Skip to content

Commit 1140dd1

Browse files
authored
Merge pull request #988 from douardda/swhid
2 parents ac41c20 + 5f26710 commit 1140dd1

File tree

6 files changed

+301
-9
lines changed

6 files changed

+301
-9
lines changed

docs/source/usage.rst

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ Using ``repo2docker``
1414
follows :ref:`specification`. repo2docker is called with the URL of a Git repository,
1515
a `DOI <https://en.wikipedia.org/wiki/Digital_object_identifier>`_ from Zenodo or Figshare,
1616
a `Handle <https://en.wikipedia.org/wiki/Handle_System>`_ or DOI from a Dataverse installation,
17+
a `SWHID`_ of a directory of a revision archived in the
18+
`Software Heritage Archive <https://archive.softwareheritage.org>`_,
1719
or a path to a local directory.
1820

1921
It then performs these steps:
@@ -36,7 +38,8 @@ repo2docker is called with this command::
3638
where ``<source-repository>`` is:
3739

3840
* a URL of a Git repository (``https://github.com/binder-examples/requirements``),
39-
* a Zenodo DOI (``10.5281/zenodo.1211089``), or
41+
* a Zenodo DOI (``10.5281/zenodo.1211089``),
42+
* a SWHID_ (``swh:1:rev:999dd06c7f679a2714dfe5199bdca09522a29649``), or
4043
* a path to a local directory (``a/local/directory``)
4144

4245
of the source repository you want to build.
@@ -132,3 +135,4 @@ Command line API
132135

133136

134137
.. _Pytudes: https://github.com/norvig/pytudes
138+
.. _SWHID: https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html

repo2docker/app.py

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ def _default_log_level(self):
148148
contentproviders.Figshare,
149149
contentproviders.Dataverse,
150150
contentproviders.Hydroshare,
151+
contentproviders.Swhid,
151152
contentproviders.Mercurial,
152153
contentproviders.Git,
153154
],
@@ -269,6 +270,18 @@ def _user_name_default(self):
269270
allow_none=True,
270271
)
271272

273+
swh_token = Unicode(
274+
None,
275+
help="""
276+
Token to use authenticated SWH API access.
277+
278+
If unset, default to unauthenticated (limited) usage of the Software
279+
Heritage API.
280+
""",
281+
config=True,
282+
allow_none=True,
283+
)
284+
272285
cleanup_checkout = Bool(
273286
False,
274287
help="""
@@ -395,26 +408,29 @@ def fetch(self, url, ref, checkout_path):
395408
"No matching content provider found for " "{url}.".format(url=url)
396409
)
397410

411+
swh_token = self.config.get("swh_token", self.swh_token)
412+
if swh_token and isinstance(picked_content_provider, contentproviders.Swhid):
413+
picked_content_provider.set_auth_token(swh_token)
414+
398415
for log_line in picked_content_provider.fetch(
399416
spec, checkout_path, yield_output=self.json_logs
400417
):
401418
self.log.info(log_line, extra=dict(phase="fetching"))
402419

403420
if not self.output_image_spec:
404-
self.output_image_spec = (
405-
"r2d" + escapism.escape(self.repo, escape_char="-").lower()
406-
)
421+
image_spec = "r2d" + self.repo
407422
# if we are building from a subdirectory include that in the
408423
# image name so we can tell builds from different sub-directories
409424
# apart.
410425
if self.subdir:
411-
self.output_image_spec += escapism.escape(
412-
self.subdir, escape_char="-"
413-
).lower()
426+
image_spec += self.subdir
414427
if picked_content_provider.content_id is not None:
415-
self.output_image_spec += picked_content_provider.content_id
428+
image_spec += picked_content_provider.content_id
416429
else:
417-
self.output_image_spec += str(int(time.time()))
430+
image_spec += str(int(time.time()))
431+
self.output_image_spec = escapism.escape(
432+
image_spec, escape_char="-"
433+
).lower()
418434

419435
def json_excepthook(self, etype, evalue, traceback):
420436
"""Called on an uncaught exception when using json logging

repo2docker/contentproviders/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@
55
from .dataverse import Dataverse
66
from .hydroshare import Hydroshare
77
from .mercurial import Mercurial
8+
from .swhid import Swhid
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
import io
2+
import os
3+
import shutil
4+
import tarfile
5+
import time
6+
import re
7+
8+
from os import path
9+
10+
import requests
11+
12+
from .base import ContentProvider
13+
from ..utils import copytree
14+
from .. import __version__
15+
16+
17+
def parse_swhid(swhid):
18+
swhid_regexp = r"^swh:(?P<version>\d+):(?P<type>ori|cnt|rev|dir|snp|rel):(?P<hash>[0-9a-f]{40})$"
19+
# only parse/check the <identifier_core> of the swhid
20+
# see https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
21+
m = re.match(swhid_regexp, swhid.split(";")[0])
22+
if m:
23+
return m.groupdict()
24+
25+
26+
class Swhid(ContentProvider):
27+
"""Provide contents of a repository identified by a SWHID."""
28+
29+
retry_delay = 5
30+
31+
def __init__(self):
32+
self.swhid = None
33+
self.base_url = "https://archive.softwareheritage.org/api/1"
34+
self.session = requests.Session()
35+
self.session.headers.update(
36+
{
37+
"user-agent": "repo2docker {}".format(__version__),
38+
}
39+
)
40+
41+
def set_auth_token(self, token):
42+
header = {"Authorization": "Bearer {}".format(token)}
43+
self.session.headers.update(header)
44+
45+
def _request(self, url, method="GET"):
46+
if not url.endswith("/"):
47+
url = url + "/"
48+
49+
for retries in range(3):
50+
try:
51+
resp = self.session.request(method, url)
52+
if resp.ok:
53+
break
54+
except requests.ConnectionError:
55+
time.sleep(self.retry_delay)
56+
57+
return resp
58+
59+
@property
60+
def content_id(self):
61+
"""The SWHID record ID used for content retrival"""
62+
return self.swhid
63+
64+
def detect(self, swhid, ref=None, extra_args=None):
65+
swhid_dict = parse_swhid(swhid)
66+
67+
if (
68+
swhid_dict
69+
and swhid_dict["type"] in ("dir", "rev")
70+
and swhid_dict["version"] == "1"
71+
):
72+
return {"swhid": swhid, "swhid_obj": swhid_dict}
73+
74+
def fetch_directory(self, dir_hash, output_dir):
75+
url = "{}/vault/directory/{}/".format(self.base_url, dir_hash)
76+
yield "Fetching directory {} from {}\n".format(dir_hash, url)
77+
resp = self._request(url, "POST")
78+
receipt = resp.json()
79+
status = receipt["status"]
80+
assert status != "failed", receipt
81+
while status not in ("failed", "done"):
82+
time.sleep(self.retry_delay)
83+
resp = self._request(url)
84+
status = resp.json()["status"]
85+
if status == "failed":
86+
yield "Error preparing the directory for download"
87+
raise Exception()
88+
resp = self._request(resp.json()["fetch_url"])
89+
archive = tarfile.open(fileobj=io.BytesIO(resp.content))
90+
archive.extractall(path=output_dir)
91+
# the output_dir should have only one subdir named after the dir_hash
92+
# move its content one level up
93+
copytree(path.join(output_dir, dir_hash), output_dir)
94+
shutil.rmtree(path.join(output_dir, dir_hash))
95+
yield "Fetched files: {}\n".format(os.listdir(output_dir))
96+
97+
def fetch(self, spec, output_dir, yield_output=False):
98+
swhid = spec["swhid"]
99+
swhid_obj = spec["swhid_obj"]
100+
101+
if swhid_obj["type"] == "rev":
102+
# need to get the directory for this revision
103+
sha1git = swhid_obj["hash"]
104+
url = "{}/revision/{}/".format(self.base_url, sha1git)
105+
yield "Fetching revision {} from {}\n".format(sha1git, url)
106+
resp = self._request(url)
107+
assert resp.ok, (resp.content, self.session.headers)
108+
directory = resp.json()["directory"]
109+
self.swhid = "swh:1:dir:{}".format(directory)
110+
yield from self.fetch_directory(directory, output_dir)
111+
elif swhid_obj["type"] == "dir":
112+
self.swhid = swhid
113+
yield from self.fetch_directory(swhid_obj["hash"], output_dir)

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ def get_identifier(json):
5656
"ruamel.yaml>=0.15",
5757
"toml",
5858
"semver",
59+
"requests",
5960
],
6061
python_requires=">=3.6",
6162
author="Project Jupyter Contributors",
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
import json
2+
import os
3+
import io
4+
import tarfile
5+
import shutil
6+
import re
7+
import urllib
8+
import pytest
9+
import tempfile
10+
import logging
11+
import requests_mock
12+
13+
from os import makedirs
14+
from os.path import join
15+
from unittest.mock import patch, MagicMock, mock_open
16+
from zipfile import ZipFile
17+
18+
from repo2docker.contentproviders.swhid import Swhid, parse_swhid
19+
from repo2docker.contentproviders.base import ContentProviderException
20+
21+
22+
# this is a slightly stripped down copy of swh.model.cli.swhid_of_dir().
23+
# We do not use this later to prevent having to depend on swh.model[cli]
24+
def swhid_of_dir(path):
25+
object = Directory.from_disk(path=path).get_data()
26+
return swhid(DIRECTORY, object)
27+
28+
29+
def test_content_id():
30+
swhid = Swhid()
31+
assert swhid.content_id is None
32+
33+
34+
swhids_ok = [
35+
"swh:1:dir:" + "0" * 40,
36+
"swh:1:rev:" + "0" * 40,
37+
]
38+
swhids_invalid = [
39+
"swh:1:dir:" + "0" * 39,
40+
"swh:2:dir:" + "0" * 40,
41+
"swh:1:rev:" + "0" * 41,
42+
"swh:1:cnt:" + "0" * 40,
43+
"swh:1:ori:" + "0" * 40,
44+
"swh:1:rel:" + "0" * 40,
45+
"swh:1:snp:" + "0" * 40,
46+
]
47+
48+
detect_values = [
49+
(swhid, {"swhid": swhid, "swhid_obj": parse_swhid(swhid)}) for swhid in swhids_ok
50+
] + [(swhid, None) for swhid in swhids_invalid]
51+
52+
53+
@pytest.mark.parametrize("swhid, expected", detect_values)
54+
def test_detect(swhid, expected):
55+
provider = Swhid()
56+
assert provider.detect(swhid) == expected
57+
58+
59+
def fake_urlopen(req):
60+
print(req)
61+
return req.headers
62+
63+
64+
def test_unresolving_swhid():
65+
provider = Swhid()
66+
67+
# swhid = "0" * 40
68+
# assert provider.swhid2url(swhid) is swhid
69+
70+
71+
NULLID = "0" * 40
72+
73+
74+
@pytest.fixture
75+
def gen_tarfile(tmpdir):
76+
rootdir = join(tmpdir, "tmp")
77+
makedirs(rootdir)
78+
with open(join(rootdir, "file1.txt"), "wb") as fobj:
79+
fobj.write(b"Some content\n")
80+
81+
# this directory hash can be computed using the swh.model package, but we do
82+
# nto want to depend on this later to limit dependencies and because it
83+
# does not support python 3.6;
84+
dirhash = "89a3bd29a2c5ae0b1465febbe5df09730a8576fe"
85+
buf = io.BytesIO()
86+
tarf = tarfile.open(name=dirhash, fileobj=buf, mode="w")
87+
tarf.add(rootdir, arcname=dirhash)
88+
tarf.close()
89+
shutil.rmtree(rootdir)
90+
return dirhash, buf.getvalue()
91+
92+
93+
def mocked_provider(tmpdir, dirhash, tarfile_buf):
94+
provider = Swhid()
95+
adapter = requests_mock.Adapter()
96+
provider.base_url = "mock://api/1"
97+
provider.retry_delay = 0.1
98+
provider.session.mount("mock://", adapter)
99+
100+
adapter.register_uri(
101+
"GET",
102+
"mock://api/1/revision/{}/".format(NULLID),
103+
json={
104+
"author": {"fullname": "John Doe <[email protected]>"},
105+
"directory": dirhash,
106+
},
107+
)
108+
adapter.register_uri(
109+
"POST",
110+
"mock://api/1/vault/directory/{}/".format(dirhash),
111+
json={
112+
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
113+
"status": "new",
114+
},
115+
)
116+
adapter.register_uri(
117+
"GET",
118+
"mock://api/1/vault/directory/{}/".format(dirhash),
119+
[
120+
{
121+
"json": {
122+
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
123+
"status": "pending",
124+
}
125+
},
126+
{
127+
"json": {
128+
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
129+
"status": "done",
130+
}
131+
},
132+
],
133+
)
134+
adapter.register_uri(
135+
"GET",
136+
"mock://api/1/vault/directory/{}/raw/".format(dirhash),
137+
content=tarfile_buf,
138+
)
139+
return provider
140+
141+
142+
def test_fetch_revision(tmpdir, gen_tarfile):
143+
dir_id, tarfile_buf = gen_tarfile
144+
provider = mocked_provider(tmpdir, dir_id, tarfile_buf)
145+
swhid = "swh:1:rev:" + NULLID
146+
for log in provider.fetch(provider.detect(swhid), tmpdir):
147+
print(log)
148+
assert provider.content_id == "swh:1:dir:" + dir_id
149+
150+
151+
def test_fetch_directory(tmpdir, gen_tarfile):
152+
dir_id, tarfile_buf = gen_tarfile
153+
provider = mocked_provider(tmpdir, dir_id, tarfile_buf)
154+
swhid = "swh:1:dir:" + dir_id
155+
for log in provider.fetch(provider.detect(swhid), tmpdir):
156+
print(log)
157+
assert provider.content_id == swhid

0 commit comments

Comments
 (0)