From 04b9f8587370f96366c6e53fb411473279ba7c02 Mon Sep 17 00:00:00 2001 From: Yousong Zhou Date: Thu, 28 Jun 2018 18:27:27 +0800 Subject: [PATCH] scripts/dl_github_archive.py: rename from download.py - Make the code more GitHub-specific - Requires mirror hash to work with .gitattributes - Use different API depending on whether PKG_SOURCE_VERSION is a complete commit id or other ref types like tags - Fix removing symbolic link - pre-clean dir_untar for possible leftovers from previous run Signed-off-by: Yousong Zhou --- include/download.mk | 14 +- scripts/{download.py => dl_github_archive.py} | 241 +++++++++--------- 2 files changed, 130 insertions(+), 125 deletions(-) rename scripts/{download.py => dl_github_archive.py} (64%) diff --git a/include/download.mk b/include/download.mk index 5cfc542042..1627b4fdfb 100644 --- a/include/download.mk +++ b/include/download.mk @@ -172,25 +172,25 @@ endef define DownloadMethod/git $(call wrap_mirror,$(1),$(2), \ - $(call DownloadMethod/git-raw) \ + $(call DownloadMethod/rawgit) \ ) endef -define DownloadMethod/github-tarball +define DownloadMethod/github_archive $(call wrap_mirror,$(1),$(2), \ - $(SCRIPT_DIR)/download.py dl \ + $(SCRIPT_DIR)/dl_github_archive.py \ --dl-dir="$(DL_DIR)" \ - --url $(foreach url,$(URL),"$(url)") \ - --proto="$(PROTO)" \ + --url="$(URL)" \ --version="$(VERSION)" \ --subdir="$(SUBDIR)" \ --source="$(FILE)" \ - || ( $(call DownloadMethod/git-raw) ); \ + --hash="$(MIRROR_HASH)" \ + || ( $(call DownloadMethod/rawgit) ); \ ) endef # Only intends to be called as a submethod from other DownloadMethod -define DownloadMethod/git-raw +define DownloadMethod/rawgit echo "Checking out files from the git repository..."; \ mkdir -p $(TMP_DIR)/dl && \ cd $(TMP_DIR)/dl && \ diff --git a/scripts/download.py b/scripts/dl_github_archive.py similarity index 64% rename from scripts/download.py rename to scripts/dl_github_archive.py index 779d7b3de2..5a5a016e37 100755 --- a/scripts/download.py +++ b/scripts/dl_github_archive.py @@ -10,6 +10,7 @@ import calendar import datetime import errno import fcntl +import hashlib import json import os import os.path @@ -23,26 +24,31 @@ import urllib2 TMPDIR = os.environ.get('TMP_DIR') or '/tmp' TMPDIR_DL = os.path.join(TMPDIR, 'dl') -DOWNLOAD_METHODS = [] + class PathException(Exception): pass -class DownloadException(Exception): pass +class DownloadGitHubError(Exception): pass class Path(object): """Context class for preparing and cleaning up directories. + If ```preclean` is ``False``, ``path`` will NOT be removed on context enter + If ``path`` ``isdir``, then it will be created on context enter. If ``keep`` is True, then ``path`` will NOT be removed on context exit """ - def __init__(self, path, isdir=True, keep=False): + def __init__(self, path, isdir=True, preclean=False, keep=False): self.path = path self.isdir = isdir + self.preclean = preclean self.keep = keep def __enter__(self): + if self.preclean: + self.rm_all(self.path) if self.isdir: self.mkdir_all(self.path) return self @@ -61,14 +67,11 @@ class Path(object): Path._mkdir(p) @staticmethod - def _rmdir_all(dir_): + def _rmdir_dir(dir_): names = Path._listdir(dir_) for name in names: p = os.path.join(dir_, name) - if os.path.isdir(p): - Path._rmdir_all(p) - else: - Path._remove(p) + Path.rm_all(p) Path._rmdir(dir_) @staticmethod @@ -105,8 +108,10 @@ class Path(object): @staticmethod def rm_all(path): """Same as rm -r.""" - if os.path.isdir(path): - Path._rmdir_all(path) + if os.path.islink(path): + Path._remove(path) + elif os.path.isdir(path): + Path._rmdir_dir(path) else: Path._remove(path) @@ -201,60 +206,47 @@ class GitHubCommitTsCache(object): fout.write(line) -class DownloadMethod(object): - """Base class of all download method.""" +class DownloadGitHubTarball(object): + """Download and repack archive tarabll from GitHub. - def __init__(self, args): - self.args = args - self.urls = args.urls - self.url = self.urls[0] - self.dl_dir = args.dl_dir + Compared with the method of packing after cloning the whole repo, this + method is more friendly to users with fragile internet connection. - @classmethod - def resolve(cls, args): - """Resolve download method to use. + However, there are limitations with this method - return instance of subclass of DownloadMethod - """ - for c in DOWNLOAD_METHODS: - if c.match(args): - return c(args) + - GitHub imposes a 60 reqs/hour limit for unauthenticated API access. + This affects fetching commit date for reproducible tarballs. Download + through the archive link is not affected. - @staticmethod - def match(args): - """Return True if it can do the download.""" - return NotImplemented + - GitHub archives do not contain source codes for submodules. - def download(self): - """Do the download and put it into the download dir.""" - return NotImplemented + - GitHub archives seem to respect .gitattributes and ignore pathes with + export-ignore attributes. + For the first two issues, the method will fail loudly to allow fallback to + clone-then-pack method. -class DownloadMethodGitHubTarball(DownloadMethod): - """Download and repack archive tarabll from GitHub.""" + As for the 3rd issue, to make sure that this method only produces identical + tarballs as the fallback method, we require the expected hash value to be + supplied. That means the first tarball will need to be prepared by the + clone-then-pack method + """ __repo_url_regex = re.compile(r'^(?:https|git)://github.com/(?P[^/]+)/(?P[^/]+)') def __init__(self, args): - super(DownloadMethodGitHubTarball, self).__init__(args) - self._init_owner_repo() + self.dl_dir = args.dl_dir self.version = args.version self.subdir = args.subdir self.source = args.source + self.url = args.url + self._init_owner_repo() + self.xhash = args.hash + self._init_hasher() self.commit_ts = None # lazy load commit timestamp self.commit_ts_cache = GitHubCommitTsCache() self.name = 'github-tarball' - @staticmethod - def match(args): - """Match if it's a GitHub clone url.""" - url = args.urls[0] - proto = args.proto - if proto == 'git' and isinstance(url, basestring) \ - and (url.startswith('https://github.com/') or url.startswith('git://github.com/')): - return True - return False - def download(self): """Download and repack GitHub archive tarball.""" self._init_commit_ts() @@ -265,18 +257,23 @@ class DownloadMethodGitHubTarball(DownloadMethod): self._fetch(tarball_path) # unpack d = os.path.join(dir_dl.path, self.subdir + '.untar') - with Path(d) as dir_untar: + with Path(d, preclean=True) as dir_untar: tarball_prefix = Path.untar(tarball_path, into=dir_untar.path) dir0 = os.path.join(dir_untar.path, tarball_prefix) dir1 = os.path.join(dir_untar.path, self.subdir) # submodules check if self._has_submodule(dir0): - raise DownloadException('unable to fetch submodules\' source code') + raise self._error('Fetching submodules is not yet supported') # rename subdir os.rename(dir0, dir1) # repack into=os.path.join(TMPDIR_DL, self.source) Path.tar(dir_untar.path, self.subdir, into=into, ts=self.commit_ts) + try: + self._hash_check(into) + except Exception: + Path.rm_all(into) + raise # move to target location file1 = os.path.join(self.dl_dir, self.source) if into != file1: @@ -291,10 +288,9 @@ class DownloadMethodGitHubTarball(DownloadMethod): return e.errno != errno.ENOENT def _init_owner_repo(self): - url = self.url - m = self.__repo_url_regex.search(url) + m = self.__repo_url_regex.search(self.url) if m is None: - raise DownloadException('invalid github url: %s' % url) + raise self._error('Invalid github url: {}'.format(self.url)) owner = m.group('owner') repo = m.group('repo') if repo.endswith('.git'): @@ -302,23 +298,79 @@ class DownloadMethodGitHubTarball(DownloadMethod): self.owner = owner self.repo = repo + def _init_hasher(self): + xhash = self.xhash + if len(xhash) == 64: + self.hasher = hashlib.sha256() + elif len(xhash) == 32: + self.hasher = hashlib.md5() + else: + raise self._error('Requires sha256sum for verification') + self.xhash = xhash + + def _hash_check(self, f): + with open(f, 'rb') as fin: + while True: + d = fin.read(4096) + if not d: + break + self.hasher.update(d) + xhash = self.hasher.hexdigest() + if xhash != self.xhash: + raise self._error('Wrong hash (probably caused by .gitattributes), expecting {}, got {}'.format(self.xhash, xhash)) + def _init_commit_ts(self): if self.commit_ts is not None: return - url = self._make_repo_url_path('git', 'commits', self.version) - ct = self.commit_ts_cache.get(url) - if ct is not None: - self.commit_ts = ct - return + # GitHub provides 2 APIs[1,2] for fetching commit data. API[1] is more + # terse while API[2] provides more verbose info such as commit diff + # etc. That's the main reason why API[1] is preferred: the response + # size is predictable. + # + # However, API[1] only accepts complete commit sha1sum as the parameter + # while API[2] is more liberal accepting also partial commit id and + # tags, etc. + # + # [1] Get a single commit, Repositories, https://developer.github.com/v3/repos/commits/#get-a-single-commit + # [2] Git Commits, Git Data, https://developer.github.com/v3/git/commits/#get-a-commit + apis = [ + { + 'url': self._make_repo_url_path('git', 'commits', self.version), + 'attr_path': ('committer', 'date'), + }, { + 'url': self._make_repo_url_path('commits', self.version), + 'attr_path': ('commit', 'committer', 'date'), + }, + ] + version_is_sha1sum = len(self.version) == 40 + if not version_is_sha1sum: + apis.insert(0, apis.pop()) + for api in apis: + url = api['url'] + attr_path = api['attr_path'] + try: + ct = self.commit_ts_cache.get(url) + if ct is not None: + self.commit_ts = ct + return + ct = self._init_commit_ts_remote_get(url, attr_path) + self.commit_ts = ct + self.commit_ts_cache.set(url, ct) + return + except Exception: + pass + raise self._error('Cannot fetch commit ts: {}'.format(url)) + + def _init_commit_ts_remote_get(self, url, attrpath): resp = self._make_request(url) data = resp.read() - data = json.loads(data) - date = data['committer']['date'] + date = json.loads(data) + for attr in attrpath: + date = date[attr] date = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ') date = date.timetuple() ct = calendar.timegm(date) - self.commit_ts = ct - self.commit_ts_cache.set(url, ct) + return ct def _fetch(self, path): """Fetch tarball of the specified version ref.""" @@ -350,72 +402,25 @@ class DownloadMethodGitHubTarball(DownloadMethod): fileobj = urllib2.urlopen(req, context=sslcontext) return fileobj - -class DownloadMethodCatchall(DownloadMethod): - """Dummy method that knows names but not ways of download.""" - - def __init__(self, args): - super(DownloadMethodCatchall, self).__init__(args) - self.args = args - self.proto = args.proto - self.name = self._resolve_name() - - def _resolve_name(self): - if self.proto: - return self.proto - methods_map = ( - ('default', ('@APACHE/', '@GITHUB/', '@GNOME/', '@GNU/', - '@KERNEL/', '@SF/', '@SAVANNAH/', 'ftp://', 'http://', - 'https://', 'file://')), - ('git', ('git://', )), - ('svn', ('svn://', )), - ('cvs', ('cvs://', )), - ('bzr', ('sftp://', )), - ('bzr', ('sftp://', )), - ('unknown', ('', )), - ) - for name, prefixes in methods_map: - if any(url.startswith(prefix) for prefix in prefixes for url in self.urls): - return name - - @staticmethod - def match(args): - """Return True.""" - return True - - def download(self): - """Not implemented. - - raise DownloadException - """ - raise DownloadException('download method for %s is not yet implemented' % self.name) - -# order matters -DOWNLOAD_METHODS = [ - DownloadMethodGitHubTarball, - DownloadMethodCatchall, -] + def _error(self, msg): + return DownloadGitHubError('{}: {}'.format(self.source, msg)) def main(): parser = argparse.ArgumentParser() - parser.add_argument('action', choices=('dl_method', 'dl'), help='Action to take') - parser.add_argument('--urls', nargs='+', metavar='URL', help='Download URLs') - parser.add_argument('--proto', help='Download proto') + parser.add_argument('--dl-dir', default=os.getcwd(), help='Download dir') + parser.add_argument('--url', help='Download URL') parser.add_argument('--subdir', help='Source code subdir name') parser.add_argument('--version', help='Source code version') parser.add_argument('--source', help='Source tarball filename') - parser.add_argument('--dl-dir', default=os.getcwd(), help='Download dir') + parser.add_argument('--hash', help='Source tarball\'s expected sha256sum') args = parser.parse_args() - if args.action == 'dl_method': - method = DownloadMethod.resolve(args) - sys.stdout.write(method.name + '\n') - elif args.action == 'dl': - method = DownloadMethod.resolve(args) - try: - method.download() - except Exception: - raise + method = DownloadGitHubTarball(args) + try: + method.download() + except Exception: + sys.stderr.write('download {} from {} failed\n'.format(args.source, args.url)) + raise if __name__ == '__main__': main()