Changeset - f0fbb0fe4462
[Not reviewed]
default
0 1 0
Mads Kiilerich (mads) - 5 years ago 2021-05-18 00:58:06
mads@kiilerich.com
git: update check for invalid URL characters to work with Python versions that include an attempt at fixing the very same problem

With changes like
https://github.com/python/cpython/commit/76cd81d60310d65d01f9d7b48a8985d8ab89c8b4
making it to Python 3.10 and being backported to previous Python versions, the
approach in a8a51a3bdb61 no longer works when combined with
urllib.parse.urlparse in d2f59de17bef: path will never contain the invalid
characters.

To catch this case anyway, add a new check to verify that the parsed URL can
roundtrip back to the original representation with urllib.parse.urlunparse .

The actual exception might vary, but one of them should always fire.

There is a risk that the new check will reject some URLs that somehow isn't
normalized. No such cases have been found yet.
1 file changed with 8 insertions and 0 deletions:
0 comments (0 inline, 0 general)
kallithea/lib/vcs/backends/git/repository.py
Show inline comments
 
@@ -99,204 +99,212 @@ class GitRepository(BaseRepository):
 

	
 
    @classmethod
 
    def _run_git_command(cls, cmd, cwd=None):
 
        """
 
        Runs given ``cmd`` as git command and returns output bytes in a tuple
 
        (stdout, stderr) ... or raise RepositoryError.
 

	
 
        :param cmd: git command to be executed
 
        :param cwd: passed directly to subprocess
 
        """
 
        # need to clean fix GIT_DIR !
 
        gitenv = dict(os.environ)
 
        gitenv.pop('GIT_DIR', None)
 
        gitenv['GIT_CONFIG_NOGLOBAL'] = '1'
 

	
 
        assert isinstance(cmd, list), cmd
 
        cmd = [settings.GIT_EXECUTABLE_PATH, '-c', 'core.quotepath=false'] + cmd
 
        try:
 
            p = subprocessio.SubprocessIOChunker(cmd, cwd=cwd, env=gitenv, shell=False)
 
        except (EnvironmentError, OSError) as err:
 
            # output from the failing process is in str(EnvironmentError)
 
            msg = ("Couldn't run git command %s.\n"
 
                   "Subprocess failed with '%s': %s\n" %
 
                   (cmd, type(err).__name__, err)
 
            ).strip()
 
            log.error(msg)
 
            raise RepositoryError(msg)
 

	
 
        try:
 
            stdout = b''.join(p.output)
 
            stderr = b''.join(p.error)
 
        finally:
 
            p.close()
 
        # TODO: introduce option to make commands fail if they have any stderr output?
 
        if stderr:
 
            log.debug('stderr from %s:\n%s', cmd, stderr)
 
        else:
 
            log.debug('stderr from %s: None', cmd)
 
        return stdout, stderr
 

	
 
    def run_git_command(self, cmd):
 
        """
 
        Runs given ``cmd`` as git command with cwd set to current repo.
 
        Returns stdout as unicode str ... or raise RepositoryError.
 
        """
 
        cwd = None
 
        if os.path.isdir(self.path):
 
            cwd = self.path
 
        stdout, _stderr = self._run_git_command(cmd, cwd=cwd)
 
        return safe_str(stdout)
 

	
 
    @staticmethod
 
    def _check_url(url):
 
        r"""
 
        Raise URLError if url doesn't seem like a valid safe Git URL. We
 
        only allow http, https, git, and ssh URLs.
 

	
 
        For http and https URLs, make a connection and probe to see if it is valid.
 

	
 
        >>> GitRepository._check_url('git://example.com/my%20fine repo')
 

	
 
        >>> GitRepository._check_url('http://example.com:65537/repo')
 
        Traceback (most recent call last):
 
        ...
 
        urllib.error.URLError: <urlopen error Error parsing URL: 'http://example.com:65537/repo'>
 
        >>> GitRepository._check_url('foo')
 
        Traceback (most recent call last):
 
        ...
 
        urllib.error.URLError: <urlopen error Unsupported protocol in URL 'foo'>
 
        >>> GitRepository._check_url('file:///repo')
 
        Traceback (most recent call last):
 
        ...
 
        urllib.error.URLError: <urlopen error Unsupported protocol in URL 'file:///repo'>
 
        >>> GitRepository._check_url('git+http://example.com/repo')
 
        Traceback (most recent call last):
 
        ...
 
        urllib.error.URLError: <urlopen error Unsupported protocol in URL 'git+http://example.com/repo'>
 
        >>> GitRepository._check_url('git://example.com/%09')
 
        Traceback (most recent call last):
 
        ...
 
        urllib.error.URLError: <urlopen error Invalid escape character in path: '%'>
 
        >>> GitRepository._check_url('git://example.com/%x00')
 
        Traceback (most recent call last):
 
        ...
 
        urllib.error.URLError: <urlopen error Invalid escape character in path: '%'>
 
        >>> GitRepository._check_url(r'git://example.com/\u0009')
 
        Traceback (most recent call last):
 
        ...
 
        urllib.error.URLError: <urlopen error Invalid escape character in path: '\'>
 
        >>> GitRepository._check_url(r'git://example.com/\t')
 
        Traceback (most recent call last):
 
        ...
 
        urllib.error.URLError: <urlopen error Invalid escape character in path: '\'>
 
        >>> GitRepository._check_url('git://example.com/\t')
 
        Traceback (most recent call last):
 
        ...
 
        urllib.error.URLError: <urlopen error Invalid ...>
 

	
 
        The failure above will be one of, depending on the level of WhatWG support:
 
        urllib.error.URLError: <urlopen error Invalid whitespace character in path: '\t'>
 
        urllib.error.URLError: <urlopen error Invalid url: 'git://example.com/	' normalizes to 'git://example.com/'>
 
        """
 
        try:
 
            parsed_url = urllib.parse.urlparse(url)
 
            parsed_url.port  # trigger netloc parsing which might raise ValueError
 
        except ValueError:
 
            raise urllib.error.URLError("Error parsing URL: %r" % url)
 

	
 
        # check first if it's not an local url
 
        if os.path.isabs(url) and os.path.isdir(url):
 
            return
 

	
 
        unparsed_url = urllib.parse.urlunparse(parsed_url)
 
        if unparsed_url != url:
 
            raise urllib.error.URLError("Invalid url: '%s' normalizes to '%s'" % (url, unparsed_url))
 

	
 
        if parsed_url.scheme == 'git':
 
            # Mitigate problems elsewhere with incorrect handling of encoded paths.
 
            # Don't trust urllib.parse.unquote but be prepared for more flexible implementations elsewhere.
 
            # Space is the only allowed whitespace character - directly or % encoded. No other % or \ is allowed.
 
            for c in parsed_url.path.replace('%20', ' '):
 
                if c in '%\\':
 
                    raise urllib.error.URLError("Invalid escape character in path: '%s'" % c)
 
                if c.isspace() and c != ' ':
 
                    raise urllib.error.URLError("Invalid whitespace character in path: %r" % c)
 
            return
 

	
 
        if parsed_url.scheme not in ['http', 'https']:
 
            raise urllib.error.URLError("Unsupported protocol in URL %r" % url)
 

	
 
        url_obj = mercurial.util.url(safe_bytes(url))
 
        test_uri, handlers = get_urllib_request_handlers(url_obj)
 
        if not test_uri.endswith(b'info/refs'):
 
            test_uri = test_uri.rstrip(b'/') + b'/info/refs'
 

	
 
        url_obj.passwd = b'*****'
 
        cleaned_uri = str(url_obj)
 

	
 
        o = urllib.request.build_opener(*handlers)
 
        o.addheaders = [('User-Agent', 'git/1.7.8.0')]  # fake some git
 

	
 
        req = urllib.request.Request(
 
            "%s?%s" % (
 
                safe_str(test_uri),
 
                urllib.parse.urlencode({"service": 'git-upload-pack'})
 
            ))
 

	
 
        try:
 
            resp = o.open(req)
 
            if resp.code != 200:
 
                raise Exception('Return Code is not 200')
 
        except Exception as e:
 
            # means it cannot be cloned
 
            raise urllib.error.URLError("[%s] org_exc: %s" % (cleaned_uri, e))
 

	
 
        # now detect if it's proper git repo
 
        gitdata = resp.read()
 
        if b'service=git-upload-pack' not in gitdata:
 
            raise urllib.error.URLError(
 
                "url [%s] does not look like an git" % cleaned_uri)
 

	
 
    def _get_repo(self, create, src_url=None, update_after_clone=False,
 
                  bare=False):
 
        if create and os.path.exists(self.path):
 
            raise RepositoryError("Location already exist")
 
        if src_url and not create:
 
            raise RepositoryError("Create should be set to True if src_url is "
 
                                  "given (clone operation creates repository)")
 
        try:
 
            if create and src_url:
 
                GitRepository._check_url(src_url)
 
                self.clone(src_url, update_after_clone, bare)
 
                return Repo(self.path)
 
            elif create:
 
                os.makedirs(self.path)
 
                if bare:
 
                    return Repo.init_bare(self.path)
 
                else:
 
                    return Repo.init(self.path)
 
            else:
 
                return Repo(self.path)
 
        except (NotGitRepository, OSError) as err:
 
            raise RepositoryError(err)
 

	
 
    def _get_all_revisions(self):
 
        # we must check if this repo is not empty, since later command
 
        # fails if it is. And it's cheaper to ask than throw the subprocess
 
        # errors
 
        try:
 
            self._repo.head()
 
        except KeyError:
 
            return []
 

	
 
        rev_filter = settings.GIT_REV_FILTER
 
        cmd = ['rev-list', rev_filter, '--reverse', '--date-order']
 
        try:
 
            so = self.run_git_command(cmd)
 
        except RepositoryError:
 
            # Can be raised for empty repositories
 
            return []
 
        return so.splitlines()
 

	
 
    def _get_all_revisions2(self):
 
        # alternate implementation using dulwich
 
        includes = [ascii_str(sha) for key, (sha, type_) in self._parsed_refs.items()
 
                    if type_ != b'T']
 
        return [c.commit.id for c in self._repo.get_walker(include=includes)]
 

	
 
    def _get_revision(self, revision):
 
        """
 
        Given any revision identifier, returns a 40 char string with revision hash.
 
        """
0 comments (0 inline, 0 general)