Changeset - f0fbb0fe4462
[Not reviewed]
default
0 1 0
Mads Kiilerich (mads) - 5 years ago 2021-05-18 00:58:06
mads@kiilerich.com
git: update check for invalid URL characters to work with Python versions that include an attempt at fixing the very same problem

With changes like
https://github.com/python/cpython/commit/76cd81d60310d65d01f9d7b48a8985d8ab89c8b4
making it to Python 3.10 and being backported to previous Python versions, the
approach in a8a51a3bdb61 no longer works when combined with
urllib.parse.urlparse in d2f59de17bef: path will never contain the invalid
characters.

To catch this case anyway, add a new check to verify that the parsed URL can
roundtrip back to the original representation with urllib.parse.urlunparse .

The actual exception might vary, but one of them should always fire.

There is a risk that the new check will reject some URLs that somehow isn't
normalized. No such cases have been found yet.
1 file changed with 8 insertions and 0 deletions:
0 comments (0 inline, 0 general)
kallithea/lib/vcs/backends/git/repository.py
Show inline comments
 
@@ -171,60 +171,68 @@ class GitRepository(BaseRepository):
 
        urllib.error.URLError: <urlopen error Unsupported protocol in URL 'file:///repo'>
 
        >>> GitRepository._check_url('git+http://example.com/repo')
 
        Traceback (most recent call last):
 
        ...
 
        urllib.error.URLError: <urlopen error Unsupported protocol in URL 'git+http://example.com/repo'>
 
        >>> GitRepository._check_url('git://example.com/%09')
 
        Traceback (most recent call last):
 
        ...
 
        urllib.error.URLError: <urlopen error Invalid escape character in path: '%'>
 
        >>> GitRepository._check_url('git://example.com/%x00')
 
        Traceback (most recent call last):
 
        ...
 
        urllib.error.URLError: <urlopen error Invalid escape character in path: '%'>
 
        >>> GitRepository._check_url(r'git://example.com/\u0009')
 
        Traceback (most recent call last):
 
        ...
 
        urllib.error.URLError: <urlopen error Invalid escape character in path: '\'>
 
        >>> GitRepository._check_url(r'git://example.com/\t')
 
        Traceback (most recent call last):
 
        ...
 
        urllib.error.URLError: <urlopen error Invalid escape character in path: '\'>
 
        >>> GitRepository._check_url('git://example.com/\t')
 
        Traceback (most recent call last):
 
        ...
 
        urllib.error.URLError: <urlopen error Invalid ...>
 

	
 
        The failure above will be one of, depending on the level of WhatWG support:
 
        urllib.error.URLError: <urlopen error Invalid whitespace character in path: '\t'>
 
        urllib.error.URLError: <urlopen error Invalid url: 'git://example.com/	' normalizes to 'git://example.com/'>
 
        """
 
        try:
 
            parsed_url = urllib.parse.urlparse(url)
 
            parsed_url.port  # trigger netloc parsing which might raise ValueError
 
        except ValueError:
 
            raise urllib.error.URLError("Error parsing URL: %r" % url)
 

	
 
        # check first if it's not an local url
 
        if os.path.isabs(url) and os.path.isdir(url):
 
            return
 

	
 
        unparsed_url = urllib.parse.urlunparse(parsed_url)
 
        if unparsed_url != url:
 
            raise urllib.error.URLError("Invalid url: '%s' normalizes to '%s'" % (url, unparsed_url))
 

	
 
        if parsed_url.scheme == 'git':
 
            # Mitigate problems elsewhere with incorrect handling of encoded paths.
 
            # Don't trust urllib.parse.unquote but be prepared for more flexible implementations elsewhere.
 
            # Space is the only allowed whitespace character - directly or % encoded. No other % or \ is allowed.
 
            for c in parsed_url.path.replace('%20', ' '):
 
                if c in '%\\':
 
                    raise urllib.error.URLError("Invalid escape character in path: '%s'" % c)
 
                if c.isspace() and c != ' ':
 
                    raise urllib.error.URLError("Invalid whitespace character in path: %r" % c)
 
            return
 

	
 
        if parsed_url.scheme not in ['http', 'https']:
 
            raise urllib.error.URLError("Unsupported protocol in URL %r" % url)
 

	
 
        url_obj = mercurial.util.url(safe_bytes(url))
 
        test_uri, handlers = get_urllib_request_handlers(url_obj)
 
        if not test_uri.endswith(b'info/refs'):
 
            test_uri = test_uri.rstrip(b'/') + b'/info/refs'
 

	
 
        url_obj.passwd = b'*****'
 
        cleaned_uri = str(url_obj)
 

	
 
        o = urllib.request.build_opener(*handlers)
 
        o.addheaders = [('User-Agent', 'git/1.7.8.0')]  # fake some git
0 comments (0 inline, 0 general)