Changeset - fd61f678577f
[Not reviewed]
default
0 2 1
Mads Kiilerich (mads) - 5 years ago 2020-11-14 15:20:40
mads@kiilerich.com
Grafted from: 9ae72340369d
diff: improved handling of Git diffs with " quoting

Kallithea would intentionally and explicitly fail with an ugly exception when
trying to parse Git diffs with quoted filenames.

Improve this by parsing quotes ... and ignore them, as long as they are
matching. The content inside the quotes might be \-escaped ... but that could
potentially also be the case without quoting. We will fix that later.

Adding some test cases that before would have failed to parse and raised an
exception.

Thanks to stypr of Flatt Security for raising this.
3 files changed with 112 insertions and 3 deletions:
0 comments (0 inline, 0 general)
kallithea/lib/diffs.py
Show inline comments
 
@@ -466,109 +466,109 @@ def _escaper(diff_line):
 
    >>> _escaper(' foo\t')
 
    ' foo<u>\t</u><i></i>'
 
    >>> _escaper(' foo ')
 
    ' foo <i></i>'
 
    >>> _escaper(' foo  ')
 
    ' foo  <i></i>'
 
    >>> _escaper(' ')
 
    ' '
 
    >>> _escaper('  ')
 
    '  <i></i>'
 
    >>> _escaper(' \t')
 
    ' <u>\t</u><i></i>'
 
    >>> _escaper(' \t  ')
 
    ' <u>\t</u>  <i></i>'
 
    >>> _escaper('   \t')
 
    '   <u>\t</u><i></i>'
 
    >>> _escaper(' \t\t  ')
 
    ' <u>\t</u><u>\t</u>  <i></i>'
 
    >>> _escaper('   \t\t')
 
    '   <u>\t</u><u>\t</u><i></i>'
 
    >>> _escaper(' foo&bar<baz>  ')
 
    ' foo&amp;bar&lt;baz&gt;  <i></i>'
 
    """
 

	
 
    def substitute(m):
 
        groups = m.groups()
 
        if groups[0]:
 
            return '&amp;'
 
        if groups[1]:
 
            return '&lt;'
 
        if groups[2]:
 
            return '&gt;'
 
        if groups[3]:
 
            if groups[4] is not None:  # end of line
 
                return '<u>\t</u><i></i>'
 
            return '<u>\t</u>'
 
        if groups[5]:
 
            return '<u class="cr"></u>'
 
        if groups[6]:
 
            if m.start() == 0:
 
                return ' '  # first column space shouldn't make empty lines show up as trailing space
 
            return ' <i></i>'
 
        assert False
 

	
 
    return _escape_re.sub(substitute, diff_line)
 

	
 

	
 
_git_header_re = re.compile(br"""
 
    ^diff[ ]--git[ ]a/(?P<a_path>.+?)[ ]b/(?P<b_path>.+?)\n
 
    ^diff[ ]--git[ ](?P<a_path_quote>"?)a/(?P<a_path>.+?)(?P=a_path_quote)[ ](?P<b_path_quote>"?)b/(?P<b_path>.+?)(?P=a_path_quote)\n
 
    (?:^old[ ]mode[ ](?P<old_mode>\d+)\n
 
       ^new[ ]mode[ ](?P<new_mode>\d+)(?:\n|$))?
 
    (?:^similarity[ ]index[ ](?P<similarity_index>\d+)%\n
 
       ^rename[ ]from[ ](?P<rename_from>.+)\n
 
       ^rename[ ]to[ ](?P<rename_to>.+)(?:\n|$))?
 
    (?:^new[ ]file[ ]mode[ ](?P<new_file_mode>.+)(?:\n|$))?
 
    (?:^deleted[ ]file[ ]mode[ ](?P<deleted_file_mode>.+)(?:\n|$))?
 
    (?:^index[ ](?P<a_blob_id>[0-9A-Fa-f]+)
 
        \.\.(?P<b_blob_id>[0-9A-Fa-f]+)[ ]?(?P<b_mode>.+)?(?:\n|$))?
 
    (?:^(?P<bin_patch>GIT[ ]binary[ ]patch)(?:\n|$))?
 
    (?:^---[ ](a/(?P<a_file>.+?)|/dev/null)\t?(?:\n|$))?
 
    (?:^\+\+\+[ ](b/(?P<b_file>.+?)|/dev/null)\t?(?:\n|$))?
 
    (?:^---[ ](?P<a_file_quote>"?)(a/(?P<a_file>.+?)(?P=a_file_quote)|/dev/null)\t?(?:\n|$))?
 
    (?:^\+\+\+[ ](?P<b_file_quote>"?)(b/(?P<b_file>.+?)(?P=b_file_quote)|/dev/null)\t?(?:\n|$))?
 
""", re.VERBOSE | re.MULTILINE)
 

	
 

	
 
_hg_header_re = re.compile(br"""
 
    ^diff[ ]--git[ ]a/(?P<a_path>.+?)[ ]b/(?P<b_path>.+?)\n
 
    (?:^old[ ]mode[ ](?P<old_mode>\d+)\n
 
       ^new[ ]mode[ ](?P<new_mode>\d+)(?:\n|$))?
 
    (?:^similarity[ ]index[ ](?P<similarity_index>\d+)%(?:\n|$))?
 
    (?:^rename[ ]from[ ](?P<rename_from>.+)\n
 
       ^rename[ ]to[ ](?P<rename_to>.+)(?:\n|$))?
 
    (?:^copy[ ]from[ ](?P<copy_from>.+)\n
 
       ^copy[ ]to[ ](?P<copy_to>.+)(?:\n|$))?
 
    (?:^new[ ]file[ ]mode[ ](?P<new_file_mode>.+)(?:\n|$))?
 
    (?:^deleted[ ]file[ ]mode[ ](?P<deleted_file_mode>.+)(?:\n|$))?
 
    (?:^index[ ](?P<a_blob_id>[0-9A-Fa-f]+)
 
        \.\.(?P<b_blob_id>[0-9A-Fa-f]+)[ ]?(?P<b_mode>.+)?(?:\n|$))?
 
    (?:^(?P<bin_patch>GIT[ ]binary[ ]patch)(?:\n|$))?
 
    (?:^---[ ](a/(?P<a_file>.+?)|/dev/null)\t?(?:\n|$))?
 
    (?:^\+\+\+[ ](b/(?P<b_file>.+?)|/dev/null)\t?(?:\n|$))?
 
""", re.VERBOSE | re.MULTILINE)
 

	
 

	
 
_header_next_check = re.compile(br'''(?!@)(?!literal )(?!delta )''')
 

	
 

	
 
def _get_header(vcs, diff_chunk):
 
    """
 
    Parses a Git diff for a single file (header and chunks) and returns a tuple with:
 

	
 
    1. A dict with meta info:
 

	
 
        a_path, b_path, similarity_index, rename_from, rename_to,
 
        old_mode, new_mode, new_file_mode, deleted_file_mode,
 
        a_blob_id, b_blob_id, b_mode, a_file, b_file
 

	
 
    2. An iterator yielding lines with simple HTML markup.
 
    """
 
    match = None
 
    if vcs == 'git':
 
        match = _git_header_re.match(diff_chunk)
 
    elif vcs == 'hg':
 
        match = _hg_header_re.match(diff_chunk)
 
    if match is None:
 
        raise Exception('diff not recognized as valid %s diff: %r' % (vcs, safe_str(bytes(diff_chunk[:1000]))))
 
    meta_info = {k: None if v is None else safe_str(v) for k, v in match.groupdict().items()}
 
    rest = diff_chunk[match.end():]
 
    if rest:
 
        if _header_next_check.match(rest):
kallithea/tests/fixtures/git_diff_quoting.diff
Show inline comments
 
new file 100644
 
diff --git "a/\"foo\"" "b/\"foo\""
 
new file mode 100644
 
index 0000000..8b13789
 
--- /dev/null
 
+++ "b/\"foo\""
 
@@ -0,0 +1 @@
 
+
 
diff --git a/'foo' b/'foo'
 
new file mode 100644
 
index 0000000..8b13789
 
--- /dev/null
 
+++ b/'foo'
 
@@ -0,0 +1 @@
 
+
 
diff --git "a/'foo'\"foo\"" "b/'foo'\"foo\""
 
new file mode 100644
 
index 0000000..8b13789
 
--- /dev/null
 
+++ "b/'foo'\"foo\""
 
@@ -0,0 +1 @@
 
+
 
diff --git "a/a\r\nb" "b/a\r\nb"
 
new file mode 100644
 
index 0000000..30d74d2
 
--- /dev/null
 
+++ "b/a\r\nb"
 
@@ -0,0 +1 @@
 
+test
 
\ No newline at end of file
 
diff --git "a/foo\rfoo" "b/foo\rfoo"
 
new file mode 100644
 
index 0000000..e69de29
 
diff --git a/foo bar b/foo bar
 
new file mode 100644
 
index 0000000..219ea2b
 
--- /dev/null
 
+++ b/foo bar	
 
@@ -0,0 +1 @@
 
+foo  bar
 
\ No newline at end of file
 
diff --git a/test b/test
 
new file mode 100644
 
index 0000000..9daeafb
 
--- /dev/null
 
+++ b/test
 
@@ -0,0 +1 @@
 
+test
 
diff --git "a/esc\033foo" "b/esc\033foo"
 
new file mode 100644
 
index 0000000..e69de29
 
diff --git "a/tab\tfoo" "b/tab\tfoo"
 
new file mode 100644
 
index 0000000..e69de29
kallithea/tests/models/test_diff_parsers.py
Show inline comments
 
@@ -223,92 +223,148 @@ DIFF_FIXTURES = {
 
    ],
 
    'git_diff_modify_binary_file.diff': [
 
        ('file.name', 'modified',
 
         {'added': 0,
 
          'deleted': 0,
 
          'binary': True,
 
          'ops': {MOD_FILENODE: 'modified file',
 
                  BIN_FILENODE: 'binary diff not shown'}})
 
    ],
 
    'hg_diff_copy_file.diff': [
 
        ('file2', 'modified',
 
         {'added': 0,
 
          'deleted': 0,
 
          'binary': True,
 
          'ops': {COPIED_FILENODE: 'file copied from file1 to file2'}}),
 
    ],
 
    'hg_diff_copy_and_modify_file.diff': [
 
        ('file3', 'modified',
 
         {'added': 1,
 
          'deleted': 0,
 
          'binary': False,
 
          'ops': {COPIED_FILENODE: 'file copied from file2 to file3',
 
                  MOD_FILENODE: 'modified file'}}),
 
    ],
 
    'hg_diff_copy_and_chmod_file.diff': [
 
        ('file4', 'modified',
 
         {'added': 0,
 
          'deleted': 0,
 
          'binary': True,
 
          'ops': {COPIED_FILENODE: 'file copied from file3 to file4',
 
                  CHMOD_FILENODE: 'modified file chmod 100644 => 100755'}}),
 
    ],
 
    'hg_diff_copy_chmod_and_edit_file.diff': [
 
        ('file5', 'modified',
 
         {'added': 2,
 
          'deleted': 1,
 
          'binary': False,
 
          'ops': {COPIED_FILENODE: 'file copied from file4 to file5',
 
                  CHMOD_FILENODE: 'modified file chmod 100755 => 100644',
 
                  MOD_FILENODE: 'modified file'}}),
 
    ],
 
    'hg_diff_rename_space_cr.diff': [
 
        ('oh yes', 'renamed',
 
         {'added': 3,
 
          'deleted': 2,
 
          'binary': False,
 
          'ops': {RENAMED_FILENODE: 'file renamed from oh no to oh yes'}}),
 
    ],
 
    'git_diff_quoting.diff': [
 
        (r'\"foo\"',  # TODO: quotes should not be escaped
 
         'added',
 
         {'added': 1,
 
          'binary': False,
 
          'deleted': 0,
 
          'ops': {1: 'new file 100644'}}),
 
        ("'foo'",
 
         'added',
 
         {'added': 1,
 
          'binary': False,
 
          'deleted': 0,
 
          'ops': {1: 'new file 100644'}}),
 
        ("'foo'" r'\"foo\"',  # TODO: quotes should not be escaped
 
         'added',
 
         {'added': 1,
 
          'binary': False,
 
          'deleted': 0,
 
          'ops': {1: 'new file 100644'}}),
 
        (r'a\r\nb',  # TODO: escaped
 
         'added',
 
         {'added': 1,
 
          'binary': False,
 
          'deleted': 0,
 
          'ops': {1: 'new file 100644'}}),
 
        (r'foo\rfoo',  # TODO: escaped
 
         'added',
 
        {'added': 0,
 
         'binary': True,
 
         'deleted': 0,
 
          'ops': {1: 'new file 100644'}}),
 
        ('foo bar',
 
         'added',
 
         {'added': 1,
 
          'binary': False,
 
          'deleted': 0,
 
          'ops': {1: 'new file 100644'}}),
 
        ('test',
 
         'added',
 
         {'added': 1,
 
          'binary': False,
 
          'deleted': 0,
 
          'ops': {1: 'new file 100644'}}),
 
        (r'esc\033foo',  # TODO: escaped
 
         'added',
 
         {'added': 0,
 
          'binary': True,
 
          'deleted': 0,
 
          'ops': {1: 'new file 100644'}}),
 
        (r'tab\tfoo',  # TODO: escaped
 
         'added',
 
         {'added': 0,
 
          'binary': True,
 
          'deleted': 0,
 
          'ops': {1: 'new file 100644'}}),
 
    ],
 
}
 

	
 

	
 
class TestDiffLib(base.TestController):
 

	
 
    @base.parametrize('diff_fixture', DIFF_FIXTURES)
 
    def test_diff(self, diff_fixture):
 
        raw_diff = fixture.load_resource(diff_fixture, strip=False)
 
        vcs = 'hg'
 
        if diff_fixture.startswith('git_'):
 
            vcs = 'git'
 
        diff_processor = DiffProcessor(raw_diff, vcs=vcs)
 
        data = [(x['filename'], x['operation'], x['stats']) for x in diff_processor.parsed]
 
        expected_data = DIFF_FIXTURES[diff_fixture]
 
        assert expected_data == data
 

	
 
    def test_diff_markup(self):
 
        raw_diff = fixture.load_resource('markuptest.diff', strip=False)
 
        diff_processor = DiffProcessor(raw_diff)
 
        chunks = diff_processor.parsed[0]['chunks']
 
        assert not chunks[0]
 
        #from pprint import pprint; pprint(chunks[1])
 
        l = ['\n']
 
        for d in chunks[1]:
 
            l.append('%(action)-7s %(new_lineno)3s %(old_lineno)3s %(line)r\n' % d)
 
        s = ''.join(l)
 
        assert s == r'''
 
context         '@@ -51,6 +51,13 @@'
 
unmod    51  51 '<u>\t</u>begin();'
 
unmod    52  52 '<u>\t</u><i></i>'
 
add      53     '<u>\t</u>int foo;<u class="cr"></u>'
 
add      54     '<u>\t</u>int bar; <u class="cr"></u>'
 
add      55     '<u>\t</u>int baz;<u>\t</u><u class="cr"></u>'
 
add      56     '<u>\t</u>int space; <i></i>'
 
add      57     '<u>\t</u>int tab;<u>\t</u><i></i>'
 
add      58     '<u>\t</u><i></i>'
 
unmod    59  53 ' <i></i>'
 
del          54 '<u>\t</u>#define MAX_STEPS (48)'
 
add      60     '<u>\t</u><u class="cr"></u>'
 
add      61     '<u>\t</u>#define MAX_STEPS (64)<u class="cr"></u>'
 
unmod    62  55 ''
 
del          56 '<u>\t</u>#define MIN_STEPS (<del>48</del>)'
 
add      63     '<u>\t</u>#define MIN_STEPS (<ins>42</ins>)'
 
'''
0 comments (0 inline, 0 general)