Changeset - 53142fd5af4e
[Not reviewed]
default
0 2 0
Thomas De Schampheleire - 5 years ago 2020-10-19 12:47:50
thomas.de_schampheleire@nokia.com
Grafted from: 59f33539b8ea
lib/diffs: make sure that trailing tabs are indicated

Between the initial submission and final version of commit f79c40759d6f,
changes were made that turn out to be incorrect. The changes assume that the
later match on trailing tabs will 'win' from the plain 'tab' match. However,
Python 're' documentation says:

As the target string is scanned, REs separated by '|' are tried from
left to right. When one pattern completely matches, that branch is
accepted. This means that once A matches, B will not be tested further,
even if it would produce a longer overall match. In other words, the '|'
operator is never greedy.
https://docs.python.org/3.8/library/re.html

As a result, a trailing tab is seen as a plain tab and not highlighted in a
special way.

Unify the tab handling to make it unambiguous how they should be parsed.

The change diff mainly shows re group numbers shifting.
2 files changed with 13 insertions and 13 deletions:
0 comments (0 inline, 0 general)
kallithea/lib/diffs.py
Show inline comments
 
@@ -352,255 +352,255 @@ class DiffProcessor(object):
 
                        op = 'added'
 
                        stats['binary'] = True
 
                        stats['ops'][NEW_FILENODE] = 'new file'
 

	
 
                    elif head['a_file'] and not head['b_file']:
 
                        op = 'removed'
 
                        stats['binary'] = True
 
                        stats['ops'][DEL_FILENODE] = 'deleted file'
 

	
 
                # it's not ADD not DELETE
 
                if op is None:
 
                    op = 'modified'
 
                    stats['binary'] = True
 
                    stats['ops'][MOD_FILENODE] = 'modified file'
 

	
 
            # a real non-binary diff
 
            if head['a_file'] or head['b_file']:
 
                chunks, added, deleted = _parse_lines(diff_lines)
 
                stats['binary'] = False
 
                stats['added'] = added
 
                stats['deleted'] = deleted
 
                # explicit mark that it's a modified file
 
                if op == 'modified':
 
                    stats['ops'][MOD_FILENODE] = 'modified file'
 
            else:  # Git binary patch (or empty diff)
 
                # Git binary patch
 
                if head['bin_patch']:
 
                    stats['ops'][BIN_FILENODE] = 'binary diff not shown'
 
                chunks = []
 

	
 
            if op == 'removed' and chunks:
 
                # a way of seeing deleted content could perhaps be nice - but
 
                # not with the current UI
 
                chunks = []
 

	
 
            chunks.insert(0, [{
 
                'old_lineno': '',
 
                'new_lineno': '',
 
                'action':     'context',
 
                'line':       msg,
 
                } for _op, msg in stats['ops'].items()
 
                  if _op not in [MOD_FILENODE]])
 

	
 
            _files.append({
 
                'old_filename':     head['a_path'],
 
                'filename':         head['b_path'],
 
                'old_revision':     head['a_blob_id'],
 
                'new_revision':     head['b_blob_id'],
 
                'chunks':           chunks,
 
                'operation':        op,
 
                'stats':            stats,
 
            })
 

	
 
        if not inline_diff:
 
            return _files
 

	
 
        # highlight inline changes when one del is followed by one add
 
        for diff_data in _files:
 
            for chunk in diff_data['chunks']:
 
                lineiter = iter(chunk)
 
                try:
 
                    peekline = next(lineiter)
 
                    while True:
 
                        # find a first del line
 
                        while peekline['action'] != 'del':
 
                            peekline = next(lineiter)
 
                        delline = peekline
 
                        peekline = next(lineiter)
 
                        # if not followed by add, eat all following del lines
 
                        if peekline['action'] != 'add':
 
                            while peekline['action'] == 'del':
 
                                peekline = next(lineiter)
 
                            continue
 
                        # found an add - make sure it is the only one
 
                        addline = peekline
 
                        try:
 
                            peekline = next(lineiter)
 
                        except StopIteration:
 
                            # add was last line - ok
 
                            _highlight_inline_diff(delline, addline)
 
                            raise
 
                        if peekline['action'] != 'add':
 
                            # there was only one add line - ok
 
                            _highlight_inline_diff(delline, addline)
 
                except StopIteration:
 
                    pass
 

	
 
        return _files
 

	
 
    def stat(self):
 
        """
 
        Returns tuple of added, and removed lines for this instance
 
        """
 
        return self.adds, self.removes
 

	
 

	
 
_escape_re = re.compile(r'(&)|(<)|(>)|(\t)|(\r)|( $)|(\t$)')
 
_escape_re = re.compile(r'(&)|(<)|(>)|(\t)($)?|(\r)|( $)')
 

	
 

	
 
def _escaper(diff_line):
 
    r"""
 
    Do HTML escaping/markup of a single diff line (including first +/- column)
 

	
 
    >>> _escaper('foobar')
 
    'foobar'
 
    >>> _escaper('@foo & bar')
 
    '@foo &amp; bar'
 
    >>> _escaper('+foo < bar')
 
    '+foo &lt; bar'
 
    >>> _escaper('-foo > bar')
 
    '-foo &gt; bar'
 
    >>> _escaper(' <foo>')
 
    ' &lt;foo&gt;'
 
    >>> _escaper(' foo\tbar')
 
    ' foo<u>\t</u>bar'
 
    >>> _escaper(' foo\rbar\r')
 
    ' foo<u class="cr"></u>bar<u class="cr"></u>'
 
    >>> _escaper(' foo\t')
 
    ' foo<u>\t</u>'
 
    ' foo<u>\t</u><i></i>'
 
    >>> _escaper(' foo ')
 
    ' foo <i></i>'
 
    >>> _escaper(' foo  ')
 
    ' foo  <i></i>'
 
    >>> _escaper(' ')
 
    ' '
 
    >>> _escaper('  ')
 
    '  <i></i>'
 
    >>> _escaper(' \t')
 
    ' <u>\t</u>'
 
    ' <u>\t</u><i></i>'
 
    >>> _escaper(' \t  ')
 
    ' <u>\t</u>  <i></i>'
 
    >>> _escaper('   \t')
 
    '   <u>\t</u>'
 
    '   <u>\t</u><i></i>'
 
    >>> _escaper(' \t\t  ')
 
    ' <u>\t</u><u>\t</u>  <i></i>'
 
    >>> _escaper('   \t\t')
 
    '   <u>\t</u><u>\t</u>'
 
    '   <u>\t</u><u>\t</u><i></i>'
 
    >>> _escaper(' foo&bar<baz>  ')
 
    ' foo&amp;bar&lt;baz&gt;  <i></i>'
 
    """
 

	
 
    def substitute(m):
 
        groups = m.groups()
 
        if groups[0]:
 
            return '&amp;'
 
        if groups[1]:
 
            return '&lt;'
 
        if groups[2]:
 
            return '&gt;'
 
        if groups[3]:
 
            return '<u>\t</u>'  # Note: trailing tabs will get a longer match later
 
        if groups[4]:
 
            if groups[4] is not None:  # end of line
 
                return '<u>\t</u><i></i>'
 
            return '<u>\t</u>'
 
        if groups[5]:
 
            return '<u class="cr"></u>'
 
        if groups[5]:
 
        if groups[6]:
 
            if m.start() == 0:
 
                return ' '  # first column space shouldn't make empty lines show up as trailing space
 
            return ' <i></i>'
 
        if groups[6]:
 
            return '<u>\t</u><i></i>'
 
        assert False
 

	
 
    return _escape_re.sub(substitute, diff_line)
 

	
 

	
 
_git_header_re = re.compile(br"""
 
    ^diff[ ]--git[ ]a/(?P<a_path>.+?)[ ]b/(?P<b_path>.+?)\n
 
    (?:^old[ ]mode[ ](?P<old_mode>\d+)\n
 
       ^new[ ]mode[ ](?P<new_mode>\d+)(?:\n|$))?
 
    (?:^similarity[ ]index[ ](?P<similarity_index>\d+)%\n
 
       ^rename[ ]from[ ](?P<rename_from>.+)\n
 
       ^rename[ ]to[ ](?P<rename_to>.+)(?:\n|$))?
 
    (?:^new[ ]file[ ]mode[ ](?P<new_file_mode>.+)(?:\n|$))?
 
    (?:^deleted[ ]file[ ]mode[ ](?P<deleted_file_mode>.+)(?:\n|$))?
 
    (?:^index[ ](?P<a_blob_id>[0-9A-Fa-f]+)
 
        \.\.(?P<b_blob_id>[0-9A-Fa-f]+)[ ]?(?P<b_mode>.+)?(?:\n|$))?
 
    (?:^(?P<bin_patch>GIT[ ]binary[ ]patch)(?:\n|$))?
 
    (?:^---[ ](a/(?P<a_file>.+?)|/dev/null)\t?(?:\n|$))?
 
    (?:^\+\+\+[ ](b/(?P<b_file>.+?)|/dev/null)\t?(?:\n|$))?
 
""", re.VERBOSE | re.MULTILINE)
 

	
 

	
 
_hg_header_re = re.compile(br"""
 
    ^diff[ ]--git[ ]a/(?P<a_path>.+?)[ ]b/(?P<b_path>.+?)\n
 
    (?:^old[ ]mode[ ](?P<old_mode>\d+)\n
 
       ^new[ ]mode[ ](?P<new_mode>\d+)(?:\n|$))?
 
    (?:^similarity[ ]index[ ](?P<similarity_index>\d+)%(?:\n|$))?
 
    (?:^rename[ ]from[ ](?P<rename_from>.+)\n
 
       ^rename[ ]to[ ](?P<rename_to>.+)(?:\n|$))?
 
    (?:^copy[ ]from[ ](?P<copy_from>.+)\n
 
       ^copy[ ]to[ ](?P<copy_to>.+)(?:\n|$))?
 
    (?:^new[ ]file[ ]mode[ ](?P<new_file_mode>.+)(?:\n|$))?
 
    (?:^deleted[ ]file[ ]mode[ ](?P<deleted_file_mode>.+)(?:\n|$))?
 
    (?:^index[ ](?P<a_blob_id>[0-9A-Fa-f]+)
 
        \.\.(?P<b_blob_id>[0-9A-Fa-f]+)[ ]?(?P<b_mode>.+)?(?:\n|$))?
 
    (?:^(?P<bin_patch>GIT[ ]binary[ ]patch)(?:\n|$))?
 
    (?:^---[ ](a/(?P<a_file>.+?)|/dev/null)\t?(?:\n|$))?
 
    (?:^\+\+\+[ ](b/(?P<b_file>.+?)|/dev/null)\t?(?:\n|$))?
 
""", re.VERBOSE | re.MULTILINE)
 

	
 

	
 
_header_next_check = re.compile(br'''(?!@)(?!literal )(?!delta )''')
 

	
 

	
 
def _get_header(vcs, diff_chunk):
 
    """
 
    Parses a Git diff for a single file (header and chunks) and returns a tuple with:
 

	
 
    1. A dict with meta info:
 

	
 
        a_path, b_path, similarity_index, rename_from, rename_to,
 
        old_mode, new_mode, new_file_mode, deleted_file_mode,
 
        a_blob_id, b_blob_id, b_mode, a_file, b_file
 

	
 
    2. An iterator yielding lines with simple HTML markup.
 
    """
 
    match = None
 
    if vcs == 'git':
 
        match = _git_header_re.match(diff_chunk)
 
    elif vcs == 'hg':
 
        match = _hg_header_re.match(diff_chunk)
 
    if match is None:
 
        raise Exception('diff not recognized as valid %s diff' % vcs)
 
    meta_info = {k: None if v is None else safe_str(v) for k, v in match.groupdict().items()}
 
    rest = diff_chunk[match.end():]
 
    if rest:
 
        if _header_next_check.match(rest):
 
            raise Exception('cannot parse %s diff header: %r followed by %r' % (vcs, safe_str(bytes(diff_chunk[:match.end()])), safe_str(bytes(rest[:1000]))))
 
        if rest[-1:] != b'\n':
 
            # The diff will generally already have trailing \n (and be a memoryview). It might also be huge so we don't want to allocate it twice. But in this very rare case, we don't care.
 
            rest = bytes(rest) + b'\n'
 
    diff_lines = (_escaper(safe_str(m.group(1))) for m in re.finditer(br'(.*)\n', rest))
 
    return meta_info, diff_lines
 

	
 

	
 
_chunk_re = re.compile(r'^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(.*)')
 
_newline_marker = re.compile(r'^\\ No newline at end of file')
 

	
 

	
 
def _parse_lines(diff_lines):
 
    """
 
    Given an iterator of diff body lines, parse them and return a dict per
 
    line and added/removed totals.
 
    """
 
    added = deleted = 0
 
    old_line = old_end = new_line = new_end = None
 

	
 
    chunks = []
 
    try:
 
        line = next(diff_lines)
 

	
 
        while True:
 
            lines = []
 
            chunks.append(lines)
 

	
 
            match = _chunk_re.match(line)
kallithea/tests/models/test_diff_parsers.py
Show inline comments
 
@@ -204,111 +204,111 @@ DIFF_FIXTURES = {
 
         {'added': 46,
 
          'deleted': 15,
 
          'binary': False,
 
          'ops': {MOD_FILENODE: 'modified file'}}),
 
        ('vcs/backends/hg.py', 'modified',
 
         {'added': 22,
 
          'deleted': 3,
 
          'binary': False,
 
          'ops': {MOD_FILENODE: 'modified file'}}),
 
        ('vcs/tests/test_git.py', 'modified',
 
         {'added': 5,
 
          'deleted': 5,
 
          'binary': False,
 
          'ops': {MOD_FILENODE: 'modified file'}}),
 
        ('vcs/tests/test_repository.py', 'modified',
 
         {'added': 174,
 
          'deleted': 2,
 
          'binary': False,
 
          'ops': {MOD_FILENODE: 'modified file'}}),
 
    ],
 
    'git_diff_modify_binary_file.diff': [
 
        ('file.name', 'modified',
 
         {'added': 0,
 
          'deleted': 0,
 
          'binary': True,
 
          'ops': {MOD_FILENODE: 'modified file',
 
                  BIN_FILENODE: 'binary diff not shown'}})
 
    ],
 
    'hg_diff_copy_file.diff': [
 
        ('file2', 'modified',
 
         {'added': 0,
 
          'deleted': 0,
 
          'binary': True,
 
          'ops': {COPIED_FILENODE: 'file copied from file1 to file2'}}),
 
    ],
 
    'hg_diff_copy_and_modify_file.diff': [
 
        ('file3', 'modified',
 
         {'added': 1,
 
          'deleted': 0,
 
          'binary': False,
 
          'ops': {COPIED_FILENODE: 'file copied from file2 to file3',
 
                  MOD_FILENODE: 'modified file'}}),
 
    ],
 
    'hg_diff_copy_and_chmod_file.diff': [
 
        ('file4', 'modified',
 
         {'added': 0,
 
          'deleted': 0,
 
          'binary': True,
 
          'ops': {COPIED_FILENODE: 'file copied from file3 to file4',
 
                  CHMOD_FILENODE: 'modified file chmod 100644 => 100755'}}),
 
    ],
 
    'hg_diff_copy_chmod_and_edit_file.diff': [
 
        ('file5', 'modified',
 
         {'added': 2,
 
          'deleted': 1,
 
          'binary': False,
 
          'ops': {COPIED_FILENODE: 'file copied from file4 to file5',
 
                  CHMOD_FILENODE: 'modified file chmod 100755 => 100644',
 
                  MOD_FILENODE: 'modified file'}}),
 
    ],
 
    'hg_diff_rename_space_cr.diff': [
 
        ('oh yes', 'renamed',
 
         {'added': 3,
 
          'deleted': 2,
 
          'binary': False,
 
          'ops': {RENAMED_FILENODE: 'file renamed from oh no to oh yes'}}),
 
    ],
 
}
 

	
 

	
 
class TestDiffLib(base.TestController):
 

	
 
    @base.parametrize('diff_fixture', DIFF_FIXTURES)
 
    def test_diff(self, diff_fixture):
 
        raw_diff = fixture.load_resource(diff_fixture, strip=False)
 
        vcs = 'hg'
 
        if diff_fixture.startswith('git_'):
 
            vcs = 'git'
 
        diff_processor = DiffProcessor(raw_diff, vcs=vcs)
 
        data = [(x['filename'], x['operation'], x['stats']) for x in diff_processor.parsed]
 
        expected_data = DIFF_FIXTURES[diff_fixture]
 
        assert expected_data == data
 

	
 
    def test_diff_markup(self):
 
        raw_diff = fixture.load_resource('markuptest.diff', strip=False)
 
        diff_processor = DiffProcessor(raw_diff)
 
        chunks = diff_processor.parsed[0]['chunks']
 
        assert not chunks[0]
 
        #from pprint import pprint; pprint(chunks[1])
 
        l = ['\n']
 
        for d in chunks[1]:
 
            l.append('%(action)-7s %(new_lineno)3s %(old_lineno)3s %(line)r\n' % d)
 
        s = ''.join(l)
 
        assert s == r'''
 
context         '@@ -51,6 +51,13 @@'
 
unmod    51  51 '<u>\t</u>begin();'
 
unmod    52  52 '<u>\t</u>'
 
unmod    52  52 '<u>\t</u><i></i>'
 
add      53     '<u>\t</u>int foo;<u class="cr"></u>'
 
add      54     '<u>\t</u>int bar; <u class="cr"></u>'
 
add      55     '<u>\t</u>int baz;<u>\t</u><u class="cr"></u>'
 
add      56     '<u>\t</u>int space; <i></i>'
 
add      57     '<u>\t</u>int tab;<u>\t</u>'
 
add      58     '<u>\t</u>'
 
add      57     '<u>\t</u>int tab;<u>\t</u><i></i>'
 
add      58     '<u>\t</u><i></i>'
 
unmod    59  53 ' <i></i>'
 
del          54 '<u>\t</u>#define MAX_STEPS (48)'
 
add      60     '<u>\t</u><u class="cr"></u>'
 
add      61     '<u>\t</u>#define MAX_STEPS (64)<u class="cr"></u>'
 
unmod    62  55 ''
 
del          56 '<u>\t</u>#define MIN_STEPS (<del>48</del>)'
 
add      63     '<u>\t</u>#define MIN_STEPS (<ins>42</ins>)'
 
'''
0 comments (0 inline, 0 general)