webapp/codereview/intra_region_diff.py - gerrit - Git at Google

 # Copyright 2008 Google Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 """Intra-region diff utilities.

 Intra-region diff highlights the blocks of code which have been changed or
 deleted within a region. So instead of highlighting the whole region marked as
 changed, the user can see what exactly was changed within that region.

 Terminology:
   'region' is a list of consecutive code lines.
   'word' is the unit of intra-region diff. Its definition is arbitrary based on
    what we think as to be a good unit of difference between two regions.
   'block' is a small section of code within a region. It can span multiple
   lines. There can be multiple non overlapping blocks within a region. A block
   can potentially span the whole region.

 The blocks have two representations. One is of the format (offset1, offset2,
 size) which is returned by the SequenceMatcher to indicate a match of
 length 'size' starting at offset1 in the first/old line and starting at offset2
 in the second/new line. We convert this representation to a pair of tuples i.e.
 (offset1, size) and (offset2, size) for rendering each side of the diff
 separately. This latter representation is also more efficient for doing
 compaction of adjacent blocks which reduces the size of the HTML markup. See
 CompactBlocks for more details.

 SequenceMatcher always returns one special matching block at the end with
 contents (len(line1), len(line2), 0). We retain this special block as it
 simplifies for loops in rendering the last non-matching block. All functions
 which deal with the sequence of blocks assume presence of the special block at
 the end of the sequence and retain it.
 """

 import cgi
 import difflib
 import re

 # Tag to begin a diff chunk.
 BEGIN_TAG = "<span class=\"%s\">"
 # Tag to end a diff block.
 END_TAG = "</span>"
 # Tag used for visual tab indication
 TAB_TAG = '<span class="visualtab" title="Visual tab.">&raquo;</span>'

 # Color scheme to govern the display properties of diff blocks and matching
 # blocks. Each value e.g. 'oldlight' corresponds to a CSS style.
 COLOR_SCHEME = {
   'old': {
           'match':      'oldlight',
           'diff':       'olddark',
           'bckgrnd':    'oldlight',
          },
   'new': {
           'match':      'newlight',
           'diff':       'newdark',
           'bckgrnd':    'newlight',
          },
   'oldmove': {
           'match':      'movelight',
           'diff':       'oldmovedark',
           'bckgrnd':    'movelight'
   },
   'newmove': {
           'match':      'newlight',
           'diff':       'newdark',
           'bckgrnd':    'newlight'
   },
 }
 # Regular expressions to tokenize lines. Default is 'b'.
 EXPRS = {
          'a': r'(\w+|[^\w\s]+|\s+)',
          'b': r'([A-Za-z0-9]+|[^A-Za-z0-9])',
          'c': r'([A-Za-z0-9_]+|[^A-Za-z0-9_])',
         }
 # Maximum total characters in old and new lines for doing intra-region diffs.
 # Intra-region diff for larger regions is hard to comprehend and wastes CPU
 # time.
 MAX_TOTAL_LEN = 10000


 def ExpandTabs(text, tabsize=8, tab_marker=None):
   """Expand tab characters in a string into spaces with an optional marker.

   Args:
     text: a string containing tab characters.
     tabsize: the number of spaces that a tab represents
     tab_marker: a character; if not None, we replace the first character
                 of each tab expansion with this.
   """
   tabpos = text.find("\t")
   while tabpos >= 0:
     fillwidth = tabsize - (tabpos % tabsize)
     if fillwidth == 0:
       fillwidth = tabsize
     if tab_marker:
       fill = tab_marker + " " * (fillwidth - 1)
     else:
       fill = " " * fillwidth
     # We avoid str.replace in case tab_marker is \t
     text = text[:tabpos] + fill + text[tabpos+1:]
     tabpos = text.find("\t", tabpos + 1)
   return text


 def Fold(text, limit=85, indent=5, offset=0, tabsize=8, mark_tabs=False):
   """Break a long string into multiple lines.

   Lines longer than 'limit' are broken up into pieces of at most
   'limit' characters; continuation lines start with 'indent' spaces.

   'offset' is used to indicate if 'text' itself doesn't align with
   the beginning of line e.g. we are trying to Fold a line when we have
   already printed 'offset' number of characters to the output.

   This also translates tabs into 'tabsize' spaces. If 'mark_tabs' is true,
   then we indicate the first character of each expanded tab visually.

   Input and output are assumed to be in UTF-8; the computation is done
   in Unicode.  (Still not good enough if zero-width characters are
   present.) If the input is not valid UTF-8, then the encoding is
   passed through, potentially breaking up multi-byte characters.
   We pass the line through cgi.escape before returning it.

   A trailing newline is always stripped from the input first.
   """
   assert tabsize > 0, tabsize
   if text.endswith("\n"):
     text = text[:-1]
   try:
     text = unicode(text, "utf-8")
   except:
     pass
   if "\t" in text:
     # If mark_tabs is true, we retain one \t character as a marker during
     # expansion so that we later replace it with an HTML snippet.
     tab_marker = mark_tabs and "\t" or None
     rest = text[indent-offset:]
     text = text[:indent-offset] + ExpandTabs(rest, tabsize, tab_marker)
   # Perform wrapping.
   if len(text) > limit - offset:
     parts = []
     prefix = ""
     i = 0
     j = limit - offset
     while i < len(text):
       parts.append(prefix + text[i:j])
       i = j
       j += limit - indent
       prefix = " " * indent
     text = "\n".join(parts)
   # Colorize tab markers (after calling escape)
   text = cgi.escape(text)
   text = text.replace("\t", TAB_TAG)
   if isinstance(text, unicode):
     return text.encode("utf-8", "replace")
   return text


 def CompactBlocks(blocks):
   """Compacts adjacent code blocks.

   In many cases 2 adjacent blocks can be merged into one. This allows
   to do some further processing on those blocks.

   Args:
     blocks: [(offset1, size), ...]

   Returns:
     A list with the same structure as the input with adjacent blocks
     merged.  However, the last block (which is always assumed to have
     a zero size) is never merged.  For example, the input
     [(0, 2), (2, 8), (10, 5), (15, 0)]
     will produce the output [(0, 15), (15, 0)].
   """
   if len(blocks) == 1:
     return blocks
   result = [blocks[0]]
   for block in blocks[1:-1]:
     last_start, last_len = result[-1]
     curr_start, curr_len = block
     if last_start + last_len == curr_start:
       result[-1] = last_start, last_len + curr_len
     else:
       result.append(block)
   result.append(blocks[-1])
   return result


 def FilterBlocks(blocks, filter_func):
   """Gets rid of any blocks if filter_func evaluates false for them.

   Args:
     blocks: [(offset1, offset2, size), ...]; must have at least 1 entry
     filter_func: a boolean function taking a single argument of the form
                  (offset1, offset2, size)

   Returns:
     A list with the same structure with entries for which filter_func()
     returns false removed.  However, the last block is always included.
   """
   # We retain the 'special' block at the end.
   res = [b for b in blocks[:-1] if filter_func(b)]
   res.append(blocks[-1])
   return res


 def GetDiffParams(expr='b', min_match_ratio=0.6, min_match_size=2, dbg=False):
   """Returns a tuple of various parameters which affect intra region diffs.

   Args:
     expr: regular expression id to use to identify 'words' in the intra region
           diff
     min_match_ratio: minimum similarity between regions to qualify for intra
                      region diff
     min_match_size: the smallest matching block size to use. Blocks smaller
                     than this are ignored.
     dbg: to turn on generation of debugging information for the diff

   Returns:
     4 tuple (expr, min_match_ratio, min_match_size, dbg) that can be used to
     customize diff. It can be passed to functions like WordDiff and
     IntraLineDiff.
   """
   assert expr in EXPRS
   assert min_match_size in xrange(1,5)
   assert min_match_ratio > 0.0 and min_match_ratio < 1.0
   return (expr, min_match_ratio, min_match_size, dbg)


 def CanDoIRDiff(old_lines, new_lines):
   """Tells if it would be worth computing the intra region diff.

   Calculating IR diff is costly and is usually helpful only for small regions.
   We use a heuristic that if the total number of characters is more than a
   certain threshold then we assume it is not worth computing the IR diff.

   Args:
     old_lines: an array of strings containing old text
     new_lines: an array of strings containing new text

   Returns:
     True if we think it is worth computing IR diff for the region defined
     by old_lines and new_lines, False otherwise.

   TODO: Let GetDiffParams handle MAX_TOTAL_LEN param also.
   """
   total_chars = (sum(len(line) for line in old_lines) +
                  sum(len(line) for line in new_lines))
   return total_chars <= MAX_TOTAL_LEN


 def WordDiff(line1, line2, diff_params):
   """Returns blocks with positions indiciating word level diffs.

   Args:
     line1: string representing the left part of the diff
     line2: string representing the right part of the diff
     diff_params: return value of GetDiffParams

   Returns:
     A tuple (blocks, ratio) where:
       blocks: [(offset1, offset2, size), ...] such that
               line1[offset1:offset1+size] == line2[offset2:offset2+size]
               and the last block is always (len(line1), len(line2), 0)
       ratio: a float giving the diff ratio computed by SequenceMatcher.
   """
   match_expr, min_match_ratio, min_match_size, dbg = diff_params
   exp = EXPRS[match_expr]
   # We want to split at proper character boundaries in UTF8 text.
   try:
     line1_u = unicode(line1, "utf8")
   except:
     line1_u = line1
   try:
     line2_u = unicode(line2, "utf8")
   except:
     line2_u = line2
   def _ToUTF8(s):
     if isinstance(s, unicode):
       return s.encode("utf8")
     return s
   a = map(_ToUTF8, re.findall(exp, line1_u, re.U))
   b = map(_ToUTF8, re.findall(exp, line2_u, re.U))
   s = difflib.SequenceMatcher(None, a, b)
   matching_blocks = s.get_matching_blocks()
   ratio = s.ratio()
   # Don't show intra region diffs if both lines are too different and there is
   # more than one block of difference. If there is only one change then we
   # still show the intra region diff regardless of how different the blocks
   # are.
   # Note: We compare len(matching_blocks) with 3 because one block of change
   # results in 2 matching blocks. We add the one special block and we get 3
   # matching blocks per one block of change.
   if ratio < min_match_ratio and len(matching_blocks) > 3:
     return ([(0, 0, 0)], ratio)
   # For now convert to character level blocks because we already have
   # the code to deal with folding across lines for character blocks.
   # Create arrays lena an lenb which have cumulative word lengths
   # corresponding to word positions in a and b
   lena = []
   last = 0
   for w in a:
     lena.append(last)
     last += len(w)
   lenb = []
   last = 0
   for w in b:
     lenb.append(last)
     last += len(w)
   lena.append(len(line1))
   lenb.append(len(line2))
   # Convert to character blocks
   blocks = []
   for s1, s2, blen in matching_blocks[:-1]:
     apos = lena[s1]
     bpos = lenb[s2]
     block_len = lena[s1+blen] - apos
     blocks.append((apos, bpos, block_len))
   # Recreate the special block.
   blocks.append((len(line1), len(line2), 0))
   # Filter any matching blocks which are smaller than the desired threshold.
   # We don't remove matching blocks with only a newline character as doing so
   # results in showing the matching newline character as non matching which
   # doesn't look good.
   blocks = FilterBlocks(blocks, lambda b: (b[2] >= min_match_size or
                                            line1[b[0]:b[0]+b[2]] == '\n'))
   return (blocks, ratio)


 def IntraLineDiff(line1, line2, diff_params, diff_func=WordDiff):
   """Computes intraline diff blocks.

   Args:
     line1: string representing the left part of the diff
     line2: string representing the right part of the diff
     diff_params: return value of GetDiffParams
     diff_func: a function whose signature matches that of WordDiff() above

   Returns:
     A tuple of (blocks1, blocks2) corresponding to line1 and line2.
     Each element of the tuple is an array of (start_pos, length)
     tuples denoting a diff block.
   """
   blocks, ratio = diff_func(line1, line2, diff_params)
   blocks1 = [(start1, length) for (start1, start2, length) in blocks]
   blocks2 = [(start2, length) for (start1, start2, length) in blocks]

   return (blocks1, blocks2, ratio)


 def DumpDiff(blocks, line1, line2):
   """Helper function to debug diff related problems.

   Args:
     blocks: [(offset1, offset2, size), ...]
     line1: string representing the left part of the diff
     line2: string representing the right part of the diff
   """
   for offset1, offset2, size in blocks:
     print offset1, offset2, size
     print offset1, size, ":  ", line1[offset1:offset1+size]
     print offset2, size, ":  ", line2[offset2:offset2+size]


 def RenderIntraLineDiff(blocks, line, tag, dbg_info=None, limit=80, indent=5,
                         tabsize=8, mark_tabs=False):
   """Renders the diff blocks returned by IntraLineDiff function.

   Args:
     blocks: [(start_pos,  size), ...]
     line: line of code on which the blocks are to be rendered.
     tag: 'new' or 'old' to control the color scheme.
     dbg_info: a string that holds debugging informaion header. Debug
               information is rendered only if dbg_info is not None.
     limit: folding limit to be passed to the Fold function.
     indent: indentation size to be passed to the Fold function.
     tabsize: the number of spaces that a tab represents
     mark_tabs: if True, mark the first character of each expanded tab visually

   Returns:
     A tuple of two elements. First element is the rendered version of
     the input 'line'. Second element tells if the line has a matching
     newline character.
   """
   res = ""
   prev_start, prev_len = 0, 0
   has_newline = False
   debug_info = dbg_info
   if dbg_info:
     debug_info += "\nBlock Count: %d\nBlocks: " % (len(blocks) - 1)
   for curr_start, curr_len in blocks:
     if dbg_info and curr_len > 0:
       debug_info += Fold("\n(%d, %d):|%s|" %
                          (curr_start, curr_len,
                           line[curr_start:curr_start+curr_len]),
                          limit, indent, tabsize, mark_tabs)
     res += FoldBlock(line, prev_start + prev_len, curr_start, limit, indent,
                      tag, 'diff', tabsize, mark_tabs)
     res += FoldBlock(line, curr_start, curr_start + curr_len, limit, indent,
                      tag, 'match', tabsize, mark_tabs)
     # TODO: This test should be out of loop rather than inside. Once we
     # filter out some junk from blocks (e.g. some empty blocks) we should do
     # this test only on the last matching block.
     if line[curr_start:curr_start+curr_len].endswith('\n'):
       has_newline = True
     prev_start, prev_len = curr_start, curr_len
   return (res, has_newline, debug_info)


 def FoldBlock(src, start, end, limit, indent, tag, btype, tabsize=8,
               mark_tabs=False):
   """Folds and renders a block.

   Args:
     src: line of code
     start: starting position of the block within 'src'.
     end: ending position of the block within 'src'.
     limit: folding limit
     indent: indentation to use for folding.
     tag: 'new' or 'old' to control the color scheme.
     btype: block type i.e. 'match' or 'diff' to control the color schme.
     tabsize: the number of spaces that a tab represents
     mark_tabs: if True, mark the first character of each expanded tab visually

   Returns:
     A string represeting the rendered block.
   """
   text = src[start:end]
   # We ignore newlines because we do newline management ourselves.
   # Any other new lines with at the end will be stripped off by the Fold
   # method.
   if start >= end or text == '\n':
     return ""
   fbegin, lend, nl_plus_indent = GetTags(tag, btype, indent)
   # 'bol' is beginning of line
   offset_from_bol = start % limit
   res = ""
   # If this is the first block of the line and this is not the first line then
   # insert newline + indent. This special case is not dealt with in the for
   # loop below.
   if offset_from_bol == 0 and not start == 0:
     res = nl_plus_indent
   text = Fold(text, limit, 0, offset_from_bol, tabsize, mark_tabs)
   folded_lines = text.split("\n")
   for (j, l) in enumerate(folded_lines):
     if l:
       res += (fbegin + l + lend)
     # Add new line plus indent except for the last line.
     if j < len(folded_lines) - 1:
       res += nl_plus_indent
   return res


 def GetTags(tag, btype, indent):
   """Returns various tags for rendering diff blocks.

   Args:
     tag: a key from COLOR_SCHEME
     btype: 'match' or 'diff'
     indent: indentation to use
   Returns
     A 3 tuple (begin_tag, end_tag, formatted_indent_block)
   """
   assert tag in COLOR_SCHEME
   assert btype in ['match', 'diff']
   fbegin = BEGIN_TAG % COLOR_SCHEME[tag][btype]
   bbegin = BEGIN_TAG % COLOR_SCHEME[tag]['bckgrnd']
   lend = END_TAG
   nl_plus_indent = '\n'
   if indent > 0:
     nl_plus_indent += bbegin + cgi.escape(" "*indent) + lend
   return fbegin, lend, nl_plus_indent


 def ConvertToSingleLine(lines):
   """Transforms a sequence of strings into a single line.

   Returns the state that can be used to reconstruct the original lines with
   the newline separators placed at the original place.

   Args:
     lines: sequence of strings

   Returns:
     Returns (single_line, state) tuple. 'state' shouldn't be modified by the
     caller. It is only used to pass to other functions which will do certain
     operations on this state.

     'state' is an array containing a dictionary for each item in lines. Each
     dictionary has two elements 'pos' and 'blocks'. 'pos' is the end position
     of each line in the final converted string. 'blocks' is an array of blocks
     for each line of code. These blocks are added using MarkBlock function.
   """
   state = []
   total_length = 0
   for l in lines:
     total_length += len(l)
     # TODO: Use a tuple instead.
     state.append({     'pos': total_length, # the line split point
                     'blocks': []            # blocks which belong to this line
                  })
   result = "".join(lines)
   assert len(state) == len(lines)
   return (result, state)


 def MarkBlock(state, begin, end):
   """Marks a block on a region such that it doesn't cross line boundaries.

   It is an operation that can be performed on the single line which was
   returned by the ConvertToSingleLine function. This operation marks arbitrary
   block [begin,end) on the text. It also ensures that if [begin,end) crosses
   line boundaries in the original region then it splits the section up in 2 or
   more blocks such that no block crosses the boundaries.

   Args:
     state: the state returned by ConvertToSingleLine function. The state
            contained is modified by this function.
     begin: Beginning of the block.
     end: End of the block (exclusive).

   Returns:
     None.
   """
   # TODO: Make sure already existing blocks don't overlap
   if begin == end:
     return
   last_pos = 0
   for entry in state:
     pos = entry['pos']
     if begin >= last_pos and begin < pos:
       if end < pos:
         # block doesn't cross any line boundary
         entry['blocks'].append((begin, end))
       else:
         # block crosses the line boundary
         entry['blocks'].append((begin, pos))
         MarkBlock(state, pos, end)
       break
     last_pos = pos


 def GetBlocks(state):
   """Returns all the blocks corresponding to the lines in the region.

   Args:
     state: the state returned by ConvertToSingleLine().

   Returns:
     An array of [(start_pos, length), ..] with an entry for each line in the
     region.
   """
   result = []
   last_pos = 0
   for entry in state:
     pos = entry['pos']
     # Calculate block start points from the beginning of individual lines.
     blocks = [(s[0]-last_pos, s[1]-s[0]) for s in entry['blocks']]
     # Add one end marker block.
     blocks.append((pos-last_pos, 0))
     result.append(blocks)
     last_pos = pos
   return result


 def IntraRegionDiff(old_lines, new_lines, diff_params):
   """Computes intra region diff.

   Args:
     old_lines: array of strings
     new_lines: array of strings
     diff_params: return value of GetDiffParams

   Returns:
     A tuple (old_blocks, new_blocks) containing matching blocks for old and new
     lines.
   """
   old_line, old_state = ConvertToSingleLine(old_lines)
   new_line, new_state = ConvertToSingleLine(new_lines)
   old_blocks, new_blocks, ratio = IntraLineDiff(old_line, new_line, diff_params)
   for begin, length in old_blocks:
     MarkBlock(old_state, begin, begin+length)
   old_blocks = GetBlocks(old_state)

   for begin, length in new_blocks:
     MarkBlock(new_state, begin, begin+length)
   new_blocks = GetBlocks(new_state)

   return (old_blocks, new_blocks, ratio)


 def NormalizeBlocks(blocks, line):
   """Normalizes block representation of an intra line diff.

   One diff can have multiple representations. Some times the diff returned by
   the difflib for similar text sections is different even within same region.
   For example if 2 already indented lines were indented with one additional
   space character, the difflib may return the non matching space character to
   be any of the already existing spaces. So one line may show non matching
   space character as the first space character and the second line may show it
   to be the last space character. This is sometimes confusing. This is the
   side effect of the new regular expression we are using in WordDiff for
   identifying indvidual words. This regular expression ('b') treats a sequence
   of punctuation and whitespace characters as individual characters. It has
   some visual advantages for showing a character level punctuation change as
   one character change rather than a group of character change.

   Making the normalization too generic can have performance implications. So
   this implementation of normalize blocks intends to handle only one case.
   Let's say S represents the space character and () marks a matching block.
   Then the normalize operation will do following:

      SSSS(SS)(ABCD) => SSSS(SS)(ABCD)
      (SS)SSSS(ABCD) => SSSS(SS)(ABCD)
      (SSSS)SS(ABCD) => SS(SSSS)(ABCD)

      and so on..

   Args:
     blocks: An array of (offset, len) tuples defined on 'line'. These blocks
             mark the matching areas. Anything between these matching blocks is
             considered non-matching.
     line: The text string on which the blocks are defined.

   Returns:
     An array of (offset, len) tuples representing the same diff but in
     normalized form.
   """
   result = []
   prev_start, prev_len = blocks[0]
   for curr_start, curr_len in blocks[1:]:
     # Note: nm_ is a prefix for non matching and m_ is a prefix for matching.
     m_len, nm_len  = prev_len, curr_start - (prev_start+prev_len)
     # This if condition checks if matching and non matching parts are greater
     # than zero length and are comprised of spaces ONLY. The last condition
     # deals with most of the observed cases of strange diffs.
     # Note: curr_start - prev_start == m_l + nm_l
     #       So line[prev_start:curr_start] == matching_part + non_matching_part.
     text = line[prev_start:curr_start]
     if m_len > 0 and nm_len > 0 and text == ' ' * len(text):
       # Move the matching block towards the end i.e. normalize.
       result.append((prev_start + nm_len, m_len))
     else:
       # Keep the existing matching block.
       result.append((prev_start, prev_len))
     prev_start, prev_len = curr_start, curr_len
   result.append(blocks[-1])
   assert len(result) == len(blocks)
   return result


 def RenderIntraRegionDiff(lines, diff_blocks, tag, ratio, limit=80, indent=5,
                           tabsize=8, mark_tabs=False, dbg=False):
   """Renders intra region diff for one side.

   Args:
     lines: list of strings representing source code in the region
     diff_blocks: blocks that were returned for this region by IntraRegionDiff()
     tag: 'new' or 'old'
     ratio: similarity ratio returned by the diff computing function
     limit: folding limit
     indent: indentation size
     tabsize: the number of spaces that a tab represents
     mark_tabs: if True, mark the first character of each expanded tab visually
     dbg: indicates if debug information should be rendered

   Returns:
     A list of strings representing the rendered version of each item in input
     'lines'.
   """
   result = []
   dbg_info = None
   if dbg:
     dbg_info = 'Ratio: %.1f' % ratio
   for line, blocks in zip(lines, diff_blocks):
     blocks = NormalizeBlocks(blocks, line)
     blocks = CompactBlocks(blocks)
     diff = RenderIntraLineDiff(blocks,
                                line,
                                tag,
                                dbg_info=dbg_info,
                                limit=limit,
                                indent=indent,
                                tabsize=tabsize,
                                mark_tabs=mark_tabs)
     result.append(diff)
   assert len(result) == len(lines)
   return result