eggs/mercurial-1.7.3-py2.6-linux-x86_64.egg/mercurial/similar.py
changeset 69 c6bca38c1cbf
equal deleted inserted replaced
68:5ff1fc726848 69:c6bca38c1cbf
       
     1 # similar.py - mechanisms for finding similar files
       
     2 #
       
     3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
       
     4 #
       
     5 # This software may be used and distributed according to the terms of the
       
     6 # GNU General Public License version 2 or any later version.
       
     7 
       
     8 from i18n import _
       
     9 import util
       
    10 import mdiff
       
    11 import bdiff
       
    12 
       
    13 def _findexactmatches(repo, added, removed):
       
    14     '''find renamed files that have no changes
       
    15 
       
    16     Takes a list of new filectxs and a list of removed filectxs, and yields
       
    17     (before, after) tuples of exact matches.
       
    18     '''
       
    19     numfiles = len(added) + len(removed)
       
    20 
       
    21     # Get hashes of removed files.
       
    22     hashes = {}
       
    23     for i, fctx in enumerate(removed):
       
    24         repo.ui.progress(_('searching for exact renames'), i, total=numfiles)
       
    25         h = util.sha1(fctx.data()).digest()
       
    26         hashes[h] = fctx
       
    27 
       
    28     # For each added file, see if it corresponds to a removed file.
       
    29     for i, fctx in enumerate(added):
       
    30         repo.ui.progress(_('searching for exact renames'), i + len(removed),
       
    31                 total=numfiles)
       
    32         h = util.sha1(fctx.data()).digest()
       
    33         if h in hashes:
       
    34             yield (hashes[h], fctx)
       
    35 
       
    36     # Done
       
    37     repo.ui.progress(_('searching for exact renames'), None)
       
    38 
       
    39 def _findsimilarmatches(repo, added, removed, threshold):
       
    40     '''find potentially renamed files based on similar file content
       
    41 
       
    42     Takes a list of new filectxs and a list of removed filectxs, and yields
       
    43     (before, after, score) tuples of partial matches.
       
    44     '''
       
    45     copies = {}
       
    46     for i, r in enumerate(removed):
       
    47         repo.ui.progress(_('searching for similar files'), i, total=len(removed))
       
    48 
       
    49         # lazily load text
       
    50         @util.cachefunc
       
    51         def data():
       
    52             orig = r.data()
       
    53             return orig, mdiff.splitnewlines(orig)
       
    54 
       
    55         def score(text):
       
    56             orig, lines = data()
       
    57             # bdiff.blocks() returns blocks of matching lines
       
    58             # count the number of bytes in each
       
    59             equal = 0
       
    60             matches = bdiff.blocks(text, orig)
       
    61             for x1, x2, y1, y2 in matches:
       
    62                 for line in lines[y1:y2]:
       
    63                     equal += len(line)
       
    64 
       
    65             lengths = len(text) + len(orig)
       
    66             return equal * 2.0 / lengths
       
    67 
       
    68         for a in added:
       
    69             bestscore = copies.get(a, (None, threshold))[1]
       
    70             myscore = score(a.data())
       
    71             if myscore >= bestscore:
       
    72                 copies[a] = (r, myscore)
       
    73     repo.ui.progress(_('searching'), None)
       
    74 
       
    75     for dest, v in copies.iteritems():
       
    76         source, score = v
       
    77         yield source, dest, score
       
    78 
       
    79 def findrenames(repo, added, removed, threshold):
       
    80     '''find renamed files -- yields (before, after, score) tuples'''
       
    81     parentctx = repo['.']
       
    82     workingctx = repo[None]
       
    83 
       
    84     # Zero length files will be frequently unrelated to each other, and
       
    85     # tracking the deletion/addition of such a file will probably cause more
       
    86     # harm than good. We strip them out here to avoid matching them later on.
       
    87     addedfiles = set([workingctx[fp] for fp in added
       
    88             if workingctx[fp].size() > 0])
       
    89     removedfiles = set([parentctx[fp] for fp in removed
       
    90             if fp in parentctx and parentctx[fp].size() > 0])
       
    91 
       
    92     # Find exact matches.
       
    93     for (a, b) in _findexactmatches(repo,
       
    94             sorted(addedfiles), sorted(removedfiles)):
       
    95         addedfiles.remove(b)
       
    96         yield (a.path(), b.path(), 1.0)
       
    97 
       
    98     # If the user requested similar files to be matched, search for them also.
       
    99     if threshold < 1.0:
       
   100         for (a, b, score) in _findsimilarmatches(repo,
       
   101                 sorted(addedfiles), sorted(removedfiles), threshold):
       
   102             yield (a.path(), b.path(), score)
       
   103