eggs/mercurial-1.7.3-py2.6-linux-x86_64.egg/mercurial/similar.py
changeset 69 c6bca38c1cbf
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/eggs/mercurial-1.7.3-py2.6-linux-x86_64.egg/mercurial/similar.py	Sat Jan 08 11:20:57 2011 +0530
@@ -0,0 +1,103 @@
+# similar.py - mechanisms for finding similar files
+#
+# Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+
+from i18n import _
+import util
+import mdiff
+import bdiff
+
+def _findexactmatches(repo, added, removed):
+    '''find renamed files that have no changes
+
+    Takes a list of new filectxs and a list of removed filectxs, and yields
+    (before, after) tuples of exact matches.
+    '''
+    numfiles = len(added) + len(removed)
+
+    # Get hashes of removed files.
+    hashes = {}
+    for i, fctx in enumerate(removed):
+        repo.ui.progress(_('searching for exact renames'), i, total=numfiles)
+        h = util.sha1(fctx.data()).digest()
+        hashes[h] = fctx
+
+    # For each added file, see if it corresponds to a removed file.
+    for i, fctx in enumerate(added):
+        repo.ui.progress(_('searching for exact renames'), i + len(removed),
+                total=numfiles)
+        h = util.sha1(fctx.data()).digest()
+        if h in hashes:
+            yield (hashes[h], fctx)
+
+    # Done
+    repo.ui.progress(_('searching for exact renames'), None)
+
+def _findsimilarmatches(repo, added, removed, threshold):
+    '''find potentially renamed files based on similar file content
+
+    Takes a list of new filectxs and a list of removed filectxs, and yields
+    (before, after, score) tuples of partial matches.
+    '''
+    copies = {}
+    for i, r in enumerate(removed):
+        repo.ui.progress(_('searching for similar files'), i, total=len(removed))
+
+        # lazily load text
+        @util.cachefunc
+        def data():
+            orig = r.data()
+            return orig, mdiff.splitnewlines(orig)
+
+        def score(text):
+            orig, lines = data()
+            # bdiff.blocks() returns blocks of matching lines
+            # count the number of bytes in each
+            equal = 0
+            matches = bdiff.blocks(text, orig)
+            for x1, x2, y1, y2 in matches:
+                for line in lines[y1:y2]:
+                    equal += len(line)
+
+            lengths = len(text) + len(orig)
+            return equal * 2.0 / lengths
+
+        for a in added:
+            bestscore = copies.get(a, (None, threshold))[1]
+            myscore = score(a.data())
+            if myscore >= bestscore:
+                copies[a] = (r, myscore)
+    repo.ui.progress(_('searching'), None)
+
+    for dest, v in copies.iteritems():
+        source, score = v
+        yield source, dest, score
+
+def findrenames(repo, added, removed, threshold):
+    '''find renamed files -- yields (before, after, score) tuples'''
+    parentctx = repo['.']
+    workingctx = repo[None]
+
+    # Zero length files will be frequently unrelated to each other, and
+    # tracking the deletion/addition of such a file will probably cause more
+    # harm than good. We strip them out here to avoid matching them later on.
+    addedfiles = set([workingctx[fp] for fp in added
+            if workingctx[fp].size() > 0])
+    removedfiles = set([parentctx[fp] for fp in removed
+            if fp in parentctx and parentctx[fp].size() > 0])
+
+    # Find exact matches.
+    for (a, b) in _findexactmatches(repo,
+            sorted(addedfiles), sorted(removedfiles)):
+        addedfiles.remove(b)
+        yield (a.path(), b.path(), 1.0)
+
+    # If the user requested similar files to be matched, search for them also.
+    if threshold < 1.0:
+        for (a, b, score) in _findsimilarmatches(repo,
+                sorted(addedfiles), sorted(removedfiles), threshold):
+            yield (a.path(), b.path(), score)
+