eggs/mercurial-1.7.3-py2.6-linux-x86_64.egg/mercurial/similar.py
author Nishanth Amuluru <nishanth@fossee.in>
Tue, 11 Jan 2011 00:23:47 +0530
changeset 134 563fe356947d
parent 69 c6bca38c1cbf
permissions -rw-r--r--
created browse textbooks page
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
69
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
     1
# similar.py - mechanisms for finding similar files
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
     2
#
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
     3
# Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
     4
#
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
     5
# This software may be used and distributed according to the terms of the
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
     6
# GNU General Public License version 2 or any later version.
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
     7
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
     8
from i18n import _
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
     9
import util
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    10
import mdiff
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    11
import bdiff
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    12
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    13
def _findexactmatches(repo, added, removed):
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    14
    '''find renamed files that have no changes
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    15
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    16
    Takes a list of new filectxs and a list of removed filectxs, and yields
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    17
    (before, after) tuples of exact matches.
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    18
    '''
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    19
    numfiles = len(added) + len(removed)
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    20
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    21
    # Get hashes of removed files.
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    22
    hashes = {}
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    23
    for i, fctx in enumerate(removed):
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    24
        repo.ui.progress(_('searching for exact renames'), i, total=numfiles)
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    25
        h = util.sha1(fctx.data()).digest()
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    26
        hashes[h] = fctx
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    27
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    28
    # For each added file, see if it corresponds to a removed file.
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    29
    for i, fctx in enumerate(added):
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    30
        repo.ui.progress(_('searching for exact renames'), i + len(removed),
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    31
                total=numfiles)
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    32
        h = util.sha1(fctx.data()).digest()
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    33
        if h in hashes:
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    34
            yield (hashes[h], fctx)
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    35
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    36
    # Done
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    37
    repo.ui.progress(_('searching for exact renames'), None)
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    38
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    39
def _findsimilarmatches(repo, added, removed, threshold):
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    40
    '''find potentially renamed files based on similar file content
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    41
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    42
    Takes a list of new filectxs and a list of removed filectxs, and yields
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    43
    (before, after, score) tuples of partial matches.
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    44
    '''
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    45
    copies = {}
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    46
    for i, r in enumerate(removed):
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    47
        repo.ui.progress(_('searching for similar files'), i, total=len(removed))
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    48
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    49
        # lazily load text
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    50
        @util.cachefunc
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    51
        def data():
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    52
            orig = r.data()
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    53
            return orig, mdiff.splitnewlines(orig)
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    54
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    55
        def score(text):
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    56
            orig, lines = data()
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    57
            # bdiff.blocks() returns blocks of matching lines
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    58
            # count the number of bytes in each
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    59
            equal = 0
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    60
            matches = bdiff.blocks(text, orig)
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    61
            for x1, x2, y1, y2 in matches:
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    62
                for line in lines[y1:y2]:
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    63
                    equal += len(line)
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    64
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    65
            lengths = len(text) + len(orig)
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    66
            return equal * 2.0 / lengths
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    67
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    68
        for a in added:
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    69
            bestscore = copies.get(a, (None, threshold))[1]
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    70
            myscore = score(a.data())
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    71
            if myscore >= bestscore:
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    72
                copies[a] = (r, myscore)
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    73
    repo.ui.progress(_('searching'), None)
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    74
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    75
    for dest, v in copies.iteritems():
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    76
        source, score = v
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    77
        yield source, dest, score
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    78
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    79
def findrenames(repo, added, removed, threshold):
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    80
    '''find renamed files -- yields (before, after, score) tuples'''
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    81
    parentctx = repo['.']
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    82
    workingctx = repo[None]
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    83
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    84
    # Zero length files will be frequently unrelated to each other, and
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    85
    # tracking the deletion/addition of such a file will probably cause more
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    86
    # harm than good. We strip them out here to avoid matching them later on.
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    87
    addedfiles = set([workingctx[fp] for fp in added
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    88
            if workingctx[fp].size() > 0])
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    89
    removedfiles = set([parentctx[fp] for fp in removed
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    90
            if fp in parentctx and parentctx[fp].size() > 0])
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    91
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    92
    # Find exact matches.
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    93
    for (a, b) in _findexactmatches(repo,
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    94
            sorted(addedfiles), sorted(removedfiles)):
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    95
        addedfiles.remove(b)
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    96
        yield (a.path(), b.path(), 1.0)
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    97
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    98
    # If the user requested similar files to be matched, search for them also.
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
    99
    if threshold < 1.0:
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
   100
        for (a, b, score) in _findsimilarmatches(repo,
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
   101
                sorted(addedfiles), sorted(removedfiles), threshold):
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
   102
            yield (a.path(), b.path(), score)
c6bca38c1cbf Added buildout stuff and made changes accordingly
Nishanth Amuluru <nishanth@fossee.in>
parents:
diff changeset
   103