diff options
Diffstat (limited to 'Lib/difflib.py')
-rw-r--r-- | Lib/difflib.py | 127 |
1 files changed, 66 insertions, 61 deletions
diff --git a/Lib/difflib.py b/Lib/difflib.py index 3bbcb76b7ec..e6cc6ee4425 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python +#! /usr/bin/env python3 """ Module difflib -- helpers for computing deltas between objects. @@ -32,9 +32,9 @@ __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher', 'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff', 'unified_diff', 'HtmlDiff', 'Match'] +import warnings import heapq from collections import namedtuple as _namedtuple -from functools import reduce Match = _namedtuple('Match', 'a b size') @@ -80,7 +80,7 @@ class SequenceMatcher: sequences. As a rule of thumb, a .ratio() value over 0.6 means the sequences are close matches: - >>> print round(s.ratio(), 3) + >>> print(round(s.ratio(), 3)) 0.866 >>> @@ -88,7 +88,7 @@ class SequenceMatcher: .get_matching_blocks() is handy: >>> for block in s.get_matching_blocks(): - ... print "a[%d] and b[%d] match for %d elements" % block + ... print("a[%d] and b[%d] match for %d elements" % block) a[0] and b[0] match for 8 elements a[8] and b[17] match for 21 elements a[29] and b[38] match for 0 elements @@ -101,7 +101,7 @@ class SequenceMatcher: use .get_opcodes(): >>> for opcode in s.get_opcodes(): - ... print "%6s a[%d:%d] b[%d:%d]" % opcode + ... print("%6s a[%d:%d] b[%d:%d]" % opcode) equal a[0:8] b[0:8] insert a[8:8] b[8:17] equal a[8:29] b[17:38] @@ -183,7 +183,7 @@ class SequenceMatcher: # we need to do to 'a' to change it into 'b'?" # b2j # for x in b, b2j[x] is a list of the indices (into b) - # at which x appears; junk elements do not appear + # at which x appears; junk and popular elements do not appear # fullbcount # for x in b, fullbcount[x] == the number of times x # appears in b; only materialized if really needed (used @@ -205,15 +205,10 @@ class SequenceMatcher: # subtle but helpful effects on the algorithm, which I'll # get around to writing up someday <0.9 wink>. # DON'T USE! Only __chain_b uses this. Use isbjunk. - # isbjunk - # for x in b, isbjunk(x) == isjunk(x) but much faster; - # it's really the __contains__ method of a hidden dict. - # DOES NOT WORK for x in a! - # isbpopular - # for x in b, isbpopular(x) is true iff b is reasonably long - # (at least 200 elements) and x accounts for more than 1 + 1% of - # its elements (when autojunk is enabled). - # DOES NOT WORK for x in a! + # bjunk + # the items in b for which isjunk is True. + # bpopular + # nonjunk items in b treated as junk by the heuristic (if used). self.isjunk = isjunk self.a = self.b = None @@ -322,30 +317,39 @@ class SequenceMatcher: indices.append(i) # Purge junk elements - junk = set() + self.bjunk = junk = set() isjunk = self.isjunk if isjunk: - for elt in list(b2j.keys()): # using list() since b2j is modified + for elt in b2j.keys(): if isjunk(elt): junk.add(elt) - del b2j[elt] + for elt in junk: # separate loop avoids separate list of keys + del b2j[elt] # Purge popular elements that are not junk - popular = set() + self.bpopular = popular = set() n = len(b) if self.autojunk and n >= 200: ntest = n // 100 + 1 - for elt, idxs in list(b2j.items()): + for elt, idxs in b2j.items(): if len(idxs) > ntest: popular.add(elt) - del b2j[elt] - - # Now for x in b, isjunk(x) == x in junk, but the latter is much faster. - # Sicne the number of *unique* junk elements is probably small, the - # memory burden of keeping this set alive is likely trivial compared to - # the size of b2j. - self.isbjunk = junk.__contains__ - self.isbpopular = popular.__contains__ + for elt in popular: # ditto; as fast for 1% deletion + del b2j[elt] + + def isbjunk(self, item): + "Deprecated; use 'item in SequenceMatcher().bjunk'." + warnings.warn("'SequenceMatcher().isbjunk(item)' is deprecated;\n" + "use 'item in SMinstance.bjunk' instead.", + DeprecationWarning, 2) + return item in self.bjunk + + def isbpopular(self, item): + "Deprecated; use 'item in SequenceMatcher().bpopular'." + warnings.warn("'SequenceMatcher().isbpopular(item)' is deprecated;\n" + "use 'item in SMinstance.bpopular' instead.", + DeprecationWarning, 2) + return item in self.bpopular def find_longest_match(self, alo, ahi, blo, bhi): """Find longest matching block in a[alo:ahi] and b[blo:bhi]. @@ -403,14 +407,14 @@ class SequenceMatcher: # Windiff ends up at the same place as diff, but by pairing up # the unique 'b's and then matching the first two 'a's. - a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk + a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.bjunk.__contains__ besti, bestj, bestsize = alo, blo, 0 # find longest junk-free match # during an iteration of the loop, j2len[j] = length of longest # junk-free match ending with a[i-1] and b[j] j2len = {} nothing = [] - for i in xrange(alo, ahi): + for i in range(alo, ahi): # look at all instances of a[i] in b; note that because # b2j has no junk keys, the loop is skipped if a[i] is junk j2lenget = j2len.get @@ -472,7 +476,7 @@ class SequenceMatcher: triple with n==0. >>> s = SequenceMatcher(None, "abxcd", "abcd") - >>> s.get_matching_blocks() + >>> list(s.get_matching_blocks()) [Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)] """ @@ -548,8 +552,8 @@ class SequenceMatcher: >>> b = "abycdf" >>> s = SequenceMatcher(None, a, b) >>> for tag, i1, i2, j1, j2 in s.get_opcodes(): - ... print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" % - ... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])) + ... print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" % + ... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))) delete a[0:1] (q) b[0:0] () equal a[1:3] (ab) b[0:2] (ab) replace a[3:4] (x) b[2:3] (y) @@ -590,7 +594,7 @@ class SequenceMatcher: Each group is in the same format as returned by get_opcodes(). >>> from pprint import pprint - >>> a = map(str, range(1,40)) + >>> a = list(map(str, range(1,40))) >>> b = a[:] >>> b[8:8] = ['i'] # Make an insertion >>> b[20] += 'x' # Make a replacement @@ -655,8 +659,7 @@ class SequenceMatcher: 1.0 """ - matches = reduce(lambda sum, triple: sum + triple[-1], - self.get_matching_blocks(), 0) + matches = sum(triple[-1] for triple in self.get_matching_blocks()) return _calculate_ratio(matches, len(self.a) + len(self.b)) def quick_ratio(self): @@ -723,7 +726,7 @@ def get_close_matches(word, possibilities, n=3, cutoff=0.6): >>> import keyword as _keyword >>> get_close_matches("wheel", _keyword.kwlist) ['while'] - >>> get_close_matches("apple", _keyword.kwlist) + >>> get_close_matches("Apple", _keyword.kwlist) [] >>> get_close_matches("accept", _keyword.kwlist) ['except'] @@ -836,7 +839,7 @@ class Differ: As a single multi-line string it looks like this: - >>> print ''.join(result), + >>> print(''.join(result), end="") 1. Beautiful is better than ugly. - 2. Explicit is better than implicit. - 3. Simple is better than complex. @@ -893,8 +896,9 @@ class Differ: Example: - >>> print ''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(1), + >>> print(''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(1), ... 'ore\ntree\nemu\n'.splitlines(1))), + ... end="") - one ? ^ + ore @@ -917,14 +921,14 @@ class Differ: elif tag == 'equal': g = self._dump(' ', a, alo, ahi) else: - raise ValueError, 'unknown tag %r' % (tag,) + raise ValueError('unknown tag %r' % (tag,)) for line in g: yield line def _dump(self, tag, x, lo, hi): """Generate comparison results for a same-tagged range.""" - for i in xrange(lo, hi): + for i in range(lo, hi): yield '%s %s' % (tag, x[i]) def _plain_replace(self, a, alo, ahi, b, blo, bhi): @@ -954,7 +958,7 @@ class Differ: >>> d = Differ() >>> results = d._fancy_replace(['abcDefghiJkl\n'], 0, 1, ... ['abcdefGhijkl\n'], 0, 1) - >>> print ''.join(results), + >>> print(''.join(results), end="") - abcDefghiJkl ? ^ ^ ^ + abcdefGhijkl @@ -970,10 +974,10 @@ class Differ: # search for the pair that matches best without being identical # (identical lines must be junk lines, & we don't want to synch up # on junk -- unless we have to) - for j in xrange(blo, bhi): + for j in range(blo, bhi): bj = b[j] cruncher.set_seq2(bj) - for i in xrange(alo, ahi): + for i in range(alo, ahi): ai = a[i] if ai == bj: if eqi is None: @@ -1029,7 +1033,7 @@ class Differ: atags += ' ' * la btags += ' ' * lb else: - raise ValueError, 'unknown tag %r' % (tag,) + raise ValueError('unknown tag %r' % (tag,)) for line in self._qformat(aelt, belt, atags, btags): yield line else: @@ -1062,7 +1066,7 @@ class Differ: >>> d = Differ() >>> results = d._qformat('\tabcDefghiJkl\n', '\tabcdefGhijkl\n', ... ' ^ ^ ^ ', ' ^ ^ ^ ') - >>> for line in results: print repr(line) + >>> for line in results: print(repr(line)) ... '- \tabcDefghiJkl\n' '? \t ^ ^ ^\n' @@ -1184,7 +1188,7 @@ def unified_diff(a, b, fromfile='', tofile='', fromfiledate='', ... 'zero one tree four'.split(), 'Original', 'Current', ... '2005-01-26 23:30:50', '2010-04-02 10:20:52', ... lineterm=''): - ... print line # doctest: +NORMALIZE_WHITESPACE + ... print(line) # doctest: +NORMALIZE_WHITESPACE --- Original 2005-01-26 23:30:50 +++ Current 2010-04-02 10:20:52 @@ -1,4 +1,4 @@ @@ -1215,10 +1219,10 @@ def unified_diff(a, b, fromfile='', tofile='', fromfiledate='', for line in a[i1:i2]: yield ' ' + line continue - if tag in ('replace', 'delete'): + if tag in {'replace', 'delete'}: for line in a[i1:i2]: yield '-' + line - if tag in ('replace', 'insert'): + if tag in {'replace', 'insert'}: for line in b[j1:j2]: yield '+' + line @@ -1265,8 +1269,9 @@ def context_diff(a, b, fromfile='', tofile='', Example: - >>> print ''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(1), + >>> print(''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(1), ... 'zero\none\ntree\nfour\n'.splitlines(1), 'Original', 'Current')), + ... end="") *** Original --- Current *************** @@ -1298,7 +1303,7 @@ def context_diff(a, b, fromfile='', tofile='', file1_range = _format_range_context(first[1], last[2]) yield '*** {} ****{}'.format(file1_range, lineterm) - if any(tag in ('replace', 'delete') for tag, _, _, _, _ in group): + if any(tag in {'replace', 'delete'} for tag, _, _, _, _ in group): for tag, i1, i2, _, _ in group: if tag != 'insert': for line in a[i1:i2]: @@ -1307,7 +1312,7 @@ def context_diff(a, b, fromfile='', tofile='', file2_range = _format_range_context(first[3], last[4]) yield '--- {} ----{}'.format(file2_range, lineterm) - if any(tag in ('replace', 'insert') for tag, _, _, _, _ in group): + if any(tag in {'replace', 'insert'} for tag, _, _, _, _ in group): for tag, _, _, j1, j2 in group: if tag != 'delete': for line in b[j1:j2]: @@ -1336,7 +1341,7 @@ def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK): >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1), ... 'ore\ntree\nemu\n'.splitlines(1)) - >>> print ''.join(diff), + >>> print(''.join(diff), end="") - one ? ^ + ore @@ -1469,7 +1474,7 @@ def _mdiff(fromlines, tolines, context=None, linejunk=None, # so we can do some very readable comparisons. while len(lines) < 4: try: - lines.append(diff_lines_iterator.next()) + lines.append(next(diff_lines_iterator)) except StopIteration: lines.append('X') s = ''.join([line[0] for line in lines]) @@ -1556,7 +1561,7 @@ def _mdiff(fromlines, tolines, context=None, linejunk=None, while True: # Collecting lines of text until we have a from/to pair while (len(fromlines)==0 or len(tolines)==0): - from_line, to_line, found_diff =line_iterator.next() + from_line, to_line, found_diff = next(line_iterator) if from_line is not None: fromlines.append((from_line,found_diff)) if to_line is not None: @@ -1571,7 +1576,7 @@ def _mdiff(fromlines, tolines, context=None, linejunk=None, line_pair_iterator = _line_pair_iterator() if context is None: while True: - yield line_pair_iterator.next() + yield next(line_pair_iterator) # Handle case where user wants context differencing. We must do some # storage of lines until we know for sure that they are to be yielded. else: @@ -1584,7 +1589,7 @@ def _mdiff(fromlines, tolines, context=None, linejunk=None, index, contextLines = 0, [None]*(context) found_diff = False while(found_diff is False): - from_line, to_line, found_diff = line_pair_iterator.next() + from_line, to_line, found_diff = next(line_pair_iterator) i = index % context contextLines[i] = (from_line, to_line, found_diff) index += 1 @@ -1604,7 +1609,7 @@ def _mdiff(fromlines, tolines, context=None, linejunk=None, # Now yield the context lines after the change lines_to_write = context-1 while(lines_to_write): - from_line, to_line, found_diff = line_pair_iterator.next() + from_line, to_line, found_diff = next(line_pair_iterator) # If another change within the context, extend the context if found_diff: lines_to_write = context-1 @@ -2032,11 +2037,11 @@ def restore(delta, which): >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1), ... 'ore\ntree\nemu\n'.splitlines(1)) >>> diff = list(diff) - >>> print ''.join(restore(diff, 1)), + >>> print(''.join(restore(diff, 1)), end="") one two three - >>> print ''.join(restore(diff, 2)), + >>> print(''.join(restore(diff, 2)), end="") ore tree emu @@ -2044,7 +2049,7 @@ def restore(delta, which): try: tag = {1: "- ", 2: "+ "}[int(which)] except KeyError: - raise ValueError, ('unknown delta choice (must be 1 or 2): %r' + raise ValueError('unknown delta choice (must be 1 or 2): %r' % which) prefixes = (" ", tag) for line in delta: |