# coding: utf-8 # Copyright (c) Jupyter Development Team. # Distributed under the terms of the Modified BSD License. import operator from collections import defaultdict import difflib from ..diff_format import SequenceDiffBuilder, MappingDiffBuilder, validate_diff from ..diff_utils import count_consumed_symbols from .config import DiffConfig from .sequences import diff_strings_linewise, diff_sequence from .snakes import compute_snakes_multilevel, compute_diff_from_snakes __all__ = ["diff"] def default_predicates(): return defaultdict(lambda: (operator.__eq__,)) def default_differs(): return defaultdict(lambda: diff) def compare_strings_approximate(x, y, threshold=0.7, maxlen=None): "Compare to strings with approximate heuristics." # TODO: Add configuration framework # TODO: Tune threshold with realistic sources # Fast cutoff when one is empty if bool(x) != bool(y): return False # Cutoff on equality: Python has fast hash functions for strings, # and lists of strings also works fine if len(x) == len(y) and x == y: return True # TODO: Investigate performance and quality of this difflib ratio approach, # possibly one of the weakest links of the notebook diffing algorithm. # Alternatives to try are the libraries diff-patch-match and Levenschtein # Informal benchmark normalized to operator ==: # 1.0 operator == # 438.2 real_quick_ratio # 796.5 quick_ratio # 3088.2 ratio # The == cutoff will hit most of the time for long runs of # equal items, at least in the Myers diff algorithm. # Most other comparisons will likely not be very similar, # and the (real_)quick_ratio cutoffs will speed up those. # So the heavy ratio function is only used for close calls. # s = difflib.SequenceMatcher(lambda c: c in (" ", "\t"), x, y, autojunk=False) s = difflib.SequenceMatcher(None, x, y, autojunk=False) # Use only the fast ratio approximations first if s.real_quick_ratio() < threshold: return False if s.quick_ratio() < threshold: return False if maxlen is not None and len(x) > maxlen and len(y) > maxlen: # We know from above that there is not an exact similarity return False return s.ratio() > threshold def diff(a, b, path="", config=None): "Compute the diff of two json-like objects, list or dict or string." if config is None: config = DiffConfig() if isinstance(a, list) and isinstance(b, list): d = diff_lists(a, b, path=path, config=config) elif isinstance(a, dict) and isinstance(b, dict): d = diff_dicts(a, b, path=path, config=config) elif isinstance(a, str) and isinstance(b, str): # Don't pass differs/predicates as the only possible use case is to # use a different character differ within each line or predicates # for comparing lines d = diff_strings_linewise(a, b) else: raise RuntimeError("Can currently only diff list, dict, or str objects.") # We can turn this off for performance after the library has been well tested: validate_diff(d) return d def diff_string_lines(a, b, path="", config=None): """Diff two lists of strings (lines)""" # This is mainly about short-circuiting to avoid full snakes for equal content # since we know we can rely on __eq__ comparison if len(a) == len(b) and a == b: return [] return diff_strings_linewise(a, b) def diff_sequence_multilevel(a, b, path="", config=None): """Compute diff of two lists with configurable behaviour.""" if config is None: config = DiffConfig() # Invoke multilevel snake computation algorithm compares = config.predicates[path or '/'] snakes = compute_snakes_multilevel(a, b, compares) # Convert snakes to diff return compute_diff_from_snakes(a, b, snakes, path=path, config=config) def diff_lists(a, b, path="", config=None, shallow_diff=None): """Compute diff of two lists with configurable behaviour.""" if config is None: config = DiffConfig() # If multiple compares are provided to this path, delegate to multilevel algorithm compares = config.predicates[path or '/'] if len(compares) > 1: assert shallow_diff is None return diff_sequence_multilevel(a, b, path=path, config=config) # First make a shallow sequence diff with custom compare, # unless it's provided for us if shallow_diff is None: shallow_diff = diff_sequence(a, b, compares[0]) # Next we recurse to diff items in sequence that are considered # similar by compares[0] in the loop below subpath = "/".join((path, "*")) diffit = config.differs[subpath] # Count consumed items i,j from a,b, (i="take" in patch_list) i, j = 0, 0 di = SequenceDiffBuilder() M = len(shallow_diff) for ie in range(M+1): if ie < M: # Consume n more unmentioned items before this diff entry # Note that index can be larger than i in the case where items # have been deleted from a and then insertions from b occur. e = shallow_diff[ie] index = e.key n = max(0, index - i) askip, bskip = count_consumed_symbols(e) else: # Consume final items after the last diff entry e = None n = len(a) - i askip, bskip = 0, 0 assert n >= 0, "Sanity check failed: Cannot have negative remaining entries" assert len(b) - j == n, "Sanity check failed: Base/remote indexing mismatch" # Recursively diff the n items that have been deemed similar for k in range(n): aval = a[i + k] bval = b[j + k] if not config.is_atomic(aval, subpath): cd = diffit(aval, bval, path=subpath, config=config) if cd: di.patch(i + k, cd) # FIXME: Not covered in tests, create test situation # Keep count of consumed items i += n + askip j += n + bskip # Insert the diff entry from shallow diff unless past the end # (this either adds or removes items) if ie < M: di.append(e) # Sanity check assert i == len(a), "Sanity check failed: Did not process all entries in a" assert j == len(b), "Sanity check failed: Did not process all entries in b" return di.validated() def diff_dicts(a, b, path="", config=None): """Compute diff of two dicts with configurable behaviour. Keys in both a and b will be handled based on Make a one-level diff of dicts a and b, using given compare operator to specify which items are considered the same. Items not mentioned in diff are items where compare(x, y) return True. For other items the diff will contain delete, insert, or replace entries. """ if config is None: config = DiffConfig() if not isinstance(a, dict) or not isinstance(b, dict): raise TypeError('Arguments to diff_dicts need to be dicts, got %r and %r' % (a, b)) akeys = set(a.keys()) bkeys = set(b.keys()) di = MappingDiffBuilder() # Sorting keys in loops to get a deterministic diff result for key in sorted(akeys - bkeys): di.remove(key) # Handle values for keys in both a and b for key in sorted(akeys & bkeys): avalue = a[key] bvalue = b[key] # If types are the same and nonatomic, recurse subpath = "/".join((path, key)) if type(avalue) is type(bvalue) and not config.is_atomic(avalue, path=subpath): diffit = config.differs[subpath] dd = diffit(avalue, bvalue, path=subpath, config=config) if dd: di.patch(key, dd) else: if (path or '/') in config.predicates: # Could also this a warning, but I think it shouldn't be done raise RuntimeError( "Found predicate(s) for path {} pointing to dict entry.".format( path or '/')) if avalue != bvalue: di.replace(key, bvalue) for key in sorted(bkeys - akeys): di.add(key, b[key]) return di.validated()