# coding: utf-8

# Copyright (c) Jupyter Development Team.
# Distributed under the terms of the Modified BSD License.

"""Tools for diffing notebooks.

All diff tools here currently assumes the notebooks have already been
converted to the same format version, currently v4 at time of writing.
Up- and down-conversion is handled by nbformat.
"""

import operator
import re
import copy
from functools import lru_cache

from ..diff_format import MappingDiffBuilder, DiffOp
from ..utils import defaultdict2

from .config import DiffConfig
from .generic import (
    diff, diff_sequence_multilevel, compare_strings_approximate,
    diff_string_lines,
)

__all__ = ["diff_notebooks"]

# A regexp matching base64 encoded data
_base64 = re.compile(r'^(?:[a-z0-9+/]{4})*(?:[a-z0-9+/]{2}==|[a-z0-9+/]{3}=)?$', re.UNICODE | re.IGNORECASE)

# A regexp matching common python repr-style output like
# <module.type at 0xmemoryaddress>
re_repr = re.compile(r"<[a-z0-9._]+ at 0x[a-f0-9]{8,16}>", re.IGNORECASE)

re_pointer = re.compile(r"0x[a-f0-9]{8,16}", re.IGNORECASE)

TEXT_MIMEDATA_MAX_COMPARE_LENGTH = 10000
STREAM_MAX_COMPARE_LENGTH = 1000


# List of mimes we can diff recursively
_split_mimes = (
    'text/',
    'image/svg+xml',
    'application/javascript',
    'application/json',
    )


def _is_base64(test_string, min_len=64):
    """Whether string is base64 data, possibly with newlines in it"""
    if len(test_string) < min_len:
        return False
    return _base64.match(''.join(test_string.splitlines()))


# TODO: Maybe cleaner to make the split between strict/approximate
#       an argument instead of separate functions.


@lru_cache(maxsize=1024, typed=False)
def compare_text_approximate(x, y, maxlen=None):
    # Fast cutoff when one is empty
    if bool(x) != bool(y):
        return False

    if isinstance(x, list):
        x = "".join(x)
    if isinstance(y, list):
        y = "".join(y)

    # TODO: Review whether this is wanted.
    #       The motivation is to align tiny
    #       strings in outputs such as a single number.
    # Allow aligning short strings without comparison
    nx = len(x)
    ny = len(y)
    shortlen = 10  # TODO: Add this to configuration framework
    if nx < shortlen and ny < shortlen:
        return True

    return compare_strings_approximate(x, y, threshold=0.7, maxlen=maxlen)


def compare_text_strict(x, y, maxlen=None):
    # TODO: Doesn't have to be 100% equal here?
    if isinstance(x, list):
        x = "".join(x)
    if isinstance(y, list):
        y = "".join(y)
    if len(x) == len(y) and x == y:
        return True
    return compare_strings_approximate(x, y, threshold=0.95, maxlen=maxlen)


def make_compare_text(maxlen, strict):
    def compare_text(x, y):
        if strict:
            return compare_text_strict(x, y, maxlen=maxlen)
        return compare_text_approximate(x, y, maxlen=maxlen)
    return compare_text


def compare_base64_strict(x, y):
    if len(x) != len(y):
        return False
    # TODO: Handle base64 data another way?
    return x == y


@lru_cache(maxsize=128, typed=False)
def _compare_mimedata_strings(x, y, comp_text, comp_base64):
    # Most likely base64 encoded data
    if _is_base64(x):
        return comp_base64(x, y)
    else:
        return comp_text(x, y)


def _compare_mimedata(mimetype, x, y, comp_text, comp_base64):
    mimetype = mimetype.lower()

    # TODO: Test this. Match repr-style oneliners with random pointer
    if mimetype == "text/plain":
        # Allow short texts to only differ by pointer values
        if "\n" not in x and "\n" not in y:
            xsplit = re_pointer.split(x)
            ysplit = re_pointer.split(y)
            if xsplit == ysplit:
                return True

    if mimetype.startswith("text/"):
        return comp_text(x, y)

    # TODO: Compare binary images?
    #if mimetype.startswith("image/"):
    if isinstance(x, str) and isinstance(y, str):
        return _compare_mimedata_strings(x, y, comp_text, comp_base64)
    # Fallback to exactly equal
    return x == y


def compare_mimedata_approximate(mimetype, x, y):
    return _compare_mimedata(
        mimetype, x, y,
        make_compare_text(maxlen=TEXT_MIMEDATA_MAX_COMPARE_LENGTH, strict=False),
        compare_base64_strict)


def compare_mimedata_strict(mimetype, x, y):
    return _compare_mimedata(
        mimetype, x, y,
        make_compare_text(maxlen=TEXT_MIMEDATA_MAX_COMPARE_LENGTH, strict=True),
        compare_base64_strict)


def compare_mimebundle_approximate(x, y):
    # Get the simple and cheap stuff out of the way
    if x is None and y is None:
        return True
    if x is None or y is None:
        return False

    # This only checks that the same mime types are present
    if set(x.keys()) != set(y.keys()):
        return False

    for key in x.keys():
        # Delegate to mimetype specific comparison
        if not compare_mimedata_approximate(key, x[key], y[key]):
            return False

    # Didn't fail up to here it must be equal
    return True


def compare_mimebundle_strict(x, y):
    # Get the simple and cheap stuff out of the way
    if x is None and y is None:
        return True
    if x is None or y is None:
        return False

    # This only checks that the same mime types are present
    if set(x.keys()) != set(y.keys()):
        return False

    for key in x.keys():
        # Delegate to mimetype specific comparison
        if not compare_mimedata_strict(key, x[key], y[key]):
            return False

    # Didn't fail up to here it must be equal
    return True


def compare_tracebacks(xt, yt):
    if len(xt) != len(yt):
        return False
    for x, y in zip(xt, yt):
        if not compare_strings_approximate(x, y):
            return False
    return True


def compare_output_approximate(x, y):
    "Compare type and data of output cells x,y approximately."
    # NB! This is used as a basis by the exact compare_output.

    # Fast cutuff
    ot = x["output_type"]
    if ot != y["output_type"]:
        return False

    # Sanity cutoff
    xkeys = set(x)
    ykeys = set(y)
    if xkeys != ykeys:
        return False

    # Deliberately skipping metadata and execution count here
    handled = set(("output_type", "metadata", "execution_count"))

    if ot == "stream":
        if x["name"] != y["name"]:
            return False
        if not compare_strings_approximate(x["text"], y["text"], maxlen=STREAM_MAX_COMPARE_LENGTH):
            return False
        handled.update(("name", "text"))

    elif ot == "error":
        if x["ename"] != y["ename"]:
            return False
        if x["evalue"] != y["evalue"]:
            return False

        # Compare tracebacks
        xt = x["traceback"]
        yt = y["traceback"]
        if not compare_tracebacks(xt, yt):
            return False

        handled.update(("ename", "evalue", "traceback"))

    elif ot == "display_data" or ot == "execute_result":
        xd = x["data"]
        yd = y["data"]
        if not compare_mimebundle_approximate(xd, yd):
            return False
        handled.update(("data",))

    else:
        # Unknown type
        pass

    # Play safe on unknown keys
    for k in xkeys - handled:
        if x[k] != y[k]:
            return False

    # NB! Ignoring metadata and execution count
    return True


def compare_output_strict(x, y):
    "Compare type and data of output cells x,y to higher accuracy."

    # Fast cutuff
    ot = x["output_type"]
    if ot != y["output_type"]:
        return False

    # Sanity cutoff
    xkeys = set(x)
    ykeys = set(y)
    if xkeys != ykeys:
        return False

    handled = set(("output_type", "data"))

    # Strict match on all keys we do not otherwise handle
    for k in xkeys - handled:
        if x[k] != y[k]:
            return False

    if not compare_mimebundle_strict(x.get("data"), y.get("data")):
        return False

    # NB! Ignoring metadata and execution count
    return True


def compare_cell_approximate(x, y):
    """Compare cells x,y with approximate heuristics.

    This is used to align cells in the /cells list
    in the third and last multilevel diff iteration.
    """
    # Cell types must match
    if x["cell_type"] != y["cell_type"]:
        return False

    # Compare sources
    if not compare_text_approximate(x["source"], y["source"]):
        return False

    # NB! Ignoring metadata, execution_count, outputs
    return True


def compare_outputs_approximate(xoutputs, youtputs):
    """Compare sequences of outputs with approximate heuristics."""
    dd = diff_item_at_path(xoutputs, youtputs, "/cells/*/outputs")
    if any(e.op != DiffOp.PATCH for e in dd):
        # Something added or removed
        return False
    # If nothing was added or removed, that means all
    # items in outputs lists are determined to align
    return True


def compare_cell_moderate(x, y):
    """Compare cells x,y with moderate accuracy heuristics.

    This is used to align cells in the /cells list
    in the second multilevel diff iteration.
    """

    # Cell types must match
    if x["cell_type"] != y["cell_type"]:
        return False

    # Compare sources
    if not compare_text_approximate(x["source"], y["source"]):
        return False

    # Compare outputs for code cells
    if x["cell_type"] == "code":
        xop = x["outputs"] or ()
        yop = y["outputs"] or ()
        if bool(xop) != bool(yop):
            return False
        return compare_outputs_approximate(xop, yop)

    # NB! Ignoring metadata and execution_count
    return True


def compare_cell_strict(x, y):
    """Compare cells x,y with higher accuracy heuristics.

    This is used to align cells in the /cells list
    in the first multilevel diff iteration.
    """

    # Cell types must match
    if x["cell_type"] != y["cell_type"]:
        return False

    # Compare sources
    if not compare_text_strict(x["source"], y["source"]):
        return False

    # Compare outputs for code cells
    if x["cell_type"] == "code":
        xop = x["outputs"] or ()
        yop = y["outputs"] or ()
        # Be strict on number of outputs
        if len(xop) != len(yop):
            return False
        # Be strict on order and content of outputs
        for xo, yo in zip(xop, yop):
            if not compare_output_strict(xo, yo):
                return False

    # NB! Ignoring metadata and execution count
    return True


def compare_cell_by_ids(x, y):
    """Compare cells x,y strictly using cell IDs"""
    # Only consider equal if both have IDs and they match
    return 'id' in x and 'id' in y and x['id'] == y['id']


def diff_single_outputs(a, b, path="/cells/*/outputs/*", config=None):
    """DiffOp a pair of output cells."""
    assert path == "/cells/*/outputs/*", 'Invalid path for ouput: %r' % path
    assert a.output_type == b.output_type, 'cannot diff outputs of different types'

    if a.output_type in ("execute_result", "display_data"):
        di = MappingDiffBuilder()

        # Separate data from output during diffing:
        tmp_data = a.pop('data')
        a_conj = copy.deepcopy(a)  # Output without data
        a.data = tmp_data          # Restore output
        tmp_data = b.pop('data')
        b_conj = copy.deepcopy(b)
        b.data = tmp_data
        # Only diff outputs without data:
        dd_conj = diff(a_conj, b_conj)
        if dd_conj:
            for e in dd_conj:
                di.append(e)

        # Only diff data:
        dd = diff_mime_bundle(a.data, b.data, path=path+"/data", config=config)
        if dd:
            di.patch("data", dd)

        return di.validated()
    else:
        return diff(a, b, config=config)


def add_mime_diff(key, avalue, bvalue, diffbuilder):
    # TODO: Handle output diffing with plugins?
    # I.e. image diff, svg diff, json diff, etc.

    mimetype = key.lower()
    if isinstance(avalue, str) and isinstance(bvalue, str) and avalue == bvalue:
        return
    if any(mimetype.startswith(tm) for tm in _split_mimes):
        dd = diff(avalue, bvalue)
        if dd:
            diffbuilder.patch(key, dd)
    elif avalue != bvalue:
        diffbuilder.replace(key, bvalue)


def diff_attachments(a, b, path="/cells/*/attachments", config=None):
    """Diff a pair of attachment collections"""
    assert path == "/cells/*/attachments", 'Invalid path for attachment: %r' % path

    # Two events can happen:
    #  1: An attachment is added/removed/patched
    #  2: An attachment is renamed (key change)
    # Currently, #2 is handled as two ops (an add and a remove)

    # keys here are 'filenames' of the attachments
    if not isinstance(a, dict) or not isinstance(b, dict):
        raise TypeError('Attachments stores should be dictionaries. Got %r and %r' % (a, b))
    akeys = set(a.keys())
    bkeys = set(b.keys())

    di = MappingDiffBuilder()

    # Sorting keys in loops to get a deterministic diff result
    for key in sorted(akeys - bkeys):
        di.remove(key)

    # Handle values for keys in both a and b
    for key in sorted(akeys & bkeys):
        avalue = a[key]
        bvalue = b[key]

        dd = diff_mime_bundle(avalue, bvalue)
        if dd:
            di.patch(key, dd)

    for key in sorted(bkeys - akeys):
        di.add(key, b[key])
    return di.validated()


def diff_mime_bundle(a, b, path=None, config=None):
    """Diff a MIME bundle.

    A MIME bundle has MIME types as keys, with values that are
    of the corresponding MIME type.

    Very similar to the generic dict differ, except it passes
    the diff builder to the sub-differ.
    """
    # keys here are mime/types
    if not isinstance(a, dict) or not isinstance(b, dict):
        raise TypeError('MIME bundles should be dictionaries. Got %r and %r' % (a, b))
    akeys = set(a.keys())
    bkeys = set(b.keys())

    di = MappingDiffBuilder()

    # Sorting keys in loops to get a deterministic diff result
    for key in sorted(akeys - bkeys):
        di.remove(key)

    # Handle values for keys in both a and b
    for key in sorted(akeys & bkeys):
        avalue = a[key]
        bvalue = b[key]
        add_mime_diff(key, avalue, bvalue, di)

    for key in sorted(bkeys - akeys):
        di.add(key, b[key])
    return di.validated()


def diff_ignore(*args, **kwargs):
    """Always returns an empty diff"""
    return []


def diff_ignore_keys(inner_differ, ignore_keys):
    """Call inner_differ, but filter the resulting diff.

    Will ignore all direct diff values that has a key in
    ignore_keys. I.e. this will not recurse into patch ops.
    """
    def ignored_diff(*args, **kwargs):
        d = inner_differ(*args, **kwargs)
        ret = []
        for e in d:
            if e.key not in ignore_keys:
                ret.append(e)
        return ret
    return ignored_diff


# Sequence diffs should be applied with multilevel
# algorithm for paths with more than one predicate,
# and using operator.__eq__ if no match in there.
notebook_predicates = defaultdict2(lambda: [operator.__eq__], {
    # Predicates to compare cells in order of low-to-high precedence
    "/cells": [
        compare_cell_approximate,
        compare_cell_moderate,
        compare_cell_strict,
        compare_cell_by_ids,
        ],
    # Predicates to compare output cells (within one cell) in order of low-to-high precedence
    "/cells/*/outputs": [
        compare_output_approximate,
        compare_output_strict,
        ]
    })

# Recursive diffing of substructures should pick a rule from here, with diff as fallback
notebook_differs = defaultdict2(lambda: diff, {
    "/cells": diff_sequence_multilevel,
    "/cells/*": diff,
    "/cells/*/source": diff_string_lines,
    "/cells/*/outputs": diff_sequence_multilevel,
    "/cells/*/outputs/*": diff_single_outputs,
    "/cells/*/attachments": diff_attachments,
    })


notebook_config = DiffConfig(
    predicates=notebook_predicates,
    differs=notebook_differs,
    atomic_paths={
        "/cells/*/id": True
    }
)


def reset_notebook_differ():
    """Reset the notebook_differs dictionary to default values."""
    # As it is a defaultdict2, simply clear all set keys to reset:
    for key in tuple(notebook_differs.keys()):
        del notebook_differs[key]


def set_notebook_diff_ignores(ignore_paths):
    """Set/unset notebook differs to ignore.

    Parameters:
        ignore_paths: dict
            Dictionary with path strings (e.g. /cells/*/outputs) as keys.
            For each entry in the dictionary do the following:
            - if value is True, set path to `diff_ignore`.
            - if value is False, reset the differ of path to the default value.
            - if value is set/tuple/list, assume the container is a collection
              of subkeys of path to ignore with `diff_ignore_keys`.
    """
    for path, subkeys in ignore_paths.items():
        if subkeys is True:
            notebook_differs[path] = diff_ignore
        elif subkeys is False:
            if path in notebook_differs:
                del notebook_differs[path]
        elif isinstance(subkeys, (list, tuple, set)):
            notebook_differs[path] = diff_ignore_keys(notebook_differs[path], subkeys)
        else:
            raise ValueError('Invalid ignore config entry: %r: %r' % (path, subkeys))


def set_notebook_diff_targets(sources=True, outputs=True, attachments=True,
                              metadata=True, identifier=True, details=True):
    """Configure the notebook differs to include/ignore various changes."""

    config = {
        '/cells/*/source': not sources,
        '/cells/*/outputs': not outputs,
        '/cells/*/attachments': not attachments,
        '/metadata': not metadata,
        '/cells/*/id': not identifier,
        '/cells/*/metadata': not metadata,
        '/cells/*/outputs/*/metadata': not metadata,
        '/cells/*': False if details else ('execution_count',),
        '/cells/*/outputs/*': False if details else ('execution_count',),
    }
    set_notebook_diff_ignores(config)


def diff_cells(a, b):
    "This is currently just used by some tests."
    path = "/cells"
    return notebook_differs[path](a, b, path=path, config=notebook_config)


def diff_item_at_path(a, b, path):
    """Calculate the diff using the configured notebook differ for path."""
    return notebook_differs[path](a, b, path=path, config=notebook_config)


def diff_notebooks(a, b):
    """Compute the diff of two notebooks using customized heuristics and diff rules.

    The notebooks are expected in the format of nbformat's version 4.
    """
    if not (isinstance(a, dict) and isinstance(b, dict)):
        raise TypeError("Expected inputs to be dicts, got %r and %r" % (a, b))
    return diff(a, b, path="", config=notebook_config)