""" caching.py ----------- Functions and classes that help with tracking changes in `numpy.ndarray` and clearing cached values based on those changes. You should really `pip install xxhash`: ``` In [23]: %timeit int(blake2b(d).hexdigest(), 16) 102 us +/- 684 ns per loop In [24]: %timeit int(sha256(d).hexdigest(), 16) 142 us +/- 3.73 us In [25]: %timeit xxh3_64_intdigest(d) 3.37 us +/- 116 ns per loop ``` """ import os import time import warnings import numpy as np from functools import wraps from .constants import log from .util import is_sequence try: from collections.abc import Mapping except BaseException: from collections import Mapping # sha256 is always available from hashlib import sha256 as _sha256 def sha256(item): return int(_sha256(item).hexdigest(), 16) try: # blake2b is available on Python 3 and from hashlib import blake2b as _blake2b def hash_fallback(item): return int(_blake2b(item).hexdigest(), 16) except BaseException: # fallback to sha256 hash_fallback = sha256 # xxhash is up to 30x faster than sha256: # `pip install xxhash` try: # newest version of algorithm from xxhash import xxh3_64_intdigest as hash_fast except BaseException: try: # older version of the algorithm from xxhash import xxh64_intdigest as hash_fast except BaseException: # use hashlib as a fallback hashing library log.debug('falling back to hashlib ' + 'hashing: `pip install xxhash`' + 'for 50x faster cache checks') hash_fast = hash_fallback def tracked_array(array, dtype=None): """ Properly subclass a numpy ndarray to track changes. Avoids some pitfalls of subclassing by forcing contiguous arrays and does a view into a TrackedArray. Parameters ------------ array : array- like object To be turned into a TrackedArray dtype : np.dtype Which dtype to use for the array Returns ------------ tracked : TrackedArray Contains input array data. """ # if someone passed us None, just create an empty array if array is None: array = [] # make sure it is contiguous then view it as our subclass tracked = np.ascontiguousarray( array, dtype=dtype).view(TrackedArray) # should always be contiguous here assert tracked.flags['C_CONTIGUOUS'] return tracked def cache_decorator(function): """ A decorator for class methods, replaces @property but will store and retrieve function return values in object cache. Parameters ------------ function : method This is used as a decorator: ``` @cache_decorator def foo(self, things): return 'happy days' ``` """ # use wraps to preserve docstring @wraps(function) def get_cached(*args, **kwargs): """ Only execute the function if its value isn't stored in cache already. """ self = args[0] # use function name as key in cache name = function.__name__ # do the dump logic ourselves to avoid # verifying cache twice per call self._cache.verify() # access cache dict to avoid automatic validation # since we already called cache.verify manually if name in self._cache.cache: # already stored so return value return self._cache.cache[name] # value not in cache so execute the function value = function(*args, **kwargs) # store the value if self._cache.force_immutable and hasattr( value, 'flags') and len(value.shape) > 0: value.flags.writeable = False self._cache.cache[name] = value return value # all cached values are also properties # so they can be accessed like value attributes # rather than functions return property(get_cached) class TrackedArray(np.ndarray): """ Subclass of numpy.ndarray that provides hash methods to track changes. General method is to aggressively set 'modified' flags on operations which might (but don't necessarily) alter the array, ideally we sometimes compute hashes when we don't need to, but we don't return wrong hashes ever. We store boolean modified flag for each hash type to make checks fast even for queries of different hashes. Methods ---------- __hash__ : int Runs the fastest available hash in this order: `xxh3_64, xxh_64, blake2b, sha256` """ def __array_finalize__(self, obj): """ Sets a modified flag on every TrackedArray This flag will be set on every change as well as during copies and certain types of slicing. """ self._dirty_hash = True if isinstance(obj, type(self)): obj._dirty_hash = True def __array_wrap__(self, out_arr, context=None): """ Return a numpy scalar if array is 0d. See https://github.com/numpy/numpy/issues/5819 """ if out_arr.ndim: return np.ndarray.__array_wrap__( self, out_arr, context) # Match numpy's behavior and return a numpy dtype scalar return out_arr[()] @property def mutable(self): return self.flags['WRITEABLE'] @mutable.setter def mutable(self, value): self.flags.writeable = value def hash(self): warnings.warn( '`array.hash()` is deprecated and will ' + 'be removed in October 2023: replace ' + 'with `array.__hash__()` or `hash(array)`', DeprecationWarning) return self.__hash__() def crc(self): warnings.warn( '`array.crc()` is deprecated and will ' + 'be removed in October 2023: replace ' + 'with `array.__hash__()` or `hash(array)`', DeprecationWarning) return self.__hash__() def md5(self): warnings.warn( '`array.md5()` is deprecated and will ' + 'be removed in October 2023: replace ' + 'with `array.__hash__()` or `hash(array)`', DeprecationWarning) return self.__hash__() def __hash__(self): """ Return a fast hash of the contents of the array. Returns ------------- hash : long int A hash of the array contents. """ # repeat the bookkeeping to get a contiguous array if not self._dirty_hash and hasattr(self, '_hashed'): # we have a valid hash without recomputing. return self._hashed if self.flags['C_CONTIGUOUS']: hashed = hash_fast(self) else: # the case where we have sliced our nice # contiguous array into a non- contiguous block # for example (note slice *after* track operation): # t = tracked_array(np.random.random(10))[::-1] hashed = hash_fast(np.ascontiguousarray(self)) # assign the value and set the flag self._hashed = hashed self._dirty_hash = False return hashed def __iadd__(self, *args, **kwargs): """ In-place addition. The i* operations are in- place and modify the array, so we better catch all of them. """ self._dirty_hash = True return super(self.__class__, self).__iadd__( *args, **kwargs) def __isub__(self, *args, **kwargs): self._dirty_hash = True return super(self.__class__, self).__isub__( *args, **kwargs) def __imul__(self, *args, **kwargs): self._dirty_hash = True return super(self.__class__, self).__imul__( *args, **kwargs) def __idiv__(self, *args, **kwargs): self._dirty_hash = True return super(self.__class__, self).__idiv__( *args, **kwargs) def __itruediv__(self, *args, **kwargs): self._dirty_hash = True return super(self.__class__, self).__itruediv__( *args, **kwargs) def __imatmul__(self, *args, **kwargs): self._dirty_hash = True return super(self.__class__, self).__imatmul__( *args, **kwargs) def __ipow__(self, *args, **kwargs): self._dirty_hash = True return super(self.__class__, self).__ipow__( *args, **kwargs) def __imod__(self, *args, **kwargs): self._dirty_hash = True return super(self.__class__, self).__imod__( *args, **kwargs) def __ifloordiv__(self, *args, **kwargs): self._dirty_hash = True return super(self.__class__, self).__ifloordiv__( *args, **kwargs) def __ilshift__(self, *args, **kwargs): self._dirty_hash = True return super(self.__class__, self).__ilshift__( *args, **kwargs) def __irshift__(self, *args, **kwargs): self._dirty_hash = True return super(self.__class__, self).__irshift__( *args, **kwargs) def __iand__(self, *args, **kwargs): self._dirty_hash = True return super(self.__class__, self).__iand__( *args, **kwargs) def __ixor__(self, *args, **kwargs): self._dirty_hash = True return super(self.__class__, self).__ixor__( *args, **kwargs) def __ior__(self, *args, **kwargs): self._dirty_hash = True return super(self.__class__, self).__ior__( *args, **kwargs) def __setitem__(self, *args, **kwargs): self._dirty_hash = True super(self.__class__, self).__setitem__( *args, **kwargs) def __setslice__(self, *args, **kwargs): self._dirty_hash = True super(self.__class__, self).__setslice__( *args, **kwargs) class Cache(object): """ Class to cache values which will be stored until the result of an ID function changes. """ def __init__(self, id_function, force_immutable=False): """ Create a cache object. Parameters ------------ id_function : function Returns hashable value force_immutable : bool If set will make all numpy arrays read-only """ self._id_function = id_function # force stored numpy arrays to have flags.writable=False self.force_immutable = bool(force_immutable) # call the id function for initial value self.id_current = self._id_function() # a counter for locks self._lock = 0 # actuSal store for data self.cache = {} def delete(self, key): """ Remove a key from the cache. """ if key in self.cache: self.cache.pop(key, None) def verify(self): """ Verify that the cached values are still for the same value of id_function and delete all stored items if the value of id_function has changed. """ # if we are in a lock don't check anything if self._lock != 0: return # check the hash of our data id_new = self._id_function() # things changed if id_new != self.id_current: if len(self.cache) > 0: log.debug('%d items cleared from cache: %s', len(self.cache), str(list(self.cache.keys()))) # hash changed, so dump the cache # do it manually rather than calling clear() # as we are internal logic and can avoid function calls self.cache = {} # set the id to the new data hash self.id_current = id_new def clear(self, exclude=None): """ Remove elements in the cache. Parameters ----------- exclude : list List of keys in cache to not clear. """ if exclude is None: self.cache = {} else: self.cache = {k: v for k, v in self.cache.items() if k in exclude} def update(self, items): """ Update the cache with a set of key, value pairs without checking id_function. """ self.cache.update(items) if self.force_immutable: for k, v in self.cache.items(): if hasattr(v, 'flags') and len(v.shape) > 0: v.flags.writeable = False self.id_set() def id_set(self): """ Set the current ID to the value of the ID function. """ self.id_current = self._id_function() def __getitem__(self, key): """ Get an item from the cache. If the item is not in the cache, it will return None Parameters ------------- key : hashable Key in dict Returns ------------- cached : object, or None Object that was stored """ self.verify() if key in self.cache: return self.cache[key] return None def __setitem__(self, key, value): """ Add an item to the cache. Parameters ------------ key : hashable Key to reference value value : any Value to store in cache """ # dumpy cache if ID function has changed self.verify() # make numpy arrays read-only if asked to if self.force_immutable and hasattr(value, 'flags') and len(value.shape) > 0: value.flags.writeable = False # assign data to dict self.cache[key] = value return value def __contains__(self, key): self.verify() return key in self.cache def __len__(self): self.verify() return len(self.cache) def __enter__(self): self._lock += 1 def __exit__(self, *args): self._lock -= 1 self.id_current = self._id_function() class DiskCache(object): """ Store results of expensive operations on disk with an option to expire the results. This is used to cache the multi-gigabyte test corpuses in `tests/corpus.py` """ def __init__(self, path, expire_days=30): """ Create a cache on disk for storing expensive results. Parameters -------------- path : str A writeable location on the current file path. expire_days : int or float How old should results be considered expired. """ # store how old we allow results to be self.expire_days = expire_days # store the location for saving results self.path = os.path.abspath( os.path.expanduser(path)) # make sure the specified path exists os.makedirs(self.path, exist_ok=True) def get(self, key, fetch): """ Get a key from the cache or run a calculation. Parameters ----------- key : str Key to reference item with fetch : function If key isn't stored and recent run this function and store its result on disk. """ # hash the key so we have a fixed length string key_hash = _sha256(key.encode('utf-8')).hexdigest() # full path of result on local disk path = os.path.join(self.path, key_hash) # check to see if we can use the cache if os.path.isfile(path): # compute the age of the existing file in days age_days = (time.time() - os.stat(path).st_mtime) / 86400.0 if age_days < self.expire_days: # this nested condition means that # the file both exists and is recent # enough, so just return its contents with open(path, 'rb') as f: return f.read() log.debug('not in cache fetching: `{}`'.format(key)) # since we made it here our data isn't cached # run the expensive function to fetch the file raw = fetch() # write the data so we can save it with open(path, 'wb') as f: f.write(raw) # return the data return raw class DataStore(Mapping): """ A class to store multiple numpy arrays and track them all for changes. Operates like a dict that only stores numpy.ndarray """ def __init__(self): self.data = {} def __iter__(self): return iter(self.data) def pop(self, key): return self.data.pop(key, None) def __delitem__(self, key): self.data.pop(key, None) @property def mutable(self): """ Is data allowed to be altered or not. Returns ----------- is_mutable : bool Can data be altered in the DataStore """ return getattr(self, '_mutable', True) @mutable.setter def mutable(self, value): """ Is data allowed to be altered or not. Parameters ------------ is_mutable : bool Should data be allowed to be altered """ # make sure passed value is a bool is_mutable = bool(value) # apply the flag to any data stored for n, i in self.data.items(): i.mutable = value # save the mutable setting self._mutable = is_mutable def is_empty(self): """ Is the current DataStore empty or not. Returns ---------- empty : bool False if there are items in the DataStore """ if len(self.data) == 0: return True for v in self.data.values(): if is_sequence(v): if len(v) == 0: return True else: return False elif bool(np.isreal(v)): return False return True def clear(self): """ Remove all data from the DataStore. """ self.data = {} def __getitem__(self, key): return self.data[key] def __setitem__(self, key, data): """ Store an item in the DataStore """ # we shouldn't allow setting on immutable datastores if not self.mutable: raise ValueError('DataStore is configured immutable!') if hasattr(data, 'hash'): # don't bother to re-track TrackedArray tracked = data else: # otherwise wrap data tracked = tracked_array(data) # apply our mutability setting if hasattr(self, '_mutable'): # apply our mutability setting only if it was explicitly set tracked.mutable = self.mutable # store data self.data[key] = tracked def __contains__(self, key): return key in self.data def __len__(self): return len(self.data) def update(self, values): if not isinstance(values, dict): raise ValueError('Update only implemented for dicts') for key, value in values.items(): self[key] = value def __hash__(self): """ Get a hash reflecting everything in the DataStore. Returns ---------- hash : str hash of data in hexadecimal """ return hash_fast(np.array( [hash(v) for v in self.data.values()], dtype=np.int64).tobytes()) def crc(self): """ Get a CRC reflecting everything in the DataStore. Returns ---------- crc : int CRC of data """ warnings.warn( '`array.crc()` is deprecated and will ' + 'be removed in October 2023: replace ' + 'with `array.__hash__()` or `hash(array)`', DeprecationWarning) return self.__hash__() def fast_hash(self): """ Get a CRC32 or xxhash.xxh64 reflecting the DataStore. Returns ------------ hashed : int Checksum of data """ warnings.warn( '`array.fast_hash()` is deprecated and will ' + 'be removed in October 2023: replace ' + 'with `array.__hash__()` or `hash(array)`', DeprecationWarning) return self.__hash__() def hash(self): warnings.warn( '`array.hash()` is deprecated and will ' + 'be removed in October 2023: replace ' + 'with `array.__hash__()` or `hash(array)`', DeprecationWarning) return self.__hash__()