""" Fast cryptographic hash of Python objects, with a special case for fast hashing of numpy arrays. """ # Author: Gael Varoquaux # Copyright (c) 2009 Gael Varoquaux # License: BSD Style, 3 clauses. import warnings import pickle import hashlib import sys import types import struct from ._compat import _bytes_or_unicode import io PY3 = sys.version[0] == '3' if PY3: Pickler = pickle._Pickler else: Pickler = pickle.Pickler class _ConsistentSet(object): """ Class used to ensure the hash of Sets is preserved whatever the order of its items. """ def __init__(self, set_sequence): self._sequence = sorted(set_sequence) class _MyHash(object): """ Class used to hash objects that won't normally pickle """ def __init__(self, *args): self.args = args class Hasher(Pickler): """ A subclass of pickler, to do cryptographic hashing, rather than pickling. """ def __init__(self, hash_name='md5'): self.stream = io.BytesIO() # By default we want a pickle protocol that only changes with # the major python version and not the minor one protocol = (pickle.DEFAULT_PROTOCOL if PY3 else pickle.HIGHEST_PROTOCOL) Pickler.__init__(self, self.stream, protocol=protocol) # Initialise the hash obj self._hash = hashlib.new(hash_name) def hash(self, obj, return_digest=True): try: self.dump(obj) except pickle.PicklingError as e: warnings.warn('PicklingError while hashing %r: %r' % (obj, e)) dumps = self.stream.getvalue() self._hash.update(dumps) if return_digest: return self._hash.hexdigest() def save(self, obj): if isinstance(obj, (types.MethodType, type({}.pop))): # the Pickler cannot pickle instance methods; here we decompose # them into components that make them uniquely identifiable if hasattr(obj, '__func__'): func_name = obj.__func__.__name__ else: func_name = obj.__name__ inst = obj.__self__ if type(inst) == type(pickle): obj = _MyHash(func_name, inst.__name__) elif inst is None: # type(None) or type(module) do not pickle obj = _MyHash(func_name, inst) else: cls = obj.__self__.__class__ obj = _MyHash(func_name, inst, cls) Pickler.save(self, obj) def memoize(self, obj): # We want hashing to be sensitive to value instead of reference. # For example we want ['aa', 'aa'] and ['aa', 'aaZ'[:2]] # to hash to the same value and that's why we disable memoization # for strings if isinstance(obj, _bytes_or_unicode): return Pickler.memoize(self, obj) # The dispatch table of the pickler is not accessible in Python # 3, as these lines are only bugware for IPython, we skip them. def save_global(self, obj, name=None, pack=struct.pack): # We have to override this method in order to deal with objects # defined interactively in IPython that are not injected in # __main__ kwargs = dict(name=name, pack=pack) if sys.version_info >= (3, 4): del kwargs['pack'] try: Pickler.save_global(self, obj, **kwargs) except pickle.PicklingError: Pickler.save_global(self, obj, **kwargs) module = getattr(obj, "__module__", None) if module == '__main__': my_name = name if my_name is None: my_name = obj.__name__ mod = sys.modules[module] if not hasattr(mod, my_name): # IPython doesn't inject the variables define # interactively in __main__ setattr(mod, my_name, obj) dispatch = Pickler.dispatch.copy() # builtin dispatch[type(len)] = save_global # type dispatch[type(object)] = save_global # classobj dispatch[type(Pickler)] = save_global # function dispatch[type(pickle.dump)] = save_global def _batch_setitems(self, items): # forces order of keys in dict to ensure consistent hash Pickler._batch_setitems(self, iter(sorted(items))) def save_set(self, set_items): # forces order of items in Set to ensure consistent hash Pickler.save(self, _ConsistentSet(set_items)) dispatch[type(set())] = save_set class NumpyHasher(Hasher): """ Special case the hasher for when numpy is loaded. """ def __init__(self, hash_name='md5', coerce_mmap=False): """ Parameters ---------- hash_name: string The hash algorithm to be used coerce_mmap: boolean Make no difference between np.memmap and np.ndarray objects. """ self.coerce_mmap = coerce_mmap Hasher.__init__(self, hash_name=hash_name) # delayed import of numpy, to avoid tight coupling import numpy as np self.np = np if hasattr(np, 'getbuffer'): self._getbuffer = np.getbuffer else: self._getbuffer = memoryview def save(self, obj): """ Subclass the save method, to hash ndarray subclass, rather than pickling them. Off course, this is a total abuse of the Pickler class. """ if isinstance(obj, self.np.ndarray) and not obj.dtype.hasobject: # Compute a hash of the object: try: # memoryview is not supported for some dtypes, # e.g. datetime64, see # https://github.com/numpy/numpy/issues/4983. The # workaround is to view the array as bytes before # taking the memoryview obj_bytes_view = obj.view(self.np.uint8) self._hash.update(self._getbuffer(obj_bytes_view)) # ValueError is raised by .view when the array is not contiguous # BufferError is raised by Python 3 in the hash update if # the array is Fortran rather than C contiguous except (ValueError, BufferError): # Cater for non-single-segment arrays: this creates a # copy, and thus aleviates this issue. # XXX: There might be a more efficient way of doing this obj_bytes_view = obj.flatten().view(self.np.uint8) self._hash.update(self._getbuffer(obj_bytes_view)) # We store the class, to be able to distinguish between # Objects with the same binary content, but different # classes. if self.coerce_mmap and isinstance(obj, self.np.memmap): # We don't make the difference between memmap and # normal ndarrays, to be able to reload previously # computed results with memmap. klass = self.np.ndarray else: klass = obj.__class__ # We also return the dtype and the shape, to distinguish # different views on the same data with different dtypes. # The object will be pickled by the pickler hashed at the end. obj = (klass, ('HASHED', obj.dtype, obj.shape, obj.strides)) elif isinstance(obj, self.np.dtype): # Atomic dtype objects are interned by their default constructor: # np.dtype('f8') is np.dtype('f8') # This interning is not maintained by a # pickle.loads + pickle.dumps cycle, because __reduce__ # uses copy=True in the dtype constructor. This # non-deterministic behavior causes the internal memoizer # of the hasher to generate different hash values # depending on the history of the dtype object. # To prevent the hash from being sensitive to this, we use # .descr which is a full (and never interned) description of # the array dtype according to the numpy doc. klass = obj.__class__ obj = (klass, ('HASHED', obj.descr)) Hasher.save(self, obj) def hash(obj, hash_name='md5', coerce_mmap=False): """ Quick calculation of a hash to identify uniquely Python objects containing numpy arrays. Parameters ----------- hash_name: 'md5' or 'sha1' Hashing algorithm used. sha1 is supposedly safer, but md5 is faster. coerce_mmap: boolean Make no difference between np.memmap and np.ndarray """ if 'numpy' in sys.modules: hasher = NumpyHasher(hash_name=hash_name, coerce_mmap=coerce_mmap) else: hasher = Hasher(hash_name=hash_name) return hasher.hash(obj)