""" intervaltree: A mutable, self-balancing interval tree for Python 2 and 3. Queries may be by point, by range overlap, or by range envelopment. Core logic. Copyright 2013-2015 Chaim-Leib Halbert Modifications Copyright 2014 Konstantin Tretyakov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ from .interval import Interval from .node import Node from numbers import Number import collections from sortedcontainers import SortedDict from warnings import warn try: xrange # Python 2? except NameError: xrange = range # noinspection PyBroadException class IntervalTree(collections.MutableSet): """ A binary lookup tree of intervals. The intervals contained in the tree are represented using ``Interval(a, b, data)`` objects. Each such object represents a half-open interval ``[a, b)`` with optional data. Examples: --------- Initialize a blank tree:: >>> tree = IntervalTree() >>> tree IntervalTree() Initialize a tree from an iterable set of Intervals in O(n * log n):: >>> tree = IntervalTree([Interval(-10, 10), Interval(-20.0, -10.0)]) >>> tree IntervalTree([Interval(-20.0, -10.0), Interval(-10, 10)]) >>> len(tree) 2 Note that this is a set, i.e. repeated intervals are ignored. However, Intervals with different data fields are regarded as different:: >>> tree = IntervalTree([Interval(-10, 10), Interval(-10, 10), Interval(-10, 10, "x")]) >>> tree IntervalTree([Interval(-10, 10), Interval(-10, 10, 'x')]) >>> len(tree) 2 Insertions:: >>> tree = IntervalTree() >>> tree[0:1] = "data" >>> tree.add(Interval(10, 20)) >>> tree.addi(19.9, 20) >>> tree IntervalTree([Interval(0, 1, 'data'), Interval(10, 20), Interval(19.9, 20)]) >>> tree.update([Interval(19.9, 20.1), Interval(20.1, 30)]) >>> len(tree) 5 Inserting the same Interval twice does nothing:: >>> tree = IntervalTree() >>> tree[-10:20] = "arbitrary data" >>> tree[-10:20] = None # Note that this is also an insertion >>> tree IntervalTree([Interval(-10, 20), Interval(-10, 20, 'arbitrary data')]) >>> tree[-10:20] = None # This won't change anything >>> tree[-10:20] = "arbitrary data" # Neither will this >>> len(tree) 2 Deletions:: >>> tree = IntervalTree(Interval(b, e) for b, e in [(-10, 10), (-20, -10), (10, 20)]) >>> tree IntervalTree([Interval(-20, -10), Interval(-10, 10), Interval(10, 20)]) >>> tree.remove(Interval(-10, 10)) >>> tree IntervalTree([Interval(-20, -10), Interval(10, 20)]) >>> tree.remove(Interval(-10, 10)) Traceback (most recent call last): ... ValueError >>> tree.discard(Interval(-10, 10)) # Same as remove, but no exception on failure >>> tree IntervalTree([Interval(-20, -10), Interval(10, 20)]) Delete intervals, overlapping a given point:: >>> tree = IntervalTree([Interval(-1.1, 1.1), Interval(-0.5, 1.5), Interval(0.5, 1.7)]) >>> tree.remove_overlap(1.1) >>> tree IntervalTree([Interval(-1.1, 1.1)]) Delete intervals, overlapping an interval:: >>> tree = IntervalTree([Interval(-1.1, 1.1), Interval(-0.5, 1.5), Interval(0.5, 1.7)]) >>> tree.remove_overlap(0, 0.5) >>> tree IntervalTree([Interval(0.5, 1.7)]) >>> tree.remove_overlap(1.7, 1.8) >>> tree IntervalTree([Interval(0.5, 1.7)]) >>> tree.remove_overlap(1.6, 1.6) # Null interval does nothing >>> tree IntervalTree([Interval(0.5, 1.7)]) >>> tree.remove_overlap(1.6, 1.5) # Ditto >>> tree IntervalTree([Interval(0.5, 1.7)]) Delete intervals, enveloped in the range:: >>> tree = IntervalTree([Interval(-1.1, 1.1), Interval(-0.5, 1.5), Interval(0.5, 1.7)]) >>> tree.remove_envelop(-1.0, 1.5) >>> tree IntervalTree([Interval(-1.1, 1.1), Interval(0.5, 1.7)]) >>> tree.remove_envelop(-1.1, 1.5) >>> tree IntervalTree([Interval(0.5, 1.7)]) >>> tree.remove_envelop(0.5, 1.5) >>> tree IntervalTree([Interval(0.5, 1.7)]) >>> tree.remove_envelop(0.5, 1.7) >>> tree IntervalTree() Point/interval overlap queries:: >>> tree = IntervalTree([Interval(-1.1, 1.1), Interval(-0.5, 1.5), Interval(0.5, 1.7)]) >>> assert tree[-1.1] == set([Interval(-1.1, 1.1)]) >>> assert tree.search(1.1) == set([Interval(-0.5, 1.5), Interval(0.5, 1.7)]) # Same as tree[1.1] >>> assert tree[-0.5:0.5] == set([Interval(-0.5, 1.5), Interval(-1.1, 1.1)]) # Interval overlap query >>> assert tree.search(1.5, 1.5) == set() # Same as tree[1.5:1.5] >>> assert tree.search(1.5) == set([Interval(0.5, 1.7)]) # Same as tree[1.5] >>> assert tree.search(1.7, 1.8) == set() Envelop queries:: >>> assert tree.search(-0.5, 0.5, strict=True) == set() >>> assert tree.search(-0.4, 1.7, strict=True) == set([Interval(0.5, 1.7)]) Membership queries:: >>> tree = IntervalTree([Interval(-1.1, 1.1), Interval(-0.5, 1.5), Interval(0.5, 1.7)]) >>> Interval(-0.5, 0.5) in tree False >>> Interval(-1.1, 1.1) in tree True >>> Interval(-1.1, 1.1, "x") in tree False >>> tree.overlaps(-1.1) True >>> tree.overlaps(1.7) False >>> tree.overlaps(1.7, 1.8) False >>> tree.overlaps(-1.2, -1.1) False >>> tree.overlaps(-1.2, -1.0) True Sizing:: >>> tree = IntervalTree([Interval(-1.1, 1.1), Interval(-0.5, 1.5), Interval(0.5, 1.7)]) >>> len(tree) 3 >>> tree.is_empty() False >>> IntervalTree().is_empty() True >>> not tree False >>> not IntervalTree() True >>> print(tree.begin()) # using print() because of floats in Python 2.6 -1.1 >>> print(tree.end()) # ditto 1.7 Iteration:: >>> tree = IntervalTree([Interval(-11, 11), Interval(-5, 15), Interval(5, 17)]) >>> [iv.begin for iv in sorted(tree)] [-11, -5, 5] >>> assert tree.items() == set([Interval(-5, 15), Interval(-11, 11), Interval(5, 17)]) Copy- and typecasting, pickling:: >>> tree0 = IntervalTree([Interval(0, 1, "x"), Interval(1, 2, ["x"])]) >>> tree1 = IntervalTree(tree0) # Shares Interval objects >>> tree2 = tree0.copy() # Shallow copy (same as above, as Intervals are singletons) >>> import pickle >>> tree3 = pickle.loads(pickle.dumps(tree0)) # Deep copy >>> list(tree0[1])[0].data[0] = "y" # affects shallow copies, but not deep copies >>> tree0 IntervalTree([Interval(0, 1, 'x'), Interval(1, 2, ['y'])]) >>> tree1 IntervalTree([Interval(0, 1, 'x'), Interval(1, 2, ['y'])]) >>> tree2 IntervalTree([Interval(0, 1, 'x'), Interval(1, 2, ['y'])]) >>> tree3 IntervalTree([Interval(0, 1, 'x'), Interval(1, 2, ['x'])]) Equality testing:: >>> IntervalTree([Interval(0, 1)]) == IntervalTree([Interval(0, 1)]) True >>> IntervalTree([Interval(0, 1)]) == IntervalTree([Interval(0, 1, "x")]) False """ @classmethod def from_tuples(cls, tups): """ Create a new IntervalTree from an iterable of 2- or 3-tuples, where the tuple lists begin, end, and optionally data. """ ivs = [Interval(*t) for t in tups] return IntervalTree(ivs) def __init__(self, intervals=None): """ Set up a tree. If intervals is provided, add all the intervals to the tree. Completes in O(n*log n) time. """ intervals = set(intervals) if intervals is not None else set() for iv in intervals: if iv.is_null(): raise ValueError( "IntervalTree: Null Interval objects not allowed in IntervalTree:" " {0}".format(iv) ) self.all_intervals = intervals self.top_node = Node.from_intervals(self.all_intervals) self.boundary_table = SortedDict() for iv in self.all_intervals: self._add_boundaries(iv) def copy(self): """ Construct a new IntervalTree using shallow copies of the intervals in the source tree. Completes in O(n*log n) time. :rtype: IntervalTree """ return IntervalTree(iv.copy() for iv in self) def _add_boundaries(self, interval): """ Records the boundaries of the interval in the boundary table. """ begin = interval.begin end = interval.end if begin in self.boundary_table: self.boundary_table[begin] += 1 else: self.boundary_table[begin] = 1 if end in self.boundary_table: self.boundary_table[end] += 1 else: self.boundary_table[end] = 1 def _remove_boundaries(self, interval): """ Removes the boundaries of the interval from the boundary table. """ begin = interval.begin end = interval.end if self.boundary_table[begin] == 1: del self.boundary_table[begin] else: self.boundary_table[begin] -= 1 if self.boundary_table[end] == 1: del self.boundary_table[end] else: self.boundary_table[end] -= 1 def add(self, interval): """ Adds an interval to the tree, if not already present. Completes in O(log n) time. """ if interval in self: return if interval.is_null(): raise ValueError( "IntervalTree: Null Interval objects not allowed in IntervalTree:" " {0}".format(interval) ) if not self.top_node: self.top_node = Node.from_interval(interval) else: self.top_node = self.top_node.add(interval) self.all_intervals.add(interval) self._add_boundaries(interval) append = add def addi(self, begin, end, data=None): """ Shortcut for add(Interval(begin, end, data)). Completes in O(log n) time. """ return self.add(Interval(begin, end, data)) appendi = addi def update(self, intervals): """ Given an iterable of intervals, add them to the tree. Completes in O(m*log(n+m), where m = number of intervals to add. """ for iv in intervals: self.add(iv) def extend(self, intervals): """ Deprecated: Replaced by update(). """ warn("IntervalTree.extend() has been deprecated. Consider using update() instead", DeprecationWarning) self.update(intervals) def remove(self, interval): """ Removes an interval from the tree, if present. If not, raises ValueError. Completes in O(log n) time. """ #self.verify() if interval not in self: #print(self.all_intervals) raise ValueError self.top_node = self.top_node.remove(interval) self.all_intervals.remove(interval) self._remove_boundaries(interval) #self.verify() def removei(self, begin, end, data=None): """ Shortcut for remove(Interval(begin, end, data)). Completes in O(log n) time. """ return self.remove(Interval(begin, end, data)) def discard(self, interval): """ Removes an interval from the tree, if present. If not, does nothing. Completes in O(log n) time. """ if interval not in self: return self.all_intervals.discard(interval) self.top_node = self.top_node.discard(interval) self._remove_boundaries(interval) def discardi(self, begin, end, data=None): """ Shortcut for discard(Interval(begin, end, data)). Completes in O(log n) time. """ return self.discard(Interval(begin, end, data)) def difference(self, other): """ Returns a new tree, comprising all intervals in self but not in other. """ ivs = set() for iv in self: if iv not in other: ivs.add(iv) return IntervalTree(ivs) def difference_update(self, other): """ Removes all intervals in other from self. """ for iv in other: self.discard(iv) def union(self, other): """ Returns a new tree, comprising all intervals from self and other. """ return IntervalTree(set(self).union(other)) def intersection(self, other): """ Returns a new tree of all intervals common to both self and other. """ ivs = set() shorter, longer = sorted([self, other], key=len) for iv in shorter: if iv in longer: ivs.add(iv) return IntervalTree(ivs) def intersection_update(self, other): """ Removes intervals from self unless they also exist in other. """ for iv in self: if iv not in other: self.remove(iv) def symmetric_difference(self, other): """ Return a tree with elements only in self or other but not both. """ if not isinstance(other, set): other = set(other) me = set(self) ivs = me - other + (other - me) return IntervalTree(ivs) def symmetric_difference_update(self, other): """ Throws out all intervals except those only in self or other, not both. """ other = set(other) for iv in self: if iv in other: self.remove(iv) other.remove(iv) self.update(other) def remove_overlap(self, begin, end=None): """ Removes all intervals overlapping the given point or range. Completes in O((r+m)*log n) time, where: * n = size of the tree * m = number of matches * r = size of the search range (this is 1 for a point) """ hitlist = self.search(begin, end) for iv in hitlist: self.remove(iv) def remove_envelop(self, begin, end): """ Removes all intervals completely enveloped in the given range. Completes in O((r+m)*log n) time, where: * n = size of the tree * m = number of matches * r = size of the search range (this is 1 for a point) """ hitlist = self.search(begin, end, strict=True) for iv in hitlist: self.remove(iv) def chop(self, begin, end, datafunc=None): """ Like remove_envelop(), but trims back Intervals hanging into the chopped area so that nothing overlaps. """ insertions = set() begin_hits = [iv for iv in self[begin] if iv.begin < begin] end_hits = [iv for iv in self[end] if iv.end > end] if datafunc: for iv in begin_hits: insertions.add(Interval(iv.begin, begin, datafunc(iv, True))) for iv in end_hits: insertions.add(Interval(end, iv.end, datafunc(iv, False))) else: for iv in begin_hits: insertions.add(Interval(iv.begin, begin, iv.data)) for iv in end_hits: insertions.add(Interval(end, iv.end, iv.data)) self.remove_envelop(begin, end) self.difference_update(begin_hits) self.difference_update(end_hits) self.update(insertions) def slice(self, point, datafunc=None): """ Split Intervals that overlap point into two new Intervals. if specified, uses datafunc(interval, islower=True/False) to set the data field of the new Intervals. :param point: where to slice :param datafunc(interval, isupper): callable returning a new value for the interval's data field """ hitlist = set(iv for iv in self[point] if iv.begin < point) insertions = set() if datafunc: for iv in hitlist: insertions.add(Interval(iv.begin, point, datafunc(iv, True))) insertions.add(Interval(point, iv.end, datafunc(iv, False))) else: for iv in hitlist: insertions.add(Interval(iv.begin, point, iv.data)) insertions.add(Interval(point, iv.end, iv.data)) self.difference_update(hitlist) self.update(insertions) def clear(self): """ Empties the tree. Completes in O(1) tine. """ self.__init__() def find_nested(self): """ Returns a dictionary mapping parent intervals to sets of intervals overlapped by and contained in the parent. Completes in O(n^2) time. :rtype: dict of [Interval, set of Interval] """ result = {} def add_if_nested(): if parent.contains_interval(child): if parent not in result: result[parent] = set() result[parent].add(child) long_ivs = sorted(self.all_intervals, key=Interval.length, reverse=True) for i, parent in enumerate(long_ivs): for child in long_ivs[i + 1:]: add_if_nested() return result def overlaps(self, begin, end=None): """ Returns whether some interval in the tree overlaps the given point or range. Completes in O(r*log n) time, where r is the size of the search range. :rtype: bool """ if end is not None: return self.overlaps_range(begin, end) elif isinstance(begin, Number): return self.overlaps_point(begin) else: return self.overlaps_range(begin.begin, begin.end) def overlaps_point(self, p): """ Returns whether some interval in the tree overlaps p. Completes in O(log n) time. :rtype: bool """ if self.is_empty(): return False return bool(self.top_node.contains_point(p)) def overlaps_range(self, begin, end): """ Returns whether some interval in the tree overlaps the given range. Completes in O(r*log n) time, where r is the range length and n is the table size. :rtype: bool """ if self.is_empty(): return False elif self.overlaps_point(begin): return True return any( self.overlaps_point(bound) for bound in self.boundary_table if begin <= bound < end ) def split_overlaps(self): """ Finds all intervals with overlapping ranges and splits them along the range boundaries. Completes in worst-case O(n^2*log n) time (many interval boundaries are inside many intervals), best-case O(n*log n) time (small number of overlaps << n per interval). """ if not self: return if len(self.boundary_table) == 2: return bounds = sorted(self.boundary_table) # get bound locations new_ivs = set() for lbound, ubound in zip(bounds[:-1], bounds[1:]): for iv in self[lbound]: new_ivs.add(Interval(lbound, ubound, iv.data)) self.__init__(new_ivs) def items(self): """ Constructs and returns a set of all intervals in the tree. Completes in O(n) time. :rtype: set of Interval """ return set(self.all_intervals) def is_empty(self): """ Returns whether the tree is empty. Completes in O(1) time. :rtype: bool """ return 0 == len(self) def search(self, begin, end=None, strict=False): """ Returns a set of all intervals overlapping the given range. Or, if strict is True, returns the set of all intervals fully contained in the range [begin, end]. Completes in O(m + k*log n) time, where: * n = size of the tree * m = number of matches * k = size of the search range (this is 1 for a point) :rtype: set of Interval """ root = self.top_node if not root: return set() if end is None: try: iv = begin return self.search(iv.begin, iv.end, strict=strict) except: return root.search_point(begin, set()) elif begin >= end: return set() else: result = root.search_point(begin, set()) boundary_table = self.boundary_table bound_begin = boundary_table.bisect_left(begin) bound_end = boundary_table.bisect_left(end) # exclude final end bound result.update(root.search_overlap( # slice notation is slightly slower boundary_table.iloc[index] for index in xrange(bound_begin, bound_end) )) # TODO: improve strict search to use node info instead of less-efficient filtering if strict: result = set( iv for iv in result if iv.begin >= begin and iv.end <= end ) return result def begin(self): """ Returns the lower bound of the first interval in the tree. Completes in O(n) time. :rtype: Number """ if not self.boundary_table: return 0 return min(self.boundary_table) def end(self): """ Returns the upper bound of the last interval in the tree. Completes in O(n) time. :rtype: Number """ if not self.boundary_table: return 0 return max(self.boundary_table) def print_structure(self, tostring=False): """ ## FOR DEBUGGING ONLY ## Pretty-prints the structure of the tree. If tostring is true, prints nothing and returns a string. :rtype: None or str """ if self.top_node: return self.top_node.print_structure(tostring=tostring) else: result = "" if not tostring: print(result) else: return result def verify(self): """ ## FOR DEBUGGING ONLY ## Checks the table to ensure that the invariants are held. """ if self.all_intervals: ## top_node.all_children() == self.all_intervals try: assert self.top_node.all_children() == self.all_intervals except AssertionError as e: print( 'Error: the tree and the membership set are out of sync!' ) tivs = set(self.top_node.all_children()) print('top_node.all_children() - all_intervals:') pprint(tivs - self.all_intervals) print('all_intervals - top_node.all_children():') pprint(self.all_intervals - tivs) raise e ## All members are Intervals for iv in self: assert isinstance(iv, Interval), ( "Error: Only Interval objects allowed in IntervalTree:" " {0}".format(iv) ) ## No null intervals for iv in self: assert not iv.is_null(), ( "Error: Null Interval objects not allowed in IntervalTree:" " {0}".format(iv) ) ## Reconstruct boundary_table bound_check = {} for iv in self: if iv.begin in bound_check: bound_check[iv.begin] += 1 else: bound_check[iv.begin] = 1 if iv.end in bound_check: bound_check[iv.end] += 1 else: bound_check[iv.end] = 1 ## Reconstructed boundary table (bound_check) ==? boundary_table assert set(self.boundary_table.keys()) == set(bound_check.keys()),\ 'Error: boundary_table is out of sync with ' \ 'the intervals in the tree!' # For efficiency reasons this should be iteritems in Py2, but we # don't care much for efficiency in debug methods anyway. for key, val in self.boundary_table.items(): assert bound_check[key] == val, \ 'Error: boundary_table[{0}] should be {1},' \ ' but is {2}!'.format( key, bound_check[key], val) ## Internal tree structure self.top_node.verify(set()) else: ## Verify empty tree assert not self.boundary_table, \ "Error: boundary table should be empty!" assert self.top_node is None, \ "Error: top_node isn't None!" def score(self, full_report=False): """ Returns a number between 0 and 1, indicating how suboptimal the tree is. The lower, the better. Roughly, this number represents the fraction of flawed Intervals in the tree. :rtype: float """ if len(self) <= 2: return 0.0 n = len(self) m = self.top_node.count_nodes() def s_center_score(): """ Returns a normalized score, indicating roughly how many times intervals share s_center with other intervals. Output is full-scale from 0 to 1. :rtype: float """ raw = n - m maximum = n - 1 return raw / float(maximum) report = { "depth": self.top_node.depth_score(n, m), "s_center": s_center_score(), } cumulative = max(report.values()) report["_cumulative"] = cumulative if full_report: return report return cumulative def __getitem__(self, index): """ Returns a set of all intervals overlapping the given index or slice. Completes in O(k * log(n) + m) time, where: * n = size of the tree * m = number of matches * k = size of the search range (this is 1 for a point) :rtype: set of Interval """ try: start, stop = index.start, index.stop if start is None: start = self.begin() if stop is None: return set(self) if stop is None: stop = self.end() return self.search(start, stop) except AttributeError: return self.search(index) def __setitem__(self, index, value): """ Adds a new interval to the tree. A shortcut for add(Interval(index.start, index.stop, value)). If an identical Interval object with equal range and data already exists, does nothing. Completes in O(log n) time. """ self.addi(index.start, index.stop, value) def __delitem__(self, point): """ Delete all items overlapping point. """ self.remove_overlap(point) def __contains__(self, item): """ Returns whether item exists as an Interval in the tree. This method only returns True for exact matches; for overlaps, see the overlaps() method. Completes in O(1) time. :rtype: bool """ # Removed point-checking code; it might trick the user into # thinking that this is O(1), which point-checking isn't. #if isinstance(item, Interval): return item in self.all_intervals #else: # return self.contains_point(item) def containsi(self, begin, end, data=None): """ Shortcut for (Interval(begin, end, data) in tree). Completes in O(1) time. :rtype: bool """ return Interval(begin, end, data) in self def __iter__(self): """ Returns an iterator over all the intervals in the tree. Completes in O(1) time. :rtype: collections.Iterable[Interval] """ return self.all_intervals.__iter__() iter = __iter__ def __len__(self): """ Returns how many intervals are in the tree. Completes in O(1) time. :rtype: int """ return len(self.all_intervals) def __eq__(self, other): """ Whether two IntervalTrees are equal. Completes in O(n) time if sizes are equal; O(1) time otherwise. :rtype: bool """ return ( isinstance(other, IntervalTree) and self.all_intervals == other.all_intervals ) def __repr__(self): """ :rtype: str """ ivs = sorted(self) if not ivs: return "IntervalTree()" else: return "IntervalTree({0})".format(ivs) __str__ = __repr__ def __reduce__(self): """ For pickle-ing. :rtype: tuple """ return IntervalTree, (sorted(self.all_intervals),)