# Copyright 1999 by Jeffrey Chang. All rights reserved. # This code is part of the Biopython distribution and governed by its # license. Please see the LICENSE file that should have been included # as part of this package. """Index.py This module provides a way to create indexes to text files. Classes: Index Dictionary-like class used to store index information. _ShelveIndex An Index class based on the shelve module. _InMemoryIndex An in-memory Index class. """ import os import array import cPickle import shelve class _ShelveIndex(dict): """An index file wrapped around shelve. """ # Without a good dbm module installed, this is pretty slow and # generates large files. When generating an index on a FASTA- # formatted file with 82000 sequences (37Mb), the # index 'dat' file is 42Mb and 'dir' file is 8Mb. __version = 2 __version_key = '__version' def __init__(self, indexname, truncate=None): dict.__init__(self) try: if truncate: # In python 1.52 and before, dumbdbm (under shelve) # doesn't clear the old database. files = [indexname + '.dir', indexname + '.dat', indexname + '.bak' ] for file in files: if os.path.exists(file): os.unlink(file) raise Exception("open a new shelf") self.data = shelve.open(indexname, flag='r') except: # No database exists. self.data = shelve.open(indexname, flag='n') self.data[self.__version_key] = self.__version else: # Check to make sure the database is the correct version. version = self.data.get(self.__version_key, None) if version is None: raise IOError("Unrecognized index format") elif version != self.__version: raise IOError("Version %s doesn't match my version %s" \ % (version, self.__version)) def __del__(self): if self.__dict__.has_key('data'): self.data.close() class _InMemoryIndex(dict): """This creates an in-memory index file. """ # File Format: # version # key value # [...] __version = 3 __version_key = '__version' def __init__(self, indexname, truncate=None): self._indexname = indexname dict.__init__(self) self.__changed = 0 # the index hasn't changed # Remove the database if truncate is true. if truncate and os.path.exists(indexname): os.unlink(indexname) self.__changed = 1 # Load the database if it exists if os.path.exists(indexname): handle = open(indexname) version = self._toobj(handle.readline().rstrip()) if version != self.__version: raise IOError("Version %s doesn't match my version %s" \ % (version, self.__version)) for line in handle: key, value = line.split() key, value = self._toobj(key), self._toobj(value) self[key] = value self.__changed = 0 def update(self, dict): self.__changed = 1 dict.update(self, dict) def __setitem__(self, key, value): self.__changed = 1 dict.__setitem__(self, key, value) def __delitem__(self, key): self.__changed = 1 dict.__delitem__(self, key) def clear(self): self.__changed = 1 dict.clear(self) def __del__(self): if self.__changed: handle = open(self._indexname, 'w') handle.write("%s\n" % self._tostr(self.__version)) for key, value in self.items(): handle.write("%s %s\n" % (self._tostr(key), self._tostr(value))) handle.close() def _tostr(self, obj): # I need a representation of the object that's saveable to # a file that uses whitespace as delimiters. Thus, I'm # going to pickle the object, and then convert each character of # the string to its ASCII integer value. Then, I'm going to convert # the integers into strings and join them together with commas. # It's not the most efficient way of storing things, but it's # relatively fast. s = cPickle.dumps(obj) intlist = array.array('b', s) strlist = map(str, intlist) return ','.join(strlist) def _toobj(self, str): intlist = map(int, str.split(',')) intlist = array.array('b', intlist) strlist = map(chr, intlist) return cPickle.loads(''.join(strlist)) Index = _InMemoryIndex