# Copyright 2013 by Michiel de Hoon. All rights reserved. # This code is part of the Biopython distribution and governed by its # license. Please see the LICENSE file that should have been included # as part of this package. """Implementation of frequency (count) matrices, position-weight matrices, and position-specific scoring matrices. """ import math from Bio._py3k import range from Bio.Seq import Seq from Bio.Alphabet import IUPAC #Hack for Python 2.5, isnan and isinf were new in Python 2.6 try: from math import isnan as _isnan except ImportError: def _isnan(value): #This is tricky due to cross platform float differences if str(value).lower() == "nan": return True return value != value try: from math import isinf as _isinf except ImportError: def _isinf(value): #This is tricky due to cross platform float differences if str(value).lower().endswith("inf"): return True return False #Hack for Python 2.5 on Windows: try: _nan = float("nan") except ValueError: _nan = 1e1000 / 1e1000 class GenericPositionMatrix(dict): def __init__(self, alphabet, values): self.length = None for letter in alphabet.letters: if self.length is None: self.length = len(values[letter]) elif self.length!=len(values[letter]): raise Exception("data has inconsistent lengths") self[letter] = list(values[letter]) self.alphabet = alphabet self._letters = sorted(self.alphabet.letters) def __str__(self): words = ["%6d" % i for i in range(self.length)] line = " " + " ".join(words) lines = [line] for letter in self._letters: words = ["%6.2f" % value for value in self[letter]] line = "%c: " % letter + " ".join(words) lines.append(line) text = "\n".join(lines) + "\n" return text def __getitem__(self, key): if isinstance(key, tuple): if len(key)==2: key1, key2 = key if isinstance(key1, slice): start1, stop1, stride1 = key1.indices(len(self._letters)) indices1 = range(start1, stop1, stride1) letters1 = [self._letters[i] for i in indices1] dim1 = 2 elif isinstance(key1, int): letter1 = self._letters[key1] dim1 = 1 elif isinstance(key1, tuple): letters1 = [self._letters[i] for i in key1] dim1 = 2 elif isinstance(key1, str): if len(key1)==1: letter1 = key1 dim1 = 1 else: raise KeyError(key1) else: raise KeyError("Cannot understand key %s", str(key1)) if isinstance(key2, slice): start2, stop2, stride2 = key2.indices(self.length) indices2 = range(start2, stop2, stride2) dim2 = 2 elif isinstance(key2, int): index2 = key2 dim2 = 1 else: raise KeyError("Cannot understand key %s", str(key2)) if dim1==1 and dim2==1: return dict.__getitem__(self, letter1)[index2] elif dim1==1 and dim2==2: values = dict.__getitem__(self, letter1) return tuple(values[index2] for index2 in indices2) elif dim1==2 and dim2==1: d = {} for letter1 in letters1: d[letter1] = dict.__getitem__(self, letter1)[index2] return d else: d = {} for letter1 in letters1: values = dict.__getitem__(self, letter1) d[letter1] = [values[index2] for index2 in indices2] if sorted(letters1)==self._letters: return self.__class__(self.alphabet, d) else: return d elif len(key)==1: key = key[0] else: raise KeyError("keys should be 1- or 2-dimensional") if isinstance(key, slice): start, stop, stride = key.indices(len(self._letters)) indices = range(start, stop, stride) letters = [self._letters[i] for i in indices] dim = 2 elif isinstance(key, int): letter = self._letters[key] dim = 1 elif isinstance(key, tuple): letters = [self._letters[i] for i in key] dim = 2 elif isinstance(key, str): if len(key)==1: letter = key dim = 1 else: raise KeyError(key) else: raise KeyError("Cannot understand key %s", str(key)) if dim==1: return dict.__getitem__(self, letter) elif dim==2: d = {} for letter in letters: d[letter] = dict.__getitem__(self, letter) return d else: raise RuntimeError("Should not get here") @property def consensus(self): """Returns the consensus sequence. """ sequence = "" for i in range(self.length): try: maximum = float("-inf") except ValueError: # On Python 2.5 or older that was handled in C code, # and failed on Windows XP 32bit maximum = - 1E400 for letter in self.alphabet.letters: count = self[letter][i] if count > maximum: maximum = count sequence_letter = letter sequence += sequence_letter return Seq(sequence, self.alphabet) @property def anticonsensus(self): sequence = "" for i in range(self.length): try: minimum = float("inf") except ValueError: # On Python 2.5 or older that was handled in C code, # and failed on Windows XP 32bit minimum = 1E400 for letter in self.alphabet.letters: count = self[letter][i] if count < minimum: minimum = count sequence_letter = letter sequence += sequence_letter return Seq(sequence, self.alphabet) @property def degenerate_consensus(self): # Following the rules adapted from # D. R. Cavener: "Comparison of the consensus sequence flanking # translational start sites in Drosophila and vertebrates." # Nucleic Acids Research 15(4): 1353-1361. (1987). # The same rules are used by TRANSFAC. degenerate_nucleotide = { 'A': 'A', 'C': 'C', 'G': 'G', 'T': 'T', 'AC': 'M', 'AG': 'R', 'AT': 'W', 'CG': 'S', 'CT': 'Y', 'GT': 'K', 'ACG': 'V', 'ACT': 'H', 'AGT': 'D', 'CGT': 'B', 'ACGT': 'N', } sequence = "" for i in range(self.length): def get(nucleotide): return self[nucleotide][i] nucleotides = sorted(self, key=get, reverse=True) counts = [self[c][i] for c in nucleotides] # Follow the Cavener rules: if counts[0] >= sum(counts[1:]) and counts[0] >= 2*counts[1]: key = nucleotides[0] elif 4*sum(counts[:2]) > 3*sum(counts): key = "".join(sorted(nucleotides[:2])) elif counts[3]==0: key = "".join(sorted(nucleotides[:3])) else: key = "ACGT" nucleotide = degenerate_nucleotide[key] sequence += nucleotide return Seq(sequence, alphabet = IUPAC.ambiguous_dna) @property def gc_content(self): """ Compute the fraction GC content. """ alphabet = self.alphabet gc_total = 0.0 total = 0.0 for i in range(self.length): for letter in alphabet.letters: if letter in 'CG': gc_total += self[letter][i] total += self[letter][i] return gc_total / total def reverse_complement(self): values = {} values["A"] = self["T"][::-1] values["T"] = self["A"][::-1] values["G"] = self["C"][::-1] values["C"] = self["G"][::-1] alphabet = self.alphabet return self.__class__(alphabet, values) class FrequencyPositionMatrix(GenericPositionMatrix): def normalize(self, pseudocounts=None): """ create and return a position-weight matrix by normalizing the counts matrix. If pseudocounts is None (default), no pseudocounts are added to the counts. If pseudocounts is a number, it is added to the counts before calculating the position-weight matrix. Alternatively, the pseudocounts can be a dictionary with a key for each letter in the alphabet associated with the motif. """ counts = {} if pseudocounts is None: for letter in self.alphabet.letters: counts[letter] = [0.0] * self.length elif isinstance(pseudocounts, dict): for letter in self.alphabet.letters: counts[letter] = [float(pseudocounts[letter])] * self.length else: for letter in self.alphabet.letters: counts[letter] = [float(pseudocounts)] * self.length for i in range(self.length): for letter in self.alphabet.letters: counts[letter][i] += self[letter][i] # Actual normalization is done in the PositionWeightMatrix initializer return PositionWeightMatrix(self.alphabet, counts) class PositionWeightMatrix(GenericPositionMatrix): def __init__(self, alphabet, counts): GenericPositionMatrix.__init__(self, alphabet, counts) for i in range(self.length): total = sum(float(self[letter][i]) for letter in alphabet.letters) for letter in alphabet.letters: self[letter][i] /= total for letter in alphabet.letters: self[letter] = tuple(self[letter]) def log_odds(self, background=None): """ returns the Position-Specific Scoring Matrix. The Position-Specific Scoring Matrix (PSSM) contains the log-odds scores computed from the probability matrix and the background probabilities. If the background is None, a uniform background distribution is assumed. """ values = {} alphabet = self.alphabet if background is None: background = dict.fromkeys(self._letters, 1.0) else: background = dict(background) total = sum(background.values()) for letter in alphabet.letters: background[letter] /= total values[letter] = [] for i in range(self.length): for letter in alphabet.letters: b = background[letter] if b > 0: p = self[letter][i] if p > 0: logodds = math.log(p/b, 2) else: #TODO - Ensure this has unittest coverage! try: logodds = float("-inf") except ValueError: # On Python 2.5 or older that was handled in C code, # and failed on Windows XP 32bit logodds = - 1E400 else: p = self[letter][i] if p > 0: logodds = float("inf") else: logodds = _nan values[letter].append(logodds) pssm = PositionSpecificScoringMatrix(alphabet, values) return pssm class PositionSpecificScoringMatrix(GenericPositionMatrix): def calculate(self, sequence): """ returns the PWM score for a given sequence for all positions. - the sequence can only be a DNA sequence - the search is performed only on one strand - if the sequence and the motif have the same length, a single number is returned - otherwise, the result is a one-dimensional list or numpy array """ #TODO - Code itself tolerates ambiguous bases (as NaN). if not isinstance(self.alphabet, IUPAC.IUPACUnambiguousDNA): raise ValueError("PSSM has wrong alphabet: %s - Use only with DNA motifs" \ % self.alphabet) if not isinstance(sequence.alphabet, IUPAC.IUPACUnambiguousDNA): raise ValueError("Sequence has wrong alphabet: %r - Use only with DNA sequences" \ % sequence.alphabet) #TODO - Force uppercase here and optimise switch statement in C #by assuming upper case? sequence = str(sequence) m = self.length n = len(sequence) scores = [] # check if the fast C code can be used try: import _pwm except ImportError: # use the slower Python code otherwise #The C code handles mixed case so Python version must too: sequence = sequence.upper() for i in range(n-m+1): score = 0.0 for position in range(m): letter = sequence[i+position] try: score += self[letter][position] except KeyError: score = _nan break scores.append(score) else: # get the log-odds matrix into a proper shape # (each row contains sorted (ACGT) log-odds values) logodds = [[self[letter][i] for letter in "ACGT"] for i in range(m)] scores = _pwm.calculate(sequence, logodds) if len(scores)==1: return scores[0] else: return scores def search(self, sequence, threshold=0.0, both=True): """ a generator function, returning found hits in a given sequence with the pwm score higher than the threshold """ sequence = sequence.upper() n = len(sequence) m = self.length if both: rc = self.reverse_complement() for position in range(0, n-m+1): s = sequence[position:position+m] score = self.calculate(s) if score > threshold: yield (position, score) if both: score = rc.calculate(s) if score > threshold: yield (position-n, score) @property def max(self): """Maximal possible score for this motif. returns the score computed for the consensus sequence. """ score = 0.0 letters = self._letters for position in range(0, self.length): score += max(self[letter][position] for letter in letters) return score @property def min(self): """Minimal possible score for this motif. returns the score computed for the anticonsensus sequence. """ score = 0.0 letters = self._letters for position in range(0, self.length): score += min(self[letter][position] for letter in letters) return score @property def gc_content(self): raise Exception("Cannot compute the %GC composition of a PSSM") def mean(self, background=None): """Expected value of the score of a motif.""" if background is None: background = dict.fromkeys(self._letters, 1.0) else: background = dict(background) total = sum(background.values()) for letter in self._letters: background[letter] /= total sx = 0.0 for i in range(self.length): for letter in self._letters: logodds = self[letter, i] if _isnan(logodds): continue if _isinf(logodds) and logodds < 0: continue b = background[letter] p = b * math.pow(2, logodds) sx += p * logodds return sx def std(self, background=None): """Standard deviation of the score of a motif.""" if background is None: background = dict.fromkeys(self._letters, 1.0) else: background = dict(background) total = sum(background.values()) for letter in self._letters: background[letter] /= total variance = 0.0 for i in range(self.length): sx = 0.0 sxx = 0.0 for letter in self._letters: logodds = self[letter, i] if _isnan(logodds): continue if _isinf(logodds) and logodds < 0: continue b = background[letter] p = b * math.pow(2, logodds) sx += p*logodds sxx += p*logodds*logodds sxx -= sx*sx variance += sxx variance = max(variance, 0) # to avoid roundoff problems return math.sqrt(variance) def dist_pearson(self, other): """ return the similarity score based on pearson correlation for the given motif against self. We use the Pearson's correlation of the respective probabilities. """ if self.alphabet != other.alphabet: raise ValueError("Cannot compare motifs with different alphabets") max_p=-2 for offset in range(-self.length+1, other.length): if offset<0: p = self.dist_pearson_at(other, -offset) else: # offset>=0 p = other.dist_pearson_at(self, offset) if max_p