"""
SYNOPSIS: This module contains the Motif and MotifList classes. Motif stores
and processes motif information. MotifList is a simple list wrapper for
lists--it contains some helper functions that operate over a list of motifs.
"""
import os
import sys
import re
import math
import json
from xml.dom.minidom import parse, parseString
#NOTE: when seqscan is modified to take pssms that aren't numpy arrays, then i can remove this!
import numpy
try:
import _seq
except:
import mdseqpos._seq as _seq
import count
#from database import MotifParser
import MotifParser
import Prob
def calc_pssm(sequences):
"""Given a list of sequences, e.g.
[['A','C','C','A','A','T','T'],
['A','A','C','C','T','A','T'],
...
]
this function returns the pssm describing the sequences.
"""
#Step 1: For each COLUMN, count the number of A's in that pos, C's etc
pssm = []
if sequences and (len(sequences) > 0):
for col in range(len(sequences[0])):
tmp = [0, 0, 0, 0]
for row in range(len(sequences)):
if sequences[row][col] == 'A':
tmp[0] += 1
elif sequences[row][col] == 'C':
tmp[1] += 1
elif sequences[row][col] == 'G':
tmp[2] += 1
else: #T
tmp[3] += 1
#Step 2: normalize the counts
hit_sum = sum(tmp)
pssm.append(map(lambda ct: float(ct)/float(hit_sum), tmp))
return pssm
def revcomp(s):
"""Returns the reverse compliment of a sequence"""
t = s[:]
t.reverse()
rc = {'A':'T', 'C':'G', 'G':'C', 'T':'A', 'N':'N'}
t = [rc[x] for x in t]
return t
class DuplicateMotif(Exception):
pass
class BadPssm(Exception):
pass
class Motif:
"""Class for sequence motifs.
(NOT SO) Required Attributes:
NOTE: in ying's old Motif class, these could be None
self.id (string): A string containing a unique ID identifying the motif.
self.pssm (float array, N x 4 cols):
position specific scoring matrix (PSSM) for the motif. The array
should contain N rows, each corresponding to a single position in
the PSSM, and 4 columns, each corresponding to the 4 DNA basepairs
A, C, G, T, in order. Each element in the matrix should be a
float, and each row of the matrix should sum to 1.0.
self.seqpos_results (dict): {'numhits','cutoff','zscore','meanposition',
'pvalue}
NOTE: we have convenience fns, e.g. self.getpvalue() returns
self.seqpos_results['pvalue']
self.antisense (bool): whether the motif is on the antisense strand
Optional Attributes (descriptive):
self.source (string): the source of the motif, e.g. 'Transfac' or 'PBM'
self.sourcefile (string): the path to the source file of the motif
self.species (list of string): list of species names, eg ['mouse', 'human']
self.fullname (string): fullname of the motif
self.pmid (integer): the Pubmed ID of the article in which the motif was
published
self.numseqs(integer): number of sequences used to derive the motif's
position specific scoring matrix (PSSM).
self.factors (list of string): list of the transcription factors that
bind to the motif, e.g. ['ERa', 'Runx2'].
self.entrezs (list of integers): list of integers representing the
Entrez Gene IDs of the transcription factor that bind to the motif,
such as [6908, 2099].
self.refseqs (list of strings): list of the refseq strings, e.g.
['NM_001141919', 'NM_013464']
self.dbd (string): the full name of the DNA binding domain of the
transcription factor that binds to the motif, such as
'estrogen receptor region C'.
"""
_ATTRIBUTES = ['id', 'pssm', 'seqpos_results', 'antisense',
'source', 'sourcefile', 'species', 'fullname', 'pmid',
'numseqs', 'factors', 'entrezs', 'refseqs', 'dbd']
_results_fields = ['numhits','cutoff','zscore','meanposition','pvalue']
#maybe i'll make pssm the only required attribute
def __init__(self):
"""initializes all attributes to None"""
for attr in Motif._ATTRIBUTES:
setattr(self, attr, None)
#set the seqpos_result field to None
#and convenience accessors/setters, e.g. self.getpvalue
self.seqpos_results = {}
for attr in Motif._results_fields:
self.seqpos_results[attr] = None
def curry(x):
return lambda : self.seqpos_results[x]
#setattr(self, "get"+attr, lambda : self.seqpos_results[attr])
setattr(self, "get"+attr, curry(attr))
#setattr(self, "set"+attr, lambda x: self.seqpos_results[attr] = x)
@staticmethod
def _validpssm(pssm):
"""Checks to see if its a valid pssm:
1. each row must have 4 cols of floats
2. each row must sum to 1.0
"""
def feq(f1, f2):
epsilon = 1e-6
return abs(f1 - f2) < epsilon
if pssm is None: raise BadPssm("None is not a valid pssm")
if len(pssm) == 0: raise BadPssm("Empty pssm")
for i, row in enumerate(pssm):
if len(row) != 4:
raise BadPssm("row %s is not length 4" % i)
else:
if not feq(sum(row), 1.0):
raise BadPssm("row %s sums to %s NOT 1.0" % (i, sum(row)))
return True
def setpssm(self, pssm):
"""The preferred method to set the motif's pssm. does error checking"""
if Motif._validpssm(pssm):
self.pssm = pssm
def equals(self, other):
"""Returns true if self.id == other.id"""
return (self.id == other.id)
def seqpos_stat(self, start, end, score, seq_length):
"""Calculates the statistics (and other things) that go into
seqpos_results.
numhits - number of cumulative hits
meanposition - mean of the positions where the motif was found
cutoff - ???
zscore - motif's zscore
pvalue - motif's pvalue:= max(normal cum dist(zscore), 1e-30)
"""
plotPosWin = 200 #sliding window used to calculate mean pos plot
numBins = 50
MINHITS = 20
MUALPHA0 = 0.296
MUALPHA1 = 0.641
SDALPHA0 = 0.943
SDALPHA1 = -0.111
mu_bias = 0.5
offset = len(self.pssm)
#calculate relative positions of where the motif is found in sequence
frac = [abs((float((s + e - offset))/(seq_length - offset)) - 1.0)
for (s,e) in zip(start, end)]
#generate list of indices sorted by largest score, e.g. idx[0] is
#the index of the largest score and score[idx[0]] is the largest score
#print frac
idx = [i for (i,s) in
sorted(enumerate(score), key=lambda x: x[1], reverse=True)]
#print idx
cumfrac = 0
zscore_min = 99 #infinity
kmin, numhits, cuthits, meanpos = -1, 0, 0, 0
#calculate the sums of the relative positions of sites (cumfrac)
#starting w/ highest score on downward
#NOTE: this is NOT an error, numhits is the number of sites seen
#so far--ie. simply the index, and i is the score/fracpos index
for numhits, i in enumerate(idx):
if frac[i] > 1.0 or frac[i] < 0.0:
raise(ValueError,"Fraction out of bounds %4.2f" % frac[i] )
cumfrac += frac[i]
if numhits > MINHITS: #a valid number of sites have been computed
#NOTE: 12.0 is the magic number according to cliff, its
#the expected std dev, and it "comes from the variance of a
#convolution of uniform distributions"
zscore = (cumfrac - numhits*mu_bias)/(math.sqrt(numhits/12.0))
#take most significan site
if zscore < zscore_min:
zscore_min = zscore
kmin = score[i]
cuthits = numhits
meanpos = cumfrac/numhits - 0.5
frac_sort = [frac[i] for i in idx]
#calculate a mean sliding average using plotPosWin as the window size
#normalize using (seq_length-offset)*0.5
meanSldAvg=[sum(frac_sort[i:i+plotPosWin])/plotPosWin*((seq_length-offset)*0.5)\
for i in range(len(frac_sort) - plotPosWin)]
#NOTE: downscale the number of means (in meanSldAvg) b/c plotting
#the full thing will just kill highcharts.js on the front end
#sample numBins = 50 at regular intervals along meanSldAvg
chunk = len(meanSldAvg)/numBins
binAvg = [meanSldAvg[chunk*i] for i in range(numBins-1) if len(meanSldAvg) > 0]
#last one:
if len(meanSldAvg) > 0:
binAvg.append(meanSldAvg[-1])
if cuthits > MINHITS:
# NOTE: w/ the pvalue, we are assuming the normal distribution
# get the raw p-value score
#BUG? z_mean_adj = - (MUALPHA0 + MUALPHA1*math.log(math.log(numhits)))
#BUG? z_sd_adj = SDALPHA0 + SDALPHA1*math.log(math.log(numhits))
z_mean_adj = - (MUALPHA0 + MUALPHA1*math.log(math.log(cuthits)))
z_sd_adj = SDALPHA0 + SDALPHA1*math.log(math.log(cuthits))
zscore_min_adjusted = ( zscore_min - z_mean_adj )/z_sd_adj
pvalue = max(Prob.normal_cdf(zscore_min_adjusted), 1e-30)
else:
zscore_min_adjusted = 0
pvalue = 1.0
self.seqpos_results = {'numhits': cuthits,
'meanposition': meanpos,
'cutoff': kmin,
'zscore': zscore_min_adjusted,
'pvalue': pvalue,
'plot': {'chunk':chunk,
'width':seq_length,
'bin_avg': binAvg}}
#import func
#func.debug(locals())
def seqpos(self, chip_regions, width=600, margin=50):
"""Score motif on how centrally located they are within each ChIP
region.
ChIP regions should be given as a ChipRegions object.
The results of SeqPos will be stored as properties of
self.seqpos_results.
Adapted from Cliff Meyer's ChIP_region.CentralMotifScan() method."""
ANTISENSE = 1
MOTIFMIN = 1e-3
if not chip_regions.preprocessed_regions:
chip_regions.preprocess(width, margin)
#process the chip-regions
chip_regions.read_sequence()
bgseqprob_mat = count.count(chip_regions.sequence)
markov_order = 2
prob_option = _seq.MAX_OPTION
#GRR: THIS IS THE ONLY numpy dependency, and it is b/c seqscan
#expects self.pssm to be a numpy array!
pssm = numpy.array(self.pssm, float)
s_idx, start, end, orient, score = \
_seq.seqscan(chip_regions.sequence, pssm, bgseqprob_mat,
markov_order, prob_option)
#adjust score
adj_score = map(lambda s: math.log(s + MOTIFMIN), score)
#calculate the seqpos_results (stats)
self.seqpos_stat(start, end, adj_score, width + margin)
#generating the observed pssm
#fracpos is the fractional position of each site/hit
fracpos = [abs(0.5*(s + e) - (margin + width)/2.0) / (width/2.0)
for (s,e) in zip(start, end)]
#retrieve sequences whose fracposition is in (0.0, 1.0]
seq = []
for j,elem in enumerate(fracpos):
if elem <= 1.0:
t = list(chip_regions.sequence[int(s_idx[j])])
t = t[int(start[j]):int(end[j])]
if orient[j] == ANTISENSE:
seq.append(revcomp(t))
else:
seq.append(t)
self.seqpos_results['pssm'] = calc_pssm(seq)
self.seqpos_results['seq'] = ["".join(t) for t in seq]
#if self.id == "MA0104":
# import func
# func.debug(locals())
@staticmethod
def from_flat_file(path):
"""Read in a motif from a text file which describes the pssm, and
returns a Motif object- the pssm is in the following format, example:
[[0.1, 0.1, 0.7, 0.1],
[0.1, 0.7, 0.1, 0.1],
[0.7, 0.1, 0.1, 0.1],
[0.1, 0.1, 0.1, 0.7]]
"""
#NOTE: this inner fn is ugly, maybe i'll just lambda it
def pre_process(input_txt):
"""Converts weird literals, e.g. '__ob__' --> '[' according to a
dictionary of symbol translations. For example, galaxy converts
'[' --> '__ob__', this fn will convert them back"""
sym_dict = [("__ob__", "["), ("__cb__", "]")]
for s in sym_dict:
input_txt = input_txt.replace(s[0], s[1])
return input_txt
tmp = Motif()
f = open(path)
try:
#flatten file into a string
pssm_txt = "\n".join([line for line in f])
pssm_txt = pre_process(pssm_txt)
# use regular expression to pick out the rows
row_pattern = '\[\s*(0\.\d+)\s*\,\s*(0\.\d+)\s*\,\s*(0\.\d+)\s*\,\s*(0\\.\d+)\s*\]'
#function that converts a tuple of float strings to float objs
convert_tpl = lambda x: [float(i) for i in x]
pssm = [convert_tpl(r) for r in re.findall(row_pattern, pssm_txt)]
tmp.setpssm(pssm)
finally:
f.close()
return tmp
@staticmethod
def pssm_from_xml(dom_node):
"""Reads in the xml data, and returns a Nx4 col matrix (hopefully).
...
"""
pos = dom_node.getElementsByTagName("pos")
tags = ["A", "C", "T", "G"]
pssm = []
for p in pos:
num = p.getAttribute("num")
row = [float(Motif.getText(p.getElementsByTagName(tag)[0].childNodes))
for tag in tags]
pssm.append((num, row))
#can pos-s be out of order?--should we order by "num"??
pssm = sorted(pssm, key=lambda p: p[0])
return [p[1] for p in pssm]
@staticmethod #???
def getText(nodelist):
"""RIPPED from: http://docs.python.org/library/xml.dom.minidom.html
"""
rc = []
for node in nodelist:
if node.nodeType == node.TEXT_NODE:
rc.append(node.data)
return (''.join(rc)).strip()
#NOTE: the following method is obsolete b/c we should use MotifParser
#module to parse db xml!
@staticmethod
def from_xml(dom_node):
"""
Tries to create a new motif object that is defined in the node xml
tree. EXPECTED XML struct:
..............................
"""
#NOTE: these are DUPLICATED in to_xml--should move them to
#Motif.literals EXCEPT for antisense
literals = ['source', 'sourcefile', 'fullname', 'pmid', 'numseqs',
'dbd', 'antisense']
lists = ['symbols', 'entrezs', 'refseqs', 'species']
tmp = Motif()
tmp.id = dom_node.getAttribute("id")
for attr in literals:
val = dom_node.getElementsByTagName(attr)
if val:
setattr(tmp, attr, Motif.getText(val[0].childNodes))
for attr in lists:
singular_name = attr[:-1] if attr != 'species' else attr
ls = dom_node.getElementsByTagName(singular_name+"list")
#these lists are all optional -- so ls might not be found
elms_list = ls[0].getElementsByTagName(singular_name) if ls else None
if elms_list:
setattr(tmp, attr,
map(Motif.getText, [e.childNodes for e in elms_list]))
pssm_node = dom_node.getElementsByTagName("pssm")
if pssm_node:
tmp.pssm = Motif.pssm_from_xml(pssm_node[0])
return tmp
@staticmethod
def from_dict(d):
"""Returns a new motif with the values specified in the dictionary.
The dictionary is expected to follow the format specified by the
MotifParser
ref: http://cistrome.dfci.harvard.edu/trac/wiki/MotifParser
"""
fields = ['id', 'status', 'source', 'sourcefile', 'dbd', 'pmid', ]
lists = ['entrezs', 'refseqs', 'symbols', 'synonyms',]
#special lists: species, numseqs
#missing: fullname, curators
tmp = Motif()
for f in fields:
#EVERY item in d is a list, so we have to make them into values
val = ','.join(d[f])
setattr(tmp, f, val)
for l in lists:
#we want to set the value, chop off the s for the dictionary
setattr(tmp, l, d[l[:-1]])
tmp.pssm = numpy.array(d['pssm'][0])
tmp.species = d['species']
tmp.numseqs = d['numseqs']
#NOTE: symbols is a synonym for factors...and until jian changes this
#we will have to do the following
tmp.factors = tmp.symbols
return tmp
def __str__(self):
"""How motifs should be printed out"""
divider = "".join(["-" for x in range(79)])
lines = ["%s: %s" % (attr,getattr(self, attr)) \
for attr in Motif._ATTRIBUTES]
return "%s\n%s\n%s" % (divider, "\n".join(lines), divider)
def pssm_to_xml(self):
"""returns a string representation of the pssm:
0.10.30.30.3
"""
try:
if Motif._validpssm(self.pssm):
tmp = ""
for i, row in enumerate(self.pssm):
A, C, T, G = row
tmp += "%s%s%s%s\n" % (i, A, C, G, T)
return "\n%s\n" % tmp
except BadPssm:
return ""
def to_xml(self, print_non_schema=False):
"""Returns the xml representation of the motif. see Motif.from_xml
to see the xml structure we are assuming
"""
literals = ['source', 'sourcefile', 'fullname', 'pmid', 'numseqs',
'dbd']
lists = ['symbols', 'entrezs', 'refseqs', 'species']
#attributes that aren't defined in the xml schema, but might be useful
#anyway
non_schema = ['antisense', ]#seqpos_results]
def print_lit(attr):
if getattr(self, attr):
return "<%s>%s%s>\n" % (attr, getattr(self, attr), attr)
return ""
def print_lists(attr):
try:
val = getattr(self, attr)
except:
val = None
if val and len(val) > 0:
name = attr[:-1] if attr != 'species' else attr #entrezs --> entrez
tmp = ''.join(["<%s>%s%s>\n" % (name,v,name) for v in val])
return "<%s>\n%s%s>\n" % (name+"list", tmp, name+"list")
return ""
lits = ''.join(map(print_lit, literals))
lists = ''.join(map(print_lists, lists))
non_s = ''
if print_non_schema:
non_s = ''.join(map(print_lit, non_schema))
pssm = self.pssm_to_xml()
id = "id=\"%s\"" % self.id if self.id else ""
return "\n%s" % (id, pssm + lits + lists + non_s)
def to_json(self):
"""Returns the json representation of the motif.
"""
_aNumpyAr = numpy.array([])
#GRAB all of the _ATTRIBUTES and dump them into a dictionary
tmp = {}
for attr in Motif._ATTRIBUTES:
val = getattr(self, attr)
if type(val) == type(_aNumpyAr):
tmp[attr] = val.tolist()
else:
tmp[attr] = val
#CHECK for logoImg
if 'logoImg' in vars(self):
tmp['logoImg'] = self.logoImg
#Control the floating point precision, unless it is pvalue --we want to preserve
#the precision
def printFloats(f):
return "%.4f" % f if abs(f) > 0.0001 else f.__str__()
json.encoder.FLOAT_REPR = printFloats
return json.dumps(tmp)
#NOTE: moving Motif.logo --> motiflogo.make_logo (fn)
class MotifList(list):
"""Class to hold a list of unique motifs, and perform operations over them
"""
def append(self, motif_obj):
"""Tries to append motif_obj to the end of the list;
IF duplicate: raises DuplicateMotif exception
"""
if motif_obj not in self:
#note if you do self.append, that's infinite recursion
unbnd_app = list.append
unbnd_app(self, motif_obj)
else:
raise DuplicateMotif
def cull(self, pvalcutoff=0.001, maxmotifs=100):
"""Filter the MotifList based on the pvalue and the max number
of motifs allowed in the list. NOTE: this is non-destructive!
"""
sig_motifs = filter(lambda m: m.getpvalue() <= pvalcutoff, self)
if (maxmotifs != -1) and (len(sig_motifs) > maxmotifs):
#sort and return top maxmotifs
sorted_motifs = sorted(self, key = lambda elem: elem.getpvalue())
del sorted_motifs[maxmotifs:]
return MotifList(sorted_motifs)
else:
return MotifList(sig_motifs)
def from_xml_file(self, path):
"""Tries to parse the xml file specified by 'path', and load in the
motifs to the MotifList.
Expected XML file struct:...
"""
#NOTE: here is where we rely on jian's MotifParser class
p = MotifParser.MotifParser(path)
motifs = [Motif.from_dict(p.motifs[m]) for m in p.motifs]
self.extend(MotifList(motifs))
def to_xml(self):
"""Prints out the motif list in xml format, i.e.:
...
"""
tmp = '\n'.join([x.to_xml() for x in self])
#yea this might be overkill to convert it to a dom object first
dom = parseString("\n%s\n\n" % tmp)
return dom.toprettyxml()
def to_json(self):
"""Prints out the motif list as json data
"""
return "[%s]" % ',\n'.join([m.to_json() for m in self])
def save_to_xml(self, path):
"""writes the motif list as an xml file"""
#if _DEBUG: print "writing new motifs to file..."
dir_name = os.path.dirname(path)
if not os.path.exists(dir_name):
os.makedirs(dir_name)
motifs_file = open(path, 'w')
try:
motifs_file.write(self.to_xml())
finally:
motifs_file.close()
def filterBySpecies(self, species_list):
"""Filter the list by the species in species_list"""
mapSpecies = {'hs' : 'Homo sapiens', 'mm' : 'Mus musculus',
'dm' : 'Drosophila melanogaster',
'ce' : 'Caenorhabditis elegans'}
#MAP short names to long names
sl = map(lambda s: mapSpecies[s], species_list)
motifs = []
for m in self:
#take the motif IFF species = [], OR an element in species is
#in species_list
if m.species:
#print m.species
found = False
i = 0
while (i < len(m.species) and not found):
if (m.species[i] in sl): found = True
i += 1
if found: motifs.append(m)
else:
motifs.append(m)
return MotifList(motifs)