# Time-stamp: <2011-06-21 10:59:08 Tao Liu> """Module Description Copyright (c) 2008,2009 Yong Zhang, Tao Liu Copyright (c) 2010,2011 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the Artistic License (see the file COPYING included with the distribution). @status: experimental @version: $Revision$ @author: Yong Zhang, Tao Liu @contact: taoliu@jimmy.harvard.edu """ import os from math import log as mathlog from array import array from MACS14.OutputWriter import zwig_write,zbdg_write from MACS14.IO.FeatIO import PeakIO,WigTrackI from MACS14.Prob import poisson_cdf,poisson_cdf_inv from MACS14.Constants import * class PeakDetect: """Class to do the peak calling. e.g: >>> from MACS14.PeakDetect import PeakDetect >>> pd = PeakDetect(treat=treatdata, control=controldata, pvalue=pvalue_cutoff, d=100, scan_window=200, gsize=3000000000) >>> pd.call_peaks() >>> print pd.toxls() """ def __init__ (self,opt=None,treat=None, control=None): """Initialize the PeakDetect object. """ self.opt = opt self.info = opt.info self.debug = opt.debug self.warn = opt.warn self.treat = treat self.control = control self.ratio_treat2control = None self.peaks = None self.final_peaks = None self.final_negative_peaks = None self.femax = opt.femax self.femin = opt.femin self.festep = opt.festep self.pvalue = opt.log_pvalue self.d = opt.d self.shift_size = self.d/2 self.scan_window = opt.scanwindow self.gsize = opt.gsize self.nolambda = opt.nolambda self.sregion = opt.smalllocal self.lregion = opt.largelocal if (self.nolambda): self.info("#3 !!!! DYNAMIC LAMBDA IS DISABLED !!!!") self.diag = opt.diag #self.save_score = opt.store_score self.zwig_tr = opt.zwig_tr self.zwig_ctl= opt.zwig_ctl def call_peaks (self): """Call peaks function. Scan the whole genome for peaks. RESULTS WILL BE SAVED IN self.final_peaks and self.final_negative_peaks. """ if self.control: # w/ control self.peaks = self.__call_peaks_w_control () else: # w/o control self.peaks = self.__call_peaks_wo_control () return None def diag_result (self): """Run the diagnosis process on sequencing saturation. """ if not self.diag: return None if self.control: # w/ control return self.__diag_w_control() else: # w/o control return self.__diag_wo_control() def toxls (self): """Save the peak results in a tab-delimited plain text file with suffix .xls. """ text = "" if self.control and self.peaks: text += "\t".join(("chr","start", "end", "length", "summit", "tags", "-10*log10(pvalue)", "fold_enrichment", "FDR(%)"))+"\n" elif self.peaks: text += "\t".join(("chr","start", "end", "length", "summit", "tags", "-10*log10(pvalue)", "fold_enrichment"))+"\n" else: return None chrs = self.peaks.keys() chrs.sort() for chrom in chrs: for peak in self.peaks[chrom]: text += "%s\t%d\t%d\t%d" % (chrom,peak[0]+1,peak[1],peak[2]) peak_summit_relative_pos = peak[3]-peak[0] text += "\t%d" % (peak_summit_relative_pos) text += "\t%d\t%.2f" % (peak[5],peak[6]) text += "\t%.2f" % (peak[7]) if self.control: if peak[8]>=100: text += "\t100" else: text += "\t%.2f" % (peak[8]) text+= "\n" return text def neg_toxls (self): text = "" text += "\t".join(("chr","start", "end", "length", "summit", "tags", "-10*log10(pvalue)","fold_enrichment"))+"\n" chrs = self.final_negative_peaks.keys() chrs.sort() for chrom in chrs: for peak in self.final_negative_peaks[chrom]: text += "%s\t%d\t%d\t%d" % (chrom,peak[0]+1,peak[1],peak[2]) peak_summit_relative_pos = peak[3]-peak[0]+1 text += "\t%d" % (peak_summit_relative_pos) text += "\t%d\t%.2f" % (peak[5],peak[6]) text += "\t%.2f" % (peak[7]) text+= "\n" return text def tobed (self): text = "" chrs = self.peaks.keys() chrs.sort() n = 0 for chrom in chrs: for peak in self.peaks[chrom]: n += 1 text+= "%s\t%d\t%d\tMACS_peak_%d\t%.2f\n" % (chrom,peak[0],peak[1],n,peak[6]) return text def summitsToBED (self): text = "" chrs = self.peaks.keys() chrs.sort() n = 0 for chrom in chrs: for peak in self.peaks[chrom]: n += 1 text+= "%s\t%d\t%d\tMACS_peak_%d\t%.2f\n" % (chrom,peak[3]-1,peak[3],n,peak[4]) return text def __add_fdr (self, final, negative): """ A peak info type is a: dictionary key value: chromosome items: (peak start,peak end, peak length, peak summit, peak height, number of tags in peak region, peak pvalue, peak fold_enrichment, fdr) <-- tuple type """ pvalue2fdr = {} pvalues_final = [] pvalues_negative = [] chrs = final.keys() a = pvalues_final.append for chrom in chrs: for i in final[chrom]: a(i[6]) # i[6] is pvalue in peak info pvalue2fdr[i[6]]=None chrs = negative.keys() a = pvalues_negative.append for chrom in chrs: for i in negative[chrom]: a(i[6]) pvalues_final.sort(reverse=True) pvalues_final_l = len(pvalues_final) pvalues_negative.sort(reverse=True) pvalues_negative_l = len(pvalues_negative) pvalues = pvalue2fdr.keys() pvalues.sort(reverse=True) index_p2f_pos = 0 index_p2f_neg = 0 for p in pvalues: while index_p2f_pos 2 or self.ratio_treat2control < 0.5: self.warn("Treatment tags and Control tags are uneven! FDR may be wrong!") self.info("#3 shift treatment data") self.__shift_trackI(self.treat) self.info("#3 merge +/- strand of treatment data") self.treat.merge_plus_minus_locations_naive () self.debug("#3 after shift and merging, tags: %d" % (self.treat.total)) if self.opt.store_bdg: self.info("#3 save the shifted and merged tag counts into bedGraph file...") zbdg_write(self.treat,self.opt.wig_dir_tr,self.zwig_tr,self.d,log=self.info,single=self.opt.single_profile) elif self.opt.store_wig: self.info("#3 save the shifted and merged tag counts into wiggle file...") #if self.opt.wigextend: # zwig_write(self.treat,self.opt.wig_dir_tr,self.zwig_tr,self.opt.wigextend,log=self.info,space=self.opt.space,single=self.opt.single_profile) #else: zwig_write(self.treat,self.opt.wig_dir_tr,self.zwig_tr,self.d,log=self.info,space=self.opt.space,single=self.opt.single_profile) self.info("#3 call peak candidates") peak_candidates = self.__call_peaks_from_trackI (self.treat) self.info("#3 shift control data") self.info("#3 merge +/- strand of control data") self.__shift_trackI(self.control) self.control.merge_plus_minus_locations_naive () self.debug("#3 after shift and merging, tags: %d" % (self.control.total)) if self.opt.store_bdg: self.info("#3 save the shifted and merged tag counts into bedGraph file...") zbdg_write(self.control,self.opt.wig_dir_ctl,self.zwig_ctl,self.d,log=self.info,single=self.opt.single_profile) if self.opt.store_wig: self.info("#3 save the shifted and merged tag counts into wiggle file...") #if self.opt.wigextend: # zwig_write(self.control,self.opt.wig_dir_ctl,self.zwig_ctl,self.opt.wigextend,log=self.info,space=self.opt.space,single=self.opt.single_profile) #else: zwig_write(self.control,self.opt.wig_dir_ctl,self.zwig_ctl,self.d,log=self.info,space=self.opt.space,single=self.opt.single_profile) self.info("#3 call negative peak candidates") negative_peak_candidates = self.__call_peaks_from_trackI (self.control) # build score #if self.save_score: # self.info("#4 build scores") # scoreswig = self.__build_score_wigtrackI(treatwig,controlbkI,self.d,space=self.opt.space,bglambda=self.lambda_bg) # zwigfile = file(self.opt.name+".score.wig","w") # self.info("#4.1 save scores to wiggle file") # scoreswig.write_wig(zwigfile,"score") # self.info("compress the wiggle file using gzip...") # os.system("gzip "+self.opt.name+".score.wig") self.info("#3 use control data to filter peak candidates...") #self.final_peaks = self.__filter_w_control(peak_candidates,self.treat,self.control, self.ratio_treat2control,fake_when_missing=True) self.final_peaks = self.__filter_w_control(peak_candidates,self.treat,self.control, fake_when_missing=True, to_small_sample=self.opt.tosmall) self.info("#3 find negative peaks by swapping treat and control") #self.final_negative_peaks = self.__filter_w_control(negative_peak_candidates,self.control,self.treat, 1.0/self.ratio_treat2control,fake_when_missing=True) self.final_negative_peaks = self.__filter_w_control(negative_peak_candidates,self.control,self.treat, fake_when_missing=True, to_small_sample=self.opt.tosmall) return self.__add_fdr (self.final_peaks, self.final_negative_peaks) def __call_peaks_wo_control (self): """To call peaks w/o control data. """ #self.lambda_bg = float(self.scan_window)*self.treat.total/self.gsize #self.debug("#3 background lambda: %.2f " % (self.lambda_bg)) #self.min_tags = poisson_cdf_inv(1-pow(10,self.pvalue/-10),self.lambda_bg)+1 #self.debug("#3 min tags: %d" % (self.min_tags)) self.info("#3 shift treatment data") self.__shift_trackI(self.treat) self.info("#3 merge +/- strand of treatment data") self.treat.merge_plus_minus_locations_naive () self.debug("#3 after shift and merging, tags: %d" % (self.treat.total)) if self.opt.store_bdg: self.info("#3 save the shifted and merged tag counts into bedGraph file...") zbdg_write(self.treat,self.opt.wig_dir_tr,self.zwig_tr,self.d,log=self.info,single=self.opt.single_profile) elif self.opt.store_wig: self.info("#3 save the shifted and merged tag counts into wiggle file...") #if self.opt.wigextend: # zwig_write(self.treat,self.opt.wig_dir_tr,self.zwig_tr,self.opt.wigextend,log=self.info,space=self.opt.space,single=self.opt.single_profile) #else: zwig_write(self.treat,self.opt.wig_dir_tr,self.zwig_tr,self.d,log=self.info,space=self.opt.space,single=self.opt.single_profile) self.info("#3 call peak candidates") peak_candidates = self.__call_peaks_from_trackI (self.treat) self.info("#3 use self to calculate local lambda and filter peak candidates...") self.final_peaks = self.__filter_w_control(peak_candidates,self.treat,self.treat,pass_sregion=True) return self.final_peaks def __print_peak_info (self, peak_info): """Print out peak information. A peak info type is a: dictionary key value: chromosome items: (peak start,peak end, peak length, peak summit, peak height, number of tags in peak region, peak pvalue, peak fold_enrichment) <-- tuple type """ chrs = peak_info.keys() chrs.sort() for chrom in chrs: peak_list = peak_info[chrom] for peak in peak_list: print ( chrom+"\t"+"\t".join(map(str,peak)) ) def __filter_w_control (self, peak_info, treatment, control, pass_sregion=False, write2wig= False, fake_when_missing=False, to_small_sample=False ): """Use control data to calculate several lambda values around 1k, 5k and 10k region around peak summit. Choose the highest one as local lambda, then calculate p-value in poisson distribution. Parameters: 1. pass_sregion: If set True, the slocal lambda will be ignored. Use this when the control is not available. 2. write2wig: obselete 3. fake_when_missing: when a chromosome is missing in control but existing in IP or vice versa, MACS will fake a tag to pass the process. 4. to_small_sample: when set as True, balance the number of tags by linearly scaling larger sample to smaller sample. The default behaviour is to linearly scale smaller to larger one. Return value type in this format: a dictionary key value : chromosome items : array of (peak_start,peak_end,peak_length,peak_summit,peak_height,peak_num_tags,peak_pvalue,peak_fold_enrichment) """ lambda_bg0 = float(self.scan_window)*treatment.total/self.gsize # bug fixed... if treatment.total>control.total: t_ratio = 1.00 c_ratio = float(treatment.total)/control.total else: t_ratio = float(control.total)/treatment.total c_ratio = 1.00 if to_small_sample: tmp = t_ratio t_ratio = 1/c_ratio c_ratio = 1/tmp final_peak_info = {} chrs = peak_info.keys() chrs.sort() total = 0 for chrom in chrs: self.debug("#3 Chromosome %s" % (chrom)) n_chrom = 0 final_peak_info[chrom] = [] peak_list = peak_info[chrom] try: (ctags,tmp) = control.get_locations_by_chr(chrom) except: self.warn("Missing %s data, skip it..." % (chrom)) if fake_when_missing: ctags = [-1,] self.warn("Fake a tag at %s:%d" % (chrom,-1)) tmp=[] else: continue try: (ttags,tmp) = treatment.get_locations_by_chr(chrom) except: self.warn("Missing %s data, skip it..." % (chrom)) if fake_when_missing: ttags = [-1,] self.warn("Fake a tag at %s:%d" % (chrom,-1)) tmp=[] else: continue index_ctag = 0 # index for control tags index_ttag = 0 # index for treatment tags flag_find_ctag_locally = False flag_find_ttag_locally = False prev_index_ctag = 0 prev_index_ttag = 0 len_ctags =len(ctags) len_ttags =len(ttags) for i in range(len(peak_list)): (peak_start,peak_end,peak_length,peak_summit,peak_height,peak_num_tags) = peak_list[i] #window_size_4_lambda = min(self.first_lambda_region,max(peak_length,self.scan_window)) window_size_4_lambda = max(peak_length,self.scan_window) lambda_bg = lambda_bg0/self.scan_window*window_size_4_lambda*t_ratio if self.nolambda: # skip local lambda local_lambda = lambda_bg tlambda_peak = float(peak_num_tags)/peak_length*window_size_4_lambda else: left_peak = peak_start+self.shift_size # go to middle point of the first fragment right_peak = peak_end-self.shift_size # go to middle point of the last fragment left_lregion = peak_summit-self.lregion/2 left_sregion = peak_summit-self.sregion/2 right_lregion = peak_summit+self.lregion/2 right_sregion = peak_summit+self.sregion/2 #(cnum_10k,cnum_5k,cnum_1k,cnum_peak) = (0,0,0,0) #(tnum_10k,tnum_5k,tnum_1k,tnum_peak) = (0,0,0,0) (cnum_sregion, cnum_lregion, cnum_peak, tnum_sregion, tnum_lregion, tnum_peak) = (0,0,0,0,0,0) #smallest = min(left_peak,left_10k,left_5k,left_1k) #largest = max(right_peak,right_10k,right_5k,right_1k) while index_ctag < len_ctags: if ctags[index_ctag] < left_lregion: # go to next control tag index_ctag+=1 elif index_ctag+1 >= len_ctags or right_lregion < ctags[index_ctag]: # If move outof the lregion or reach the chromosome end # finalize and go to next peak region # Thanks to Jake Biesinger flag_find_ctag_locally = False index_ctag = prev_index_ctag break else: if not flag_find_ctag_locally: flag_find_ctag_locally = True prev_index_ctag = index_ctag p = ctags[index_ctag] if left_peak <= p <= right_peak: cnum_peak +=1 if left_sregion <= p <= right_sregion: cnum_sregion +=1 cnum_lregion +=1 else: cnum_lregion += 1 index_ctag += 1 # go to next tag while index_ttag < len_ttags: if ttags[index_ttag] < left_lregion: # go to next treatment tag index_ttag+=1 elif index_ttag+1 >= len_ttags or right_lregion < ttags[index_ttag]: # If move outof the lregion or reach the chromosome end # finalize and go to next peak region # Thanks to Jake Biesinger flag_find_ttag_locally = False index_ttag = prev_index_ttag break else: if not flag_find_ttag_locally: flag_find_ttag_locally = True prev_index_ttag = index_ttag p = ttags[index_ttag] if left_peak <= p <= right_peak: tnum_peak +=1 if left_sregion <= p <= right_sregion: tnum_sregion +=1 tnum_lregion += 1 else: tnum_lregion += 1 index_ttag += 1 # go to next tag clambda_peak = float(cnum_peak)/peak_length*c_ratio*window_size_4_lambda clambda_lregion = float(cnum_lregion)/self.lregion*c_ratio*window_size_4_lambda clambda_sregion = float(cnum_sregion)/self.sregion*c_ratio*window_size_4_lambda tlambda_peak = float(tnum_peak)/peak_length*t_ratio*window_size_4_lambda tlambda_lregion = float(tnum_lregion)/self.lregion*t_ratio*window_size_4_lambda tlambda_sregion = float(tnum_sregion)/self.sregion*t_ratio*window_size_4_lambda if pass_sregion: # for experiment w/o control, peak region lambda and sregion region lambda are ignored! local_lambda = max(lambda_bg,tlambda_lregion) else: # for experiment w/ control local_lambda = max(lambda_bg,clambda_peak,clambda_lregion,clambda_sregion) p_tmp = poisson_cdf(tlambda_peak,local_lambda,lower=False) if p_tmp <= 0: peak_pvalue = 3100 else: peak_pvalue = mathlog(p_tmp,10) * -10 if peak_pvalue > self.pvalue: n_chrom += 1 total += 1 peak_fold_enrichment = float(peak_height)/local_lambda*window_size_4_lambda/self.d final_peak_info[chrom].append((peak_start,peak_end,peak_length,peak_summit,peak_height,peak_num_tags,peak_pvalue,peak_fold_enrichment)) # uncomment the following two lines, MACS will report the peaks been rejected. #else: # #self.debug("Reject the peak at %s:%d-%d with local_lambda: %.2f and -log10pvalue: %.2f" % (chrom,peak_start,peak_end,local_lambda,peak_pvalue)) self.debug("#3 peaks whose pvalue < cutoff: %d" % (n_chrom)) self.info("#3 Finally, %d peaks are called!" % (total)) return final_peak_info def __call_peaks_from_trackI (self, trackI): """Call peak candidates from trackI data. Using every tag as step and scan the self.scan_window region around the tag. If tag number is greater than self.min_tags, then the position is recorded. Return: data in this format. (peak_start,peak_end,peak_length,peak_summit,peak_height,peak_num_tags) """ lambda_bg0 = float(self.scan_window)*trackI.total/self.gsize # bug fixed... min_tags = poisson_cdf_inv(1-pow(10,self.pvalue/-10),lambda_bg0)+1 peak_candidates = {} self.debug("#3 search peak condidates...") chrs = trackI.get_chr_names() total = 0 for chrom in chrs: self.debug("#3 Chromosome %s" % (chrom)) n_chrom = 0 peak_candidates[chrom] = [] (tags,tmp) = trackI.get_locations_by_chr(chrom) len_t = len(tags) cpr_tags = [] # Candidate Peak Region tags cpr_tags.extend(tags[:min_tags-1]) number_cpr_tags = min_tags-1 p = min_tags-1 # Next Tag Index while p < len_t: if number_cpr_tags >= min_tags: if tags[p] - cpr_tags[-1*min_tags+1] <= self.scan_window: # add next tag, if the new tag is less than self.scan_window away from previous no. min_tags tag cpr_tags.append(tags[p]) number_cpr_tags += 1 p+=1 else: # candidate peak region is ready, call peak... (peak_start,peak_end,peak_length,peak_summit,peak_height) = self.__tags_call_peak (cpr_tags) peak_candidates[chrom].append((peak_start,peak_end,peak_length,peak_summit,peak_height,number_cpr_tags)) cpr_tags = [tags[p]] # reset p += 1 # bug fixed by Jake Biesinger number_cpr_tags = 1 total += 1 n_chrom += 1 else: # add next tag, but if the first one in cpr_tags # is more than self.scan_window away from this new # tag, remove the first one from the list if tags[p] - cpr_tags[0] >= self.scan_window: cpr_tags.pop(0) number_cpr_tags -= 1 cpr_tags.append(tags[p]) number_cpr_tags += 1 p+=1 self.debug("#3 peak candidates: %d" % (n_chrom)) self.debug("#3 Total number of candidates: %d" % (total)) return self.__remove_overlapping_peaks(peak_candidates) def __tags_call_peak (self, tags ): """Project tags to a line. Then find the highest point. """ start = tags[0]-self.d/2 end = tags[-1]+self.d/2 # +1 or not? region_length = end - start line= [0]*region_length for tag in tags: tag_projected_start = tag-start-self.d/2 tag_projected_end = tag-start+self.d/2 for i in range(tag_projected_start,tag_projected_end): line[i] += 1 tops = [] top_height = 0 for i in range(len(line)): if line[i] > top_height: top_height = line[i] tops = [i] elif line[i] == top_height: tops.append(i) peak_summit = tops[len(tops)/2]+start return (start,end,region_length,peak_summit,top_height) def __shift_trackI (self, trackI): """Shift trackI data to right (for plus strand) or left (for minus strand). """ chrs = trackI.get_chr_names() number_removed_tags = 0 for chrom in chrs: tags = trackI.get_locations_by_chr(chrom) # plus for i in range(len(tags[0])): tags[0][i]+=self.shift_size # minus for i in range(len(tags[1])): tags[1][i]-=self.shift_size # remove the tags extended outside of chromosome start while tags[1]: if tags[1][0]-self.shift_size<0: number_removed_tags += 1 tags[1].pop(0) else: break self.debug("# %d tag(s) extended outside of chromosome start are removed!" % number_removed_tags) return def __build_wigtrackI (self, trackI, space=10): """Shift trackI then build a wigTrackI object. """ chrs = trackI.get_chr_names() wigtrack = WigTrackI() wigtrack.span = space d = self.d step = 10000000 + 2*d for chrom in chrs: tags = trackI.get_locations_by_chr(chrom)[0] l = len(tags) window_counts = array(BYTE4,[0]*step) startp = -1*d endp = startp+step index_tag = 0 while index_tag prev_peak[4]: summit_new_peak = peaks_chr[i][3] h_new_peak = peaks_chr[i][4] else: summit_new_peak = prev_peak[3] h_new_peak = prev_peak[4] prev_peak = (s_new_peak,e_new_peak,l_new_peak,summit_new_peak,h_new_peak,peaks_chr[i][5]+prev_peak[5]) else: n_append(prev_peak) prev_peak = peaks_chr[i] if prev_peak: n_append(prev_peak) return new_peaks