#!/usr/bin/python2 # """Module Description Copyright (c) 2009 H. Gene Shin This code is free software; you can redistribute it and/or modify it under the terms of the BSD License (see the file COPYING included with the distribution). site.py gives an average enrichment profile of given regions of interest (eg, binding sites or motifs). @status: experimental @version: $Revision$ @author: H. Gene Shin @contact: shin@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ import os import sys import time import subprocess import string import logging import re import itertools from optparse import OptionParser # ----------------------------------- # my modules # ----------------------------------- import CEAS.inout as inout import CEAS.corelib as corelib import CEAS.R as R # ------------------------------------ # constants # ------------------------------------ logging.basicConfig(level=20, format='%(levelname)-5s @ %(asctime)s: %(message)s ', datefmt='%a, %d %b %Y %H:%M:%S', stream=sys.stderr, filemode="w" ) # ------------------------------------ # Misc functions # ------------------------------------ error = logging.critical # function alias warn = logging.warning debug = logging.debug info = logging.info # ------------------------------------ # classes # ------------------------------------ class WigProfilerwBed: """WigProfiler for regions in Bed""" def __init__(self, span=1000, step=20, dir=False): """Constructor""" # parameters self.span = span self.step = step self.dir = dir # output values self.names = [] self.profiles = [] self.xaxis = [] def set_params(self, span, step, dir): """Set parameters Parameters: 1. wig: an Wig object in Cistrome.Assoc.inout 2. bed: a Bed object in Cistrome.Assoc.inout 3. span: span size from the center of each region 4. dir: if set True, the start and end of '-' will be regarded as the end and start. """ self.span=span self.step=step self.dir=dir def capture_regions(self, wig, bed): """Capture the regions that will be profiled""" wigint = self.estimate_wig_interval(wig) # estimate the wig interval for where2 step = self.step hf = 1.0*step/2 binned = [] chroms = list(set(wig.get_chroms()).intersection(set(bed.get_chroms()))) chroms = corelib.sort_chroms(chroms) for chrom in chroms: x=wig[chrom][0] y=wig[chrom][1] # if the direction is considered, include strand info from the BED if self.dir: try: bediter=itertools.izip(bed[chrom]['start'],bed[chrom]['end'], bed[chrom]['strand']) except KeyError: dummy=['' for i in xrange(len(bed[chrom]['start']))] bediter=itertools.izip(bed[chrom]['start'],bed[chrom]['end'], dummy) else: dummy=['' for i in xrange(len(bed[chrom]['start']))] bediter=itertools.izip(bed[chrom]['start'],bed[chrom]['end'], dummy) # to reduce the search time, the initial search point is updated everytime. #init = 0 for begin, cease, strand in bediter: # get the center, right edge and left edge for search center = (begin+cease) /2 left = center -self.span - hf right = center + self.span + hf n = int(right-left)/step # get the region from the wig, binning #start, end= corelib.where2(left, right, x[init:], wigint) #start, end = corelib.where(left, right, x[init:]) #regionx = x[init+start:init+end] #regiony = y[init+start:init+end] # test if updating the initial search point everytime might have caused a problem start, end = corelib.where(left, right, x) regionx = x[start:end] regiony = y[start:end] if regionx and regiony: bins = corelib.linspace(left, right, n+1) this = corelib.binxy_equibin(bins, regionx, regiony, binfunc='middle', NaN=False) if strand == '-': this.reverse() else: #this = [0] * n this = [] # save the binned signal binned.append(this) # update initial search point #init+=start return binned def estimate_wig_interval(self, wig): """Estimate the interval between two consecutive points. This method is exactly the same as estimate_wig_interval function in inout.py. This methods select randomly 10 regions in each chromosome and take the median of two consecutive intervals. """ chroms = wig.get_chroms() if not chroms: return None n_random_positions = 10 intervals = [] for chrom in chroms: len_this_chr = len(wig[chrom][0]) a = corelib.randints(0, len_this_chr - 2, 2 * n_random_positions) # we need at least two element array to get difference a.sort() starts = [a[i] for i in xrange(len(a)) if i%2 == 0] ends = [a[i] + 2 for i in xrange(len(a)) if i%2 == 1]# we need at least two element array to get difference for start, end in itertools.izip(starts, ends): intervals.append(corelib.median(corelib.diff(wig[chrom][0][start:end]))) return corelib.median(intervals) def get_breaks(self, start, end): """Return breaks for bins Parameters: 1. start: the start value 2. end: the end value. This end value is included in the resulting breaks. """ step = self.step n = (end-start) / step + 1 breaks = map(lambda x: int(round(x)), corelib.linspace(start, end, n)) return breaks def profile(self, wig, bed): """Wrapper function of WigProfilewBed. Through this function, the user can set the parameters for profiling and get a list of profiles of the regions. """ start = -1 * self.span end = self.span return self.capture_regions(wig, bed) # ------------------------------------ # functions # ------------------------------------ def prepare_optparser (): """Prepare optparser object. New options will be added in this function first. """ usage = "usage: %prog <-w wig -b bed> [options]" description = "sitepro -- Average profile around given genomic sites" # option processor optparser = OptionParser(version="%prog 0.6.6 (package version 1.0.2)",description=description,usage=usage,add_help_option=False) optparser.add_option("-h","--help",action="help",help="Show this help message and exit.") optparser.add_option("-w","--wig",dest="wig",type="string", action="append",\ help="input WIG file. WARNING: both fixedStep and variableStep WIG formats are accepted. Multiple WIG files can be given via -w (--wig) individually (eg -w WIG1.wig, -w WIG2.wig). WARNING! multiple wig and bed files are not allowed.") optparser.add_option("-b","--bed",dest="bed",type="string", action="append",\ help="BED file of regions of interest. (eg, binding sites or motif locations) Multiple BED files can be given via -b (--bed) individually (eg -b BED1.bed -b BED2.bed). WARNING! multiple wig and bed files are not allowed.") optparser.add_option("--span",dest="span",type="int",\ help="Span from the center of each BED region in both directions(+/-) (eg, [c - span, c + span], where c is the center of a region), default:1000 bp", default=1000) optparser.add_option("--pf-res", dest="pf_res", type="int",\ help="Profiling resolution, default: 50 bp", default=50) optparser.add_option("--dir",action="store_true",dest="dir",\ help="If set, the direction (+/-) is considered in profiling. If no strand info given in the BED, this option is ignored.",default=False) optparser.add_option("--dump",action="store_true",dest="dump",\ help="If set, profiles are dumped as a TXT file",default=False) optparser.add_option("--name",dest="name",type="string", help="Name of this run. If not given, the body of the bed file name will be used,") optparser.add_option("-l","--label",dest="label",type="string", action="append",\ help="Labels of the wig files. If given, they are used as the legends of the plot and in naming the TXT files of profile dumps; otherwise, the WIG file names will be used as the labels. Multiple labels can be given via -l (--label) individually (eg, -l LABEL1 -l LABEL2). WARNING! The number and order of the labels must be the same as the WIG files.", default=None) #optparser.add_option("--log",action="store_true",dest="log",\ # help="If set, a log file is recorded in the current working directory.",default=False) return optparser def opt_validate (optparser): """Validate options from a OptParser object. Ret: Validated options object. """ (options,args) = optparser.parse_args() # input BED file and GDB must be given if not (options.wig and options.bed): optparser.print_help() sys.exit(1) else: if len(options.wig) > 1 and len(options.bed) > 1: error("Either a single BED file and multiple WIG files or multiple BED files and a single WIG file are allowed.") sys.exit(1) # split the wig file names if options.wig: for wig in options.wig: if not os.path.isfile(wig): error("Check -w (--wig). No such file as '%s'" %wig) sys.exit(1) if options.bed: for bed in options.bed: if not os.path.isfile(bed): error('Check -b (--bed). No such file exists:%s' %bed) sys.exit(1) # get namename if not options.name: #options.name=os.path.split(options.bed)[-1].rsplit('.bed',2)[0] options.name="sitepro_%s" %(time.strftime("%Y.%b.%d.%H-%M-%S", time.localtime())) # get the aliases if len(options.wig) > 1: if options.label: if len(options.label) != len(options.wig): error("The number and order of the labels must be the same as the WIG files. Check -w and -l options.") sys.exit(1) else: options.label = map(lambda x: os.path.split(x)[1].rsplit('.wig')[0], options.wig) elif len(options.bed) > 1: if options.label: if len(options.label) != len(options.bed): error("The number and order of the labels must be the same as the BED files. Check -b and -l options.") sys.exit(1) else: options.label = map(lambda x: os.path.split(x)[1].rsplit('.bed')[0], options.bed) else: # when given very standard inputs, one bed and one wig if options.label: if len(options.label) != 1: error("Only one label must be given with one BED and one WIG. Check -l option.") sys.exit(1) else: options.label = [os.path.split(options.wig[0])[1].rsplit('.wig')[0]] # print arguments options.argtxt = "# ARGUMENTS:\n" options.argtxt += "\n".join(("# name: %s" %options.name, "# BED file(s): %s" %str(options.bed)[1:-1], "# WIG file(s): %s" %str(options.wig)[1:-1], "# span: %s bp" %str(options.span), "# resolution: %s bp" %str(options.pf_res), "# direction (+/-): %s" %("ON"*options.dir + "OFF"*(not options.dir)))) # if logfile is written. #if options.log: # logf = open(options.name + ".log", 'w') # logf.write(options.argtxt + "\n") # logf.close() return options def draw_siteprofiles(sitebreaks, avg_siteprof): """Return a R script that draws the average profile on the given sites""" #comment R.comment('') R.comment('Draw wig profiles around binding sites') R.comment('') if avg_siteprof == None: trim_siteprof = [0]*len(sitebreaks) else: trim_siteprof = avg_siteprof minlen = min([len(sitebreaks), len(trim_siteprof)]) rscript=inout.draw_single_profile(sitebreaks[:minlen], trim_siteprof[:minlen],col=["red"],main='Average Profile around the Center of Sites',xlab='Relative Distance from the Center (bp)',ylab='Average Profile',ylim=[],v=0) return rscript def draw_multiple_siteprofiles(sitebreaks, avg_siteprofs, legends): """Return a R script that draws multiple average profiles on the given sites""" #comment R.comment('') R.comment('Draw multiple wig profiles around binding sites') R.comment('') # handle if None is in the avg_siteprofs trim_siteprofs = [] for siteprof in avg_siteprofs: if siteprof == None: temp = [0]*len(sitebreaks) else: temp = siteprof trim_siteprofs.append(temp) minlen = min([len(sitebreaks), min(map(len, trim_siteprofs))]) trim_siteprofs = map(lambda x: x[:minlen], trim_siteprofs) rscript=inout.draw_multiple_profiles2(sitebreaks[:minlen], trim_siteprofs, cols=[], main='Average Profiles around the Center of Sites',xlab='Relative Distance from the Center (bp)',ylab='Average Profile',ylim=[],v=0, legends=legends) return rscript def dump(chrom, sites, siteprofs): """Dump the sites and their profiles in a long string """ starts = sites['start'] ends = sites['end'] # if no names are allowed, put no names for the output try: names = sites['name'] except KeyError: names = [''] * len(starts) txt = '' for start, end, name, siteprof in itertools.izip(starts, ends, names, siteprofs): s = map(str, siteprof) # print out the regions if name == '': txt += "%s\t%d\t%d\t%s\n" %(chrom, start, end, ','.join(s)+',') else: txt += "%s\t%d\t%d\t\t%s\t%s\n" %(chrom, start, end, name, ','.join(s)+',') return txt def catBEDs(BEDlist): """mergeBEDs merges multiple BED files in BEDlist into a one BED object. In this case, a unique ID is given to the regions of each BED object to be merged """ cBED = inout.Bed() cBED.bed = {} IDs = xrange(len(BEDlist)) for BED, ID in itertools.izip(BEDlist, IDs) : chroms = BED.get_chroms() for chrom in chroms: # if the first bed to this chromosome, initialize if not cBED.has_key(chrom): cBED[chrom] = {'start': array('l', []), 'end': array('l', []), 'name': [], \ 'score': array('d', []), 'strand':[]} # chromosome length chrom_len = len(BED[chrom]['start']) # copy into the merged BED object cBED[chrom]['start'].extend(BED[chrom]['start']) cBED[chrom]['end'].extend(BED[chrom]['end']) cBED[chrom]['name'].extend([str(ID)] * chrom_len) cBED[chrom]['score'].extend([0] * chrom_len) try: cBED[chrom]['strand'].extend(BED[chrom]['strand']) except KeyError: cBED[chrom]['strand'].extend([''] * chrom_len) return cBED # ------------------------------------ # Main function # ------------------------------------ def main(): # read the options and validate them options=opt_validate(prepare_optparser()) info ("\n" + options.argtxt) # reading a gene annotation table jobcount=1 # read regions of interest (bed file) info("#%d read the bed file(s) of regions of interest..." %jobcount) Siteslist = [] for bed in options.bed: BED = inout.Bed() BED.read(bed) BED.sort() Siteslist.append(BED) # cat the BEd files to one and give a unique ID to the regions of each BED #Sites = catBEDs(Siteslist) # delete the individual BEDs #del Siteslist # sort for profiling #Sites.sort() jobcount += 1 # create a profiler object profwbed=WigProfilerwBed(span=options.span, step=options.pf_res, dir=options.dir) # get the breaks sitebreaks = profwbed.get_breaks(-1*options.span, options.span) # initialize avg_siteprofs with empty list super_avg_siteprofs = [] # run multiple wig files wigl = 0 for wig in options.wig: # Doing profiling chrom='' chrcount=1 FIRST=True avg_siteprofs = map(lambda x: [], Siteslist) avg_spcounts = map(lambda x: [], Siteslist) fixedStep = False info( "#%d run sitepro on %s..." %(jobcount, os.path.split(wig)[-1])) for line in open(wig,'r').xreadlines(): if not line: continue # read a chromosome if re.search(r'track',line): try: description=re.search(r'description="(\w+)"\s',line).group(1) except AttributeError: pass continue # check if fixedStep or variableStep if re.search(r'fixedStep', line): fixedStep = True step = int(re.search(r'step=(\S+)\s', line).group(1)) position = int(re.search(r'start=(\S+)\s', line).group(1)) if re.search(r'chrom=(\S+)\s',line): newchrom=re.search(r'chrom=(\S+)\s',line).group(1) try: newchrom=inout.standard_chroms[newchrom] except KeyError: pass continue l=line.strip().split() # the beginning if chrom=='' and chrom!=newchrom: chrom=newchrom # if this chromosome is not in chromosome list of the BED, just ignore it info("#%d-%d read and process %s..." %(jobcount, chrcount, chrom)) input=inout.Wig() # if fixedStep, calculate the position from start and step if fixedStep: row = [position, l[-1]] position += step else: row = l # add the new line to the Wig object input.add_line(chrom, row) chrcount += 1 elif chrom!='' and chrom!=newchrom: # new chromosome #for Sites, avg_siteprof, avg_spcount in itertools.izip(Siteslist, avg_siteprofs, avg_spcounts): for i in xrange(len(Siteslist)): chroms_bed = Siteslist[i].get_chroms() if chrom in chroms_bed: # wig profiling for given regions of interest siteprofs=profwbed.profile(input, Siteslist[i]) avg_sp, spcount=corelib.mean_col_by_col(siteprofs, counts=True) if not FIRST: # if not first chromosome # average site profiles avg_siteprofs[i],avg_spcounts[i]=corelib.weight_mean_col_by_col([avg_siteprofs[i],avg_sp],[avg_spcounts[i],spcount],counts=True) # if --dump, dump the profiles along with their corresponding BED regions if options.dump: nm = options.label[wigl + i] + '_dump.txt' dfhd = open(nm, 'a') dfhd.write(dump(chrom, Siteslist[i][chrom], siteprofs)) dfhd.close() del avg_sp,spcount else: # if first chromosome avg_siteprofs[i]=avg_sp avg_spcounts[i]=spcount # open a txt file to dump profiles if --dump is set if options.dump: nm = options.label[wigl + i] + '_dump.txt' dfhd = open(nm, 'w') dfhd.write(dump(chrom, Siteslist[i][chrom], siteprofs)) dfhd.close() # de1lete unnucessary variables to maximize usable memory del siteprofs # set chrom to the new chromosome if FIRST: FIRST = False chrom=newchrom info("#%d-%d read and process %s..." %(jobcount, chrcount, chrom)) input=inout.Wig() # if fixedStep, calculate the position from start and step if fixedStep: row = [position, l[-1]] position += step else: row = l # add the new line to the Wig object input.add_line(chrom, row) chrcount+=1 else: # in the middle of chromosome # if fixedStep, calculate the position from start and step if fixedStep: row = [position, l[-1]] position += step else: row = l # add the new line to the Wig object input.add_line(chrom, row) # the last chromosome #for Sites, avg_siteprof, avg_spcount in xrange(len(Siteslist)): #itertools.izip(Siteslist, avg_siteprofs, avg_spcounts): for i in xrange(len(Siteslist)): #itertools.izip(Siteslist, avg_siteprofs, avg_spcounts): chroms_bed = Siteslist[i].get_chroms() if chrom in chroms_bed: # doing profiling! siteprofs=profwbed.profile(input, Siteslist[i]) avg_sp,spcount=corelib.mean_col_by_col(siteprofs,counts=True) if not FIRST: # the first chromosome profiling avg_siteprofs[i],avg_spcounts[i] = corelib.weight_mean_col_by_col([avg_siteprofs[i],avg_sp],[avg_spcounts[i],spcount],counts=True) # if --dump, dump the profiles along with their corresponding BED regions if options.dump: nm = options.label[wigl+i] + '_dump.txt' dfhd = open(nm, 'a') dfhd.write(dump(chrom, Siteslist[i][chrom], siteprofs)) dfhd.close() del avg_sp,spcount else: # average site profiles avg_siteprofs[i]=avg_sp avg_spcounts[i]=spcount # if --dump, dump the profiles along with their corresponding BED regions if options.dump: nm = options.label[wigl+i] + '_dump.txt' dfhd = open(nm, 'w') dfhd.write(dump(chrom, Siteslist[i][chrom], siteprofs)) dfhd.close() # delete unnecessary variables del siteprofs #if options.dump: # dfhd.close() #print avg_siteprofs # save the site profile of current wig file super_avg_siteprofs.extend(avg_siteprofs) jobcount+=1 wigl += 1 # write the R script info('#%d writing R script of profiling...' %jobcount) ofhd=open(options.name+'.R','w') pdfname=options.name+'.pdf' rscript=R.pdf(pdfname, height=6, width=8.5) # if single profile; otherwise multiple profiles if len(options.wig)==1 and len(options.bed) == 1: rscript += draw_siteprofiles(sitebreaks, super_avg_siteprofs[0]) else: #print super_avg_siteprofs rscript += draw_multiple_siteprofiles(sitebreaks, super_avg_siteprofs, options.label) ofhd.write(rscript) # write wig profiling ofhd.write(R.devoff()) ofhd.close() jobcount+=1 # Run R directly - if any exceptions, just pass try: p = subprocess.Popen("R" + " --vanilla < %s" %(options.name+'.R'), shell=True) sts = os.waitpid(p.pid, 0) if options.dump: info ('#... cong! See %s for the graphical result of sitepro and %s for the dumped profiles!' %(options.name+'.pdf', options.name+'.txt')) else: info ('#... cong! See %s for the graphical result of sitepro!' % (options.name+'.pdf')) except: info ('#... cong! Run %s using R for the graphical result of sitepro! sitepro could not run R directly.' %(options.name+'.R')) # program running if __name__ == '__main__': try: main() except KeyboardInterrupt: warn("User interrupts me! ;-) See you!") sys.exit(0)