#!/usr/bin/python2 """Module Description Copyright (c) 2009 H. Gene Shin This code is free software; you can redistribute it and/or modify it under the terms of the BSD License (see the file COPYING included with the distribution). @status: experimental @version: $Revision$ @author: H. Gene Shin @contact: shin@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ import os import sys import re import logging import operator import itertools import subprocess import warnings from optparse import OptionParser import CEAS.inout as inout import CEAS.R as R import CEAS.annotator as annotator import CEAS.sampler as sampler import CEAS.profiler as profiler import CEAS.tables as tables import CEAS.corelib as corelib #from CEAS.inout import MYSQL # ------------------------------------ # constants # ------------------------------------ logging.basicConfig(level=20, format='%(levelname)-5s @ %(asctime)s: %(message)s ', datefmt='%a, %d %b %Y %H:%M:%S', stream=sys.stderr, filemode="w" ) # ------------------------------------ # Misc functions # ------------------------------------ error = logging.critical # function alias warn = logging.warning debug = logging.debug info = logging.info # ------------------------------------ # Main function # ------------------------------------ def main(): # read the options and validate them options=opt_validate(prepare_optparser()) # print out the options info("\n" + options.argtxt) # read the gene annotation table jobcount=1 info("#%d read the gene table..." %jobcount) GeneT = inout.GeneTable() try: GeneT.read(Host = options.Host, User= options.User, Db=options.gdb, annotation='GeneTable', \ columns=('name','chrom','strand','txStart','txEnd','cdsStart','cdsEnd','exonCount','exonStarts', 'exonEnds', 'name2')) except Exception, e: # if 'name2' does not exist and the user wants to use 'name2', error; otherwise, if re.search(r'column.*name2', str(e)): if options.name2: error("The gene annotation table does not have 'name2.' Only gene IDs of 'name' can be used in wig profiling of the sub-groups of genes given though --gn-groups.") sys.exit(1) else: GeneT.read(Host = options.Host, User= options.User, Db=options.gdb, annotation='GeneTable', \ columns=('name','chrom','strand','txStart','txEnd','cdsStart','cdsEnd','exonCount','exonStarts', 'exonEnds')) else: raise GeneT.sort() chroms_GeneT=GeneT.get_chroms() chroms_GeneT=filter_chroms(chroms_GeneT,'_[A-Za-z0-9]*') # determine the metagenesize, concated exon size, concatenated intron size if options.metagene_size: options.catexon_size, options.catintron_size = corelib.find_nearest_multiple(options.metagene_size/2, options.pf_res), corelib.find_nearest_multiple(options.metagene_size/2, options.pf_res) else: options.metagene_size = return_med_gene(GeneT, options.pf_res) options.catexon_size, options.catintron_size = return_med_catexon_catintron(GeneT, options.pf_res) # get the exon and intron sizes to consider in the average profiling options = determine_exon_intron_sizes(GeneT, options) jobcount+=1 if options.chipannot: # read ChIP regions info("#%d read the bed file of ChIP regions..." %jobcount) ChIP=inout.Bed() ChIP.read(options.bed) ChIP.sort() # test if the ChIP BED is valid or not. test_result = inout.test_if_valid_BED( ChIP ) if test_result != None: error( 'ChIP BED file format is not valid! %s' %test_result ) sys.exit(1) Csampler=sampler.ChIPSampler() ChIPSamp=Csampler.sample(ChIP,resolution=options.chip_res) jobcount+=1 # do gene-centered annotation info('#%d perform gene-centered annotation...' %jobcount) GAnnotator=annotator.GeneAnnotator() GAnnotator.annotate(GeneT, ChIP, u=options.span, d=options.span) GAnnotator.write(options.name, description=True) jobcount+=1 info('#%d See %s for gene-centered annotation!' %(jobcount, options.name+'.xls')) jobcount+=1 # read regions of interest if it is given if options.chipannot and options.ebedannot: info("#%d read the bed file of regions of interest..." %jobcount) roi=inout.Bed() roi.read(options.ebed) jobcount+=1 else: roi=None # if background annotation is not being run. if options.chipannot and not options.rebg: # iterate through chromosomes of the gene table info("#%d read the pre-computed genome bg annotation..." %jobcount) try: GenomeBGS=tables.SummaryGBG(name='GenomeBGS') GenomeBGS.readdb(Db=options.gdb) GenomePieP = tables.PieP( name='GenomePieP' ) GenomePieP.readdb( Db=options.gdb ) except: error("the pre-computed genome bg annotation is required for ChIP annotation. Use the gene annotation table files provided by CEAS or add a WIG file and set --bg.") sys.exit(1) GP=_interpoloate_gbg(options.gdb,options.sizes,options.bisizes) # interpolation chroms_bg=GP.get_chroms() jobcount+=1 # if any regions of interest are given if options.ebedannot: GP=_get_bgroi(GP,GenomeBGS,roi=roi,bg_res=options.bg_res) # annotate ChIP regions info('#%d perform ChIP region annotation...' %jobcount) Annot=annotator.Annotator() ChIPA=Annot.annotate(genome_coordinates=ChIPSamp,gene_table=GeneT,roi=roi,prom=options.sizes,biprom=options.bisizes,down=options.sizes,gene_div=(3,5) ) CS,CP=Annot.summarize(ChIPA) CES, CEP = Annot.obtain_distribution_of_sites(ChIPA) # make the table complete with missing chromsomes, if there are annotator.make_table_complete(CS, chroms_bg) annotator.make_table_complete(CP, chroms_bg) # get the pvalues CPval=annotator.estimate_pvals(GP,CS,CP) jobcount+=1 # open outfile info('#%d write a R script of ChIP region annotation...' %jobcount) ofhd=open(options.name+'.R','w') pdfname=options.name+'.pdf' # the first part of CEAS R script. Because wig profiling is not run, just terminate rscript = options.argtxt rscript += "\n" rscript += R.pdf(pdfname,height=11.5,width=8.5) rscript += inout.draw_CEAS(GP,CP,CPval,bg_res=options.bg_res,chip_res=options.chip_res,prom=options.sizes,biprom=options.bisizes,down=options.sizes,gene_div=(3,5)) ofhd.write(rscript) # write ChIP region annotation # write the pie chart GEP = GenomePieP.export2dic() rscript = inout.draw_pie_distribution_of_elements(GEP, CEP, gprom=(1000, 2000, 3000), gdown=(1000, 2000, 3000), prom=options.sizes, down=options.sizes) ofhd.write(rscript) # draw ChIP regions over chromosomes if no wig profiling is going on if not options.pf: # if no scores are given, just put 1 as dummy scores if not inout.check_if_yes_score(ChIP): inout.fill_up_scores_w_val(ChIP, val = 1) rscript = inout.draw_ChIP_over_genome_mono_col(ChIP, None, n_peaks=options.n_peaks) ofhd.write(rscript) jobcount+=1 # when wig profiling is running if options.pf: if options.rebg: GenomeBGS=tables.Summary(name='GenomeBGS') GenomePieS= tables.PieSummary( name='GenomePieS' ) # if gene groups are give if options.gn_groups: #subsets=inout.read_gene_subsets(options.gn_groups) subsets = inout.read_gene_subsets2(options.gn_groups) chrom='' chrcount=1 prof=profiler.WigProfiler2(rel_dist=options.rel_dist, step=options.pf_res, metagenesize=options.metagene_size, \ catexonsize=options.catexon_size, catintronsize=options.catintron_size, metaexonsizes=options.metaexonsizes, \ metaintronsizes=options.metaintronsizes, elowers=options.elowers, euppers=options.euppers, ilowers=options.ilowers, \ iuppers=options.iuppers) wigsize = {} swig=inout.Wig() ws = sampler.WigSamplerFast() FIRST=True fixedStep = False # variableStep == True: variableStep Wig, False: fixedStep for line in open(options.wig,'r').xreadlines(): if not line: continue # when meet 'track', continue to the next line if re.search(r'track',line): try: description=re.search(r'description="(\w+)"\s',line).group(1) except AttributeError: pass continue # check if fixedStep or variableStep if re.search(r'fixedStep', line): fixedStep = True step = int(re.search(r'step=(\S+)\s', line).group(1)) position = int(re.search(r'start=(\S+)\s', line).group(1)) # get the chromosome if re.search(r'chrom=(\S+)\s', line): newchrom=re.search(r'chrom=(\S+)\s',line).group(1) try: newchrom=inout.standard_chroms[newchrom] except KeyError: pass continue # split the line l=line.strip().split() # the beginning if chrom=='' and chrom!=newchrom: # if the chromosome is not in gene table, continue chrom=newchrom if chrom in chroms_GeneT: # only if the new chromosome is in the chroms of gene table, a wig object is initiated. if options.rebg: info("#%d-%d run wig profiling and genome bg annotation of %s..." %(jobcount,chrcount,chrom)) else: info("#%d-%d run wig profiling of %s..." %(jobcount,chrcount,chrom)) input=inout.Wig() # if fixedStep, calculate the position from start and step if fixedStep: row = [position, l[-1]] position += step else: row = l # add the new line to the Wig object input.add_line(chrom, row) chrcount+=1 elif chrom!='' and chrom!=newchrom: # new chromosome if chrom in chroms_GeneT: # do genome BG annotation if options.rebg: Sampler=sampler.GenomeSampler() Annotator=annotator.Annotator() GA=Annotator.annotate(Sampler.sample(input,resolution=options.bg_res),GeneT,roi=roi,prom=options.sizes,biprom=options.bisizes,down=options.sizes,gene_div=(3,5)) tempS,tempP=Annotator.summarize(GA) GenomeBGS.add_row(chrom,tempS.get_row(chrom)) tempPieSt = Annotator.obtain_distribution_of_sites_per_chrom(GA) tempPieS = tables.PieSummary() tempPieS.import2tb( tempPieSt ) GenomePieS.add_row( chrom, tempPieS.get_row( chrom ) ) # update ChIP regions with WIG scores if options.chipannot: profiler.scan_scores_in_wig(ChIP, input) try: wigsize[chrom] = (input[chrom][0][0], input[chrom][0][-1]) except IndexError: wigsize[chrom] = (0, 0) # wig profiling profiles = prof.profile(input, GeneT) # get average of this chromosome avg_up,upcount=corelib.mean_col_by_col(profiles['upstreams'], counts=True) avg_down,downcount=corelib.mean_col_by_col(profiles['downstreams'], counts=True) avg_mg,genecount=corelib.mean_col_by_col(profiles['genes'], counts=True) avg_mce,cexoncount=corelib.mean_col_by_col(profiles['catexons'], counts=True) avg_mci,cintroncount=corelib.mean_col_by_col(profiles['catintrons'],counts=True) # get average of exon and introns of this chromosome n_metas = len(profiles['exons']) outs = map(corelib.mean_col_by_col, map(corelib.extend_list_series, profiles['exons']), [True] * len(profiles['exons'])) avg_me = map(operator.itemgetter(0), outs) exoncount = map(operator.itemgetter(1), outs) outs = map(corelib.mean_col_by_col, map(corelib.extend_list_series, profiles['introns']), [True] * len(profiles['introns'])) avg_mi = map(operator.itemgetter(0), outs) introncount = map(operator.itemgetter(1), outs) del outs if options.dump: # if dump is on, dump upstream, downstream and genebody if options.name2: options.out['upstreams'].write(profiler.dump(chrom, profiles['start'], profiles['end'], profiles['name2'], profiles['strand'], profiles['upstreams'])) options.out['downstreams'].write(profiler.dump(chrom, profiles['start'], profiles['end'], profiles['name2'], profiles['strand'], profiles['downstreams'])) options.out['genes'].write(profiler.dump(chrom, profiles['start'], profiles['end'], profiles['name2'], profiles['strand'], profiles['genes'])) else: options.out['upstreams'].write(profiler.dump(chrom, profiles['start'], profiles['end'], profiles['name'], profiles['strand'], profiles['upstreams'])) options.out['downstreams'].write(profiler.dump(chrom, profiles['start'], profiles['end'], profiles['name'], profiles['strand'], profiles['downstreams'])) options.out['genes'].write(profiler.dump(chrom, profiles['start'], profiles['end'], profiles['name'], profiles['strand'], profiles['genes'])) if not FIRST: # if not first chromosome avg_upstream,avg_upcount=corelib.weight_mean_col_by_col([avg_upstream,avg_up],[avg_upcount,upcount],counts=True) avg_downstream,avg_downcount=corelib.weight_mean_col_by_col([avg_downstream,avg_down],[avg_downcount,upcount],counts=True) avg_metagene,avg_genecount=corelib.weight_mean_col_by_col([avg_metagene,avg_mg],[avg_genecount,genecount],counts=True) avg_metacatexon,avg_cexoncount=corelib.weight_mean_col_by_col([avg_metacatexon,avg_mce],[avg_cexoncount,cexoncount],counts=True) avg_metacatintron,avg_cintroncount=corelib.weight_mean_col_by_col([avg_metacatintron,avg_mci],[avg_cintroncount,cintroncount],counts=True) outs = map(corelib.weight_mean_col_by_col, map(lambda x, y: [x, y], avg_metaexon, avg_me), map(lambda x, y: [x, y], avg_exoncount, exoncount), [True] * n_metas) avg_metaexon = map(operator.itemgetter(0), outs) avg_exoncount = map(operator.itemgetter(1), outs) outs = map(corelib.weight_mean_col_by_col, map(lambda x, y: [x, y], avg_metaintron, avg_mi), map(lambda x, y: [x, y], avg_introncount, introncount), [True] * n_metas) avg_metaintron = map(operator.itemgetter(0), outs) avg_introncount = map(operator.itemgetter(1), outs) del outs, avg_me, avg_mi, exoncount, introncount del avg_up,avg_down,avg_mg,avg_mce,avg_mci,upcount,downcount,genecount,cexoncount,cintroncount if options.gn_groups: # when gene sub-gropus are given if options.name2: ixs, subsets = profiler.get_gene_indicies(profiles['name2'], subsets) #ixs = profiler.get_gene_indicies2(profiles['name2'], subsets) else: ixs, subsets = profiler.get_gene_indicies(profiles['name'], subsets) #ixs = profiler.get_gene_indicies2(profiles['name'], subsets) avg_ups, upcs = profiler.select_take_average_profiles_chr_by_chr(ixs, profiles['upstreams']) avg_downs, downcs = profiler.select_take_average_profiles_chr_by_chr(ixs, profiles['downstreams']) avg_mgs, gcs = profiler.select_take_average_profiles_chr_by_chr(ixs, profiles['genes']) avg_mces, cecs = profiler.select_take_average_profiles_chr_by_chr(ixs, profiles['catexons']) avg_mcis, cics = profiler.select_take_average_profiles_chr_by_chr(ixs, profiles['catintrons']) # exon and intron profile avg_mes, ecs = profiler.select_take_average_profiles_chr_by_chr_meta(ixs, profiles['exons']) avg_mis, ics = profiler.select_take_average_profiles_chr_by_chr_meta(ixs, profiles['introns']) avg_upstreams,avg_upcounts=profiler.weight_mean_profiles_chr_by_chr(avg_upstreams,avg_upcounts,avg_ups,upcs) avg_downstreams,avg_downcounts=profiler.weight_mean_profiles_chr_by_chr(avg_downstreams,avg_downcounts,avg_downs,downcs) avg_metagenes,avg_genecounts=profiler.weight_mean_profiles_chr_by_chr(avg_metagenes,avg_genecounts,avg_mgs,gcs) avg_metacatexons,avg_cexoncounts=profiler.weight_mean_profiles_chr_by_chr(avg_metacatexons,avg_cexoncounts,avg_mces,cecs) avg_metacatintrons,avg_cintroncounts=profiler.weight_mean_profiles_chr_by_chr(avg_metacatintrons,avg_cintroncounts,avg_mcis,cics) # exon and intron profiling avg_metaexons, avg_exoncounts = profiler.weight_mean_profiles_chr_by_chr_meta(avg_metaexons, avg_exoncounts, avg_mes, ecs) avg_metaintrons, avg_introncounts = profiler.weight_mean_profiles_chr_by_chr_meta(avg_metaintrons, avg_introncounts, avg_mis, ics) del avg_mes,avg_mis, ecs,ics del avg_ups,avg_downs,avg_mgs,avg_mces,avg_mcis,upcs,downcs,gcs,cecs,cics if options.dump: # if dump is on, dump upstream, downstream and genebody if options.name2: txts = profiler.dump_mult(chrom, profiles['start'], profiles['end'], profiles['name2'], profiles['strand'], profiles['upstreams'], ixs) write_txts(options.outs['upstreams'], txts) txts = profiler.dump_mult(chrom, profiles['start'], profiles['end'], profiles['name2'], profiles['strand'], profiles['downstreamss'], ixs) write_txts(options.outs['downstreams'], txts) txts = profiler.dump_mult(chrom, profiles['start'], profiles['end'], profiles['name2'], profiles['strand'], profiles['genes'], ixs) write_txts(options.outs['genes'], txts) else: txts = profiler.dump_mult(chrom, profiles['start'], profiles['end'], profiles['name'], profiles['strand'], profiles['upstreams'], ixs) write_txts(options.outs['upstreams'], txts) txts = profiler.dump_mult(chrom, profiles['start'], profiles['end'], profiles['name'], profiles['strand'], profiles['downstreams'], ixs) write_txts(options.outs['downstreams'], txts) txts = profiler.dump_mult(chrom, profiles['start'], profiles['end'], profiles['name'], profiles['strand'], profiles['genes'], ixs) write_txts(options.outs['genes'], txts) else: # if first chromosome avg_upstream = avg_up avg_downstream = avg_down avg_metagene = avg_mg avg_metacatexon = avg_mce avg_metacatintron = avg_mci avg_upcount = upcount avg_downcount = downcount avg_genecount = genecount avg_cexoncount = cexoncount avg_cintroncount = cintroncount avg_metaexon = avg_me avg_metaintron = avg_mi avg_exoncount=exoncount avg_introncount=introncount if options.gn_groups: # if name2 is used instead of name if options.name2: ixs, subsets = profiler.get_gene_indicies(profiles['name2'], subsets) #ixs = profiler.get_gene_indicies2(profiles['name2'], subsets) else: ixs, subsets = profiler.get_gene_indicies(profiles['name'], subsets) #ixs = profiler.get_gene_indicies2(profiles['name'], subsets) avg_upstreams, avg_upcounts = profiler.select_take_average_profiles_chr_by_chr(ixs, profiles['upstreams']) avg_downstreams, avg_downcounts = profiler.select_take_average_profiles_chr_by_chr(ixs, profiles['downstreams']) avg_metagenes, avg_genecounts = profiler.select_take_average_profiles_chr_by_chr(ixs, profiles['genes']) avg_metacatexons, avg_cexoncounts = profiler.select_take_average_profiles_chr_by_chr(ixs, profiles['catexons']) avg_metacatintrons, avg_cintroncounts = profiler.select_take_average_profiles_chr_by_chr(ixs, profiles['catintrons']) avg_metaexons, avg_exoncounts = profiler.select_take_average_profiles_chr_by_chr_meta(ixs, profiles['exons']) avg_metaintrons, avg_introncounts = profiler.select_take_average_profiles_chr_by_chr_meta(ixs, profiles['introns']) if options.dump: # if dump is on, dump upstream, downstream and genebody if options.name2: txts = profiler.dump_mult(chrom, profiles['start'], profiles['end'], profiles['name2'], profiles['strand'], profiles['upstreams'], ixs) write_txts(options.outs['upstreams'], txts) txts = profiler.dump_mult(chrom, profiles['start'], profiles['end'], profiles['name2'], profiles['strand'], profiles['downstreams'], ixs) write_txts(options.outs['downstreams'], txts) txts = profiler.dump_mult(chrom, profiles['start'], profiles['end'], profiles['name2'], profiles['strand'], profiles['genes'], ixs) write_txts(options.outs['genes'], txts) else: txts = profiler.dump_mult(chrom, profiles['start'], profiles['end'], profiles['name'], profiles['strand'], profiles['upstreams'], ixs) write_txts(options.outs['upstreams'], txts) txts = profiler.dump_mult(chrom, profiles['start'], profiles['end'], profiles['name'], profiles['strand'], profiles['downstreams'], ixs) write_txts(options.outs['downstreams'], txts) txts = profiler.dump_mult(chrom, profiles['start'], profiles['end'], profiles['name'], profiles['strand'], profiles['genes'], ixs) write_txts(options.outs['genes'], txts) FIRST=False ##del profiles # set chrom to the new chromosome chrom=newchrom if chrom in chroms_GeneT: # only if the new chromosome is in the chroms of gene table, a wig object is initiated. if options.rebg: info("#%d-%d run wig profiling and genome bg annotation of %s..." %(jobcount,chrcount,chrom)) else: info("#%d-%d run wig profiling of %s..." %(jobcount,chrcount,chrom)) input=inout.Wig() # if fixedStep, calculate the position from start and step if fixedStep: row = [position, l[-1]] position += step else: row = l # add the line to the Wig object input.add_line(chrom, row) chrcount+=1 else: # in the middle of chromosome if chrom in chroms_GeneT: # only if the new chromosome is in the chroms of gene table, the wig object is updated. # if fixedStep, calculate the position from start and step if fixedStep: row = [position, l[-1]] position += step else: row = l # add the line to the Wig object input.add_line(chrom, row) # do profiling for the last chromosome if chrom in chroms_GeneT: if options.rebg: Sampler=sampler.GenomeSampler() Annotator=annotator.Annotator() GA=Annotator.annotate(Sampler.sample(input,resolution=options.bg_res),GeneT,roi=roi,prom=options.sizes,biprom=options.bisizes,down=options.sizes,gene_div=(3,5)) tempS,tempP=Annotator.summarize(GA) GenomeBGS.add_row(chrom,tempS.get_row(chrom)) tempPieSt = Annotator.obtain_distribution_of_sites_per_chrom(GA) tempPieS = tables.PieSummary() tempPieS.import2tb( tempPieSt ) GenomePieS.add_row( chrom, tempPieS.get_row( chrom ) ) # if extra bed file exists, get the bg statistics of the regions if options.chipannot and options.ebedannot: GP=_get_bgroi(GP,GenomeBGS,roi=roi,bg_res=options.bg_res) # update ChIP regions with wig scores if options.chipannot: profiler.scan_scores_in_wig(ChIP, input) try: wigsize[chrom] = (input[chrom][0][0], input[chrom][0][-1]) except IndexError: wigsize[chrom] = (0, 0) # profiling profiles = prof.profile(input, GeneT) del input # get average of this chromosome avg_up,upcount = corelib.mean_col_by_col(profiles['upstreams'], counts=True) avg_down,downcount = corelib.mean_col_by_col(profiles['downstreams'], counts=True) avg_mg,genecount = corelib.mean_col_by_col(profiles['genes'], counts=True) avg_mce,cexoncount = corelib.mean_col_by_col(profiles['catexons'], counts=True) avg_mci,cintroncount = corelib.mean_col_by_col(profiles['catintrons'],counts=True) # get average of exons and introns of this chromosome n_metas = len(profiles['exons']) outs = map(corelib.mean_col_by_col, map(corelib.extend_list_series, profiles['exons']), [True] * len(profiles['exons'])) avg_me = map(operator.itemgetter(0), outs) exoncount = map(operator.itemgetter(1), outs) outs = map(corelib.mean_col_by_col, map(corelib.extend_list_series, profiles['introns']), [True] * len(profiles['introns'])) avg_mi = map(operator.itemgetter(0), outs) introncount = map(operator.itemgetter(1), outs) if options.dump: # if dump is on, dump upstream, downstream and genebody if options.name2: options.out['upstreams'].write(profiler.dump(chrom, profiles['start'], profiles['end'], profiles['name2'], profiles['strand'], profiles['upstreams'])) options.out['downstreams'].write(profiler.dump(chrom, profiles['start'], profiles['end'], profiles['name2'], profiles['strand'], profiles['downstreams'])) options.out['genes'].write(profiler.dump(chrom, profiles['start'], profiles['end'], profiles['name2'], profiles['strand'], profiles['genes'])) else: options.out['upstreams'].write(profiler.dump(chrom, profiles['start'], profiles['end'], profiles['name'], profiles['strand'], profiles['upstreams'])) options.out['downstreams'].write(profiler.dump(chrom, profiles['start'], profiles['end'], profiles['name'], profiles['strand'], profiles['downstreams'])) options.out['genes'].write(profiler.dump(chrom, profiles['start'], profiles['end'], profiles['name'], profiles['strand'], profiles['genes'])) if not FIRST: # the first chromosome profiling avg_upstream,avg_upcount=corelib.weight_mean_col_by_col([avg_upstream,avg_up],[avg_upcount,upcount],counts=True) avg_downstream,avg_downcount=corelib.weight_mean_col_by_col([avg_downstream,avg_down],[avg_downcount,upcount],counts=True) avg_metagene,avg_genecount=corelib.weight_mean_col_by_col([avg_metagene,avg_mg],[avg_genecount,genecount],counts=True) avg_metacatexon,avg_cexoncount=corelib.weight_mean_col_by_col([avg_metacatexon,avg_mce],[avg_cexoncount,cexoncount],counts=True) avg_metacatintron,avg_cintroncount=corelib.weight_mean_col_by_col([avg_metacatintron,avg_mci],[avg_cintroncount,cintroncount],counts=True) # get the average of exon and introns of all together outs = map(corelib.weight_mean_col_by_col, map(lambda x, y: [x, y], avg_metaexon, avg_me), map(lambda x, y: [x, y], avg_exoncount, exoncount), [True] * n_metas) avg_metaexon = map(operator.itemgetter(0), outs) avg_exoncount = map(operator.itemgetter(1), outs) outs = map(corelib.weight_mean_col_by_col, map(lambda x, y: [x, y], avg_metaintron, avg_mi), map(lambda x, y: [x, y], avg_introncount, introncount), [True] * n_metas) avg_metaintron = map(operator.itemgetter(0), outs) avg_introncount = map(operator.itemgetter(1), outs) del outs,avg_me,avg_mi,exoncount,introncount del avg_up,avg_down,avg_mg,avg_mce,avg_mci,upcount,downcount,genecount,cexoncount,cintroncount if options.gn_groups: # when name2 is used instead of name of the gene annotation table if options.name2: ixs, subsets = profiler.get_gene_indicies(profiles['name2'], subsets) #ixs = profiler.get_gene_indicies2(profiles['name2'], subsets) else: ixs, subsets = profiler.get_gene_indicies(profiles['name'], subsets) #ixs = profiler.get_gene_indicies2(profiles['name'], subsets) # take an average of each profile (upstream, downstream, gene, exon, intron, cat-exon, cat-intron avg_ups, upcs = profiler.select_take_average_profiles_chr_by_chr(ixs, profiles['upstreams']) avg_downs, downcs = profiler.select_take_average_profiles_chr_by_chr(ixs, profiles['downstreams']) avg_mgs, gcs = profiler.select_take_average_profiles_chr_by_chr(ixs, profiles['genes']) avg_mces, cecs = profiler.select_take_average_profiles_chr_by_chr(ixs, profiles['catexons']) avg_mcis, cics = profiler.select_take_average_profiles_chr_by_chr(ixs, profiles['catintrons']) # take averages of exons and introns of all the chromosomes avg_mes, ecs = profiler.select_take_average_profiles_chr_by_chr_meta(ixs, profiles['exons']) avg_mis, ics = profiler.select_take_average_profiles_chr_by_chr_meta(ixs, profiles['introns']) avg_upstreams,avg_upcounts=profiler.weight_mean_profiles_chr_by_chr(avg_upstreams,avg_upcounts,avg_ups,upcs) avg_downstreams,avg_downcounts=profiler.weight_mean_profiles_chr_by_chr(avg_downstreams,avg_downcounts,avg_downs,downcs) avg_metagenes,avg_genecounts=profiler.weight_mean_profiles_chr_by_chr(avg_metagenes,avg_genecounts,avg_mgs,gcs) avg_metacatexons,avg_cexoncounts=profiler.weight_mean_profiles_chr_by_chr(avg_metacatexons,avg_cexoncounts,avg_mces,cecs) avg_metacatintrons,avg_cintroncounts=profiler.weight_mean_profiles_chr_by_chr(avg_metacatintrons,avg_cintroncounts,avg_mcis,cics) avg_metaexons, avg_exoncounts = profiler.weight_mean_profiles_chr_by_chr_meta(avg_metaexons, avg_exoncounts, avg_mes, ecs) avg_metaintrons, avg_introncounts = profiler.weight_mean_profiles_chr_by_chr_meta(avg_metaintrons, avg_introncounts, avg_mis, ics) del avg_mes,avg_mis,ecs,ics del avg_ups,avg_downs,avg_mgs,avg_mces,avg_mcis,upcs,downcs,gcs,cecs,cics if options.dump: # if dump is on, dump upstream, downstream and genebody if options.name2: txts = profiler.dump_mult(chrom, profiles['start'], profiles['end'], profiles['name2'], profiles['strand'], profiles['upstreams'], ixs) write_txts(options.outs['upstreams'], txts) txts = profiler.dump_mult(chrom, profiles['start'], profiles['end'], profiles['name2'], profiles['strand'], profiles['downstreams'], ixs) write_txts(options.outs['downstreams'], txts) txts = profiler.dump_mult(chrom, profiles['start'], profiles['end'], profiles['name2'], profiles['strand'], profiles['genes'], ixs) write_txts(options.outs['genes'], txts) else: txts = profiler.dump_mult(chrom, profiles['start'], profiles['end'], profiles['name'], profiles['strand'], profiles['upstreams'], ixs) write_txts(options.outs['upstreams'], txts) txts = profiler.dump_mult(chrom, profiles['start'], profiles['end'], profiles['name'], profiles['strand'], profiles['downstreams'], ixs) write_txts(options.outs['downstreams'], txts) txts = profiler.dump_mult(chrom, profiles['start'], profiles['end'], profiles['name'], profiles['strand'], profiles['genes'], ixs) write_txts(options.outs['genes'], txts) else: avg_upstream=avg_up avg_downstream=avg_down avg_metagene=avg_mg avg_metacatexon=avg_mce avg_metacatintron=avg_mci avg_upcount=upcount avg_downcount=downcount avg_genecount=genecount avg_cexoncount=cexoncount avg_cintroncount=cintroncount avg_metaexon=avg_me avg_metaintron=avg_mi avg_exoncount=exoncount avg_introncount=introncount if options.gn_groups: if options.name2: ixs, subsets = profiler.get_gene_indicies(profiles['name2'], subsets) #ixs = profiler.get_gene_indicies2(profiles['name2'], subsets) else: ixs, subsets = profiler.get_gene_indicies(profiles['name'], subsets) #ixs = profiler.get_gene_indicies2(profiles['name'], subsets) avg_upstreams, avg_upcounts = profiler.select_take_average_profiles_chr_by_chr(ixs, profiles['upstreams']) avg_downstreams, avg_downcounts = profiler.select_take_average_profiles_chr_by_chr(ixs, profiles['downstreams']) avg_metagenes, avg_genecounts = profiler.select_take_average_profiles_chr_by_chr(ixs, profiles['genes']) avg_metacatexons, avg_cexoncounts = profiler.select_take_average_profiles_chr_by_chr(ixs, profiles['catexons']) avg_metacatintrons, avg_cintroncounts = profiler.select_take_average_profiles_chr_by_chr(ixs, profiles['catintrons']) avg_metaexons, avg_exoncounts = profiler.select_take_average_profiles_chr_by_chr_meta(ixs, profiles['exons']) avg_metaintrons, avg_introncounts = profiler.select_take_average_profiles_chr_by_chr_meta(ixs, profiles['introns']) if options.dump: # if dump is on, dump upstream, downstream and genebody if options.name2: txts = profiler.dump_mult(chrom, profiles['start'], profiles['end'], profiles['name2'], profiles['strand'], profiles['upstreams'], ixs) write_txts(options.outs['upstreams'], txts) txts = profiler.dump_mult(chrom, profiles['start'], profiles['end'], profiles['name2'], profiles['strand'], profiles['downstreams'], ixs) write_txts(options.outs['downstreams'], txts) txts = profiler.dump_mult(chrom, profiles['start'], profiles['end'], profiles['name2'], profiles['strand'], profiles['genes'], ixs) write_txts(options.outs['genes'], txts) else: txts = profiler.dump_mult(chrom, profiles['start'], profiles['end'], profiles['name'], profiles['strand'], profiles['upstreams'], ixs) write_txts(options.outs['upstreams'], txts) txts = profiler.dump_mult(chrom, profiles['start'], profiles['end'], profiles['name'], profiles['strand'], profiles['downstreams'], ixs) write_txts(options.outs['downstreams'], txts) txts = profiler.dump_mult(chrom, profiles['start'], profiles['end'], profiles['name'], profiles['strand'], profiles['genes'], ixs) write_txts(options.outs['genes'], txts) # close the file handlers for dumping the profiles if options.dump: close_dump_files(options.out['upstreams']) close_dump_files(options.out['downstreams']) close_dump_files(options.out['genes']) if options.gn_groups: close_dump_mult_files(options.outs['upstreams']) close_dump_mult_files(options.outs['downstreams']) close_dump_mult_files(options.outs['genes']) jobcount+=1 if options.chipannot and options.rebg: info('#%d perform ChIP region annotation...' %jobcount) # summarize the genome annotation GenomeBGS.summarize() GP=GenomeBGS.get_p() GP.set_name('GenomeBGP') GenomePieS.summarize() GenomePieP = GenomePieS.get_p() GenomePieP.set_name('GenomePieP') # do ChIP annotation Annot=annotator.Annotator() ChIPA=Annot.annotate(genome_coordinates=ChIPSamp,gene_table=GeneT,roi=roi,prom=options.sizes,biprom=options.bisizes,down=options.sizes,gene_div=(3,5)) CS,CP=Annot.summarize(ChIPA) CPval=annotator.estimate_pvals(GP,CS,CP) CES, CEP = Annot.obtain_distribution_of_sites(ChIPA) jobcount+=1 info('#%d write R script of ChIP region annotation and wig profiling...' %jobcount) ofhd=open(options.name+'.R','w') pdfname=options.name+'.pdf' # the first part of CEAS R script. Because wig profiling is not run, just terminate rscript = options.argtxt rscript += "\n" rscript += R.pdf(pdfname,height=11.5,width=8.5) rscript+=inout.draw_CEAS(GP,CP,CPval,bg_res=options.bg_res,chip_res=options.chip_res,prom=options.sizes,biprom=options.bisizes,down=options.sizes,gene_div=(3,5)) ofhd.write(rscript) # writing ChIP region annotation # write the pie chart GEP = GenomePieP.export2dic() rscript = inout.draw_pie_distribution_of_elements(GEP, CEP, gprom=(1000, 2000, 3000), gdown=(1000, 2000, 3000), prom=options.sizes, down=options.sizes) ofhd.write(rscript) jobcount+=1 # # writing R script for wig profiling # if options.chipannot: # write the ChIP regions over chromosomes rscript = inout.draw_ChIP_over_genome_mono_col(ChIP, wigsize, n_peaks=options.n_peaks) ofhd.write(rscript) info('#%d append an R script of wig profiling...' %jobcount) else: info('#%d write an R script of wig profiling...' %jobcount) ofhd = open(options.name+'.R', 'w') pdfname = options.name + '.pdf' rscript = options.argtxt rscript += "\n" rscript += R.pdf(pdfname, height=11.5, width=8.5) ofhd.write(rscript) # # write a R script of drawing profiles near genes # # get breaks for the plots breaks = profiles['breaks'] metagene_breaks = profiles['genebreaks'] metacatexon_breaks = profiles['catexonbreaks'] metacatintron_breaks = profiles['catintronbreaks'] metaexon_breaks = profiles['exonbreaks'] metaintron_breaks = profiles['intronbreaks'] # write R script if options.gn_groups: # when multiple gene groups are given # append the profiles of all genes avg_upstreams.append(avg_upstream) avg_downstreams.append(avg_downstream) avg_metagenes.append(avg_metagene) avg_metacatexons.append(avg_metacatexon) avg_metacatintrons.append(avg_metacatintron) rscript=inout.draw_profile_plots(breaks,avg_upstreams,avg_downstreams,metagene_breaks,avg_metagenes,metacatexon_breaks,avg_metacatexons,metacatintron_breaks,avg_metacatintrons,metagene_breaks_lim=[-1000,1000],legends=options.gn_names) # exon intron profiling map(lambda x, y: x.append(y), avg_metaexons, avg_metaexon) map(lambda x, y: x.append(y), avg_metaintrons, avg_metaintron) rscript+=inout.draw_exon_intron_profile_plots(metaexon_breaks, avg_metaexons,metaintron_breaks,avg_metaintrons, options.elowers, options.euppers, options.ilowers, options.iuppers, legends=options.gn_names) else: # only when a single master profiling is obtianed rscript=inout.draw_profile_plot(breaks,avg_upstream,avg_downstream,metagene_breaks,avg_metagene,metacatexon_breaks,avg_metacatexon,metacatintron_breaks,avg_metacatintron,metagene_breaks_lim=[-1000,1000]) rscript+=inout.draw_exon_intron_profile_plot(metaexon_breaks,avg_metaexon,metaintron_breaks,avg_metaintron, options.elowers, options.euppers, options.ilowers, options.iuppers) ofhd.write(rscript) # write wig profiling # write to the file and close it ofhd.write(R.devoff()) ofhd.close() # Run R directly - if any exceptions, just pass try: p = subprocess.Popen("R" + " --vanilla < %s" %(options.name+'.R'), shell=True) #sts = os.waitpid(p.pid, 0) p.wait() info ('#... cong! See %s for the graphical results of CEAS!' %(options.name+'.pdf')) except: info ('#... cong! Run %s using R for the graphical results of CEAS! CEAS could not run R directly.' %(options.name+'.R')) # ------------------------------------ # functions # ------------------------------------ def prepare_optparser (): """Prepare optparser object. New options will be added in this function first. """ usage = "usage: %prog < input files > [options]" description = "CEAS (Cis-regulatory Element Annotation System)" optparser = OptionParser(version="%prog -- 0.9.9.7 (package version 1.0.2)",description=description,usage=usage,add_help_option=False) optparser.add_option("-h","--help",action="help",help="Show this help message and exit.") optparser.add_option("-b","--bed",dest="bed",type="string", help="BED file of ChIP regions.") optparser.add_option("-w","--wig",dest="wig",type="string", help="WIG file for either wig profiling or genome background annotation. WARNING: --bg flag must be set for genome background re-annotation.") optparser.add_option("-e","--ebed",dest="ebed",type="string", help="BED file of extra regions of interest (eg, non-coding regions)") optparser.add_option("-g","--gt",dest="gdb",type="string", help="Gene annotation table (eg, a refGene table in sqlite3 db format provided through the CEAS web, http://liulab.dfci.harvard.edu/CEAS/download.html).") optparser.add_option("--name",dest="name",\ help="Experiment name. This will be used to name the output files. If an experiment name is not given, the stem of the input BED file name will be used instead (eg, if 'peaks.bed', 'peaks' will be used as a name.)") optparser.add_option("--sizes",dest="sizes",type="str", help="Promoter (also dowsntream) sizes for ChIP region annotation. Comma-separated three values or a single value can be given. If a single value is given, it will be segmented into three equal fractions (ie, 3000 is equivalent to 1000,2000,3000), DEFAULT: 1000,2000,3000. WARNING: Values > 10000bp are automatically set to 10000bp.", default='1000,2000,3000') optparser.add_option("--bisizes",dest="bisizes",type="str", help="Bidirectional-promoter sizes for ChIP region annotation Comma-separated two values or a single value can be given. If a single value is given, it will be segmented into two equal fractions (ie, 5000 is equivalent to 2500,5000) DEFAULT: 2500,5000bp. WARNING: Values > 20000bp are automatically set to 20000bp.", default='2500,5000') optparser.add_option("--bg",action="store_true",dest="bg",\ help="Run genome BG annotation again. WARNING: This flag is effective only if a WIG file is given through -w (--wig). Otherwise, ignored.",default=False) optparser.add_option("--span", dest="span", type="int",\ help="Span from TSS and TTS in the gene-centered annotation. ChIP regions within this range from TSS and TTS are considered when calculating the coverage rates in promoter and downstream, DEFAULT=3000bp", default=3000) optparser.add_option("--pf-res", dest="pf_res", type="int",\ help="Wig profiling resolution, DEFAULT: 50bp. WARNING: Value smaller than the wig interval (resolution) may cause aliasing error.", default=50) optparser.add_option("--rel-dist",dest="rel_dist",type="int", help="Relative distance to TSS/TTS in wig profiling, DEFAULT: 3000bp", default=3000) optparser.add_option("--gn-groups",dest="gn_groups",type="string",\ help="Gene-groups of particular interest in wig profiling. Each gene group file must have gene names in the 1st column. The file names are separated by commas w/ no space (eg, --gn-groups=top10.txt,bottom10.txt)") optparser.add_option("--gn-group-names", dest="gn_names",type="string",\ help="The names of the gene groups in --gn-groups. The gene group names are separated by commas. (eg, --gn-group-names='top 10%,bottom 10%'). These group names appear in the legends of the wig profiling plots. If no group names given, the groups are represented as 'Group 1, Group2,...Group n'.") optparser.add_option("--gname2", action="store_true", dest="name2",\ help="Whether or not use the 'name2' column of the gene annotation table when reading the gene IDs in the files given through --gn-groups. This flag is meaningful only with --gn-groups.",default=False) optparser.add_option("--dump", action="store_true", dest="dump",\ help="Whether to save the raw profiles of near TSS, TTS, and gene body. The file names have a suffix of 'TSS', 'TTS', and 'gene' after the name.",default=False) return optparser def opt_validate (optparser): """Validate options from a OptParser object. Ret: Validated options object. """ (options,args) = optparser.parse_args() # if gdb not given, print help, either BED or WIG must be given if not options.gdb and not options.bed and not options.wig: optparser.print_help() sys.exit(1) elif not options.gdb: error('A gene table file must be given through -g (--gt).') sys.exit(1) elif options.gdb and not options.bed and not options.wig: error('Either a BED file or a WIG file must be given.') sys.exit(1) ## # check what inputs are given and determine which modules will operate ## # # check gene annotation table database # #quotes bug fix if options.gdb and'"' in options.gdb: options.gdb = options.gdb.replace('"','') if options.bed and '"' in options.bed: options.bed = options.bed.replace('"','') if options.wig and '"' in options.wig: options.wig = options.wig.replace('"','') if options.ebed and '"' in options.ebed: options.ebed = options.ebed.replace('"','') if options.name and '"' in options.name: options.name = options.name.replace('"','') if options.gn_groups and '"' in options.gn_groups: options.gn_groups = options.gn_groups.replace('"','') if options.gn_names and '"' in options.gn_names: options.gn_names = options.gn_names.replace('"','') HAVELOCALGDB = os.path.isfile(options.gdb) if not HAVELOCALGDB: error("No such gene table file as '%s'" %options.gdb) sys.exit(1) else: options.gdbtype = 'localdb' options.Host = None options.User = None # #check the ChIP bed file # if options.bed: HAVEBED = os.path.isfile(options.bed) if not HAVEBED: error("Check -b (--bed). No such bed file as '%s'" %options.bed) sys.exit(1) if os.path.getsize(options.bed) > 5000000: warnings.warn("ChIP bed file size may be too large to run CEAS with. Make sure it is a 'peak' file!") #error("ChIP bed file size is too big to handle! The file size is limmited to 5M bytes.") #sys.exit(1) else: HAVEBED = False # # check the wig file # if options.wig: HAVEWIG=os.path.isfile(options.wig) if not HAVEWIG: error("Check -w (--wig). No such wig file as '%s'" %options.wig) sys.exit(1) else: HAVEWIG=False # check background annotation BG = options.bg REBG = False PF = False CHIPANNOT = False # # determine do ChIP annotation and re-do genome background annotation # if HAVEBED: CHIPANNOT = True if BG and HAVEWIG: REBG = True # # determine do wig profiling # if HAVEWIG: if HAVELOCALGDB: PF = True # # non-coding regions # EBEDANNOT = False if options.ebed and CHIPANNOT: if not os.path.isfile(options.ebed): error("Check -e (--ebed). No such file as '%s'" %options.ebed) sys.exit(0) else: EBEDANNOT = True # # set options to activate CEAS modules # options.chipannot = CHIPANNOT options.rebg = REBG options.pf = PF options.ebedannot = EBEDANNOT ## # handle other options ## # # get the experiment name # # if options.name is not given, BED and WIG file names will be used in order if not options.name: if HAVEBED: options.name=os.path.split(options.bed)[-1].rsplit('.bed',2)[0] elif HAVEWIG: options.name=os.path.split(options.wig)[-1].rsplit('.wig',2)[0] # # ChIP annotation related parameters # options.bg_res=100 options.chip_res = 1000 # promoter downstream lengths and bidirectional promoter lengths try: options.sizes= map(int, options.sizes.rsplit(',')) options.bisizes= map(int, options.bisizes.rsplit(',')) except ValueError: error('Only integer values are accepted for --sizes or --bisizes and numbers must be comma-separated w/o space') sys.exit(0) # only three values or one value can be given if len(options.sizes) !=3 and len(options.sizes) !=1: error('Three comma-separated numbers or a single number can be given for --sizes') sys.exit(0) # only two values or one value can be given if len(options.bisizes) !=2 and len(options.bisizes) !=1: error('Two comma-separated numbers or a single number can be given for --bisizes') sys.exit(0) # saturate the values with 1000 for promoter and downstream and 20000 for bidirectional promoter options.sizes = map(min, [10000]*len(options.sizes), options.sizes) options.bisizes = map(min, [20000]*len(options.bisizes), options.bisizes) # if a single value is given, split into three equal fractions if len(options.sizes) == 1: n = 3 options.sizes = [options.sizes[0]*i/n for i in range(1, n+1)] options.sizes.sort() options.sizes = tuple(options.sizes) if len(options.bisizes) == 1: n = 2 options.bisizes = [options.bisizes[0]*i/n for i in range(1, n+1)] options.bisizes.sort() options.bisizes = tuple(options.bisizes) # # gene-centered annotatino related parameters # options.span = max(1, options.span) # # Average profiling related parameters # #check if name2 is going to be used instead of name if options.name2 and options.pf and options.gn_groups: options.name2 = True else: options.name2 = False # check the gene group files if options.pf and options.gn_groups: parsed=options.gn_groups.rsplit(',') for p in parsed: if not os.path.isfile(p): error("Check --gn-groups. No such file as '%s'" %p) sys.exit(0) options.gn_groups=parsed # gene group names. If not given, Group 1, Group 2, ... Group n will be used if options.gn_names: parsed_names=options.gn_names.rsplit(',') if len(parsed_names) < len(options.gn_groups): error('There must be the equal or more group names to or than gene groups') sys.exit(0) options.gn_names=parsed_names else: options.gn_names=[] for i in range(len(options.gn_groups)): options.gn_names.append('Group %d' %(i+1)) # profiling resolution options.pf_res = max(1, options.pf_res) # relative distance options.rel_dist = max(options.rel_dist, options.pf_res) # metagene_size options.metagene_size = 3000 options.catexon_size = options.metagene_size/2 options.catintron_size = options.metagene_size/2 # exon length and intron lengths options.epercentlowers=[10, 35, 65] # in percent options.epercentuppers=[35, 65, 90] # in percent options.ipercentlowers=[10, 35, 65] # in percent options.ipercentuppers=[35, 65, 90] # in percent # # dummy parameters # options.elowers = None options.euppers = None options.ilowers = None options.iuppers = None options.metaexonsizes = None options.metaintronsizes = None options.n_peaks = None # the number of top n peaks to plot in chromosomal view of the peaks: Default None, which means only downsampling will be applied. # # dump # if options.dump: options.out = {} options.out['upstreams'] = open(options.name + '_dump_TSS.txt', 'w') options.out['downstreams'] = open(options.name + '_dump_TTS.txt', 'w') options.out['genes'] = open(options.name + '_dump_gene.txt', 'w') if options.gn_groups: options.outs = {} # group names gn_names = [re.sub(r' ', '_', gn_name) for gn_name in options.gn_names] options.outs['upstreams'] = [open(options.name + "_" + gn_name + "_dump_TSS.txt", "w") for gn_name in gn_names] options.outs['downstreams'] = [open(options.name + "_" + gn_name + "_dump_TTS.txt", "w") for gn_name in gn_names] options.outs['genes'] = [open(options.name + "_" + gn_name + "_dump_gene.txt", "w") for gn_name in gn_names] dumpprof = 'On' else: dumpprof = 'Off' # # make a txt of parameter setting # # basic parameters if options.chipannot: chipannot = 'On' else: chipannot = 'Off' if options.pf: avpf = 'On' else: avpf = 'Off' options.argtxt = "\n".join(( "# ARGUMENTS: ", \ "# name = %s" % (options.name),\ "# gene annotation table = %s" % (options.gdb),\ "# BED file = %s" % (options.bed),\ "# WIG file = %s" % (options.wig),\ "# extra BED file = %s" % (options.ebed),\ "# ChIP annotation = %s" % (chipannot),\ "# gene-centered annotation = %s" %(chipannot),\ "# average profiling = %s" %(avpf),\ "# dump profiles = %s" %(dumpprof))) # if ChIP region annotation is running if options.chipannot: options.argtxt += "\n" options.argtxt += "\n".join(("# re-annotation for genome background (ChIP region annotation) = %s" %(str(options.rebg)),\ "# promoter sizes (ChIP region annotation) = " + ",".join(["%d" %p for p in options.sizes]) + " bp",\ "# downstream sizes (ChIP region annotation) = " + ",".join(["%d" %d for d in options.sizes]) + " bp",\ "# bidrectional promoter sizes (ChIP region annotation) = " + ",".join(["%d" %d for d in options.bisizes]) + " bp",\ "# span size (gene-centered annotation) = %d bp" %(options.span))) # if average profilng is running if options.pf: options.argtxt += "\n" options.argtxt += "\n".join(("# profiling resolution (average profiling) = %d bp" %(options.pf_res),\ "# relative distance wrt TSS and TTS (average profiling) = %d bp" % (options.rel_dist))) if options.gn_groups: options.argtxt += "\n" options.argtxt += "# gene groups (average profiling) = %s" %", ".join(options.gn_groups) return options def get_min_max_lengths(GeneT, epercentlimit, ipercentlimit, minexonlen, minintronlen): """Return the minimum and maximum lengths of exons or introns in the gene annotation table Parameters: 1. GeneT: gene annotation table 2. epercentlimit: [lower limit for exon length, upper limit for exon length] to consider. lower limit and upper limit must be percentage values (0-100) 3. ipercentlimit: [lower limit for intron length, upper limit for exon lentht] to consider. 4. minexonlen: mininum exon length to consider 5. minintronlen: minimum intron length to consider """ exonLens, intronLens=GeneT.get_exon_intron_lens() exon_enough_long, intron_enough_long = corelib.get_certain_part(exonLens, percentlimit=epercentlimit), corelib.get_certain_part(intronLens, percentlimit=ipercentlimit) exonLenLim, intronLenLim = [max(minexonlen, exon_enough_long[0]), exon_enough_long[-1]], [max(minintronlen, intron_enough_long[0]), intron_enough_long[-1]] return exonLenLim, intronLenLim def return_med_gene(GeneT, pf_res): """Return the median gene length of the given gene annotation table. The nearest multiple of pf_res to the median gene length will be returned. Parameters: 1. GeneT: the gene annotation table 2. pf_res: the profiling resolution """ gLens = GeneT.get_gene_lens() medgLen = corelib.median(gLens) return corelib.find_nearest_multiple(medgLen, pf_res) def return_med_catexon_catintron(GeneT, pf_res): """Return the concatenated exons and introns sizes for scaling Parameters: 1. GeneT: the gene annotation table to consider 2. pf_res: profiling resolution """ catexonLens, catintronLens =GeneT.get_cat_exon_intron_lens() medcatexonLen, medcatintronLen = corelib.median(catexonLens), corelib.median(catintronLens) medcatexonLen, medcatintronLen = corelib.find_nearest_multiple(medcatexonLen, pf_res), corelib.find_nearest_multiple(medcatintronLen, pf_res) return medcatexonLen, medcatintronLen def return_med_exons_introns(GeneT, eplowers, epuppers, iplowers, ipuppers): """Return the median exon lengths and intron lengths Parameters: 1. GeneT: genome annotation table 2. eplowers: a list of numbers (1-100) that indicate lower percentage limits for exon length 3. epuppers: a list of numbers (1-100) that indicate upper percentage limits for exon length 4. iplowers: a list of numbers (1-100) that indicate lower percentage limits for intron length 5. ipuppers: a list of numbers (1-100) that indicate upper percentage limits for intron length """ exonLens, intronLens = GeneT.get_exon_intron_lens() elowers, medexonsizes, euppers = corelib.get_boundaries_medians(exonLens, lowers = eplowers, uppers = epuppers) ilowers, medintronsizes, iuppers = corelib.get_boundaries_medians(intronLens, lowers = iplowers, uppers = ipuppers) return elowers, medexonsizes, euppers, ilowers, medintronsizes, iuppers def determine_exon_intron_sizes(GeneT, options): """Determine the exon and intron sizes to consider in exon-intron average profiling""" # get the exon lengths and intron lengths when either elowers or ilowers is not given. if not (options.elowers and options.ilowers): elowers, medexonsizes, euppers, ilowers, medintronsizes, iuppers = return_med_exons_introns(GeneT, options.epercentlowers, options.epercentuppers, options.ipercentlowers, options.ipercentuppers) # update the options with the exonsizes and intronsizes if options.elowers and not options.ilowers: options.ilowers = ilowers options.iuppers = iuppers medexonsizes = map(lambda x,y: (x+y)/2, options.elowers, options.euppers) elif not options.elowers and options.ilowers: options.elowers = elowers options.euppers = euppers medintronsizes = map(lambda x,y: (x+y)/2, options.ilowers, options.iuppers) elif not options.elowers and not options.ilowers: options.elowers = elowers options.euppers = euppers options.ilowers = ilowers options.iuppers = iuppers else: medexonsizes = map(lambda x,y: (x+y)/2, options.elowers, options.euppers) medintronsizes = map(lambda x,y: (x+y)/2, options.ilowers, options.iuppers) # approximate the exonsizes to the nearest the multiple of the profiling resolution n_ranges_e = len(options.elowers) n_ranges_i = len(options.ilowers) options.metaexonsizes = map(corelib.find_nearest_multiple, map(max, medexonsizes, [2*options.pf_res]*n_ranges_e), [options.pf_res]*n_ranges_e) options.metaintronsizes = map(corelib.find_nearest_multiple, map(max, medintronsizes, [2*options.pf_res]*n_ranges_i), [options.pf_res]*n_ranges_i) return options def filter_chroms(chroms,regex): """Get rid of chromosome names with a user-specified re Parameters: 1. chroms: chromosome names 2. re: regular expression as a raw string Return: filtered_chrom: chromosome names after filtering """ filtered_chroms=[] for chrom in chroms: if not re.search(regex, chrom): filtered_chroms.append(chrom) return filtered_chroms def write_txts(outs, txts): """Write multiple txts to multiple files Arguments 1. outs: a list of file handlers 2. txts: a list of texts to write out """ for out, txt in itertools.izip(outs, txts): out.write(txt) def close_dump_files(out): """Close the file handlers of the dump files """ out.close() def close_dump_mult_files(outs): """Close the file handers of multiple dump files """ for out in outs: out.close() def _get_bgroi(GenomeBGP,GenomeBGS,roi,bg_res=100): """Get the background annotation for regions of interest given through -e (or --ebed) option Parameters: 1. GenomeGBP: a P object (see inout.py) of genome background annotation. This will be modified by this function and returned. 2. GenomeGBS: a SummaryGBG object (see inout.py) of genome background annotation 2. roi: a Bed object of regions of interest 3. bg_res: genome background annotation resolution (default=100bp) """ # take the union of the roi regions, just in case these regions overlap each other u_roi = _take_union_of_bed(roi) # sampler Sampler=sampler.ChIPSampler() roisamp=Sampler.sample(bed=u_roi,resolution=bg_res) chroms=set(GenomeBGP.get_chroms()).intersection(roisamp.keys()) bgroi={} whole=0 for chrom in chroms: num_this_chr=len(roisamp[chrom]) bgroi[chrom]=num_this_chr whole+=num_this_chr bgroi['whole']=whole for chrom in bgroi.keys(): try: GenomeBGP[chrom]['roi']=1.0*bgroi[chrom]/GenomeBGS[chrom]['Ns'] except ZeroDivisionError: pass except KeyError: pass return GenomeBGP def _take_union_of_bed(bed): """Take the union of the bed regions and return it as a new Bed object. Only chr, start, and end will remain in the new Bed object. Arguments: 1. bed: a Bed object """ chroms = bed.get_chroms() union = inout.Bed() union.bed = {} for chrom in chroms: u = corelib.union_intervals([bed[chrom]['start'], bed[chrom]['end']]) union.bed[chrom] = {'start': u[0][:], 'end': u[1][:]} return union def _interpoloate_gbg(gdb,promoter,bipromoter): """In using the pre-computed genome bg model, this function performs linear interpolation of genome-wide enrichments of promoter, bidirectional promoter, and downstream. Parameters: 1. gdb: sqlite3 db file. This file must have GenomeBGS and GenomeBGP tables 2. promoter: promoter length given through options.sizes 3. bipromoter: bidirectional promoter length given through options.bisizes 4. downstream: downstream length given through options.sizes Return GP: a P object (see tables.py). This object contains genome bg annotation """ GenomeBGP=tables.PGBG(name='GenomeBGP',numprom=11,numbiprom=21,numdown=11) GenomeBGP.readdb(Db=gdb) GP=tables.P(name='GP') # the given promoter, bipromoter, and downstream lengths new={} new['promoter'] = promoter new['bipromoter'] = bipromoter new['downstream'] = promoter # the model promoter, bipromoter, and downstream lengths mod={} mod['promoter']=[0, 500]+corelib.seq(fr=1000,to=10000,by=1000) mod['bipromoter']=[0, 500]+corelib.seq(fr=1000,to=20000,by=1000) mod['downstream']=[0, 500]+corelib.seq(fr=1000,to=10000,by=1000) for chrom in GenomeBGP.get_chroms(): GP.init_table(chrom) for column in GenomeBGP.columns[1:]: if column not in ['promoter','bipromoter','downstream']: GP[chrom][column]=GenomeBGP[chrom][column] else: vals=[0.0]+GenomeBGP[chrom][column] interpol=[] for x in new[column]: # first check if the value is in the genome background list try: i = mod[column].index(x) interpol.append(vals[i]) except ValueError: # if cannot find, do linear interpolation. i=corelib.findbin(x, mod[column]) try: interpol.append(corelib.lininterpol([mod[column][i],vals[i]], [mod[column][i+1],vals[i+1]],x)) except IndexError: # in case that x is equal to or larger than the upper bound (e.g. >= 10000 in promoter or downstream) interpol.append(corelib.lininterpol([mod[column][i],vals[i]], [mod[column][i],vals[i]],x)) GP[chrom][column]=interpol return GP if __name__ == '__main__': try: main() except KeyboardInterrupt: warn("User interrupts me! ;-) See you!") sys.exit(0)