#!/usr/bin/python2 """Module Description Copyright (c) 2009 H. Gene Shin This code is free software; you can redistribute it and/or modify it under the terms of the BSD License (see the file COPYING included with the distribution). @status: experimental @version: $Revision$ @author: H. Gene Shin @contact: shin@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ import os import sys import re import logging from optparse import OptionParser import CEAS.inout as inout import sqlite3 from CEAS.inout import MYSQL import CEAS.corelib as corelib import CEAS.annotator as annotator import CEAS.tables as tables import CEAS.sampler as sampler # ------------------------------------ # constants # ------------------------------------ logging.basicConfig(level=20, format='%(levelname)-5s @ %(asctime)s: %(message)s ', datefmt='%a, %d %b %Y %H:%M:%S', stream=sys.stderr, filemode="w" ) # ------------------------------------ # Misc functions # ------------------------------------ error = logging.critical # function alias warn = logging.warning debug = logging.debug info = logging.info # ------------------------------------ # Main function # ------------------------------------ def main(): # read the options and validate them options=opt_validate(prepare_optparser()) # roi is just dummy parameter roi=None # CEAS run # read the gene annotation table jobcount=1 info("#%d read the gene annotation table..." %(jobcount)) # read GeneT = inout.GeneTable() # to detect if 'name2' exists in the gene table. # if not exist, just read the remaining columns. try: GeneT.read(Host = options.Host, User= options.User, Db=options.Db, annotation=options.annotation, \ columns=('name','chrom','strand','txStart','txEnd','cdsStart','cdsEnd','exonCount','exonStarts', 'exonEnds', 'name2')) except Exception, e: if re.search(r'column.*name2', str(e)): # if 'name2' does not exist, 'name2' is not ignored. GeneT.read(Host = options.Host, User= options.User, Db=options.Db, annotation=options.annotation, columns=('name','chrom','strand','txStart','txEnd','cdsStart','cdsEnd','exonCount','exonStarts', 'exonEnds')) else: raise GeneT.sort() chroms_GeneT=GeneT.get_chroms() chroms_GeneT=filter_chroms(chroms_GeneT,'_[A-Za-z0-9]*') jobcount+=1 # start bg annotation chrom='' chrcount=1 # set the summary table GenomeBGS=tables.SummaryGBG(name='GenomeBGS') GenomePieS = tables.PieSummary( name='GenomePieS' ) FIRST=True fixedStep = False for line in open(options.wig,'r').xreadlines(): if not line or line.startswith('#'): continue # handling the track if re.search(r'track',line): try: description=re.search(r'description="(\S+)"\s',line).group(1) except AttributeError: pass continue # check if fixedStep or variableStep if re.search(r'fixedStep', line): fixedStep = True step = int(re.search(r'step=(\S+)\s', line).group(1)) position = int(re.search(r'start=(\S+)\s', line).group(1)) # read the chromosome if re.search(r'chrom=(\S+)\s',line): newchrom=re.search(r'chrom=(\S+)\s',line).group(1) try: newchrom=inout.standard_chroms[newchrom] except KeyError: pass continue l=line.strip().split() # the beginning if chrom=='' and chrom!=newchrom: # if the chromosome is not in gene table, continue chrom=newchrom if chrom in chroms_GeneT: # only if the new chromosome is in the chroms of gene table, a wig object is initiated. info("#%d-%d annotate %s..." %(jobcount,chrcount,chrom)) input=inout.Wig() # if fixedStep, calculate the position from start and step if fixedStep: row = [position, l[-1]] position += step else: row = l # add the new line to the Wig object input.add_line(chrom, row) chrcount+=1 elif chrom!='' and chrom!=newchrom: # new chromosome if chrom in chroms_GeneT: # do genome BG annotation Sampler=sampler.GenomeSampler() Annotator=annotator.AnnotatorGBG() GA=Annotator.annotate(Sampler.sample(input,resolution=options.bg_res),GeneT,roi=roi,prom=options.promoter,biprom=options.bipromoter,down=options.downstream,gene_div=(3,5),quantize=False) tempS,tempP=Annotator.summarize(GA,options.promoter,options.bipromoter,options.downstream,options.binsize) GenomeBGS.set_dim(tempS.numprom,tempS.numbiprom,tempS.numdown) GenomeBGS.add_row(chrom,tempS.get_row(chrom)) tempPieSt = Annotator.obtain_distribution_of_sites_for_genome(GA) tempPieS = tables.PieSummary() tempPieS.import2tb( tempPieSt ) GenomePieS.add_row( chrom, tempPieS.get_row( chrom ) ) ### DEBUG #print GenomeBGS.table #print GenomePieS.table # set chrom to the new chromosome chrom=newchrom if chrom in chroms_GeneT: # only if the new chromosome is in the chroms of gene table, a wig object is initiated. info("#%d-%d annotate %s..." %(jobcount,chrcount,chrom)) input=inout.Wig() # if fixedStep, calculate the position from start and step if fixedStep: row = [position, l[-1]] position += step else: row = l # add the line to the Wig object input.add_line(chrom, row) chrcount+=1 else: # in the middle of chromosome if chrom in chroms_GeneT: # only if the new chromosome is in the chroms of gene table, the wig object is updated. # if fixedStep, calculate the position from start and step if fixedStep: row = [position, l[-1]] position += step else: row = l # add the line to the Wig object input.add_line(chrom, row) # the last chromosome if chrom in chroms_GeneT: # do genome bg annotation Sampler=sampler.GenomeSampler() Annotator=annotator.AnnotatorGBG() GA=Annotator.annotate(Sampler.sample(input,resolution=options.bg_res),GeneT,roi=roi,prom=options.promoter,biprom=options.bipromoter,down=options.downstream,gene_div=(3,5),quantize=False) tempS,tempP=Annotator.summarize(GA,options.promoter,options.bipromoter,options.downstream,options.binsize) GenomeBGS.set_dim(tempS.numprom,tempS.numbiprom,tempS.numdown) GenomeBGS.add_row(chrom,tempS.get_row(chrom)) tempPieSt = Annotator.obtain_distribution_of_sites_for_genome(GA) tempPieS = tables.PieSummary() tempPieS.import2tb( tempPieSt ) GenomePieS.add_row( chrom, tempPieS.get_row( chrom ) ) ### DEBUG #print GenomeBGS.table #print GenomePieS.table jobcount+=1 # get the probability table GenomeBGS.summarize() GenomeBGP=GenomeBGS.get_p() GenomeBGP.set_name('GenomeBGP') GenomePieS.summarize() GenomePieP = GenomePieS.get_p() GenomePieP.set_name('GenomePieP') #print GenomeBGS #print GenomeBGP #print GenomePieS #print GenomePieP # save the GeneTable and GenomeBG summary and probabilities GeneT.savedb(Db=options.ot, annotation='GeneTable', overwrite=True) GenomeBGS.savedb(Db=options.ot, overwrite=True) GenomeBGP.savedb(Db=options.ot, overwrite=True) GenomePieS.savedb(Db=options.ot, overwrite=True) GenomePieP.savedb(Db=options.ot, overwrite=True) info ('#... cong! See sqlite3 db %s!' %(options.ot)) # ------------------------------------ # functions # ------------------------------------ def prepare_optparser (): """Prepare optparser object. New options will be added in this function first. """ usage = "usage: %prog <-g gt -w wig> [options]" description = "build_genomeBG, do genome bg annotation and save it for CEAS" optparser = OptionParser(version="%prog 0.1.6 (package version 1.0.2)",description=description,usage=usage,add_help_option=False) optparser.add_option("-h","--help",action="help",help="Show this help message and exit.") optparser.add_option("-d","--db",dest="db",type="string", help="Genome of UCSC (eg hg18). If -d (--db) is not given, this script searches for a local sqlite3 referenced by -g (--gt). WARNING: MySQLdb must be installed to use the tables of UCSC.", default=None) optparser.add_option("-g","--gt",dest="gt",type="string", help="Name of the gene annotation table (or local sqlite3 file) (eg refGene or knownGene). If -d (--db) is given, build_genomeBG will connect to UCSC and download the specified gene table. Otherwise, build_genomeBG search for a local sqlite3 file with the name.") optparser.add_option("-w","--wig",dest="wig",type="string", help="WIG file needed to obtain genome locations in BG annotation. VariableStep and fixedWig files are accepted.") optparser.add_option("-o", "--ot", dest="ot",type="string",\ help="Output sqlite3 db file name. The gene annotation table read from the local sqlite3 file or UCSC DB will be saved in a table named as 'GeneTable' and the computed genome bg annotation will be saved in two tables named as 'GenomeBGS' and 'GenomeBGP. If this option is not given, this script generates a sqlite3 file with the same name as given through -g (--gt). WARNING! When an existing local sqlite3 file is opened and saved as the same name, the tables in the file will be overwritten.", default=None) optparser.add_option("--promoter",dest="promoter",type="int", help="Maximum promoter size to consider for genome bg annotation. This must be >= 1000bp. Any value less than 1000bp will be set to 1000bp. DEFAULT: 10000bp", default=10000) optparser.add_option("--bipromoter",dest="bipromoter",type="int", help="Maximum Bidirectional promoter size to consider for genome bg annotation. This must be >= 1000bp. Any value less than 1000bp will be set to 1000bp. DEFAULT: 20000bp", default=20000) optparser.add_option("--downstream",dest="downstream",type="int", help="Maximum immediate downstream size to consider for genome bg annotation. This must be >= 1000bp. Any value less than 1000bp will be set to 1000bp. DEFAULT: 10000bp", default=10000) optparser.add_option("--binsize",dest="binsize",type="int",\ help="Binsize with which to bin promoter, bidirectional promoter, and immediate downstream sizes. In each bin, the percentage of genome will be calculated. DEFAULT=1000bp", default=1000) return optparser def opt_validate (optparser): """Validate options from a OptParser object. Ret: Validated options object. """ (options,args) = optparser.parse_args() # input BED file and GDB must be given if not (options.gt or options.wig): optparser.print_help() sys.exit(1) # get gdb lower case if options.db: if MYSQL: options.Host="genome-mysql.cse.ucsc.edu" options.User="genome" options.Db = options.db options.annotation=os.path.split(options.gt)[-1] else: error('MySQLdb package needs to be installed to use UCSC or a local sqlite3 db file must exist.') error("Check -g (--gdb). No such file or species as '%s'" %options.gt) sys.exit(1) else: if os.path.isfile(options.gt): options.Host=None options.User=None options.Db=options.gt options.annotation='GeneTable' else: error("No such local sqlite3 db file as '%s'" %options.gt) sys.exit(1) # check if the wig file exists if not os.path.isfile(options.wig): error("No such wig file as '%s'" %options.wig) sys.exit(1) # when no output file name is given if not options.ot: options.ot = os.path.split(options.gt)[-1] # bg background resolution is set to 100 options.bg_res=100 # set the maximum sizes of promoter, bidirectional promoter and downstream options.promoter=max(1000,options.promoter) options.bipromoter=max(1000,options.bipromoter) options.downstream=max(1000,options.downstream) return options def filter_chroms(chroms,regex): """Get rid of chromosome names with a user-specified re Parameters: 1. chroms: chromosome names 2. re: regular expression as a raw string Return: filtered_chrom: chromosome names after filtering """ filtered_chroms=[] for chrom in chroms: if not re.search(regex, chrom): filtered_chroms.append(chrom) return filtered_chroms ### ### Some functions that were temporarily used for genome background annotation. These functions were ### modifed and reused by AnnotatorGBG class as its methods. def caculate_gbg(gbgtable,maxprom=10000,maxbiprom=20000,maxdown=10000,by=1000): """Calculate the genome bg. """ prombin=[1, 500]+range(by,maxprom+by,by) downbin=[1, 500]+range(by,maxdown+by,by) biprombin=[1, 500]+range(by,maxbiprom+by,by) allbp = [0] * (len(prombin)+1) allbd = [0] * (len(downbin)+1) allbbp = [0] * (len(biprombin)+1) for chrom in gbgtable.get_chroms(): prom=gbgtable[chrom]['promoter'] down=gbgtable[chrom]['downstream'] biprom=gbgtable[chrom]['bipromoter'] bp=corelib.bin(prom, prombin) bd=corelib.bin(down, downbin) bbp=corelib.bin(biprom, biprombin) allbp, c = corelib.array_adder(allbp, bp) allbd, c = corelib.array_adder(allbd, bd) allbbp, c = corelib.array_adder(allbbp, bbp) # cumsum from 500 to maxprom (or maxbiprom or maxdown) cumbp = corelib.cumsum(allbp[1:-1]) cumbd = corelib.cumsum(allbd[1:-1]) cumbbp = corelib.cumsum(allbbp[1:-1]) return cumbp, cumbbp, cumbd def sum_gbg(binned,max=1000,by=1000): """Sum the counts of bins across chromosomes. Parameters: 1. binned: a dictionary of chromosome-wise bins. bins['chr1]=[n1,n2,...nn], where ni is the counts within the ith bin Return: 1. binsum: an array of the bin sum across chromosomes """ bins=[1, 500]+range(by,max+by,by) binsum=[0]*len(bins) for chrom in bins.keys(): binsum=corelib.array_adder(binsum,bins[chrom]) return bins,binsum def sum_dict(dict): """Return the sum of the elements in a dictionary Parameters: 1. dict: a dictionary. The element referred by a key must be a single number. Return: 1. summation: the sum """ s=0 for key in dict.keys(): s+=dict[key] return s if __name__ == '__main__': try: main() except KeyboardInterrupt: warn("User interrupts me! ;-) See you!") sys.exit(0)