#!/usr/bin/python2 """Module Description Copyright (c) 2009 H. Gene Shin This code is free software; you can redistribute it and/or modify it under the terms of the BSD License (see the file COPYING included with the distribution). @status: experimental @version: $Revision$ @author: H. Gene Shin @contact: shin@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ import os import sys import re import logging import operator import itertools from array import array from optparse import OptionParser # ------------------------------------ # my modules # ------------------------------------ import CEAS.inout as inout import CEAS.annotator as annotator # ------------------------------------ # constants # ------------------------------------ logging.basicConfig(level=20, format='%(levelname)-5s @ %(asctime)s: %(message)s ', datefmt='%a, %d %b %Y %H:%M:%S', stream=sys.stderr, filemode="w" ) # ------------------------------------ # Misc functions # ------------------------------------ error = logging.critical # function alias warn = logging.warning debug = logging.debug info = logging.info def prepare_optparser (): """Prepare optparser object. New options will be added in this function first. """ usage = "usage: %prog <-g gdb -b bed> [options]" description = "GCA -- Gene-centered Annotation" optparser = OptionParser(version="%prog 0.1.7 (package version 1.0.2)",description=description,usage=usage,add_help_option=False) optparser.add_option("-h","--help",action="help",help="Show this help message and exit.") optparser.add_option("-b","--bed",dest="bed",type="string", help="BED file of ChIP regions.") optparser.add_option("-g","--gt",dest="gdb",type="string", help="Gene annotation table. This can be a sqlite3 local db file, BED file or genome version of UCSC. The BED file must have an extension of '.bed'") optparser.add_option("--span", dest="span", type="int",\ help="Span in search of ChIP regions from TSS and TTS, DEFAULT=3000bp", default=3000) optparser.add_option("--name",dest="name",\ help="Experiment name. This will be used to name the output file. If an experiment name is not given, input BED file name will be used instead.") optparser.add_option("--gn-group",dest="gn_group",\ help="A particular group of genes of interest. If a txt file with one column of gene names (eg RefSeq IDs in case of using a refGene table) is given, gca returns the gene-centered annotation of this particular gene group.", default=None) optparser.add_option("--gname2",dest="name2",\ help="The gene names of --gn-group will be regarded as 'name2.' See the schema of the gene annotation table.", default=False) return optparser def opt_validate (optparser): """Validate options from a OptParser object. Ret: Validated options object. """ (options,args) = optparser.parse_args() # if gdb not given, print help, either BED or WIG must be given if not options.gdb or not options.bed: optparser.print_help() sys.exit(1) if not os.path.isfile(options.gdb): error("No such file as '%s'" %options.gdb) sys.exit(1) else: options.Host = None options.User = None options.Db = options.gdb if not os.path.isfile(options.bed): error("No such file as %s" %options.bed) sys.exit(1) # check the sub-group genes if options.gn_group: if not os.path.isfile(options.gn_group): error("No such file as '%s'" %options.gn_group) options.name2 = False sys.exit(1) else: options.name2 = False if not options.name: options.name=os.path.split(options.bed)[-1].rsplit('.bed',2)[0] return options def filter_chroms(chroms,regex): """Get rid of chromosome names with a user-specified re Parameters: 1. chroms: chromosome names 2. re: regular expression as a raw string Return: filtered_chrom: chromosome names after filtering """ filtered_chroms=[] for chrom in chroms: if not re.search(regex, chrom): filtered_chroms.append(chrom) return filtered_chroms def convert_BED2GeneTable(Bed): """Convert a BED-formatted gene annotation table into a GeneTable object Note that the Bed must strictly follow the BED format not to generate error conversion error. """ # make a GeneTable object GeneT = inout.GeneTable() GeneT.reset() # set the column names of the gene table GeneT.columns=('name','strand','txStart','txEnd','cdsStart','cdsEnd','exonCount','exonStarts', 'exonEnds') Bed_columns = ('name', 'strand', 'start','end','thickStart','thickEnd', 'blockCount','blockSizes','blockStarts') for chrom in Bed.get_chroms(): GeneT[chrom] = {} # copy the columns into the GeneTable for gc, bc in itertools.izip(GeneT.columns, Bed_columns): GeneT[chrom][gc] = Bed[chrom][bc] return GeneT # ------------------------------------ # Main function # ------------------------------------ def main(): # read the options and validate them options=opt_validate(prepare_optparser()) jobcount=1 # if sub-group of genes exist, read them first if options.gn_group: info("#%d read the sub-group of genes to do gene-centered annotation for..." %jobcount) (where,) = inout.read_gene_subsets((options.gn_group,)) if options.name2: which = 'name2' else: which = 'name' jobcount += 1 else: which = '' where = () # read the gene annotation table info("#%d read the gene annotation table..." %jobcount) GeneT = inout.GeneTable() try: GeneT.read(Host = options.Host, User= options.User, Db=options.Db, annotation='GeneTable', \ columns=('name','chrom','strand','txStart','txEnd','cdsStart','cdsEnd','exonCount','exonStarts', 'exonEnds'), which=which, where=where) except Exception, e: # if 'name2' does not exist and the user wants to use 'name2', error; otherwise, if re.search(r'column.*name2', str(e)): error("The gene annotation table does not have 'name2.' Only gene IDs of 'name' can be used to indicate the sub-group of genes given via --gn-group.") sys.exit(1) else: raise GeneT.sort() chroms_GeneT=GeneT.get_chroms() chroms_GeneT=filter_chroms(chroms_GeneT,'_[A-Za-z0-9]*') jobcount+=1 info("#%d read the ChIP region BED file..." %jobcount) ChIP = inout.Bed() ChIP.read(options.bed) jobcount+=1 info("#%d perform gene-centered annotation" %jobcount) GAnnotator=annotator.GeneAnnotator() GAnnotator.annotate(GeneT, ChIP, u=options.span, d=options.span, name2=options.name2) GAnnotator.map.set_name(options.name) GAnnotator.map.write() jobcount+=1 info("# ...cong! See %s!" %(options.name+'.xls')) if __name__=="__main__": try: main() except KeyboardInterrupt: warn("User interrupts me! ;-) See you!") sys.exit(0)