Conduct GO

""" exptf.py A galaxy tool to find the top-n highest expressed TFs """ import subprocess import os import sys import tempfile #import htmlParser import csv galhtmlprefix = """ Conduct GO

""" galhtmlpostfix = """""" rprog="""rgo=function(tdir, title, txtfname, logfpath, anntIdentifier, rUtilName) { sink(file=file(logfpath, "at"), type="message") sink(file=file(logfpath, "at"), type="output") if (anntIdentifier == "fly.db0.db"){anntIdentifier = "fly.db0"} newfiles = c() newnames = c() newtypes = c() cdir=getwd() setwd(tdir) pkgTest <- function(x) { if (!require(x,character.only = TRUE)) { #install.packages(x,dep=TRUE) source("http://bioconductor.org/biocLite.R") biocLite(x) if(!require(x,character.only = TRUE)) stop("Package not found") } } #source("http://bioconductor.org/biocLite.R") #biocLite("GO.db") #source("http://bioconductor.org/biocLite.R") #biocLite("GOstats") #library("GOstats") pkgTest("GOstats") #library(GO.db) pkgTest("GO.db") pkgTest(anntIdentifier) library(xtable) #source(rUtilName) print(txtfname) dummy <- read.table(txtfname, sep = "\t", header = TRUE, as.is = TRUE, comment.char = "#", quote = "", check.names = FALSE) selGenes <- unique(as.character(dummy[, "Gene"])) selGenes <- as.character(dummy[, 1]) selGenes <- selGenes[!is.na(selGenes)] selGenes <- selGenes[!selGenes == ""] selGenes <- selGenes[!selGenes == "\n"] selGenes <- selGenes[!selGenes == "\r"] universeGenes <- NULL # GO analysis, Cellular Component ontology #annt <- "hgu133a" #print(paste("THIS IS FROM OUTSIDE starting GO",annt)) #cellular component outfname = "CC_RESULT" outfname = paste(title, outfname, sep="_") outfname = paste(outfname, "txt", sep=".") outffname = file.path(tdir, outfname) #provides full file path params <- new("GOHyperGParams", geneIds = selGenes, universeGeneIds = universeGenes, annotation = anntIdentifier , ontology = "CC", pvalueCutoff = 0.05,conditional = FALSE, testDirection = "over") #print(paste("annotation:", annotation) GOResultCC <- hyperGTest(params) #Filename <- checkExistantFile(outffname) AllTerms <- summary(GOResultCC, pvalue = 2) write.table(AllTerms, file = outffname, sep = "\t", row.names = FALSE, quote = FALSE) #for output newnames = c(newnames, outfname) newfiles = c(newfiles, outffname) newtypes = c(newtypes, 'txt') #biological process bpoutfname = "BP_RESULT" bpoutfname = paste(title, bpoutfname, sep="_") bpoutfname = paste(bpoutfname, "txt", sep=".") bpoutffname = file.path(tdir, bpoutfname) #provides full file path params <- new("GOHyperGParams", geneIds = selGenes, universeGeneIds = universeGenes, annotation = anntIdentifier , ontology = "BP", pvalueCutoff = 0.05,conditional = FALSE, testDirection = "over") GOResultBP <- hyperGTest(params) #Filename <- checkExistantFile("GO-Mapping-resultBP.txt") AllTerms <- summary(GOResultBP, pvalue = 2) write.table(AllTerms, file = bpoutffname, sep = "\t", row.names = FALSE, quote = FALSE) #for output newnames = c(newnames, bpoutfname) newfiles = c(newfiles, bpoutffname) newtypes = c(newtypes, 'txt') #Molecular Function mfoutfname = "MF_RESULT" mfoutfname = paste(title, mfoutfname, sep="_") mfoutfname = paste(mfoutfname, "txt", sep=".") mfoutffname = file.path(tdir, mfoutfname) #provides full file path params <- new("GOHyperGParams", geneIds = selGenes, universeGeneIds = universeGenes, annotation = anntIdentifier , ontology = "MF", pvalueCutoff = 0.05, conditional = FALSE, testDirection = "over") GOResultMF <- hyperGTest(params) #Saving the file of GO analysis, Molecular Function ontology AllTerms <- summary(GOResultMF, pvalue = 2) write.table(AllTerms, file = mfoutffname, sep = "\t", row.names = FALSE,quote = FALSE) #for output newnames = c(newnames, mfoutfname) newfiles = c(newfiles, mfoutffname) newtypes = c(newtypes, 'txt') #for output f1 = "Conduct_GO_using_David" htmlname = paste(title, f1, sep="_") htmlname = paste(htmlname, "html", sep=".") htmlpath = file.path(tdir, htmlname) #provides full file path newnames = c(newnames, htmlname) newfiles = c(newfiles, htmlpath) newtypes = c(newtypes, 'html') sink() return(list(newfiles=newfiles,newnames=newnames,newtypes=newtypes)) } """ mng = '### makenewgalaxy' # used by exec_after_process hook to parse out new file paths/names tmp = '### tmp' # used by exec_after_process hook to clean up tmp directory def getEntrezGeneIds(inputExprFilePath): #read expression index file f = file(inputExprFilePath, 'r') exprFilelines = f.readlines() f.close() #entrezIdPosn = 3 entrezIdPosn = 4 #drop header exprFilelines = exprFilelines[1:] entrezIdStr = "" count = 1 #when reference genes are ALL for (line) in exprFilelines: columns = line.strip().split("\t") if(len(columns)>entrezIdPosn): value = columns[entrezIdPosn] if(entrezIdStr.find(value))==-1: if(count>200): break entrezIdStr = entrezIdStr +","+ value count = count+1 if(len(entrezIdStr)>0): entrezIdStr = entrezIdStr[1:] return entrezIdStr def getDisplayGeneList(geneLinkList): gnDisplayStr = "" gnDisplayStrList = geneLinkList.split(',') for k in range(len(gnDisplayStrList)): entrezId = gnDisplayStrList[k] if(k%20==0): gnDisplayStr = gnDisplayStr+",
"+entrezId else: gnDisplayStr = gnDisplayStr+","+entrezId if(len(gnDisplayStr)>0): gnDisplayStr = gnDisplayStr[1:] return gnDisplayStr def createGoGeneFormat(inFilename, outFilename): inFile = csv.reader(open(inFilename, 'rb'), delimiter="\t") outFile = file(outFilename, 'w') #geneCol = 0 geneCol = 4 for rowNum, row in enumerate(inFile): if rowNum == 0: hasGeneHead = False for colNum, header in enumerate(row): if header.lower()=="gene": geneCol = colNum outFile.write(header+'\n') hasGeneHead = True break if not hasGeneHead: outFile.write("Gene"+'\n') else: if len(row) == 0: continue if row[geneCol] != "": outFile.write(row[geneCol]+'\n') outFile.close() def main(): """called as go_analysis.py '$title' '$diff_expr_file' '$logmeta' '$diff_expr_file.dbkey' """ nparm = 5 appname = sys.argv[0] if (len(sys.argv) < nparm): print "Conduct GO 1.0" print '%s needs %d parameters - given %d as %s' % (appname,nparm,len(sys.argv),';'.join(sys.argv)) print 'usage: %s title diff_expression_value out_file annotationKey' % (appname) sys.exit(1) # short command line error title = sys.argv[1].strip() inputExprFilePath = sys.argv[2].strip() logFileName = sys.argv[3].strip() #dbKey = sys.argv[4].strip() # dbKey has been replaced by compulsory annotation selection dbKey = sys.argv[5].strip() #outhtml = sys.argv[5].strip() gnStrList = getEntrezGeneIds(inputExprFilePath) #utilRscriptName=os.path.dirname(sys.argv[0]) + "/utils.R" utilRscriptName=os.path.dirname(sys.argv[0]) + "/utils.R" utilRscriptName = utilRscriptName.replace('\\', '/') print 'input anntkey:%s'% (dbKey) if dbKey == None or dbKey == "" or dbKey == "?": dbKey = "hgu133a" #create temp directory title = title.strip(' ').replace(' ',' ').replace(' ','_') #tdir = tempfile.mkdtemp(prefix=title) #tdir = tdir.replace('\\', '/') tdir = os.path.dirname(inputExprFilePath) print 'title:%s'% (title) print 'ExpressionFilePath:%s'% (inputExprFilePath) print 'logfilename:%s'% (logFileName) print 'anntkey:%s'% (dbKey) print 'tmp dir: %s'% (tdir) #print 'outhtml:%s'% (outhtml) # call R function & read return array tempExprFilePath = os.path.dirname(inputExprFilePath) + "/tempExprFile.txt" print 'tempExprFilePath: %s'% (tempExprFilePath) createGoGeneFormat(inputExprFilePath, tempExprFilePath) rprogname = 'rgo' if dbKey == 'fly.db0.db' : dbKey = 'fly.db0' rcall = "%s('%s','%s','%s','%s','%s','%s')" % (rprogname,tdir,title,tempExprFilePath,logFileName,dbKey,utilRscriptName) print '##rcall=',rcall #p = subprocess.Popen("R --vanilla", shell=True, executable="/bin/bash", stdin=subprocess.PIPE) p = subprocess.Popen("R --vanilla", shell=True, stdin=subprocess.PIPE) p.communicate(rprog + "\n" + rcall) makenew = getFilenames(tdir, title) newfiles = makenew[0] newnames = makenew[1] newtypes = makenew[2] #Write PDF file path,name and type to Log file for plot_code reading. i.e, exec_after_process() logf = file(logFileName,'a') s = 'R code and call\n%s\n\n\nCalled as\n%s\n' % (rprog, rcall) logf.write(s) try: outlist = ['%s\t%s\t%s\t%s' % (mng,newfiles[i],newnames[i],newtypes[i]) for i in range(len(newfiles))] except: outlist = ['%s\t%s\t%s\t%s' % (mng,newfiles,newnames,newtypes),] # only 1 tmpString = '\n%s\t%s\n' % (tmp, tdir) logf.write(tmpString) logf.write('\n'.join(outlist)) logf.write('\n') logf.close() for j in range(len(newfiles)): htmlfile = newfiles[j] ft = newtypes[j] if(ft=="html"): html = [] html.append(galhtmlprefix) #html.append('') #html.append('

') gnDisplayStr = getDisplayGeneList(gnStrList) html.append('
Conduct GO using DAVID

') html.append('
Target Entrez Gene Id List:
'+gnDisplayStr+'

') html.append('Functional Annotation Summary Link

') html.append('Functional Annotation Chart Link

') html.append('Functional Annotation Table Link

') html.append('Functional Annotation Clustering Link

') html.append('Gene Full Report Link

') html.append('Gene Report Link

') html.append('Show Gene List Names in Batch Link

') html.append('Gene Functional Classfication Link

') html.append("
Note:
Please be aware that maximum of 200 genes can be sent due to DAVID on-line internal limitation
") #html.append('

') html.append(galhtmlpostfix) outf = file(htmlfile,'w') outf.write('\n'.join(html)) outf.close() def getFilenames(tdir, title): titles = ["_CC_RESULT.txt", "_BP_RESULT.txt", "_MF_RESULT.txt", "_Conduct_GO_using_David.html"] titles = [title + x for x in titles] paths = [os.path.join(tdir, x) for x in titles] return [ paths, titles, ['txt', 'txt', 'txt', 'html'] ] if __name__ == "__main__": main()

Conduct GO using DAVID