""" exptf.py A galaxy tool to find the top-n highest expressed TFs """ import subprocess import os import sys import tempfile #import htmlParser import csv galhtmlprefix = """ Conduct GO

""" galhtmlpostfix = """""" rprog="""rgo=function(tdir, title, txtfname, logfpath, anntIdentifier, rUtilName) { sink(file=file(logfpath, "at"), type="message") sink(file=file(logfpath, "at"), type="output") if (anntIdentifier == "fly.db0.db"){anntIdentifier = "fly.db0"} newfiles = c() newnames = c() newtypes = c() cdir=getwd() setwd(tdir) pkgTest <- function(x) { if (!require(x,character.only = TRUE)) { #install.packages(x,dep=TRUE) source("http://bioconductor.org/biocLite.R") biocLite(x) if(!require(x,character.only = TRUE)) stop("Package not found") } } #source("http://bioconductor.org/biocLite.R") #biocLite("GO.db") #source("http://bioconductor.org/biocLite.R") #biocLite("GOstats") #library("GOstats") pkgTest("GOstats") #library(GO.db) pkgTest("GO.db") pkgTest(anntIdentifier) library(xtable) #source(rUtilName) print(txtfname) dummy <- read.table(txtfname, sep = "\t", header = TRUE, as.is = TRUE, comment.char = "#", quote = "", check.names = FALSE) selGenes <- unique(as.character(dummy[, "Gene"])) selGenes <- as.character(dummy[, 1]) selGenes <- selGenes[!is.na(selGenes)] selGenes <- selGenes[!selGenes == ""] selGenes <- selGenes[!selGenes == "\n"] selGenes <- selGenes[!selGenes == "\r"] universeGenes <- NULL # GO analysis, Cellular Component ontology #annt <- "hgu133a" #print(paste("THIS IS FROM OUTSIDE starting GO",annt)) #cellular component outfname = "CC_RESULT" outfname = paste(title, outfname, sep="_") outfname = paste(outfname, "txt", sep=".") outffname = file.path(tdir, outfname) #provides full file path params <- new("GOHyperGParams", geneIds = selGenes, universeGeneIds = universeGenes, annotation = anntIdentifier , ontology = "CC", pvalueCutoff = 0.05,conditional = FALSE, testDirection = "over") #print(paste("annotation:", annotation) GOResultCC <- hyperGTest(params) #Filename <- checkExistantFile(outffname) AllTerms <- summary(GOResultCC, pvalue = 2) write.table(AllTerms, file = outffname, sep = "\t", row.names = FALSE, quote = FALSE) #for output newnames = c(newnames, outfname) newfiles = c(newfiles, outffname) newtypes = c(newtypes, 'txt') #biological process bpoutfname = "BP_RESULT" bpoutfname = paste(title, bpoutfname, sep="_") bpoutfname = paste(bpoutfname, "txt", sep=".") bpoutffname = file.path(tdir, bpoutfname) #provides full file path params <- new("GOHyperGParams", geneIds = selGenes, universeGeneIds = universeGenes, annotation = anntIdentifier , ontology = "BP", pvalueCutoff = 0.05,conditional = FALSE, testDirection = "over") GOResultBP <- hyperGTest(params) #Filename <- checkExistantFile("GO-Mapping-resultBP.txt") AllTerms <- summary(GOResultBP, pvalue = 2) write.table(AllTerms, file = bpoutffname, sep = "\t", row.names = FALSE, quote = FALSE) #for output newnames = c(newnames, bpoutfname) newfiles = c(newfiles, bpoutffname) newtypes = c(newtypes, 'txt') #Molecular Function mfoutfname = "MF_RESULT" mfoutfname = paste(title, mfoutfname, sep="_") mfoutfname = paste(mfoutfname, "txt", sep=".") mfoutffname = file.path(tdir, mfoutfname) #provides full file path params <- new("GOHyperGParams", geneIds = selGenes, universeGeneIds = universeGenes, annotation = anntIdentifier , ontology = "MF", pvalueCutoff = 0.05, conditional = FALSE, testDirection = "over") GOResultMF <- hyperGTest(params) #Saving the file of GO analysis, Molecular Function ontology AllTerms <- summary(GOResultMF, pvalue = 2) write.table(AllTerms, file = mfoutffname, sep = "\t", row.names = FALSE,quote = FALSE) #for output newnames = c(newnames, mfoutfname) newfiles = c(newfiles, mfoutffname) newtypes = c(newtypes, 'txt') #for output f1 = "Conduct_GO_using_David" htmlname = paste(title, f1, sep="_") htmlname = paste(htmlname, "html", sep=".") htmlpath = file.path(tdir, htmlname) #provides full file path newnames = c(newnames, htmlname) newfiles = c(newfiles, htmlpath) newtypes = c(newtypes, 'html') sink() return(list(newfiles=newfiles,newnames=newnames,newtypes=newtypes)) } """ mng = '### makenewgalaxy' # used by exec_after_process hook to parse out new file paths/names tmp = '### tmp' # used by exec_after_process hook to clean up tmp directory def getEntrezGeneIds(inputExprFilePath): #read expression index file f = file(inputExprFilePath, 'r') exprFilelines = f.readlines() f.close() #entrezIdPosn = 3 entrezIdPosn = 4 #drop header exprFilelines = exprFilelines[1:] entrezIdStr = "" count = 1 #when reference genes are ALL for (line) in exprFilelines: columns = line.strip().split("\t") if(len(columns)>entrezIdPosn): value = columns[entrezIdPosn] if(entrezIdStr.find(value))==-1: if(count>200): break entrezIdStr = entrezIdStr +","+ value count = count+1 if(len(entrezIdStr)>0): entrezIdStr = entrezIdStr[1:] return entrezIdStr def getDisplayGeneList(geneLinkList): gnDisplayStr = "" gnDisplayStrList = geneLinkList.split(',') for k in range(len(gnDisplayStrList)): entrezId = gnDisplayStrList[k] if(k%20==0): gnDisplayStr = gnDisplayStr+",
"+entrezId else: gnDisplayStr = gnDisplayStr+","+entrezId if(len(gnDisplayStr)>0): gnDisplayStr = gnDisplayStr[1:] return gnDisplayStr def createGoGeneFormat(inFilename, outFilename): inFile = csv.reader(open(inFilename, 'rb'), delimiter="\t") outFile = file(outFilename, 'w') #geneCol = 0 geneCol = 4 for rowNum, row in enumerate(inFile): if rowNum == 0: hasGeneHead = False for colNum, header in enumerate(row): if header.lower()=="gene": geneCol = colNum outFile.write(header+'\n') hasGeneHead = True break if not hasGeneHead: outFile.write("Gene"+'\n') else: if len(row) == 0: continue if row[geneCol] != "": outFile.write(row[geneCol]+'\n') outFile.close() def main(): """called as go_analysis.py '$title' '$diff_expr_file' '$logmeta' '$diff_expr_file.dbkey' """ nparm = 5 appname = sys.argv[0] if (len(sys.argv) < nparm): print "Conduct GO 1.0" print '%s needs %d parameters - given %d as %s' % (appname,nparm,len(sys.argv),';'.join(sys.argv)) print 'usage: %s title diff_expression_value out_file annotationKey' % (appname) sys.exit(1) # short command line error title = sys.argv[1].strip() inputExprFilePath = sys.argv[2].strip() logFileName = sys.argv[3].strip() #dbKey = sys.argv[4].strip() # dbKey has been replaced by compulsory annotation selection dbKey = sys.argv[5].strip() #outhtml = sys.argv[5].strip() gnStrList = getEntrezGeneIds(inputExprFilePath) #utilRscriptName=os.path.dirname(sys.argv[0]) + "/utils.R" utilRscriptName=os.path.dirname(sys.argv[0]) + "/utils.R" utilRscriptName = utilRscriptName.replace('\\', '/') print 'input anntkey:%s'% (dbKey) if dbKey == None or dbKey == "" or dbKey == "?": dbKey = "hgu133a" #create temp directory title = title.strip(' ').replace(' ',' ').replace(' ','_') #tdir = tempfile.mkdtemp(prefix=title) #tdir = tdir.replace('\\', '/') tdir = os.path.dirname(inputExprFilePath) print 'title:%s'% (title) print 'ExpressionFilePath:%s'% (inputExprFilePath) print 'logfilename:%s'% (logFileName) print 'anntkey:%s'% (dbKey) print 'tmp dir: %s'% (tdir) #print 'outhtml:%s'% (outhtml) # call R function & read return array tempExprFilePath = os.path.dirname(inputExprFilePath) + "/tempExprFile.txt" print 'tempExprFilePath: %s'% (tempExprFilePath) createGoGeneFormat(inputExprFilePath, tempExprFilePath) rprogname = 'rgo' if dbKey == 'fly.db0.db' : dbKey = 'fly.db0' rcall = "%s('%s','%s','%s','%s','%s','%s')" % (rprogname,tdir,title,tempExprFilePath,logFileName,dbKey,utilRscriptName) print '##rcall=',rcall #p = subprocess.Popen("R --vanilla", shell=True, executable="/bin/bash", stdin=subprocess.PIPE) p = subprocess.Popen("R --vanilla", shell=True, stdin=subprocess.PIPE) p.communicate(rprog + "\n" + rcall) makenew = getFilenames(tdir, title) newfiles = makenew[0] newnames = makenew[1] newtypes = makenew[2] #Write PDF file path,name and type to Log file for plot_code reading. i.e, exec_after_process() logf = file(logFileName,'a') s = 'R code and call\n%s\n\n\nCalled as\n%s\n' % (rprog, rcall) logf.write(s) try: outlist = ['%s\t%s\t%s\t%s' % (mng,newfiles[i],newnames[i],newtypes[i]) for i in range(len(newfiles))] except: outlist = ['%s\t%s\t%s\t%s' % (mng,newfiles,newnames,newtypes),] # only 1 tmpString = '\n%s\t%s\n' % (tmp, tdir) logf.write(tmpString) logf.write('\n'.join(outlist)) logf.write('\n') logf.close() for j in range(len(newfiles)): htmlfile = newfiles[j] ft = newtypes[j] if(ft=="html"): html = [] html.append(galhtmlprefix) #html.append('
') #html.append('') #html.append('') #html.append('') #html.append('
') #html.append('') html.append(galhtmlpostfix) outf = file(htmlfile,'w') outf.write('\n'.join(html)) outf.close() def getFilenames(tdir, title): titles = ["_CC_RESULT.txt", "_BP_RESULT.txt", "_MF_RESULT.txt", "_Conduct_GO_using_David.html"] titles = [title + x for x in titles] paths = [os.path.join(tdir, x) for x in titles] return [ paths, titles, ['txt', 'txt', 'txt', 'html'] ] if __name__ == "__main__": main()