#!/usr/bin/env python ######################### # KEGG locus tag mapper # ######################### # SYNOPSIS # This program determines which locus tags in a GenBANK file belong # to a given KEGG pathway. The program does this by mapping the EC # numbers for each locus tag in a GenBANK file to the EC numbers in # a KEGG pathway. NOTE: incomplete EC numbers are skipped. # # USAGE # kegg_mapper.py # # Examples: # kegg_mapper.py map00010 ecoli_k12_genome.gb # # INPUT FILES # This program takes a KEGG pathway ID and one GenBANK file as input. # Only the locus tags with EC numbers in the GenBANK file will be # comprared. Any incomplete EC numbers (e.g. 1.-.-.-, etc.) will be # skipped (unless the KEGG pathway also contains the exact same # incomplete number - which seems unlikely). # # PREREQUISITES # This program requires the following modules (version numbers in # parenthesis are the version numbers I used, NOT the minimum or # maximum version numbers): # # Python (2.6.1) -- the below are probably installed by default # CSV (1.0) # URL-lib (1.17) # URL-lib 2 (2.5) # BioPython (1.6.0) # To install BioPython, visit: http://biopython.org/wiki/Download # # OUTPUT # The program output consists of the locus tags in the GenBANK file # which belong to the KEGG pathway. Each line of output consists of # a locus tag name from the GenBANK file. # # Example (E. coli K12 w/map00010): # # b0356 # b1241 # b1478 # b1779 # .... (continued) # ################################### # THIS IS OPEN SOURCE CODE (YAY!) # ################################### # LICENSE # This code is licensed under the Creative Commons 3.0 # Attribution + ShareAlike license - for details see: # http://creativecommons.org/licenses/by-sa/3.0/ # # PROGRAM AUTHOR # Graham Alvare - home.cc.umanitoba.ca/~alvare # # ACKNOWLEDGEMENTS/CO-AUTHORS # Dr. Brian Fristensky - my work supervisor, and the man who introduced # me to the wonderful field of bioinformatics. # Justin Zhang - for asking me to design such a tool to help with his work # MGCB2 project - for providing me with the funding to build this script # Manitoba Government (Innovation, Energy and Mines) - MGCB2 co-funding # Richard Sparling and David Levin - MGCB2 Project leads # # QUESTIONS & COMMENTS # If you have any questions, please contact me: alvare@cc.umanitoba.ca # I usually get back to people within 1-2 weekdays (weekends, I am slower) # # P.S. please also let me know of any bugs, or if you have any suggestions # I am generally happy to help create new tools, or modify my existing # tools to make them more useful. # # Happy usage! import os, os.path, sys, csv, collections sys.path.append('/home/u4/zhangju/script/Python/lib') from Bio import GenBank from Bio import SeqIO import urllib from urllib.request import urlopen import re from parse_genbank import * import glob #import urllib2 ######## # locus_to_ec (filename) ######## # SYNOPSIS # -------- # Creates a map between ec numbers and locus tags. The map is specified # as a dict object containing a list object for each value. Each key in # the dict object is an ec number read from the GenBANK file. Each value # is a list of locus tags which correspond to that EC number. # # PARAMETERS # ---------- # 'filename' - the GenBANK file to read ######## def locus_to_ec (filename): # dictionary for converting locus tags to EC numbers ec2tag = dict() gene_product=dict() input_handle = open(filename) BASE = SeqIO.parse(input_handle, "genbank") # iterate through every record in the GenBANK file for record in BASE: ckey = record.name # iterate through feature in the GenBANK record for feature in record.features: if (('locus_tag' in feature.qualifiers.keys()) and ('EC_number' in feature.qualifiers.keys())): # ensure that there is only one locus tag # for the current GenBank feature if len(feature.qualifiers['locus_tag']) > 1: print( "INVALID LOCUS TAG: " + str(feature.qualifiers['locus_tag'])) sys.exit(1) # extract the feature data tag = feature.qualifiers['locus_tag'][0] # extract the EC number information ecnums = feature.qualifiers['EC_number'] #extract protein function if("product" in feature.qualifiers.keys()): gene_product[tag]=feature.qualifiers["product"][0] else: gene_product[tag]="NA" # add the locus tags to the for num in ecnums: if not num in ec2tag.keys(): ec2tag[num] = list() ec2tag[num].append(tag) # extract the DNA sequence # seq = feature.extract(record.seq) input_handle.close() return [ec2tag, gene_product] #Find cluster of genes: assume locus tag increasing by 5 def locus_cluster (filename): IMGF = open(filename,"r") line=IMGF.readline() line=IMGF.readline() cluster_tag=line.split('\t')[0] last_tag_index=cluster_tag.split('_')[1] ClusterF=open(filename+"_cluster","w") while(line): arr1=line.split('\t') arr2=arr1[0].split('_') if((arr2[1][0:2]=="0r") or (arr2[1][0:2]=="0t")): print(arr2) elif(int(arr2[1])==int(last_tag_index)+5): cluster_tag=cluster_tag+','+arr1[0] last_tag_index=arr2[1] else: if(len(cluster_tag.split(','))>1): ClusterF.write(str(len(cluster_tag.split(',')))+"\t"+cluster_tag+"\n") cluster_tag=arr1[0] last_tag_index=arr2[1] line=IMGF.readline() IMGF.close() #Mapping locus tag to COG number def locus_to_COG (filename): # dictionary for converting locus tags to EC numbers cog2tag = dict() #gene_product=dict() cog=dict() IMGF = open(filename,"r") line=IMGF.readline() while(line): if re.search(r'PPUTLS46_\d+\tCOG_category',line): arr1=line.split('\t') tag=arr1[1] # add the locus tags to the if not tag in cog2tag.keys(): cog2tag[tag] = list() cog2tag[tag].append(arr1[3]) if not arr1[3] in cog.keys(): cog[arr1[3]]=list() cog[arr1[3]].append(tag) line=IMGF.readline() IMGF.close() CogFile=open(filename+"_cog.txt","w") for cog_key in cog.keys(): CogFile.write(cog_key+"\t"+str(len(cog[cog_key]))+"\t"+":".join(cog[cog_key])+"\n") CogFile.close() #return cog2tag #return [cog2tag, gene_product] def IMGTag2COG(IMG_file): tag2cog=dict() IMGF = open(IMG_file,"r") line=IMGF.readline() while(line): if re.search(r'SMa.+\tCOG_category',line):#if re.search(r'PPUTLS46_.+\tCOG_category',line): arr1=line.rstrip().split('\t') if(arr1[1] in tag2cog.keys()): tag2cog[arr1[1]]=tag2cog[arr1[1]]+":"+arr1[3] else: tag2cog[arr1[1]]=arr1[3] line=IMGF.readline() IMGF.close() return tag2cog def Sm_IMGTag2Product(IMG_file): tag2cog=dict() tag2length=dict() IMGF = open(IMG_file,"r") line=IMGF.readline() while(line): if re.search(r'.+\tProduct_name',line): arr1=line.rstrip().split('\t') if(len(arr1)<4): tag2cog[arr1[1]]="N/A" else: tag2cog[arr1[1]]=arr1[4] elif re.search(r'.+\tDNA_length',line): arr1=line.rstrip().split('\t') tag2length[arr1[1]]=arr1[4] line=IMGF.readline() IMGF.close() return (tag2cog,tag2length) def IMGTag2Product(IMG_file): tag2cog=dict() tag2length=dict() IMGF = open(IMG_file,"r") line=IMGF.readline() while(line): if re.search(r'PPUTLS46_.+\tProduct_name',line): arr1=line.rstrip().split('\t') tag2cog[arr1[1]]=arr1[4] elif re.search(r'PPUTLS46_.+\tDNA_length',line): arr1=line.rstrip().split('\t') tag2length[arr1[1]]=arr1[4] line=IMGF.readline() IMGF.close() return (tag2cog,tag2length) def FPKM_addIMGCOG(): files_in_dir=glob.glob('*withLocusTag') tag2cog=IMGTag2COG("/local/workspace01/zhangju/PutidaLS46_RNASeq/KEGG/IMG_gene.txt") for file_fpkm in files_in_dir: CogFile=open(file_fpkm+"_multi_cog.txt","w") FPKM=open(file_fpkm,"r") line=FPKM.readline() CogFile.write("COG\t"+line) line=FPKM.readline() while(line): arr1=line.split('\t') #if(arr1[1]=="PPUTLS46_000390"): #print("390") #else: #print(arr1[1]) if(arr1[0] in tag2cog.keys()): cog_array=tag2cog[arr1[0]].split(':') for cog in cog_array: CogFile.write(cog+"\t"+line) else: CogFile.write("None\t"+line) line=FPKM.readline() CogFile.close() FPKM.close() def Cuffdiff_addKEGG(): files_in_dir=glob.glob('*.txt') #files_in_dir=glob.glob('*_cuffdiff') tag2kegg=IMGTag2KEGG('/local/workspace01/zhangju/PutidaLS46_RNASeq/KEGG/kegg_ko_gene_set.txt') #tag2cog=IMGTag2COG("/local/workspace01/zhangju/PutidaLS46_RNASeq/KEGG/IMG_gene.txt") for file_fpkm in files_in_dir: CogFile=open(file_fpkm+"_KEGG","w") FPKM=open(file_fpkm,"r") line=FPKM.readline() CogFile.write("KEGG\t"+line) line=FPKM.readline() while(line): arr1=line.split('\t') if(arr1[0] in tag2kegg.keys()): kegg_array=tag2kegg[arr1[0]].split(':') for cog in kegg_array: CogFile.write(cog+"\t"+line) else: CogFile.write("None\t"+line) line=FPKM.readline() CogFile.close() FPKM.close() def IMGTag2KEGG(filename): tag2kegg=dict() IMGF = open(filename,"r") line=IMGF.readline() while(line): arr1=line.rstrip().split('\t') kegg=arr1[0] for tag in arr1[1:]: if(tag in tag2kegg): tag2kegg[tag]=tag2kegg[tag]+":"+kegg else: tag2kegg[tag]=kegg line=IMGF.readline() IMGF.close() return tag2kegg def Cuffdiff_addIMGCOG(): files_in_dir=glob.glob('*diff_product.txt')#files_in_dir=glob.glob('*.txt') #files_in_dir=glob.glob('*_cuffdiff') #tag2cog=IMGTag2COG("/local/workspace01/zhangju/PutidaLS46_RNASeq/KEGG/IMG_gene.txt") tag2cog=IMGTag2COG("/local/workspace01/zhangju/sm1021/IMG/IMG_gene.txt") for file_fpkm in files_in_dir: CogFile=open(file_fpkm+"_COG","w") FPKM=open(file_fpkm,"r") line=FPKM.readline() line=FPKM.readline() CogFile.write("COG\t"+line) line=FPKM.readline() while(line): arr1=line.split('\t') if(arr1[1] in tag2cog.keys()): #if(arr1[0] in tag2cog.keys()): cog_array=tag2cog[arr1[1]].split(':')#cog_array=tag2cog[arr1[0]].split(':') for cog in cog_array: CogFile.write(cog+"\t"+line) else: CogFile.write("None\t"+line) line=FPKM.readline() CogFile.close() FPKM.close() def FPKM_or_Cuffdiff_addIMGProduct(): #files_in_dir=glob.glob('*.fpkm_tracking_withLocusTag') #files_in_dir=glob.glob('*.diff') #files_in_dir=glob.glob('*') #files_in_dir=glob.glob('*ATCC27405*tracking') #files_in_dir=glob.glob('*DSM1313_genes.fpkm_tracking') #files_in_dir=glob.glob('*refDSM1313*') files_in_dir=glob.glob('*refATCC27405*') #(tag2cog,tag2length)=IMGTag2Product("/local/workspace01/zhangju/PutidaLS46_RNASeq/KEGG/IMG_gene.txt") #(tag2cog,tag2length)=Sm_IMGTag2Product("/local/workspace01/zhangju/sm1021/KEGG/IMG_gene.txt") #(tag2cog,tag2length)=Sm_IMGTag2Product("/local/workspace01/zhangju/Caldi_sac/IMGGeneCaldSac.xls") #(tag2cog,tag2length)=Sm_IMGTag2Product("/local/workspace01/zhangju/Thermo_petrophila/IMGGene_ThermoPetro.xls") (tag2cog,tag2length)=Sm_IMGTag2Product("/local/workspace01/zhangju/Cthermocellum/ATCC27405_IMGGene.xls") #(tag2cog,tag2length)=Sm_IMGTag2Product("/local/workspace01/zhangju/Cthermocellum/DSM1313_IMGGene.xls") #(tag2cog,tag2length)=IMGTag2Product("/local/workspace01/zhangju/sm1021/KEGG/IMG_gene.txt") for file_fpkm in files_in_dir: #CogFile=open(file_fpkm+"_product.txt","w") #CogFile=open("/local/workspace01/zhangju/Caldi_sac/FPKM_withIMGFeature/"+file_fpkm,"w") #CogFile=open("/local/workspace01/zhangju/Thermo_petrophila/FPKM_withIMGFeature/"+file_fpkm,"w") CogFile=open("/local/workspace01/zhangju/Cthermocellum/FPKM_withFeature/"+file_fpkm,"w") FPKM=open(file_fpkm,"r") line=FPKM.readline() #CogFile.write("Product\t"+line)#.rstrip()+"\tDNA length\n") CogFile.write("Product\t"+line)#.rstrip()+"\tDNA length\n") line=FPKM.readline() while(line): arr1=line.split('\t') #CogFile.write(tag2cog[arr1[0]]+"\t"+line.rstrip()+"\t"+tag2length[arr1[0]][:-2]+"\n") CogFile.write(tag2cog[arr1[0]]+"\t"+line)#.rstrip()+"\t"+tag2length[arr1[0]][:-2]+"\n") line=FPKM.readline() CogFile.close() FPKM.close() #Mapping locus tag to ko number def locus_to_ko (filename): # dictionary for converting locus tags to EC numbers ko2tag = dict() gene_product=dict() IMGF = open(filename,"r") line=IMGF.readline() while(line): if re.search(r'PPUTLS46_\d+\tKO:K\d+',line): arr1=line.split() tag=arr1[1] num=arr1[2].split(':')[1] # add the locus tags to the if not num in ko2tag.keys(): ko2tag[num] = list() ko2tag[num].append(tag) line=IMGF.readline() IMGF.close() return [ko2tag, gene_product] ''' input_handle = open(filename) BASE = SeqIO.parse(input_handle, "genbank") # iterate through every record in the GenBANK file for record in BASE: ckey = record.name # iterate through feature in the GenBANK record for feature in record.features: if (('locus_tag' in feature.qualifiers.keys()) and ('EC_number' in feature.qualifiers.keys())): # ensure that there is only one locus tag # for the current GenBank feature if len(feature.qualifiers['locus_tag']) > 1: print( "INVALID LOCUS TAG: " + str(feature.qualifiers['locus_tag'])) sys.exit(1) # extract the feature data tag = feature.qualifiers['locus_tag'][0] # extract the EC number information ecnums = feature.qualifiers['EC_number'] #extract protein function if("product" in feature.qualifiers.keys()): gene_product[tag]=feature.qualifiers["product"][0] else: gene_product[tag]="NA" # add the locus tags to the for num in ecnums: if not num in ec2tag.keys(): ec2tag[num] = list() ec2tag[num].append(tag) # extract the DNA sequence # seq = feature.extract(record.seq) input_handle.close() ''' ######## # get_keggpw_ecs(pathway) ######## # SYNOPSIS # -------- # Obtains (via the KEGG API - http://www.kegg.jp/kegg/rest/keggapi.html) # the list of EC numbers associated with a given pathway. # # PARAMETERS # ---------- # 'pathway' - the KEGG pathway ID to obtain the EC numbers for. ######## def get_keggpw_ecs(pathway): url = 'http://rest.kegg.jp/link/ec/' + pathway #url='http://www.genome.jp/dbget-bin/get_linkdb?-t+orthology+path:'+pathway params = dict() handle = urlopen(url) #, params) reader=handle.read() #prog = re.compile(r"ec:\d+\.\d+\.\d+\.\d+") prog = re.compile(r"EC:\d+\.\d+\.\d+\.\d+") result = re.findall(prog,str(reader)) #reader = csv.reader(handle, dialect='excel-tab') #result = [line[1][3:] for line in reader if len(str(line)) > 1 and line[1].lower().startswith('ec:')] #handle.close() return result def get_keggpw_kos(pathway): #url = 'http://rest.kegg.jp/link/ec/' + pathway url='http://www.genome.jp/dbget-bin/get_linkdb?-t+orthology+path:'+pathway params = dict() handle = urlopen(url) #, params) reader=handle.read() #prog = re.compile(r"ec:\d+\.\d+\.\d+\.\d+") prog = re.compile(r">(K\d+)<") result = re.findall(prog,str(reader)) #reader = csv.reader(handle, dialect='excel-tab') #result = [line[1][3:] for line in reader if len(str(line)) > 1 and line[1].lower().startswith('ec:')] #handle.close() return result ######## # get_keggpath(imgLS46_list, kegg_allPutida) ######## # -------- # imgLS46_list has pathway category and names of each pathway # kegg_allPutida has all pathways of Putida with pathway id and name # Obtain the list of path id for IMG LS46 ########## def get_keggid(imgLS46_list, kegg_allPathway_file): IMGF = open(imgLS46_list,"r") kegg_name=list() line=IMGF.readline() while(line): if re.search("\(\d",line): arr1 = re.split('\(\d',line) kegg_name.append(arr1[0].strip().rstrip()) line=IMGF.readline() IMGF.close() kegg_id = dict() PutidaF = open(kegg_allPathway_file,"r") line=PutidaF.readline() while(line): arr1=line.rstrip().split(' ') kname="" if(len(arr1)==4): kname=arr1[3] if(kname in kegg_name): #KEGG pathway options - ec, ko, general #ko path has more items than map path #For example map02020 (two-component system has no entries # but ko02020 has entries # here we use ko to include map_id="ko"+arr1[2].rstrip() kegg_id[map_id.replace(" ","")]=kname line=PutidaF.readline() PutidaF.close() return kegg_id ''' arr1=re.split(r' ',line.rstrip()) pre_size=3 if(len(arr1)==1): arr1=re.split(r' ',line.rstrip()) pre_size=4 kid="map"+arr1[0][pre_size:] kname=arr1[1].split("-")[0].rstrip() if(kname in kegg_name): kegg_id[kid]=kname ''' def get_ec_geneset(): img_kegg_file = "LS46KEGG_fromIMG.txt" kegg_allPathway_file="kegg_pathway.txt" gbk_file = "Nov20_IMG25372.gbk" #"ALPV02.gbf" gbk_file_convert=fixing_img_gbk(gbk_file) kegg_list = get_keggid(img_kegg_file,kegg_allPathway_file) [ec2tag,gene_product] = locus_to_ec(gbk_file_convert) # GenBANK img_gene_file="IMG_gene.txt" # downloaed from IMG gene information KEGGF = open("kegg_gene_set.txt","w") count=0 for keggid in kegg_list: count=count+1 #KEGGF.write("set_"+str(count)+"\t"+kegg_list[keggid]) KEGGF.write(kegg_list[keggid]) # print(kegg_list[keggid]+"\n======\n") # map the EC numbers in the pathway to locus tags in the genome ecnums = get_keggpw_ecs(keggid) # KEGG ID result = list() for ec in ecnums: #if ec.split(':')[1] in ec2tag.keys(): # result.extend(ec2tag[ec.split(':')[1]]) if ec.split(':')[1] in ec2tag.keys(): result.extend(ec2tag[ec.split(':')[1]]) # print the results KEGGF.write("("+str(len(result))+")") for tag in result: # print(tag+'\t'+gene_product[tag]) KEGGF.write("\t"+tag) KEGGF.write("\n") KEGGF.close() def get_ko_geneset(): img_kegg_file = "LS46KEGG_fromIMG.txt" kegg_allPathway_file="kegg_pathway.txt" gbk_file = "Nov20_IMG25372.gbk" #"ALPV02.gbf" #gbk_file_convert=fixing_img_gbk(gbk_file) kegg_list = get_keggid(img_kegg_file,kegg_allPathway_file) #[ec2tag,gene_product] = locus_to_ec(gbk_file_convert) # GenBANK img_gene_file="IMG_gene.txt" # downloaed from IMG gene information [ko2tag,gene_product] = locus_to_ko(img_gene_file) # GenBANK KEGGF = open("kegg_gene_set.txt","w") count=0 for keggid in kegg_list: count=count+1 #KEGGF.write("set_"+str(count)+"\t"+kegg_list[keggid]) KEGGF.write(kegg_list[keggid]) # print(kegg_list[keggid]+"\n======\n") # map the EC numbers in the pathway to locus tags in the genome kos = get_keggpw_kos(keggid) # KEGG ID result = list() for ko in kos: #if ec.split(':')[1] in ec2tag.keys(): # result.extend(ec2tag[ec.split(':')[1]]) if ko in ko2tag.keys(): result.extend(ko2tag[ko]) # print the results KEGGF.write("("+str(len(result))+")") for tag in result: # print(tag+'\t'+gene_product[tag]) KEGGF.write("\t"+tag) KEGGF.write("\n") KEGGF.close() ######## # main - kegg_mapper.py ######## # SYNOPSIS # -------- # Reads in a GenBANK file and determines which locus tags (genes) are # involved in a given KEGG pathway. The tags are read from the GenBANK # file using the method 'locus_to_ec'. The pathway is read from KEGG # using the method 'get_keggpw_ecs'. All comparison procedures are # currently done in the body of the main method (below). ######## if __name__ == "__main__": #locus_to_COG("IMG_gene.txt") #FPKM_addIMGCOG() #Cuffdiff_addIMGCOG() FPKM_or_Cuffdiff_addIMGProduct() #get_ko_geneset() #Cuffdiff_addKEGG() #locus_cluster(sys.argv[1]) ''' # Ensure that the proper command line structure is used # for calling the command. (i.e. ensure that a pathway, # and a genome (GenBank file) are supplied). if len(sys.argv) < 2: print("Usage: kegg_mapper.py ") sys.exit() else: # initialize the results list result = list() # TODO (?): handle dashes!!! #print( >> sys.stderr, "NOTE: EC #'s with dashes are dropped!") # read the GenBANK file and load the KEGG pathway ecnums = get_keggpw_ecs(sys.argv[1]) # KEGG ID [ec2tag,gene_product] = locus_to_ec(sys.argv[2]) # GenBANK # map the EC numbers in the pathway to locus tags in the genome for ec in ecnums: if ec.split(':')[1] in ec2tag.keys(): result.extend(ec2tag[ec.split(':')[1]]) # print the results for tag in result: print(tag+'\t'+gene_product[tag]) '''