import sys sys.path.append('/home/psgendb/local/biopython-1.55') import Bio from Bio import SeqIO, SeqFeature from Bio.SeqRecord import SeqRecord import os import glob import re #modifying to loop genbank with multiple records # Copyright(C) 2009 Iddo Friedberg & Ian MC Fleming # Released under Biopython license. http://www.biopython.org/DIST/LICENSE # Do not remove this comment def merge_gbk(): #original IMG genbank could not read by Biopython - sequence 1 column further than the standard ncbi format #parse by bioperl and rewrite that is fine input_handle = open("rewrite_gbk.gbk", "rU") count_rec=0 combined_record="empty" for record in SeqIO.parse(input_handle, "genbank") : if(count_rec==0): combined_record=record else: combined_record=combined_record+("N"*4)+record count_rec=count_rec+1 print(record) input_handle.close() SeqIO.write(combined_record, "combined.gbk", "genbank") #The format downloaded from IMG site is not support for Biopython #Sequence lines, there is one space more than NCBI format def fixing_img_gbk(img_gbk_file): # if if re.search(r'\s+\d+',str1): IMGF = open(img_gbk_file,"r") ConvertF=open(img_gbk_file+"_converted.gbk","w") line=IMGF.readline() origin=False while(line): if(line.strip()=="ORIGIN"): origin=True if(line.strip()=="//"): origin=False #if re.search(r'^\s+\d+',line): if(origin): ConvertF.write(line[1:]) else: ConvertF.write(line) line=IMGF.readline() IMGF.close() ConvertF.close() return img_gbk_file+"_converted.gbk" def get_interregions(genbank_path,intergene_length=1): outpath = os.path.splitext(os.path.basename(genbank_path))[0] + "_ign.fasta" for seq_record in SeqIO.parse(open(genbank_path), "genbank"): cds_list_plus = [] cds_list_minus = [] intergenic_records = [] # Loop over the genome file, get the CDS features on each of the strands for feature in seq_record.features: if feature.type == 'CDS': mystart = feature.location._start.position myend = feature.location._end.position if feature.strand == -1: cds_list_minus.append((mystart,myend,-1)) elif feature.strand == 1: cds_list_plus.append((mystart,myend,1)) else: sys.stderr.write("No strand indicated %d-%d. Assuming +\n" % (mystart, myend)) cds_list_plus.append((mystart,myend,1)) for i,pospair in enumerate(cds_list_plus[1:]): # Compare current start position to previous end position last_end = cds_list_plus[i][1] this_start = pospair[0] strand = pospair[2] if this_start - last_end >= intergene_length: intergene_seq = seq_record.seq[last_end:this_start] strand_string = "+" intergenic_records.append( SeqRecord(intergene_seq,id="%s-ign-%d" % (seq_record.name,i), description="%s %d-%d %s" % (seq_record.name, last_end+1, this_start,strand_string))) for i,pospair in enumerate(cds_list_minus[1:]): last_end = cds_list_minus[i][1] this_start = pospair[0] strand = pospair[2] if this_start - last_end >= intergene_length: intergene_seq = seq_record.seq[last_end:this_start] strand_string = "-" intergenic_records.append( SeqRecord(intergene_seq,id="%s-ign-%d" % (seq_record.name,i), description="%s %d-%d %s" % (seq_record.name, last_end+1, this_start,strand_string))) SeqIO.write(intergenic_records, open(outpath,"w"), "fasta") if __name__ == '__main__': # merge_gbk() # sys.exit(0) if len(sys.argv) == 2: get_interregions(sys.argv[1]) elif len(sys.argv) == 3: get_interregions(sys.argv[1],int(sys.argv[2])) else: # print "Usage: get_intergenic.py gb_file [intergenic_length]" sys.exit(0)