# To change this template, choose Tools | Templates # and open the template in the editor. import sys import os import re sys.path.append('/home/psgendb/local/biopython-1.55') from Bio import SeqIO from Bio.Seq import Seq #from Bio.Alphabet import generic_dna from Bio.Alphabet import IUPAC from operator import itemgetter class Fasta(): def __init__(self, fastaFile): self.fFile=fastaFile def tab_to_fasta(self, tabFile): f=open(tabFile) outF=open(self.fFile,"w") for line in f: line.rstrip('\n') arr=line.split() outF.write('>'+arr[0]+'\n') outF.write(arr[1]+'\n') f.close() outF.close() def getSingleSeq(self,title): singleF=open(self.fFile+"_"+title+".ffn","w") for record in SeqIO.parse(open(self.fFile,"r"), "fasta") : if record.id==title: singleF.write(">"+title+"\n"+str(record.seq)+"\n") break singleF.close() def getSubSeq(self,title,start,end,c): # c- 1 complimentary; title- sequence id for record in SeqIO.parse(open(self.fFile,"r"), "fasta") : if record.id==title: dna_str=(str(record.seq)[(int(start)-1):int(end)]) if int(c)==-1: my_dna=Seq(str(dna_str),IUPAC.unambiguous_dna) dna_str=str(my_dna.reverse_complement()) return dna_str def getSubSet(self,str_include): outfile=open(self.fFile+"_"+str_include+".ffn","w") for record in SeqIO.parse(open(self.fFile,"r"), "fasta") : print(record.id) if (str_include in record.description): outfile.write(">"+record.description.split()[1]+"\n" \ +str(record.seq)+"\n") ''' if record.id==title: dna_str=(str(record.seq)[(int(start)-1):int(end)]) if int(c)==-1: my_dna=Seq(str(dna_str),IUPAC.unambiguous_dna) dna_str=str(my_dna.reverse_complement()) return dna_str ''' def NtoT(self): outF=open(self.fFile+"_NtoT.ffn","w") outS=open(self.fFile+"_NtoT.stat","w") # for record in SeqIO.parse(open(self.fFile,"r"), "fasta") : def getuniquegene(self): self.uniF=open(self.fFile+"_unique.ffn","w") self.shortF=open(self.fFile+"_short.ffn","w") self.dupF=open(self.fFile+"_dup.desc","w"); self.dupIdF=open(self.fFile+"_dup.id","w"); genes={}; for record in SeqIO.parse(open(self.fFile,"r"), "fasta") : genes[record.id]={} genes[record.id]["seq"]=str(record.seq) genes[record.id]["dup"]="no" if len(record.seq)<60: genes[record.id]["len"]="short" else: genes[record.id]["len"]="long" genes[record.id]["title"]='|'.join(record.description.split("\t")) dupgenes={} dupId=[] sortedgenes=list(genes.keys()) sortedgenes.sort() for key in sortedgenes: dup=0 for dkey in dupgenes: if genes[key]["seq"] == dupgenes[dkey]["seq"]: if dupgenes[dkey]["target"] =="": dupgenes[dkey]["target"]=key else: dupgenes[dkey]["target"]=dupgenes[dkey]["target"]+"\t"+key dup=1 dupId.append(key) break if dup==0: dupgenes[key]={} dupgenes[key]["seq"]=genes[key]["seq"] dupgenes[key]["target"]="" dupF=open(self.fFile+"_dup.desc","w"); dupIdF=open(self.fFile+"_dup.id","w"); dup_count=0 total=0 for key in dupgenes: if dupgenes[key]["target"]=="": continue dup_count=dup_count+1 ids=dupgenes[key]["target"].split() dupF.write("------------------------------------------\n\n") dupF.write("- No. "+str(dup_count)+" list of genes with identical sequence\n") dupF.write("------------------------------------------\n\n") dupIdF.write("\n---------------------------\n") dupF.write(key+"\t"+genes[key]["title"]+"\n") dupIdF.write(key) for item in ids: total=total+1 dupF.write(item+"\t"+genes[item]["title"]+"\n") dupIdF.write("\t"+item) dupIdF.write("\n") dupF.write("total duplicate genes - "+str(total)+"\n") dupF.close() dupIdF.close() uniF=open(self.fFile+"_unique.ffn","w") shortF=open(self.fFile+"_short.ffn","w") for key in sortedgenes: if ((not (key in dupId)) | (genes[key]["len"]=="short")): uniF.write(">"+genes[key]["title"]+"\n") uniF.write(genes[key]["seq"]+"\n") if genes[key]["len"]=="short": shortF.write(">"+key+"\n") shortF.write(genes[key]["seq"]+"\n") uniF.close() shortF.close() def eArrayProbeToFasta(self,probeFile): fastaFile=self.fFile probeF=open(probeFile,'r') FFile=open(fastaFile,'w') locus="Tpe" for line in probeF: line=line.rstrip('\n') if line[0:3]!=locus: continue arrs=line.split('\t') FFile.write(">"+arrs[0]+"\n") FFile.write(arrs[2]+"\n") probeF.close() FFile.close() def geneNotInNewAssembly(self): ls=[] noId=set(ls) f=open("IMGOldGene_on_newAssembly.m8","r") o1=open("OldGeneNotInNewAssembly_9595.fna","w") curContig="" hit=0 for line in f: arr=line.rstrip().split() if arr[0].split("_")[0]=="2506431805": print(arr[0]) if curContig!=arr[0].split("_")[0]: if hit==0: noId.add(curContig) curContig=arr[0].split("_")[0] hit=0 # continue if abs(float(arr[3])/(float(arr[0].split("_")[1])+1))>0.95 and float(arr[2])>95: hit=1 f.close() for record in SeqIO.parse(open(self.fFile,"r"),"fasta"): # len=abs(int(record.description.split()[2].split('(')[0].split('.')[2])-int(record.description.split()[2].split('(')[0].split('.')[0])) if record.id in noId: o1.write(">"+str(record.description)+"\n"+str(record.seq)+"\n") def subsetMerge(self): ls=[] hitId=set(ls) f=open("mergeContig_on_notInContig.m8","r") o1=open("subSetMerge.ffn","w") curContig="" for line in f: arr=line.rstrip().split() if abs(float(arr[3]))/float(arr[0].split("_")[1])>0.95 and float(arr[2])>95: curContig=arr[0].split("_")[0] hitId.add(curContig) f.close() for record in SeqIO.parse(open(self.fFile,"r"),"fasta"): # len=abs(int(record.description.split()[2].split('(')[0].split('.')[2])-int(record.description.split()[2].split('(')[0].split('.')[0])) if record.id not in hitId: o1.write(">"+str(record.description)+"\n"+str(record.seq)+"\n") def seqNotInFasta(self): ls=[] noId=set(ls) id=90 align=0.8 blastm8="allContig_on_Scaffold.m8" f=open(blastm8,"r") o1=open(blastm8.split(".")[0]+"Not_id"+str(id)+"_align"+str(align)+".ffn","w") curContig="" hit=0 for line in f: arr=line.rstrip().split() if arr[0].split("_")[0]=="2505393056": print(line) if curContig!=arr[0].split("_")[0]: if hit==0: noId.add(curContig) curContig=arr[0].split("_")[0] hit=0 # continue if abs(float(arr[3]))/float(arr[0].split("_")[1])>align and float(arr[2])>id: hit=1 f.close() for record in SeqIO.parse(open(self.fFile,"r"),"fasta"): # len=abs(int(record.description.split()[2].split('(')[0].split('.')[2])-int(record.description.split()[2].split('(')[0].split('.')[0])) if record.id in noId: o1.write(">"+str(record.description)+"\n"+str(record.seq)+"\n") def contig_len(self): FFile=open(self.fFile+"_len.ffn","w") for record in SeqIO.parse(open(self.fFile,"r"),"fasta"): # len=abs(int(record.description.split()[2].split('(')[0].split('.')[2])-int(record.description.split()[2].split('(')[0].split('.')[0])) FFile.write(">"+str(record.id)+"_"+str(len(record.seq))+"\n") FFile.write(str(record.seq)+"\n") FFile.close() def concatenation(self,gap): FFile=open(self.fFile+"_concat.ffn","w") FFile.write(">"+self.fFile+" concatenation\n") SFile=open(self.fFile+"_sort.ffn","w") dseq={} dlen={} for record in SeqIO.parse(open(self.fFile,"r"),"fasta"): dseq[str(record.id)]=str(record.seq) dlen[str(record.id)]=len(str(record.seq)) items=sorted(dlen, key=dlen.__getitem__,reverse=True) for seq_id in items: print(seq_id+"\n") FFile.write(dseq[str(seq_id)]+gap*'N') SFile.write(">"+str(seq_id)+"_"+str(len(dseq[str(seq_id)]))+"\n"+dseq[str(seq_id)]+"\n") FFile.close() SFile.close() def fastatosingle(self): fastaFile_dir=self.fFile+"_single" if not os.path.isdir(fastaFile_dir): os.makedirs(fastaFile_dir) for record in SeqIO.parse(open(self.fFile,"r"),"fasta"): FFile=open(fastaFile_dir+"/"+str(record.id)+".ffn","w") FFile.write(">"+str(record.id)+"\n") FFile.write(str(record.seq)+"\n") FFile.close() def GC_content(self): GCF=open(self.fFile+"_GC","w") for record in SeqIO.parse(open(self.fFile,"r"),"fasta"): str_seq=str(str(record.seq)) intGC=int(str_seq.count("g")+str_seq.count("G")+ str_seq.count("c")+str_seq.count("C")) ratio=int(float(intGC)/float(len(str_seq))*100) GCF.write(str(ratio)+"\n") GCF.close() cmd_str="R CMD BATCH '--args df=\""+self.fFile+"_GC\"'"+" /local/workspace01/zhangju/script/R_script/Histogram.r GC_run.out" os.system(cmd_str) def Scaffolds_to_Contigs(self): contigFile=open(self.fFile+"_contigs.fna","w") contig_index=1 title_h={} seq_h={} for record in SeqIO.parse(open(self.fFile,"r"),"fasta"): str_seq=str(str(record.seq)) arr=str_seq.split('N') for item in arr: if(item==""): continue else: title="lanl_contig"+str('%05d' % contig_index)+"_"+str(len(item)) title_h[title]=len(item) seq_h[title]=item contig_index=contig_index+1 reverse_title=list(sorted(title_h,key=title_h.__getitem__,reverse=True)) for c_title in reverse_title: contigFile.write(">"+c_title+"\n"+seq_h[c_title]+"\n") contigFile.close()