# To change this template, choose Tools | Templates
# and open the template in the editor.
import sys
import os
import re
sys.path.append('/home/psgendb/local/biopython-1.55')
from Bio import SeqIO
from Bio.Seq import Seq
#from Bio.Alphabet import generic_dna
from Bio.Alphabet import IUPAC
from operator import itemgetter

class Fasta():

    def __init__(self, fastaFile):
        self.fFile=fastaFile
        
    def tab_to_fasta(self, tabFile):
        f=open(tabFile)
        outF=open(self.fFile,"w")
        for line in f:
            line.rstrip('\n')
            arr=line.split()
            outF.write('>'+arr[0]+'\n')
            outF.write(arr[1]+'\n')
        f.close()
        outF.close()

    def getSingleSeq(self,title):
        singleF=open(self.fFile+"_"+title+".ffn","w")
        for record in SeqIO.parse(open(self.fFile,"r"), "fasta") :
            if record.id==title:
                singleF.write(">"+title+"\n"+str(record.seq)+"\n")
                break
        singleF.close()

    def getSubSeq(self,title,start,end,c):
#        c- 1 complimentary; title- sequence id
        for record in SeqIO.parse(open(self.fFile,"r"), "fasta") :
            if record.id==title:
                dna_str=(str(record.seq)[(int(start)-1):int(end)])
                if int(c)==-1:
                    my_dna=Seq(str(dna_str),IUPAC.unambiguous_dna)
                    dna_str=str(my_dna.reverse_complement())
                return dna_str
            
    def getSubSet(self,str_include):
        outfile=open(self.fFile+"_"+str_include+".ffn","w")
        for record in SeqIO.parse(open(self.fFile,"r"), "fasta") :
            print(record.id)
            if (str_include in record.description):
                outfile.write(">"+record.description.split()[1]+"\n" \
                              +str(record.seq)+"\n")
            '''
            if record.id==title:
                dna_str=(str(record.seq)[(int(start)-1):int(end)])
                if int(c)==-1:
                    my_dna=Seq(str(dna_str),IUPAC.unambiguous_dna)
                    dna_str=str(my_dna.reverse_complement())
                return dna_str
            '''
            
    def NtoT(self):
        outF=open(self.fFile+"_NtoT.ffn","w")
        outS=open(self.fFile+"_NtoT.stat","w")
#        for record in SeqIO.parse(open(self.fFile,"r"), "fasta") :

    def getuniquegene(self):
        self.uniF=open(self.fFile+"_unique.ffn","w")
        self.shortF=open(self.fFile+"_short.ffn","w")
        self.dupF=open(self.fFile+"_dup.desc","w");
        self.dupIdF=open(self.fFile+"_dup.id","w");

        genes={};
        for record in SeqIO.parse(open(self.fFile,"r"), "fasta") :
            genes[record.id]={}
            genes[record.id]["seq"]=str(record.seq)
            genes[record.id]["dup"]="no"
            if len(record.seq)<60:
                genes[record.id]["len"]="short"
            else:
                genes[record.id]["len"]="long"
            genes[record.id]["title"]='|'.join(record.description.split("\t"))
        dupgenes={}
        dupId=[]
        sortedgenes=list(genes.keys())
        sortedgenes.sort()
        for key in sortedgenes:
            dup=0
            for dkey in dupgenes:
                if genes[key]["seq"] == dupgenes[dkey]["seq"]:
                    if dupgenes[dkey]["target"] =="":
                        dupgenes[dkey]["target"]=key
                    else:
                        dupgenes[dkey]["target"]=dupgenes[dkey]["target"]+"\t"+key
                    dup=1
                    dupId.append(key)
                    break
            if dup==0:
                dupgenes[key]={}
                dupgenes[key]["seq"]=genes[key]["seq"]
                dupgenes[key]["target"]=""
        dupF=open(self.fFile+"_dup.desc","w");
        dupIdF=open(self.fFile+"_dup.id","w");
        dup_count=0
        total=0
        for key in dupgenes:
            if dupgenes[key]["target"]=="":
                continue
            dup_count=dup_count+1
            ids=dupgenes[key]["target"].split()
            dupF.write("------------------------------------------\n\n")
            dupF.write("- No. "+str(dup_count)+" list of genes with identical sequence\n")
            dupF.write("------------------------------------------\n\n")
            dupIdF.write("\n---------------------------\n")
            dupF.write(key+"\t"+genes[key]["title"]+"\n")
            dupIdF.write(key)
            for item in ids:
                total=total+1
                dupF.write(item+"\t"+genes[item]["title"]+"\n")
                dupIdF.write("\t"+item)
            dupIdF.write("\n")
        dupF.write("total duplicate genes - "+str(total)+"\n")
        dupF.close()
        dupIdF.close()
        uniF=open(self.fFile+"_unique.ffn","w")
        shortF=open(self.fFile+"_short.ffn","w")
        for key in sortedgenes:
            if ((not (key in dupId)) | (genes[key]["len"]=="short")):
                uniF.write(">"+genes[key]["title"]+"\n")
                uniF.write(genes[key]["seq"]+"\n")
            if genes[key]["len"]=="short":
                shortF.write(">"+key+"\n")
                shortF.write(genes[key]["seq"]+"\n")
        uniF.close()
        shortF.close()

    def eArrayProbeToFasta(self,probeFile):

        fastaFile=self.fFile
        probeF=open(probeFile,'r')
        FFile=open(fastaFile,'w')
        locus="Tpe"
        for line in probeF:
            line=line.rstrip('\n')
            if line[0:3]!=locus:
                continue
            arrs=line.split('\t')
            FFile.write(">"+arrs[0]+"\n")
            FFile.write(arrs[2]+"\n")
        probeF.close()
        FFile.close()

    def geneNotInNewAssembly(self):
        ls=[]
        noId=set(ls)
        f=open("IMGOldGene_on_newAssembly.m8","r")
        o1=open("OldGeneNotInNewAssembly_9595.fna","w")
        curContig=""
        hit=0
        for line in f:
            arr=line.rstrip().split()
            if arr[0].split("_")[0]=="2506431805":
                print(arr[0])
  
            if curContig!=arr[0].split("_")[0]:
                if hit==0:
                    noId.add(curContig)
                curContig=arr[0].split("_")[0]
                hit=0
#                continue
            if abs(float(arr[3])/(float(arr[0].split("_")[1])+1))>0.95 and float(arr[2])>95:
                hit=1
        f.close()
        for record in SeqIO.parse(open(self.fFile,"r"),"fasta"):
#            len=abs(int(record.description.split()[2].split('(')[0].split('.')[2])-int(record.description.split()[2].split('(')[0].split('.')[0]))
            if record.id in noId:
                o1.write(">"+str(record.description)+"\n"+str(record.seq)+"\n")

    def subsetMerge(self):
        ls=[]
        hitId=set(ls)
        f=open("mergeContig_on_notInContig.m8","r")
        o1=open("subSetMerge.ffn","w")
        curContig=""

        for line in f:
            arr=line.rstrip().split()
            if abs(float(arr[3]))/float(arr[0].split("_")[1])>0.95 and float(arr[2])>95:

                curContig=arr[0].split("_")[0]
                hitId.add(curContig)
        f.close()
        for record in SeqIO.parse(open(self.fFile,"r"),"fasta"):
#            len=abs(int(record.description.split()[2].split('(')[0].split('.')[2])-int(record.description.split()[2].split('(')[0].split('.')[0]))
            if record.id not in hitId:
                o1.write(">"+str(record.description)+"\n"+str(record.seq)+"\n")

    def seqNotInFasta(self):
        ls=[]
        noId=set(ls)
        id=90
        align=0.8
        blastm8="allContig_on_Scaffold.m8"
        f=open(blastm8,"r")
        o1=open(blastm8.split(".")[0]+"Not_id"+str(id)+"_align"+str(align)+".ffn","w")
        curContig=""
        hit=0
        for line in f:
            arr=line.rstrip().split()
            if arr[0].split("_")[0]=="2505393056":
                print(line)
            if curContig!=arr[0].split("_")[0]:
                if hit==0:
                    noId.add(curContig)
                curContig=arr[0].split("_")[0]
                hit=0
#                continue
            if abs(float(arr[3]))/float(arr[0].split("_")[1])>align and float(arr[2])>id:
                hit=1
        f.close()
        for record in SeqIO.parse(open(self.fFile,"r"),"fasta"):
#            len=abs(int(record.description.split()[2].split('(')[0].split('.')[2])-int(record.description.split()[2].split('(')[0].split('.')[0]))
            if record.id in noId:
                o1.write(">"+str(record.description)+"\n"+str(record.seq)+"\n")
    def contig_len(self):
        FFile=open(self.fFile+"_len.ffn","w")
        for record in SeqIO.parse(open(self.fFile,"r"),"fasta"):
#            len=abs(int(record.description.split()[2].split('(')[0].split('.')[2])-int(record.description.split()[2].split('(')[0].split('.')[0]))
            FFile.write(">"+str(record.id)+"_"+str(len(record.seq))+"\n")
            FFile.write(str(record.seq)+"\n")
        FFile.close()
    
    def concatenation(self,gap):
      
        FFile=open(self.fFile+"_concat.ffn","w")
        FFile.write(">"+self.fFile+" concatenation\n")
        SFile=open(self.fFile+"_sort.ffn","w")
        dseq={}
        dlen={}
        for record in SeqIO.parse(open(self.fFile,"r"),"fasta"):
            dseq[str(record.id)]=str(record.seq)
            dlen[str(record.id)]=len(str(record.seq))
        
        items=sorted(dlen, key=dlen.__getitem__,reverse=True)
        for seq_id in items:
            print(seq_id+"\n")
            FFile.write(dseq[str(seq_id)]+gap*'N')
            SFile.write(">"+str(seq_id)+"_"+str(len(dseq[str(seq_id)]))+"\n"+dseq[str(seq_id)]+"\n")

        FFile.close()
        SFile.close()

    def fastatosingle(self):

        fastaFile_dir=self.fFile+"_single"
        if not os.path.isdir(fastaFile_dir):
            os.makedirs(fastaFile_dir)
        for record in SeqIO.parse(open(self.fFile,"r"),"fasta"):
            FFile=open(fastaFile_dir+"/"+str(record.id)+".ffn","w")
            FFile.write(">"+str(record.id)+"\n")
            FFile.write(str(record.seq)+"\n")
            FFile.close()
 
    def GC_content(self):

        GCF=open(self.fFile+"_GC","w")
        for record in SeqIO.parse(open(self.fFile,"r"),"fasta"):
            str_seq=str(str(record.seq))
            intGC=int(str_seq.count("g")+str_seq.count("G")+ str_seq.count("c")+str_seq.count("C"))
            ratio=int(float(intGC)/float(len(str_seq))*100)
            GCF.write(str(ratio)+"\n")
        GCF.close()

        cmd_str="R CMD BATCH '--args df=\""+self.fFile+"_GC\"'"+" /local/workspace01/zhangju/script/R_script/Histogram.r GC_run.out"
        os.system(cmd_str)
        
    def Scaffolds_to_Contigs(self):
        
        contigFile=open(self.fFile+"_contigs.fna","w")
        contig_index=1
        title_h={}
        seq_h={}
        for record in SeqIO.parse(open(self.fFile,"r"),"fasta"):
            str_seq=str(str(record.seq))
            arr=str_seq.split('N')
            for item in arr:
                if(item==""):
                    continue
                else:
                    title="lanl_contig"+str('%05d' % contig_index)+"_"+str(len(item))
                    title_h[title]=len(item)
                    seq_h[title]=item
                    contig_index=contig_index+1
        reverse_title=list(sorted(title_h,key=title_h.__getitem__,reverse=True))
        for c_title in reverse_title:
            contigFile.write(">"+c_title+"\n"+seq_h[c_title]+"\n")
        contigFile.close()