""" Title: Fasta file sequence concatenation; Author:Justin Zhang; Date: Sep 28, 2010 Usage: python concat_seq.py --infile= --outfile=(optional) --seqorderfile=(optional) --numNs=(optional 1-1000) Argument parameters: infile - input fasta file (required); outfile - output file of concatenated sequence (optional) seqorderfile: the list of sequences to be cocatenated, if not provided concatenating all the sequences(optional) numNs - number of Ns (1-1000) between 2 joined sequences, default is 10 (optional) Statistic output file: default created - a file named , listed all the sequence name and length, counted number of sequences, average length Usage cases: 1. Only statistic output - python concat_seq.py --infile=; 2. Concatenating all sequences - python concat_seq.py --infile= --outfile= --numNs= 3. Concatenating sequences listed in seq order file - python concat_seq.py --infile= --outfile= --seqorderfile= --numNs= """ import sys sys.path.append('/home/psgendb/local/biopython-1.55') # append biopython lib path import string import os.path import shutil import re from optparse import OptionParser from Bio import SeqIO def PrintUsage(): print "Usage 1: python concat_seq.py --infile= (for statistics info only)" + '\n' print "Usage 2: python concat_seq.py --infile= --outfile= --numNs=(optional 1-1000 default 10)(concatenate all the sequences in the order)" + '\n' print "Usage 3: python concat_seq.py --infile= --outfile= --seqorderfile= --numNs=(optional 1-1000 default 10)(concatenate list of sequences in seq order file)" + '\n' #*********************************Main Function************************************** parser = OptionParser() parser.add_option("-o", "--outfile", dest="outfile", action="store", type="string") parser.add_option("-i", "--infile", dest="infile", action="store", type="string") parser.add_option("-s", "--seqorderfile", dest="orderfile", action="store", type="string") parser.add_option("-n", "--numNs", dest="numNs", action="store", type="string") parser.add_option("-t", "--fastaheader", dest="title", action="store", type="string") (options, args) = parser.parse_args() if options.infile is None: PrintUsage() sys.exit(2) if options.outfile is None: blnConcate=False else: blnConcate=True if options.orderfile is None: blnOrder=False else: blnOrder=True if not os.path.exists(options.orderfile): print "Sequence order file - " + options.orderfile + " doesnot exist." + '\n' PrintUsage() sys.exit(2) numNs=10 if options.numNs is not None: blnRightNs=True intNs=0 try: intNs=int(options.numNs) if not (intNs>0 and intNs<1001): print "numNs requires an integer in the range (1 to 1000)\n" blnRightNs=False else: numNs=intNs except: print "wrong numNs input\n" blnRightNs=False if not blnRightNs: sys.exit(2) if not os.path.exists(options.infile): print "Input file", str(options.infile), "does not exist." sys.exit(2) else: STATFILE=open(options.infile+"_stat",'w') STATFILE.write("Statistics of sequences in file - " + str(options.infile) + '\n') title="Concatenated sequence in file - " + str(options.infile) if options.title is not None: if str(options.title)=="": print "fasta header input is empty\n" print "continue with default header -" + title + " (enter yes): " proc=str(sys.stdin.readline()) if proc != "yes": sys.exit(2) else: title= str(options.title) try: # if outfile is an appropriate file name if blnConcate: OUTFILE=open(options.outfile,'w') OUTFILE.write(">" + title + '\n') blnFirst=True num_contig=0 total_len=0 SeqHash={'first_id00001':'123456'} #establish hash table intMin=1000000000000000 intMax=0 intShort=0 intLong=0 for seq_record in SeqIO.parse(options.infile, "fasta"): num_contig=num_contig+1 total_len=total_len+len(seq_record) STATFILE.write(seq_record.id + '\t' + str(len(seq_record)) + '\n') if len(seq_record)<100: intShort=intShort+1 if len(seq_record)>=300: intLong=intLong+1 if len(seq_record)intMax: intMax=len(seq_record) if blnConcate: if not blnOrder: # if not blnFirst: # OUTFILE.write('N'*numNs) OUTFILE.write(str(seq_record.seq)+'\n') OUTFILE.write('N'*numNs) blnFirst=False else: SeqHash[seq_record.id]=seq_record.seq if blnConcate: if blnOrder: blnFirst=True SEQFILE=open(options.orderfile,'r') for line in SEQFILE: w=re.search('\w+',line) if SeqHash[w.group(0)]==None: qprint "Sequence - " + w.group(0) + " does not exist.\n" else: if not blnFirst: OUTFILE.write('N'*numNs) blnFirst=False print "writting sequence - " + w.group(0) + '\n' OUTFILE.write(str(SeqHash[w.group(0)])) OUTFILE.close() STATFILE.write("Number sequences: " + str(num_contig)+'\n') STATFILE.write("Average sequence length: " + str(int(total_len/num_contig))+'\n') STATFILE.write("Shortest sequence length: " + str(intMin)+'\n') STATFILE.write("Longest sequence length: " + str(intMax)+'\n') STATFILE.write("Number sequeces shorter than 100: " + str(intShort)+'\n') STATFILE.write("Number sequeces not shorter than 300: " + str(intLong)+'\n') STATFILE.close except IOError: print "The file ", str(options.outfile), "cannot be open to write , exiting gracefully" sys.exit(2)