#!/usr/bin/env python # Copyright 2007, 2008 # Niko Beerenwinkel, # Nicholas Eriksson, # Lukas Geyrhofer, # Osvaldo Zagordi, # ETH Zurich # This file is part of ShoRAH. # ShoRAH is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # ShoRAH is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with ShoRAH. If not, see . from pythonlib import SFFParser, EmbossStandalone from Bio import SeqIO import MyAlignIO import sys,os import time import optparse import random,math needle_exe = 'needle' acclength = 3. amb_thresh = 2 dna_code = ['A','C','T','G'] f_code = {} f_code['R'] = ['G', 'A'] f_code['Y'] = ['T', 'C'] f_code['K'] = ['G', 'T'] f_code['M'] = ['A', 'C'] f_code['S'] = ['G', 'C'] f_code['W'] = ['A', 'T'] f_code['B'] = ['G', 'T', 'C'] f_code['D'] = ['G', 'A', 'T'] f_code['H'] = ['A', 'C', 'T'] f_code['V'] = ['G', 'C', 'A'] f_code['N'] = ['A', 'G', 'C', 'T'] optparser = optparse.OptionParser() optparser.add_option("-f","--readfile",type="string",default="",dest="input") optparser.add_option("-r","--ref",type="string",default="",dest="ref") optparser.add_option("-o","--output",help="output suffix must be '.far' or none for stdout",type="string",dest="o") optparser.add_option("-t","--threshold",help="threshold of errors for throwing reads away... ",type="float",dest="threshold",default=0.4) optparser.add_option("-k","--keep_files",help="dont delete temporary files ",action="store_true",dest="keep") (options,args) = optparser.parse_args() if options.o == None: f_output = sys.stdout else: if(options.o.split('.')[-1] != 'far'): sys.exit('The suffix of output file must be .far') f_output = open(options.o,'w') try: keep_files = options.keep except: keep_files = False # check the reference genome fas_reference = options.ref rh = open(fas_reference, 'rU') ref = list(SeqIO.parse(rh, 'fasta') ) assert len(ref) == 1, 'One and only one sequence must be in the reference file' gen_length = len(ref[0].seq) assert 'N' not in ref[0].seq, "Found an ambiguous position in the reference '%s'" % ref[0].id print 'The reference genome length is', gen_length sff_droppedreads_quality = 0 if options.input.split('.')[-1] == 'sff': sff_reader = SFFParser.SFFReader(options.input) reads = sff_reader.reads f_tmp = open('tmp_reads_f.fas','w') print 'converting sff-file to fasta format...' length = 0. length2 = 0. n =0. readdict = {} for read in reads: seq = ''.join(read.bases).strip('N') if seq.count('N') < amb_thresh: len_seq = len(seq) length += float(len_seq) length2 += float(len_seq*len_seq) readdict[read.name] = [seq,len_seq] n += 1. meanlr = length/n stdlr = math.sqrt((n*length2 - length*length)/(n*n - n)) allowed_length = [meanlr - acclength * stdlr, meanlr + acclength * stdlr] for ID in readdict.iterkeys(): """ print 'bases:\t',read.bases print 'clip_adapter:\t',read.clip_adapter_left,read.clip_adapter_right print 'clip_qual:\t',read.clip_qual_left,read.clip_qual_right print 'eight_byte_padding:\t',read.eight_byte_padding print 'flow_index_per_base:\t',read.flow_index_per_base print 'flowgram_values:\t',read.flowgram_values print 'mydataoffset:\t',read.mydataoffset print 'mytype:\t',read.mytype print 'name:\t',read.name print 'name_length:\t',read.name_length print 'number_of_bases:\t',read.number_of_bases print 'quality_scores:\t',read.quality_scores print 'read_header_length:\t',read.read_header_length sys.exit(1) """ if readdict[ID][1] > allowed_length[0] and readdict[ID][1] < allowed_length[1]: s = '' print >>f_tmp,'>%s_%d'%(ID,readdict[ID][1]) for letter in readdict[ID][0]: if len(s) == 80: print >>f_tmp,s s = '' if letter in dna_code: s += letter elif f_code.has_key(letter): s += random.choice(f_code[letter]) if len(s) != 0: print >>f_tmp,s else: sff_droppedreads_length += 1 f_tmp.close() f_fasta_filename = 'tmp_reads_f.fas' elif options.input.split('.')[-1] in ['fas','fasta']: if os.path.isfile(options.input): f_fasta_filename = options.input else: print 'fasta file "%s" not found...'%options.input sys.exit() else: print 'format of input-file not supported...' sys.exit() f_fasta = open(f_fasta_filename) seqlist = list(SeqIO.parse(f_fasta,'fasta')) for seq in seqlist: seq.seq = seq.seq.reverse_complement() f_fasta.close() countreads = len(seqlist) f_fasta_reverse_filename = 'tmp_reads_r.fas' f_fasta_reverse = open(f_fasta_reverse_filename,'w') SeqIO.write(seqlist,f_fasta_reverse,'fasta') f_fasta_reverse.close() if not os.path.isfile('tmp_align_f.needle'): EmbossStandalone.needle(needle_exe,options.ref,f_fasta_filename,out='tmp_align_f.needle',gapopen=6.0,gapext=3.0,aglobal3='False') else: print >>sys.stderr, 'The alignment file tmp_align_f.needle is already present' statinfo = os.stat('tmp_align_f.needle') age_sec = time.time() - statinfo.st_mtime if age_sec > 3600: print >>sys.stderr, 'Warning: it was modified more than an hour ago' age = time.gmtime(age_sec) print >>sys.stderr, 'If you want to run the alignment again, remove it' print >>sys.stderr, "using existing 'tmp_align_f.needle'..." if not os.path.isfile('tmp_align_r.needle'): EmbossStandalone.needle(needle_exe,options.ref,f_fasta_reverse_filename,out='tmp_align_r.needle',gapopen=6.0,gapext=3.0,aglobal3='False') else: print >>sys.stderr, 'The alignment file tmp_align_r.needle is already present' statinfo = os.stat('tmp_align_f.needle') age_sec = time.time() - statinfo.st_mtime if age_sec > 3600: print >>sys.stderr, 'Warning: it was modified more than an hour ago' age = time.gmtime(age_sec) print >>sys.stderr, 'If you want to run the alignment again, remove it' f_normal = open('tmp_align_f.needle') normalaligniter = iter(MyAlignIO.parse(f_normal,'markx10')) #print dir(normalalign.next()) diff_ident = [] diff_score = [] countreads_afterreverse = 0 count_forward = 0 count_reverse = 0 reads = {} ref = {} ID = 0 maxlen = [0,0] f_reverse = open('tmp_align_r.needle') for reversealign in MyAlignIO.parse(f_reverse,'markx10'): normalalign = normalaligniter.next() assert normalalign.get_all_seqs()[1].id == reversealign.get_all_seqs()[1].id, 'same seq back and forward' #print normalalign.get_all_seqs()[1].id #print reversealign.get_all_seqs()[1].id #print normalalign._annotations['sw_score'] #print reversealign._annotations['sw_score'] #print normalalign._annotations['sw_ident'] #print reversealign._annotations['sw_ident'] #print normalalign.get_seq_by_num(1) #print reversealign.get_seq_by_num(1) # diff_ident.append(float(normalalign._annotations['sw_ident']) - float(reversealign._annotations['sw_ident'])) # diff_score.append(float(normalalign._annotations['sw_score']) - float(reversealign._annotations['sw_score'])) basecount = 0. errorcount = 0. accept_read = False if float(normalalign._annotations['sw_ident']) > float(reversealign._annotations['sw_ident']): tmp = normalalign.get_seq_by_num(1).tostring().upper() refseq = normalalign.get_seq_by_num(0).tostring().upper() count_forward += 1 else: tmp = reversealign.get_seq_by_num(1).tostring().upper() refseq = reversealign.get_seq_by_num(0).tostring().upper() count_reverse += 1 align_start = min([tmp.find('A'),tmp.find('C'),tmp.find('T'),tmp.find('G')]) align_end = max([tmp.rfind('A'),tmp.rfind('C'),tmp.rfind('T'),tmp.rfind('G')]) for i in range(align_start,align_end): if tmp[i] != refseq[i]: errorcount += 1. if tmp[i] in dna_code: basecount += 1. if options.threshold > errorcount/basecount: if len(refseq) > maxlen[0]: maxlen[0] = len(refseq) maxlen[1] = ID reads[ID] = tmp ref[ID] = refseq ID += 1 #tmp = tmp.replace('-','').strip('N') #print >>f_output,">%s"%normalalign.get_all_seqs()[1].id.split('#')[0] #print >>f_output,tmp countreads_afterreverse += 1 print >>sys.stderr,'%d reads were above the threshold, %d reads left'%(countreads - countreads_afterreverse,countreads_afterreverse) try: print >>sys.stderr,'dropped %d reads with wron length'%sff_droppedreads_length except: pass print >>sys.stderr,'forward: %d, reverse: %d'%(count_forward,count_reverse) #assert len(reads) == len(ref) ################################ # propagating the gaps # # from the pairwise alignments # ################################ pos = 0 while pos < maxlen[0]: # list of ref-sequences with no gaps in it at current pos gaplist = ref.keys() removed_from_gaplist = False # list of ref-sequences which already terminated shortend = [] for id in ref.iterkeys(): if len(ref[id]) > maxlen[0]: maxlen[0] = len(ref[id]) maxlen[1] = id try: if ref[id][pos] == '-': gaplist.remove(id) removed_from_gaplist = True except IndexError: shortend.append(id) gaplist.remove(id) # debugging output: #print "\t%d\t%d\t%d\t%d\t%d\t%d"%(pos,gl_size_start,len(gaplist),gl_size_start-len(gaplist),len(shortend),max[0]) # only when there are gaps if removed_from_gaplist: if maxlen[1] in gaplist: maxlen[0] += 1 # go through the list and insert gaps in ref and read for id in gaplist: ref[id] = ref[id][:pos] + '-' + ref[id][pos:] reads[id] = reads[id][:pos] + '-' + reads[id][pos:] # insert gaps in ref and read, when already terminated for id in shortend: ref[id] = ref[id] + '-' reads[id] = reads[id] + '-' pos += 1 # sort according to startpos startpos = {} for ID in reads.iterkeys(): start_align = min([reads[ID].find('A'),reads[ID].find('C'),reads[ID].find('T'),reads[ID].find('G')]) if not startpos.has_key(start_align): startpos[start_align] = [] startpos[start_align].append(ID) for i in range(maxlen[0]): if startpos.has_key(i): for ID in startpos[i]: print >> f_output, ">read#%s"%ID s = '' for letter in reads[ID]: if len(s) == 80: print >> f_output,s s = '' s += letter if len(s) > 0: print >> f_output,s if not keep_files: try: os.remove('tmp_reads_f.fas') except: pass try: os.remove('tmp_reads_r.fas') except: pass try: os.remove('tmp_align_f.needle') except: pass try: os.remove('tmp_align_r.needle') except: pass