#!/usr/bin/env python
# Copyright 2007, 2008
# Niko Beerenwinkel,
# Nicholas Eriksson,
# Lukas Geyrhofer,
# Osvaldo Zagordi,
# ETH Zurich
# This file is part of ShoRAH.
# ShoRAH is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# ShoRAH is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with ShoRAH. If not, see .
from pythonlib import SFFParser, EmbossStandalone
from Bio import SeqIO
import MyAlignIO
import sys,os
import time
import optparse
import random,math
needle_exe = 'needle'
acclength = 3.
amb_thresh = 2
dna_code = ['A','C','T','G']
f_code = {}
f_code['R'] = ['G', 'A']
f_code['Y'] = ['T', 'C']
f_code['K'] = ['G', 'T']
f_code['M'] = ['A', 'C']
f_code['S'] = ['G', 'C']
f_code['W'] = ['A', 'T']
f_code['B'] = ['G', 'T', 'C']
f_code['D'] = ['G', 'A', 'T']
f_code['H'] = ['A', 'C', 'T']
f_code['V'] = ['G', 'C', 'A']
f_code['N'] = ['A', 'G', 'C', 'T']
optparser = optparse.OptionParser()
optparser.add_option("-f","--readfile",type="string",default="",dest="input")
optparser.add_option("-r","--ref",type="string",default="",dest="ref")
optparser.add_option("-o","--output",help="output suffix must be '.far' or none for stdout",type="string",dest="o")
optparser.add_option("-t","--threshold",help="threshold of errors for throwing reads away... ",type="float",dest="threshold",default=0.4)
optparser.add_option("-k","--keep_files",help="dont delete temporary files ",action="store_true",dest="keep")
(options,args) = optparser.parse_args()
if options.o == None:
f_output = sys.stdout
else:
if(options.o.split('.')[-1] != 'far'):
sys.exit('The suffix of output file must be .far')
f_output = open(options.o,'w')
try:
keep_files = options.keep
except:
keep_files = False
# check the reference genome
fas_reference = options.ref
rh = open(fas_reference, 'rU')
ref = list(SeqIO.parse(rh, 'fasta') )
assert len(ref) == 1, 'One and only one sequence must be in the reference file'
gen_length = len(ref[0].seq)
assert 'N' not in ref[0].seq, "Found an ambiguous position in the reference '%s'" % ref[0].id
print 'The reference genome length is', gen_length
sff_droppedreads_quality = 0
if options.input.split('.')[-1] == 'sff':
sff_reader = SFFParser.SFFReader(options.input)
reads = sff_reader.reads
f_tmp = open('tmp_reads_f.fas','w')
print 'converting sff-file to fasta format...'
length = 0.
length2 = 0.
n =0.
readdict = {}
for read in reads:
seq = ''.join(read.bases).strip('N')
if seq.count('N') < amb_thresh:
len_seq = len(seq)
length += float(len_seq)
length2 += float(len_seq*len_seq)
readdict[read.name] = [seq,len_seq]
n += 1.
meanlr = length/n
stdlr = math.sqrt((n*length2 - length*length)/(n*n - n))
allowed_length = [meanlr - acclength * stdlr, meanlr + acclength * stdlr]
for ID in readdict.iterkeys():
"""
print 'bases:\t',read.bases
print 'clip_adapter:\t',read.clip_adapter_left,read.clip_adapter_right
print 'clip_qual:\t',read.clip_qual_left,read.clip_qual_right
print 'eight_byte_padding:\t',read.eight_byte_padding
print 'flow_index_per_base:\t',read.flow_index_per_base
print 'flowgram_values:\t',read.flowgram_values
print 'mydataoffset:\t',read.mydataoffset
print 'mytype:\t',read.mytype
print 'name:\t',read.name
print 'name_length:\t',read.name_length
print 'number_of_bases:\t',read.number_of_bases
print 'quality_scores:\t',read.quality_scores
print 'read_header_length:\t',read.read_header_length
sys.exit(1)
"""
if readdict[ID][1] > allowed_length[0] and readdict[ID][1] < allowed_length[1]:
s = ''
print >>f_tmp,'>%s_%d'%(ID,readdict[ID][1])
for letter in readdict[ID][0]:
if len(s) == 80:
print >>f_tmp,s
s = ''
if letter in dna_code:
s += letter
elif f_code.has_key(letter):
s += random.choice(f_code[letter])
if len(s) != 0:
print >>f_tmp,s
else:
sff_droppedreads_length += 1
f_tmp.close()
f_fasta_filename = 'tmp_reads_f.fas'
elif options.input.split('.')[-1] in ['fas','fasta']:
if os.path.isfile(options.input):
f_fasta_filename = options.input
else:
print 'fasta file "%s" not found...'%options.input
sys.exit()
else:
print 'format of input-file not supported...'
sys.exit()
f_fasta = open(f_fasta_filename)
seqlist = list(SeqIO.parse(f_fasta,'fasta'))
for seq in seqlist:
seq.seq = seq.seq.reverse_complement()
f_fasta.close()
countreads = len(seqlist)
f_fasta_reverse_filename = 'tmp_reads_r.fas'
f_fasta_reverse = open(f_fasta_reverse_filename,'w')
SeqIO.write(seqlist,f_fasta_reverse,'fasta')
f_fasta_reverse.close()
if not os.path.isfile('tmp_align_f.needle'):
EmbossStandalone.needle(needle_exe,options.ref,f_fasta_filename,out='tmp_align_f.needle',gapopen=6.0,gapext=3.0,aglobal3='False')
else:
print >>sys.stderr, 'The alignment file tmp_align_f.needle is already present'
statinfo = os.stat('tmp_align_f.needle')
age_sec = time.time() - statinfo.st_mtime
if age_sec > 3600:
print >>sys.stderr, 'Warning: it was modified more than an hour ago'
age = time.gmtime(age_sec)
print >>sys.stderr, 'If you want to run the alignment again, remove it'
print >>sys.stderr, "using existing 'tmp_align_f.needle'..."
if not os.path.isfile('tmp_align_r.needle'):
EmbossStandalone.needle(needle_exe,options.ref,f_fasta_reverse_filename,out='tmp_align_r.needle',gapopen=6.0,gapext=3.0,aglobal3='False')
else:
print >>sys.stderr, 'The alignment file tmp_align_r.needle is already present'
statinfo = os.stat('tmp_align_f.needle')
age_sec = time.time() - statinfo.st_mtime
if age_sec > 3600:
print >>sys.stderr, 'Warning: it was modified more than an hour ago'
age = time.gmtime(age_sec)
print >>sys.stderr, 'If you want to run the alignment again, remove it'
f_normal = open('tmp_align_f.needle')
normalaligniter = iter(MyAlignIO.parse(f_normal,'markx10'))
#print dir(normalalign.next())
diff_ident = []
diff_score = []
countreads_afterreverse = 0
count_forward = 0
count_reverse = 0
reads = {}
ref = {}
ID = 0
maxlen = [0,0]
f_reverse = open('tmp_align_r.needle')
for reversealign in MyAlignIO.parse(f_reverse,'markx10'):
normalalign = normalaligniter.next()
assert normalalign.get_all_seqs()[1].id == reversealign.get_all_seqs()[1].id, 'same seq back and forward'
#print normalalign.get_all_seqs()[1].id
#print reversealign.get_all_seqs()[1].id
#print normalalign._annotations['sw_score']
#print reversealign._annotations['sw_score']
#print normalalign._annotations['sw_ident']
#print reversealign._annotations['sw_ident']
#print normalalign.get_seq_by_num(1)
#print reversealign.get_seq_by_num(1)
# diff_ident.append(float(normalalign._annotations['sw_ident']) - float(reversealign._annotations['sw_ident']))
# diff_score.append(float(normalalign._annotations['sw_score']) - float(reversealign._annotations['sw_score']))
basecount = 0.
errorcount = 0.
accept_read = False
if float(normalalign._annotations['sw_ident']) > float(reversealign._annotations['sw_ident']):
tmp = normalalign.get_seq_by_num(1).tostring().upper()
refseq = normalalign.get_seq_by_num(0).tostring().upper()
count_forward += 1
else:
tmp = reversealign.get_seq_by_num(1).tostring().upper()
refseq = reversealign.get_seq_by_num(0).tostring().upper()
count_reverse += 1
align_start = min([tmp.find('A'),tmp.find('C'),tmp.find('T'),tmp.find('G')])
align_end = max([tmp.rfind('A'),tmp.rfind('C'),tmp.rfind('T'),tmp.rfind('G')])
for i in range(align_start,align_end):
if tmp[i] != refseq[i]:
errorcount += 1.
if tmp[i] in dna_code:
basecount += 1.
if options.threshold > errorcount/basecount:
if len(refseq) > maxlen[0]:
maxlen[0] = len(refseq)
maxlen[1] = ID
reads[ID] = tmp
ref[ID] = refseq
ID += 1
#tmp = tmp.replace('-','').strip('N')
#print >>f_output,">%s"%normalalign.get_all_seqs()[1].id.split('#')[0]
#print >>f_output,tmp
countreads_afterreverse += 1
print >>sys.stderr,'%d reads were above the threshold, %d reads left'%(countreads - countreads_afterreverse,countreads_afterreverse)
try:
print >>sys.stderr,'dropped %d reads with wron length'%sff_droppedreads_length
except:
pass
print >>sys.stderr,'forward: %d, reverse: %d'%(count_forward,count_reverse)
#assert len(reads) == len(ref)
################################
# propagating the gaps #
# from the pairwise alignments #
################################
pos = 0
while pos < maxlen[0]:
# list of ref-sequences with no gaps in it at current pos
gaplist = ref.keys()
removed_from_gaplist = False
# list of ref-sequences which already terminated
shortend = []
for id in ref.iterkeys():
if len(ref[id]) > maxlen[0]:
maxlen[0] = len(ref[id])
maxlen[1] = id
try:
if ref[id][pos] == '-':
gaplist.remove(id)
removed_from_gaplist = True
except IndexError:
shortend.append(id)
gaplist.remove(id)
# debugging output:
#print "\t%d\t%d\t%d\t%d\t%d\t%d"%(pos,gl_size_start,len(gaplist),gl_size_start-len(gaplist),len(shortend),max[0])
# only when there are gaps
if removed_from_gaplist:
if maxlen[1] in gaplist:
maxlen[0] += 1
# go through the list and insert gaps in ref and read
for id in gaplist:
ref[id] = ref[id][:pos] + '-' + ref[id][pos:]
reads[id] = reads[id][:pos] + '-' + reads[id][pos:]
# insert gaps in ref and read, when already terminated
for id in shortend:
ref[id] = ref[id] + '-'
reads[id] = reads[id] + '-'
pos += 1
# sort according to startpos
startpos = {}
for ID in reads.iterkeys():
start_align = min([reads[ID].find('A'),reads[ID].find('C'),reads[ID].find('T'),reads[ID].find('G')])
if not startpos.has_key(start_align):
startpos[start_align] = []
startpos[start_align].append(ID)
for i in range(maxlen[0]):
if startpos.has_key(i):
for ID in startpos[i]:
print >> f_output, ">read#%s"%ID
s = ''
for letter in reads[ID]:
if len(s) == 80:
print >> f_output,s
s = ''
s += letter
if len(s) > 0:
print >> f_output,s
if not keep_files:
try:
os.remove('tmp_reads_f.fas')
except:
pass
try:
os.remove('tmp_reads_r.fas')
except:
pass
try:
os.remove('tmp_align_f.needle')
except:
pass
try:
os.remove('tmp_align_r.needle')
except:
pass