#!/usr/bin/env python # Copyright 2007, 2008 # Niko Beerenwinkel, # Nicholas Eriksson, # Lukas Geyrhofer, # Osvaldo Zagordi, # ETH Zurich # This file is part of ShoRAH. # ShoRAH is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # ShoRAH is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with ShoRAH. If not, see . import sys import os ################################################# # a common user should not edit above this line # ################################################# # parameters not controlled by command line options fasta_length = 60 # controls line length in fasta files tolerance = 0.1 # the portion of in-dels that are tolerated in a single read go_default = 6.0 # gap_opening penalty in needle alignment ge_default = 3.0 # gap_extension penalty in needle alignment amb_thresh = 2 # reads with more than amb_thresh Ns (ambiguous calls) are discarded win_min_ext = 0.6 # if the read covers at least win_min_ext of the window, fill it with Ns verbose = False # sets verbosity hist_fraction = 0.20 # fraction that goes into the history min_quality = 0.8 # quality under which discard the correction # path to diri_sampler program path_to_shorah = './' ################################################# # a common user should not edit below this line # ################################################# to_correct = {} correction = {} quality = {} count = {} count['A'] = 0 count['C'] = 0 count['G'] = 0 count['T'] = 0 count['-'] = 0 clusters = [ [] ] untouched = [ [] ] win_shifts = 3 keep_all_files = False def solve_amb(seq_list): """Use FASTA format description from NCBI to extract a random base from the possible ones A --> adenosine M --> A C (amino) C --> cytidine S --> G C (strong) G --> guanine W --> A T (weak) T --> thymidine B --> G T C U --> uridine D --> G A T R --> G A (purine) H --> A C T Y --> T C (pyrimidine) V --> G C A K --> G T (keto) N --> A G C T (any) - gap of indeterminate length """ import random solved_list = [] f_code = {} f_code['R'] = ['G', 'A'] f_code['Y'] = ['T', 'C'] f_code['K'] = ['G', 'T'] f_code['M'] = ['A', 'C'] f_code['S'] = ['G', 'C'] f_code['W'] = ['A', 'T'] f_code['B'] = ['G', 'T', 'C'] f_code['D'] = ['G', 'A', 'T'] f_code['H'] = ['A', 'C', 'T'] f_code['V'] = ['G', 'C', 'A'] f_code['N'] = ['A', 'G', 'C', 'T'] for base in seq_list: if base in f_code.keys(): solved_list.append( random.choice(f_code[base]) ) else: solved_list.append(base) return solved_list def parse_aligned_reads(reads_file): """ Parse reads from a file with aligned reads in fasta format """ from Bio import SeqIO import time format = 'fasta' max_read_length = 300 out_reads = {} cp = False if not os.path.isfile(reads_file): print 'There should be a file here:', reads_file sys.exit() else: print 'Using file', reads_file, 'of aligned reads' # statinfo = os.stat(reads_file) # age_sec = time.time() - statinfo.st_mtime # if age_sec > 3600: # print 'Warning: it was modified more than an hour ago' # age = time.gmtime(age_sec) handle = open(reads_file, 'rU') print 'Parsing aligned reads' for this_read in SeqIO.parse(handle, format) : try: old_length = gen_length except: gen_length = len(this_read.seq) old_length = gen_length gen_length = len(this_read.seq) assert gen_length == old_length, 'All reads must have the same length' if not cp: # only the first time allocate cov_prof cov_prof = [0]*(2*gen_length + max_read_length) cp = True #print 'Alignment is', gen_length, 'bases long' mst = this_read.seq.tostring() mls = list( mst ) for c in mls: if c != '-': mstart = mls.index(c) + 1 break mstop = len ( mst.rstrip('-') ) # counts the gaps in the read (no flanking gaps) gaps_match = 0 for c in mst.strip('-'): if c =='-': gaps_match = gaps_match + 1 match_length = len(mst.strip('-')) # if gaps_match > round (tolerance * match_length): # print 'too many indels,', (gaps_query + gaps_match) # continue name = this_read.id out_reads[name] = [ None, None, None, None, [] ] out_reads[name][0] = 0 # out_reads[name][1] = gen_length # out_reads[name][2] = mstart # this is out_reads[name][3] = mstop # this too for i in range(mstart, mstop + 1): try: cov_prof[i] = cov_prof[i] + 1 except IndexError: print 'out of coverage', i this_m = list( mst.strip('-') ) amb_calls = 0 for i in range(len(this_m)): out_reads[name][4].append(this_m[i]) if this_m[i] == 'N': amb_calls = amb_calls + 1 if verbose: print >> sys.stderr, 'Found an N in', match_rec.id if amb_calls > amb_thresh: if verbose: print 'Read', match_rec.id, 'has too many Ns' del out_reads[match_rec.id] cp = open( './%s.covprof' % reads_file.rstrip('_reads.fas'), 'w' ) for i in range(1, gen_length): cp.write('%i\t%i\n' % (i, cov_prof[i]) ) cp.close() return out_reads def print_window(wstart, wend, reads): """ Considers only the read overlapping a given window and prints them aligned """ wind_file = open('./w%d-%d.reads.fas' % (wstart, wend), 'w') rn = 0 min_overlap = round( (wend - wstart + 1) * win_min_ext ) for r in reads: rstart = reads[r][2] k = len(reads[r][4]) # trailing Ns have already been stripped if rstart <= wstart and rstart + k - 1 >= wend: overlap = wend - wstart + 1 elif rstart >= wstart and rstart + k - 1 <= wend: overlap = k elif rstart <= wstart and rstart + k - 1 <= wend: overlap = rstart + k - wstart elif rstart >= wstart and rstart + k - 1 >= wend: overlap = wend - rstart + 1 if overlap > min_overlap: rn = rn + 1 wind_file.write( '>%s %d\n' % (r, reads[r][0]) ) for i in range(wstart, wend+1): if i < rstart or i > rstart + k-1: wind_file.write('N') else: wind_file.write( '%s' % reads[r][4][i-rstart] ) wind_file.flush() wind_file.write('\n') return rn def run_dpm(filein, j, a): """run the dirichlet process clustering """ import subprocess my_prog = "%s/diri_sampler" % path_to_shorah my_arg = " -i %s -j %i -t %i -a %f -o haplo.out -m 20 > ds-out" % (filein, j, int(j*hist_fraction), a) try: #os.remove('./corrected.tmp' ) os.remove('./assignment.tmp') except: pass # runs the gibbs sampler for the dirichlet process mixture try: retcode = subprocess.call(my_prog + my_arg, shell=True) if retcode < 0: print >>sys.stderr, "'%s %s'"%(my_prog,my_arg) print >>sys.stderr, "Child diri_sampler was terminated by signal", -retcode else: print >>sys.stdout, "Child diri_sampler returned %i" % retcode except OSError, ee: print >>sys.stderr, "Execution of diri_sampler failed:", ee assert os.path.isfile('ds-out'), 'File ds-out not found' new_clusters = 0 k = len(clusters)-1 dsh = open('ds-out', 'rU') for line in dsh: if line.startswith('iteration'): lspl = line.split() it = lspl[1] clus = lspl[2] untouch = lspl[3] clusters[k].append(clus) untouched[k].append(untouch) elif line.startswith('#made'): new_clusters = int(line.split()[1]) assert len(clusters[k]) > 1, 'Clusters not appended' assert len(untouched[k]) > 1, 'Untouched not appended' clusters.append([]) untouched.append([]) return new_clusters def seq_2_fas(in_stem): # To be finished """ From Solexa _seq.txt files to fasta format """ fas_handle = open('%s_reads.fas' % in_stem, 'w') print 'Converting files written in', '%s.seq' % in_stem, 'to fasta format' print len( open('%s.seq' % in_stem, 'rU').readlines() ), ' _seq.txt files are to be read' file_names = open('%s.seq' % in_stem, 'rU') for line in file_names: this_seq = line.strip() file_seq = open(this_seq, 'rU') index = 0 for line_seq in file_seq: line_entries = line_seq.split() if not line_entries: continue fas_handle.write( '>%s-read_%i\n' % (this_seq, index) ) index = index + 1 cc = 0 read = line_entries[4] for c in read: fas_handle.write(c) cc = cc + 1 if cc % fasta_length == 0: fas_handle.write('\n') if cc % fasta_length != 0: fas_handle.write('\n') return def correct_reads(wstart, wend, k): """ Parses corrected.tmp (in fasta format) and correct the reads """ # out_reads[match_rec.id][0] = qstart # out_reads[match_rec.id][1] = qstop # out_reads[match_rec.id][2] = mstart # out_reads[match_rec.id][3] = mstop # out_reads[match_rec.id][4] = Sequence... from Bio import SeqIO import os import re read_name_rule = re.compile("(.*)\#\d+") wlen = wend - wstart + 1 try: handle = open('corrected.tmp', 'rU') for seq_record in SeqIO.parse(handle, "fasta"): assert '\0' not in seq_record.seq.tostring(), 'binary file!!!' read_id = seq_record.id # read_name_rule.search(seq_record.id).group(1) try: correction[read_id][wstart] = list( seq_record.seq.tostring() ) quality[read_id][wstart] = float(seq_record.description.split('|')[1]) except: correction[read_id] = {} quality[read_id] = {} correction[read_id][wstart] = list( seq_record.seq.tostring() ) quality[read_id][wstart] = float(seq_record.description.split('|')[1]) handle.close() #os.remove('corrected.tmp') return except IOError: print 'No reads in window %d?' % wstart return def print_prop(sh, ncw, nc, in_stem): """ print a single file with proposed new clusters """ nch = open('%s-%d.proposed' % (in_stem, sh), 'w') nch.write('#Proposed new clusters in each read\n') for nck in ncw: nch.write('%i\t%i\n' % (nck, nc[nck])) nch.close() return def print_clust_unt(sh, clusters, untouched, in_stem, iterations): """ Print clusters and untouched information in two separate files """ del clusters[ len(clusters)-1 ] del untouched[ len(untouched)-1 ] ch = open('%s-%d.clusters' % (in_stem, sh), 'w') for i in range(1, 1 + iterations): ch.write('%d' % i) try: for k in clusters: ch.write('\t%s' % k[i]) except: ch.write('\t0') ch.write('\n') uh = open('%s.%d.untouch' % (in_stem, sh), 'w') for i in range(1, 1 + iterations): uh.write('%d' % i) try: for k in untouched: uh.write('\t%s' % k[i]) except: uh.write('\t0') uh.write('\n') return def base_break(baselist): """ """ import random for c1 in count: count[c1] = 0 for c in baselist: if c.upper() != 'N': count[c.upper()] += 1 max = 0 out = [] for b in count: if count[b] >= max: max = count[b] for b in count: if count[b] == max: out.append(b) rc = random.choice(out) return rc def main(): """ Only called if run not interactively """ import optparse supported_formats = { 'far': 'fasta_aligned_reads' # 'seq': 'solexa_genome_analyzer_seq' } # parse command line optparser = optparse.OptionParser() optparser.add_option("-f", "--readsfile", help="file with reads <.far format>", default="", type="string", dest="f") optparser.add_option("-j", "--iterations", help="iterations in dpm sampling <2000>", default=2000, type="int", dest="j") optparser.add_option("-a", "--alpha", help="alpha in dpm sampling <0.01>", default=0.01, type="float", dest="a") optparser.add_option("-w", "--windowsize", help="window size <201>", default=201, type="int", dest="w") optparser.add_option("-s", "--winshifts", help="number of window shifts <3>", default=3, type="int", dest="s") # window shiftings, such that each base is covered up to win_shifts times optparser.add_option("-k","--keep_all_files",help="keep all files of the diri_sampler output. useful for detailed analysis. ", action="store_true",dest="k") (options, args) = optparser.parse_args() # check the input file is in supported format try: tmp_filename = os.path.split(options.f)[1] [in_stem, in_format] = [tmp_filename.split('.')[0], tmp_filename.split('.')[-1]] t = supported_formats[in_format] except IndexError: print 'The input file must be filestem.format' print 'Supported formats are' for sf in supported_formats: print sf+': ', supported_formats[sf] sys.exit() except KeyError: print in_format, 'format unknown' print 'Supported formats are' for sf in supported_formats: print sf+': ', supported_formats[sf] sys.exit() cwd = os.getcwd() if not os.path.isfile('%s/diri_sampler' % path_to_shorah): print "found no 'diri_sampler'-program in the directory '%s'" % path_to_shorah sys.exit() keep_all_files = False try: if options.k == True: keep_all_files = True except: pass step = options.w win_shifts = options.s if step % win_shifts != 0: sys.exit('Window size must be divisible by win_shifts') if win_min_ext < 1/float(win_shifts): print 'Warning: some bases might not be covered by any window' assert os.path.isfile(options.f), "File '%s' not found" % options.f fas_reads = options.f if in_format == 'far': aligned_reads = parse_aligned_reads(fas_reads) r = aligned_reads.keys()[0] gen_length = aligned_reads[r][1] - aligned_reads[r][0] else: sys.exit('Input file must be a .far file (see the README)') if options.w >= gen_length: sys.exit('The window size must be smaller than the genome length') print len(aligned_reads), 'reads are being considered' print 'The others discarded because of too many in-dels or ambiguous calls (Ns)' print '\n' for k in aligned_reads: to_correct[k] = [None, None, None, None, []] to_correct[k][0] = aligned_reads[k][0] to_correct[k][1] = aligned_reads[k][1] to_correct[k][2] = aligned_reads[k][2] to_correct[k][3] = aligned_reads[k][3] to_correct[k][4] = [] # aligned_reads[k][4][:] del k ############################################ # Now the windows and the error correction # ############################################ # get the new length of the aligned genome (with gaps) # maybe there is a more elegant solution.... gen_length = aligned_reads[aligned_reads.keys()[0]][1] single_sh = step/win_shifts proposed= {} for sh in range(win_shifts): shift = sh * single_sh ncw = [] for s in range(1, gen_length+1, step): rn = print_window(s+shift, s+shift+step-1, aligned_reads) fst = 'w%d-%d.reads.fas' % (s+shift, s+shift+step-1) # print '\x1B[1A\x1B[2K doing from %d to %d %d reads in this window\n' % (s, s+step, rn) # if no reads in the window, go on if rn <= 1: print >>sys.stdout, '\n No reads in window', s+shift, '-', s+shift+step-1 os.remove(fst) continue print >>sys.stdout, '\n window from %d to %d %d reads in this window' % (s+shift, s+shift+step-1, rn) ncw.append(s+shift) proposed[s+shift] = run_dpm(fst, options.j, options.a) correct_reads(s+shift, s+shift+step-1, sh) if keep_all_files: os.rename('ds-out','w%d-%d.ds-out'%(s+shift,s+shift+step-1)) os.rename('corrected.tmp','w%d-%d.corrected.fas'%(s+shift,s+shift+step-1)) os.rename('haplo.out','w%d-%d.haplo.out'%(s+shift,s+shift+step-1)) else: os.remove('ds-out') os.remove('./%s' % fst) os.remove('corrected.tmp') os.remove('haplo.out') try: os.remove('./assignment.tmp') except: pass #print_prop(sh, ncw, nc, in_stem) #del nc #del ncw #print_clust_unt(sh, clusters, untouched, in_stem, options.j) ################################# ## Print the corrected reads ## ################################# for r in to_correct: if r not in correction.keys(): continue rlen = len(aligned_reads[r][4]) rst = aligned_reads[r][2] for rpos in range(rlen): this = [] for cst in correction[r]: tp = rpos + rst - int(cst) if tp >=0 and tp < len(correction[r][cst]) and quality[r][cst] > min_quality: t = correction[r][cst][tp] this.append(t) if len(this) > 0: cb = base_break(this) else: cb = 'x' to_correct[r][4].append(cb) del this fch = open('%s.cor.fas' % in_stem, 'w') for r in to_correct: if to_correct[r][4].count('x') == 0: fch.write('>%s %d\n' % (r, to_correct[r][2]) ) cc = 0 for c in to_correct[r][4]: fch.write(str(c)) fch.flush() cc = cc + 1 if cc % fasta_length == 0: fch.write('\n') if cc % fasta_length != 0: fch.write('\n') fch.close() ################# # gnuplot files # ################# # clusters prop_keys = proposed.keys() prop_keys.sort() pf = open ('%s.proposed' % in_stem, 'w') for k in prop_keys: pf.write('%d\t%f\n' % (k, float(proposed[k])/options.j)) pf.close() set_prop = 'set title \'proposed per step, sample %s\'\nset yra [0:]\n' % (in_stem) title = '' gf = open('proposed.gp', 'w') gf.write(set_prop) gf.write('plot \'%s.proposed\' u 1:2 w l title \'%s\'\n' % (in_stem, title) ) gf.close() if __name__ == "__main__": main()