#!/usr/bin/env python # # Copyright 2009 Illumina, Inc. # # This software is covered by the "Illumina Genome Analyzer Software # License Agreement" and the "Illumina Source Code License Agreement", # and certain third party copyright/licenses, and any user of this # source file is bound by the terms therein (see accompanying files # Illumina_Genome_Analyzer_Software_License_Agreement.pdf and # Illumina_Source_Code_License_Agreement.pdf and third party # copyright/license notices). # # \author Chris Saunders # """ usage: finish_variant_files.py [options] -contigs file indicate contig_file -reads file indicate contig_reads_file -merge attempt to merge contigs (and reads) corresponding to the same indel writes out: contig_file.finished contig_reads_file.finished temporary script to 1) sort variant contigs by start pos of the indel in the reference genome 2) optionally merge contigs describing the same indel in the same location 3) cull and sort the variant reads file such that the reads match the variant contigs file """ # get set working in older versions of python: # try: set() except NameError: from sets import Set as set def strip_cigar(cigar) : is_indel_started=False start=0 stop=0 for i,c in enumerate(cigar) : if c in "0123456789" : pass elif c == 'M' : if not is_indel_started : start = i+1 elif c == 'D' or c == 'I' : is_indel_started=True stop = i+1 else : raise Exception("unsupported cigar string: %s" % cigar) return cigar[start:stop] def is_indel_equiv(cigar1,cigar2) : return (strip_cigar(cigar1) == strip_cigar(cigar2)) def main(): import sys contig_file = contig_read_file = "" is_merge = False arg = sys.argv[1:] i = 0 while i < len(arg) : if arg[i] == "-contigs" : i = i + 1 contig_file = arg[i] elif arg[i] == "-reads" : i = i + 1 contig_read_file = arg[i] elif arg[i] == "-merge" : is_merge = True else : print __doc__ return i = i + 1 # check param if contig_file == "" or contig_read_file == "" : print __doc__ return contigs = [] id="" cigar="" contigbuf = "" startpos = 0 indelstartpos = 0 # read contig file for line in open(contig_file) : if line[0] == '>' : if contigbuf != "" and indelstartpos != 0 : contigs.append((indelstartpos,id,startpos,cigar,contigbuf)) contigbuf = "" w=line[1:].strip().split('|') id=w[0] cigar=w[3] startpos=int(w[2]) indelstartpos=int(w[4]) contigbuf += line if contigbuf != "" and indelstartpos != 0 : contigs.append((indelstartpos,id,startpos,cigar,contigbuf)) contigs.sort() # look for contigs containing equivilent indels, merge them if found: # # when a equiv indels are found, the second indel of the pair is merged back to the id of the first # and the startpos adjustment between the two contigs is recorded: # contig_id_remap = {} contig_id_inverse_remap = {} last_indelstartpos = 0 i=0 if is_merge : while i < len(contigs) : if i != 0 and contigs[i][0] == last_indelstartpos : if is_indel_equiv(contigs[i-1][3],contigs[i][3]) : # equivilent contigs -- merge: startoffset=contigs[i][2]-contigs[i-1][2] contig_id_remap[contigs[i][1]] = (contigs[i-1][1],startoffset) if not contigs[i-1][1] in contig_id_inverse_remap : contig_id_inverse_remap[contigs[i-1][1]] = set() contig_id_inverse_remap[contigs[i-1][1]].add(contigs[i-1][1]) contig_id_inverse_remap[contigs[i-1][1]].add(contigs[i][1]); del contigs[i] continue last_indelstartpos=contigs[i][0] i += 1 id_order = [] contig_file_out = contig_file + ".finished" cfp=open(contig_file_out,"w") # write sorted contig file, record order of id's now that the file has been resorted by indelstartpos for c in contigs : id_order.append(c[1]) cfp.write(c[4]) cfp.close() contigs = [] idset = set(id_order) contig_read_file_out = contig_read_file + ".finished" crfp=open(contig_read_file_out,"w") id_print_no = 0 id_buffer = {} last_orig_id = None n_id=len(idset) printable_id = set() printed_id = set() for line in open(contig_read_file) : w=line.strip().split('\t') id=w[10] # confusing... the 'chromosome' field in the export file is used to store the name of the assembled 'contig' orig_id=id # swap in contig remap if it applies: if id in contig_id_remap : ti=contig_id_remap[id] w[10]=ti[0] id=w[10] w[12]=str(int(w[12])+ti[1]) line='\t'.join(w)+'\n' if id not in idset: continue if id in printed_id: raise Exception("Read ids are not consistent with those in contig set") if last_orig_id != None and last_orig_id != orig_id : if last_orig_id in contig_id_remap : last_id=contig_id_remap[last_orig_id][0] if last_orig_id not in contig_id_inverse_remap[last_id] : raise Exception("failed assertion on contig_id_inverse_remap") contig_id_inverse_remap[last_id].remove(last_orig_id) if len(contig_id_inverse_remap[last_id]) == 0 : printable_id.add(last_id) else : if last_orig_id not in contig_id_inverse_remap : printable_id.add(last_orig_id) else : contig_id_inverse_remap[last_orig_id].remove(last_orig_id) if len(contig_id_inverse_remap[last_orig_id]) == 0 : printable_id.add(last_orig_id) # print anything we can out of the id_buffer early: while True : if id_print_no >= n_id : raise Exception("Read ids are not consistent with those in contig set") next_id=id_order[id_print_no] if next_id not in printable_id : break crfp.write(id_buffer[next_id]) del id_buffer[next_id] printable_id.remove(next_id) printed_id.add(next_id) id_print_no += 1 if id not in id_buffer : id_buffer[id] = "" id_buffer[id] += line last_orig_id = orig_id while id_print_no < len(id_order) : next_id=id_order[id_print_no] if next_id not in id_buffer : raise Exception("unexpected contig read format") crfp.write(id_buffer[next_id]) id_print_no += 1 crfp.close() if __name__ == '__main__': main()