// -*- mode: c++; indent-tabs-mode: nil; -*- // // Copyright 2009 Illumina, Inc. // // This software is covered by the "Illumina Genome Analyzer Software // License Agreement" and the "Illumina Source Code License Agreement", // and certain third party copyright/licenses, and any user of this // source file is bound by the terms therein (see accompanying files // Illumina_Genome_Analyzer_Software_License_Agreement.pdf and // Illumina_Source_Code_License_Agreement.pdf and third party // copyright/license notices). // // /// \file /// /// \author Chris Saunders /// /// note coding convention for all ranges '_pos fields' is: /// XXX_begin_pos is zero-indexed position at the begining of the range /// XXX_end_pos is zero-index position 1 step after the end of the range /// #include "starling/align_path.hh" #include "starling/align_path_bam_util.hh" #include "starling_pos_processor.hh" #include "starling_pos_processor_indel_util.hh" #include "starling_read_align_shared.hh" #include "starling_read_util.hh" #include "grouper_contig_util.hh" #include "blt_util/bam_streamer.hh" #include "blt_util/blt_exception.hh" #include "blt_util/export_stream_reader.hh" #include "blt_util/log.hh" #include "blt_util/read_util.hh" #include "blt_util/seq_util.hh" #include "blt_common/blt_arg_validate.hh" #include "starling/starling_info.hh" #include "starling/starling_shared.hh" #include #include #include #include #include #include #include #include #include #include namespace { const prog_info& pinfo(starling_info::get()); } static void get_next_read_pos(bool& is_next_read, pos_t& next_read_pos, bam_streamer& read_stream){ is_next_read=read_stream.next(); if(is_next_read){ const bam_record& read_rec(*(read_stream.get_record_ptr())); next_read_pos=(read_rec.pos()-1); } else { next_read_pos=0; } } // reads position of and holds next contig from the contig file: // // TODO -- system to test for and filter out very poor contigs // (this should be left to GROUPER, but it might be approriate to // catch extreme cases here??) // static void get_next_contig_pos(bool& is_next_contig, pos_t& next_contig_pos, contig_reader& creader){ is_next_contig=creader.next(); next_contig_pos=creader.get_contig().pos; } // returns true if the read is accepted into the buffer // (read can fail a number of quality checks -- such as // being located too far away from other alignments of the // same read or having an indel that is too large // static std::pair add_export_read_to_buffer(starling_pos_processor& sppr, const export_line_parser& exl, const char* fwd_strand_read, const unsigned read_size, const alignment& al, const READ_ALIGN::index_t rat, const char* chrom_name, const bool is_usable_mapping, bam_record& tmp_key_br, const align_id_t contig_id = 0, const indel_set_t* contig_indels_ptr = NULL) { // set qname: std::string key_name; get_read_key_from_export_line(exl,key_name); tmp_key_br.set_qname(key_name.c_str()); // set paired bit: if(exl.is_partner_strand() != tmp_key_br.is_paired()) { tmp_key_br.toggle_is_paired(); } // set readno: const int readnum(exl.read_number()); assert((readnum == 1) or (readnum == 2)); if(readnum==1) { if(not tmp_key_br.is_first()) tmp_key_br.toggle_is_first(); if(tmp_key_br.is_second()) tmp_key_br.toggle_is_second(); } else { if(tmp_key_br.is_first()) tmp_key_br.toggle_is_first(); if(not tmp_key_br.is_second()) tmp_key_br.toggle_is_second(); } // set strand: if(al.is_fwd_strand!=tmp_key_br.is_fwd_strand()) { tmp_key_br.toggle_is_fwd_strand(); } // set read and quality: const char* const raw_quality(exl.quality()); uint8_t fwd_quality[MAX_READ_SIZE+1]; if(al.is_fwd_strand){ for(unsigned i(0);i(read.se_map_qual())); if(read.is_paired()) { const int pe_mapq(static_cast(read.pe_map_qual())); if(pe_mapq(read.map_qual())); if(read.is_paired()){ if (mapq < client_opt.min_paired_align_score){ is_usable_mapping=false; // paired submap } } else { if (mapq < client_opt.min_single_align_score) { is_usable_mapping=false; // single submap } } } } check_bam_record(read_stream,read); // secondary range filter check: // (removed as part of RNA-Seq modifications) // //if(base_pos+static_cast(read.read_size())+MAX_INDEL_SIZE <= report_begin_pos) return; // extract indels and add to indel map: // // TODO: settle how to handle reads which fail mapping score but could be realigned // // include submapped reads for realignment only if we are writing out the realigned cases // ** feature disabled for now -- unmapped reads are ignored unless they come from grouper // { // if(is_usable_mapping or client_opt.is_realign_submapped_reads){ // or client_opt.is_realigned_read_file){ // if we're seeing an unmapped read, it's still expected to // have a position and chrom assignment // alignment al; al.pos=base_pos; if(not is_unmapped) { al.is_fwd_strand=read.is_fwd_strand(); bam_cigar_to_apath(read.raw_cigar(),read.n_cigar(),al.path); } // // give broken CASAVA output a pass for now... //static const bool is_edge_deletion_error(false); //export_md_to_apath(md,al.is_fwd_strand,al.path,is_edge_deletion_error); const char* chrom_name(read_stream.target_id_to_name(read.target_id())); static const READ_ALIGN::index_t rat(READ_ALIGN::GENOME); try { sppr.insert_read(read,al,rat,chrom_name,is_usable_mapping); } catch (...) { log_os << "\nException caught while inserting read alignment in sppr. Genomic read alignment record:\n"; read_stream.report_state(log_os); throw; } } } // advance to first read aligning to GROUPER contig_id: // static void find_first_read_for_contig(const std::string& contig_id, export_stream_reader& exr) { do { if((exr.exline()) and (contig_id == exr.exline()->match_chromosome())) return; } while(exr.next()); log_os << "ERROR: expected at least one read for GROUPER contig: " << contig_id << "\n"; log_os << " in GROUPER read export stream: " << exr.name() << "\n"; exit(EXIT_FAILURE); } /// Mark any edge insertions in alignment as belonging to breakpoints. /// /// Note this is only intended for assembled contig alignments. /// static void set_candidate_edge_insert_bp(candidate_alignment& cal){ using namespace ALIGNPATH; const path_t& path(cal.al.path); pos_t ref_head_pos(cal.al.pos); const unsigned as(path.size()); for(unsigned i(0);i(ctg); set_candidate_edge_insert_bp(ctg_cal); indel_set_t contig_indels; get_alignment_indels(ctg_cal,max_indel_size,contig_indels); do { const export_line_parser& exl(*(exr.exline())); // stop when we hit the next contig's reads if(ctg.id != exl.match_chromosome()) break; const export_read_parse erp(exr); alignment al; al.pos=(exl.match_position()-1); al.is_fwd_strand=erp.is_fwd_strand; { // read apath will be derived from the contig apath, but // first assert that there are no indels in the read's // alignment to the contig: const char* const md(exl.match_descriptor()); export_md_to_apath(md,al.is_fwd_strand,al.path); assert((al.path.size()==1) and (al.path[0].type == ALIGNPATH::MATCH)); } const bool is_ref_align(map_grouper_contig_read_to_genome(ctg,al)); // this read could not be reference aligned (ie. it is entirely aligned within an insert) // if(not is_ref_align) continue; // TODO:rescue these reads as soft-clips if possible: // if(al.pos < 0) { std::string key; get_read_key_from_export_line(exl,key); log_os << "WARNING: skipping contig alignment for read: " << key << " due to negative alignment position: " << al.pos << "\n"; continue; } // double-check: if(ALIGNPATH::is_apath_invalid(al.path,erp.read_size)) { log_os << "ERROR: Invalid contig read reference alignment: " << al; log_os << "\tread_size:" << erp.read_size << "\n"; exr.report_state(log_os); exit(EXIT_FAILURE); } static const bool is_usable_mapping(false); // this value is ignored for contig reads static const READ_ALIGN::index_t rat(READ_ALIGN::CONTIG); try { add_export_read_to_buffer(sppr,exl,erp.fwd_strand_read,erp.read_size,al,rat, ctg.chrom.c_str(),is_usable_mapping,tmp_key_br, contig_id,&contig_indels); } catch (...) { log_os << "\nException caught in add_export_read_to_buffer() while processing contig read alignment record:\n"; exr.report_state(log_os); throw; } } while (exr.next()); } static void process_contig(const starling_options& client_opt, const std::string&,// ref_seq, const grouper_contig& ctg, starling_pos_processor& sppr){ // TODO -- better tracking of contig_id // contig_id++; #ifdef DEBUG_STARLING log_os << "contig_id: " << contig_id << "\n"; log_os << "contig: " << ctg; #endif if(not is_valid_seq(ctg.seq.c_str())){ log_os << "ERROR: invalid sequence in contig: " << ctg << "\n"; exit(EXIT_FAILURE); } static const INDEL_ALIGN_TYPE::index_t iat(INDEL_ALIGN_TYPE::CONTIG); const string_bam_seq bseq(ctg.seq); try { add_alignment_indels_to_sppr(client_opt.max_indel_size,ctg.pos,ctg.path,bseq,sppr,iat,contig_id); } catch (...) { log_os << "\nException caught in add_alignment_indels_to_sppr() while processing contig: " << ctg << "\n"; throw; } } static void open_ifstream(std::ifstream& ifs, const char* filename){ ifs.open(filename); if(not ifs){ log_os << "ERROR: Can't open file: " << filename << "\n"; exit(EXIT_FAILURE); } } enum event_type { NONE, READ, CONTIG }; static const char* event_type_label(const event_type e){ switch(e){ case NONE : return "NONE"; case READ : return "READ"; case CONTIG : return "CONTIG"; default : log_os << "ERROR: unrecognized event type.\n"; exit(EXIT_FAILURE); } } // read the local-assembly contig and contig read buffer ahead of the // genomic reads: // // \TODO redesign so that we handle the much larger deletions coming // from GROUPER (more) correctly. Idea is (1) to prevent any possible // double counting of a widely separated GROUPER and genomic read and // (2) to handle deletion spanning reads as segmented reads using the // exon handling system. // const pos_t CONTIG_BUFFER_LEAD(1000); void starling_run(starling_options& client_opt, const char* const cmdline){ assert(NULL != cmdline); std::string ref_seq; if(client_opt.is_ref_set){ get_ref_seq(client_opt.ref_seq_file.c_str(),ref_seq); standardize_ref_seq(client_opt.ref_seq_file.c_str(),ref_seq); make_ref_seq_option_adjustments(client_opt,ref_seq.size()); if(not client_opt.is_genome_size) { client_opt.genome_size = get_ref_seq_known_size(ref_seq); client_opt.is_genome_size = true; } } starling_deriv_options client_dopt; get_starling_deriv_options(client_opt,ref_seq.size(),client_dopt); const pos_range& rlimit(client_dopt.report_range_limit); if(client_opt.is_test_indels and ((not client_opt.is_genome_size) or (client_opt.genome_size<1))) { log_os << "ERROR: invalid genome-size for indel calling\n"; exit(EXIT_FAILURE); } std::ifstream indel_contig_is; std::ifstream indel_contig_read_is; if(client_opt.is_test_indels){ if(not client_opt.indel_contig_filename.empty()) { open_ifstream(indel_contig_is,client_opt.indel_contig_filename.c_str()); } if(not client_opt.indel_contig_read_filename.empty()) { open_ifstream(indel_contig_read_is,client_opt.indel_contig_read_filename.c_str()); } } export_stream_reader indel_read_exr(indel_contig_read_is,client_opt.indel_contig_filename.c_str()); contig_reader creader(indel_contig_is); // generalize export_stream_reader to universal_reader: // just worry about a straight BAM replacement first -- generalize what's easy // std::auto_ptr exr_ptr; // const unsigned n_sorted(client_opt.sorted_filenames.size()); // assert((n_sorted == 1) or (n_sorted == 0)); // if(n_sorted == 0) { // exr_ptr.reset(new export_stream_reader()); // set null export reader // } else { // const std::string& sorted_filename(client_opt.sorted_filenames.front()); // if(sorted_filename==STDIN_FILENAME){ // exr_ptr.reset(new export_stream_reader(std::cin,"stdin")); // } else { // exr_ptr.reset(new export_file_reader(sorted_filename.c_str())); // } // } // export_stream_reader& exr(*exr_ptr); assert(not client_opt.bam_filename.empty()); std::ostringstream bam_region_oss; { const int zsize(get_influence_zone_size(client_opt.max_indel_size)); const pos_t begin_pos(std::max(0,client_opt.report_range.begin_pos-zsize)); const pos_t end_pos(client_opt.report_range.end_pos+zsize); bam_region_oss << client_opt.bam_seq_name << ':' << begin_pos+1 << '-' << end_pos; } bam_streamer read_stream(client_opt.bam_filename.c_str(),bam_region_oss.str().c_str()); // Provide a temporary bam record for contig reads to write key // information into. bam_record tmp_key_br; // initiialize this record with values that will be fixed for this run: tmp_key_br.set_target_id(read_stream.target_name_to_id(client_opt.bam_seq_name.c_str())); starling_streams client_io(client_opt,pinfo,cmdline,read_stream.get_header()); starling_pos_processor sppr(client_opt,client_dopt,ref_seq,client_io); starling_read_counts brc; bool is_next_read(false); pos_t next_read_pos(0); bool is_next_contig(false); pos_t next_contig_pos(0); // setup initial values for next read and next indel before entering loop: get_next_read_pos(is_next_read,next_read_pos,read_stream); get_next_contig_pos(is_next_contig,next_contig_pos,creader); next_contig_pos=next_contig_pos-std::min(static_cast(CONTIG_BUFFER_LEAD),next_contig_pos); // loop through all read and indel begin points in order: bool is_last_pos_set(false); pos_t last_pos(0); pos_t buffer_head_pos(0); event_type current_type(NONE); event_type last_type(NONE); while(true) { // advance to next read or indel based on the type of last iteration: if ( current_type == READ ){ get_next_read_pos(is_next_read,next_read_pos,read_stream); } else if ( current_type == CONTIG ) { get_next_contig_pos(is_next_contig,next_contig_pos,creader); next_contig_pos=next_contig_pos-std::min(static_cast(CONTIG_BUFFER_LEAD),next_contig_pos); } // determine what the next event type and pos is: pos_t current_pos(0); if(not (is_next_read or is_next_contig)) { break; } else if(not is_next_contig) { current_pos=next_read_pos; current_type=READ; } else if(not is_next_read) { current_pos=next_contig_pos; current_type=CONTIG; } else { current_pos=std::min(next_read_pos,next_contig_pos); if(next_read_pos= (rlimit.end_pos+max_indel_size))) break; if (current_type == READ){ // handle regular ELAND reads /// get potential bounds of the read based only on current_pos: const known_pos_range any_read_bounds(current_pos-max_indel_size,current_pos+MAX_READ_SIZE+max_indel_size); if( sppr.is_range_outside_report_influence_zone(any_read_bounds) ) continue; // Approximate begin range filter: (removed for RNA-Seq) //if((current_pos+MAX_READ_SIZE+MAX_INDEL_SIZE) <= rlimit.begin_pos) continue; const bam_record& read(*(read_stream.get_record_ptr())); if(read.is_filter()) { brc.primary_filter++; continue; } if(read.is_dup()) { brc.duplicate++; continue; } if(read.is_unmapped()) { brc.unmapped++; continue; } if(client_opt.is_filter_unanchored and read.is_unanchored()) { continue; } process_genomic_read(client_opt,ref_seq,read_stream,read,current_pos,rlimit.begin_pos,brc,sppr); } else if(current_type == CONTIG) { // process local-assembly contig and its reads const grouper_contig& ctg(creader.get_contig()); // skip contigs which are too large: if(not ctg.is_usable) continue; { // skip contigs which will not affect results in the report range: const pos_t contig_span(apath_ref_length(ctg.path)); const pos_t contig_end_pos(current_pos+contig_span); const known_pos_range contig_bounds(current_pos,contig_end_pos); const bool is_skip_contig(sppr.is_range_outside_report_influence_zone(contig_bounds)); if(is_skip_contig) continue; } // skip contigs which contain adjacent insertion/deletion events: if(is_seq_swap(ctg.path)) { log_os << "WARNING: Indel contig: '" << ctg.id << "' contains adjacent insertion/deletion event. Skipping...\n"; continue; } process_contig(client_opt,ref_seq,ctg,sppr); process_contig_reads(ctg,client_opt.max_indel_size,indel_read_exr,sppr,tmp_key_br); } else { log_os << "ERROR: invalid input condition.\n"; exit(EXIT_FAILURE); } } sppr.reset(); // brc.report(client_io.report_os()); }