// -*- mode: c++; indent-tabs-mode: nil; -*- // // Copyright 2009 Illumina, Inc. // // This software is covered by the "Illumina Genome Analyzer Software // License Agreement" and the "Illumina Source Code License Agreement", // and certain third party copyright/licenses, and any user of this // source file is bound by the terms therein (see accompanying files // Illumina_Genome_Analyzer_Software_License_Agreement.pdf and // Illumina_Source_Code_License_Agreement.pdf and third party // copyright/license notices). // // /// \file /// /// \author Chris Saunders /// #include "alignment_util.hh" #include "indel_util.hh" #include "starling_read_align_score_indels.hh" #include "blt_util/log.hh" #include //#define DEBUG_ALIGN typedef std::pair > indel_status_t; typedef std::pair align_info_t; typedef std::map iks_map_t; // check whether the current path has a better score than that stored // and if so add entry to iks map // static void check_and_update_iks(iks_map_t& iks_map, const indel_key& ik_call, const bool is_indel_present, const indel_key& ik_present, const double path_lnp, const candidate_alignment* cal_ptr) { const indel_status_t mkey(std::make_pair(ik_call,std::make_pair(is_indel_present,ik_present))); // check to see if the path score is better than what we already have: const iks_map_t::const_iterator j(iks_map.find(mkey)); if((j!=iks_map.end()) and ((j->second.first) >= path_lnp)) return; iks_map[mkey]=std::make_pair(path_lnp,cal_ptr); } typedef std::map overlap_map_t; static void overlap_map_tick(overlap_map_t& omap, const indel_key& ik1, const indel_key& ik2) { indel_set_t& os(omap[ik1]); if(os.find(ik2) == os.end()) os.insert(ik2); } static bool is_interfering_indel(const indel_set_t& current_indels, const indel_key& new_indel){ if(current_indels.count(new_indel) != 0) return false; typedef indel_set_t::const_iterator siter; siter i(current_indels.begin()), i_end(current_indels.end()); for(;i!=i_end;++i) if(is_indel_conflict(*i,new_indel)) return true; return false; } // by how many bases does the alignment cross each indel breakpoint? // // Note that this function only works correctly for indels that are // present in the alignment -- otherwise there are multiple possible // realignments of the read // static std::pair get_alignment_indel_bp_overlap(const alignment& al, const indel_key& ik) { using namespace ALIGNPATH; // all read positions are expressed in unclipped read coordinates! // pos_t read_head_pos(0); pos_t ref_head_pos(al.pos); // mark the indel breakpoints in read position coordinates: bool is_left_read_pos(false); pos_t left_read_pos(0); bool is_right_read_pos(false); pos_t right_read_pos(0); const unsigned as(al.path.size()); for(unsigned i(0);i > indel_pair_set; // given two alignemnts that have (nearly) the same alignment score, // quickly approximate whether they could be expressing the same set // of indels: // static bool is_equiv_candidate(const candidate_alignment& cal1, const candidate_alignment& cal2, const unsigned max_indel_size, indel_pair_set& equiv_keys) { equiv_keys.clear(); indel_set_t is1,is2; get_alignment_indels(cal1,max_indel_size,is1); get_alignment_indels(cal2,max_indel_size,is2); const unsigned s1(is1.size()); const unsigned s2(is2.size()); if(s1 != s2) return false; indel_set_t::const_iterator it1(is1.begin()), it1_end(is1.end()); indel_set_t::const_iterator it2(is2.begin()), it2_end(is2.end()); for(;it1!=it1_end;++it1,++it2) { if(*it1==*it2) continue; if(it1->type != it2->type) return false; if(it1->length != it2->length) return false; equiv_keys.insert(std::make_pair(*it1,*it2)); } return true; } // written for equiv indel resolution (i.e. assuming ik1 and ik2 are // of the same type and length) // static bool is_first_indel_dominant(const starling_options& client_opt, const depth_buffer& db, const indel_buffer& ibuff, const indel_key& ik1, const indel_key& ik2) { const indel_data* id1_ptr(ibuff.get_indel_data(ik1)); assert(NULL != id1_ptr); const indel_data* id2_ptr(ibuff.get_indel_data(ik2)); assert(NULL != id2_ptr); const bool ic1(is_candidate_indel(client_opt,db,ik1,*id1_ptr)); const bool ic2(is_candidate_indel(client_opt,db,ik2,*id2_ptr)); if(ic2 and (not ic1)) return false; if(ic2==ic1) { return (ik1.pos<=ik2.pos); } return true; } // use the most likely alignment for each indel state for every indel // in indel_status_map to generate data needed in indel calling: // void score_indels(const starling_options& client_opt, const depth_buffer& db, const read_segment& rseg, indel_buffer& ibuff, const std::set& cal_set, const bool is_incomplete_search, const std::vector& cal_set_path_lnp, double max_path_lnp, const candidate_alignment* max_cal_ptr){ static bool is_safe_mode(true); // (1) score candidate alignments -- already done before calling this function // (2) store the highest scoring alignment with each indel present // and absent. // // Note there are up to two 'indel absent' states. The first is // when an interfering indel is not present, and the second is when // an interfering indel is present. The second case will not often // (and the great majority of the time, does not) exist. // // We must control for the case where the indel is absent but // replaced by an equivilent indel. In this case we must not only // determine that an equivilent exists, but establish which case // is dominant so that the indel is only reported once. Dominance // rules should follow the same pattern established for // realignment above. // // The indel normalization here is intended to be a final fallback // corrective measure to prevent indels from disappearing at the // call stage. It is expected (and preferred) that as much // upstream indel normalization will take place as is practical. // // (2alpha) // // go through all scored alignments and search for cases with an // equal score (or within the smooth theshold if this is turned // on). Determine if smooth score matches are actually equivilent // -- in which case remove the non-dominant alignment and mark the // equivilent indel as not participating in this read's indel call // search: // const unsigned cal_set_size(cal_set.size()); // Nonnorm indels contains any indels found to be equivilent to // another indel (ie. not normalized). These indels are not used // in subsequent calculation so as not to penalize the normalized // indel form (otherwise all reads which support the indel would // appear to ambiguously support two different indels). // indel_set_t nonnorm_indels; std::vector cal_set_exclude(cal_set_size,false); // This procedure implements a heuristic late-stage indel // normalization, set is_slip_norm=false to disable: // static const bool is_slip_norm(true); if(is_slip_norm){ const double equiv_lnp_range( client_opt.is_smoothed_alignments ? client_opt.smoothed_lnp_range : 0. ); // go through alignment x alignments in best->worst score // order -- to do so start out with the sort order: std::vector > sorted_path_lnp; std::vector cal_ptr_vec; { std::set::const_iterator si(cal_set.begin()); for(unsigned i(0);i smooth_path_lnp(cal_set_path_lnp); bool is_any_excluded(false); indel_pair_set ips; for(unsigned i(0);ifirst,ip->second)); #ifdef DEBUG_ALIGN log_os << "COWSLIP: indel1: " << ip->first << "\n"; log_os << "COWSLIP: indel2: " << ip->second << "\n"; log_os << "COWSLIP: is_indel1_dominant?: " << is1 << "\n"; #endif if(is1) { nonnorm_indels.insert(ip->second); #ifdef DEBUG_ALIGN log_os << "COWSLIP: marking 2 nonnorm: " << ip->second << "\n"; #endif if(not is_removed) { cal_set_exclude[sortj] = true; is_any_excluded = true; smooth_path_lnp[sorti] = std::max(smooth_path_lnp[sorti],smooth_path_lnp[sortj]); #ifdef DEBUG_ALIGN log_os << "COWSLIP: excluding sortj: " << sortj << "\n"; #endif } } else { nonnorm_indels.insert(ip->first); #ifdef DEBUG_ALIGN log_os << "COWSLIP: marking 1 nonnorm: " << ip->first << "\n"; #endif if(not is_removed) { cal_set_exclude[sorti] = true; is_any_excluded = true; smooth_path_lnp[sortj] = std::max(smooth_path_lnp[sorti],smooth_path_lnp[sortj]); #ifdef DEBUG_ALIGN log_os << "COWSLIP: excluding sorti: " << sorti << "\n"; #endif is_sorti_removed=true; } } is_removed=true; } if(is_sorti_removed) break; } } // recalc new max path in case it was excluded: // // note this loop is designed to take advantage of the high->low path_lnp sort // if(is_any_excluded) { for(unsigned i(0);i ipair(ibuff.pos_range_iter(max_pr.begin_pos,max_pr.end_pos)); for(iiter i(ipair.first);i!=ipair.second;++i){ const indel_key& ik(i->first); indel_data& id(i->second); #ifdef DEBUG_ALIGN log_os << "VARMIT: max path eval indel candidate: " << ik; #endif if(not is_candidate_indel(client_opt,db,ik,id)) continue; #ifdef DEBUG_ALIGN log_os << "VARMIT: max path indel is candidate\n"; #endif const bool is_indel_interfering(is_interfering_indel(max_cal_indels,ik)); #ifdef DEBUG_ALIGN log_os << "VARMIT: max path indel is non-interfering\n"; #endif const bool is_indel_present(max_cal_indels.count(ik)!=0); #ifdef DEBUG_ALIGN log_os << "VARMIT: indel present? " << is_indel_present << "\n"; #endif if(is_indel_present) { const std::pair both_bpo(get_alignment_indel_bp_overlap(max_cal.al,ik)); const int bpo(std::max(both_bpo.first,both_bpo.second)); #ifdef DEBUG_ALIGN log_os << "VARMIT: indel bp_overlap " << bpo << "\n"; #endif if(bpo < client_opt.min_read_bp_flank) { if(bpo>0) id.suboverlap_read_ids.insert(rseg.id()); continue; } } else { // check that the most likely alignment intersects this indel: #ifdef DEBUG_ALIGN log_os << "VARMIT: indel intersects max_path? " << is_range_intersect_indel_breakpoints(strict_max_pr,ik) << "\n"; #endif if(not is_range_intersect_indel_breakpoints(strict_max_pr,ik)) continue; } // all checks passed... indel will be evaluated for indel calling: // max_cal_eval_indels.insert(ik); is_max_cal_eval_indels_interfere[ik] = is_indel_interfering; } } // go through eval indel set and map which indels conflict with // each other // // for now we calc and store this info in a comically inefficient // manner // overlap_map_t indel_overlap_map; { indel_set_t::const_iterator i(max_cal_eval_indels.begin()); const indel_set_t::const_iterator i_end(max_cal_eval_indels.end()); for(;i!=i_end;++i){ indel_set_t::const_iterator j(i); ++j; for(;j!=i_end;++j) { if(is_indel_conflict(*i,*j)) { overlap_map_tick(indel_overlap_map,*i,*j); overlap_map_tick(indel_overlap_map,*j,*i); } } } } // (2b) Create index of highest scoring alignment set for each // evaluation indel (found in step 2a). The set of alignments // include all mutually consistent sets of indels which // interfere with the evalutation indel (including the eval // indel itself) 99% of the time this should just be the // indel and ref paths. 99% of the rest of the time this // will just be the paths of the indel, ref, and one // alternate indel allele. // // // First part of iks_map_t key is the called indel -- second part // of the key is the indel which is toggled on which interferes // with the called indel. When called indel is toggled on the // first and second keys are the same. Ptr is NULL for no-indel // (reference) allele case. // // Effect of this change is now we store maximum scoring alignment // for each interfering indel allele (not just the called indel // and reference). I don't realistically envision that more than // one indel will be required very often -- that's the case where we // would want a full-blown candidate haplotype model anyway. // // iks_map_t iks_max_path_lnp; { { // as a simple acceleration to the full max scoring // alignment path search below -- we go through all indel // states present in the global maximum scoring alignment, // taking advantage of our knowledge that this will be the // highest scoring path already: // indel_set_t::const_iterator i(max_cal_eval_indels.begin()), i_end(max_cal_eval_indels.end()); const align_info_t max_info(std::make_pair(max_path_lnp,&max_cal)); for(;i!=i_end;++i){ const bool is_indel_present(max_cal_indels.count(*i)!=0); if(is_indel_present){ iks_max_path_lnp[std::make_pair(*i,std::make_pair(is_indel_present,*i))] = max_info; // mark this as an alternate indel score for interfering indels: const indel_set_t& os(indel_overlap_map[*i]); indel_set_t::const_iterator j(os.begin()),j_end(os.end()); for(;j!=j_end;++j){ iks_max_path_lnp[std::make_pair(*j,std::make_pair(is_indel_present,*i))] = max_info; } } else { // check that this indel does not interfere with the max-set: if(not is_max_cal_eval_indels_interfere[*i]) { iks_max_path_lnp[std::make_pair(*i,std::make_pair(is_indel_present,*i))] = max_info; } } } } std::set::const_iterator si(cal_set.begin()),si_end(cal_set.end()); for(unsigned c(0);si!=si_end;++si,++c){ const candidate_alignment& ical(*si); const bool is_max_cal(&ical == &max_cal); if(cal_set_exclude[c]) { assert(not is_max_cal); continue; } if(is_max_cal) continue; const double path_lnp(cal_set_path_lnp[c]); indel_set_t ical_indels; get_alignment_indels(ical,client_opt.max_indel_size,ical_indels); indel_set_t::const_iterator i(max_cal_eval_indels.begin()), i_end(max_cal_eval_indels.end()); for(;i!=i_end;++i){ const indel_key& ik(*i); const bool is_indel_present(ical_indels.count(ik)!=0); if(is_indel_present) { const indel_status_t mkey(std::make_pair(ik,std::make_pair(is_indel_present,ik))); check_and_update_iks(iks_max_path_lnp,ik,is_indel_present,ik,path_lnp,&ical); // mark this as an alternate indel score for interfering indels: const indel_set_t& os(indel_overlap_map[ik]); indel_set_t::const_iterator j(os.begin()),j_end(os.end()); for(;j!=j_end;++j){ check_and_update_iks(iks_max_path_lnp,*j,is_indel_present,ik,path_lnp,&ical); } } else { // if indel is not present, we must determine // whether an interfering indel is present in the // alignment to score this alignment in the right // category: // bool is_interference(false); { const indel_set_t& os(indel_overlap_map[ik]); indel_set_t::const_iterator j(os.begin()),j_end(os.end()); for(;j!=j_end;++j){ if(ical_indels.count(*j)!=0) { is_interference=true; continue; } } } if(not is_interference) { check_and_update_iks(iks_max_path_lnp,ik,is_indel_present,ik,path_lnp,&ical); } } } } } // (3) assemble indel calling scores for each indel into the indel buffer: // // const unsigned read_length(rseg.read_size()); // for indel caller calculation we always need the "rest-of-genome" // alignment score, which is independent of alignment: double nonsite_path_lnp(0); { static const double nonsite_match_lnp(std::log(0.25)); const bam_seq bseq(rseg.get_bam_read()); for(unsigned i(0);isecond.second)); const std::pair both_bpo(get_alignment_indel_bp_overlap(alt_cal.al,ik)); const int bpo(std::max(both_bpo.first,both_bpo.second)); #ifdef DEBUG_ALIGN log_os << "VARMIT: called_indel_bp_overlap " << bpo << "\n"; #endif if(bpo < client_opt.min_read_bp_flank) { if(bpo>0) id_ptr->suboverlap_read_ids.insert(rseg.id()); continue; } indel_path_lnp=j->second.first; } #ifdef DEBUG_ALIGN log_os << "VARMIT: called_indel_path_lnp " << indel_path_lnp << "\n"; #endif // 2) indel absent w/o interference: double ref_path_lnp(0); { const iks_map_t::iterator j(iks_max_path_lnp.find(std::make_pair(ik,std::make_pair(false,ik)))); const bool is_found(j!=iks_max_path_lnp.end()); if(is_incomplete_search and (not is_found)) continue; if(not is_found) { if(is_safe_mode) { log_os << "WARNING: "; } else { log_os << "ERROR: "; } log_os << "failed to find reference alignement while evaluating read_segment:\n" << rseg << "\n"; if(is_safe_mode) { continue; } else { exit(EXIT_FAILURE); } } ref_path_lnp=j->second.first; #ifdef DEBUG_ALIGN log_os << "VARMIT: ref_path_lnp " << ref_path_lnp << "\n"; #endif } // assemble basic score data: // read_path_scores rps(ref_path_lnp,indel_path_lnp,nonsite_path_lnp,read_length); // start adding alternate indel alleles, if present: { const indel_set_t& os(indel_overlap_map[ik]); indel_set_t::const_iterator k(os.begin()),k_end(os.end()); for(;k!=k_end;++k){ const iks_map_t::iterator j(iks_max_path_lnp.find(std::make_pair(ik,std::make_pair(true,*k)))); #ifdef DEBUG_ALIGN log_os << "VARMIT: alternate_indel " << *k; #endif // TODO consider a way that the basic stores // could still be used even if we can't get all // alternate paths? // const bool is_found(j!=iks_max_path_lnp.end()); //if(is_incomplete_search and (not is_found)) continue; //assert(is_found); // for now, this case has to be allowed, because it is possible // for no alignment not including the called indel to be in range // of the alternate indel // // for now we continue when the alt case is not found. // // TODO tighten the re-alignment procedure so that we know when the // alt case should be missing -- possibly use ref_path instead of continuing // in this case // if(is_found) { rps.alt_indel[*k] = j->second.first; } else { // TODO look at these cases in more detail before allowing them! continue; //rp.alt_indel[*k] = ref_path_lnp; } #ifdef DEBUG_ALIGN //log_os << "VARMIT: alternate_indel_path_found: " << is_found << "\n"; log_os << "VARMIT: alternate_indel_path_lnp " << rps.alt_indel[*k] << "\n"; #endif } } id_ptr->read_path_lnp[rseg.id()] = rps; #ifdef DEBUG_ALIGN log_os << "VARMIT: modified indel data: " << *(id_ptr); #endif } } }