// -*- mode: c++; indent-tabs-mode: nil; -*- // // Copyright 2009 Illumina, Inc. // // This software is covered by the "Illumina Genome Analyzer Software // License Agreement" and the "Illumina Source Code License Agreement", // and certain third party copyright/licenses, and any user of this // source file is bound by the terms therein (see accompanying files // Illumina_Genome_Analyzer_Software_License_Agreement.pdf and // Illumina_Source_Code_License_Agreement.pdf and third party // copyright/license notices). // // /// \file /// /// \author Chris Saunders /// #include "starling_indel_report_info.hh" #include "blt_util/seq_util.hh" #include "boost/lexical_cast.hpp" #include #include #ifdef DEBUG_REPORT #include "blt_util/log.hh" #endif // switches on the CASAVA 1.8 treatment of alternate overlapping alleles: // const bool is_use_alt_indel(true); /// get the indel cigar and ref and indel strings used in the indel /// summary line output /// static void get_indel_summary_strings(const indel_key& ik, const indel_data& id, const std::string& ref_seq, std::string& indel_desc, std::string& indel_seq, std::string& indel_ref_seq){ indel_desc.clear(); if((ik.type == INDEL::INSERT) or (ik.type == INDEL::DELETE)){ indel_desc += boost::lexical_cast(ik.length); if(ik.type == INDEL::INSERT) indel_desc += 'I'; else indel_desc += 'D'; } else { if (ik.type == INDEL::SV_BP_LEFT) { indel_desc = "BP_LEFT"; } else if(ik.type == INDEL::SV_BP_RIGHT){ indel_desc = "BP_RIGHT"; } else { assert(0); } } if((ik.type == INDEL::INSERT) or (ik.type == INDEL::SV_BP_LEFT) or (ik.type == INDEL::SV_BP_RIGHT)) { indel_seq = id.seq; indel_ref_seq = std::string(indel_seq.size(),'-'); } else { const char* rp(ref_seq.c_str()); const pos_t rs(ref_seq.size()); pos_t pos(ik.pos); const pos_t end_pos(ik.right_pos()); for(;pos(n_alleles)); static const double allele_lnprior(std::log(allele_prior)); read_path_scores pprob; pprob.nonsite = path_lnp.nonsite + client_dopt.nonsite_lnprior; pprob.ref = path_lnp.ref + client_dopt.site_lnprior + allele_lnprior; pprob.indel = path_lnp.indel + client_dopt.site_lnprior + allele_lnprior; if(is_use_alt_indel) { aciter j(path_lnp.alt_indel.begin()), j_end(path_lnp.alt_indel.end()); for(;j!=j_end;++j){ pprob.alt_indel[j->first] = j->second + client_dopt.site_lnprior + allele_lnprior; } } double scale(std::max(pprob.nonsite,std::max(pprob.ref,pprob.indel))); if(is_use_alt_indel) { aiter j(pprob.alt_indel.begin()), j_end(pprob.alt_indel.end()); for(;j!=j_end;++j){ if(scale < j->second) scale = j->second; } } pprob.nonsite = std::exp(pprob.nonsite-scale); pprob.ref = std::exp(pprob.ref-scale); pprob.indel = std::exp(pprob.indel-scale); if(is_use_alt_indel) { aiter j(pprob.alt_indel.begin()), j_end(pprob.alt_indel.end()); for(;j!=j_end;++j){ j->second = std::exp((j->second)-scale); } } double sum(pprob.nonsite+pprob.ref+pprob.indel); if(is_use_alt_indel) { aciter j(pprob.alt_indel.begin()), j_end(pprob.alt_indel.end()); for(;j!=j_end;++j){ sum += j->second; } } pprob.nonsite /= sum; pprob.ref /= sum; pprob.indel /= sum; if(is_use_alt_indel) { aiter j(pprob.alt_indel.begin()), j_end(pprob.alt_indel.end()); for(;j!=j_end;++j){ j->second /= sum; } } return pprob; } void get_starling_indel_report_info(const starling_deriv_options& client_dopt, const indel_key& ik, const indel_data& id, const std::string& ref_seq, starling_indel_report_info& iri){ // indel summary info get_indel_summary_strings(ik,id,ref_seq,iri.desc,iri.indel_seq,iri.ref_seq); iri.it=ik.type; const pos_t indel_begin_pos(ik.pos); const pos_t indel_end_pos(ik.right_pos()); const char* rp(ref_seq.c_str()); const pos_t rs(ref_seq.size()); // context: { static const unsigned INDEL_CONTEXT_SIZE(10); if(ik.type != INDEL::SV_BP_RIGHT) { iri.ref_upstream.clear(); for(pos_t i(indel_begin_pos-static_cast(INDEL_CONTEXT_SIZE));i(INDEL_CONTEXT_SIZE));++i){ iri.ref_downstream += get_seq_base(rp,rs,i); } } else { iri.ref_downstream = "N/A"; } } // repeat analysis: { iri.repeat_unit = "N/A"; iri.ref_repeat_count = 0; iri.indel_repeat_count = 0; if( (iri.it == INDEL::INSERT) or (iri.it == INDEL::DELETE) ){ unsigned indel_local_repeat_count(0); if (iri.it == INDEL::INSERT){ get_seq_repeat_unit(iri.indel_seq,iri.repeat_unit,indel_local_repeat_count); } else if(iri.it == INDEL::DELETE){ get_seq_repeat_unit(iri.ref_seq,iri.repeat_unit,indel_local_repeat_count); } // count repeats in contextual sequence: const int repeat_unit_size(static_cast(iri.repeat_unit.size())); unsigned indel_context_repeat_count(0); // count upstream repeats: for(pos_t i(indel_begin_pos-repeat_unit_size);i>=0;i-=repeat_unit_size){ bool is_repeat(true); for(int j(0);j(repeat_unit_size)-1)second)); if (pprob.ref >= path_pprob_thresh) { iri.n_q30_ref_reads++; } else if(pprob.indel >= path_pprob_thresh) { iri.n_q30_indel_reads++; } else { typedef read_path_scores::alt_indel_t::const_iterator aciter; bool is_alt_found(false); aciter j(pprob.alt_indel.begin()), j_end(pprob.alt_indel.end()); for(;j!=j_end;++j){ if(j->second >= path_pprob_thresh) { iri.n_q30_alt_reads++; is_alt_found=true; break; } } if(not is_alt_found) { n_subscore_reads++; } } } // total number of reads with non-zero, yet insufficient indel // breakpoint overlap const unsigned n_suboverlap_reads(id.suboverlap_read_ids.size()); iri.n_other_reads = (n_subscore_reads+n_suboverlap_reads); } }