// -*- mode: c++; indent-tabs-mode: nil; -*- // // Copyright 2009 Illumina, Inc. // // This software is covered by the "Illumina Genome Analyzer Software // License Agreement" and the "Illumina Source Code License Agreement", // and certain third party copyright/licenses, and any user of this // source file is bound by the terms therein (see accompanying files // Illumina_Genome_Analyzer_Software_License_Agreement.pdf and // Illumina_Source_Code_License_Agreement.pdf and third party // copyright/license notices). // // /// \file /// /// \author Chris Saunders /// #include "pos_processor.hh" #include "blt_util/blt_exception.hh" #include "blt_util/log.hh" #include "blt_common/adjust_joint_eprob.hh" #include "blt_common/report_bacon_calls.hh" #include "blt_common/position_snp_call_lrt.hh" #include "blt_common/position_snp_call_pprob_digt.hh" #include "blt_common/position_snp_call_pprob_monogt.hh" #include "blt_common/position_snp_call_pprob_nploid.hh" #include "blt_common/position_strand_coverage_anomaly.hh" #include "blt_common/position_strand_distro_anomaly.hh" #include "blt_common/position_strand_distro_anomaly_lrt.hh" #include "blt_util/stat_util.hh" #include #include #include #include static void report_counts(const snp_pos_info& pi, const unsigned n_unused_calls, const unsigned output_pos, std::ostream& os) { unsigned base_count[N_BASE]; for(unsigned i(0);i dependent_eprob; for(unsigned b(0);b0. or _client_opt.bsnp_ssd_one_mismatch>0)); } pos_processor:: ~pos_processor() { reset(); pos_range output_report_range(_client_opt.report_range); if((not output_report_range.is_begin_pos) and _is_first_pos_set){ output_report_range.set_begin_pos(_min_pos); } if((not output_report_range.is_end_pos) and _is_first_pos_set){ output_report_range.set_end_pos(_max_pos+1); } std::ostream& report_os(get_report_os()); report_os << std::setprecision(8); report_stream_stat(_ss,"ALLSITES_COVERAGE",output_report_range,report_os); report_stream_stat(_used_ss,"ALLSITES_COVERAGE_USED",output_report_range,report_os); if(_client_opt.is_ref_set) { report_stream_stat(_ssn,"NO_REF_N_COVERAGE",output_report_range,report_os); report_stream_stat(_used_ssn,"NO_REF_N_COVERAGE_USED",output_report_range,report_os); } if(_ws) free(_ws); } void pos_processor:: add_pos_snp_info(const pos_t pos, const char ref, const base_call& bc){ if(not is_pos_reportable(pos)) return; handle_new_pos_value(pos); snp_pos_info& pi(_counts[pos_to_pindex(pos)]); // set pi.ref_base the first time we encounter this position, set // this value from the reference sequence if available, else set // from the first observation: // if(pi.calls.empty()){ if((_client_opt.is_ref_set) and (pos>=0) and (pos < _ref_size)){ pi.ref_base=base_to_id(_ref_seq[pos]); } else { pi.ref_base=ref; } } if(pi.ref_base != ref){ const pos_t output_pos(pos+1); if((pi.ref_base == 'N') or (ref == 'N')){ if(not pi.is_n_ref_warn) { const char prev_ref(pi.ref_base); if(pi.ref_base=='N') pi.ref_base = ref; log_os << "WARNING:: inconsistent reference base at pos: " << output_pos << "\n" << "\tprevious_ref: '" << prev_ref << "' new_ref: '" << ref << "'\n" << "\tProceeding with reference='" << pi.ref_base << "'; ignoring subsequent reference 'N's at this position.\n"; pi.is_n_ref_warn = true; } } else { log_os << "ERROR:: inconsistent reference base at pos: " << output_pos << "\n" << "\tprevious_ref: " << pi.ref_base << " new_ref: " << ref << "\n"; throw blt_exception("Inconsistent reference base implied by match descriptor."); } } pi.calls.push_back(bc); } void pos_processor:: reset() { const int fs(static_cast(_flank_size)); const int bshift(POS_BUFFER_SIZE-1-fs); const pos_range& rr(_client_opt.report_range); if(_is_first_pos_set){ // first finish writing buffered data in the report range and clear // ths from the buffer: pos_t final_in_buffer_report_pos(_max_pos); pos_t final_report_pos(_max_pos); if(rr.is_end_pos){ if( (rr.end_pos-1)<_max_pos) { final_in_buffer_report_pos = rr.end_pos-1; final_report_pos = rr.end_pos-1; } else if((rr.end_pos-1)>_max_pos){ final_report_pos = rr.end_pos-1; } } for(pos_t i(std::max(_min_pos,_max_pos-bshift));i<=final_in_buffer_report_pos;++i){ if((i+fs) > _max_pos) clear_pos(i+fs); process_pos(i); } for(pos_t i(std::max(_min_pos,_max_pos-(bshift+fs) ));i<=_max_pos;++i){ clear_pos(i); } // if the report range extends beyond the buffered data, call process_pos // on the cleared buffer positions to complete the report: for(pos_t i(final_in_buffer_report_pos+1);i<=final_report_pos;++i){ process_pos(i); } } else if(rr.is_begin_pos and rr.is_end_pos){ // never read any data in this case, so we just write out // the approriate range of zeros for consistency: // for(pos_t i(rr.begin_pos);i(_flank_size)); const int bshift(POS_BUFFER_SIZE-1-fs); if(not _is_first_pos_set){ const pos_range& rr(_client_opt.report_range); _max_pos = pos; _min_pos = pos; if(rr.is_begin_pos){ _min_pos = rr.begin_pos; for(pos_t i(_min_pos);i<(pos-bshift);++i){ process_pos(i); clear_pos(i); } } _is_first_pos_set = true; } if(pos < (_max_pos-bshift)){ log_os << "ERROR:: reference sequence position difference too high for pos_processor buffer\n" << "current_position:\t" << (pos+1) << "\n" << "top_position:\t" << (_max_pos+1) << "\n"; exit(EXIT_FAILURE); } if(pos < _min_pos) { _min_pos = pos; } if(pos > _max_pos) { // process older positions: for(pos_t i(std::max(_min_pos,_max_pos-bshift));i<(pos-bshift);++i){ // these positions need to be cleared so that empty // positions "in front" of the buffer range are not // counted as having basecalls from the back of the // buffer: if((i+fs) > _max_pos) clear_pos(i+fs); process_pos(i); } for(pos_t i(std::max(_min_pos,_max_pos-(bshift+fs) ));i<(pos-(bshift+fs));++i){ clear_pos(i); } _max_pos = pos; } } void pos_processor:: process_pos(const pos_t pos){ if(is_pos_reportable(pos)){ try { process_pos_snp(pos); } catch (...) { log_os << "Exception caught in pos_processor.process_pos_snp() while processing read_position: " << (pos+1) << "\n" << "snp_pos_info:\n" << _counts[pos_to_pindex(pos)] << "\n"; throw; } } } void pos_processor:: process_pos_snp(const pos_t pos){ snp_pos_info& pi(_counts[pos_to_pindex(pos)]); const unsigned n_calls(pi.calls.size()); const pos_t output_pos(pos+1); // for all but coverage-tests, we use a high-quality subset of the basecalls: // snp_pos_info good_pi; good_pi.ref_base = pi.ref_base; for(unsigned i(0);i=0) and (pos < _ref_size) and (_ref_seq[pos] != 'N')){ _ssn.update(n_calls); _used_ssn.update(n_used_calls); } } std::vector dependent_eprob; adjust_joint_eprob(_client_opt,pi,dependent_eprob,_is_dependent_eprob); // get fraction of filtered bases: const double filter_fraction(static_cast(n_unused_calls)/static_cast(n_calls)); const bool is_overfilter(filter_fraction > _client_opt.max_basecall_filter_fraction); // delay writing any snpcalls so that anomaly tests can (optionally) be applied as filters: // bacon_info bi; lrt_snp_call lsc; diploid_genotype dgt; monoploid_genotype mgt; std::auto_ptr ngt_ptr; // if(_client_opt.is_bacon_allele or _client_opt.is_bacon_snp){ get_bacon_scores(_client_opt,good_pi,n_unused_calls,bi); } if(pi.calls.empty()){ // report allele-caller output for empty lines if option to do so is set: if(_client_opt.is_bacon_allele and _client_opt.is_bacon_allele_print_empty){ report_bacon_allele_call(_client_io,output_pos,bi); } if(_client_opt.is_bsnp_diploid_allele_file and _client_opt.is_bsnp_diploid_allele_print_empty){ const diploid_genotype& edgt(get_empty_dgt(pi.ref_base)); write_bsnp_diploid_allele(_client_opt,_client_io,output_pos,pi.ref_base,n_used_calls,n_unused_calls,good_pi,edgt); } return; } if(_client_opt.is_counts){ report_counts(good_pi,n_unused_calls,output_pos,*_client_io.counts_osptr()); } if(_client_opt.is_lsnp){ position_snp_call_lrt(_client_opt.lsnp_alpha,good_pi,lsc); } if(_client_opt.is_bsnp_diploid){ position_snp_call_pprob_digt(_client_opt.bsnp_diploid_theta,good_pi, dependent_eprob,dgt,_client_opt.is_bsnp_diploid_allele_file); } if(_client_opt.is_bsnp_monoploid){ position_snp_call_pprob_monogt(_client_opt.bsnp_monoploid_theta,good_pi,mgt); } if(_client_opt.is_bsnp_nploid){ ngt_ptr.reset(new nploid_genotype(*_ninfo)); position_snp_call_pprob_nploid(_client_opt.bsnp_nploid_snp_prob,good_pi,*_ninfo,*ngt_ptr); } const bool is_snp(bi.is_snp or lsc.is_snp or dgt.is_snp or mgt.is_snp or (ngt_ptr.get() and ngt_ptr->is_snp)); // find anomalies: // bool is_pos_adis(false); bool is_pos_acov(false); if((_client_opt.is_adis_table or _client_opt.is_adis_lrt) and is_snp){ if(_client_opt.is_adis_table){ is_pos_adis = (is_pos_adis or position_strand_distro_anomaly(_client_opt.adis_table_alpha,good_pi,_ws)); } if(_client_opt.is_adis_lrt){ is_pos_adis = (is_pos_adis or position_strand_distro_anomaly_lrt(_client_opt.adis_lrt_alpha,good_pi)); } } if(_client_opt.is_adis_win_lrt and is_snp){ const int fs(static_cast(_client_opt.adis_win_lrt_flank_size)); double region_null_loghood(0); double region_alt_loghood(0); unsigned region_df(0); snp_pos_info good_ipi; for(int i(0);i<(1+2*fs);++i){ const pos_t ipos(i+pos-fs); const snp_pos_info& good_ipi_ref((ipos == pos) ? good_pi : good_ipi); if(ipos != pos){ /// tmp fix -- find a more efficient way to do this -- note that casava-bin filter results not included here: get_filtered_pos_info(_counts[pos_to_pindex(ipos)],good_ipi); } double null_loghood(0); double alt_loghood(0); unsigned df(0); position_strand_distro_anomaly_lrt_expert(good_ipi_ref,null_loghood,alt_loghood,df); if(df == 0) continue; region_null_loghood += null_loghood; region_alt_loghood += alt_loghood; region_df += df; } is_pos_adis = (is_pos_adis or is_lrt_reject_null(region_null_loghood, region_alt_loghood, region_df, _client_opt.adis_win_lrt_alpha)); } if(_client_opt.is_acov){ is_pos_acov = position_strand_coverage_anomaly(_client_opt.acov_alpha,pi); } const bool is_anomaly(is_pos_adis or is_pos_acov); const bool is_filter_snp(is_overfilter or (_client_opt.is_filter_anom and is_anomaly)); // in case of anomaly filtering -- still print out the allele caller line, but reduce the BaCON score to zero: // if(_client_opt.is_bacon_allele){ if(is_filter_snp) { bacon_scores& bas(bi.bas); bas.max_score = 0.; bas.max2_score = 0.; bas.max_base_id = base_to_id(pi.ref_base); bas.max2_base_id = base_to_id(pi.ref_base); bi.is_max2_reported = false; } report_bacon_allele_call(_client_io,output_pos,bi); } if(_client_opt.is_bsnp_diploid_allele_file){ const diploid_genotype* dgt_ptr(&dgt); if(is_filter_snp) { dgt_ptr=&get_empty_dgt(pi.ref_base); } write_bsnp_diploid_allele(_client_opt,_client_io,output_pos,pi.ref_base,n_used_calls,n_unused_calls,good_pi,*dgt_ptr); } // report events: // bool is_reported_event(false); std::ostream& report_os(get_report_os()); if(is_snp and not (is_filter_snp)) { if(_client_opt.is_bacon_snp and bi.is_snp) { report_bacon_snp_call(_client_io,output_pos,pi.ref_base,bi); } if(lsc.is_snp) { write_snp_prefix_info("LSNP",output_pos,pi.ref_base,n_used_calls,n_unused_calls,report_os); report_os << " " << lsc << "\n"; } if(dgt.is_snp){ if(_client_opt.is_bsnp_diploid_file){ std::ostream& bos(*_client_io.bsnp_diploid_osptr()); write_snp_prefix_info_file(output_pos,pi.ref_base,n_used_calls,n_unused_calls,bos); bos << "\t"; write_diploid_genotype_snp(_client_opt,good_pi,dgt,bos); bos << "\n"; } else { write_snp_prefix_info("BSNP2",output_pos,pi.ref_base,n_used_calls,n_unused_calls,report_os); report_os << " " << dgt << "\n"; } } if(mgt.is_snp) { write_snp_prefix_info("BSNP1",output_pos,pi.ref_base,n_used_calls,n_unused_calls,report_os); report_os << " " << mgt << "\n"; } if(ngt_ptr.get() and ngt_ptr->is_snp) { write_snp_prefix_info("BSNPN",output_pos,pi.ref_base,n_used_calls,n_unused_calls,report_os); report_os << " "; nploid_write(*_ninfo,*ngt_ptr,report_os); report_os << "\n"; } is_reported_event = true; } if(is_anomaly and not _client_opt.is_filter_anom){ if(is_pos_adis) report_os << "ANOM_DIS pos: " << output_pos << "\n"; if(is_pos_acov) report_os << "ANOM_COV pos: " << output_pos << "\n"; is_reported_event = true; } if(_client_opt.is_print_all_site_evidence or (_client_opt.is_print_evidence and is_reported_event)){ report_os << "EVIDENCE pos: " << output_pos << "\n" << "is_snp: " << is_snp << "\n" << "is_anomaly: " << is_anomaly << "\n" << pi << "\n"; } } const diploid_genotype& pos_processor:: get_empty_dgt(const char ref) const { if(ref!='N'){ return static_cast(*_empty_dgt[base_to_id(ref)]); } else { static const diploid_genotype n_dgt; return static_cast(n_dgt); } }