// -*- mode: c++; indent-tabs-mode: nil; -*- // // Copyright 2009 Illumina, Inc. // // This software is covered by the "Illumina Genome Analyzer Software // License Agreement" and the "Illumina Source Code License Agreement", // and certain third party copyright/licenses, and any user of this // source file is bound by the terms therein (see accompanying files // Illumina_Genome_Analyzer_Software_License_Agreement.pdf and // Illumina_Source_Code_License_Agreement.pdf and third party // copyright/license notices). // // /// \file /// /// \author Chris Saunders /// #include "starling/align_path.hh" #include "depth_buffer_util.hh" #include "indel_util.hh" #include "starling_indel_call_pprob_digt.hh" #include "starling_indel_report_info.hh" #include "starling_pos_processor.hh" #include "starling_pos_processor_indel_util.hh" #include "starling_read_align.hh" #include "starling_read_util.hh" #include "blt_common/adjust_joint_eprob.hh" #include "blt_common/position_snp_call_lrt.hh" #include "blt_common/position_snp_call_pprob_digt.hh" #include "blt_common/position_snp_call_pprob_monogt.hh" #include "blt_common/position_snp_call_pprob_nploid.hh" #include "blt_common/position_strand_coverage_anomaly.hh" #include "blt_common/position_strand_distro_anomaly.hh" #include "blt_common/position_strand_distro_anomaly_lrt.hh" #include "blt_common/report_bacon_calls.hh" #include "blt_util/blt_exception.hh" #include "blt_util/log.hh" #include "blt_util/read_util.hh" #include #include //#define DEBUG_PPOS static void report_counts(const snp_pos_info& pi, const unsigned n_unused_calls, const pos_t output_pos, std::ostream& os) { unsigned base_count[N_BASE]; for(unsigned i(0);i(get_read_buffer_size(max_indel_size))-1; } //////////////////////////////////////////////////////// // define starling_pos_processor stages: // namespace STAGE { enum index_t { HEAD=-1, READ_BUFFER, POST_ALIGN, POST_CALL, SIZE }; // given max_indel_size, provide a vector of circular buffer stage // lengths: // static std::vector get_stage_size(const unsigned max_indel_size) { std::vector stage_size(SIZE,0); // // HEAD contains everything before the head position in the // stage processing pipeline, this should be: // // 1) GROUPER contigs/reads that were far out-of-order // 2) Exon entries that extend past the head position // // HEAD has no defined size -- it is everything stored before // the head position // // READ_BUFFER is where most normal genomic reads are read in // and procesed for indels, it needs enough room to collect the full // candidate indel map before we start read alignment: // // occuring at the beginning of stage READ_BUFFER (or before): // collect and buffer read // enter read into estimated depth // // occuring at the end of stage READ_BUFFER: // read realignment // base-call pos entry (pileup) // stage_size[READ_BUFFER] = get_read_buffer_size(max_indel_size); // // POST_ALIGN_STAGE - the end of this stage is where snp and indel // calling occur. It needs to be separated from the end of // READ_BUFFER_STAGE with enough room to allow for the repositioning // of reads to the 'left' during re-alignment. // // At the end of this stage we conduct: // // indel calling // snp-calling // realigned read output // stage_size[POST_ALIGN] = max_indel_size; // POST_CALL is used to free up the pileup data. This data is // preserved for one extra cycle after snp and indel calling so that // the indel caller can look one base 'to the left' of its call // location (excepting right breakpoints) to get the depth for the // current indel call // // At the end of this stage we free the position's pileup buffer // stage_size[POST_CALL] = 1; return stage_size; } } starling_pos_processor:: starling_pos_processor(const starling_options& client_opt, const starling_deriv_options& client_dopt, const std::string& ref_seq, const starling_streams& client_io) : base_t(), _client_opt(client_opt), _client_dopt(client_dopt), _ref_seq(ref_seq), _client_io(client_io), _stage_size(STAGE::get_stage_size(_client_opt.max_indel_size)), _msm(_stage_size,client_opt.report_range,*this), _bc_buff(_ref_seq), _ws(0) { #ifdef HAVE_FISHER_EXACT_TEST if(_client_opt.is_adis_table) { _ws = get_exact_test_ws(); } #endif if(_client_opt.is_bsnp_nploid){ _ninfo.reset(new nploid_info(_client_opt.bsnp_nploid_ploidy)); } if(_client_opt.is_bsnp_diploid_allele_file){ snp_pos_info good_pi; std::vector dependent_eprob; for(unsigned b(0);b0. or _client_opt.bsnp_ssd_one_mismatch>0)); // define an expanded indel influence zone around the report range: const int bshift(get_influence_zone_size(_client_opt.max_indel_size)); pos_range& rir( _report_influence_range); rir = _client_dopt.report_range_limit; if(rir.is_begin_pos) { rir.begin_pos -= bshift; } if(rir.is_end_pos) { rir.end_pos += bshift; } } starling_pos_processor:: ~starling_pos_processor() { if(_ws) free(_ws); } void starling_pos_processor:: reset() { _msm.reset(); pos_range output_report_range(_client_opt.report_range); if((not output_report_range.is_begin_pos) and _msm.is_first_pos_set()){ output_report_range.set_begin_pos(_msm.min_pos()); } if((not output_report_range.is_end_pos) and _msm.is_first_pos_set()){ output_report_range.set_end_pos(_msm.max_pos()+1); } std::ostream& report_os(get_report_os()); report_os << std::setprecision(8); report_stream_stat(_ss,"ALLSITES_COVERAGE",output_report_range,report_os); report_stream_stat(_used_ss,"ALLSITES_COVERAGE_USED",output_report_range,report_os); if(_client_opt.is_ref_set) { report_stream_stat(_ssn,"NO_REF_N_COVERAGE",output_report_range,report_os); report_stream_stat(_used_ssn,"NO_REF_N_COVERAGE_USED",output_report_range,report_os); } } bool starling_pos_processor:: insert_indel(const indel& in){ // // ppr advance is controlled by the start positions of reads and // contigs, not indels. The rationalle for this is that indels are // relatively cheap to store (so long as we aren't including // gigantic insert sequences) and do not scale up linearly with // increased coverage like reads do. For this reason our strategy // is to buffer the indels as far ahead as possible while leaving // the read buffer sizes fixed at a smaller value. // _msm.validate_new_pos_value(in.key.pos,STAGE::READ_BUFFER); return _indel_buff.insert_indel(in); } std::pair starling_pos_processor:: insert_read(const bam_record& br, const alignment& al, const READ_ALIGN::index_t rat, const char* chrom_name, const bool is_usable_mapping, const align_id_t contig_id, const indel_set_t* contig_indels_ptr) { if(_chrom_name.empty()) { assert(NULL != chrom_name); assert(strlen(chrom_name)); _chrom_name=chrom_name; } else { if(_chrom_name!=chrom_name){ log_os << "ERROR: starling_read_buffer.insert_read(): read has unexpected sequence name: '" << chrom_name << "' expecting: '" << _chrom_name << "'\n"; log_os << "\tread_key: " << read_key(br) << "\n"; exit(EXIT_FAILURE); } } // check at al.pos rather than buffer pos because we just need to // make sure that all candidate indels get entered by the end of // HEAD stage -- if the read aligns back past the end of head // stage it is ok so long as we know it will not generate any // candidate indels in this region: // if(not _msm.is_new_pos_value_valid(al.pos,STAGE::HEAD)) { log_os << "WARNING: skipping alignment for read: " << read_key(br) << " which falls outside of the read buffer\n"; return std::make_pair(false,0); } const std::pair res(_read_buff.add_read_alignment(_client_opt.max_indel_size, br,al,is_usable_mapping, rat,contig_id)); if(not res.first) return res; // must initialize initial genomic read_segments "by-hand": // // TODO get this streamlined into the pos-processor // if(READ_ALIGN::GENOME==rat) { const starling_read* sread_ptr(_read_buff.get_read(res.second)); assert(NULL!=sread_ptr); const seg_id_t seg_id(sread_ptr->is_segmented() ? 1 : 0 ); init_read_segment(sread_ptr->get_segment(seg_id)); } // add contig read indels to sppr (genomic reads handled within pos_processor): // if(READ_ALIGN::CONTIG==rat) { if(not (al.empty() or al.is_seq_swap())) { // TODO -- check that indels stay within the bounds of the ref_seq // // TODO -- check that multiple indels on the same read do not add-up to exceed the buffer size // // TODO -- normalize indels // static const INDEL_ALIGN_TYPE::index_t iat(INDEL_ALIGN_TYPE::CONTIG_READ); const bam_seq bseq(br.get_bam_read()); try { add_alignment_indels_to_sppr(_client_opt.max_indel_size, al.pos,al.path,bseq,*this,iat,res.second,contig_indels_ptr); } catch (...) { log_os << "\nException caught in add_alignment_indels_to_sppr() while processing record: " << read_key(br) << "\n"; throw; } } } return res; } // // this function is last chance to check-for/warn-about/clean-out any // // alignments that won't survive alignment and calling: // // // // note that cleared alignments still potentially have ids sitting in // // the indel buffer, and this would be hard to clean out (except by // // brute force) // // // void // starling_pos_processor:: // clean_pos(const pos_t pos) { // std::vector dead_meat; // starling_read_iter ri(_read_buff.get_pos_read_iter(pos)); // starling_read* srp; // while(NULL!=(srp=ri.get_ptr())){ // if(not srp->is_full_record()) { // log_os << "WARNING: incomplete read record must be removed from pipeline: " << srp->key() << "\n"; // dead_meat.push_back(srp->id); // } // ri.next(); // } // const unsigned ds(dead_meat.size()); // for(unsigned i(0);iget_segment(r.second)); } } // For all reads buffered at the current position: // 1) determine the set of candidate indels that the read overlaps // 2) determine the set of private indels within the reads discovery alignments // 3) Find the most likely alignment given both sets of indels // 4) evaluate the probability that the read supports each candidate indel vs. the reference // 5) process most-likely alignment for snp-calling // 5a) insert most-likely alignment into output read buffer (re-link within same data-structure?) // 6) remove read from input read buffer // void starling_pos_processor:: align_pos(const pos_t pos) { read_segment_iter ri(_read_buff.get_pos_read_segment_iter(pos)); for(read_segment_iter::ret_val r;true;ri.next()){ r=ri.get_ptr(); if(NULL==r.first) break; read_segment& rseg(r.first->get_segment(r.second)); if(_client_opt.is_realign_submapped_reads or rseg.is_treated_as_usable_mapping()){ realign_and_score_read(_client_opt,_ref_seq,_estdepth_buff, rseg,_indel_buff); } } #ifdef ESTDEPTH_DUMP if(_estdepth_buff.val(pos)>0){ log_os << "ESTDEPTH: pos,val: " << pos << " " << _estdepth_buff.val(pos) << "\n"; } #endif } void starling_pos_processor:: process_pos(const int stage_no, const pos_t pos){ if (stage_no==STAGE::HEAD) { init_read_segment_pos(pos); } else if (stage_no==STAGE::READ_BUFFER) { #ifdef DEBUG_PPOS _indel_buff.dump_pos(pos,log_os); _read_buff.dump_pos(pos,log_os); #endif // clean_pos(pos); align_pos(pos); pileup_pos_reads(pos); // if(_client_opt.is_realigned_read_file) { // rebuffer_pos_reads(pos); // } if(_client_opt.is_realigned_read_file) { write_reads(pos); } _read_buff.clear_pos(pos); } else if (stage_no==STAGE::POST_ALIGN){ if(is_pos_reportable(pos)){ process_pos_indel(pos); try { process_pos_snp(pos); } catch (...) { log_os << "Exception caught in starling_pos_processor.process_pos_snp() while processing chromosome position: " << (pos+1) << "\n" << "snp_pos_info:\n"; const snp_pos_info* spi_ptr(_bc_buff.get_pos(pos)); if(NULL==spi_ptr){ static const snp_pos_info spi_null; spi_ptr=&spi_null; } log_os << *spi_ptr << "\n"; throw; } } _indel_buff.clear_pos(pos); } else if (stage_no==STAGE::POST_CALL) { _estdepth_buff.clear_pos(pos); _bc_buff.clear_pos(pos); } else { log_os << "ERROR: unexpected processing stage in starling_pos_processor\n"; exit(EXIT_FAILURE); } } void starling_pos_processor:: insert_pos_basecall(const pos_t pos, const base_call& bc) { if(not is_pos_reportable(pos)) return; _msm.validate_new_pos_value(pos,STAGE::POST_ALIGN); _bc_buff.insert_pos_basecall(pos,bc); } static void get_indel_error_prob_hpol_len(const unsigned hpol_len, double& insert_error_prob, double& delete_error_prob){ // indel error model parameters for P(error) = Ax+Bx^C, where x=hpol_len // note that fit does not cover length 1 deletions, // for which the estimated value is instead provided directly // // \todo get these parameters out of the code! // static const double insert_A(5.03824e-7); static const double insert_B(3.30572e-10); static const double insert_C(6.99777); static const double delete_hpol1_err(3.00057e-6); static const double delete_A(1.09814e-5); static const double delete_B(5.19742e-10); static const double delete_C(6.99256); const double insert_g(insert_A*hpol_len+insert_B*std::pow(hpol_len,insert_C)); insert_error_prob=(1.-std::exp(-insert_g)); double delete_g(delete_hpol1_err); if(hpol_len>1){ delete_g = delete_A*hpol_len+delete_B*std::pow(hpol_len,delete_C); } delete_error_prob=(1.-std::exp(-delete_g)); } // // "indel_error" is the probability that the read supporting the indel case is an error // "ref_error" is the probability that the read supporting the ref case is an error // static void get_indel_error_prob(const starling_options& client_opt, const starling_indel_report_info& iri, double& indel_error_prob, double& ref_error_prob){ // cache results for any realistic homopolymer length: static const unsigned max_hpol_len(40); static bool is_init(false); static std::pair indel_error_prob_len[max_hpol_len]; if(not is_init) { if(not client_opt.is_simple_indel_error) { double itmp(0); double dtmp(0); for(unsigned i(0);ifirst); const indel_data& id(i->second); // if(id.is_conflict) continue; if(not is_candidate_indel(_client_opt,_estdepth_buff,ik,id)) continue; if(id.read_path_lnp.empty()) continue; // TODO implement indel overlap resolution // // punt conflict resolution for now.... bool is_indel(false); if(_client_opt.is_bindel_diploid){ // indel_report_info needs to be run first now so that // local small repeat info is available to the indel // caller starling_indel_report_info iri; get_starling_indel_report_info(_client_dopt,ik,id,_ref_seq,iri); { // get depth of indel: pos_t depth_pos(pos-1); if(ik.type==INDEL::SV_BP_RIGHT) depth_pos=pos; const snp_pos_info* spi_ptr(_bc_buff.get_pos(depth_pos)); if(NULL==spi_ptr) { iri.depth=0; } else { iri.depth=spi_ptr->calls.size(); } } double indel_error_prob(0); double ref_error_prob(0); get_indel_error_prob(_client_opt,iri,indel_error_prob,ref_error_prob); starling_diploid_indel dindel; starling_indel_call_pprob_digt(_client_opt,_client_dopt, indel_error_prob,ref_error_prob, ik,id,dindel); if(dindel.is_indel) { is_indel=true; if(_client_opt.is_bindel_diploid_file){ std::ostream& bos(*_client_io.bindel_diploid_osptr()); bos << _chrom_name << "\t" << output_pos << "\t"; write_starling_diploid_indel_file(dindel,iri,bos); bos << "\n"; } else { report_os << "BINDEL2 pos: " << output_pos << " "; write_starling_diploid_indel(dindel,iri,report_os); report_os << "\n"; } } /// \TODO put this option under runtime control... /// \TODO setup option so that read keys persist longer when needed for this case... /// static const bool is_print_indel_evidence(false); if(is_print_indel_evidence and is_indel){ report_os << "INDEL_EVIDENCE " << ik; typedef indel_data::score_t::const_iterator siter; siter i(id.read_path_lnp.begin()), i_end(id.read_path_lnp.end()); for(;i!=i_end;++i){ const align_id_t read_id(i->first); const read_path_scores& lnp(i->second); const read_path_scores pprob(indel_lnp_to_pprob(_client_dopt,lnp)); const starling_read* srptr(_read_buff.get_read(read_id)); report_os << "read key: "; if(NULL==srptr) report_os << "UNKNOWN_KEY"; else report_os << srptr->key(); report_os << "\n" << "read log_lhoods: " << lnp << "\n" << "read pprobs: " << pprob << "\n"; } } } } } #if 0 static pos_t get_new_read_pos(const starling_read& sr) { // get the best alignment for the read: const read_segment& rseg(sr.get_segment()); const alignment* best_al_ptr(&(rseg.genome_align)); if(rseg.is_realigned) best_al_ptr=&(rseg.realignment); if(best_al_ptr->empty()) return sr.buffer_pos; // a grouper contig read which was not realigned... else return best_al_ptr->pos; } // adjust read buffer position so that reads are buffered in sorted // order after realignment: // void starling_pos_processor:: rebuffer_pos_reads(const pos_t pos) { // need to queue up read changes and run at the end so that we // don't invalidate read buffer iterators // typedef std::pair read_pos_t; std::vector new_read_pos; starling_read_iter ri(_read_buff.get_pos_read_iter(pos)); const starling_read* srp; while(NULL!=(srp=ri.get_ptr())){ const pos_t new_pos(get_new_read_pos(*srp)); if((new_pos!=pos) and (_msm.is_new_pos_value_valid(new_pos,STAGE::POST_ALIGN))){ new_read_pos.push_back(std::make_pair(srp->id(),new_pos)); } ri.next(); } const unsigned nr(new_read_pos.size()); for(unsigned i(0);isegment_count()==r.second){ r.first->write_bam(bamd); } ri.next(); } } // convert reads buffered at position into a position basecall // "pileup" to allow for downstream depth and snp calculations // void starling_pos_processor:: pileup_pos_reads(const pos_t pos) { read_segment_iter ri(_read_buff.get_pos_read_segment_iter(pos)); read_segment_iter::ret_val r; while(true){ r=ri.get_ptr(); if(NULL==r.first) break; const read_segment& rseg(r.first->get_segment(r.second)); if(rseg.is_treated_as_usable_mapping()){ pileup_read_segment(rseg); } ri.next(); } } void starling_pos_processor:: pileup_read_segment(const read_segment& rseg) { // get the best alignment for the read: const alignment* best_al_ptr(&(rseg.genome_align())); if(rseg.is_realigned){ best_al_ptr=&(rseg.realignment); } else { // detect whether this read has no alignments with indels we can handle: if(not rseg.is_any_nonovermax(_client_opt.max_indel_size)) return; } const alignment& best_al(*best_al_ptr); if(best_al.empty()){ if(not rseg.is_realigned) { if(_client_opt.verbosity >= LOG_LEVEL::ALLWARN) { log_os << "WARNING: skipping read_segment with no genomic alignment and contig alignment outside of indel.\n"; log_os << "\tread_name: " << rseg.key(); } } else { // this indicates that 2 or more equally likely alignments // of the entire read exist in the local realignment log_os << "WARNING: skipping read_segment which has multiple equaly likely but incompatible alignments: " << rseg.key() << "\n"; } return; } // check that read has not been realigned too far to the left: if(not _msm.is_new_pos_value_valid(best_al.pos,STAGE::POST_ALIGN)){ assert(rseg.is_realigned); log_os << "WARNING: read realigned outside bounds of pileup buffer. Skipping...\n" << "\tread: " << rseg.key() << "\n"; return; } const unsigned read_size(rseg.read_size()); const bam_seq bseq(rseg.get_bam_read()); const uint8_t* qual(rseg.qual()); const uint8_t mapq(rseg.map_qual()); const bool is_mapq_adjust(mapq<=80); // test read against max indel size (this is a backup, should have been taken care of upstream): const unsigned read_ref_mapped_size(apath_ref_length(best_al.path)); if(read_ref_mapped_size > (read_size+_client_opt.max_indel_size)){ //brc.large_ref_deletion++; return; } // exact begin and end report range filters: { const pos_range& rlimit(_client_dopt.report_range_limit); if(rlimit.is_end_pos and (best_al.pos>=rlimit.end_pos)) return; if(rlimit.is_begin_pos) { const pos_t al_end_pos(best_al.pos+static_cast(read_ref_mapped_size)); if(al_end_pos <= rlimit.begin_pos) return; } } // find trimmed sections (as defined by the CASAVA 1.0 caller) unsigned fwd_strand_begin_skip(0); unsigned fwd_strand_end_skip(0); get_read_fwd_strand_skip(bseq, best_al.is_fwd_strand, fwd_strand_begin_skip, fwd_strand_end_skip); assert(read_size>=fwd_strand_end_skip); const unsigned read_begin(fwd_strand_begin_skip); const unsigned read_end(read_size-fwd_strand_end_skip); #ifdef DEBUG_PPOS std::cerr << "best_al: " << best_al; std::cerr << "read_size,read_rms: " << read_size << " " << read_ref_mapped_size << "\n"; std::cerr << "read_begin,read_end: " << read_begin << " " << read_end << "\n"; #endif // find mismatch filtration: bool mismatch_filter_map[MAX_READ_SIZE]; int mismatch_count_ns[MAX_READ_SIZE]; // value used for is_neighbor_mismatch in case it is not measured: bool is_neighbor_mismatch(false); // precompute mismatch density info for this read: if(_client_opt.is_max_win_mismatch){ const string_bam_seq ref_bseq(_ref_seq); create_mismatch_filter_map(_client_opt,best_al,ref_bseq,bseq,read_begin,read_end, mismatch_filter_map,mismatch_count_ns); } // alignment walkthough: pos_t ref_head_pos(best_al.pos); unsigned read_head_pos(0); using namespace ALIGNPATH; const unsigned as(best_al.path.size()); for(unsigned i(0);i= read_end)) continue; // allow for read end trimming const pos_t ref_pos(ref_head_pos+static_cast(j)); #ifdef DEBUG_PPOS std::cerr << "j,ref,read: " << j << " " << ref_pos << " " << read_pos << "\n"; #endif //const char ref(get_seq_base(_ref_seq,ref_pos)); const uint8_t call_code(bseq.get_code(read_pos)); uint8_t qscore(qual[read_pos]); if(is_mapq_adjust) { qscore = qphred_to_mapped_qphred(qscore,mapq); } bool is_call_filter((call_code == BAM_BASE::ANY) or (qscore < _client_opt.min_qscore)); assert(not _client_opt.is_min_win_qscore); if(not is_call_filter and _client_opt.is_max_win_mismatch){ is_call_filter = mismatch_filter_map[read_pos]; } unsigned align_strand_read_pos(read_pos); unsigned end_trimmed_read_len(read_end); if(not best_al.is_fwd_strand){ align_strand_read_pos=read_size-(read_pos+1); end_trimmed_read_len=read_size-fwd_strand_begin_skip; } if(_client_opt.is_max_win_mismatch){ is_neighbor_mismatch=(mismatch_count_ns[read_pos]>0); } try { const uint8_t call_id(bam_seq_code_to_id(call_code)); insert_pos_basecall(ref_pos, base_call(call_id,qscore,best_al.is_fwd_strand, align_strand_read_pos,end_trimmed_read_len, is_call_filter,is_neighbor_mismatch)); } catch (...) { log_os << "Exception caught in starling_pos_processor.insert_pos_basecall() " << "while processing read_position: " << (read_pos+1) << "\n"; throw; } } } if(is_segment_type_read_length(ps.type)) read_head_pos += ps.length; if(is_segment_type_ref_length(ps.type)) ref_head_pos += ps.length; } // return READ_FATE::USED; } void starling_pos_processor:: process_pos_snp(const pos_t pos){ snp_pos_info null_pi; snp_pos_info* pi_ptr(_bc_buff.get_pos(pos)); if(NULL==pi_ptr) pi_ptr=&null_pi; snp_pos_info& pi(*pi_ptr); const unsigned n_calls(pi.calls.size()); const pos_t output_pos(pos+1); pi.ref_base=get_seq_base(_ref_seq,pos); // for all but coverage-tests, we use a high-quality subset of the basecalls: // snp_pos_info good_pi; good_pi.ref_base = pi.ref_base; for(unsigned i(0);i dependent_eprob; adjust_joint_eprob(_client_opt,good_pi,dependent_eprob,_is_dependent_eprob); // get fraction of filtered bases: const double filter_fraction(static_cast(n_unused_calls)/static_cast(n_calls)); const bool is_overfilter(filter_fraction > _client_opt.max_basecall_filter_fraction); // delay writing any snpcalls so that anomaly tests can (optionally) be applied as filters: // bacon_info bi; lrt_snp_call lsc; diploid_genotype dgt; monoploid_genotype mgt; std::auto_ptr ngt_ptr; // if(_client_opt.is_bacon_allele or _client_opt.is_bacon_snp){ get_bacon_scores(_client_opt,good_pi,n_unused_calls,bi); } if(pi.calls.empty()) { // report allele-caller output for empty lines if option to do so is set: if(_client_opt.is_bacon_allele and _client_opt.is_bacon_allele_print_empty){ report_bacon_allele_call(_client_io,output_pos,bi); } if(_client_opt.is_bsnp_diploid_allele_file and _client_opt.is_bsnp_diploid_allele_print_empty){ const diploid_genotype& edgt(get_empty_dgt(pi.ref_base)); write_bsnp_diploid_allele(_client_opt,_client_io,_chrom_name,output_pos,pi.ref_base,n_used_calls,n_unused_calls,good_pi,edgt); } return; } if(_client_opt.is_counts){ report_counts(good_pi,n_unused_calls,output_pos,*_client_io.counts_osptr()); } if(_client_opt.is_lsnp){ position_snp_call_lrt(_client_opt.lsnp_alpha,good_pi,lsc); } if(_client_opt.is_bsnp_diploid){ position_snp_call_pprob_digt(_client_opt.bsnp_diploid_theta,good_pi,dependent_eprob,dgt,_client_opt.is_bsnp_diploid_allele_file); } if(_client_opt.is_bsnp_monoploid){ position_snp_call_pprob_monogt(_client_opt.bsnp_monoploid_theta,good_pi,mgt); } if(_client_opt.is_bsnp_nploid){ ngt_ptr.reset(new nploid_genotype(*_ninfo)); position_snp_call_pprob_nploid(_client_opt.bsnp_nploid_snp_prob,good_pi,*_ninfo,*ngt_ptr); } const bool is_snp(bi.is_snp or lsc.is_snp or dgt.is_snp or mgt.is_snp or (ngt_ptr.get() and ngt_ptr->is_snp)); // find anomalies: // bool is_pos_adis(false); bool is_pos_acov(false); if((_client_opt.is_adis_table or _client_opt.is_adis_lrt) and is_snp){ if(_client_opt.is_adis_table){ is_pos_adis = (is_pos_adis || position_strand_distro_anomaly(_client_opt.adis_table_alpha,good_pi,_ws)); } if(_client_opt.is_adis_lrt){ is_pos_adis = (is_pos_adis || position_strand_distro_anomaly_lrt(_client_opt.adis_lrt_alpha,good_pi)); } } if(_client_opt.is_acov){ is_pos_acov = position_strand_coverage_anomaly(_client_opt.acov_alpha,pi); } const bool is_anomaly(is_pos_adis or is_pos_acov); const bool is_filter_snp(is_overfilter or (_client_opt.is_filter_anom and is_anomaly)); // in case of anomaly filtering -- still print out the allele caller line, but reduce the BaCON score to zero: // if(_client_opt.is_bacon_allele){ if(is_filter_snp) { bacon_scores& bas(bi.bas); bas.max_score = 0.; bas.max2_score = 0.; bas.max_base_id = base_to_id(pi.ref_base); bas.max2_base_id = base_to_id(pi.ref_base); bi.is_max2_reported = false; } report_bacon_allele_call(_client_io,output_pos,bi); } if(_client_opt.is_bsnp_diploid_allele_file){ const diploid_genotype* dgt_ptr(&dgt); if(is_filter_snp) { dgt_ptr=&get_empty_dgt(pi.ref_base); } write_bsnp_diploid_allele(_client_opt,_client_io,_chrom_name,output_pos,pi.ref_base,n_used_calls,n_unused_calls,good_pi,*dgt_ptr); } // report events: // bool is_reported_event(false); std::ostream& report_os(get_report_os()); if(is_snp and not (is_filter_snp)) { if(_client_opt.is_bacon_snp and bi.is_snp) { report_bacon_snp_call(_client_io,output_pos,pi.ref_base,bi); } if(lsc.is_snp) { write_snp_prefix_info("LSNP",output_pos,pi.ref_base,n_used_calls,n_unused_calls,report_os); report_os << " " << lsc << "\n"; } if(dgt.is_snp){ if(_client_opt.is_bsnp_diploid_file){ std::ostream& bos(*_client_io.bsnp_diploid_osptr()); write_snp_prefix_info_file(_chrom_name,output_pos,pi.ref_base,n_used_calls,n_unused_calls,bos); bos << "\t"; write_diploid_genotype_snp(_client_opt,good_pi,dgt,bos); bos << "\n"; } else { write_snp_prefix_info("BSNP2",output_pos,pi.ref_base,n_used_calls,n_unused_calls,report_os); report_os << " " << dgt << "\n"; } } if(mgt.is_snp) { write_snp_prefix_info("BSNP1",output_pos,pi.ref_base,n_used_calls,n_unused_calls,report_os); report_os << " " << mgt << "\n"; } if(ngt_ptr.get() and ngt_ptr->is_snp) { write_snp_prefix_info("BSNPN",output_pos,pi.ref_base,n_used_calls,n_unused_calls,report_os); report_os << " "; nploid_write(*_ninfo,*ngt_ptr,report_os); report_os << "\n"; } is_reported_event = true; } if(is_anomaly and not _client_opt.is_filter_anom){ if(is_pos_adis) report_os << "ANOM_DIS pos: " << output_pos << "\n"; if(is_pos_acov) report_os << "ANOM_COV pos: " << output_pos << "\n"; is_reported_event = true; } if(_client_opt.is_print_all_site_evidence or (_client_opt.is_print_evidence and is_reported_event)){ report_os << "EVIDENCE pos: " << output_pos << "\n" << "is_snp: " << is_snp << "\n" << "is_anomaly: " << is_anomaly << "\n" << pi << "\n"; } } const diploid_genotype& starling_pos_processor:: get_empty_dgt(const char ref) const { if(ref!='N'){ return static_cast(*_empty_dgt[base_to_id(ref)]); } else { static const diploid_genotype n_dgt; return static_cast(n_dgt); } }