// -*- mode: c++; indent-tabs-mode: nil; -*- // // Copyright 2009 Illumina, Inc. // // This software is covered by the "Illumina Genome Analyzer Software // License Agreement" and the "Illumina Source Code License Agreement", // and certain third party copyright/licenses, and any user of this // source file is bound by the terms therein (see accompanying files // Illumina_Genome_Analyzer_Software_License_Agreement.pdf and // Illumina_Source_Code_License_Agreement.pdf and third party // copyright/license notices). // // /// \file /// /// \author Chris Saunders /// #include "pos_processor.hh" #include #include "blt_common/blt_arg_validate.hh" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace { const prog_info& pinfo(blt_info::get()); } #if 0 static void open_ifstream(std::ifstream& ifs, const char* filename){ ifs.open(filename); if( ! ifs ){ log_os << "ERROR:: Can't open file: " << filename << "\n"; exit(EXIT_FAILURE); } } #endif struct stream_pos { stream_pos(const pos_t p, const unsigned s) : pos(p), stream_no(s) {} // reverse the sort so that this works correctly on a priority_queue: bool operator<(const stream_pos& rhs) const { return pos > rhs.pos; } pos_t pos; unsigned stream_no; }; static void put_next_in_queue(boost::scoped_array >& exr_array, std::priority_queue& stream_queue, const unsigned stream_no){ if(exr_array[stream_no]->next()){ const pos_t base_pos(exr_array[stream_no]->exline()->match_position()-1); stream_queue.push(stream_pos(base_pos,stream_no)); } } /// \brief generate mismatch-filter map in a way that scales /// efficiently for large mismatch density windows: /// // // basic algo (described with 1-based indices): // // S=length(seq) F=length(flank) // // 1. delta list length is DL=max(1,S-2F) // 2. for each position: // 1. if position is mismatch, add 1 to delta list at position max(1,p-2F), and subtract 1 from delta list at position p+1 if p+1 <= DL // 3. sum delta list to match-count list // 4. match(pos) = delta[min(DL,max(1,pos-F))] // //..for window size 1: // // match: M // pos : 1 2 3 4 5 // delta: x 1 0 0 x // void create_mismatch_filter_map(const blt_options& client_opt, const unsigned read_end, const char* const read, const char* const read_ref_copy, const unsigned* const read_ref_map, const unsigned leading_insertion_end, const unsigned trailing_insertion_begin, bool* mismatch_filter_map, int* mismatch_count_ns) { const unsigned fs(client_opt.max_win_mismatch_flank_size); const unsigned fs2(fs*2); const unsigned delta_size(std::max(1+fs2,read_end)-fs2); int delta[MAX_READ_SIZE]; for(unsigned i(0);i=leading_insertion_end) and (i(client_opt.max_win_mismatch)); for(unsigned i(0);i > exr_array(new std::auto_ptr[n_sorted]); for(unsigned i(0);i ppr_ptr(new pos_processor(client_opt,client_dopt,ref_seq.c_str(),static_cast(ref_seq.size()),client_io)); pos_processor& ppr(*ppr_ptr); unsigned candidate_reads(0); unsigned sampled_reads(0); // start out by initiating every input stream (except the first) // on the stream queue. The first stream (current_stream) will be // initialized at the start of the stream-reading loop: // std::priority_queue stream_queue; for(unsigned i(1);i= rlimit.end_pos)) break; // Approximate begin range filter. Exact filter below uses // read_length and match descriptor: if(base_pos+MAX_READ_SIZE+MAX_READ_REF_DELETION_SIZE <= rlimit.begin_pos) continue; // now check various quality filters: if(not exl.is_passed_filter()) { brc.primary_filter++; continue; } const char* const md(exl.match_descriptor()); if((strlen(md) == 0) or (strcmp(md,"-") == 0)) { brc.unmapped++; continue; } if(not exl.is_single_alignment_score()){ log_os << "ERROR:: no single alignment score found in export line.\n"; exr.report_state(log_os); exit(EXIT_FAILURE); } if(exl.is_paired_alignment_score()){ if(exl.paired_alignment_score()0); if(read_size > MAX_READ_SIZE){ log_os << "ERROR:: maximum read size (" << MAX_READ_SIZE << ") exceeded in export line.\n"; exr.report_state(log_os); exit(EXIT_FAILURE); } if(not is_valid_seq(read)){ log_os << "ERROR:: unsupported base(s) in read sequence.\n"; exr.report_state(log_os); exit(EXIT_FAILURE); } unsigned read_ref_mapped_size(0); try { expand_match_descriptor(read,md,read_ref_copy,read_ref_map,read_size,read_ref_mapped_size); } catch (...) { log_os << "Exception caught in expand_match_descriptor()\n"; exr.report_state(log_os); throw; } // exact begin range filter: if(base_pos+static_cast(read_ref_mapped_size) <= rlimit.begin_pos) continue; // mark boundaries of any flanking insertion to reference: unsigned leading_insertion_end(read_size); for(unsigned i(0);i0;--i){ if(read_ref_map[i-1] != 0){ trailing_insertion_begin=i; break; } } // check if read spans too far over the reference sequence due // to deletions: if(read_ref_mapped_size > read_size+MAX_READ_REF_DELETION_SIZE){ brc.large_ref_deletion++; continue; } brc.used++; // don't count continous trailing 'N''s on a read: unsigned read_align_strand_end_skip(0); get_read_align_strand_end_skip(read,read_size,read_align_strand_end_skip); const unsigned read_end(read_size-read_align_strand_end_skip); // precompute mismatch density info for this read: if(client_opt.is_max_win_mismatch){ create_mismatch_filter_map(client_opt,read_end, read,read_ref_copy,read_ref_map, leading_insertion_end, trailing_insertion_begin, mismatch_filter_map, mismatch_count_ns); } const bool is_fwd_strand(exl.match_strand() == 'F'); const char* const quality(exl.quality()); for(unsigned i(0);i(read_ref_map[i])-1); pos_t pos(base_pos+read_ref_shift); char iref(read_ref_copy[i]); char icall(read[i]); if(not is_fwd_strand){ pos=base_pos+static_cast(read_ref_mapped_size)-read_ref_shift-1; iref=comp_base(iref); icall=comp_base(icall); } const int qval(char_to_qval(quality[i])); // do all the read-level filtering we can before we add the basecall to pos-processor. // pos-processor will handle filtration based on assembled read evidence. // bool is_call_filter(false); if((icall == 'N') or (qval < client_opt.min_qscore)){ is_call_filter=true; } // check window quality filters: // // 1. minimum average quality in a window around the current base -- edge handler: preserve window size towards the inside of the read. // 2. minimum number of mismatches in a window arournd the current base -- edge handler: preserve window size towards the inside of the read. // // do the window calculation in a completely naive way to simplify future tweaking: // if(not is_call_filter and client_opt.is_min_win_qscore) { const unsigned fs(client_opt.min_win_qscore_flank_size); const unsigned win_center(std::min(std::max(i,fs),read_end-fs-1)); // if fs is very large or read_end is very small, then just use the whole read: const unsigned win_begin( fs<=win_center ? win_center-fs : 0 ); const unsigned win_end( (win_center+fs)0); ppr.add_pos_snp_info(pos,base_to_id(iref), base_call(base_to_id(icall), qval,is_fwd_strand,i,read_end, is_call_filter,is_neighbor_mismatch)); } catch (...) { log_os << "Exception caught in pos_processor.add_pos_snp_info() while processing read_position: " << (i+1) << "\n"; exr.report_state(log_os); throw; } } } ppr_ptr.reset(); brc.report(report_os); }