/*****************************************************************************/ // Copyright (c) Illumina 2008 // Author: Richard Shaw // // This software is covered by the "Illumina Genome Analyzer Software // License Agreement" and the "Illumina Source Code License Agreement", // and certain third party copyright/licenses, and any user of this // source file is bound by the terms therein (see accompanying files // Illumina_Genome_Analyzer_Software_License_Agreement.pdf and // Illumina_Source_Code_License_Agreement.pdf and third party // copyright/license notices). /*****************************************************************************/ #include #include #include #include "common/Exceptions.hh" #include "statistics/Tile_Stats.h" /*****************************************************************************/ namespace cc=casava::common; namespace cs=casava::statistics; const unsigned int Tile_Stats::our_all_reads_read_num = 0; /*****************************************************************************/ Tile_Stats::Tile_Stats(const std::vector &read_list /*const unsigned int num_reads*/, size_t max_patterns_to_store) : my_lane_num(0), my_tile_num(0), my_read_list(read_list), /*my_num_reads(num_reads),*/ my_curr_read_num(0), my_curr_read_num_cycles(0), my_curr_read_num_blanks(0), my_non_blank_filter(my_non_blank_mask, my_read_blank_pattern_str, my_curr_read_num_blanks), my_all_stats_set_ptr(0), my_passed_filter_stats_set_ptr(0) { assert(!my_read_list.empty()); assert(std::find(my_read_list.begin(), my_read_list.end(), our_all_reads_read_num) == my_read_list.end()); // If there are multiple reads, also include all reads (read num 0). //unsigned int low_read_num((num_reads > 1) ? our_all_reads_read_num : 1); my_quality_type_list.push_back(QUALITY_RAW); my_quality_type_list.push_back(QUALITY_FILTERED); std::string key_str; for (Quality_Type_List_CIter quality_type_citer(my_quality_type_list.begin()); quality_type_citer != my_quality_type_list.end(); ++quality_type_citer) { for (std::vector::const_iterator read_num = my_read_list.begin(); my_read_list.end() != read_num; ++read_num) { make_key_str(*quality_type_citer, *read_num, key_str); my_key_stats_set_ptr_map[key_str] = new Stats_Set(max_patterns_to_store); } if (1 < my_read_list.size()) { make_key_str(*quality_type_citer, our_all_reads_read_num, key_str); my_key_stats_set_ptr_map[key_str] = new Stats_Set(max_patterns_to_store); my_key_stats_set_ptr_map[key_str]->mark_as_all_reads(); } } } /*****************************************************************************/ Tile_Stats::~Tile_Stats() { for (Key_Stats_Set_Ptr_Map_Iter pair_iter(my_key_stats_set_ptr_map.begin()); pair_iter != my_key_stats_set_ptr_map.end(); ++pair_iter) { delete pair_iter->second; pair_iter->second = 0; } } /*****************************************************************************/ void Tile_Stats::add_read_align_info(const Read_Align_Info& read_align_info) { if (my_lane_num == 0) { // First read. if ((read_align_info.my_lane_num < MIN_LANE_NUM) || (read_align_info.my_lane_num > MAX_LANE_NUM)) { std::cerr << "ERROR: Supplied lane number " << read_align_info.my_lane_num << " is outside valid range " << MIN_LANE_NUM << " - " << MAX_LANE_NUM << std::endl; throw std::range_error(""); } if (read_align_info.my_tile_num < MIN_TILE_NUM) { std::cerr << "ERROR: Supplied tile number " << read_align_info.my_tile_num << " is less than minimum " << MIN_TILE_NUM << std::endl; throw std::range_error(""); } if (read_align_info.my_read_num < 1) { std::cerr << "ERROR: Supplied read num " << read_align_info.my_read_num << " is less than 1" << std::endl; throw std::range_error(""); } my_lane_num = read_align_info.my_lane_num; my_tile_num = read_align_info.my_tile_num; } else { if (read_align_info.my_lane_num != my_lane_num) { std::cerr << "ERROR: Unexpected lane number " << read_align_info.my_lane_num << " (expected " << my_lane_num << ")" << std::endl; throw std::range_error(""); } if (read_align_info.my_tile_num != my_tile_num) { std::cerr << "ERROR: Unexpected tile number " << read_align_info.my_tile_num << " (expected " << my_tile_num << ")" << std::endl; throw std::range_error(""); } } const std::string& raw_read_str(read_align_info.raw_read_str()); const std::string& read_str(read_align_info.my_read_str); const std::string& ref_str(read_align_info.my_ref_str); const unsigned int read_len(raw_read_str.length()); if (read_align_info.my_read_num != my_curr_read_num) { handle_new_read(read_align_info.my_read_num, read_len); } if (read_len != my_curr_read_num_cycles) { std::cerr << "ERROR: Unexpected read sequence length " << read_len << " (expected " << my_curr_read_num_cycles << ")" << std::endl; throw std::range_error(""); } // Update non-blank totals by cycle. my_non_blank_filter.reset(); std::for_each(raw_read_str.begin(), raw_read_str.end(), my_non_blank_filter); const bool passed_blank_count(my_curr_read_num_blanks <= Stats_Set::MAX_NUM_BLANKS_IN_COMMON_WORDS); my_all_stats_set_ptr->apply_non_blank_mask(my_non_blank_mask); my_all_stats_set_ptr->most_common_blank_patterns().record_pattern(my_read_blank_pattern_str); if (passed_blank_count) { my_all_stats_set_ptr->most_common_words().record_pattern(raw_read_str); } if (read_align_info.passed_filter()) { my_passed_filter_stats_set_ptr->apply_non_blank_mask(my_non_blank_mask); my_passed_filter_stats_set_ptr->most_common_blank_patterns().record_pattern(my_read_blank_pattern_str); if (passed_blank_count) { my_passed_filter_stats_set_ptr->most_common_words().record_pattern(raw_read_str); } } my_all_stats_set_ptr->add_basecalls_stats(read_align_info.my_raw_read_str, read_align_info.my_qual_str); if (read_align_info.passed_filter()) { my_passed_filter_stats_set_ptr->add_basecalls_stats(read_align_info.my_raw_read_str, read_align_info.my_qual_str); } if (!read_align_info.is_aligned()) { // Nothing more to do. return; } // From this point the read is known to be aligned. // For export files, ref is generated from read. // (For align, length mismatches are handled at the point of reading.) assert(read_str.length() == ref_str.length()); const char del_ch(Read_Align_Info::our_del_ch); my_one_read_cum_errs_mask = 0; std::string::const_iterator read_ch_citer(read_str.begin()); std::string::const_iterator ref_ch_citer(ref_str.begin()); unsigned int read_cum_num_errs(0); unsigned int row_start_ele_ind(0); unsigned int num_insertion_bases(0); unsigned int num_deletion_bases(0); for (unsigned int cycle_num(1); cycle_num <= my_curr_read_num_cycles; ++cycle_num, ++read_ch_citer, ++ref_ch_citer, row_start_ele_ind += Cum_Errs::max_to_count) { const char read_ch(*read_ch_citer); const char ref_ch(*ref_ch_citer); // Skip past deletion (the padding char does not count as a cycle). if (read_ch == del_ch) { ++num_deletion_bases; --cycle_num; // OK for uint as 1-offset row_start_ele_ind -= Cum_Errs::max_to_count; // another undo continue; } // Skip past insertion. if (ref_ch == del_ch) { ++num_insertion_bases; continue; } if ((read_ch != the_blank_ch) && (ref_ch != the_blank_ch)) { if (read_ch != ref_ch) { ++read_cum_num_errs; } if (read_cum_num_errs <= Cum_Errs::max_to_count) { const unsigned int start_ind(row_start_ele_ind + read_cum_num_errs); const unsigned int num_eles(Cum_Errs::max_to_count - read_cum_num_errs); my_one_read_cum_errs_mask[std::slice(start_ind, num_eles, 1)] = 1; } } ++(my_all_stats_set_ptr->base_calls()(read_ch, ref_ch, cycle_num)); if (read_align_info.passed_filter()) { ++(my_passed_filter_stats_set_ptr->base_calls()(read_ch, ref_ch, cycle_num)); } } my_all_stats_set_ptr->add_align_stats(read_align_info.my_align_score, read_align_info.my_num_insertions, num_insertion_bases, read_align_info.my_num_deletions, num_deletion_bases); my_all_stats_set_ptr->cum_errs().vals() += my_one_read_cum_errs_mask; if (read_align_info.passed_filter()) { my_passed_filter_stats_set_ptr->add_align_stats(read_align_info.my_align_score, read_align_info.my_num_insertions, num_insertion_bases, read_align_info.my_num_deletions, num_deletion_bases); my_passed_filter_stats_set_ptr->cum_errs().vals() += my_one_read_cum_errs_mask; } } /*****************************************************************************/ void Tile_Stats::derive_stats() { if (my_num_reads() > 1) { derive_all_reads_stats(); } for (Key_Stats_Set_Ptr_Map_Iter pair_iter(my_key_stats_set_ptr_map.begin()); pair_iter != my_key_stats_set_ptr_map.end(); ++pair_iter) { pair_iter->second->derive_stats(); } } /*****************************************************************************/ void Tile_Stats::write_stat_files(Quality_Type quality_type, File_Buffer& score_file_buffer, const std::string &sample_name, const std::string &barcode, cs::AlignStatsXmlWriter &xml_writer) const { // with multi-reads, the set for all reads must be added to the list const size_t end_read_index = (1 == my_num_reads() ? 1 : my_num_reads() + 1); for (size_t read_index = 0; end_read_index > read_index; ++read_index) { const unsigned int read_num = (read_index >= my_num_reads() ? our_all_reads_read_num : my_read_list[read_index]); File_Buffer::Line_Vec line_vec; line_vec.push_back(hdr_str(my_lane_num, my_tile_num, quality_type, read_num)); if (!score_file_buffer.write_some_lines(line_vec)) { BOOST_THROW_EXCEPTION(cc::IoException(errno, "Failed to write statistics set to " + score_file_buffer.file_path_str())); } std::string key_str; make_key_str(quality_type, read_num, key_str); Key_Stats_Set_Ptr_Map_CIter ele_citer(my_key_stats_set_ptr_map.find(key_str)); Stats_Set* stats_set_ptr(ele_citer->second); if (our_all_reads_read_num != read_num) { xml_writer.addTile(sample_name, barcode, my_lane_num, read_num, my_tile_num, *stats_set_ptr); } stats_set_ptr->write(score_file_buffer); } } /*****************************************************************************/ const std::string Tile_Stats::hdr_str(unsigned int lane_num, unsigned int tile_num, Quality_Type quality_type, unsigned int read_num) { std::stringstream hdr_ostrm; hdr_ostrm << "# Lane " << lane_num << " : Tile " << tile_num << " : " << qual_str(quality_type) << " : "; if (read_num == 0) { hdr_ostrm << "All Reads"; } else { hdr_ostrm << "Read " << read_num; } return hdr_ostrm.str(); } /*****************************************************************************/ const std::string Tile_Stats::qual_str(Quality_Type quality_type) { std::string str; switch (quality_type) { case QUALITY_RAW: str = "Raw"; break; case QUALITY_FILTERED: str = "Quality Filtered"; break; } return str; } /*****************************************************************************/ void Tile_Stats::make_key_str(Quality_Type quality_type, const unsigned int read_num, std::string& key_str) const { std::ostringstream ostrm; switch (quality_type) { case QUALITY_RAW: ostrm << "raw"; break; case QUALITY_FILTERED: ostrm << "QF"; break; } ostrm << ": " << read_num; key_str = ostrm.str(); } /*****************************************************************************/ void Tile_Stats::handle_new_read(const unsigned int read_num, const unsigned int num_cycles) { if (std::find(my_read_list.begin(), my_read_list.end(), read_num) == my_read_list.end()) { std::cerr << "ERROR: Read number found (" << read_num << ") is outside expected read list (1 - " << my_num_reads() << ")" << std::endl; throw std::range_error(""); } my_curr_read_num = read_num; my_curr_read_num_cycles = num_cycles; my_non_blank_mask.resize(num_cycles); my_read_blank_pattern_str.resize(num_cycles); my_one_read_cum_errs_mask.resize(num_cycles * Cum_Errs::max_to_count); std::string key_str; make_key_str(QUALITY_RAW, read_num, key_str); my_all_stats_set_ptr = my_key_stats_set_ptr_map[key_str]; my_all_stats_set_ptr->set_num_cycles(num_cycles); make_key_str(QUALITY_FILTERED, read_num, key_str); my_passed_filter_stats_set_ptr = my_key_stats_set_ptr_map[key_str]; my_passed_filter_stats_set_ptr->set_num_cycles(num_cycles); } /*****************************************************************************/ void Tile_Stats::derive_all_reads_stats() { assert(my_num_reads() > 1); std::string key_str; for (Quality_Type_List_CIter quality_type_citer(my_quality_type_list.begin()); quality_type_citer != my_quality_type_list.end(); ++quality_type_citer) { make_key_str(*quality_type_citer, our_all_reads_read_num, key_str); Stats_Set* all_reads_stats_set_ptr = my_key_stats_set_ptr_map[key_str]; unsigned int total_num_cycles(0); for (std::vector::const_iterator read_iterator = my_read_list.begin(); my_read_list.end() != read_iterator; ++read_iterator) { make_key_str(*quality_type_citer, *read_iterator, key_str); Stats_Set* read_stats_set_ptr = my_key_stats_set_ptr_map[key_str]; total_num_cycles += read_stats_set_ptr->num_cycles(); } all_reads_stats_set_ptr->set_num_cycles(total_num_cycles); unsigned int lo_cycle_num(1); unsigned int hi_cycle_num(1); for (std::vector::const_iterator read_iterator = my_read_list.begin(); my_read_list.end() != read_iterator; ++read_iterator) { make_key_str(*quality_type_citer, *read_iterator, key_str); Stats_Set* read_stats_set_ptr = my_key_stats_set_ptr_map[key_str]; hi_cycle_num = lo_cycle_num + read_stats_set_ptr->num_cycles() - 1; all_reads_stats_set_ptr->add_align_stats(*read_stats_set_ptr); all_reads_stats_set_ptr-> base_calls().replace_cycles(lo_cycle_num, hi_cycle_num, read_stats_set_ptr->base_calls()); lo_cycle_num = hi_cycle_num + 1; } } } /*****************************************************************************/