/*****************************************************************************/ // Copyright (c) Illumina 2008 // Author: Richard Shaw // // This software is covered by the "Illumina Genome Analyzer Software // License Agreement" and the "Illumina Source Code License Agreement", // and certain third party copyright/licenses, and any user of this // source file is bound by the terms therein (see accompanying files // Illumina_Genome_Analyzer_Software_License_Agreement.pdf and // Illumina_Source_Code_License_Agreement.pdf and third party // copyright/license notices). /*****************************************************************************/ #include #include #include #include #include #include #include #include "common/Exceptions.hh" #include "Likelihood_Calc.h" #include "Proportion_Calc.h" #include "statistics/Stats_Set.h" namespace cc=casava::common; /*****************************************************************************/ Stats_Set::Stats_Set(size_t max_patterns_to_store) : my_all_reads_flag(false), my_q20_bases_total(0), my_q30_bases_total(0), my_quality_score_sum(0), my_num_clusters_total(0), my_num_bases_total(0), my_num_bases_aligned(0), my_num_errs(0), my_num_blanks(0), my_num_unique_aligns(0), my_align_score_sum(0), my_num_insertions(0), my_num_insertion_bases(0), my_num_deletions(0), my_num_deletion_bases(0), my_most_common_words(max_patterns_to_store), my_most_common_blank_patterns(max_patterns_to_store) { ; } /*****************************************************************************/ Stats_Set::~Stats_Set() { ; } /*****************************************************************************/ void Stats_Set::set_num_cycles(const unsigned int num_cycles) { my_cycle_non_blank_freqs.resize(num_cycles); my_base_calls.resize(num_cycles); my_cum_errs.set_num_cycles(num_cycles); } /*****************************************************************************/ unsigned int Stats_Set::num_cycles() const { return my_base_calls.size(Uint_Base_Calls::DIM_CYCLE); } /*****************************************************************************/ void Stats_Set::add_align_stats(unsigned int align_score, unsigned int num_insertions, unsigned int num_insertion_bases, unsigned int num_deletions, unsigned int num_deletion_bases) { ++my_num_unique_aligns; // probably should derive this instead my_align_score_sum += align_score; my_num_insertions += num_insertions; my_num_insertion_bases += num_insertion_bases; my_num_deletions += num_deletions; my_num_deletion_bases += num_deletion_bases; } void Stats_Set::add_basecalls_stats(const std::string &read, const std::string &qual) { ++my_num_clusters_total; my_num_bases_total += read.length(); my_q20_bases_total += std::count_if(qual.begin(), qual.end(), boost::bind(&boost::cref, _1) >= (64u + 20u)); my_q30_bases_total += std::count_if(qual.begin(), qual.end(), boost::bind(&boost::cref, _1) >= (64u + 30u)); my_quality_score_sum += std::accumulate(qual.begin(), qual.end(), 0) - qual.length() * 64u; } /*****************************************************************************/ void Stats_Set::add_align_stats(const Stats_Set& stats_set) { my_num_unique_aligns += stats_set.my_num_unique_aligns; my_align_score_sum += stats_set.my_align_score_sum; my_num_insertions += stats_set.my_num_insertions; my_num_insertion_bases += stats_set.my_num_insertion_bases; my_num_deletions += stats_set.my_num_deletions; my_num_deletion_bases += stats_set.my_num_deletion_bases; } /*****************************************************************************/ void Stats_Set::apply_non_blank_mask(const std::valarray& non_blank_mask) { my_cycle_non_blank_freqs += non_blank_mask; } /*****************************************************************************/ void Stats_Set::derive_proportions(const Uint_Base_Calls& base_calls, Float_Base_Calls& pc_base_calls, Uint_Base_Calls::Dimension sum_dimension, bool as_percents, bool exclude_read_blanks, bool exclude_ref_blanks) { pc_base_calls.resize(base_calls.size(Uint_Base_Calls::DIM_CYCLE)); Uint_Base_Calls::Const_Range base_calls_range = base_calls.get_crange(sum_dimension, exclude_read_blanks, exclude_ref_blanks); Float_Base_Calls::Range pc_base_calls_range = pc_base_calls.get_range((Float_Base_Calls::Dimension) sum_dimension, exclude_read_blanks, exclude_ref_blanks); for ( ; base_calls_range(); ++base_calls_range, ++pc_base_calls_range) { float sum = std::accumulate(base_calls_range.begin(), base_calls_range.end(), 0); Proportion_Calc proportion_calc(sum, as_percents); std::transform(base_calls_range.begin(), base_calls_range.end(), pc_base_calls_range.begin(), proportion_calc); } } /*****************************************************************************/ void Stats_Set::derive_likelihoods() { my_likelihoods.resize(my_base_calls.size(Uint_Base_Calls::DIM_CYCLE)); const bool exclude_read_blanks(false); const bool exclude_ref_blanks(true); Uint_Base_Calls::Const_Range base_calls_range = my_base_calls.get_crange(Uint_Base_Calls::DIM_REF, exclude_read_blanks, exclude_ref_blanks); Int_Base_Calls::Range likelihoods_range = my_likelihoods.get_range(Int_Base_Calls::DIM_REF, exclude_read_blanks, exclude_ref_blanks); for ( ; base_calls_range(); ++base_calls_range, ++likelihoods_range) { float sum = std::accumulate(base_calls_range.begin(), base_calls_range.end(), 0); Likelihood_Calc likelihood_calc(sum); std::transform(base_calls_range.begin(), base_calls_range.end(), likelihoods_range.begin(), likelihood_calc); } } /*****************************************************************************/ void Stats_Set::derive_stats() { my_base_calls.sum_cycles(my_sum_over_cycles_base_calls); my_num_bases_aligned = (my_sum_over_cycles_base_calls.read_count(Uint_Base_Calls::READ_ANY)); my_num_errs = (my_sum_over_cycles_base_calls.read_count(Uint_Base_Calls::READ_ERR)); my_num_blanks = (my_sum_over_cycles_base_calls.read_count(Uint_Base_Calls::READ_BLANK)); my_basic_stats.derive(my_base_calls); // derive_proportions : base_calls, pc_base_calls, sum_dimension, // as_percents=true, exclude_read_blanks=false, exclude_ref_blanks=false derive_proportions(my_sum_over_cycles_base_calls, my_pc_reads_by_ref, Uint_Base_Calls::DIM_READ); derive_proportions(my_sum_over_cycles_base_calls, my_pc_reads_excl_blanks_by_ref, Uint_Base_Calls::DIM_READ, true, true); derive_proportions(my_sum_over_cycles_base_calls, my_pc_refs_by_read, Uint_Base_Calls::DIM_REF, true, false, true); derive_proportions(my_base_calls, my_prop_reads_by_ref_cycle, Uint_Base_Calls::DIM_READ, false); if (!is_all_reads()) { my_info_contents.derive(my_base_calls, my_cycle_non_blank_freqs); my_most_common_words.find_most_frequent(); my_most_common_blank_patterns.find_most_frequent(); derive_likelihoods(); } // FIXME : This should not be needed (use external formatter instead). set_formats(); } /*****************************************************************************/ // FIXME : This should not be needed (use external formatter instead). void Stats_Set::set_formats() { // Base_Calls_Format args :- // col_label_suffix, row_prefix_ch, // transpose, display_cycles, suppress_ref_blanks, suppress_read_blanks, // num_dec_places my_pc_reads_by_ref. set_format(Base_Calls_Format("pc", ' ', true, false, true, false, 2)); my_pc_reads_excl_blanks_by_ref. set_format(Base_Calls_Format("pc", ' ', true, false, true, true, 2)); my_pc_refs_by_read. set_format(Base_Calls_Format("pc", ' ', false, false, true, false, 2)); my_sum_over_cycles_base_calls. set_format(Base_Calls_Format("ct", ' ', false, false, true, false, 0)); my_base_calls. set_format(Base_Calls_Format("ct", ' ', false, true, true, false, 0)); my_prop_reads_by_ref_cycle. set_format(Base_Calls_Format("prp", '@', false, true, true, false, 3)); my_info_contents.set_cycle_prefix_ch('~'); my_likelihoods. set_format(Base_Calls_Format("", '>', false, true, true, false, 0)); my_cum_errs.set_cycle_prefix_ch('!'); } /*****************************************************************************/ void add_line(File_Buffer::Line_Vec& line_vec, char* line_buf_ptr, const char* format_cstr, ...) { va_list var_arg_list; va_start(var_arg_list, format_cstr); vsprintf(line_buf_ptr, format_cstr, var_arg_list); va_end(var_arg_list); line_vec.push_back(line_buf_ptr); } /*****************************************************************************/ void Stats_Set::write(File_Buffer& file_buffer) const { File_Buffer::Line_Vec line_vec; char line_buf[1280]; line_vec.push_back("Aligned bases (outside insertions or deletions) :-"); add_line(line_vec, line_buf, "%u bases of sequence found", my_num_bases_aligned); add_line(line_vec, line_buf, "%u were errors", my_num_errs); add_line(line_vec, line_buf, "%u were blanks", my_num_blanks); if ((my_num_bases_aligned > 0) && ((my_num_bases_aligned - my_num_blanks) > 0)) { add_line(line_vec, line_buf, "%5.2f percent error rate", (100.0 * my_num_errs) / (my_num_bases_aligned - my_num_blanks)); add_line(line_vec, line_buf, "%5.2f percent blank rate", (100.0 * my_num_blanks) / my_num_bases_aligned); } line_vec.push_back("\nIndels :-"); add_line(line_vec, line_buf, "%u insertions", my_num_insertions); if (my_num_insertions > 0) { add_line(line_vec, line_buf, "%u insertion bases", my_num_insertion_bases); add_line(line_vec, line_buf, "mean insertion length %6.2f", (1.0 * my_num_insertion_bases) / my_num_insertions); } add_line(line_vec, line_buf, "%u deletions", my_num_deletions); if (my_num_deletions > 0) { add_line(line_vec, line_buf, "%u deletion bases", my_num_deletion_bases); add_line(line_vec, line_buf, "mean deletion length %6.2f", (1.0 * my_num_deletion_bases) / my_num_deletions); } std::ostringstream ostrm; ostrm << std::endl; ostrm << "unique alignments : " << my_num_unique_aligns << " (total score " << my_align_score_sum << ")" << std::endl << "cycles : " << num_cycles() << std::endl << std::endl << "All error breakdowns and patterns below refer to " << "non-indel bases." << std::endl; if (!is_all_reads()) { ostrm << "Breakdown of errors by cycle" << std::endl << my_basic_stats << std::endl; } ostrm << "Error rate relative to reference base (including blanks)" << std::endl; ostrm << "(Given a reference base, what is it sequenced as?)" << std::endl << std::endl; ostrm << my_pc_reads_by_ref << std::endl; ostrm << "Error rate relative to reference base (excluding blanks)" << std::endl; ostrm << "(Given a reference base, what is it sequenced as?)" << std::endl << std::endl; ostrm << my_pc_reads_excl_blanks_by_ref << std::endl; ostrm << "Error rate relative to sequenced base (excluding ref blanks)" << std::endl; ostrm << "(Given a sequenced base, what was it really?)" << std::endl << std::endl; ostrm << my_pc_refs_by_read << std::endl; ostrm << "Breakdown of errors by nucleotide" << std::endl; ostrm << my_sum_over_cycles_base_calls << std::endl << std::endl; if (!is_all_reads()) { ostrm << "Full breakdown of errors by cycle and nucleotide" << std::endl; ostrm << my_base_calls << std::endl; ostrm << "Error rate relative to reference by cycle/nucleotide" << std::endl; ostrm << my_prop_reads_by_ref_cycle << std::endl; ostrm << "Information Content By Cycle" << std::endl; ostrm << my_info_contents << std::endl; ostrm << "The " << my_most_common_words.num_will_list() << " most common words with " << MAX_NUM_BLANKS_IN_COMMON_WORDS << " blanks or less:" << std::endl; ostrm << my_most_common_words << std::endl; ostrm << "The " << my_most_common_blank_patterns.num_will_list() << " most common blank patterns (N=any nonblank character)" << std::endl; ostrm << my_most_common_blank_patterns << std::endl; ostrm << "Log likelihood scores" << std::endl; ostrm << my_likelihoods << std::endl; ostrm << "Cumulative errors by cycle" << std::endl; ostrm << my_cum_errs << std::endl; } line_vec.push_back(ostrm.str()); if (!file_buffer.write_some_lines(line_vec)) { BOOST_THROW_EXCEPTION(cc::IoException(errno, "Failed to write statistics set to " + file_buffer.file_path_str())); } } /*****************************************************************************/