// -*- c++ -*- /*****************************************************************************/ // Copyright (c) Illumina 2008 // Author: Richard Shaw // // This software is covered by the "Illumina Genome Analyzer Software // License Agreement" and the "Illumina Source Code License Agreement", // and certain third party copyright/licenses, and any user of this // source file is bound by the terms therein (see accompanying files // Illumina_Genome_Analyzer_Software_License_Agreement.pdf and // Illumina_Source_Code_License_Agreement.pdf and third party // copyright/license notices). /*****************************************************************************/ #ifndef BASE_CALLS_H #define BASE_CALLS_H #include #include #include #include #include #include #include #include #include #include "statistics/Add_Paired_Ele.h" /*****************************************************************************/ class Base_Calls_Format { public: Base_Calls_Format(const std::string& col_label_suffix = "ct", char row_prefix_ch = ' ', bool transpose = false, bool display_cycles = true, bool suppress_ref_blanks = true, bool suppress_read_blanks = false, unsigned int num_dec_places = 0) : my_col_label_suffix(col_label_suffix), my_row_prefix_ch(row_prefix_ch), my_transpose_flag(transpose), my_display_cycles_flag(display_cycles), my_suppress_ref_blanks_flag(suppress_ref_blanks), my_suppress_read_blanks_flag(suppress_read_blanks), my_num_dec_places(num_dec_places) { } ~Base_Calls_Format() { } std::string my_col_label_suffix; char my_row_prefix_ch; bool my_transpose_flag; bool my_display_cycles_flag; bool my_suppress_ref_blanks_flag; bool my_suppress_read_blanks_flag; unsigned int my_num_dec_places; }; /*****************************************************************************/ template class Base_Calls_Iter; template class Base_Calls_CIter; template class Base_Calls_Range; template class Base_Calls_CRange; /*****************************************************************************/ template class Base_Calls { friend class Base_Calls_Iter; friend class Base_Calls_CIter; friend class Base_Calls_Range; friend class Base_Calls_CRange; template friend std::ostream& operator<<(std::ostream& ostrm, const Base_Calls&); public: typedef enum { DIM_READ, DIM_REF, DIM_CYCLE } Dimension; typedef enum { READ_ANY, READ_BLANK, READ_ERR } Read_Type; typedef Base_Calls_Iter iterator; typedef Base_Calls_CIter const_iterator; typedef Base_Calls_Range Range; typedef Base_Calls_CRange Const_Range; typedef std::list< Base_Calls > List; // typedef std::list< Base_Calls >::const_iterator List_CIter; Base_Calls(); Base_Calls(const unsigned int num_cycles); ~Base_Calls(); /// This is intended only to set the initial size. It zeros contents. void resize(const unsigned int num_cycles); T& operator()(char read_base, char ref_base, unsigned int cycle_num); const T& operator()(char read_base, char ref_base, unsigned int cycle_num) const; unsigned int size(const Dimension dimension, bool exclude_read_blanks = false, bool exclude_ref_blanks = false) const; unsigned int read_count(const Read_Type read_type) const; const List& get_cycle_base_calls_list(); void sum_cycles(Base_Calls& cum_base_calls) const; iterator begin() { return iterator(*this, 0, 1); } iterator end() { return iterator(*this, my_freqs.size(), 1); } const_iterator begin() const { return const_iterator(*this, 0, 1); } const_iterator end() const { return const_iterator(*this, my_freqs.size(), 1); } Base_Calls& operator+=(const Base_Calls& base_calls); void replace_cycles(unsigned int lo_cycle_num, unsigned int hi_cycle_num, const Base_Calls& base_calls); Range get_range(const Dimension dimension, bool exclude_read_blanks = false, bool exclude_ref_blanks = false); Const_Range get_crange(const Dimension dimension, bool exclude_read_blanks = false, bool exclude_ref_blanks = false) const; void set_format(const Base_Calls_Format& format) { my_format = format; } inline const std::string read_bases_str() const { return "NACGT"; } inline const std::string ref_bases_str() const { return "NACGT"; } Base_Calls_Format my_format; private: void init(); T& operator()(unsigned int ele_ind) { return my_freqs[ele_ind]; } const T& operator()(unsigned int ele_ind) const { return my_freqs[ele_ind]; } size_t ele_ind(char read_base, char ref_base, unsigned int cycle_num) const; void get_range_params(const Dimension dimension, bool exclude_read_blanks, bool exclude_ref_blanks, unsigned int& start_ind, unsigned int& start_stride, unsigned int& num_ranges, unsigned int& num_per_range, unsigned int& stride, unsigned int& range_ind_jump_mod, unsigned int& start_jump) const; inline unsigned int num_read_bases() const { return read_bases_str().size(); } inline unsigned int num_ref_bases() const { return ref_bases_str().size(); } unsigned int my_num_cycles; std::valarray my_read_base_vals; std::valarray my_ref_base_vals; std::valarray my_freqs; List my_cycle_base_calls_list; }; /*****************************************************************************/ template std::ostream& operator<<(std::ostream& ostrm, const Base_Calls& base_calls); /*****************************************************************************/ template Base_Calls::Base_Calls() : my_num_cycles(0) { init(); } /*****************************************************************************/ template Base_Calls::Base_Calls(const unsigned int num_cycles) : my_num_cycles(num_cycles) { init(); my_freqs.resize(num_read_bases() * num_ref_bases() * num_cycles, 0); } /*****************************************************************************/ template Base_Calls::~Base_Calls() { ; } /*****************************************************************************/ template void Base_Calls::resize(const unsigned int num_cycles) { my_num_cycles = num_cycles; my_freqs.resize(num_read_bases() * num_ref_bases() * num_cycles, 0); } /*****************************************************************************/ template T& Base_Calls::operator()(char read_base, char ref_base, unsigned int cycle_num) { return my_freqs[ele_ind(read_base, ref_base, cycle_num)]; } /*****************************************************************************/ template const T& Base_Calls::operator()(char read_base, char ref_base, unsigned int cycle_num) const { return my_freqs[ele_ind(read_base, ref_base, cycle_num)]; } /*****************************************************************************/ template unsigned int Base_Calls::size(const Dimension dimension, bool exclude_read_blanks, bool exclude_ref_blanks) const { unsigned int dim_size(0); switch (dimension) { case DIM_READ: dim_size = num_read_bases() - (exclude_read_blanks ? 1 : 0); break; case DIM_REF: dim_size = num_ref_bases() - (exclude_ref_blanks ? 1 : 0); break; case DIM_CYCLE: dim_size = my_num_cycles; break; } return dim_size; } /*****************************************************************************/ template unsigned int Base_Calls::read_count(const Read_Type read_type) const { unsigned int num_reads(0); switch (read_type) { case READ_ANY: num_reads = my_freqs.sum(); break; case READ_BLANK: { // Assumes Ns all at start. const size_t num_read_blanks(num_ref_bases() * my_num_cycles); num_reads = my_freqs[std::slice(0, num_read_blanks, 1)].sum(); } break; case READ_ERR: { const size_t num_read_blanks(num_ref_bases() * my_num_cycles); const size_t num_ref_blanks_per_read_ch(my_num_cycles); std::valarray err_mask(true, my_freqs.size()); // Read Ns do not count as errors (assumes read Ns all at start). err_mask[std::slice(0, num_read_blanks, 1)] = false; // Mask out blank refs and correct pairs. // (assumes read Ns all at start, ref Ns at start of each read ch) size_t blank_ref_start_pos(num_read_blanks); const size_t blank_ref_size(num_read_bases() - 1); // not N const size_t blank_ref_stride(num_ref_bases() * my_num_cycles); size_t ok_start_pos(num_read_blanks + num_ref_blanks_per_read_ch); const size_t ok_size(num_read_bases() - 1); // not N const size_t ok_stride((num_ref_bases() + 1) * my_num_cycles); for (unsigned int cycle_ind(0); cycle_ind < my_num_cycles; ++cycle_ind, ++blank_ref_start_pos, ++ok_start_pos) { err_mask[std::slice(blank_ref_start_pos, blank_ref_size, blank_ref_stride)] = false; err_mask[std::slice(ok_start_pos, ok_size, ok_stride)] = false; } // Sum masked frequencies. num_reads = my_freqs[err_mask].sum(); } break; } return num_reads; } /*****************************************************************************/ // template const Base_Calls::List & template const std::list< Base_Calls > & Base_Calls::get_cycle_base_calls_list() { if (my_cycle_base_calls_list.empty()) { for (unsigned int cycle_num(1); cycle_num <= my_num_cycles; ++cycle_num) { Base_Calls cycle_base_calls(1); cycle_base_calls.my_freqs = my_freqs[std::slice(cycle_num - 1, (num_read_bases() * num_ref_bases()), my_num_cycles)]; my_cycle_base_calls_list.push_back(cycle_base_calls); } } return my_cycle_base_calls_list; } /*****************************************************************************/ template void Base_Calls::sum_cycles(Base_Calls& cum_base_calls) const { cum_base_calls.resize(1); std::slice all_cycles_slice; const std::string tmp_read_bases_str(read_bases_str()); const std::string tmp_ref_bases_str(ref_bases_str()); for (std::string::const_iterator read_ch_citer(tmp_read_bases_str.begin()); read_ch_citer != tmp_read_bases_str.end(); ++read_ch_citer) { for (std::string::const_iterator ref_ch_citer(tmp_ref_bases_str.begin()); ref_ch_citer != tmp_ref_bases_str.end(); ++ref_ch_citer) { all_cycles_slice = std::slice(ele_ind(*read_ch_citer, *ref_ch_citer, 1), my_num_cycles, 1); cum_base_calls(*read_ch_citer, *ref_ch_citer, 1) = my_freqs[all_cycles_slice].sum(); } } } /*****************************************************************************/ template Base_Calls& Base_Calls::operator+=(const Base_Calls& base_calls) { unsigned int new_num_cycles(base_calls.size(Base_Calls::DIM_CYCLE)); unsigned int num_cycles(size(Base_Calls::DIM_CYCLE)); assert(new_num_cycles > 0); // Allow initialisation but not overwriting. if (num_cycles == 0) { resize((num_cycles = new_num_cycles)); } assert(new_num_cycles == num_cycles); Add_Paired_Ele > add_paired_ele(base_calls.begin()); std::transform(begin(), end(), begin(), add_paired_ele); return *this; } /*****************************************************************************/ template void Base_Calls::replace_cycles(unsigned int lo_cycle_num, unsigned int hi_cycle_num, const Base_Calls& base_calls) { //const size_t new_num_cycles(base_calls.size(Base_Calls::DIM_CYCLE)); //const size_t num_cycles(size(Base_Calls::DIM_CYCLE)); //assert(lo_cycle_num <= hi_cycle_num); //assert((hi_cycle_num - lo_cycle_num + 1) == new_num_cycles); //assert(hi_cycle_num <= num_cycles); unsigned int src_cycle_num(1); const std::string tmp_read_bases_str(read_bases_str()); const std::string tmp_ref_bases_str(ref_bases_str()); for (unsigned int dest_cycle_num(lo_cycle_num); dest_cycle_num <= hi_cycle_num; ++dest_cycle_num, ++src_cycle_num) { for (std::string::const_iterator read_ch_citer(tmp_read_bases_str.begin()); read_ch_citer != tmp_read_bases_str.end(); ++read_ch_citer) { for (std::string::const_iterator ref_ch_citer(tmp_ref_bases_str.begin()); ref_ch_citer != tmp_ref_bases_str.end(); ++ref_ch_citer) { const char read_ch(*read_ch_citer); const char ref_ch(*ref_ch_citer); my_freqs[ele_ind(read_ch, ref_ch, dest_cycle_num)] = base_calls(read_ch, ref_ch, src_cycle_num); } } } } /*****************************************************************************/ // start_stride : The stride to reach the next starting position. // stride : The stride within each iteration. template void Base_Calls::get_range_params(const Dimension dimension, bool exclude_read_blanks, bool exclude_ref_blanks, unsigned int& start_ind, unsigned int& start_stride, unsigned int& num_ranges, unsigned int& num_per_range, unsigned int& stride, unsigned int& range_ind_jump_mod, unsigned int& start_jump) const { // The exclude_read_blanks handling assumes read Ns are all at the start // and ref Ns are at the start of each read char. assert(read_bases_str()[0] == 'N'); assert(ref_bases_str()[0] == 'N'); start_ind = (exclude_read_blanks ? (num_ref_bases() * my_num_cycles) : 0); unsigned int num_eles_used = (my_freqs.size() - start_ind); if (exclude_ref_blanks) { start_ind += my_num_cycles; num_eles_used -= (my_num_cycles * (num_read_bases() - (exclude_read_blanks ? 1 : 0))); } const unsigned int dim_size(size(dimension, exclude_read_blanks, exclude_ref_blanks)); num_ranges = num_eles_used / dim_size; num_per_range = dim_size; // Note that start_jump is in addition to start_stride when applied. range_ind_jump_mod = 0; start_jump = 0; switch (dimension) { case DIM_READ: start_stride = 1; stride = (num_ref_bases() * my_num_cycles); break; case DIM_REF: start_stride = 1; stride = my_num_cycles; range_ind_jump_mod = my_num_cycles; start_jump = (num_ref_bases() - 1) * my_num_cycles; break; case DIM_CYCLE: start_stride = my_num_cycles; stride = 1; range_ind_jump_mod = (exclude_ref_blanks ? (num_ref_bases() - 1) : 0); start_jump = my_num_cycles; break; } } /*****************************************************************************/ template Base_Calls_Range Base_Calls::get_range(const Dimension dimension, bool exclude_read_blanks, bool exclude_ref_blanks) { unsigned int start_ind(0); unsigned int start_stride(0); unsigned int num_ranges(0); unsigned int num_per_range(0); unsigned int stride(0); unsigned int range_ind_jump_mod(0); unsigned int start_jump(0); get_range_params(dimension, exclude_read_blanks, exclude_ref_blanks, start_ind, start_stride, num_ranges, num_per_range, stride, range_ind_jump_mod, start_jump); return Base_Calls_Range(*this, start_ind, start_stride, num_ranges, num_per_range, stride, range_ind_jump_mod, start_jump); } /*****************************************************************************/ template Base_Calls_CRange Base_Calls::get_crange(const Dimension dimension, bool exclude_read_blanks, bool exclude_ref_blanks) const { unsigned int start_ind(0); unsigned int start_stride(0); unsigned int num_ranges(0); unsigned int num_per_range(0); unsigned int stride(0); unsigned int range_ind_jump_mod(0); unsigned int start_jump(0); get_range_params(dimension, exclude_read_blanks, exclude_ref_blanks, start_ind, start_stride, num_ranges, num_per_range, stride, range_ind_jump_mod, start_jump); return Base_Calls_CRange(*this, start_ind, start_stride, num_ranges, num_per_range, stride, range_ind_jump_mod, start_jump); } /*****************************************************************************/ void init_table(const std::string& str, std::valarray& table); /*****************************************************************************/ template void Base_Calls::init() { init_table(read_bases_str(), my_read_base_vals); init_table(ref_bases_str(), my_ref_base_vals); } /*****************************************************************************/ template size_t Base_Calls::ele_ind(char read_base, char ref_base, unsigned int cycle_num) const { return ((((my_read_base_vals[read_base] * num_ref_bases()) + my_ref_base_vals[ref_base]) * my_num_cycles) + cycle_num - 1); } /*****************************************************************************/ template class Base_Calls_Iter { public: typedef T value_type; typedef std::forward_iterator_tag iterator_category; typedef std::ptrdiff_t difference_type; typedef T* pointer; typedef T& reference; Base_Calls_Iter(Base_Calls& base_calls, unsigned int ele_ind, unsigned int stride) : my_base_calls(base_calls), my_ele_ind(ele_ind), my_stride(stride) { } ~Base_Calls_Iter() { } Base_Calls_Iter operator++() { my_ele_ind += my_stride; return *this; } bool operator!=(Base_Calls_Iter iter) { return (iter.my_ele_ind != my_ele_ind); } T& operator*() { return my_base_calls.operator()(my_ele_ind); } // For debug use unsigned int ele_ind() { return my_ele_ind; } private: Base_Calls& my_base_calls; unsigned int my_ele_ind; unsigned int my_stride; }; /*****************************************************************************/ template class Base_Calls_CIter { public: typedef T value_type; typedef std::forward_iterator_tag iterator_category; typedef std::ptrdiff_t difference_type; typedef T* pointer; typedef T& reference; Base_Calls_CIter(const Base_Calls& base_calls, unsigned int ele_ind, unsigned int stride) : my_base_calls(base_calls), my_ele_ind(ele_ind), my_stride(stride) { } ~Base_Calls_CIter() { } Base_Calls_CIter operator++() { my_ele_ind += my_stride; return *this; } bool operator!=(Base_Calls_CIter citer) { return (citer.my_ele_ind != my_ele_ind); } const T& operator*() const { return my_base_calls.operator()(my_ele_ind); } // For debug use unsigned int ele_ind() { return my_ele_ind; } private: const Base_Calls& my_base_calls; unsigned int my_ele_ind; unsigned int my_stride; }; /*****************************************************************************/ template class Base_Calls_Range { public: Base_Calls_Range(Base_Calls& base_calls, unsigned int start_ind, unsigned int start_stride, unsigned int num_ranges, unsigned int num_per_range, unsigned int stride, unsigned int range_ind_jump_mod=0, unsigned int start_jump=0) : my_base_calls(base_calls), my_start_ind(start_ind), my_curr_start_ind(start_ind), my_start_stride(start_stride), my_stride(stride), my_num_ranges(num_ranges), my_range_ind(0), my_num_per_range(num_per_range), my_at_valid_range_flag(my_num_ranges != 0), my_range_ind_jump_mod(range_ind_jump_mod), my_start_jump(start_jump) { } ~Base_Calls_Range() { } Base_Calls_Iter begin() { return Base_Calls_Iter(my_base_calls, my_curr_start_ind, my_stride); } Base_Calls_Iter end() { return Base_Calls_Iter(my_base_calls, my_curr_start_ind + (my_num_per_range * my_stride), my_stride); } bool operator()() { return my_at_valid_range_flag; } Base_Calls_Range& operator++() { if (my_range_ind < (my_num_ranges - 1)) { ++my_range_ind; my_curr_start_ind += my_start_stride; if (my_range_ind_jump_mod != 0) { if ((my_range_ind % my_range_ind_jump_mod) == 0) { my_curr_start_ind += my_start_jump; } } } else { my_at_valid_range_flag = false; } return *this; } void rewind() { my_curr_start_ind = my_start_ind; my_at_valid_range_flag = (my_num_ranges != 0); } size_t size() { return my_num_ranges; } private: Base_Calls& my_base_calls; unsigned int my_start_ind; unsigned int my_curr_start_ind; unsigned int my_start_stride; unsigned int my_stride; unsigned int my_num_ranges; unsigned int my_range_ind; unsigned int my_num_per_range; bool my_at_valid_range_flag; unsigned int my_range_ind_jump_mod; unsigned int my_start_jump; }; /*****************************************************************************/ template class Base_Calls_CRange { public: Base_Calls_CRange(const Base_Calls& base_calls, unsigned int start_ind, unsigned int start_stride, unsigned int num_ranges, unsigned int num_per_range, unsigned int stride, unsigned int range_ind_jump_mod=0, unsigned int start_jump=0) : my_base_calls(base_calls), my_start_ind(start_ind), my_curr_start_ind(start_ind), my_start_stride(start_stride), my_stride(stride), my_num_ranges(num_ranges), my_range_ind(0), my_num_per_range(num_per_range), my_at_valid_range_flag(my_num_ranges != 0), my_range_ind_jump_mod(range_ind_jump_mod), my_start_jump(start_jump) { } ~Base_Calls_CRange() { } Base_Calls_CIter begin() const { return Base_Calls_CIter(my_base_calls, my_curr_start_ind, my_stride); } Base_Calls_CIter end() const { return Base_Calls_CIter(my_base_calls, my_curr_start_ind + (my_num_per_range * my_stride), my_stride); } bool operator()() { return my_at_valid_range_flag; } Base_Calls_CRange& operator++() { if (my_range_ind < (my_num_ranges - 1)) { ++my_range_ind; my_curr_start_ind += my_start_stride; if (my_range_ind_jump_mod != 0) { if ((my_range_ind % my_range_ind_jump_mod) == 0) { my_curr_start_ind += my_start_jump; } } } else { my_at_valid_range_flag = false; } return *this; } void rewind() { my_curr_start_ind = my_start_ind; my_at_valid_range_flag = (my_num_ranges != 0); } size_t size() { return my_num_ranges; } private: const Base_Calls& my_base_calls; unsigned int my_start_ind; unsigned int my_curr_start_ind; unsigned int my_start_stride; unsigned int my_stride; unsigned int my_num_ranges; unsigned int my_range_ind; unsigned int my_num_per_range; bool my_at_valid_range_flag; unsigned int my_range_ind_jump_mod; unsigned int my_start_jump; }; /*****************************************************************************/ template std::ostream& operator<<(std::ostream& ostrm, const Base_Calls& base_calls) { const unsigned int num_cycles(base_calls.size(Base_Calls::DIM_CYCLE)); const std::string col_label_suffix(base_calls.my_format.my_col_label_suffix); const char row_prefix_ch(base_calls.my_format.my_row_prefix_ch); const bool transpose(base_calls.my_format.my_transpose_flag); const bool display_cycles(base_calls.my_format.my_display_cycles_flag); const bool suppress_ref_blanks(base_calls.my_format. my_suppress_ref_blanks_flag); const bool suppress_read_blanks(base_calls.my_format. my_suppress_read_blanks_flag); const unsigned int num_dec_places(base_calls.my_format.my_num_dec_places); // FIXME : uses knowledge that `N' is first base in ref and read bases const std::string& ref_bases_str(suppress_ref_blanks ? base_calls.ref_bases_str().substr(1) : base_calls.ref_bases_str()); const std::string& read_bases_str(suppress_read_blanks ? base_calls.read_bases_str().substr(1) : base_calls.read_bases_str()); const std::string& col_bases_str(transpose ? read_bases_str : ref_bases_str); const std::string& row_bases_str(transpose ? ref_bases_str : read_bases_str); if (display_cycles) { ostrm << "Cycle:\t"; } if (!transpose) { ostrm << "Read As:\tReally:" << std::endl; } else { ostrm << "Really:\tRead As:" << std::endl; } if (display_cycles) { ostrm << "\t"; } for (std::string::const_iterator col_ch_citer(col_bases_str.begin()); col_ch_citer != col_bases_str.end(); ++col_ch_citer) { ostrm << "\t" << *col_ch_citer << " " << col_label_suffix; } ostrm << std::endl; for (unsigned int cycle_num(1); cycle_num <= num_cycles; ++cycle_num) { for (std::string::const_iterator row_ch_citer(row_bases_str.begin()); row_ch_citer != row_bases_str.end(); ++row_ch_citer) { if (row_prefix_ch != ' ') { ostrm << row_prefix_ch; } if (display_cycles) { ostrm << cycle_num << "\t"; } ostrm << *row_ch_citer; for (std::string::const_iterator col_ch_citer(col_bases_str.begin()); col_ch_citer != col_bases_str.end(); ++col_ch_citer) { const char read_ch(transpose ? *col_ch_citer : *row_ch_citer); const char ref_ch(transpose ? *row_ch_citer : *col_ch_citer); ostrm << "\t"; if (num_dec_places != 0) { ostrm << std::fixed << std::setprecision(num_dec_places) << std::setw(3 + num_dec_places); } ostrm << base_calls(read_ch, ref_ch, cycle_num); } ostrm << std::endl; } } return ostrm; } /*****************************************************************************/ #endif // ! BASE_CALLS_H /*****************************************************************************/