/*****************************************************************************/ // Copyright (c) Illumina 2008 // Author: Richard Shaw // // This software is covered by the "Illumina Genome Analyzer Software // License Agreement" and the "Illumina Source Code License Agreement", // and certain third party copyright/licenses, and any user of this // source file is bound by the terms therein (see accompanying files // Illumina_Genome_Analyzer_Software_License_Agreement.pdf and // Illumina_Source_Code_License_Agreement.pdf and third party // copyright/license notices). /*****************************************************************************/ #include #include #include "statistics/Read_Patterns.h" /*****************************************************************************/ Read_Patterns::Read_Patterns(size_t max_patterns_to_store) : my_max_patterns_to_store(max_patterns_to_store) , my_storage_limit_reached(false) { } /*****************************************************************************/ Read_Patterns::~Read_Patterns() { } /*****************************************************************************/ void Read_Patterns::record_pattern(const std::string& pattern_str) { Pattern_Freq_Map_Iter pattern_freq_iter(my_pattern_freq_map.find(pattern_str)); if (pattern_freq_iter != my_pattern_freq_map.end()) { ++(pattern_freq_iter->second); } else { // New pattern so need to check if storage limit reached. if (my_pattern_freq_map.size() == my_max_patterns_to_store) { if (!my_storage_limit_reached) { my_storage_limit_reached = true; std::cerr << "Hit limit (" << my_max_patterns_to_store << ") on number of patterns to store." << std::endl; } } else { ++my_pattern_freq_map[pattern_str]; } } } /*****************************************************************************/ bool Read_Patterns::find_most_frequent() { assert(MAX_NUM_RANKS_TO_LIST > 0); const unsigned int max_num_freqs(MAX_NUM_RANKS_TO_LIST); unsigned int min_freq_to_store(0); unsigned int min_freq_stored(0); unsigned int num_unique_freqs_stored(0); for (Pattern_Freq_Map_CIter pattern_freq_citer(my_pattern_freq_map.begin()); pattern_freq_citer != my_pattern_freq_map.end(); ++pattern_freq_citer) { const std::string pattern_str(pattern_freq_citer->first); const unsigned int freq(pattern_freq_citer->second); if (freq >= min_freq_to_store) { if (my_freq_pattern_info_map[freq]. add_pattern(pattern_str).num_patterns() == 1) { if (freq < min_freq_stored) { min_freq_stored = freq; } // First pattern encountered that has frequency freq, // so freq is a new frequency and thus we need to check // whether adding it causes the number of frequencies // stored to exceed the specified limit. ++num_unique_freqs_stored; if (num_unique_freqs_stored > max_num_freqs) { // Since max_num_freqs is assumed to be >= 1, // min_freq_stored should have been set to a valid value // below. assert(min_freq_stored > 0); // FIXME : map ordered by inc freq // -> remove first and use second as new min? Freq_Pattern_Info_Map_Iter freq_pattern_info_iter = my_freq_pattern_info_map.find(min_freq_stored); my_freq_pattern_info_map.erase(freq_pattern_info_iter); --num_unique_freqs_stored; // Update min_freq_to_store by finding new min_freq_stored. min_freq_stored = my_freq_pattern_info_map.begin()->first; for (Freq_Pattern_Info_Map_CIter freq_info_citer(my_freq_pattern_info_map.begin()); freq_info_citer != my_freq_pattern_info_map.end(); ++freq_info_citer) { if (freq_info_citer->first < min_freq_stored) { min_freq_stored = freq_info_citer->first; } } min_freq_to_store = min_freq_stored; } else if (num_unique_freqs_stored == 1) { // First pattern of first frequency, so set initial // real min_freq_stored value. // min_freq_to_store should NOT be set to this value as // lower frequencies may be added to make up the set of // max_num_freqs frequencies. min_freq_stored = freq; } } } } return true; } /*****************************************************************************/ unsigned int Read_Patterns::num_will_list() const { return my_freq_pattern_info_map.size(); } /*****************************************************************************/ std::ostream& operator<<(std::ostream& ostrm, const Read_Patterns& read_patterns) { ostrm << "Ranking\tOccurrences\tWords\n"; if (read_patterns.my_storage_limit_reached) { ostrm << "[Limited to first " << read_patterns.my_max_patterns_to_store << " patterns seen]" << std::endl; } const Read_Patterns::Freq_Pattern_Info_Map& freq_pattern_info_map(read_patterns.my_freq_pattern_info_map); unsigned int rank(1); for (Read_Patterns::Freq_Pattern_Info_Map_CRIter freq_pattern_info_criter(freq_pattern_info_map.rbegin()); freq_pattern_info_criter != freq_pattern_info_map.rend(); ++freq_pattern_info_criter, ++rank) { ostrm << rank << "\t" << freq_pattern_info_criter->first <<"\t" << freq_pattern_info_criter->second << std::endl; } return ostrm; } /*****************************************************************************/