/*****************************************************************************/ // Copyright (c) Illumina 2008 // Author: Richard Shaw // // This software is covered by the "Illumina Genome Analyzer Software // License Agreement" and the "Illumina Source Code License Agreement", // and certain third party copyright/licenses, and any user of this // source file is bound by the terms therein (see accompanying files // Illumina_Genome_Analyzer_Software_License_Agreement.pdf and // Illumina_Source_Code_License_Agreement.pdf and third party // copyright/license notices). /*****************************************************************************/ #include #include #include #include #include "String_Types.h" #include "common/StringUtil.h" #include "statistics/Export_File_Data.h" /*****************************************************************************/ // N.B. Unlike StringUtil tokenize_str, this returns empty fields. unsigned int split_on_tab(const std::string str, Str_Vec& str_vec) { const char separ_ch('\t'); std::string::size_type last_pos(0); std::string::size_type pos(0); for (std::string::const_iterator ch_citer(str.begin()); ch_citer != str.end(); ++ch_citer, ++pos) { if (*ch_citer == separ_ch) { str_vec.push_back(str.substr(last_pos, (pos - last_pos))); last_pos = pos + 1; } } if (pos >= last_pos) { str_vec.push_back(str.substr(last_pos, (pos - last_pos))); } return str_vec.size(); } /*****************************************************************************/ // Updates reference string and possibly read string according to the supplied // match descriptor (mismatch_str). // Start with ref as copy of read. // The match descriptor may contain the following :- // 1) a run length of matching bases (e.g. 23) - no action // 2) a base change (e.g. A) - replace copied base in ref // 3) an insertion (^3$) - place X in the reference at each read insert pos // 4) a deletion (^ACG$) - insert each char into the ref // place a corresponding run of Xs in the read // Assumes exactly one indel per ^...$ bracketing. bool apply_mismatch(Read_Align_Info& read_align_info, const std::string& mismatch_str) { unsigned int read_len_left(read_align_info.my_read_str.length()); read_align_info.my_ref_str = read_align_info.my_read_str; std::string::iterator read_ch_iter(read_align_info.my_read_str.begin()); std::string::iterator ref_ch_iter(read_align_info.my_ref_str.begin()); std::string::const_iterator mismatch_ch_citer(mismatch_str.begin()); const char indel_start_ch('^'); const char indel_end_ch('$'); const char del_ch(Read_Align_Info::our_del_ch); bool indel_found(false); bool in_indel(false); bool in_deletion(false); bool in_insertion(false); unsigned int count(0); // shared by match run and deletion bool need_to_check_for_match_run_end(false); for ( ; mismatch_ch_citer != mismatch_str.end(); ++mismatch_ch_citer) { const char mismatch_ch(*mismatch_ch_citer); if (mismatch_ch == indel_start_ch) { if (in_indel) { // Already in indel and trying to start another std::cerr << "ERROR: Found indel start char " << indel_start_ch << " before close (with " << indel_end_ch << ") of previous indel in alignment descriptor " << mismatch_str << std::endl; return false; } in_indel = true; if (!indel_found) { // First indel, so cache the raw read. read_align_info.my_raw_read_str = read_align_info.my_read_str; indel_found = true; } // Deferred continue happens below when this is checked. need_to_check_for_match_run_end = true; } else if (mismatch_ch == indel_end_ch) { if (!in_indel) { // End of indel but was not in indel std::cerr << "ERROR: Found indel end char " << indel_end_ch << " while not in indel in alignment descriptor " << mismatch_str << std::endl; return false; } if (in_insertion) { if (read_len_left < count) { // Reported insertion is longer than remainder of read! std::cerr << "ERROR: Reported insertion of length " << count << " is longer than remainder of read in alignment descriptor " << mismatch_str << std::endl; return false; } // Replace copied chars in ref with del chars, // move iterators past the insertion and update len left. read_align_info.my_ref_str.replace(ref_ch_iter, ref_ch_iter + count, count, del_ch); ref_ch_iter += count; read_ch_iter += count; read_len_left -= count; count = 0; // Update the appropriate count since here the type is known. ++read_align_info.my_num_insertions; } else if (in_deletion) { // Update the appropriate count since here the type is known. ++read_align_info.my_num_deletions; } else { // Indel end char immediately after indel start char - no desc! std::cerr << "ERROR: No indel description between indel start char (" << indel_start_ch << ") and end char (" << indel_end_ch << ") in alignment descriptor " << mismatch_str << std::endl; return false; } in_indel = in_deletion = in_insertion = false; continue; // no further action for indel close } else if (isdigit(mismatch_ch)) { if (in_indel) { if (in_deletion) { // May not mix insertion digits into deletion strings. std::cerr << "ERROR: Found digit " << mismatch_ch << " in deletion string in alignment descriptor " << mismatch_str << std::endl; return false; } in_insertion = true; } (count *= 10) += (mismatch_ch - '0'); continue; // no further action for digit } else { // base character // A mismatch base (but not a deletion base) can close a // match run. need_to_check_for_match_run_end = !in_indel; } if (need_to_check_for_match_run_end) { // Close a match run if necessary. if (count > 0) { if (read_len_left < count) { // Reported match run is longer than remainder of read! std::cerr << "ERROR: Reported match run (" << count << " bases) is longer than remainder of read (" << read_len_left << " bases) in alignment descriptor " << mismatch_str << std::endl; return false; } ref_ch_iter += count; read_ch_iter += count; read_len_left -= count; count = 0; } need_to_check_for_match_run_end = false; // Two ways of getting here. // If due to indel start, no further action (deferred continue). if (in_indel) { continue; } } // Handle base character : insertion or mismatch if (in_indel) { if (in_insertion) { // May not mix deletion chars into insertion numbers. std::cerr << "ERROR: Found non-digit " << mismatch_ch << " in insertion length in alignment descriptor " << mismatch_str << std::endl; return false; } in_deletion = true; // Deletion -> insert actual base into ref and del char into read // (These do not count towards read_len_left.) ref_ch_iter = read_align_info.my_ref_str.insert(ref_ch_iter, mismatch_ch); read_ch_iter = read_align_info.my_read_str.insert(read_ch_iter, del_ch); // Restore iterator positions in updated string. ++ref_ch_iter; ++read_ch_iter; } else { // Reported base mismatch but have run out of read! if (!read_len_left) { std::cerr << "ERROR: Reported base mismatch (" << mismatch_ch << ") but have run out of read in alignment descriptor " << mismatch_str << std::endl; return false; } // Base mismatch -> change base (copied from read) in ref *ref_ch_iter++ = mismatch_ch; ++read_ch_iter; --read_len_left; } } if (count != read_len_left) { std::cerr << "ERROR: Reported final match run (" << count << " bases) differs in length from remainder of read (" << read_len_left << " bases) in alignment descriptor " << mismatch_str << std::endl; return false; } return true; } /*****************************************************************************/ Export_File_Data::Export_File_Data(const std::string& data_file_name, const File_Buffer::Compression_Type compression) : Align_Data(data_file_name, compression) { ; } /*****************************************************************************/ Export_File_Data::~Export_File_Data() { ; } /*****************************************************************************/ bool Export_File_Data::get_next_read_align_info(Read_Align_Info& read_align_info, bool& at_eof) { std::string export_file_line_str; bool data_line_found(false); while (!data_line_found) { if (!my_file_buffer.read_line(export_file_line_str, at_eof)) { return false; } if (export_file_line_str.empty() || (export_file_line_str[0] == '#')) { continue; } data_line_found = true; Str_Vec export_field_str_vec; if (split_on_tab(export_file_line_str, export_field_str_vec) != NumberOfEntries) { std::cerr << "ERROR: Found unexpected number of fields (" << export_field_str_vec.size() << ") in line " << export_file_line_str << std::endl; return false; } read_align_info.my_lane_num = atoi(export_field_str_vec[LANE_NUM_FIELD_IND].c_str()); read_align_info.my_tile_num = atoi(export_field_str_vec[TILE_NUM_FIELD_IND].c_str()); // Single-read export files have an empty read number field. const std::string read_num_str(export_field_str_vec[READ_NUM_FIELD_IND]); read_align_info.my_read_num = (read_num_str.empty() ? 1 : atoi(read_num_str.c_str())); read_align_info.my_passed_filter_flag = (export_field_str_vec[PASSED_FILTER_FIELD_IND] == "Y"); read_align_info.my_read_str = export_field_str_vec[READ_SEQ_FIELD_IND]; read_align_info.my_qual_str = export_field_str_vec[QUAL_STR_FIELD_IND]; read_align_info.my_num_insertions = 0; read_align_info.my_num_deletions = 0; // Aligned? if ((read_align_info.my_aligned_flag = !export_field_str_vec[ALIGN_POS_FIELD_IND].empty())) { if (!apply_mismatch(read_align_info, export_field_str_vec[MISMATCH_FIELD_IND])) { std::cerr << "ERROR: Alignment descriptor '" << export_field_str_vec[MISMATCH_FIELD_IND] << "' is incompatible with read " << read_align_info.my_read_str << std::endl; return false; } read_align_info.my_align_score = atoi(export_field_str_vec[ALIGN_SCORE_FIELD_IND].c_str()); } else { read_align_info.my_raw_read_str = read_align_info.my_read_str; } } return true; } /*****************************************************************************/ char Export_File_Data::blank_base() { return 'N'; } /*****************************************************************************/