/*****************************************************************************/ // Copyright (c) Illumina 2008 // Author: Richard Shaw // // This software is covered by the "Illumina Genome Analyzer Software // License Agreement" and the "Illumina Source Code License Agreement", // and certain third party copyright/licenses, and any user of this // source file is bound by the terms therein (see accompanying files // Illumina_Genome_Analyzer_Software_License_Agreement.pdf and // Illumina_Source_Code_License_Agreement.pdf and third party // copyright/license notices). /*****************************************************************************/ #include #include #include "statistics/Non_Blank_Filter.h" // for the_blank_ch #include "String_Types.h" #include "common/StringUtil.h" #include "statistics/Align_File_Data.h" /*****************************************************************************/ inline char complement(char orig_ch) { switch (orig_ch) { case 'A': return 'T'; case 'C': return 'G'; case 'G': return 'C'; case 'T': return 'A'; } return the_blank_ch; } /*****************************************************************************/ void reverse_complement(std::string& str) { const unsigned int str_len(str.length()); unsigned int num_swaps(str_len / 2); std::string::iterator ch_iter(str.begin()); std::string::reverse_iterator ch_riter(str.rbegin()); for ( ; num_swaps > 0; --num_swaps, ++ch_iter, ++ch_riter) { const char tmp_ch(*ch_iter); *ch_iter = complement(*ch_riter); *ch_riter = complement(tmp_ch); } if ((str_len % 2) != 0) { *ch_iter = complement(*ch_iter); } } /*****************************************************************************/ char norm_blank(const char orig_ch) { switch (orig_ch) { case 'A': case 'C': case 'G': case 'T': return orig_ch; } return the_blank_ch; } /*****************************************************************************/ Align_File_Data::Align_File_Data(const std::string& data_file_name, const File_Buffer::Compression_Type compression) : Align_Data(data_file_name, compression) { ; } /*****************************************************************************/ Align_File_Data::~Align_File_Data() { ; } /*****************************************************************************/ bool Align_File_Data::get_next_read_align_info(Read_Align_Info& read_align_info, bool& at_eof) { std::string align_file_line_str; bool data_line_found(false); while (!data_line_found) { if (!my_file_buffer.read_line(align_file_line_str, at_eof)) { return false; } if (align_file_line_str.empty() || (align_file_line_str[0] == '#')) { continue; } data_line_found = true; Str_Vec align_field_str_vec = tokenize_str(align_file_line_str); read_align_info.my_lane_num = 1; // arbitrary non-zero lane number read_align_info.my_tile_num = 1; // arbitrary non-zero tile number // Already the defaults. // read_align_info.my_read_num = 1; // read_align_info.my_passed_filter_flag = true; read_align_info.my_read_str = align_field_str_vec[READ_SEQ_FIELD_IND]; std::transform(read_align_info.my_read_str.begin(), read_align_info.my_read_str.end(), read_align_info.my_read_str.begin(), norm_blank); // Quality values are not available. read_align_info.my_qual_str = ""; // Indel representation is not supported but already initialised to 0. // read_align_info.my_num_insertions = 0; // read_align_info.my_num_deletions = 0; switch (align_field_str_vec.size()) { case ALIGNED_NUM_FIELDS: read_align_info.my_ref_str = align_field_str_vec[REF_SEQ_FIELD_IND]; if (read_align_info.my_ref_str.length() != read_align_info.my_read_str.length()) { std::cerr << "ERROR: Ref sequence length " << read_align_info.my_ref_str.length() << " does not match read sequence length " << read_align_info.my_read_str.length() << std::endl; throw std::range_error(""); } if (align_field_str_vec[STRAND_DIR_FIELD_IND] == "R") { reverse_complement(read_align_info.my_ref_str); } else { std::transform(read_align_info.my_ref_str.begin(), read_align_info.my_ref_str.end(), read_align_info.my_ref_str.begin(), norm_blank); } read_align_info.my_aligned_flag = true; read_align_info.my_align_score = atoi(align_field_str_vec[ALIGN_SCORE_FIELD_IND].c_str()); break; case NOT_ALIGNED_NUM_FIELDS: read_align_info.my_aligned_flag = false; read_align_info.my_align_score = 0; break; default: std::cerr << "ERROR: Found unexpected number of fields (" << align_field_str_vec.size() << ") in line " << align_file_line_str << std::endl; return false; } } return true; } /*****************************************************************************/ char Align_File_Data::blank_base() { return '.'; } /*****************************************************************************/