// // Copyright 2009 Illumina, Inc. // // This software is covered by the "Illumina Genome Analyzer Software // License Agreement" and the "Illumina Source Code License Agreement", // and certain third party copyright/licenses, and any user of this // source file is bound by the terms therein (see accompanying files // Illumina_Genome_Analyzer_Software_License_Agreement.pdf and // Illumina_Source_Code_License_Agreement.pdf and third party // copyright/license notices). // // /// \file /// \author Ivan Mikoulitch /// #include "SamMD/samReader.h" #include "stringSplitterJet.h" #include "SamMD/fastaReader.h" #include "SamMD/log.h" #include "SamMD/utils.h" ////////////////////////////////////////////////////////////////////// SamReader::SamReader(string fileName) { fileName_=fileName; total_reads_processed_=0; out_file_name_ = fileName_+".samMD"; out_file_name_not_found_ = fileName_+".not_found"; out_file_name_excluded_ = fileName_+".excluded"; } ////////////////////////////////////////////////////////////////////// SamReader::~SamReader(void) { Close(); } ////////////////////////////////////////////////////////////////////// void SamReader::Close() { fileName_.clear(); ref_map_.clear(); fs_.close(); ifs_.close(); ofs_.close(); ofs_not_found_.close(); ofs_excluded_.close(); return; } ////////////////////////////////////////////////////////////////////// bool SamReader::ReadSamFile() { Log::Info("Reading sam file: '" + fileName_ + "'"); num_reads=0; try { fs_.open(fileName_.c_str()); if(!fs_.is_open()) return false; string rname, data; //int pos; bool passedHeader=false; unsigned int line_number = 0; const int BUFFER_SIZE = 4096; char buffer[BUFFER_SIZE]; while(fs_.getline(buffer, BUFFER_SIZE)) { ++line_number; if(!passedHeader) { if (buffer[0]=='@') continue; // header line else passedHeader=true; } GetInMemoryData(buffer, rname, data); // automatically inserts rname if not present ref_map_[rname].insert(make_pair(line_number, data)); ++num_reads; if (num_reads % 100000 == 0) Log::Info(" # sam reads = " + to_string(num_reads) + "."); } } catch(...) { return false; } fs_.close(); Log::Info("Done. # sam reads = " + to_string(num_reads) + "."); Log::Info("Output will be written to : '" + out_file_name_ + "'"); return true; } ////////////////////////////////////////////////////////////////////// // We need this data: QNAME #1, RNAME #3, POS #4, CIGAR #6, SEQ #10 // The line is tab delimited // Returns POS unsigned int SamReader::GetInMemoryData(const char* pchar, string& rname, string& data) { // const int line_size = line.size(); // const char * pchar = line.c_str(); const char * pchar_start = pchar; int tab_index=0; // unsigned int pos=0; ostringstream ss; while(*pchar!=0) { if (*pchar=='\t') { switch(++tab_index) { /* case 1 : // QNAME // data.assign(pchar_start, pchar); // using line number ss << line_number << '\t'; break; */ case 3: // RNAME rname.assign(pchar_start, pchar); break; case 4: // POS // pos=atoi(string(pchar_start, pchar).c_str()); ss << string(pchar_start, pchar) << '\t'; break; case 6: // CIGAR ss << string(pchar_start, pchar) << '\t'; break; case 10: // SEQ ss << string(pchar_start, pchar) << '\t'; data.assign(ss.str()); return 0; } pchar_start=pchar+1; } ++pchar; } return 0; } ////////////////////////////////////////////////////////////////////// bool SamReader::ProcessReferenceData(FastaReader& fr) { string fasta_ref_header = fr.GetRefHeader(); int index = fasta_ref_header.find(" "); // ivan: usign space as separator fasta_ref_header=fasta_ref_header.substr(1, index-1); // 1 to remove '>' Log::Info(" >>> Creating match descriptors for reference: '" + fasta_ref_header + "'"); unsigned int reads_processed=0; map_type::iterator map_it = ref_map_.find(fasta_ref_header); if(map_it==ref_map_.end()) { Log::Info(" <<< Done. # reads processed = " + to_string(reads_processed) + "."); return false; } map& data = map_it->second; map::iterator data_it = data.begin(); while(data_it!=data.end()) { string match_descriptor = fr.GetMatchDescriptorInfo(data_it->second); data_it->second=match_descriptor; ++data_it; ++reads_processed; } Log::Info(" <<< Done. # reads processed = " + to_string(reads_processed) + "."); total_reads_processed_+=reads_processed; return true; } ////////////////////////////////////////////////////////////////////// bool SamReader::CreateModifiedSamFile() { unsigned counter=0; unsigned counter_all=0; unsigned counter_not_found=0; unsigned counter_excluded=0; try { Log::Info(">>> Generating modified sam file: " + out_file_name_); ifs_.open(fileName_.c_str()); if (!ifs_.is_open()) { Log::Error("Cannot open sam file: '" + fileName_ + "'"); return false; } ofs_.open(out_file_name_.c_str()); if(!ofs_.is_open()) { Log::Error("Cannot open output file: '" + out_file_name_ + "'"); return false; } ofs_not_found_.open(out_file_name_not_found_.c_str()); if(!ofs_not_found_.is_open()) { Log::Error("Cannot open output file: '" + out_file_name_not_found_ + "'"); return false; } ofs_excluded_.open(out_file_name_excluded_.c_str()); if(!ofs_excluded_.is_open()) { Log::Error("Cannot open output file: '" + out_file_name_excluded_ + "'"); return false; } string line, new_line; bool passedHeader=false; unsigned int line_number = 0; const int BUFFER_SIZE = 4096; char buffer[BUFFER_SIZE]; try { while(ifs_.getline(buffer, BUFFER_SIZE)) { ++line_number; if(!passedHeader) { if (buffer[0]=='@') { ofs_ << buffer << '\n'; continue; // header line } else passedHeader=true; } line.assign(buffer); new_line = ModifySamLine(line, buffer, line_number); ++counter_all; if(new_line.empty()) { ofs_not_found_ << line << '\n'; ++counter_not_found; } else if(new_line=="-") { ofs_excluded_ << line << '\n'; ++counter_excluded; } else { ofs_ << new_line << '\n'; ++counter; } if (counter_all % 100000 == 0) Log::Info(" # sam reads = " + to_string(counter_all) + "."); } } catch(...) { Log::Exception("Exception occured while creating modified sam file at the sam line: " + line); return false; } } catch(...) { Log::Exception("Exception occured while creating modified sam file."); return false; } ifs_.close(); ofs_.close(); ofs_not_found_.close(); ofs_excluded_.close(); Log::Info("<<< Done."); Log::Info("# total reads processed = " + to_string(counter)); Log::Info("# total reads not found = " + to_string(counter_not_found)); Log::Info("# total reads excluded = " + to_string(counter_excluded)); Log::Info("# total sam reads = " + to_string(counter_all)); return true; } const int BUFFER_SIZE = 4096; char data[BUFFER_SIZE]; ////////////////////////////////////////////////////////////////////// string SamReader::ModifySamLine(string& line_string, char* line_buffer, const unsigned int line_number) { StringSplitterJet line_ss(line_buffer); char* line_rname = line_ss.GetString(3); char* line_pos = line_ss.GetString(4); unsigned int line_pos_int = atoi(line_pos); // map_type::iterator map_it = ref_map_[line_rname]; // taking reference data map >::iterator ref_map_it = ref_map_.find(line_rname); if (ref_map_it==ref_map_.end()) return ""; // taking match data for the given line numner map::iterator ref_map_data_it = ref_map_it->second.find(line_number); if (ref_map_data_it==ref_map_it->second.end()) return ""; // // finding match descriptor data for the position // // multiple entries with the same position are possible // pair p = map_it->second.equal_range(line_pos_int); // multimap_type::const_iterator p_start_it = p.first; // multimap_type::const_iterator p_end_it = p.second; // while(p_start_it!=p_end_it) // { // match descriptor data const string& data_str = ref_map_data_it->second; // to exclude this record; empty match descriptor means deletion > 150 if(data_str.empty()) return "-"; int num_char_copied = data_str.copy(data, data_str.size()); data[num_char_copied] = 0x0; // processed lines has first char='\t' if (data[0]!='\t') return ""; StringSplitterJet data_ss(data); string data_descriptor=data_ss.GetString(2-1+1); // char* data_line_number_str = data_ss.GetString(1+1); // unsigned int data_line_number = atoi(data_line_number_str); // string line_qname = line_ss.GetString(1); // if (line_number==data_line_number) // { ostringstream new_line; // string data_match_descriptor=data_ss.GetString(2+1); string data_s_start_length=data_ss.GetString(3-1+1); string data_s_end_length=data_ss.GetString(4-1+1); // we can have soft or hard clipping // for soft clipping we need to adjust the following data: // POS #4, CIGAR #6, SEQ #10, QUAL #11 // for hard clipping to modify only CIGAR #6 if (data_s_start_length!="0"||data_s_end_length!="0") // soft or hard clipping { int s_start_length = atoi(data_s_start_length.c_str()); int s_end_length = atoi(data_s_end_length.c_str()); string line_seq=line_ss.GetString(10); // #10 string line_qual=line_ss.GetString(11); // #11 string line_cigar=line_ss.GetString(6); // #6 if (s_start_length>0) { if (line_cigar[data_s_start_length.size()]=='S') // soft clipping { // No need to add soft clipped length. // Position points to the fisrt base of clipped sequence // line_pos_int+=s_start_length; // #4 line_seq=line_seq.substr(s_start_length); line_qual=line_qual.substr(s_start_length); } line_cigar=line_cigar.substr(data_s_start_length.size()+1); } if (s_end_length>0) { if (line_cigar[line_cigar.size()-1]=='S') // soft clipping { line_seq = line_seq.erase(line_seq.size()-s_end_length); line_qual = line_qual.erase(line_qual.size()-s_end_length); } line_cigar= line_cigar.erase(line_cigar.size()-data_s_end_length.size()-1); } // now to assemble new modified sam line new_line << line_ss.GetString(1) << '\t'; new_line << line_ss.GetString(2) << '\t'; new_line << line_ss.GetString(3) << '\t'; new_line << line_pos_int << '\t'; new_line << line_ss.GetString(5) << '\t'; new_line << line_cigar << '\t'; new_line << line_ss.GetString(7) << '\t'; new_line << line_ss.GetString(8) << '\t'; new_line << line_ss.GetString(9) << '\t'; new_line << line_seq << '\t'; new_line << line_qual; // string line_end = line_ss.GetStringEnd(12); // if (line_end.size()>0) new_line << '\t'+line_end; // Need to replace XD field if it exists bool found_XD_Z_field=false; for(int i=12; i