// // Copyright 2009 Illumina, Inc. // // This software is covered by the "Illumina Genome Analyzer Software // License Agreement" and the "Illumina Source Code License Agreement", // and certain third party copyright/licenses, and any user of this // source file is bound by the terms therein (see accompanying files // Illumina_Genome_Analyzer_Software_License_Agreement.pdf and // Illumina_Source_Code_License_Agreement.pdf and third party // copyright/license notices). // // /// \file /// \author Ivan Mikoulitch /// #include "SamMD/fastaReader.h" #include "stringSplitterJet.h" #include "SamMD/log.h" #include "SamMD/utils.h" ////////////////////////////////////////////////////////////////////// FastaReader::FastaReader(void) { } ////////////////////////////////////////////////////////////////////// FastaReader::FastaReader(string fileName) { fileName_=fileName; } ////////////////////////////////////////////////////////////////////// FastaReader::~FastaReader(void) { Close(); } ////////////////////////////////////////////////////////////////////// bool FastaReader::Open() { // pos_=0; fs_.open(fileName_.c_str()); return fs_.is_open(); } ////////////////////////////////////////////////////////////////////// void FastaReader::Close() { if(fs_.is_open()) fs_.close(); fileName_.clear(); ref_header_.clear(); ref_header_next_.clear(); ref_sequence_.clear(); // line_.clear(); // sequences_.clear(); // current_ref_sequence_.clear(); // current_ref_id.clear(); } /* ////////////////////////////////////////////////////////////////////// string FastaReader::GetNextFastaReference() { string line; while(getline(fs_, line)) { if(line.empty()) continue; if(IsHeaderLine(line)) { // reading first sequence line getline(fs_, line_); pos_=0; start_=1; end_ = start_-1+line_.size(); return line; } } return ""; } ////////////////////////////////////////////////////////////////////// void FastaReader::ReadSequences() { ifstream fs; try { fs.open(fileName_.c_str()); if(!fs.is_open()) return; ReadSequences(fs); } catch(...) { } if(fs.is_open()) fs.close(); } */ ////////////////////////////////////////////////////////////////////// bool inline FastaReader::IsHeaderLine(string& line) { return line.size()>0 && line[0]=='>'; } ////////////////////////////////////////////////////////////////////// bool FastaReader::ReadNextReference() { ref_sequence_.clear(); if(fs_.eof()) return false; // looking for the first header line if(ref_header_next_.empty()) { while(getline(fs_, ref_header_)) { if(IsHeaderLine(ref_header_)) break; } } else { ref_header_=ref_header_next_; ref_header_next_.clear(); } if (ref_header_.empty()) return false; // end of file Log::Info(">>> Reading reference '" + ref_header_ + "'"); Log::Info(">>> From file: '" + fileName_ + "'."); const int BUFFER_SIZE = 4096; char buffer[BUFFER_SIZE]; unsigned long num_reads=0; while(fs_.getline(buffer, BUFFER_SIZE)) { if(buffer[0]=='>') // new sequence { ref_header_next_.assign(buffer); break; } else { ToUpper(buffer); ref_sequence_.append(buffer); } if (++num_reads % 100000 == 0) Log::Info(" # reference reads = " + to_string(num_reads) + "."); } Log::Info("<<< Done."); return !ref_header_.empty() && !ref_sequence_.empty(); } /* ////////////////////////////////////////////////////////////////////// void FastaReader::ReadSequences(ifstream& fs) { string line; string header_line; string seq_line; // looking for the first header line while(getline(fs, header_line)) { if(IsHeaderLine(header_line)) break; } if (header_line.empty()) return; // end of file while(getline(fs, line)) { if(line.empty()) continue; if(IsHeaderLine(line)) // new sequence { sequences_[header_line]=seq_line; header_line=line; seq_line.clear(); } else { seq_line.append(line); } } if(!header_line.empty() && !seq_line.empty()) { sequences_[header_line]=seq_line; } return; } ////////////////////////////////////////////////////////////////////// string FastaReader::GetSequence(const string& ref_id) { string ref_id_gt=">"+ref_id; map::const_iterator p = sequences_.begin(); while(p!=sequences_.end()) { if(0==ref_id_gt.compare(0, ref_id_gt.size(), p->first)) { return p->second; } ++p; } } ////////////////////////////////////////////////////////////////////// string FastaReader::GetMatchDescriptor(const string& read, const string& ref_id, long ref_position, const string& cigar) { if (current_ref_id!=ref_id) { string sequence = GetSequence(ref_id); if(sequence.empty()) return ""; current_ref_sequence_=sequence; current_ref_id=ref_id; } char cigar_operation; int cigar_length; int cigar_position=0; int read_position=0; ostringstream match_descriptor; for(int i=0; i0) { match_descriptor << match_length; match_length=0; } match_descriptor << current_ref_sequence_[ref_position]; } ++read_position; ++ref_position; } if (match_length>0) match_descriptor << match_length; break; } case 'I': { match_descriptor << '^'; for(int j=0; i150 && (cigar[i]=='D' or cigar[i]=='N')) // csaunders - change to treat 'N' exactly like 'D' { // to exclude this read return ""; } break; default: continue; } switch(cigar[i]) { case 'M': { for(int j=0; j0) { match_descriptor << match_length; match_length=0; } // match_descriptor << ref_sequence_[ref_position]; // Modified by request from Chris Saunders: // I don't anticipate any aligners/packages besides CASAVA will support // export format in the future, and I'm not confident that any component // of CASAVA will be able to handle input containing ambiguous characters // other than 'N'. Unfortunately I don't think we have a choice but to compress // all ambiguous characters to 'N'. switch(ref_sequence_[ref_position]) { case 'A': case 'C': case 'G': case 'T': case 'N': case 'a': case 'c': case 'g': case 't': case 'n': { match_descriptor << ref_sequence_[ref_position]; break; } default: { match_descriptor << 'N'; break; } } } ++read_position; ++ref_position; } break; } // sam insertion makes deletion ^5$ case 'I': { if (match_length>0) { match_descriptor << match_length; match_length=0; } match_descriptor << '^' << cigar_length << '$'; read_position+=cigar_length; break; } // sam deletion makes insertion ^ACGT$ case 'D': case 'N': // csaunders -- change to treat 'N' exactly like 'D' { if (match_length>0) { match_descriptor << match_length; match_length=0; } match_descriptor << '^'; for(int j=0; j0) { match_descriptor << match_length; match_length=0; } string mdi(1,'\t'); // first tab means that the line was processed to create match descriptor // not needed // mdi.append(qname); // mdi.append(1,'\t'); mdi.append(match_descriptor.str()); mdi.append(1,'\t'); mdi.append(s_start_length.size()?s_start_length:"0"); mdi.append(1,'\t'); mdi.append(s_end_length.size()?s_end_length:"0"); return mdi; }