// // Copyright 2009 Illumina, Inc. // // This software is covered by the "Illumina Genome Analyzer Software // License Agreement" and the "Illumina Source Code License Agreement", // and certain third party copyright/licenses, and any user of this // source file is bound by the terms therein (see accompanying files // Illumina_Genome_Analyzer_Software_License_Agreement.pdf and // Illumina_Source_Code_License_Agreement.pdf and third party // copyright/license notices). // // /// \file /// \author Ivan Mikoulitch /// #include #include "SamMD2Export/sam2Export.h" #include "samReader.h" #include "stringSplitter.h" #include "export.h" #include "SamMD2Export/utils.h" // Updated by Chris Saunders enum FLAG { FLAG_READ_PAIRED=0x01, FLAG_READ_PAIRED_MAPPED=0x02, FLAG_SEQUENCE_UNMAPPED=0x04, FLAG_MATE_UNMAPPED=0x08, FLAG_STRAND_REVERSE=0x10, FLAG_MATE_STRAND=0x20, FLAG_FIRST_READ_PAIR=0x40, FLAG_SECOND_READ_PAIR=0x80, FLAG_ALIGNMENT_NOT_PRIMARY=0x100, FLAG_READ_FAILS=0x200, FLAG_READ_DUPLICATE=0x400 }; inline string get_tag(StringSplitterJet& ss, int index); Sam2Export::Sam2Export(const string& samFile, const string& prefix, const string& suffix) { samFile_=samFile; prefix_=prefix; suffix_=suffix; have_prefix_=prefix_.size()>0; have_suffix_=suffix_.size()>0; } Sam2Export::~Sam2Export(void) { } // Export format only allows bases from [ACGTN]: void ExportBaseNormalize(char &c) { switch (c) { case 'A' : case 'a' : c='A'; return; case 'C' : case 'c' : c='C'; return; case 'G' : case 'g' : c='G'; return; case 'T' : case 't' : c='T'; return; default : c='N'; return; } } // Export format only allows bases from [ACGTN]: void BaseComplement(char &c) { switch (c) { case 'A' : case 'a' : c='T'; return; case 'C' : case 'c' : c='G'; return; case 'G' : case 'g' : c='C'; return; case 'T' : case 't' : c='A'; return; default : c='N'; return; } } bool Sam2Export::CreateExportFile() { SamReader sr(samFile_); if(!sr.Open()) return false; string line; unsigned int total_num_reads=0; unsigned int num_reads=0; int XD_index; int SM_index; int AS_index; int H0_index; int H1_index; int H2_index; string tag; int tag_index; char qname_buffer[512]; try { while(sr.GetNextSamLine()) { ++total_num_reads; ++num_reads; Export exr; StringSplitterJet ss(sr.BUFFER); try // single read exception, to report in error file { // QNAME string qname = ss.GetString(1); strcpy(qname_buffer, qname.c_str()); StringSplitterJet ssq(qname_buffer,':'); int numQnameElements = ssq.GetNumElements(); if(numQnameElements==5) { exr.Machine = ssq.GetString(1); int index = exr.Machine.find_last_of("_"); if (index>0) { exr.RunNumber=exr.Machine.substr(index+1); exr.Machine=exr.Machine.substr(0, index); } else { exr.RunNumber="0"; } exr.Lane = ssq.GetString(2); exr.Tile = ssq.GetString(3); exr.XCoordinateCluster = ssq.GetString(4); exr.YCoordinateCluster = ssq.GetString(5); // we may have some non numeric data at the end of QNAME // DW: to remove #0/1 int end4charIndex = exr.YCoordinateCluster.size()-4; if(end4charIndex>=0 && exr.YCoordinateCluster[end4charIndex]=='#') { exr.YCoordinateCluster=exr.YCoordinateCluster.substr(0, end4charIndex); } } else { exr.Machine = ssq.GetString(1); exr.RunNumber = "0"; exr.Lane = "0"; exr.Tile = "0"; exr.XCoordinateCluster = "0"; exr.YCoordinateCluster = "0"; } exr.Read = ss.GetString(10); exr.QualityString = ss.GetString(11); // FLAG int flag = atoi(ss.GetString(2).c_str()); // reverse strand if(flag&FLAG_STRAND_REVERSE) // 0x10 { exr.MatchStrand="R"; std::reverse(exr.Read.begin(), exr.Read.end()); std::for_each(exr.Read.begin(), exr.Read.end(), BaseComplement); std::reverse(exr.QualityString.begin(), exr.QualityString.end()); } else { std::for_each(exr.Read.begin(), exr.Read.end(), ExportBaseNormalize); exr.MatchStrand="F"; } // adjusting quality values string new_quality_string; string::const_iterator pch = exr.QualityString.begin(); while(pch!=exr.QualityString.end()) { new_quality_string.push_back(*pch+31); ++pch; } exr.QualityString=new_quality_string; // finding tags XD_index=-1; SM_index=-1; AS_index=-1; H0_index=-1; H1_index=-1; H2_index=-1; for (int i=12; i<=ss.GetNumElements(); i++) { tag = ss.GetString(i); switch (tag[0]) { case 'X': case 'x': if( (tag[1]=='D' || tag[1]=='d') && tag[2]==':') XD_index=i; break; case 'S': case 's': if( (tag[1]=='M' || tag[1]=='m') && tag[2]==':') SM_index=i; break; case 'A': case 'a': if( (tag[1]=='S' || tag[1]=='s') && tag[2]==':') AS_index=i; break; case 'H': case 'h': switch(tag[1]) { case '0': if(tag[2]==':') H0_index=i; break; case '1': if(tag[2]==':') H1_index=i; break; case '2': if(tag[2]==':') H2_index=i; break; } break; } } // 0 for non-index run exr.Index="0"; // SE&PE 1) == filter ==================================== if(flag&FLAG_READ_FAILS) { exr.FilteringYN = "N"; } else { exr.FilteringYN = "Y"; } // SE&PE 2) == match chromosome & match contig ===== string match_chromosome; string::size_type index; if (flag&FLAG_SEQUENCE_UNMAPPED) // 0x0004 { exr.MatchStrand=""; if(H0_index>=0&&H1_index>=0&&H2_index>=0) { match_chromosome.append(get_tag(ss, H0_index)); match_chromosome.append(":"); match_chromosome.append(get_tag(ss, H1_index)); match_chromosome.append(":"); match_chromosome.append(get_tag(ss, H2_index)); exr.MatchChromosome=match_chromosome; // Leaving export contig name blank } else { exr.MatchChromosome="NM"; // Leaving export contig name blank } } else { string rname = ss.GetString(3); index = rname.find('/'); if(index!=string::npos) { match_chromosome = rname.substr(0, index); exr.MatchContig = rname.substr(index+1); } else { match_chromosome=rname; // Leaving export contig name blank } // DW. The current workflow for aligning human RNA or DNA data to the human genome, // running CASAVA, and loading the CASAVA tree into GenomeStudio expects // all of the column 11 values (Match Chromosomes) to have a 'c' prefix and a '.fa' suffix. // Now prefix and suffix are optional parameters to the application if (have_prefix_) exr.MatchChromosome.append(prefix_); exr.MatchChromosome.append(match_chromosome); if (have_suffix_) exr.MatchChromosome.append(suffix_); // match position exr.MatchPosition = ss.GetString(4); // match descriptor if (XD_index>=0) { tag = ss.GetString(XD_index); tag_index = tag.find_last_of(':'); if (tag_index>0) { exr.MatchDescriptor = tag.substr(tag_index+1); if(flag&FLAG_STRAND_REVERSE) { string reverse_descriptor = ReverseMatchDescriptor(exr.MatchDescriptor); if (reverse_descriptor=="") { sr.WriteFailedRead("Invalid match descriptor. Cannot reverse complement it.", ss.GetOriginalString()); goto SKIP_EXPORT_LINE; } else { exr.MatchDescriptor=reverse_descriptor; } } } } else { sr.WriteFailedRead("No 'XD' field in SAM record containing the export match descriptor.", ss.GetOriginalString()); goto SKIP_EXPORT_LINE; } // single read alignment score if (SM_index>=0) { exr.SingleReadAlignmentScore = get_tag(ss, SM_index); } else { // MAPQ if SM is mising exr.SingleReadAlignmentScore = ss.GetString(5); } } // PE reads if(flag&FLAG_READ_PAIRED) { // PE 4) == Read number ================================ // ReadNumber: 1 or 2 for paired-read analysis, 0 for single-read analysis if(flag&FLAG_FIRST_READ_PAIR) { exr.ReadNumber="1"; } else if(flag&FLAG_SECOND_READ_PAIR) { exr.ReadNumber="2"; } else { // Both fields are false. Read number is unknown. exr.ReadNumber=""; } if(flag&FLAG_FIRST_READ_PAIR && flag&FLAG_SECOND_READ_PAIR) { sr.WriteFailedRead("Both flags 0x40 and 0x80 are ON", ss.GetOriginalString()); goto SKIP_EXPORT_LINE; } // PE 5) == NMNM read (part of unmapped pair) ============ if(flag&FLAG_SEQUENCE_UNMAPPED && flag&FLAG_MATE_UNMAPPED) { // Skip all remaining steps goto WRITE_EXPORT_LINE; } ss.MakeUpperCase(3); // RNAME can contain '/' to separate contig string RNAME = ss.GetString(3); // PE 6) == Abandoned shadow ============================= if(flag&FLAG_SEQUENCE_UNMAPPED && !(flag&FLAG_MATE_UNMAPPED) && (RNAME=="*" || exr.MatchPosition=="0")) { // Skip all remaining steps goto WRITE_EXPORT_LINE; } // PE 7) == Partner strand =============================== if(flag&FLAG_SEQUENCE_UNMAPPED || flag&FLAG_MATE_UNMAPPED) { exr.PartnerStrand="N"; } else if(flag&FLAG_MATE_STRAND) { exr.PartnerStrand="R"; } else { exr.PartnerStrand="F"; } // PE 8) == Mapping (alignment) scores =================== if (AS_index>=0) { exr.PairedReadAlignmentScore = get_tag(ss, AS_index); } else { if(flag&FLAG_MATE_UNMAPPED) { exr.PairedReadAlignmentScore = "0"; } else { // MAPQ if AS is missing and mate is mapped: exr.PairedReadAlignmentScore = ss.GetString(5); } } // PE 9) == Partner offset ============================== //ss.MakeUpperCase(7); // MRNM can contain '/' to separate contig string MRNM = ss.GetString(7); string MPOS = ss.GetString(8); int impos = atoi(MPOS.c_str()); if(flag&FLAG_MATE_UNMAPPED || MRNM=="*") { exr.PartnerOffset="0"; } else if( MRNM!= "=" && MRNM != RNAME) // chimera { if (impos>0) exr.PartnerOffset=MPOS; else { exr.PartnerOffset=""; sr.WriteFailedRead("Partner offset. (MRNM!='=' && MRNM != RNAME && MPOS is not integer).", ss.GetOriginalString()); goto SKIP_EXPORT_LINE; } } else { if (impos>0) { int ipos = atoi(exr.MatchPosition.c_str()); exr.PartnerOffset=to_string(impos-ipos); } else { int isize = atoi(ss.GetString(9).c_str()); if (isize!=0) { int size = isize - exr.Read.length(); exr.PartnerOffset=to_string(size); } else { exr.PartnerOffset=""; sr.WriteFailedRead("No data for partner offset.", ss.GetOriginalString()); goto SKIP_EXPORT_LINE; } } } // PE 10) == Partner chromosome & contig ==================== if (MRNM=="=" || MRNM==RNAME) { // leave the both fields blank } else { if(flag&FLAG_MATE_UNMAPPED || MRNM=="*") { // leave the both fields blank } else { // optional prefix if (have_prefix_) exr.PartnerChromosome.append(prefix_); index = MRNM.find('/'); if(index!=string::npos) { exr.PartnerChromosome.append(MRNM.substr(0, index)); exr.PartnerContig = MRNM.substr(index+1); } else { exr.PartnerChromosome.append(MRNM); // partner contig is blank in this case } // optional suffix if (have_suffix_) exr.PartnerChromosome.append(suffix_); } } // PE 11) == Shadow reads =================================== if(flag&FLAG_SEQUENCE_UNMAPPED && !(flag&FLAG_MATE_UNMAPPED)) { // Set alrfeady in section SE&PE 2) (2a) // exr.MatchChromosome="NM"; // exr.MatchContig=""; exr.MatchPosition = ss.GetString(4); /* if(exr.PartnerStrand=="R") exr.MatchStrand="F"; else if(exr.PartnerStrand=="F") exr.MatchStrand="R"; else exr.MatchStrand="N"; */ if(flag&FLAG_MATE_STRAND) { exr.MatchStrand="F"; } else { exr.MatchStrand="R"; } exr.MatchDescriptor="-"; exr.SingleReadAlignmentScore="-1"; exr.PairedReadAlignmentScore="0"; } } else // Single read { // ReadNumber: 1 or 2 for paired-read analysis, 1 for single-read analysis // // csaunders -- update the SE read number from 0 to 1 to reflect update in the export spec -- 20100305 // exr.ReadNumber="1"; } WRITE_EXPORT_LINE: sr.WriteExportLine(exr); SKIP_EXPORT_LINE:; } catch(...) { sr.WriteFailedRead("Read processing failed.", ss.GetOriginalString()); } if (num_reads>=10000) { cout << "# sam reads processed = " + to_string(total_num_reads) << endl; num_reads=0; } } } catch(...) { string error("Exception occured while creating export file. "); error.append("Read number: " + to_string(total_num_reads)+". "); error.append("Read : "); error.append(sr.BUFFER); sr.WriteError(error); } sr.WriteInfo("Done. Total sam reads processed = " + to_string(total_num_reads) + "."); bool success=sr.WriteWarningErrorSummary(); sr.Close(); return success; } inline string get_tag(StringSplitterJet& ss, int index) { string tag = ss.GetString(index); int tag_index = tag.find_last_of(':'); if (tag_index>0) { return tag.substr(tag_index+1); } else { return ""; } } string Sam2Export::ReverseMatchDescriptor(const string& str) { vector parts; size_t start = 0; size_t end; string part; while(start deletion { // no reverse needed for a number } else // character -> insertion { // to reverse and complement std::reverse(part.begin()+1, part.end()-1); std::for_each(part.begin()+1, part.end()-1, BaseComplement); } parts.push_back(part); start+=part.size(); } else // substitution { switch(c) { case 'A': case 'C': case 'G': case 'T': case 'N': case 'a': case 'c': case 'g': case 't': case 'n': { // looking for the end of the substitution end = str.find_first_not_of("ACGTNacgtn", start+1); if (end==string::npos) // end of part descriptor { part = str.substr(start); } else { part = str.substr(start, end-start); } // to reverse and complement std::reverse(part.begin(), part.end()); std::for_each(part.begin(), part.end(), BaseComplement); parts.push_back(part); start+=part.size(); break; } default: // unexpected, to report the problem // WriteError("Invalid reverse match descriptor: '"+str+"'. Bad character: '"+c+"'."); return ""; } } // looking for next part // start=str.find_first_of("123456789ACGTacgt^", start); } std::reverse(parts.begin(), parts.end()); string reverse_descriptor; vector::const_iterator ppart = parts.begin(); while(ppart!=parts.end()) { reverse_descriptor.append(*(ppart++)); } return reverse_descriptor; } // Current problems: // 1) Quality scores are different in sorted and sam.export // 2) We check 0x2 flag for PE test, sorted appears using 0x1 flag. Chris to look at.