// -*- mode: c++; indent-tabs-mode: nil; -*- // // Copyright 2009 Illumina, Inc. // // This software is covered by the "Illumina Genome Analyzer Software // License Agreement" and the "Illumina Source Code License Agreement", // and certain third party copyright/licenses, and any user of this // source file is bound by the terms therein (see accompanying files // Illumina_Genome_Analyzer_Software_License_Agreement.pdf and // Illumina_Source_Code_License_Agreement.pdf and third party // copyright/license notices). // // /// \file #include "variance/tagAligner.h" #include #include #include #include #include using namespace std; /* ----- ----- ----- ----- ----- ----- * * ----- Auxillary funcitons ----- * * ----- ----- ----- ----- ----- ----- */ vector TagAligner::getAlignSeqs(const string& read, const string& desc) { string raSeq; string taSeq; string matStr; string insStr; string misStr; string delStr; bool inINDEL = false; bool inINS = false; bool inDEL = false; int taIdx = 0; const unsigned dsize(desc.length()); for (unsigned ii(0); ii < dsize; ii++) { if (desc[ii] == '^') { if (matStr.length() != 0) { int matLen = atoi(matStr.c_str()); taSeq.append(read.substr(taIdx, matLen)); raSeq.append(read.substr(taIdx, matLen)); taIdx += matLen; matStr = ""; } inINDEL = true; } else if (desc[ii] == '$') { if (inINS) { raSeq.append(insStr); taSeq.append(insStr.length(), '-'); //multiChar("-", insStr.length())); } else if (inDEL) { int delLen = atoi(delStr.c_str()); taSeq.append(read.substr(taIdx, delLen)); raSeq.append(delLen, '-'); //multiChar("-", delLen)); taIdx += delLen; } else { cerr << "[ERROR]: Failed ins or del in getAlnFromDesc: " << read << ", " << desc << ", " << ii << endl; exit(EXIT_CODE_INPUT); } inINDEL = false; inINS = false; inDEL = false; insStr = ""; delStr = ""; } else if (inINDEL) { if (isdigit(desc[ii])) { delStr.append(1, desc[ii]); inDEL = true; } else { insStr.append(1, desc[ii]); inINS = true; } } else if (isdigit(desc[ii])) { matStr.append(1, desc[ii]); } else { if (matStr.length() != 0) { int matLen = atoi(matStr.c_str()); taSeq.append(read.substr(taIdx, matLen)); raSeq.append(read.substr(taIdx, matLen)); taIdx += matLen; matStr = ""; } raSeq.append(1, desc[ii]); taSeq.append(read.substr(taIdx, 1)); taIdx++; } } if (matStr.length() != 0) { int matLen = atoi(matStr.c_str()); taSeq.append(read.substr(taIdx, matLen)); raSeq.append(read.substr(taIdx, matLen)); taIdx += matLen; matStr = ""; } vector ret; ret.push_back(raSeq); ret.push_back(taSeq); return(ret); } string TagAligner::getMatchDesc(const string& raSeq, const string& taSeq) { ostringstream out; if (raSeq.length() != taSeq.length()) { return("-"); } int mat = 0; int insCnt = 0; string delStr; const unsigned rsize(raSeq.length()); for (unsigned ii(0); ii < rsize; ii++) { if (raSeq[ii] == '-') { if (mat > 0) { out << mat; mat = 0; } if (delStr.length() != 0) { out << "^" << delStr << "$"; delStr.assign(""); } insCnt++; } else if (taSeq[ii] == '-') { if (mat > 0) { out << mat; mat = 0; } if (insCnt != 0) { out << "^" << insCnt << "$"; insCnt = 0; } delStr.append(1, raSeq[ii]); } else if (raSeq[ii] != taSeq[ii]) { if (delStr.length() != 0) { out << "^" << delStr << "$"; delStr.assign(""); } if (insCnt != 0) { out << "^" << insCnt << "$"; insCnt = 0; } if (mat > 0) { out << mat; mat = 0; } out << raSeq[ii]; //desc.append(1, raSeq[ii]); } else { if (delStr.length() != 0) { out << "^" << delStr << "$"; delStr.assign(""); } if (insCnt != 0) { out << "^" << insCnt << "$"; insCnt = 0; } mat++; } } if (mat > 0) { out << mat; mat = 0; } if (delStr.length() != 0) { out << "^" << delStr << "$"; delStr.assign(""); } if (insCnt != 0) { out << "^" << insCnt << "$"; insCnt = 0; } return(out.str()); } void TagAligner::checkAlnDescFunc(const string& read, const string& desc, const string& raSeq, const string& taSeq) { // Rebuild alignments and match descriptor vector aln = TagAligner::getAlignSeqs(read, desc); string newDesc = TagAligner::getMatchDesc(aln[0], aln[1]); // Make sure newDesc = old desc if (newDesc.compare(desc) != 0) { cerr << "[ERROR]: Failed checkAlnDescFunc(matchDescGen):" << endl; cerr << "[ERROR]: Read\t" << read << endl << endl; cerr << "[ERROR]: desc\t" << desc << endl; cerr << "[ERROR]: newD\t" << newDesc << endl << endl; cerr << "[ERROR]: raSe\t" << raSeq << endl; cerr << "[ERROR]: aln1\t" << aln[0] << endl; cerr << "[ERROR]: taSe\t" << taSeq << endl; cerr << "[ERROR]: aln2\t" << aln[1] << endl; exit(EXIT_CODE_FAILURE); } // Make sure new alignment = refAlign + queAlign if (aln[0].compare(raSeq) != 0 || aln[1].compare(taSeq) != 0) { cerr << "[ERROR]: Failed checkAlnDescFunc(alignGen):" << endl; cerr << "[ERROR]: Read\t" << read << endl << endl; cerr << "[ERROR]: desc\t" << desc << endl; cerr << "[ERROR]: newD\t" << newDesc << endl << endl; cerr << "[ERROR]: raSe\t" << raSeq << endl; cerr << "[ERROR]: aln1\t" << aln[0] << endl; cerr << "[ERROR]: taSe\t" << taSeq << endl; cerr << "[ERROR]: aln2\t" << aln[1] << endl; exit(EXIT_CODE_FAILURE); } } ScoreType TagAligner::scoreAlignment(const string& raSeq, const string& taSeq, ScoreType m, ScoreType mm, ScoreType e, ScoreType o) { // Find min len unsigned int len = raSeq.length(); if (len > taSeq.length()) { len = taSeq.length(); } ScoreType score = 0; bool gapOpen = false; for (unsigned int ii = 0; ii < len; ii++) { if (raSeq[ii] == '-' || taSeq[ii] == '-') { if (! gapOpen) { gapOpen = true; score -= o; } else { score -= e; } } else if (raSeq[ii] == taSeq[ii]) { gapOpen = false; score += m; } else { gapOpen = false; score -= mm; } } return(score); } /* ----- ----- ----- ----- ----- ----- * * ----- Manipulation Functions ----- * * ----- ----- ----- ----- ----- ----- */ // cts - is there an error in printing out types of AAC-GG/A-CTGG???, // the output doesn't make sense to me? // cts - qual scores could be integrated into the match score here -- // tricky to balance out against the affine gap penalties... // cts - great spot to grab the appropriate SeqAn function..... void TagAligner::setReadAlnScore() { // Ensure the ref and tag alignment strings are of equal length // Otherwise prefix and suffix trimming failed assert(this->refAlnSeq.length() == this->tagAlnSeq.length()); // Get and set tag alignment score with generic alignmnet // score function this->readAlnScore = TagAligner::scoreAlignment(this->refAlnSeq, this->tagAlnSeq, this->matchCost, this->misMatchCost, this->gapExtendCost, this->gapOpenCost); // Set normalized tag alignment score this->readNormAlnScore = this->readAlnScore / this->tagSeq.length(); // Also set misMatch, inserts, deletes and gap statistics const unsigned int len = this->tagAlnSeq.length(); this->numMatches = 0; this->numMisMatches = 0; this->numInserts = 0; this->numDeletes = 0; this->numGaps = 0; for (unsigned pos = 0; pos < len; pos++) { if (this->refAlnSeq[pos] == '-') { this->numInserts++; this->numGaps++; } else if (this->tagAlnSeq[pos] == '-') { this->numDeletes++; this->numGaps++; } else if (this->tagAlnSeq[pos] != this->refAlnSeq[pos]) { this->numMisMatches++; } else { assert(this->tagAlnSeq[pos] == this->refAlnSeq[pos]); this->numMatches++; } } // Make sure all the numbers add up const bool is_add_up((numInserts+numMisMatches+numMatches) == static_cast(tagSeq.length())); if (not is_add_up){ cerr << "No Match\t" << this->numInserts << "\t" << this->numMisMatches << "\t" << this->numMatches << "\t!=\t" << this->tagSeq.length() << endl; cerr << this->toString() << endl; } assert(is_add_up); } /* static void max3(ScoreType& max, NScoreType& which, const ScoreType v0, const ScoreType v1, const ScoreType v2 ) { max=v0; which=0; if (v1>v0) { max=v1; which=1; } if (v2>max) { max=v2; which=2; } } // ~max3 */ ScoreType TagAligner::align() { // Local variable declarations // Set seqs const char* x = this->refSeq.c_str(); const char* y = this->tagSeq.c_str(); string xt_ = ""; string yt_ = ""; string at_ = ""; at_.clear(); // Set local match, misMatch, extend and open costs const double w_match_(this->matchCost); const double w_mismatch_(this->misMatchCost); const double w_extend_(this->gapExtendCost); const double w_open_(this->gapOpenCost); // Bring in missing variables from object const unsigned x_size = this->refSeq.length(); const unsigned y_size = this->tagSeq.length(); const unsigned ys(y_size+1); const unsigned xs(x_size+1); allocateMatrices(xs, ys); #ifdef CACHE_AND_REUSE // Dynamic programing matrices DPMatrix E_(xs,std::vector(ys,0.)); DPMatrix F_(xs,std::vector(ys,0.)); DPMatrix G_(xs,std::vector(ys,0.)); DPMatrixN TE_(xs,std::vector(ys,0)); DPMatrixN TF_(xs,std::vector(ys,0)); DPMatrixN TG_(xs,std::vector(ys,0)); #endif // Init DP edges for (unsigned i(0); i <= x_size; i++) { //E_[i][0]=-w_open_-(i*w_extend_); //E_[i][0] = -10000; E_[i][0] = 0; F_[i][0] = -10000; G_[i][0] = -10000; } // Init DP edges for (unsigned j(0); j <= y_size; j++) { E_[0][j] = -10000; //F_[0][j]=-w_open_-(j*w_extend_); F_[0][j] = 0; G_[0][j] = -10000; } // Fill DP matrix for (unsigned i(1); i <= x_size; i++) { for (unsigned j(1); j <= y_size; j++) { // update G_ max3(G_[i][j], TG_[i][j], G_[i-1][j-1], E_[i-1][j-1], F_[i-1][j-1]); if (x[i-1] == y[j-1]) { G_[i][j] += w_match_; //G_[i][j] += pow(w_match_, qualProb[j]); } else { G_[i][j] -= w_mismatch_; //G_[i][j] -= pow(w_mismatch_, qualProb[j]); } // update E_ max3(E_[i][j], TE_[i][j], G_[i][j-1] - w_open_, E_[i][j-1], F_[i][j-1] - w_open_); E_[i][j] -= w_extend_; // update F_ max3(F_[i][j], TF_[i][j], G_[i-1][j] - w_open_, E_[i-1][j] - w_open_, F_[i-1][j]); F_[i][j] -= w_extend_; } // ~for j } // ~for i // Find max score ScoreType max(0),thisMax; NScoreType whichMatrix(0),thisMatrix; int ii(0),jj(0); for (unsigned i(0); i <= x_size; i++) { max3(thisMax, thisMatrix, G_[i][y_size], E_[i][y_size], F_[i][y_size] ); if ((i==0) or (thisMax > max)) { max = thisMax; ii = i; jj = y_size; whichMatrix = thisMatrix; } // ~if } // ~for i for (unsigned j(0); j <= y_size; j++) { max3(thisMax, thisMatrix, G_[x_size][j], E_[x_size][j], F_[x_size][j]); if (thisMax > max) { max = thisMax; ii = x_size; jj = j; whichMatrix = thisMatrix; } // ~if } // Set alignment score this->alignmentScore = max; this->refAlnEnd = ii; this->tagAlnEnd = jj; int tagFlushEnd = this->tagAlnEnd; // Trace sequence back DPMatrixN* pT_ = &TE_; while ((ii > 0) && (jj > 0)) { if (whichMatrix == 0) { pT_ = &TG_; } else if (whichMatrix == 1) { pT_ = &TE_; } else { assert(whichMatrix == 2); pT_ = &TF_; } const NScoreType nextMatrix = (*pT_)[ii][jj]; if (whichMatrix == 0) { xt_ += x[ii-1]; yt_ += y[jj-1]; at_ += ((x[ii-1] == y[jj-1])?'|':'.'); ii--; jj--; // whichMatrix=0; } else if (whichMatrix == 1) { xt_ += '-'; yt_ += y[jj-1]; at_ += ' '; jj--; // whichMatrix = 1; } else { assert(whichMatrix == 2); xt_ += x[ii-1]; yt_ += '-'; at_ += ' '; ii--; } // ~else whichMatrix = nextMatrix; } // ~while reverse(xt_.begin(),xt_.end()); reverse(at_.begin(),at_.end()); reverse(yt_.begin(),yt_.end()); // We need to trim off excess alignment ref alignment // and extend read with gaps // Trim reference gap prefix int refPrefixLen = 0; while(yt_[refPrefixLen] == '-') { refPrefixLen++; } xt_ = xt_.substr(refPrefixLen, xt_.length() - refPrefixLen); at_ = at_.substr(refPrefixLen, at_.length() - refPrefixLen); yt_ = yt_.substr(refPrefixLen, yt_.length() - refPrefixLen); this->refAlnStart = ii + refPrefixLen; // Trim reference gap suffix int refSuffixLen = 0; while(yt_[refSuffixLen] == '-') { refSuffixLen++; } xt_ = xt_.substr(0, xt_.length() - refSuffixLen); at_ = at_.substr(0, at_.length() - refSuffixLen); yt_ = yt_.substr(0, yt_.length() - refSuffixLen); this->refAlnEnd -= refSuffixLen; // Add remaining unabligned tag suffix and prefix int tagFlushStart = jj; while(jj > 0) { xt_ = '-' + xt_; yt_ = y[jj-1] + yt_; at_ = ' ' + at_; jj--; } while(this->tagAlnEnd < static_cast(y_size)) { xt_+='-'; yt_+=y[this->tagAlnEnd]; at_+=' '; this->tagAlnEnd++; } this->tagAlnStart = jj; // Update class variables this->refAlnSeq = xt_; this->tagAlnSeq = yt_; this->alnSeq = at_; // Update match descriptor and read alignment scores this->setMatchDesc(); this->setReadAlnScore(); // This is just a double check to QA the functions if (this->strand == 'R') { checkAlnDescFunc(Export::reverse(Export::complement(this->tagSeq)), this->matchDesc, Export::reverse(Export::complement(this->refAlnSeq)), Export::reverse(Export::complement(this->tagAlnSeq))); } else { checkAlnDescFunc(this->tagSeq, this->matchDesc, this->refAlnSeq, this->tagAlnSeq); } // Update insideRef if (refAlnStart - tagFlushStart < 0 || (refAlnEnd == static_cast(refSeq.length()) && tagFlushEnd != static_cast(tagSeq.length()))) { // Old check that didn't work on next line: //this->refAlnEnd + (this->tagSeq.length() - this->tagAlnEnd) > this->refSeq.length()) { this->insideRef = false; } else { this->insideRef = true; } return(this->alignmentScore); } // ~ScoreType Aligner::operator()( const string& x, const string& y) void TagAligner::setMatchDesc() { if (this->strand == 'R') { this->matchDesc = TagAligner::getMatchDesc(Export::reverse(Export::complement(this->refAlnSeq)), Export::reverse(Export::complement(this->tagAlnSeq))); } else { this->matchDesc = TagAligner::getMatchDesc(this->refAlnSeq, this->tagAlnSeq); } } void TagAligner::allocateMatrices( unsigned int xs, unsigned int ys ) { if ((xs>xmax_)||(ys>ymax_)) { xmax_=xs; ymax_=ys; E_=DPMatrix(xs,std::vector(ys,0.)); F_=DPMatrix(xs,std::vector(ys,0.)); G_=DPMatrix(xs,std::vector(ys,0.)); TE_=DPMatrixN(xs,std::vector(ys,0)); TF_=DPMatrixN(xs,std::vector(ys,0)); TG_=DPMatrixN(xs,std::vector(ys,0)); } } /* static int first_indel_offset(const char* align_contig, const char* align_ref){ static const char skip('-'); const unsigned ns(strlen(align_contig)); assert(ns==strlen(align_ref)); unsigned state_count(0); for(unsigned i(0);irefSeq << endl; con << "Strand\t" << this->strand << endl; con << "TAG\t" << this->tagSeq << endl; con << "SCORE 1\t" << this->alignmentScore << endl; con << "SCORE 2\t" << this->readAlnScore << endl; con << "SCORE 3\t" << this->readNormAlnScore << endl; con << "LEN REF\t" << this->refSeq.length() << endl; con << "ALN REF\t" << this->refAlnStart << " - " << this->refAlnEnd << endl; con << "ALN REF\t" << this->refAlnSeq << endl; con << " \t" << this->alnSeq << endl; con << "ALN TAG\t" << this->tagAlnSeq << endl; con << "ALN TAG\t" << this->tagAlnStart << " - " << this->tagAlnEnd << endl; con << "LEN TAG\t" << this->tagSeq.length() << endl; con << "MATCH DESC\t" << this->matchDesc << endl; return(con.str()); } /* ----- ----- ----- ----- ----- * * ----- Test Functions -- ----- * * ----- ----- ----- ----- ----- */ /* int main( int numArgs, char* args [] ) { //string seq1 = "GCGAGCTGAACGGGCAATGCAGGAAGAGTTCTACCTGGAACTGAAAGAAGGCTTACTGGAGCCGCTGGCAGTGACGGAACGGCTGGCCATTATCTCGGTGGTAGGTGATGGTATGCGCACCTTGCGTGGGATCTCGGCGAAATTCTTTGCCGCACTGGCCCGCGCCAATATCAACATTGTCGCCATTGCTCAGGGATCTTCTGAACGCTCAATCTCTGTCGTGGTAAATAACGATGATGCGACCACTGGCGTGCGCGTTACTCATCAGATGCTGTTCAATACCGATCAGGTTATCGAAGTGTTTGTGATTGGCGTCGGTGGCGTTGGCGGTGCGCTGCTGGAGCAACTGAAGCGTCAGCAAAGCTGGCTGAAGAATAAACATATCGACTTACGTGTCTGCGGTGTTGCCAACTCGAAGGCTCTGCTCACCAATGTA"; //seq1 = Export::reverse(Export::complement(seq1)); string refSeq = "CATGGCCTTAATGGCAGGAAGAACTGGCGTCACCAAATTTTTTTT"; refSeq = "ACTTACGTGTCTGCGGTGTTGCCAACTCGAAGGCTCTGCTCACCAATGTA" + refSeq; //string seq2 = "CATGGCCTTAATCTGGAAAACTGGCAGGAAGAACTGGCGTCAAGCCAAAG"; string read = "CCCCCCGCCTTAATCTGGAAAACTGGCAGAGAACTGGCGTCAAGCCAAA"; //TagAligner::TagAligner align(5, 4, 1, 16, refSeq, seq2); //TagAligner::TagAligner align; TagAligner::TagAligner align(5, 4, 1, 16); align.setSequences(refSeq.c_str(), read.c_str(),""); //double score = align.align(); align.align(); ScoreType score = align.getAlnScore(); cerr << align.toString() << endl; return(0); } */ // END OF FILE