/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "findQueryWithinSubject.h" #include "getAlignment.h" #include #include #ifndef WALRUS_BUILD #include #endif #include "numutil.h" #include #include "assert.h" #include #include "consedParameters.h" // will use the entire query, but not entire subject static void findQueryWithinSubjectDownRight( const RWCString& soQueryAcross, const RWCString& soSubjectDown, const int nGap, const int nMatch, const int nMismatch, int& nSubjectEnd, int& nBestScore ) { int nBestDown; int nScoreDiagonallyAbove; int nToBeScoreDiagonallyAbove; int nScoreByDiagonal; int nAcross; int nDown; int* nScore; char* pQueryAcross = soQueryAcross.data(); char* pSubjectDown = soSubjectDown.data(); int nSeqAcrossLength = soQueryAcross.length(); int nSeqDownLength = soSubjectDown.length(); // the +1 is because the size of the NW matrix is 1 greater // than that of the sequence (in either dimension) nScore = (int*) calloc( nSeqAcrossLength + 1, sizeof(int) ); // scores for for nDown = 0. for ( nAcross = 0; nAcross <= nSeqAcrossLength; ++nAcross ) { nScore[ nAcross ] = -nGap * nAcross; } nBestScore = nScore[ nSeqAcrossLength ]; nBestDown = 0; // the above corresponds to a path like this: // >>>>>>>>>>> // | | // | | // | | // | | // | | // | | // | | // ----------- // That is, just across the top of the matrix. for( nDown = 1; nDown <= nSeqDownLength; ++nDown ) { nToBeScoreDiagonallyAbove = nScore[0]; /* case of first column--zero since alignment can start at any row of the first column */ nScore[0] = 0; /* now handle case of other columns */ for( nAcross = 1; nAcross <= nSeqAcrossLength; ++nAcross ) { nScoreDiagonallyAbove = nToBeScoreDiagonallyAbove; nToBeScoreDiagonallyAbove = nScore[nAcross]; // the -1 in each case is because the NW matrix // has its 0,0 before the start of the sequences // When moving from 0,0 to 1,1 you pass the // 0th char of each sequence if ( pSubjectDown[nDown - 1] == pQueryAcross[nAcross - 1] ) nScoreByDiagonal = nScoreDiagonallyAbove + nMatch; else nScoreByDiagonal = nScoreDiagonallyAbove - nMismatch; nScore[nAcross] = MAX( nScore[nAcross] - nGap, // move down MAX( nScore[nAcross - 1] - nGap, // move right nScoreByDiagonal // move down right ) ); } /* for( nAcross = 1 ... */ // we are only interested in the best score on the right edge // since we want the entire query to be used // We are now on the right edge since nAcross is at its max if (nScore[nSeqAcrossLength] > nBestScore ) { nBestScore = nScore[nSeqAcrossLength]; // nBestAcross = nAcross; not needed since this will always // be nSeqAcrossLength--ouch! This is beyond the end of the // sequence! Yes, nBestDown is not a sequence position but // a position within the SW matrix. The subject sequence ends // at nBestDown - 1. nBestDown = nDown; } } /* for( nDown = 1 ... */ nSubjectEnd = nBestDown - 1; // this is because if the best path // are in the NW matrix is from (a,b) to (c,d), the aligned // sequences are from (a,b) to (c-1,d-1) free( (char*) nScore ); } static void findStartOfAlignment( const RWCString& soQueryAcross, const RWCString& soSubjectDown, // char* szText, // const int nTextLength, // char* szQuery, // const int nQueryLength, const int n0TrySubjectStart, const int n0SubjectEnd, const int nGapPenalty, // positive const int nMatchBoost, // positive const int nMismatchPenalty, // positive int& n0SubjectStart, int& nBestScore ) { int nQueryAcrossLength = soQueryAcross.length(); char* szQuery = soQueryAcross.data(); char* szSubjectDown = soSubjectDown.data(); // we need to find the beginning of the alignment in the // consensus (or read). There may be insertion or deletion errors // so the number of consensus bases in the alignment may be more // or less than the number of bases in the query. However, using // the nMismatches, we know the furthest left it can start. // query------> // subject // |? (does alignment start here?) // |? (does alignment start here?) // |? (does alignment start here?) // | // | // | // | // | // v // keep scores for one row across // The +1 is due to the fact that the NW matrix is 1 greater // than that of the sequence in either dimension // Thus the index as we move from right to left will be // from soQuery.length() to 0 int* nScore = (int*) calloc( nQueryAcrossLength + 1, sizeof( int ) ); // compute scores for bottom row, starting on right hand side. // The bottom row is nDown = n0SubjectEnd + 1 int nAcross; for( nAcross = nQueryAcrossLength; nAcross >= 0; --nAcross ) { nScore[ nAcross ] = -nGapPenalty * ( nQueryAcrossLength - nAcross ); } int nBestScoreOnLeftEdge = nScore[0]; int nBestDownOnLeftEdge = n0SubjectEnd + 1; // I think the +1 is correct because the NW matrix goes from 0 to // subject_length+1 // the above corresponds to a path like this: // ----------- // | | // | | // | | // | | // | | // | | // | | // <<<<<<<<<<< // That is, just across the bottom of the matrix. // now compute scores for the other rows, starting with the row // just above the bottom row int nToBeScoreDiagonallyBelowRight; int nScoreDiagonallyBelowRight; int nScoreByDiagonal; // is there any +1 here? --no, since we already (above) considered // the row nDown = n0SubjectEnd + 1 for( int nDown = n0SubjectEnd; nDown >= n0TrySubjectStart; --nDown ) { nToBeScoreDiagonallyBelowRight = nScore[ nQueryAcrossLength ]; // case of rightmost column. The MAX with 0 allows us to start // the alignment somewhere besides at the very bottom. Thus we // may find a better alignment than the one that ended at the // position agrep gave us. This handles // the case of agrep giving several ending positions of the alignments // that are all variations of the same alignment. nScore[ nQueryAcrossLength ] = nScore[ nQueryAcrossLength ] - nGapPenalty; // now handle other columns for( nAcross = nQueryAcrossLength - 1; nAcross >= 0; --nAcross ) { nScoreDiagonallyBelowRight = nToBeScoreDiagonallyBelowRight; nToBeScoreDiagonallyBelowRight = nScore[ nAcross ]; if ( szQuery[nAcross] == szSubjectDown[nDown] ) nScoreByDiagonal = nScoreDiagonallyBelowRight + nMatchBoost; else nScoreByDiagonal = nScoreDiagonallyBelowRight - nMismatchPenalty; nScore[ nAcross ] = MAX( nScore[nAcross] - nGapPenalty, // move straight up MAX( nScore[ nAcross + 1 ] - nGapPenalty, // move straight left nScoreByDiagonal ) ); // move diagonally up left } // now reached left edge if ( nScore[0] > nBestScoreOnLeftEdge ) { nBestScoreOnLeftEdge = nScore[0]; nBestDownOnLeftEdge = nDown; } } // for( int nDown = nConsensusZeroBasedEndOfMatch - 1; ... free( nScore ); n0SubjectStart = nBestDownOnLeftEdge; nBestScore = nBestScoreOnLeftEdge; } // this uses the entire query // but it can start and end anywhere within the subject // need to sort out which of these penalties/boosts are positive and // which negative--they are all positive just as in // extendAlignment.cpp void findQueryWithinSubject( const RWCString& soQueryAcross, const RWCString& soSubjectDown, const int nGap, const int nMatch, const int nMismatch, const int nMinScore, const int nMaxIndels, bool& bFoundMatch, int& n0SubjectStart, int& n0SubjectEnd, int& nScore ) { // void findQueryWithinSubject( // char* pQueryAcross, // char* pSubjectDown, // int nSeqDownLength, // int nSeqAcrossLength, // int* pnBestDown, // int* pnBestAcross, // int* pnBestScore, // const int nGap, // const int nMatch, // const int nMismatch, // const int nMinScore, // const int nMaxIndels, // bool* pFoundMatch ) { findQueryWithinSubjectDownRight( soQueryAcross, soSubjectDown, nGap, nMatch, nMismatch, n0SubjectEnd, nScore ); if ( nScore < nMinScore ) { bFoundMatch = false; // cerr << "failed findQueryWithinSubjectDownRight nScore = " << nScore << endl; return; } bFoundMatch = true; // if reached here, we've found a match and it ends // at position n0SubjectEnd within soSubjectDown int n0TrySubjectStart = n0SubjectEnd - soQueryAcross.length() - nMaxIndels + 1; // the -1 is for good-measure if ( n0TrySubjectStart < 0 ) n0TrySubjectStart = 0; int n0SubjectStartReturned; int nBestScore2; findStartOfAlignment( soQueryAcross, soSubjectDown, n0TrySubjectStart, n0SubjectEnd, nGap, nMatch, nMismatch, n0SubjectStartReturned, nBestScore2 ); assert( nScore == nBestScore2 ); n0SubjectStart = n0SubjectStartReturned; // just to test RWCString soMatchingPartOfSubject = soSubjectDown( n0SubjectStart, n0SubjectEnd - n0SubjectStart + 1 ); RWCString soAlignmentTop; RWCString soAlignmentBottom; int nScore3; getAlignment( soQueryAcross.data(), soMatchingPartOfSubject.data(), nGap, nMatch, nMismatch, soAlignmentTop, soAlignmentBottom, nScore3 ); assert( nScore == nScore3 ); assert( soAlignmentTop.length() == soAlignmentBottom.length() ); RWCString soAlignmentMiddle( (size_t) soAlignmentTop.length() ); for( int n = 0; n < soAlignmentTop.length(); ++n ) { if ( soAlignmentTop[n] == soAlignmentBottom[n] ) soAlignmentMiddle += " "; else { soAlignmentMiddle += "x"; // if ( soAlignmentTop[n] == '*' ) { // ++pCP->nDebugging_; // } // else if ( soAlignmentBottom[n] == '*' ) { // ++pCP->nDebugging2_; // } // else { // ++pCP->nDebugging3_; // } } } // if ( pCP->soDebuggingString_ == "FGXQUUK01BJZM7" ) { // cerr << soAlignmentTop << endl; // cerr << soAlignmentMiddle << endl; // cerr << soAlignmentBottom << endl; // } }