/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include #include "assert.h" #include #include #include "findRepeat.h" #include "consedParameters.h" #include "min.h" #include "max.h" #include "setMatchMatrix.h" #define nBandSize 5 #define nGapScore -4 #define nMatchScore 1 #define nMismatchScore -4 #define MAX4( A, B, C, D ) ( ( (A) >= (B) && (A) >= (C) && (A) >= (D) ) ? \ (A) : \ ( ( (B) >= (C) && (B) >= (D) ) ? \ (B) : \ ( (C) >= (D) ? (C) : (D) ) ) ) static inline int nWhereCameFromIndexFromRowCol( const int nRow, const int nCol ) { return( nRow * ( nBandSize - 1 ) + nCol - 1 ); } void findRepeat( char* szString, int nStringLength, int* pBestScore, int* pRowOfBestScore, int* pColOfBestScore ) { int nScoreLength; int* nScore; int nBestScoreSoFar; int nColOfBestScore; int nRowOfBestScore; int nToBeScoreDiagonallyAbove; int nRow; int nCol; int nMaxCol; int nNewScore; setMatchMatrix( nMatchScore, -( nMismatchScore ) ); // nMismatchPenalty is positive // extra parenthesis are for HP compiler nScoreLength = nStringLength + 1; nScore = (int*) calloc( nScoreLength, sizeof( int ) ); assert( nScore ); /* b's are the bases, s's are the scores b0 b1 b2 b3 ... s0 s1 s2 s3 ... b0 b1 b2 b3 We want to just look above the diagonal */ /* the reason for going only to nScoreLength - nBandSize instead of to nScoreLength is that we don't want to have to check for subscript problems in the inner loop. Thus we want nCol + nBandSize to always be within bounds. So nCol goes as far as nScoreLength - nBandSize - 1 and thus nMaxCol goes as far as nScoreLength - 1 which is the last row in the nScore array. We've already done the nCol = 0 case. In that case all nScore's are zero and we did that with the calloc. */ nBestScoreSoFar = 0; for( nRow = 1; nRow < ( nScoreLength - nBandSize ); ++nRow ) { /* this value is the the first nCol value ( nRow + 1 ) minus 1 so nRow. Notice: D A B C D E F where D is on the diagonal. When starting a new row, such as the 2nd row above, the column starts with E and thus the nToBeScoreDiagonallyAbove is A. This is the starting column - 1. The starting column is nRow + 1 so the column of A is simply nRow. */ nToBeScoreDiagonallyAbove = nScore[ nRow ]; nMaxCol = nRow + nBandSize; for( nCol = nRow + 1; nCol <= nMaxCol; ++nCol ) { /* A B C D E When computing D, if we replace the B value with it, then when we compute E, we no longer have the B value. So we need to save B for the next round. That is the nToBeScoreDiagonallyAbove */ nNewScore = MAX4( 0, nToBeScoreDiagonallyAbove + nMatchMatrix[ szString[ nCol - 1 ]][ szString[ nRow - 1 ] ], nScore[ nCol - 1 ] + nGapScore, nScore[ nCol ] + nGapScore ); nToBeScoreDiagonallyAbove = nScore[ nCol ]; nScore[ nCol ] = nNewScore; if ( nNewScore > nBestScoreSoFar ) { nBestScoreSoFar = nNewScore; nColOfBestScore = nCol; nRowOfBestScore = nRow; } } // if ( nRow <= 117 ) { // printf( "row %d %c: ", nRow, szString[ nRow - 1] ); // for( nCol = nRow + 1; nCol <= nMaxCol; ++nCol ) // printf( " %d: %3d", nCol, nScore[ nCol ] ); // printf( "\n" ); // } } *pBestScore = nBestScoreSoFar; *pColOfBestScore = nColOfBestScore; *pRowOfBestScore = nRowOfBestScore; } void findAlignment( char* szString, const int nStringLength, const int nExpectedBestScore, const int nEndingRowOfBestScoredAlignment, const int nEndingColOfBestScoredAlignment, int* pBeginningRowOfBestScoredAlignment, int* pBeginningColOfBestScoredAlignment, RWCString& soAlignment1, RWCString& soAlignment2, RWCString& soAlignment3 ) { int nScoreLength; int* nScore; int nBestScoreSoFar; int nLocationOfBestScore; int nToBeScoreDiagonallyBelow; int nRow; int nCol; int nMaxCol; int nNewScore; char* pWhereCameFrom; // nScoreLength = nStringLength + 1 // and rows and columns go from 0 to nScoreLength - 1 hence to // nStringLength assert( nEndingRowOfBestScoredAlignment <= nStringLength ); assert( nEndingColOfBestScoredAlignment <= nStringLength ); setMatchMatrix( nMatchScore, - ( nMismatchScore ) ); // nMismatchPenalty is positive // extra parenthesis are for HP compiler nScoreLength = nStringLength + 1; nScore = (int*) calloc( nScoreLength, sizeof( int ) ); assert( nScore ); // make room for the array to hold where we came from pWhereCameFrom = (char*) calloc( nScoreLength * nBandSize, sizeof( char ) ); assert( pWhereCameFrom ); for( nRow = nEndingRowOfBestScoredAlignment; nRow >=0; --nRow ) { /* this value is the the first nCol value ( nRow + 1 ) minus 1 so nRow. Notice: D A B C D E F G where D is on the diagonal. When starting a new row, such as the 1st row above, the column starts with C and thus the nToBeScoreDiagonallyBelow is G. This is the starting column + 1. The starting column is nRow + nBandSize so the column of A is nRow + nBandSize + 1 If we are against the nEndingColOfBestScoredAlignment, then nToBeScoreDiagonallyBelow won't be used. */ if ( nRow + nBandSize + 1 <= nEndingColOfBestScoredAlignment ) nToBeScoreDiagonallyBelow = nScore[ nRow + nBandSize + 1 ]; else // otherwise nToBeScoreDiagonallyBelow isn't used. nToBeScoreDiagonallyBelow = -666; nMaxCol = MIN( nRow + nBandSize, nEndingColOfBestScoredAlignment ); for( nCol = nMaxCol; nCol >= nRow + 1; --nCol ) { // notice that we do not have 0 in the recursion relation--the // path cannot restart--we have pinned it to the ending location // already found // D A B C // D E F G // D H I J // D A B C // D E F // D G (nEndingRowOfBestScoredAlignment) // ^ // nEndingColOfBestScoredAlignment // cases: 1) not on bottom row and not on rightmost column within a row // 2) not on bottom row and not on rightmost column for the whole matrix, but rightmost column of the row // 3) not on bottom row, but on rightmost column for whole matrix // 4) on bottom row and not on rightmost column of matrix // 5) on bottom row and on rightmost column of matrix // U, up from below // L, left from right // D, diagonally from below right // Z, end of the alignment if ( nCol < nMaxCol && nRow < nEndingRowOfBestScoredAlignment ) { int nDiagonal = nToBeScoreDiagonallyBelow + nMatchMatrix[ szString[ nCol ]][ szString[ nRow ] ]; int nRight = nScore[ nCol + 1 ] + nGapScore; int nDown = nScore[ nCol ] + nGapScore; if ( nDiagonal >= nRight && nDiagonal >= nDown ) { nNewScore = nDiagonal; pWhereCameFrom[ nWhereCameFromIndexFromRowCol( nRow, nCol ) ] = 'D'; } else if ( nRight >= nDown ) { nNewScore = nRight; pWhereCameFrom[ nWhereCameFromIndexFromRowCol( nRow, nCol ) ] = 'L'; } else { nNewScore = nDown; pWhereCameFrom[ nWhereCameFromIndexFromRowCol( nRow, nCol ) ] = 'U'; } } else if ( nCol < nEndingColOfBestScoredAlignment && nRow < nEndingRowOfBestScoredAlignment ) { // case of nCol == nMaxCol, i.e., nCol == ( nRow + nBandSize ) int nDiagonal = nToBeScoreDiagonallyBelow + nMatchMatrix[ szString[ nCol ]][ szString[ nRow ] ]; int nDown = nScore[ nCol ] + nGapScore; if ( nDiagonal >= nDown ) { nNewScore = nDiagonal; pWhereCameFrom[ nWhereCameFromIndexFromRowCol( nRow, nCol ) ] = 'D'; } else { nNewScore = nDown; pWhereCameFrom[ nWhereCameFromIndexFromRowCol( nRow, nCol ) ] = 'U'; } } else if ( nCol == nEndingColOfBestScoredAlignment && nRow < nEndingRowOfBestScoredAlignment ) { // 3) not on bottom row, but on rightmost column for whole matrix nNewScore = nScore[ nCol ] + nGapScore; pWhereCameFrom[ nWhereCameFromIndexFromRowCol( nRow, nCol ) ] = 'U'; } else if ( nRow == nEndingRowOfBestScoredAlignment && nCol < nEndingColOfBestScoredAlignment ) { // 4) on bottom row and not on rightmost column of matrix nNewScore = nScore[ nCol + 1 ] + nGapScore; pWhereCameFrom[ nWhereCameFromIndexFromRowCol( nRow, nCol ) ] = 'L'; } else if ( nRow == nEndingRowOfBestScoredAlignment && nCol == nEndingColOfBestScoredAlignment ) { nNewScore = 0; pWhereCameFrom[ nWhereCameFromIndexFromRowCol( nRow, nCol ) ] = 'Z'; } else assert( false ); nToBeScoreDiagonallyBelow = nScore[ nCol ]; nScore[ nCol ] = nNewScore; if ( nNewScore == nExpectedBestScore ) { *pBeginningColOfBestScoredAlignment = nCol; *pBeginningRowOfBestScoredAlignment = nRow; goto foundBestScore; } } // for( nCol ... // if ( *pBeginningRowOfBestScoredAlignment <= nRow && // nRow <= nEndingRowOfBestScoredAlignment ) { // if ( nRow <= 100 ) { // printf( "row %3d %c: ", nRow, szString[ nRow ] ); // for( nCol = nRow + 1; nCol <= nMaxCol; ++nCol ) // printf( " %d: %3d", nCol, nScore[ nCol ] ); // printf( "\n" ); // } } // for( int nRow ... // couldn't find best score cerr << "couldn't find best score" << endl; assert( false ); foundBestScore: // now reconstruct the alignment soAlignment1 = ""; // going down S/W matrix soAlignment2 = ""; soAlignment3 = ""; // going across S/W matrix nRow = *pBeginningRowOfBestScoredAlignment; nCol = *pBeginningColOfBestScoredAlignment; char cWhereCameFrom; while( ( cWhereCameFrom = pWhereCameFrom[ nWhereCameFromIndexFromRowCol( nRow, nCol ) ] ) != 'Z' ) { if ( cWhereCameFrom == 'U' ) { soAlignment1.append( szString[ nRow ] ); soAlignment2.append( '*' ); soAlignment3.append( '*' ); ++nRow; } else if ( cWhereCameFrom == 'L' ) { soAlignment1.append( '*' ); soAlignment2.append( '*' ); soAlignment3.append( szString[ nCol ] ); ++nCol; } else if ( cWhereCameFrom == 'D' ) { soAlignment1.append( szString[ nRow ] ); soAlignment2.append( ( ( szString[ nRow ] == szString[ nCol ] ) ? ' ' : 'X' ) ); soAlignment3.append( szString[ nCol ] ); ++nRow; ++nCol; } } assert( soAlignment1.length() == soAlignment2.length() ); assert( soAlignment2.length() == soAlignment3.length() ); }