/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ // // search_for_string.cpp // // implementation for SearchForString class // // Used to put up a dialog box with query string, get a user response, // and return a list of matches // // David Gordon, July 1995 // #include "gui_search_for_string.h" #include "search_for_string.h" #include using namespace std; #include "consed.h" #include "assembly.h" #include "rwcstring.h" #include "rwctokenizer.h" #include "rwcregexp.h" #include #include "complement_so.h" #include "please_wait.h" #include "guiMultiContigNavigator.h" #include "filePopupAndGetFilename.h" #include "popupErrorMessage.h" #include "searchMethods.h" #include "popupErrorMessage2.h" #include "nNumberFromBase.h" #include "getAlignment.h" #include "myAgrepAndFilter.h" #include "consedParameters.h" #include "soGetErrno.h" void popupOutputWindowAndDoSearch( const RWCString& soQueryStringReadOnly, const int nSearchMethod, const bool bJustContigsNotReads, const int nMaxPerCentMismatch, const FileName& filSingletsFile ) { RWCString soQueryString = soQueryStringReadOnly; // added June 2004 since useful for pasting in NCBI SNP flanking // sequence. (Used to just strip a single trailing whitespace). soQueryString.stripAllWhitespaceIncludingInternal(); bool bRemovedPads = false; if ( soQueryString.bContainsChar('*') ) { soQueryString.removeSingleCharacterAllOccurrences('*'); bRemovedPads = true; } bool bTruncateErrorMessage = false; int nMaxLengthOfQuery = sizeof( long long ) * 8; if ( nSearchMethod == nInexact ) { if ( soQueryString.length() > nMaxLengthOfQuery ) { soQueryString = soQueryString( 0, nMaxLengthOfQuery ); bTruncateErrorMessage = true; } } char szLength[100]; sprintf( szLength, "%d", soQueryString.length() ); RWCString soTextForFirstLine; if ( bRemovedPads ) { soTextForFirstLine = "Query string: " + soQueryString + " (pads removed )" + " (length: " + szLength + ") (case insensitive) "; } else { soTextForFirstLine = "Query string: " + soQueryString + " (length: " + szLength + ") (case insensitive) "; } // for( int n = 0; n < soQueryString.length(); ++n ) { // cout << n << ": " << ( (int) soQueryString[n] ) << endl; // } RWCString soWindowTitle; if (bJustContigsNotReads) soWindowTitle = "Searching Contigs"; else soWindowTitle = "Searching Reads in .screen File"; guiMultiContigNavigator* pGuiMultiContigNav = new guiMultiContigNavigator( soWindowTitle, soTextForFirstLine ); ConsEd::pGetConsEd()->addGuiMultiContigNavigator( pGuiMultiContigNav ); if ( bTruncateErrorMessage ) popupErrorMessage2( pGuiMultiContigNav->widPopupShell_, "warning: search will only use the first %d characters of the query string", nMaxLengthOfQuery ); PleaseWait* pPleaseWait = new PleaseWait( pGuiMultiContigNav->widPopupShell_ ); // searches are case-insensitive RWCString soQueryStringLower = soQueryString; soQueryStringLower.toLower(); doSearch( soQueryStringLower, pGuiMultiContigNav, nSearchMethod, bJustContigsNotReads, nMaxPerCentMismatch, filSingletsFile ); delete pPleaseWait; } void doSearch(const RWCString& soQueryStringLower, guiMultiContigNavigator* pGuiMultiContigNavigator, const int nSearchMethod, const bool bJustContigsNotReads, const int nMaxPerCentMismatch, const FileName& filSingletsFile ) { if (bJustContigsNotReads ) searchJustContigs( soQueryStringLower, pGuiMultiContigNavigator, nSearchMethod, nMaxPerCentMismatch ); else searchAllReads( soQueryStringLower, pGuiMultiContigNavigator, nSearchMethod, nMaxPerCentMismatch, filSingletsFile ); } void searchAllReads(const RWCString& soQueryString, guiMultiContigNavigator* pGuiMultiContigNavigator, const int nSearchMethod, const int nMaxPerCentMismatch, const FileName& filSingletsFile ) { Assembly* pAssembly = ConsEd::pGetAssembly(); RWCString soReadName( (size_t) 100 ); RWCString soUnpaddedReadSequence( (size_t) 1000 ); int nReadsSearched = 0; for( int nContig = 0; nContig < pAssembly->nNumContigs(); ++nContig ) { Contig* pContig = pAssembly->pGetContig( nContig ); for( int nRead = 0; nRead < pContig->nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pContig->pLocatedFragmentGet( nRead ); soReadName = pLocFrag->soGetName(); pLocFrag->getUnpaddedSequence( soUnpaddedReadSequence ); searchBothStrandsOfARead( pLocFrag, soReadName, soUnpaddedReadSequence, soQueryString, pGuiMultiContigNavigator, nSearchMethod, nMaxPerCentMismatch ); ++nReadsSearched; if ( ( (nReadsSearched % 1000 ) == 0 ) || ( nReadsSearched < 5 ) ) { cout << "searched " << nReadsSearched << " reads" << endl; cout.flush(); } } } cout << "searched " << nReadsSearched << " in assembly" << endl; // now search the singlets file if ( !filSingletsFile.isNull() ) { searchReadsInSingletsFile( soQueryString, pGuiMultiContigNavigator, nSearchMethod, nMaxPerCentMismatch, filSingletsFile ); } cout.flush(); } void searchReadsInSingletsFile( const RWCString& soQueryString, guiMultiContigNavigator* pGuiMultiContigNavigator, const int nSearchMethod, const int nMaxPerCentMismatch, const FileName& filSingletsFile ) { FILE* pSingletsFile = fopen( filSingletsFile.data(), "r" ); if ( !pSingletsFile ) { RWCString soErrorMessage = "could not open "; soErrorMessage += filSingletsFile; soErrorMessage += " for read. "; soErrorMessage += soGetErrno(); popupErrorMessage( soErrorMessage ); return; } // file looks like this: // >L2966P101FA3.T0.scf CHROMAT_FILE: L2966P101FA3.T0.scf PHD_FILE: L2966P101FA3.T0.s // cf.phd.1 CHEM: term DYE: big TIME: Wed Dec 5 22:55:05 2001 // GGGGTGAGTNNTTATNGGGGGACTNTTTNNGGCCTTCCGGGCGGCTAAAT // AGTTAGACTGGGGGGTCACTNAAGAAGNACTGCCTCTCTCCGGTTTGAAT // GGGGATCATTAATTGGGGGGAAAAAAGCTGGTGGAGAGGATGTGGNAAAT // AGGGAAANTTTGGTGATGTTGGTAGGGATGTAAAATAGNTTAAAGGTTGG // GGGAGGTAATGTGGGGGTTTNTNAGGGGTTTAGGATTGGAATAANATTTG // GGNNAGTTTTTTTTTTTTTTTGANGGGGGGGGGGGGNANNNAANAAGAAG // GANNNNAAAAAAAAGNNNNNNGGGGGGGGGGNGNNNNNNNAAAAAAAANN // GGGNNNNANNGGGAAANAANNNNNNGGGGNNANAGGGGANNNAGGGGGGG // AAGGAAAAGGGGGGGNAANANNNNNANGGGGAANGANGNNGGGGNNAAAA // AAAGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN // NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN // NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN // NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN // NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN // NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN // NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN // NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN // NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN // NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN // NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN // NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN // >L2966P101FA6.T0.scf CHROMAT_FILE: L2966P101FA6.T0.scf PHD_FILE: L2966P101FA6.T0.s // cf.phd.1 CHEM: term DYE: big TIME: Wed Dec 5 22:55:07 2001 // GGGGGATTGAGTTNTTNTGAATNGGGGNGGACCNNNNNTTTGNGNGGGCC RWCString soDNA( (size_t) 1000 ); RWCString soReadName; int nSingletsSearched = 0; const int SZLINE_SIZE = 2000; RWCString soLine( (size_t) SZLINE_SIZE + 1 ); bool bFirstTime = true; while( fgets( soLine.data(), SZLINE_SIZE, pSingletsFile ) != NULL ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); if ( soLine.length() > 1 && soLine[0] == '>' ) { if ( !bFirstTime ) { searchBothStrandsOfARead( NULL, // pLocFrag soReadName, soDNA, soQueryString, pGuiMultiContigNavigator, nSearchMethod, nMaxPerCentMismatch ); ++nSingletsSearched; if ( ( ( nSingletsSearched % 1000 ) == 0 ) || ( nSingletsSearched < 5 ) ) { cout << "searched " << nSingletsSearched << " singlets" << endl; cout.flush(); } } else { bFirstTime = false; } // now work on the header line of the next read // remove the leading ">" and get the read name RWCTokenizer tokNext( soLine ); soReadName = tokNext(); soReadName = soReadName.remove( 0, 1 ); soDNA = ""; } else { soLine.toLower(); soDNA.append( soLine ); } } // must search last read, providing the singlets file is not empty if ( !soDNA.isNull() ) { ++nSingletsSearched; searchBothStrandsOfARead( NULL, soReadName, soDNA, soQueryString, pGuiMultiContigNavigator, nSearchMethod, nMaxPerCentMismatch ); } cout << "searched " << nSingletsSearched << " singlets" << endl; } void searchBothStrandsOfARead( LocatedFragment* pLocFrag, const RWCString& soFragmentName, const RWCString& soDNA, const RWCString& soQueryString, guiMultiContigNavigator *pGuiMultiContigNavigator, const int nSearchMethod, const int nMaxPerCentMismatch ) { searchOneStrandOfARead( pLocFrag, soFragmentName, soDNA, soQueryString, False, pGuiMultiContigNavigator, nSearchMethod, nMaxPerCentMismatch ); RWCString soComplementStrand = soComplementSO( soQueryString ); searchOneStrandOfARead( pLocFrag, soFragmentName, soDNA, soComplementStrand, True, pGuiMultiContigNavigator, nSearchMethod, nMaxPerCentMismatch ); } void searchOneStrandOfARead( LocatedFragment* pLocFrag, const RWCString& soFragmentName, const RWCString& soDNA, const RWCString& soPattern, bool bQueryComplemented, guiMultiContigNavigator *pGuiMultiContigNavigator, const int nSearchMethod, const int nMaxPerCentMismatch ) { if ( nSearchMethod == nExact ) searchOneStrandOfAReadExact( pLocFrag, soFragmentName, soDNA, soPattern, bQueryComplemented, pGuiMultiContigNavigator ); else searchOneStrandOfAReadApproximate( pLocFrag, soFragmentName, soDNA, soPattern, bQueryComplemented, pGuiMultiContigNavigator, nMaxPerCentMismatch ); } void searchOneStrandOfAReadExact( LocatedFragment* pLocatedFragment, const RWCString& soFragmentName, const RWCString& soDNA, const RWCString& soPattern, bool bQueryComplemented, guiMultiContigNavigator *pGuiMultiContigNavigator ) { int nStartSearchPosition = 0; bool bContinue = True; while( bContinue ) { int nFoundPosition = soDNA.index( soPattern, nStartSearchPosition ); if (nFoundPosition == RW_NPOS ) { bContinue = FALSE; } else { // the Rogue Wave .index function returns positions 0-based // We want to return 1-based positions, so increment it. ++nFoundPosition; if (pLocatedFragment ) { Contig* pContig = pLocatedFragment->pGetContig(); int nReadPaddedStart = pLocatedFragment->nPaddedIndex( nFoundPosition ); int nReadPaddedEnd = pLocatedFragment->nPaddedIndex( nFoundPosition + soPattern.length() - 1 ); int nConsPaddedStart = pLocatedFragment->nGetConsPosFromFragIndex( nReadPaddedStart ); int nConsPaddedEnd = pLocatedFragment->nGetConsPosFromFragIndex( nReadPaddedEnd ); int nConsUnpaddedStart = pContig->nUnpaddedIndex(nConsPaddedStart); int nConsUnpaddedEnd = pContig->nUnpaddedIndex( nConsPaddedEnd ); // If a match was found on a read using the complemented // query sequence, and the read is right to left, then // it will appear to the user to be a match against the // uncomplemented query sequence. Hence the complex // exclusive or. RWCString soComp; if ( bQueryComplemented ) soComp = "(complemented)"; else soComp = "(uncomplemented)"; gotoItem* pGotoItem = new gotoItem( pContig, pLocatedFragment, nConsPaddedStart, nConsPaddedEnd, nConsUnpaddedStart, nConsUnpaddedEnd, soComp ); pGuiMultiContigNavigator->appendToMultiContigNavigator( pGotoItem ); } else { RWCString soComp; if (bQueryComplemented) soComp = "(complemented)"; else soComp = "(uncomplemented)"; RWCString soExplanation( (size_t) 100 ); soExplanation = RWCString( (long) nFoundPosition ) + "-" + RWCString( (long) ( nFoundPosition + soPattern.length() - 1 ) ) + " in read " + soFragmentName + " in singlets file " + soComp; gotoItem* pGotoItem = new gotoItem( NULL, NULL, nFoundPosition, nFoundPosition + soPattern.length() - 1, nFoundPosition, nFoundPosition + soPattern.length() - 1, soExplanation, false ); // bPrefixContigToDescription pGuiMultiContigNavigator->appendToMultiContigNavigator( pGotoItem ); } // set up for next time around nStartSearchPosition = nFoundPosition + soPattern.length(); if (nStartSearchPosition + (int) soPattern.length() - 1 > (int) soDNA.length() - 1 ) bContinue = False; } } } void searchJustContigs( const RWCString& soQueryStringLower, guiMultiContigNavigator* pGuiMultiContigNavigator, const int nSearchMethod, const int nMaxPerCentMismatch ) { RWCString soQueryComplement = soComplementSO( soQueryStringLower ); Assembly* pAssembly = ConsEd::pGetAssembly(); for( int nContig = 0; nContig < pAssembly->nNumContigs(); ++nContig ) { Contig* pContig = pAssembly->pGetContig( nContig ); int nUnpaddedConsensusLength; char* szUnpaddedConsensus = pContig->szGetUnpaddedConsensus( nUnpaddedConsensusLength); searchContigForString( soQueryStringLower, pGuiMultiContigNavigator, szUnpaddedConsensus, pContig, nSearchMethod, false, nMaxPerCentMismatch ); searchContigForString( soQueryComplement, pGuiMultiContigNavigator, szUnpaddedConsensus, pContig, nSearchMethod, true, nMaxPerCentMismatch ); delete szUnpaddedConsensus; } } void searchContigForString( const RWCString& soQueryStringLower, guiMultiContigNavigator* pGuiMultiContigNavigator, char* szUnpaddedConsensus, Contig* pContig, const int nSearchMethod, const bool bQueryComplemented, const int nMaxPerCentMismatch ) { if ( nSearchMethod == nExact ) searchContigForStringExact( soQueryStringLower, pGuiMultiContigNavigator, szUnpaddedConsensus, pContig, bQueryComplemented ); else searchContigForStringApproximate( soQueryStringLower, pGuiMultiContigNavigator, szUnpaddedConsensus, pContig, bQueryComplemented, nMaxPerCentMismatch ); } void searchContigForStringExact( const RWCString& soQueryStringLower, guiMultiContigNavigator* pGuiMultiContigNavigator, char* szUnpaddedConsensus, Contig* pContig, const bool bQueryComplemented) { int nConsensusLength = strlen( szUnpaddedConsensus ); int nStartLookingHereInConsensus = 0; bool bProcessMoreMatches = true; while( bProcessMoreMatches ) { // look for a match bool bFound = false; while( !bFound && ( nStartLookingHereInConsensus + soQueryStringLower.length()) <= nConsensusLength ) { int nOffset = 0; bool bStillLookingHere = true; while( bStillLookingHere && ( nOffset < soQueryStringLower.length()) ) { if ( szUnpaddedConsensus[ nStartLookingHereInConsensus + nOffset ] == soQueryStringLower[ nOffset ] ) ++nOffset; else bStillLookingHere = false; } if (bStillLookingHere ) bFound = true; else ++nStartLookingHereInConsensus; } if (! bFound ) bProcessMoreMatches = false; else { // found a match! Display it // change from 0-based to 1-based index int nStartUnpaddedPosition = nStartLookingHereInConsensus + pContig->nGetStartIndex(); int nEndUnpaddedPosition = nStartUnpaddedPosition + soQueryStringLower.length() - 1; int nStartPaddedPosition = pContig->nPaddedIndex( nStartUnpaddedPosition ); int nEndPaddedPosition = nStartPaddedPosition + soQueryStringLower.length() - 1; RWCString soComp; if (bQueryComplemented) soComp = "(complemented)"; else soComp = "(uncomplemented)"; gotoItem* pGotoItem = new gotoItem( pContig, NULL, nStartPaddedPosition, nEndPaddedPosition, nStartUnpaddedPosition, nEndUnpaddedPosition, soComp ); pGuiMultiContigNavigator->appendToMultiContigNavigator( pGotoItem ); // set up for another look nStartLookingHereInConsensus += soQueryStringLower.length(); } } } // pretty weird constants, no? Well, that's what agrep essentially // uses. If we use another (e.g., 6 for gap penalty), then the S/W // finds a different alignment than agrep does. The alignment that // S/W finds may have a worse per cent mismatch than agrep and then // people would ask me why it didn't work const int nGapPenalty = 2; const int nMatchBoost = 2; const int nMismatchPenalty = 2; static void findStartOfAlignment( char* szText, const int nTextLength, char* szQuery, const int nQueryLength, const int nRightMatchTextPos, const int nMismatches, int& nLeftMatchTextPos ) { // we need to find the beginning of the alignment in the // consensus (or read). There may be insertion or deletion errors // so the number of consensus bases in the alignment may be more // or less than the number of bases in the query. However, using // the nMismatches, we know the furthest left it can start. // query------> // consensus // |? (does alignment start here?) // |? (does alignment start here?) // |? (does alignment start here?) // | // | // | // | // | // v int nLeftMostMatchTextPos = nRightMatchTextPos - nQueryLength + 1 - nMismatches; if ( nLeftMostMatchTextPos < 0 ) nLeftMostMatchTextPos = 0; // keep scores for one row across // The +1 is due to the fact that the NW matrix is 1 greater // than that of the sequence in either dimension // Thus the index as we move from right to left will be // from soQuery.length() to 0 int* nScore = (int*) calloc( nQueryLength + 1, sizeof( int ) ); // compute scores for bottom row, starting on right hand side. int nAcross; for( nAcross = nQueryLength; nAcross >= 0; --nAcross ) { nScore[ nAcross ] = -nGapPenalty * ( nQueryLength - nAcross ); } int nBestScoreOnLeftEdge = nScore[0]; int nBestDownOnLeftEdge = nRightMatchTextPos; // now compute scores for the other rows, starting with the row // just above the bottom row int nToBeScoreDiagonallyBelowRight; int nScoreDiagonallyBelowRight; int nScoreByDiagonal; for( int nDown = nRightMatchTextPos; nDown >= nLeftMostMatchTextPos; --nDown ) { nToBeScoreDiagonallyBelowRight = nScore[ nQueryLength ]; // case of rightmost column. The MAX with 0 allows us to start // the alignment somewhere besides at the very bottom. Thus we // may find a better alignment than the one that ended at the // position agrep gave us. This handles // the case of agrep giving several ending positions of the alignments // that are all variations of the same alignment. nScore[ nQueryLength ] = nScore[ nQueryLength ] - nGapPenalty; // now handle other columns for( nAcross = nQueryLength - 1; nAcross >= 0; --nAcross ) { nScoreDiagonallyBelowRight = nToBeScoreDiagonallyBelowRight; nToBeScoreDiagonallyBelowRight = nScore[ nAcross ]; if ( szQuery[nAcross] == szText[nDown] ) nScoreByDiagonal = nScoreDiagonallyBelowRight + nMatchBoost; else nScoreByDiagonal = nScoreDiagonallyBelowRight - nMismatchPenalty; nScore[ nAcross ] = MAX( nScore[nAcross] - nGapPenalty, // move straight up MAX( nScore[ nAcross + 1 ] - nGapPenalty, // move straight left nScoreByDiagonal ) ); // move diagonally up left } if ( nScore[0] > nBestScoreOnLeftEdge ) { nBestScoreOnLeftEdge = nScore[0]; nBestDownOnLeftEdge = nDown; } } // for( int nDown = nConsensusZeroBasedEndOfMatch - 1; ... free( nScore ); nLeftMatchTextPos = nBestDownOnLeftEdge; } static void displayReadMatch( LocatedFragment* pLocatedFragment, const RWCString& soQuery, const RWCString& soDNA, const int nUnpaddedStartOfMatch, const int nUnpaddedEndOfMatch, guiMultiContigNavigator* pGuiMultiContigNavigator, const RWCString& soFragmentName, const bool bQueryComplemented ) { // let's create the alignment for displaying char* pSeqQuery = soQuery.data(); int nReadRegionLength = nUnpaddedEndOfMatch - nUnpaddedStartOfMatch + 1; char* pSeqRead = (char*) malloc( nReadRegionLength + 1 ); memcpy( pSeqRead, soDNA.data() + nUnpaddedStartOfMatch - 1, nReadRegionLength ); pSeqRead[ nReadRegionLength ] = '\0'; RWCString soAlignmentQuery; RWCString soAlignmentRead; int nScore; getAlignment( pSeqQuery, pSeqRead, nGapPenalty, nMatchBoost, nMismatchPenalty, soAlignmentQuery, soAlignmentRead, nScore ); printf( "see the pretty alignment (query/read) for read %s from %d to %d:\n%s\n%s\n\n", soFragmentName.data(), nUnpaddedStartOfMatch, nUnpaddedEndOfMatch, soAlignmentQuery.data(), soAlignmentRead.data() ); int nMismatches = 0; for( int n = 0; n < MIN( soAlignmentQuery.length(), soAlignmentRead.length() ); ++n ) if ( soAlignmentQuery[ n ] != soAlignmentRead[ n ] ) ++nMismatches; free( pSeqRead ); // now format information for displaying int nPerCentMismatch = nMismatches * 100 / soQuery.length(); if (pLocatedFragment ) { // case in which read is in the assembly Contig* pContig = pLocatedFragment->pGetContig(); int nReadPaddedStart; int nReadPaddedEnd; nReadPaddedStart = pLocatedFragment->nPaddedIndex( nUnpaddedStartOfMatch ); nReadPaddedEnd = pLocatedFragment->nPaddedIndex( nUnpaddedEndOfMatch ); int nConsPaddedStart = pLocatedFragment->nGetConsPosFromFragIndex( nReadPaddedStart ); int nConsPaddedEnd = pLocatedFragment->nGetConsPosFromFragIndex( nReadPaddedEnd ); int nConsUnpaddedStart = pContig->nUnpaddedIndex(nConsPaddedStart); int nConsUnpaddedEnd = pContig->nUnpaddedIndex( nConsPaddedEnd ); // If a match was found on a read using the complemented // query sequence, and the read is right to left, then // it will appear to the user to be a match against the // uncomplemented query sequence. Hence the complex // exclusive or. RWCString soComp; if ( bQueryComplemented ) soComp = "(complemented)"; else soComp = "(uncomplemented)"; soComp = soComp + " " + RWCString( (long) nPerCentMismatch ) + "% mismatch"; gotoItem* pGotoItem = new gotoItem( pContig, pLocatedFragment, nConsPaddedStart, nConsPaddedEnd, nConsUnpaddedStart, nConsUnpaddedEnd, soComp ); pGuiMultiContigNavigator->appendToMultiContigNavigator( pGotoItem ); } else { RWCString soComp; if (bQueryComplemented) soComp = "(complemented)"; else soComp = "(uncomplemented)"; soComp = soComp + " " + RWCString( (long) nPerCentMismatch ) + "% mismatch"; RWCString soExplanation = "in read " + soFragmentName + " not in any contig " + soComp; gotoItem* pGotoItem = new gotoItem( NULL, NULL, nUnpaddedStartOfMatch, nUnpaddedEndOfMatch, nUnpaddedStartOfMatch, nUnpaddedEndOfMatch, soExplanation ); pGuiMultiContigNavigator->appendToMultiContigNavigator( pGotoItem ); } } static void displayContigMatch( const RWCString& soQuery, char* szUnpaddedConsensus, const int nUnpaddedStartOfMatch, const int nUnpaddedEndOfMatch, guiMultiContigNavigator* pGuiMultiContigNavigator, Contig* pContig, const bool bQueryComplemented ) { // Now we can create the alignment using a simple NW. char* pSeqQuery = soQuery.data(); int nConsensusRegionLength = nUnpaddedEndOfMatch - nUnpaddedStartOfMatch + 1; char* pSeqConsensus = (char*) malloc( nConsensusRegionLength + 1 ); memcpy( pSeqConsensus, szUnpaddedConsensus + nUnpaddedStartOfMatch - 1, nConsensusRegionLength ); pSeqConsensus[ nConsensusRegionLength ] = '\0'; RWCString soAlignmentQuery; RWCString soAlignmentConsensus; int nScore; getAlignment( pSeqQuery, pSeqConsensus, nGapPenalty, nMatchBoost, nMismatchPenalty, soAlignmentQuery, soAlignmentConsensus, nScore ); printf( "see the pretty alignment (query/consensus) from %d to %d:\n%s\n%s\n\n", nUnpaddedStartOfMatch, nUnpaddedEndOfMatch, soAlignmentQuery.data(), soAlignmentConsensus.data() ); int nMismatches = 0; for( int n = 0; n < MIN( soAlignmentQuery.length(), soAlignmentConsensus.length() ); ++n ) if ( soAlignmentQuery[ n ] != soAlignmentConsensus[ n ] ) ++nMismatches; free( pSeqConsensus ); // now format information for displaying int nPerCentMismatch = nMismatches * 100 / soQuery.length(); int nStartPaddedPosition = pContig->nPaddedIndexFast( nUnpaddedStartOfMatch ); int nEndPaddedPosition = pContig->nPaddedIndexFast( nUnpaddedEndOfMatch ); RWCString soComp; if (bQueryComplemented) soComp = "(complemented) "; else soComp = "(uncomplemented)"; soComp = soComp + " " + RWCString( (long) nPerCentMismatch ) + "% mismatch"; gotoItem* pGotoItem = new gotoItem( pContig, NULL, nStartPaddedPosition, nEndPaddedPosition, nUnpaddedStartOfMatch, nUnpaddedEndOfMatch, soComp ); pGuiMultiContigNavigator->appendToMultiContigNavigator( pGotoItem ); } void searchContigForStringApproximate( const RWCString& soQueryStringLower, guiMultiContigNavigator* pGuiMultiContigNavigator, char* szUnpaddedConsensus, Contig* pContig, const bool bQueryComplemented, const int nMaxPerCentMismatch ) { ConsEd::pGetAssembly()->setAllPaddedPositionsArrays(); int nMaxErrors = soQueryStringLower.length() * nMaxPerCentMismatch / 100; if ( nMaxErrors == 0 && nMaxPerCentMismatch > 0 ) nMaxErrors = 1; int nConsensusLength = strlen( szUnpaddedConsensus ); RWTValOrderedVector aMatchPositions; RWTValOrderedVector aMatchScores; RWTValOrderedVector aUseMatches; myAgrepAndFilter( szUnpaddedConsensus, // szText nConsensusLength, // nTextLength, soQueryStringLower.data(), // char* szQuery, soQueryStringLower.length(), // const int nQueryLength, nMaxErrors, &aMatchPositions, &aMatchScores, &aUseMatches ); assert( aMatchPositions.length() == aMatchScores.length() ); assert( aMatchPositions.length() == aUseMatches.length() ); for( int nMatch = 0; nMatch < aMatchPositions.length(); ++nMatch ) { if ( !aUseMatches[ nMatch ] ) continue; int nZeroBasedStartOfMatch; findStartOfAlignment( szUnpaddedConsensus, nConsensusLength, soQueryStringLower.data(), soQueryStringLower.length(), aMatchPositions[ nMatch ], aMatchScores[ nMatch ], nZeroBasedStartOfMatch ); int nUnpaddedStart = nZeroBasedStartOfMatch + 1; int nUnpaddedEnd = aMatchPositions[ nMatch ] + 1; displayContigMatch( soQueryStringLower, szUnpaddedConsensus, nUnpaddedStart, nUnpaddedEnd, pGuiMultiContigNavigator, pContig, bQueryComplemented ); } } void searchOneStrandOfAReadApproximate( LocatedFragment* pLocFrag, const RWCString& soFragmentName, const RWCString& soDNA, const RWCString& soQueryStringLower, bool bQueryComplemented, guiMultiContigNavigator *pGuiMultiContigNavigator, const int nMaxPerCentMismatch ) { int nMaxErrors = soQueryStringLower.length() * nMaxPerCentMismatch / 100; if ( nMaxErrors == 0 && nMaxPerCentMismatch > 0 ) nMaxErrors = 1; RWTValOrderedVector aMatchPositions; RWTValOrderedVector aMatchScores; RWTValOrderedVector aUseMatches; myAgrepAndFilter( soDNA.data(), // szText soDNA.length(), // nTextLength, soQueryStringLower.data(), // char* szQuery, soQueryStringLower.length(), // const int nQueryLength, nMaxErrors, &aMatchPositions, &aMatchScores, &aUseMatches ); assert( aMatchPositions.length() == aMatchScores.length() ); assert( aMatchPositions.length() == aUseMatches.length() ); for( int nMatch = 0; nMatch < aMatchPositions.length(); ++nMatch ) { if ( !aUseMatches[ nMatch ] ) continue; int nZeroBasedStartOfMatch; findStartOfAlignment( soDNA.data(), soDNA.length(), soQueryStringLower.data(), soQueryStringLower.length(), aMatchPositions[ nMatch ], aMatchScores[ nMatch ], nZeroBasedStartOfMatch ); int nUnpaddedStart = nZeroBasedStartOfMatch + 1; int nUnpaddedEnd = aMatchPositions[ nMatch ] + 1; displayReadMatch(pLocFrag, soQueryStringLower, soDNA, nUnpaddedStart, nUnpaddedEnd, pGuiMultiContigNavigator, soFragmentName, bQueryComplemented ); } }