/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "myAgrepAndFilter.h" #include "myAgrep.h" #include "assert.h" #include "abs.h" void myAgrepAndFilter( char* szText, const int nTextLength, char* szQuery, const int nQueryLength, const int nMaxErrors, RWTValOrderedVector* pMatchPositions, RWTValOrderedVector* pMatchScores, RWTValOrderedVector* pUseMatches ) { myAgrep( szText, nTextLength, szQuery, nQueryLength, nMaxErrors, pMatchPositions, pMatchScores ); // now need to filter these assert( pMatchPositions->length() == pMatchScores->length() ); // start by saying use all matches int n; for( n = 0; n < pMatchPositions->length(); ++n ) { pUseMatches->insert( true ); } for( n = 0; n < pMatchPositions->length(); ++n ) { int nPos = (*pMatchPositions)[ n ]; // now I'd like to find a cluster for this match int nBeginningOfCluster = n; int nEndOfCluster = n; int n2; for( n2 = n + 1; n2 < pMatchPositions->length(); ++n2 ) { if ( ABS( (*pMatchPositions)[ n2 ] - nPos ) <= 2*nMaxErrors ) nEndOfCluster = n2; else break; // reached end of cluster } // within the cluster, find the lowest scoring match // (lowest means best because we are measuring mismatches int nScoreOfLowestScoringMatch = nMaxErrors + 1; // so the first time will be found int nPositionOfLowestScoringMatch; for( n2 = nBeginningOfCluster; n2 <= nEndOfCluster; ++n2 ) { if ( (*pMatchScores)[ n2 ] < nScoreOfLowestScoringMatch ) { nPositionOfLowestScoringMatch = n2; nScoreOfLowestScoringMatch = (*pMatchScores)[ n2 ]; } } for( n2 = nBeginningOfCluster; n2 <= nEndOfCluster; ++n2 ) { if ( n2 != nPositionOfLowestScoringMatch ) (*pUseMatches)[n2] = false; } // I could try to advance n over the cluster, but I not sure if // I would miss some... } }