/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "contigMatchTablesType.h" #include "lookForImperfect3Prime4MerFalsePrimerMatchesOfOnePrimerInOneContig.h" #include "contig.h" #include "nLengthOf4Mer.h" #include "nFindScoreOfMatch.h" #include "consedParameters.h" #include "whyIsPrimerNotAcceptableTypes.h" #include "nConvertFrom4Mer.h" // We added this to handle the case in which there was a nearly perfect // false match like this: // acgggaaaaaactctta // acgggaaaaaactgtta // This false match would not be found by the routine // lookForPerfect4MerFalsePrimerMatchesOfOnePrimerInOneContig // since the latter will only find a false match if it matches perfectly // in the last 4 bases. Thus we added this routine, which will find // such matches. // This routine looks for perfect 6 base matches. Of those, it looks // for those which score above the nPrimersMaxMatchElsewhereScore_ // threshold. These primers are considered unacceptable and are // excluded. void lookForImperfect3Prime4MerFalsePrimerMatchesOfOnePrimerInOneContig( primerType* pPrimer, Contig* pContigOfTarget, const bool bComplementedContig, const bool bLookEverywhereInContig, const int nUnpaddedConsPosOfTargetStart, const int nUnpaddedConsPosOfTargetEnd, const bool bForwardNotReversePrimer ) { contigMatchTablesType* pUnpaddedContig = bComplementedContig ? pContigOfTarget->pUnpaddedContigComplemented_ : pContigOfTarget->pUnpaddedContig_; int nUnpaddedConsPosOfTargetStartPossiblyComplemented; int nUnpaddedConsPosOfTargetEndPossiblyComplemented; if ( ! bLookEverywhereInContig ) { if ( bComplementedContig ) { nUnpaddedConsPosOfTargetStartPossiblyComplemented = pContigOfTarget->nComplementUnpaddedIndex( nUnpaddedConsPosOfTargetEnd ); nUnpaddedConsPosOfTargetEndPossiblyComplemented = pContigOfTarget->nComplementUnpaddedIndex( nUnpaddedConsPosOfTargetStart ); } else { nUnpaddedConsPosOfTargetStartPossiblyComplemented = nUnpaddedConsPosOfTargetStart; nUnpaddedConsPosOfTargetEndPossiblyComplemented = nUnpaddedConsPosOfTargetEnd; } } const int nSix = 6; // xxxxxxxxxxxxxxxxxxxxxxx // ^ ^ ^ // a b c // start here and back up // c = szPrimer_ + nUnpaddedLength_ // so a = szPrimer_ + nUnpaddedLength_ - 7 // I'm not sure that saving any subtotals will help since we are // not even calculating every subtotal--just the ones in which // the end has a perfect 4-mer match within the range of consensus // positions we are looking for a false match for( char* p6MerLeftEnd = pPrimer->szPrimer_ + pPrimer->nUnpaddedLength_ - 7; p6MerLeftEnd >= pPrimer->szPrimer_; --p6MerLeftEnd ) { // look at last 4 bases of the 6mer int n4MerToLookup = nConvertFrom4Mer( p6MerLeftEnd + 2 ); blockOfLocationsOfA4MerType* pBlockOfLocationsOfA4Mer = &( pUnpaddedContig->blockOfLocationsOfA4Mer_[ n4MerToLookup ] ); bool bAgain = true; while( bAgain ) { for( int nIndexWithinBlock = 0; nIndexWithinBlock < pBlockOfLocationsOfA4Mer->nCurrentLength_; ++nIndexWithinBlock ) { int nUnpaddedPosOfMatchingContig4Mer = pBlockOfLocationsOfA4Mer->nUnpaddedPosition_[ nIndexWithinBlock ]; // avoid segmentation faults if ( nUnpaddedPosOfMatchingContig4Mer < 3 ) continue; char* sz66Bases = pUnpaddedContig->szUnpaddedBases_ + nUnpaddedPosOfMatchingContig4Mer - 3; // see diagram below and subtract 1 to change to zero-based // so nUnpaddedPosOfMatchingContig4Mer - 3 is // zero-based unpadded of 66 bases // check that previous 2 bases match precisely // if not, ignore this match since we are only interested // in perfect 6 base matches. if ( memcmp( p6MerLeftEnd, sz66Bases, 2 ) != 0 ) continue; // xxxxxxxxxxxx66mmmmxxxxxxxxxxx // ^ this is nUnpaddedPosOfMatchingContig4Mer // ^ is nUnpaddedPosOfMatchingContig4Mer - 2 // Distance to leftmost x is p6MerLeftEnd - pPrimer->szPrimer_ // So unpadded cons pos of leftmost x is // nUnpaddedPosOfMatchingContig4Mer - 2 - // ( p6MerLeftEnd - pPrimer->szPrimer_ ) // and then subtract 1 to get the position in zero based // coordinates: int nContigZeroBasedUnpaddedOfFirstBase = nUnpaddedPosOfMatchingContig4Mer - 2 - (p6MerLeftEnd - pPrimer->szPrimer_ ) - 1; if ( !bLookEverywhereInContig ) { if ( ! ( ( nUnpaddedConsPosOfTargetStartPossiblyComplemented <= nContigZeroBasedUnpaddedOfFirstBase ) && ( nContigZeroBasedUnpaddedOfFirstBase <= nUnpaddedConsPosOfTargetEndPossiblyComplemented ) ) ) { // the match is outside the subclone so it will not // cause mispriming continue; } } // this could happen if, for example, we have 4 perfectly // matching bases right at the beginning of the contig. // In such a case, would the primer misprime? We don't // really know--perhaps we should exclude it. For now, // I'm ignoring it. if ( nContigZeroBasedUnpaddedOfFirstBase < 0 ) continue; int nContigZeroBasedUnpaddedOfLastBaseOfSequenceToCompareToPrimer = nContigZeroBasedUnpaddedOfFirstBase + pPrimer->nUnpaddedLength_ - 1; // This check below insures we don't get subscript errors // by having a false match to very close to the end // of the pUnpaddedContig->szUnpaddedBases_ such that // when nFindScoreOfMatch runs, it runs off the end of // pUnpaddedContig->szUnpaddedBases_ (szFalseMatch is // just a pointer to within this structure ) // pUnpaddedContig->nUnpaddedBases_ - 1 is the index in // unpadded zero-based coordinates of the last base in // pUnpaddedContig->szUnpaddedBases if ( nContigZeroBasedUnpaddedOfLastBaseOfSequenceToCompareToPrimer > ( pUnpaddedContig->nUnpaddedBases_ - 1 ) ) { // printf("ran off end since nContigZeroBasedUnpaddedOfFirstBase = %d and pUnpaddedContig->nUnpaddedBases_ = %d and pPrimer->nUnpaddedLength_ = %d\n", // nContigZeroBasedUnpaddedOfFirstBase, // pUnpaddedContig->nUnpaddedBases_, // pPrimer->nUnpaddedLength_ ); continue; } if ( bForwardNotReversePrimer && !bComplementedContig && ( pContigOfTarget == pPrimer->pContig_ ) && ( nContigZeroBasedUnpaddedOfFirstBase == ( pPrimer->nUnpaddedStart_ - 1 ) ) ) continue; // primer is the same as the target if ( !bForwardNotReversePrimer && bComplementedContig && ( pContigOfTarget == pPrimer->pContig_ ) ) { int nComplementedStartOfPrimer = pPrimer->pContig_->nComplementUnpaddedIndex( pPrimer->nUnpaddedEnd_ ); if ( nContigZeroBasedUnpaddedOfFirstBase == nComplementedStartOfPrimer - 1 ) continue; // primer is the same as the target } char* szFalseMatch = pUnpaddedContig->szUnpaddedBases_ + nContigZeroBasedUnpaddedOfFirstBase; int nPosOfMaxCumulativeScore; int nScoreOfFalseMatch = nFindScoreOfMatch( pPrimer->nUnpaddedLength_, szFalseMatch, nPosOfMaxCumulativeScore ); // at least the last 4 bases must match--bullshit // As a matter of fact, in general the last 4 bases // will not match--that is the purpose of this routine. if ( nScoreOfFalseMatch > pPrimer->nScoreOfStickiestFalseMatch_ ) { pPrimer->nScoreOfStickiestFalseMatch_ = nScoreOfFalseMatch; pPrimer->pContigOfStickiestFalseMatch_ = pContigOfTarget; pPrimer->nUnpaddedOfStickiestFalseMatch_ = nContigZeroBasedUnpaddedOfFirstBase + 1; pPrimer->bContigOfStickiestFalseMatchIsComplemented_ = bComplementedContig; if (pPrimer->nScoreOfStickiestFalseMatch_ > pCP->nPrimersMaxMatchElsewhereScoreToUse_ ) { pPrimer->bAcceptable_ = false; pPrimer->nWhyIsPrimerNotAcceptable_ = BAD_PRIMER_STICKS_TO_SOMEWHERE_ELSE; // this saves checking for other false matches // Note that they may be worse, but this one is bad // enough so don't even show it. return; } } } // for( int nIndexWithinBlock = 0; if ( pBlockOfLocationsOfA4Mer->pNextBlockOfLocationsOfA4Mer_ ) pBlockOfLocationsOfA4Mer = pBlockOfLocationsOfA4Mer->pNextBlockOfLocationsOfA4Mer_; else bAgain = false; } // while( bAgain ) } // for( char* p6MerLeftEnd = }