/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "contig.h" #include "readTypes.h" #include using namespace std; #include "consedParameters.h" #include "numutil.h" #include "bPrimerHasMatchesElsewhere.h" #include "createPrimerCandidatesWith3PrimeEnd.h" #include "bPrimerHasMatchesAgainstSequencesInFile.h" #include "whichPrimersHaveAcceptableMeltingTemp.h" #include "whichPrimersHaveAcceptableSelfMatch.h" #include "pFindShortestAcceptablePrimer.h" #include #include "ucGetReadType.h" #include "bIsTopStrandRead.h" #include #include #include "experiment.h" #include "guiapp.h" #include "consed.h" #include "please_wait.h" #include "paddedFromUnpadded.h" #include "checkPrimersForMononucleotideRepeats.h" #include "whyIsPrimerNotAcceptableTypes.h" #include #include "mbtPtrVector.h" #include "fileDefines.h" #include "dErrorRateFromQuality.h" #include "dGetErrorsFixedAtBase.h" #include "nGetAmountOfOverlap.h" #include "abs.h" #include "nMAX_QUALITY_LEVEL2.h" #include "multiplyDistributions.h" #include "multiplyDistributions2.h" #include "multiplyDistributions3.h" #include "multiplyDistributionWithMinimum.h" #include "makeDistributionAllOnes.h" #include "assignDistributions.h" #include "dumpDistribution.h" #include "library.h" #include "bigArrayOfLittleArraysOfDistributions.h" #include "checkPrimersForACGT.h" void Contig :: setUpForCreatingExperimentsList() { // prepare for creating primers // this is done already in autoFinish.cpp and in guiFindFinishingReads // by setContigMatchTablesInAllContigs // setContigMatchTables(); setTargetNumberOfErrorsForThisContig(); createFakeSubcloneTTemplatesForWholeCloneReads(); // now this is set by Assembly::setAllUnpaddedErrorProbablities // setUnpaddedErrorProbabilities(); setExpectedNumberOfErrorsForThisContig(); setUnpaddedReadsThatCouldImproveQualities(); createCumulativeErrorArray(); if ( pCP->bAutoFinishSuggestSpecialChemistryForRunsAndStops_ ) { determineLocationsOfRunsAndStops(); } pCandidateExperiments_ = new mbtPtrOrderedVector; pChosenExperiments_ = new mbtPtrOrderedVector; pUniversalPrimerExperimentsForLowQualityRegions_ = new mbtPtrOrderedVector; } void Contig :: batchFindFinishingReads() { fprintf( pAO, "\n\nCONTIG: %s ( with %d reads and length %d )\n", (char*) soGetName().data(), nGetNumberOfFragsInContig(), nGetUnpaddedSeqLength() ); setUpForCreatingExperimentsList(); fprintf( pAO, "Estimated number of errors in consensus (not including doNotFinish tags): %.2f\n", dExpectedNumberOfErrorsInConsensus_ ); fprintf( pAO, "Estimated number of errors in contig (includes gaps on ends): %.2f\n", dExpectedNumberOfErrorsInExtendedContig_ ); fprintf( pAO, "Target number of errors for contig: %.2f\n\n", dTargetNumberOfErrors_ ); if ( bIsThisEndTheEndOfTheClone( nLeftGap ) ) { fprintf( pAO, "Left end of contig %s is one end of the clone\n", soGetName().data() ); } if ( bIsThisEndTheEndOfTheClone( nRightGap ) ) { fprintf( pAO, "Right end of contig %s is one end of the clone\n", soGetName().data() ); } // this will examine fwd/rev pair information, and possibly // suggest minilibraries or suggest reads to flank the gaps dealWithContigEnds(); if ( pCP->bAutoFinishEmulate9_66Behavior_ ) { if ( consedParameters::pGetConsedParameters()->bAutoFinishCoverLowConsensusQualityRegions_ || consedParameters::pGetConsedParameters()->bAutoFinishCloseGaps_ ) { createExperimentCandidatesListFor9_66(); chooseExperimentsToCoverLowQualityRegionsAndGapsFor9_66(); } goto skipFor9_66Behavior; } if ( consedParameters::pGetConsedParameters()->bAutoFinishCoverLowConsensusQualityRegions_ || consedParameters::pGetConsedParameters()->bAutoFinishCloseGaps_ ) { saveStateOfConsensus(); if ( pCP->bAutoFinishNearGapsSuggestEachMissingReadOfReadPairs_ ) { if ( pCP->bAutoFinishAllowDeNovoUniversalPrimerSubcloneReads_ ) { fprintf( pAO, "\n\nChoosing de novo universal primer reads to try to close gaps\n\n" ); nearGapsSuggestEachMissingReadOfReadPairs(); } else { fprintf( pAO, "\n\nYou have specified:\nconsed.autoFinishAllowDeNovoUniversalPrimerSubcloneReads: false\nconsed.autoFinishNearGapsSuggestEachMissingReadOfReadPairs: true\nso autofinish will *not* suggest each missing read of read pairs\n" ); } } fprintf( pAO, "\n\nChoosing universal primer reads to cover low quality regions and close gaps...\n\n" ); createUniversalPrimerExperimentCandidatesList(); chooseExperimentsToCoverLowQualityRegionsAndGaps( true ); // universal primer phase 1 if ( pCP->dAutoFinishRedundancy_ > 1.0 ) { restoreStateOfConsensus(); recalculateCurrentExpectedNumberOfErrorsForThisContig( dExpectedNumberOfErrorsInConsensus_, dExpectedNumberOfErrorsInExtendedContig_ ); recalculateErrorsFixedOfExperimentCandidates(); fprintf( pAO, "\n\nChoosing redundant univeral primer reads to cover low quality regions and close gaps...\n\n" ); chooseExperimentsToCoverLowQualityRegionsAndGaps( true ); // universal primer phase 2 restoreStateOfConsensus(); recalculateConsensusUsingAllChosenUniversalPrimerExperiments(); recalculateCurrentExpectedNumberOfErrorsForThisContig( dExpectedNumberOfErrorsInConsensus_, dExpectedNumberOfErrorsInExtendedContig_ ); clearExperimentCandidatesList(); } createCustomPrimerExperimentCandidatesList(); fprintf( pAO, "\n\nChoosing custom primer reads to cover low quality regions and close gaps...\n\n" ); chooseExperimentsToCoverLowQualityRegionsAndGaps( false ); // const bool bUniversalPrimerNotCustomPrimerReads ) if ( pCP->bAutoFinishCloseGaps_ ) warnUserIfCouldNotExtendContig(); } skipFor9_66Behavior: if ( consedParameters::pGetConsedParameters()->bAutoFinishCoverSingleSubcloneRegions_ ) { fprintf( pAO, "\n\nChoosing custom primer reads to cover single subclone regions for contig %s\n\n", (char*) soGetName().data() ); coverSingleSubcloneRegions(); } fprintf( pAO, "Contig %s had %d suggested experiments.\n\n\n", (char*) soGetName().data(), pChosenExperiments_->length() ); } void Contig :: chooseExperimentsToCoverLowQualityRegionsAndGaps( const bool bUniversalPrimerNotCustomPrimerReads ) { // Possible problem: if the consensus is high quality, but you want // autofinish to extend the consensus, it might not do so. // (DG, Feb 2000) if ( dExpectedNumberOfErrorsInExtendedContig_ <= dTargetNumberOfErrors_ ) { fprintf( pAO, "Estimated number of errors in extended contig %.2f is already less than the maximum %.2f for this contig\n", dExpectedNumberOfErrorsInExtendedContig_, dTargetNumberOfErrors_ ); return; } chooseExperimentsToCoverLowQualityRegionsAndGaps2( bUniversalPrimerNotCustomPrimerReads ); } void Contig :: warnUserIfCouldNotExtendContig() { if ( !bIsThisEndTheEndOfTheClone( nLeftGap ) && bDoTagsAllowExtendingThisEnd( nLeftGap ) ) { if ( nUnpaddedHighQualitySegmentStart_ == nUpdatedUnpaddedHighQualitySegmentStart_ ) fprintf( pAO, "WARNING: COULD NOT EXTEND CONTIG TO LEFT--MIGHT NEED PCR\n" ); else { ConsEd::pGetAssembly()->bAnyReadsToCloseGaps_ = true; bReadsWillExtendLeftEnd_ = true; } } if ( !bIsThisEndTheEndOfTheClone( nRightGap ) && bDoTagsAllowExtendingThisEnd( nRightGap ) ) { if ( nUnpaddedHighQualitySegmentEnd_ == nUpdatedUnpaddedHighQualitySegmentEnd_ ) fprintf( pAO, "WARNING: COULD NOT EXTEND CONTIG TO RIGHT--MIGHT NEED PCR\n" ); else { ConsEd::pGetAssembly()->bAnyReadsToCloseGaps_ = true; bReadsWillExtendRightEnd_ = true; } } } void Contig :: chooseExperimentsToCoverLowQualityRegionsAndGaps2( const bool bUniversalPrimerNotCustomPrimerReads) { if ( pCandidateExperiments_->length() == 0 ) { fprintf( pAO, "No possible experiments found\n" ); return; } bool bMoreAvailableExperiments; bool bConsiderMoreExperiments = true; do { experiment* pBestExp = pGetBestExperimentCandidate(); if ( pBestExp == NULL ) { bMoreAvailableExperiments = false; bConsiderMoreExperiments = false; } else { // found the best experiment assert( pBestExp->dErrorsFixed_ >= consedParameters::pGetConsedParameters()->dAutoFinishMinNumberOfErrorsFixedByAnExpToUse_ ); // the best experiment is fine--transfer it pBestExp->nPurpose_ = nFixWeakRegion; pBestExp->chooseExperiment(); possiblyUpdateContigHighQualitySegment( pBestExp ); pChosenExperiments_->insert( pBestExp ); if ( bUniversalPrimerNotCustomPrimerReads ) pUniversalPrimerExperimentsForLowQualityRegions_->insert( pBestExp ); ConsEd::pGetAssembly()->pAllChosenExperiments_->insert( pBestExp ); adjustExpectedNumberOfErrors( pBestExp ); findWindowOfMostErrorsFixed( pBestExp ); pBestExp->print(); fprintf( pAO, "After this experiment,\ncalculated number of errors in extended contig: %.2f and consensus: %.2f\n\n", dExpectedNumberOfErrorsInExtendedContig_, dExpectedNumberOfErrorsInConsensus_ ); if ( !bStillTooManyExpectedErrors() ) bConsiderMoreExperiments = false; else { adjustRemainingExperimentCandidates( pBestExp ); // check no bugs double dErrorsInConsensus; double dErrorsInExtendedContig; recalculateCurrentExpectedNumberOfErrorsForThisContig( dErrorsInConsensus, dErrorsInExtendedContig ); if ( ( ABS( dErrorsInConsensus - dExpectedNumberOfErrorsInConsensus_ ) > .000001 ) || ( ABS( dErrorsInExtendedContig - dExpectedNumberOfErrorsInExtendedContig_ ) > .000001 ) ) { fprintf( pAO, "Warning: dExpectedNumberOfErrorsInConsensus = %e but calculated is %e and dExpectedNumberOfErrorsInExtendedContig_ = %e but calculated is %e\n", dExpectedNumberOfErrorsInConsensus_, dErrorsInConsensus, dExpectedNumberOfErrorsInExtendedContig_, dErrorsInExtendedContig ); } else { fprintf( pAO, "OK3\n" ); } // end check no bugs } } // if ( pBestExp == NULL ) else } while( bConsiderMoreExperiments ); // tell user why we stopped // We could have stopped because: // 1) the error rate was below the threshold // 2) there were no more experiments // 3) there were more experiments, but they weren't worth doing if ( bStillTooManyExpectedErrors() ) { fprintf( pAO, "There are still too many expected errors but we stopped because\n" ); if ( bMoreAvailableExperiments ) fprintf( pAO, "all remaining available experiments each solved too \nfew errors\nto be worthwhile doing\n" ); else fprintf( pAO, "there were no more experiments this program could \nfind to resolve the remaining errors.\n" ); printWorstRemainingRegion(); } else fprintf( pAO, "The expected error count for the contig should be acceptable after doing these experiments\n" ); } void Contig :: printWorstRemainingRegion() { int nWindowOfWorstErrors = ConsEd::pGetAssembly()->pModelRead_->length() - ConsEd::pGetAssembly()->nModelReadHighQualitySegmentStart_; double dConsensusErrors = 0; double dConsensusErrorsInWindow = 0; double dMaxErrorsInWindow = -1; int nUnpaddedMaxLocationLeft; int nUnpaddedMaxLocationRight; if ( pUnpaddedErrorProbabilities_->nGetStartIndex() + nWindowOfWorstErrors - 1 > pUnpaddedErrorProbabilities_->nGetEndIndex() ) { // this is the case in which the extended consensus is shorter than // a single read nUnpaddedMaxLocationLeft = nGetUnpaddedStartIndex(); nUnpaddedMaxLocationRight = nGetUnpaddedEndIndex(); dMaxErrorsInWindow = dExpectedNumberOfErrorsInExtendedContig_; } else { // get the first window's worth double dErrorsInCurrentWindow = 0.0; for( int nUnpaddedConsPos = pUnpaddedErrorProbabilities_->nGetStartIndex(); nUnpaddedConsPos <= pUnpaddedErrorProbabilities_->nGetStartIndex() + nWindowOfWorstErrors - 1; ++nUnpaddedConsPos ) { dErrorsInCurrentWindow += (*pUnpaddedErrorProbabilities_)[ nUnpaddedConsPos ]; } dMaxErrorsInWindow = dErrorsInCurrentWindow; nUnpaddedMaxLocationLeft = pUnpaddedErrorProbabilities_->nGetStartIndex(); nUnpaddedMaxLocationRight = pUnpaddedErrorProbabilities_->nGetStartIndex() + nWindowOfWorstErrors - 1; for( int nUnpaddedConsPosRight = pUnpaddedErrorProbabilities_->nGetStartIndex() + nWindowOfWorstErrors; nUnpaddedConsPosRight <= pUnpaddedErrorProbabilities_->nGetEndIndex(); ++nUnpaddedConsPosRight ) { // remove the base just to the left of the leftmost and add the // base at the right dErrorsInCurrentWindow = dErrorsInCurrentWindow - (*pUnpaddedErrorProbabilities_)[ nUnpaddedConsPosRight - nWindowOfWorstErrors ] + (*pUnpaddedErrorProbabilities_)[ nUnpaddedConsPosRight ]; if ( dErrorsInCurrentWindow > dMaxErrorsInWindow ) { dMaxErrorsInWindow = dErrorsInCurrentWindow; nUnpaddedMaxLocationLeft = nUnpaddedConsPosRight - nWindowOfWorstErrors + 1; nUnpaddedMaxLocationRight = nUnpaddedConsPosRight; } } } // if ( pUnpaddedErrorProbabilities_->nGetStartIndex() ... else fprintf( pAO, "Worst %d base region that is left unfixed is from %d to %d with %.3f errors\n", nWindowOfWorstErrors, nUnpaddedMaxLocationLeft, nUnpaddedMaxLocationRight, dMaxErrorsInWindow ); } void Contig :: createCumulativeErrorArray() { int nArrayLength = 2* ( consedParameters::pGetConsedParameters()->nAutoFinishNumberOfBasesBetweenContigsAssumed_ ) + nGetUnpaddedSeqLength(); int nOffset = -consedParameters::pGetConsedParameters()->nAutoFinishNumberOfBasesBetweenContigsAssumed_ + Contig :: nGetUnpaddedStartIndex(); pCumUnpaddedErrors_ = new mbtValVector( nArrayLength, nOffset, 0 ); (*pCumUnpaddedErrors_)[pCumUnpaddedErrors_->nGetStartIndex() ] = (*pUnpaddedErrorProbabilities_)[ pCumUnpaddedErrors_->nGetStartIndex() ]; for( int nPos = pCumUnpaddedErrors_->nGetStartIndex() + 1; nPos <= pCumUnpaddedErrors_->nGetEndIndex(); ++nPos ) { (*pCumUnpaddedErrors_)[ nPos ] = (*pUnpaddedErrorProbabilities_)[ nPos ] + (*pCumUnpaddedErrors_)[ nPos - 1 ]; } } void Contig :: resetCumulativeErrorArray() { (*pCumUnpaddedErrors_)[pCumUnpaddedErrors_->nGetStartIndex() ] = (*pUnpaddedErrorProbabilities_)[ pCumUnpaddedErrors_->nGetStartIndex() ]; for( int nPos = pCumUnpaddedErrors_->nGetStartIndex() + 1; nPos <= pCumUnpaddedErrors_->nGetEndIndex(); ++nPos ) { (*pCumUnpaddedErrors_)[ nPos ] = (*pUnpaddedErrorProbabilities_)[ nPos ] + (*pCumUnpaddedErrors_)[ nPos - 1 ]; } } void Contig :: setUnpaddedErrorProbabilitiesAndMore() { setUnpaddedErrorProbabilities(); // now look for tags that indicate not finishing. Whenever you find // one, set the errors to zero there. setErrorsToZeroWithinDoNotFinishTags(); handleTagsToNotExtendContig(); lookForCloneEndTags(); setHighQualitySegment(); // special case for people who are using autofinish just to close gaps. // This is a little inefficient since it first sets all error probabilities // (above) and then sets most of them to back to zero. It leaves // the low quality quality errors near the ends of the consensus // so that autofinish can improve its quality so it can extend // into the gaps. if ( !pCP->bAutoFinishCoverLowConsensusQualityRegions_ ) { if ( nUnpaddedHighQualitySegmentStart_ != -1 && nUnpaddedHighQualitySegmentEnd_ != -1 ) { for( int nUnpadded = nUnpaddedHighQualitySegmentStart_; nUnpadded <= nUnpaddedHighQualitySegmentEnd_; ++nUnpadded ) { (*pUnpaddedErrorProbabilities_)[ nUnpadded ] = 0.0; } } } // save a copy since the other copy will be updated as autofinish // gets reads (this may change) pOriginalUnpaddedErrorProbabilities_ = new mbtValVector( pUnpaddedErrorProbabilities_->length(), pUnpaddedErrorProbabilities_->nOffset_, 0 ); for( int nPos = pUnpaddedErrorProbabilities_->nGetStartIndex(); nPos <= pUnpaddedErrorProbabilities_->nGetEndIndex(); ++nPos ) { (*pOriginalUnpaddedErrorProbabilities_)[ nPos ] = (*pUnpaddedErrorProbabilities_)[ nPos ]; } // set the array of arrays of distributions // Intially, since there are no reads chosen, at each base, the // array of distributions will be null. pDistributionsOfChosenExperimentsArray_ = new bigArrayOfLittleArraysOfDistributions( pUnpaddedErrorProbabilities_->length(), pUnpaddedErrorProbabilities_->nOffset_ ); pSavedDistributionsOfChosenExperimentsArray_ = new bigArrayOfLittleArraysOfDistributions( pUnpaddedErrorProbabilities_->length(), pUnpaddedErrorProbabilities_->nOffset_ ); } void Contig :: setUnpaddedErrorProbabilities() { int nLength = nGetUnpaddedSeqLength() + 2 * consedParameters::pGetConsedParameters()-> nAutoFinishNumberOfBasesBetweenContigsAssumed_; pUnpaddedErrorProbabilities_ = new mbtValVector( nLength, -consedParameters::pGetConsedParameters()-> nAutoFinishNumberOfBasesBetweenContigsAssumed_ + Contig::nGetUnpaddedStartIndex(), 0 ); pSavedUnpaddedErrorProbabilities_ = new mbtValVector( *pUnpaddedErrorProbabilities_ ); // the Contig::nGetUnpaddedStartIndex above makes this array so that position 1 is the same // as position 1 in the contig in unpadded consensus positions // We count a gap base as an error *unless* it is off the end of the // clone end. To figure this out, we need to use the clone end code. double dErrorOfALeftGapBase; if ( pCP->bAutoFinishCloseGaps_ ) { if ( bIsThisEndTheEndOfTheClone( nLeftGap ) ) dErrorOfALeftGapBase = 0.0; else dErrorOfALeftGapBase = 1.0; } else dErrorOfALeftGapBase = 0.0; int nUnpaddedConsPos; for( nUnpaddedConsPos = pUnpaddedErrorProbabilities_->nGetStartIndex(); nUnpaddedConsPos <= 0; ++nUnpaddedConsPos ) (*pUnpaddedErrorProbabilities_)[ nUnpaddedConsPos ] = dErrorOfALeftGapBase; double dErrorOfARightGapBase; if ( pCP->bAutoFinishCloseGaps_ ) { if ( bIsThisEndTheEndOfTheClone( nRightGap ) ) dErrorOfARightGapBase = 0.0; else dErrorOfARightGapBase = 1.0; } else dErrorOfARightGapBase = 0.0; // found and fixed bug via desk check for( nUnpaddedConsPos = nGetUnpaddedEndIndex() + 1; nUnpaddedConsPos <= pUnpaddedErrorProbabilities_->nGetEndIndex(); ++nUnpaddedConsPos ) (*pUnpaddedErrorProbabilities_)[ nUnpaddedConsPos ] = dErrorOfARightGapBase; int nUnpaddedPos = Contig::nGetUnpaddedStartIndex(); Ntide nt; int nQuality; double dError; for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { nt = ntGetCons( nConsPos ); if ( !nt.bIsPad() ) { // changed 3/17/99 since 98 should not be considered a high quality // dErrorRateFromQuality handles this correctly--see // setErrorRateFromQuality.cpp nQuality = nt.qualGetQuality(); (*pUnpaddedErrorProbabilities_)[ nUnpaddedPos ] = dErrorRateFromQuality[ nQuality ]; ++nUnpaddedPos; } } assert( nUnpaddedPos == ( nGetUnpaddedEndIndex() + 1 ) ); } void Contig :: handleTagsToNotExtendContig() { determineIfTagsPreventExtendingContig(); if ( bTagsSayDoNotExtendToLeft_ ) { for( int nUnpaddedConsPos = pUnpaddedErrorProbabilities_->nGetStartIndex(); nUnpaddedConsPos <= Contig::nGetUnpaddedStartIndex() - 1; ++nUnpaddedConsPos ) (*pUnpaddedErrorProbabilities_)[ nUnpaddedConsPos ] = 0.0; } if ( bTagsSayDoNotExtendToRight_ ) { for( int nUnpaddedConsPos = Contig::nGetUnpaddedEndIndex() + 1; nUnpaddedConsPos <= pUnpaddedErrorProbabilities_->nGetEndIndex(); ++nUnpaddedConsPos ) (*pUnpaddedErrorProbabilities_)[ nUnpaddedConsPos ] = 0.0; } } RWCString soDoNotDoPCR( "doNotDoPCR" ); void Contig :: determineIfTagsPreventPCR() { if ( bAlreadyExaminedContigForDoNotDoPCRTags_ ) return; bAlreadyExaminedContigForDoNotDoPCRTags_ = true; fprintf( pAO, "looking for tags indicating do not do PCR in contig %s\n", soGetName().data() ); // first look in the reads for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); for( int nTag = 0; nTag < pLocFrag->nGetNumberOfTags(); ++nTag ) { tag* pTag = pLocFrag->pGetTag( nTag ); if ( pTag->soGetTagType() != soDoNotDoPCR ) continue; int nUnpaddedConsPosRangeStart = nUnpaddedIndex( pTag->nPaddedConsPosStart_ ) - pCP->nAutoFinishDoNotExtendContigsIfTagsAreThisCloseToContigEnd_; int nUnpaddedConsPosRangeEnd = nUnpaddedIndex( pTag->nPaddedConsPosEnd_ ) + pCP->nAutoFinishDoNotExtendContigsIfTagsAreThisCloseToContigEnd_; bool bOverlapsLeftEnd = false; bool bOverlapsRightEnd = false; if ( nUnpaddedConsPosRangeStart <= nGetUnpaddedStartIndex() && nGetUnpaddedStartIndex() <= nUnpaddedConsPosRangeEnd ) { bOverlapsLeftEnd = true; } if ( nUnpaddedConsPosRangeStart <= nGetUnpaddedEndIndex() && nGetUnpaddedEndIndex() <= nUnpaddedConsPosRangeEnd ) { bOverlapsRightEnd = true; } if ( bOverlapsLeftEnd && bOverlapsRightEnd ) { /* pathological case in which this is a small contig and we are having trouble figuring out which end the tag refers to */ int nUnpaddedConsPosStart = nUnpaddedIndex( pTag->nPaddedConsPosStart_ ); int nUnpaddedConsPosEnd = nUnpaddedIndex( pTag->nPaddedConsPosEnd_ ); if ( ABS( nUnpaddedConsPosStart - nGetUnpaddedStartIndex() ) < ABS( nUnpaddedConsPosEnd - nGetUnpaddedEndIndex() ) ) bTagsSayDoNotDoPCRWithLeftEnd_ = true; else bTagsSayDoNotDoPCRWithRightEnd_ = true; } else { if ( bOverlapsLeftEnd ) bTagsSayDoNotDoPCRWithLeftEnd_ = true; else bTagsSayDoNotDoPCRWithRightEnd_ = true; } } // for( int nTag = ... } // (for( int nRead = ... // now look at consensus tags for( int nConsensusTag = 0; nConsensusTag < nGetNumberOfTags(); ++nConsensusTag ) { tag* pTag = pGetTag( nConsensusTag ); if ( pTag->soGetTagType() != soDoNotDoPCR ) continue; int nUnpaddedConsPosRangeStart = nUnpaddedIndex( pTag->nPaddedConsPosStart_ ) - pCP->nAutoFinishDoNotExtendContigsIfTagsAreThisCloseToContigEnd_; int nUnpaddedConsPosRangeEnd = nUnpaddedIndex( pTag->nPaddedConsPosEnd_ ) + pCP->nAutoFinishDoNotExtendContigsIfTagsAreThisCloseToContigEnd_; bool bOverlapsLeftEnd = false; bool bOverlapsRightEnd = false; if ( nUnpaddedConsPosRangeStart <= nGetUnpaddedStartIndex() && nGetUnpaddedStartIndex() <= nUnpaddedConsPosRangeEnd ) bOverlapsLeftEnd = true; if ( nUnpaddedConsPosRangeStart <= nGetUnpaddedEndIndex() && nGetUnpaddedEndIndex() <= nUnpaddedConsPosRangeEnd ) bOverlapsRightEnd = true; if ( bOverlapsLeftEnd && bOverlapsRightEnd ) { /* pathological case in which this is a small contig and we are having trouble figuring out which end the tag refers to */ int nUnpaddedConsPosStart = nUnpaddedIndex( pTag->nPaddedConsPosStart_ ); int nUnpaddedConsPosEnd = nUnpaddedIndex( pTag->nPaddedConsPosEnd_ ); if ( ABS( nUnpaddedConsPosStart - nGetUnpaddedStartIndex() ) < ABS( nUnpaddedConsPosEnd - nGetUnpaddedEndIndex() ) ) bTagsSayDoNotDoPCRWithLeftEnd_ = true; else bTagsSayDoNotDoPCRWithRightEnd_ = true; } else if ( bOverlapsLeftEnd ) bTagsSayDoNotDoPCRWithLeftEnd_ = true; else if ( bOverlapsRightEnd ) bTagsSayDoNotDoPCRWithRightEnd_ = true; } if ( bTagsSayDoNotDoPCRWithLeftEnd_ ) fprintf( pAO, "tags say do not do pcr with left end\n" ); if ( bTagsSayDoNotDoPCRWithRightEnd_ ) fprintf( pAO, "tags say do not do pcr with right end\n" ); } void Contig :: determineIfTagsPreventExtendingContig() { if ( bAlreadyExaminedContigForDoNotExtendTags_ ) return; bAlreadyExaminedContigForDoNotExtendTags_ = true; fprintf( pAO, "looking for tags indicating do not extend contig in contig %s\n", soGetName().data() ); // first look in the reads for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); for( int nTag = 0; nTag < pLocFrag->nGetNumberOfTags(); ++nTag ) { tag* pTag = pLocFrag->pGetTag( nTag ); if ( pCP->aAutoFinishTagsToNotExtend_.index( pTag->soGetTagType() ) == RW_NPOS ) continue; int nUnpaddedConsPosStart = nUnpaddedIndex( pTag->nPaddedConsPosStart_ ) - pCP->nAutoFinishDoNotExtendContigsIfTagsAreThisCloseToContigEnd_; int nUnpaddedConsPosEnd = nUnpaddedIndex( pTag->nPaddedConsPosEnd_ ) + pCP->nAutoFinishDoNotExtendContigsIfTagsAreThisCloseToContigEnd_; if ( nUnpaddedConsPosStart <= nGetUnpaddedStartIndex() && nGetUnpaddedStartIndex() <= nUnpaddedConsPosEnd ) { bTagsSayDoNotExtendToLeft_ = true; fprintf( pAO, "do not extend contig %s to left because of read tag %s on read %s\n", soGetName().data(), pTag->soGetTagType().data(), pLocFrag->soGetName().data() ); } if ( nUnpaddedConsPosStart <= nGetUnpaddedEndIndex() && nGetUnpaddedEndIndex() <= nUnpaddedConsPosEnd ) { bTagsSayDoNotExtendToRight_ = true; fprintf( pAO, "do not extend contig %s to right because of read tag %s on read %s\n", soGetName().data(), pTag->soGetTagType().data(), pLocFrag->soGetName().data() ); } } } // now look in consensus tags for( int nConsensusTag = 0; nConsensusTag < nGetNumberOfTags(); ++nConsensusTag ) { tag* pTag = pGetTag( nConsensusTag ); if ( pCP->aAutoFinishTagsToNotExtend_.index( pTag->soGetTagType() ) == RW_NPOS ) continue; int nUnpaddedConsPosStart = nUnpaddedIndex( pTag->nPaddedConsPosStart_ ) - pCP->nAutoFinishDoNotExtendContigsIfTagsAreThisCloseToContigEnd_; int nUnpaddedConsPosEnd = nUnpaddedIndex( pTag->nPaddedConsPosEnd_ ) + pCP->nAutoFinishDoNotExtendContigsIfTagsAreThisCloseToContigEnd_; if ( nUnpaddedConsPosStart <= nGetUnpaddedStartIndex() && nGetUnpaddedStartIndex() <= nUnpaddedConsPosEnd ) { bTagsSayDoNotExtendToLeft_ = true; fprintf( pAO, "do not extend contig %s to left because of consensus tag %s\n", soGetName().data(), pTag->soGetTagType().data() ); } if ( nUnpaddedConsPosStart <= nGetUnpaddedEndIndex() && nGetUnpaddedEndIndex() <= nUnpaddedConsPosEnd ) { fprintf( pAO, "do not extend contig %s to right because of consensus tag %s\n", soGetName().data(), pTag->soGetTagType().data() ); bTagsSayDoNotExtendToRight_ = true; } } } void Contig :: lookForCloneEndTags() { fprintf( pAO, "looking for cloneEnd tags in contig %s\n", soGetName().data() ); // first look in the reads for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); for( int nTag = 0; nTag < pLocFrag->nGetNumberOfTags(); ++nTag ) { tag* pTag = pLocFrag->pGetTag( nTag ); if ( pTag->soGetTagType() != "cloneEnd" ) continue; // found a cloneEnd tag! dealWithCloneEndTag( pTag ); } } // now look in consensus tags for( int nConsensusTag = 0; nConsensusTag < nGetNumberOfTags(); ++nConsensusTag ) { tag* pTag = pGetTag( nConsensusTag ); if ( pTag->soGetTagType() != "cloneEnd" ) continue; // found a cloneEnd tag! dealWithCloneEndTag( pTag ); } } void Contig :: dealWithCloneEndTag( tag* pTag ) { fprintf( pAO, "found clone end tag at %d to %d with insert at %s\n", nUnpaddedIndex( pTag->nPaddedConsPosStart_ ), nUnpaddedIndex( pTag->nPaddedConsPosEnd_ ), pTag->soMiscData_.data() ); int nUnpaddedLeft; int nUnpaddedRight; if ( pTag->soMiscData_ == "->" ) { nUnpaddedLeft = pUnpaddedErrorProbabilities_->nGetStartIndex(); nUnpaddedRight = nUnpaddedIndex( pTag->nPaddedConsPosEnd_ ); bLeftEndOfContigExaminedForEndOfClone_ = true; bLeftEndOfContigIsEndOfClone_ = true; } else if ( pTag->soMiscData_ == "<-" ) { nUnpaddedLeft = nUnpaddedIndex( pTag->nPaddedConsPosStart_ ); nUnpaddedRight = pUnpaddedErrorProbabilities_->nGetEndIndex(); bRightEndOfContigExaminedForEndOfClone_ = true; bRightEndOfContigIsEndOfClone_ = true; } else { // something screwed up with this tag return; } // set the error probability to zero within the clone vector // so it isn't finished. for( int nUnpadded = nUnpaddedLeft; nUnpadded <= nUnpaddedRight; ++nUnpadded ) { (*pUnpaddedErrorProbabilities_)[ nUnpadded ] = 0.0; } } void Contig :: setErrorsToZeroWithinDoNotFinishTags() { // look for do-not-finish tags in both reads and in the consensus // First look in the reads. for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); for( int nTag = 0; nTag < pLocFrag->nGetNumberOfTags(); ++nTag ) { tag* pTag = pLocFrag->pGetTag( nTag ); if ( consedParameters::pGetConsedParameters()->aAutoFinishTagsToNotFinish_.index( pTag->soGetTagType() ) == RW_NPOS ) continue; int nUnpaddedConsPosStart = nUnpaddedIndex( pTag->nPaddedConsPosStart_ ); int nUnpaddedConsPosEnd = nUnpaddedIndex( pTag->nPaddedConsPosEnd_ ); // read tags might be on a read that protrudes far beyond the // end of the consensus, and thus isn't on the extended contig int nOverlapLeft; int nOverlapRight; if ( bIntersect( nUnpaddedConsPosStart, nUnpaddedConsPosEnd, pUnpaddedErrorProbabilities_->nGetStartIndex(), pUnpaddedErrorProbabilities_->nGetEndIndex(), nOverlapLeft, nOverlapRight ) ) { for( int nUnpadded = nOverlapLeft; nUnpadded <= nOverlapRight; ++nUnpadded ) { (*pUnpaddedErrorProbabilities_)[ nUnpadded ] = 0.0; } } } // for( nTag = 0 ... } // for( int nRead = ... // now look for consensus tags for( int nConsensusTag = 0; nConsensusTag < nGetNumberOfTags(); ++nConsensusTag ) { tag* pTag = pGetTag( nConsensusTag ); if ( consedParameters::pGetConsedParameters()->aAutoFinishTagsToNotFinish_.index( pTag->soGetTagType() ) == RW_NPOS ) continue; int nUnpaddedConsPosStart = nUnpaddedIndex( pTag->nPaddedConsPosStart_ ); int nUnpaddedConsPosEnd = nUnpaddedIndex( pTag->nPaddedConsPosEnd_ ); for( int nUnpadded = nUnpaddedConsPosStart; nUnpadded <= nUnpaddedConsPosEnd; ++nUnpadded ) { (*pUnpaddedErrorProbabilities_)[ nUnpadded ] = 0.0; } } // for( int nConsensusTag = 0 ... } void Contig :: setUnpaddedReadsThatCouldImproveQualities() { int nArrayLength = 2* ( consedParameters::pGetConsedParameters()->nAutoFinishNumberOfBasesBetweenContigsAssumed_ ) + nGetUnpaddedSeqLength(); int nOffset = -consedParameters::pGetConsedParameters()->nAutoFinishNumberOfBasesBetweenContigsAssumed_ + nGetUnpaddedStartIndex(); pUnpaddedReadTypesCoveringEachBase_ = new mbtValVector( nArrayLength, nOffset, '\0' ); pOriginalUnpaddedReadTypesCoveringEachBase_ = new mbtValVector( nArrayLength, nOffset ); pSavedUnpaddedReadTypesCoveringEachBase_ = new mbtValVector( nArrayLength, nOffset ); unsigned char ucZero = 0; for( int nPos = pUnpaddedReadTypesCoveringEachBase_->nGetStartIndex(); nPos <= pUnpaddedReadTypesCoveringEachBase_->nGetEndIndex(); ++nPos ) { (*pUnpaddedReadTypesCoveringEachBase_)[ nPos ] = ucZero; } for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); // such a read doesn't belong where it is--do not // consider it as covering the region if ( pLocFrag->bWholeReadIsUnaligned_ ) continue; int nUnpaddedLeft; int nUnpaddedRight; // avoid subscript errors if ( bIntersect( nUnpaddedIndex( pLocFrag->nAlignClipStart_ ), nUnpaddedIndex( pLocFrag->nAlignClipEnd_ ), pUnpaddedReadTypesCoveringEachBase_->nGetStartIndex(), pUnpaddedReadTypesCoveringEachBase_->nGetEndIndex(), nUnpaddedLeft, nUnpaddedRight ) ) { unsigned char ucStrandAndChemistry = pLocFrag->ucGetReadType(); for( int nUnpaddedPos = nUnpaddedLeft; nUnpaddedPos <= nUnpaddedRight; ++nUnpaddedPos ) { (*pUnpaddedReadTypesCoveringEachBase_)[ nUnpaddedPos ] |= ucStrandAndChemistry; } } } // copy to pOriginalUnpaddedReadTypes_ for( int nUnpadded = pUnpaddedReadTypesCoveringEachBase_->nGetStartIndex(); nUnpadded <= pUnpaddedReadTypesCoveringEachBase_->nGetEndIndex(); ++nUnpadded ) { (*pOriginalUnpaddedReadTypesCoveringEachBase_)[ nUnpadded ] = (*pUnpaddedReadTypesCoveringEachBase_)[ nUnpadded ]; } } void Contig :: createUniversalPrimerExperimentCandidatesList() { if ( pCP->bAutoFinishAllowResequencingReads_ ) { fprintf( pAO, "creating list of possible experiments with universal primers..." ); fflush( pAO ); createExperimentsWithResequencingUniversalPrimers(); fprintf( pAO, "done\n" ); fflush( pAO ); } if ( pCP->bAutoFinishAllowDeNovoUniversalPrimerSubcloneReads_ ) { fprintf( pAO, "creating list of possible experiments of reverses where there is only a forward or forwards where there is only a reverse..." ); fflush( pAO ); createExperimentsWithFirstForwardsOrFirstReverses(); fprintf( pAO, "done\n\n" ); fflush( pAO ); } } void Contig :: createCustomPrimerExperimentCandidatesList() { if ( pCP->bAutoFinishAllowCustomPrimerSubcloneReads_ || pCP->bAutoFinishAllowWholeCloneReads_ ) { fprintf( pAO, "creating list of possible experiments with custom primers..." ); fflush( pAO ); createExperimentsWithCustomPrimers(); fprintf( pAO, "done\n\n" ); fflush( pAO ); } } void Contig :: createExperimentsWithCustomPrimers() { // now create experiments with custom primers pCP->setParametersForSequencingPrimersNotPCRPrimers( true ); // create an experiment if it will solve more than // nAutoFinishMinimumErrorThresholdOfRead for( int nReadType = 0; nReadType < 2; ++nReadType ) { readTypes ucRead; // for now, just consider terminator reads if (nReadType == 0 ) ucRead = TOP_STRAND_TERMINATOR_READS; else if ( nReadType == 1) ucRead = BOTTOM_STRAND_TERMINATOR_READS; else assert( false ); bool bTopStrandNotBottomStrand = bIsTopStrandRead( ucRead ); int nFirstExperimentHighQualityLeftEnd; int nLastExperimentHighQualityLeftEnd; // g=gap C=consensus base // ggggggCCCCCCCCCCCCCCCCCCCCCCCCCCCgggggg // top strand reads from leftmost to rightmost // ppppHHHHHHllll ppppppHHHHHHHllllll // bottom strand reads // lllllHHHHpppp llllHHHHHpppp // That is, we try to make reads wherever there is // room on the consensus for the primer. // Primers can be of varying lengths. Shall we save enough // room for the longest? int nFirstExperimentFirstBaseOfReadUnpadded; int nLastExperimentFirstBaseOfReadUnpadded; if ( bTopStrandNotBottomStrand ) { // we must be able to fit a primer onto the consensus. // ggggggCCCCCCCCCCCCCCCCCCCCCCCCCCCgggggg // ppppHHHHHHllll ppppppHHHHHHHllllll nFirstExperimentFirstBaseOfReadUnpadded = nGetUnpaddedStartIndex() + pCP->nPrimersMinimumLengthOfAPrimerToUse_; nLastExperimentFirstBaseOfReadUnpadded = nGetUnpaddedEndIndex() - pCP->nPrimersMinimumLengthOfAPrimerToUse_ + 1; } else { // ggggggCCCCCCCCCCCCCCCCCCCCCCCCCCCgggggg // lllllHHHHpppp llllHHHHHpppp nFirstExperimentFirstBaseOfReadUnpadded = nGetUnpaddedStartIndex() - 1; nLastExperimentFirstBaseOfReadUnpadded = nGetUnpaddedEndIndex() - pCP->nPrimersMinimumLengthOfAPrimerToUse_; } double dMinQuality = consedParameters::pGetConsedParameters()->dAutoFinishMinNumberOfErrorsFixedByAnExpToUse_; // all unpadded positions // nFirstBaseOfReadUnpadded is the most 5' base of the read, after the primer // Thus it is the left end of the read for top strand reads, // and the right end of the read for bottom strand reads. for( int nFirstBaseOfReadUnpadded = nFirstExperimentFirstBaseOfReadUnpadded; nFirstBaseOfReadUnpadded <= nLastExperimentFirstBaseOfReadUnpadded; ++nFirstBaseOfReadUnpadded ) { // see if this experiment will solve more than the minimum # of errors // I will do that by asking a simpler question: "is there more than // the minimum number of errors in the entire region covered by // the potential read?" If not, skip this experiment. double dMaxErrors = dGetInitialMaxErrorsFixed2( nFirstBaseOfReadUnpadded, bTopStrandNotBottomStrand ); if ( dMaxErrors < pCP->dAutoFinishMinNumberOfErrorsFixedByAnExp_ ) continue; // However, the quality might be too // low to create a primer. If a primer can't be created, // there is no point in considering this experiment further. // So check that next: if ( !bCustomPrimerQualityTest( nFirstBaseOfReadUnpadded, bTopStrandNotBottomStrand ) ) continue; if ( pCP->bAutoFinishAllowWholeCloneReads_ ) { maybeCreateExperimentWithCustomPrimer( ucRead, bTopStrandNotBottomStrand, nFirstBaseOfReadUnpadded, true, // bCloneTemplateNotSubcloneTemplate false ); // bForCoveringSingleSubcloneRegions } if ( pCP->bAutoFinishAllowCustomPrimerSubcloneReads_ ) { maybeCreateExperimentWithCustomPrimer( ucRead, bTopStrandNotBottomStrand, nFirstBaseOfReadUnpadded, false, // bCloneTemplateNotSubcloneTemplate false ); // bForCoveringSingleSubcloneRegions } } // for( int nFirstBaseOfReadUnpadded = ... } // for (int nReadType... // no longer sorting this list--just pick out top one each time. // Much more efficient. } bool Contig :: bCustomPrimerQualityTest( const int nFirstBaseOfReadUnpadded, const bool bTopStrandNotBottomStrand ) { // nFirstBaseOfReadUnpadded is in 1based unpadded coordinates, but primer // arrays in in 0 based coordinates so convert int nZeroBasedUnpaddedLeftMost; int nZeroBasedUnpaddedRightMost; if ( bTopStrandNotBottomStrand ) { // subtract 1 to convert to 0 based coordinates. // then subtract 1 again to move from read to primer nZeroBasedUnpaddedRightMost = nFirstBaseOfReadUnpadded - 2; nZeroBasedUnpaddedLeftMost = nZeroBasedUnpaddedRightMost - consedParameters::pGetConsedParameters()->nPrimersMinimumLengthOfAPrimerToUse_ + 1; } else { // substract 1 to convert to 0-based coordinates // then add 1 to move from read to primer nZeroBasedUnpaddedLeftMost = nFirstBaseOfReadUnpadded; nZeroBasedUnpaddedRightMost = nZeroBasedUnpaddedLeftMost + consedParameters::pGetConsedParameters()->nPrimersMinimumLengthOfAPrimerToUse_ - 1; } int n1BasedLeft = nZeroBasedUnpaddedLeftMost + 1; int n1BasedRight = nZeroBasedUnpaddedRightMost + 1; // if the primer is off of the consensus, then can't make a custom oligo // Note that this only checks that the minimum length primer is on the // consensus. It is possible that a longer length primer will be // off the consensus. We will need to check for that when we create // the primers. if ( n1BasedLeft < nGetUnpaddedStartIndex() ) { return( false ); } if ( n1BasedRight > nGetUnpaddedEndIndex() ) { return( false ); } unsigned char ucMinAcceptableQuality = consedParameters::pGetConsedParameters()->nPrimersMinQuality_; for( int nPrimerPos = nZeroBasedUnpaddedLeftMost; nPrimerPos <= nZeroBasedUnpaddedRightMost; ++nPrimerPos ) { if ( pUnpaddedContig_->pUnpaddedQualityArray_[ nPrimerPos ] < ucMinAcceptableQuality ) return( false ); } return( true ); } // bCustomPrimerQualityTest void Contig :: maybeCreateExperimentWithCustomPrimer( const readTypes ucRead, const bool bTopStrandNotBottomStrand, const int nFirstBaseOfReadUnpadded, // 1 -based coordinates const bool bCloneTemplateNotSubcloneTemplate, const bool bForCoveringSingleSubcloneRegions ) { int nUnpaddedPosOf3PrimeEnd; if ( bTopStrandNotBottomStrand ) { // subtract 1 to move from read to primer // (both are in 1-based coordinates) nUnpaddedPosOf3PrimeEnd = nFirstBaseOfReadUnpadded - 1; } else { // add 1 to move from read to primer // (both are in 1-based coordinates) nUnpaddedPosOf3PrimeEnd = nFirstBaseOfReadUnpadded + 1; } primerType* pPrimerArray; int nNumberOfPrimers; bool bSomeAcceptablePrimers = true; createPrimerCandidatesWith3PrimeEnd( this, nUnpaddedPosOf3PrimeEnd, bTopStrandNotBottomStrand, pPrimerArray, nNumberOfPrimers, bSomeAcceptablePrimers ); if (!bSomeAcceptablePrimers ) return; // only check the shortest--if it fails, they all will fail primerType* pShortestPrimer = pFindShortestAcceptablePrimer( pPrimerArray, nNumberOfPrimers ); // the following must be true or else bPrimerHasMatchesElsewhere won't // do any checking if ( pShortestPrimer->nUnpaddedLength_ != consedParameters::pGetConsedParameters()->nPrimersMinimumLengthOfAPrimerToUse_ ) { cout << "primer strange length = " << pShortestPrimer->nUnpaddedLength_ << endl; cout.flush(); assert( false ); } bool bFoundPrimer = false; for( int nPrimer = 0; nPrimer < nNumberOfPrimers; ++nPrimer ) { primerType* pPrimer = pPrimerArray + nPrimer; if ( ConsEd::pGetAssembly()->bFoundAutoFinishDebugPrimer( pPrimer ) ) { bFoundPrimer = true; break; } } if ( bFoundPrimer ) { cerr << "found primer before bPrimerHasMatchesElsewhere, shortest version with same 3' end is "; for( int n = 0; n < pShortestPrimer->nUnpaddedLength_; ++n ) { cerr << pShortestPrimer->szPrimer_[ n ]; } cerr << endl; } if ( bPrimerHasMatchesElsewhere( pShortestPrimer, bTopStrandNotBottomStrand, bCloneTemplateNotSubcloneTemplate ) ) return; // only check the shortest--if it fails, they all will fail if ( bPrimerHasMatchesAgainstSequencesInFile( pShortestPrimer, bCloneTemplateNotSubcloneTemplate) ) return; whichPrimersHaveAcceptableMeltingTemp( pPrimerArray, nNumberOfPrimers, bSomeAcceptablePrimers ); if ( !bSomeAcceptablePrimers ) return; whichPrimersHaveAcceptableSelfMatch( pPrimerArray, nNumberOfPrimers, bSomeAcceptablePrimers ); if ( !bSomeAcceptablePrimers ) return; int nNumberOfAcceptablePrimers; checkPrimersForMononucleotideRepeats( pPrimerArray, nNumberOfPrimers, nNumberOfAcceptablePrimers ); if ( nNumberOfAcceptablePrimers == 0 ) return; checkPrimersForACGT( pPrimerArray, nNumberOfPrimers, nNumberOfAcceptablePrimers ); if ( nNumberOfAcceptablePrimers == 0 ) return; // if reached here, there is at least 1 acceptable primer for this // position. There may be more, in which case I think that we // should just take the shortest. primerType* pPrimer = pFindShortestAcceptablePrimer( pPrimerArray, nNumberOfPrimers ); // if this is using a subclone template, we need to be sure // that there is some acceptable template if ( !bCloneTemplateNotSubcloneTemplate ) { // isn't pPrimer->pContig_ always this? (DG, Dec 1998) if ( ! pPrimer->pContig_->bFindTemplatesForPrimer( pPrimer, ucRead ) ) { return; } } if ( bFoundPrimer ) { cerr << "found primer after all checks except errors fixed "; for( int n = 0; n < pShortestPrimer->nUnpaddedLength_; ++n ) { cerr << pShortestPrimer->szPrimer_[ n ]; } cerr << endl; } // before we just checked roughly if the read would fix enough errors. // Now let's check precisely // this needs to take into account the template lengths for subclone // reads. For whole clone reads, we will assume that the read will // be the whole length of the model read. In the future, we may // downgrade this quality. // find the read in unpadded consensus coordinates double dErrorsFixed; if ( bForCoveringSingleSubcloneRegions ) { dErrorsFixed = -2.0; // no purpose served by calculating this. // since it isn't used. } else { if ( bCloneTemplateNotSubcloneTemplate ) { dErrorsFixed = dGetErrorsFixedForWholeCloneRead( bTopStrandNotBottomStrand, nFirstBaseOfReadUnpadded, false, // bChooseExperiment false, // bJustConsensusNotGaps false ); // bFindWindowOfMostErrorsFixed } else { // subclone template(s) // This calculation will consider all of the different templates dErrorsFixed = dGetErrorsFixedForCustomPrimerSubcloneRead( pPrimer, bTopStrandNotBottomStrand, nFirstBaseOfReadUnpadded, false, // bChooseExperiment false, // bJustConsensusNotGaps false ); // bFindWindowOfMostErrorsFixed } if ( dErrorsFixed < pCP->dAutoFinishMinNumberOfErrorsFixedByAnExp_ ) return; } // if ( bForCoveringSingleSubcloneRegions ) else if ( bFoundPrimer ) { cerr << "found primer before createExperimentWithCustomPrimer "; for( int n = 0; n < pShortestPrimer->nUnpaddedLength_; ++n ) { cerr << pShortestPrimer->szPrimer_[ n ]; } cerr << " dErrorsFixed = " << dErrorsFixed << endl; cerr << endl; } createExperimentWithCustomPrimer( pPrimer, ucRead, bTopStrandNotBottomStrand, nFirstBaseOfReadUnpadded, bCloneTemplateNotSubcloneTemplate, bForCoveringSingleSubcloneRegions, dErrorsFixed ); } void Contig :: createExperimentWithCustomPrimer( primerType* pPrimer, const readTypes ucRead, const bool bTopStrandNotBottomStrand, const int nFirstBaseOfReadUnpadded, const bool bCloneTemplateNotSubcloneTemplate, const bool bForCoveringSingleSubcloneRegions, const double dErrorsFixed ) { // we will get rid of the primer's memory so save primer info primerType* pSavedPrimer = (primerType*) malloc( sizeof( primerType ) ); memcpy( pSavedPrimer, pPrimer, sizeof( primerType ) ); experiment* pExp = new experiment(); pExp->dErrorsFixed_ = dErrorsFixed; pExp->ucStrandAndChemistry_ = ucRead; // pExp->bCustomPrimerNotUniversalPrimer_ = true; pExp->nReadType_ = nWalk; pExp->bCloneTemplateNotSubcloneTemplate_ = bCloneTemplateNotSubcloneTemplate; pExp->fCost_ = ( bCloneTemplateNotSubcloneTemplate ) ? consedParameters::pGetConsedParameters()->dAutoFinishCostOfCustomPrimerCloneReaction_ : consedParameters::pGetConsedParameters()->dAutoFinishCostOfCustomPrimerSubcloneReaction_; pExp->pCustomPrimer_ = pSavedPrimer; pExp->pContig_ = this; pExp->pSubcloneTemplate_ = NULL; pExp->nFirstBaseOfReadUnpadded_ = nFirstBaseOfReadUnpadded; // Note: these positions do *not* take into account the extents of // the subclone templates if ( bTopStrandNotBottomStrand ) { pExp->nUnpaddedLeft_ = nFirstBaseOfReadUnpadded; pExp->nUnpaddedRight_ = nFirstBaseOfReadUnpadded + ConsEd::pGetAssembly()->pModelRead_->length() - 1; } else { pExp->nUnpaddedRight_ = nFirstBaseOfReadUnpadded; pExp->nUnpaddedLeft_ = nFirstBaseOfReadUnpadded - ConsEd::pGetAssembly()->pModelRead_->length() + 1; } pExp->bTopStrandNotBottomStrand_ = bTopStrandNotBottomStrand; pExp->nState_ = nAvailableExperiment; // note in the below calculations that the nAutoFinishPotential... // parameters are 0-based if ( !bForCoveringSingleSubcloneRegions ) setVariousErrorData( pExp ); else { orderTemplatesAndSetSingleSubcloneValues( pExp ); if ( pExp->nSingleSubcloneBasesFixed_ < consedParameters::pGetConsedParameters()->nAutoFinishMinNumberOfSingleSubcloneBasesFixedByAnExp_ ) { // this experiment doesn't fix enough single subclone bases--reject it delete pExp; return; } // in the case of custom primer subclone experiments for // single subclone regions, the errors fixed has not yet been set // (I actually don't know if it matters, because this value is not // printed out--let's see if we can get away without even calculating // it. // if ( pExp->bIsCustomPrimerSubcloneRead() ) { // pExp->dErrorsFixed_ = dGetErrorsFixedForCustomPrimerSubcloneRead( // pPrimer, // bTopStrandNotBottomStrand, // nFirstBaseOfReadUnpadded, // false, // bChooseExperiment // false, // bJustConsensusNotGaps // false ); // bFindWindowOfMostErrorsFixed // } } pCandidateExperiments_->insert( pExp ); } void Contig :: setVariousErrorData( experiment* pExp ) { calculateErrorsFixedJustInConsensus( pExp ); if ( ( nGetUnpaddedStartIndex() <= pExp->nUnpaddedLeft_ ) && ( pExp->nUnpaddedRight_ <= nGetUnpaddedEndIndex() ) ) { pExp->nExtendsIntoWhichGap_ = nNoGap; } else { if ( pExp->nUnpaddedLeft_ < nGetUnpaddedStartIndex() ) pExp->nExtendsIntoWhichGap_ = nLeftGap; else pExp->nExtendsIntoWhichGap_ = nRightGap; } pExp->dErrorsFixedPerDollar_ = pExp->dErrorsFixed_ / pExp->fCost_; // let's just drop the whole business (DG, 10/98) // The .00000001 business is because, due to rounding error, the // errors fixed in the consensus can be larger // if ( ! (pExp->dErrorsFixedJustInConsensus_ <= // pExp->dErrorsFixed_ + 0.0000000001 ) ) { // printf( "problem with the following experiment: " ); // printf( "diff = %e\n", // pExp->dErrorsFixedJustInConsensus_ - // pExp->dErrorsFixed_ ); // printf( " pExp->dErrorsFixedJustInConsensus_ = %f \npExp->dErrorsFixed_ = %f\n", // pExp->dErrorsFixedJustInConsensus_, // pExp->dErrorsFixed_ ); // pExp->print(); // assert( false ); // } } bool Contig :: bStillTooManyExpectedErrors() { if ( dExpectedNumberOfErrorsInExtendedContig_ > dTargetNumberOfErrors_ ) return true; else return false; } void Contig :: calculateErrorsFixedJustInConsensus( experiment* pExp ) { // now calculate the dErrorsFixedJustInConsensus_ // If the high quality region and low quality regions are on the // consensus entirely, then // this is the same as the dErrorsFixed_ // Otherwise, we have to calculate it explicitly. if ( ( nGetUnpaddedStartIndex() <= pExp->nUnpaddedLeft_ ) && ( pExp->nUnpaddedRight_ <= nGetUnpaddedEndIndex() ) ) pExp->dErrorsFixedJustInConsensus_ = pExp->dErrorsFixed_; else { // case in which the read extends out over the end of the consensus // and thus only some of the errors fixed improve the consensus // accuracy if ( pExp->bIsWholeCloneRead() ) { pExp->dErrorsFixedJustInConsensus_ = dGetErrorsFixedForWholeCloneRead( pExp->bTopStrandNotBottomStrand_, pExp->nFirstBaseOfReadUnpadded_, false, // bChooseExperiment true, // bJustConsensusNotGaps false ); // bFindWindowOfMostErrorsFixed } else if ( pExp->bIsCustomPrimerSubcloneRead() ) { pExp->dErrorsFixedJustInConsensus_ = dGetErrorsFixedForCustomPrimerSubcloneRead( pExp->pCustomPrimer_, pExp->bTopStrandNotBottomStrand_, pExp->nFirstBaseOfReadUnpadded_, false, // bChooseExperiment true, // bJustConsensusNotGaps false ); // bFindWindowOfMostErrorsFixed } else if ( pExp->bIsUniversalPrimerRead() ) { pExp->dErrorsFixedJustInConsensus_ = dGetErrorsFixedForSubcloneRead( &(pExp->pSubcloneTemplate_), 1, // nNumberOfSubclones pExp->bTopStrandNotBottomStrand_, pExp->nFirstBaseOfReadUnpadded_, true, // bExcludeVectorBasesAtBeginningOfRead false, // bChooseExperiment true, // bJustConsensusNotGaps false ); // bFindWindowOfMostErrorsFixed } else assert( false ); } // if ( ( nGetUnpaddedStartIndex() <= ... else pExp->dErrorsFixedJustInConsensusPerDollar_ = pExp->dErrorsFixedJustInConsensus_ / pExp->fCost_; } void Contig :: adjustExpectedNumberOfErrors( experiment* pExp ) { dExpectedNumberOfErrorsInExtendedContig_ -= pExp->dErrorsFixed_; dExpectedNumberOfErrorsInConsensus_ -= pExp->dErrorsFixedJustInConsensus_; } experiment* Contig :: pGetBestExperimentCandidateButMightHaveBeenTriedBefore() { // this has been written so that it only returns experiments // that fix the min required number of errors in the consensus experiment* pBestSoFar = NULL; double dErrorsFixedPerDollarBestSoFar = 0.0; for( int nCan = 0; nCan < pCandidateExperiments_->length(); ++nCan ) { experiment* pExp = (*pCandidateExperiments_)[ nCan ]; if ( pExp->nState_ == nChosenExperiment ) continue; if ( pExp->nState_ == nRejectedExperiment ) continue; // commented out because rounding error can make this not true // assert( pExp->dErrorsFixedJustInConsensus_ <= // pExp->dErrorsFixed_ + 0.0000000001 ); // I don't believe we should have any requirement about fixing a minimum // number of errors in the consensus. DG, Jan 2000. But we should // have a minimum requirement about fixing total errors. if ( pExp->dErrorsFixed_ < consedParameters::pGetConsedParameters()->dAutoFinishMinNumberOfErrorsFixedByAnExpToUse_ ) continue; if ( pExp->dErrorsFixedPerDollar_ > dErrorsFixedPerDollarBestSoFar ) { // check that, if the read is a subclone custom primer read, // it isn't too close to another subclone custom primer read // on the same strand that has already been chosen if ( pCP->bAutoFinishDoNotAllowSubcloneCustomPrimerReadsCloseTogether_ ) { if ( pExp->nReadType_ == nWalk && !pExp->bCloneTemplateNotSubcloneTemplate_ && bThereIsAnotherSubcloneCustomPrimerReadTooCloseOnTheSameStrand( pExp ) ) { // this experiment can never be considered again in this // round of autofinish, so reject it pExp->nState_ = nRejectedExperiment; continue; } } if ( pCP->bAutoFinishDoNotAllowWholeCloneCustomPrimerReadsCloseTogether_ && pExp->nReadType_ == nWalk && pExp->bCloneTemplateNotSubcloneTemplate_ && bThereIsAnotherWholeCloneCustomPrimerReadTooCloseOnTheSameStrand( pExp ) ) { pExp->nState_ = nRejectedExperiment; continue; } // If the experiment is partly off the extended high quality // segment of the // consensus, then check that it overlaps be enough if ( ( pExp->nUnpaddedLeft_ < nUpdatedUnpaddedHighQualitySegmentStart_ ) || ( nUnpaddedHighQualitySegmentEnd_ < pExp->nUnpaddedRight_ ) ) { if ( nGetAmountOfOverlap( pExp->nUnpaddedLeft_, pExp->nUnpaddedRight_, nUnpaddedHighQualitySegmentStart_, nUnpaddedHighQualitySegmentEnd_ ) < pCP->nAutoFinishMinBaseOverlapBetweenAReadAndHighQualitySegmentOfConsensus_ ) { fprintf( pAO, "\nrejecting experiment from %d to %d because it doesn't overlap the high quality segment %d to %d by at least %d bases\n", pExp->nUnpaddedLeft_, pExp->nUnpaddedRight_, nUnpaddedHighQualitySegmentStart_, nUnpaddedHighQualitySegmentEnd_, pCP->nAutoFinishMinBaseOverlapBetweenAReadAndHighQualitySegmentOfConsensus_ ); continue; } } pBestSoFar = pExp; dErrorsFixedPerDollarBestSoFar = pExp->dErrorsFixedPerDollar_; } } // note: pBestSoFar might be null if all experiments are chosen return( pBestSoFar ); } experiment* Contig :: pGetBestExperimentCandidate() { experiment* pBestExperiment = NULL; bool bTryAgain = false; do { pBestExperiment = pGetBestExperimentCandidateButMightHaveBeenTriedBefore(); if ( pBestExperiment ) { if ( pBestExperiment->bHasThisExperimentOrOneLikeItBeenTriedBefore() ) { pBestExperiment->nState_ = nRejectedExperiment; bTryAgain = true; } else bTryAgain = false; } else bTryAgain = false; } while( bTryAgain ); return( pBestExperiment ); } void Contig :: adjustRemainingExperimentCandidates( experiment* pExperimentJustChosen ) { // first, adjust the ErrorsFixed arrays adjustConsensusDistributionsAndArrays( pExperimentJustChosen ); // don't bother adjusting the 1st experiment, since that is // the one we just chose if ( pCandidateExperiments_->length() < 2 ) return; for( int nExperiment = 0; nExperiment < pCandidateExperiments_->length(); ++nExperiment ) { experiment* pExperimentToBeAdjusted = (*pCandidateExperiments_)[ nExperiment ]; if (pExperimentToBeAdjusted == pExperimentJustChosen ) continue; // note that once an experiment falls below the minimum number of // errors fixed, it is rejected and won't be adjusted or considered // again if (pExperimentToBeAdjusted->nState_ != nAvailableExperiment ) continue; if ( pExperimentJustChosen->bOverlaps( pExperimentToBeAdjusted ) ) { adjustErrorsFixed( pExperimentToBeAdjusted ); } } } // After having adjusted the pUnpaddedErrorProbabilities_ array, // recalculate the errors fixed by each experiment. Note that it is // only necessary to do this to the experiments that overlap the one // that was just chosen, since any other experiment will fix the same // number of errors that it did before. void Contig :: adjustErrorsFixed( experiment* pExperimentToBeAdjusted ) { if ( pExperimentToBeAdjusted->bIsCustomPrimerSubcloneRead() ) { // if ( pExperimentToBeAdjusted->nUnpaddedLeft_ == -177 ) { // char szPrimer[ 200 ]; // for( int n = 0; n < pExperimentToBeAdjusted->pCustomPrimer_->nUnpaddedLength_; ++n ) // szPrimer[n] = pExperimentToBeAdjusted->pCustomPrimer_->szPrimer_[n]; // szPrimer[ pExperimentToBeAdjusted->pCustomPrimer_->nUnpaddedLength_ ] = '\0'; // fprintf( pAO, "found experiment at -177\n" ); // if ( strcmp( szPrimer, "cctccacacctcccc" ) == 0 ) { // fprintf( pAO, "found experiment we think should not have been chosen:" ); // pExperimentToBeAdjusted->print(); // bFoundDebug = true; // } // } pExperimentToBeAdjusted->dErrorsFixed_ = dGetErrorsFixedForCustomPrimerSubcloneRead( pExperimentToBeAdjusted->pCustomPrimer_, pExperimentToBeAdjusted->bTopStrandNotBottomStrand_, pExperimentToBeAdjusted->nFirstBaseOfReadUnpadded_, false, // bChooseExperiment false, // bJustConsensusNotGaps false ); // bFindWindowOfMostErrorsFixed if ( ConsEd::pGetAssembly()->bFoundAutoFinishDebugPrimer( pExperimentToBeAdjusted->pCustomPrimer_ ) ) { fprintf( pAO, "\nfound primer after adjustErrorsFixed--printing:\n" ); pExperimentToBeAdjusted->print(); } } else if ( pExperimentToBeAdjusted->bIsWholeCloneRead() ) { // whole clone reads pExperimentToBeAdjusted->dErrorsFixed_ = dGetErrorsFixedForWholeCloneRead( pExperimentToBeAdjusted->bTopStrandNotBottomStrand_, pExperimentToBeAdjusted->nFirstBaseOfReadUnpadded_, false, // bChooseExperiment false, // bJustConsensusNotGaps false ); // bFindWindowOfMostErrorsFixed if ( ConsEd::pGetAssembly()->bFoundAutoFinishDebugPrimer( pExperimentToBeAdjusted->pCustomPrimer_ ) ) { fprintf( pAO, "\nfound debug read after adjustErrorsFixed--printing:\n" ); pExperimentToBeAdjusted->print(); } } else if ( pExperimentToBeAdjusted->bIsUniversalPrimerRead() ) { // universal primer reads bool bJustCountConsensusErrorsFixed = false; if ( !pExperimentToBeAdjusted->bDeNovoUniversalPrimerRead_ && !pCP->bAutoFinishAllowResequencingReadsToExtendContigs_ ) bJustCountConsensusErrorsFixed = true; pExperimentToBeAdjusted->dErrorsFixed_ = dGetErrorsFixedForSubcloneRead( &(pExperimentToBeAdjusted->pSubcloneTemplate_), 1, // nNumberOfSubclones pExperimentToBeAdjusted->bTopStrandNotBottomStrand_, pExperimentToBeAdjusted->nFirstBaseOfReadUnpadded_, true, // bExcludeVectorBasesAtBeginningOfRead false, // bChooseExperiment bJustCountConsensusErrorsFixed, // bJustConsensusNotGaps false ); // bFindWindowOfMostErrorsFixed if ( ConsEd::pGetAssembly()->bFoundAutoFinishDebugRead( pExperimentToBeAdjusted ) ) { fprintf( pAO, "\nfound debug read after adjustErrorsFixed--printing:\n" ); pExperimentToBeAdjusted->print(); } } else assert( false ); calculateErrorsFixedJustInConsensus( pExperimentToBeAdjusted ); pExperimentToBeAdjusted->dErrorsFixedPerDollar_ = pExperimentToBeAdjusted->dErrorsFixed_ / pExperimentToBeAdjusted->fCost_; // note: I need to check where this routine is used. If it is only used // by parts of the code that won't pick an experiment below the // error rate threshold, there is no point in recalculating the errors // here if the error rate is already below the threshold--in that // case this should be near the top of the routine. (DG 11/99) // This routine is only used for available experiments that overlap // an experiment just picked. Thus the errors fixed do need to be // recalculated. if ( pExperimentToBeAdjusted->dErrorsFixedJustInConsensus_ < consedParameters::pGetConsedParameters()->dAutoFinishMinNumberOfErrorsFixedByAnExpToUse_ ) { pExperimentToBeAdjusted->nState_ = nRejectedExperiment; } } // When an experiment is chosen, adjust the pUnpaddedErrorProbabilities_ // to reflect the fact that some of the errors have been fixed in // the region covered by the chosen experiment. void Contig :: adjustConsensusDistributionsAndArrays( experiment* pExperimentJustChosen ) { // if ( pExperimentJustChosen->nUnpaddedLeft_ == -177 ) { // char szPrimer[ 200 ]; // for( int n = 0; n < pExperimentJustChosen->pCustomPrimer_->nUnpaddedLength_; ++n ) // szPrimer[n] = pExperimentJustChosen->pCustomPrimer_->szPrimer_[n]; // szPrimer[ pExperimentJustChosen->pCustomPrimer_->nUnpaddedLength_ ] = '\0'; // fprintf( pAO, "found experiment at -177\n" ); // if ( strcmp( szPrimer, "cctccacacctcccc" ) == 0 ) { // fprintf( pAO, "found experiment we think should not have been chosen:" ); // pExperimentJustChosen->print(); // bFoundDebug = true; // } // } // 3 cases: custom primer subclone, whole clone, univ primer double dErrorsFixed; if ( pExperimentJustChosen->bIsUniversalPrimerRead() ) { bool bJustCountConsensusErrorsFixed = false; if ( !pExperimentJustChosen->bDeNovoUniversalPrimerRead_ && !pCP->bAutoFinishAllowResequencingReadsToExtendContigs_ ) bJustCountConsensusErrorsFixed = true; dErrorsFixed = dGetErrorsFixedForSubcloneRead( &(pExperimentJustChosen->pSubcloneTemplate_), 1, // nNumberOfSubclones pExperimentJustChosen->bTopStrandNotBottomStrand_, pExperimentJustChosen->nFirstBaseOfReadUnpadded_, true, // bExcludeVectorBasesAtBeginningOfRead true, // bChooseExperiment bJustCountConsensusErrorsFixed, // bJustConsensusNotGaps false ); // bFindWindowOfMostErrorsFixed if ( ConsEd::pGetAssembly()->bFoundAutoFinishDebugRead( pExperimentJustChosen ) ) { fprintf( pAO, "\nfound debug read in adjustConsensusDistributionsAndArrays, fixed %g--printing:\n", dErrorsFixed ); pExperimentJustChosen->print(); } } else if ( pExperimentJustChosen->bIsCustomPrimerSubcloneRead() ) { dErrorsFixed = dGetErrorsFixedForCustomPrimerSubcloneRead( pExperimentJustChosen->pCustomPrimer_, pExperimentJustChosen->bTopStrandNotBottomStrand_, pExperimentJustChosen->nFirstBaseOfReadUnpadded_, true, // bChooseExperiment false, // bJustConsensusNotGaps false ); // bFindWindowOfMostErrorsFixed } else if ( pExperimentJustChosen->bIsWholeCloneRead() ) { dErrorsFixed = dGetErrorsFixedForWholeCloneRead( pExperimentJustChosen->bTopStrandNotBottomStrand_, pExperimentJustChosen->nFirstBaseOfReadUnpadded_, true, // bChooseExperiment false, // bJustConsensusNotGaps false ); // bFindWindowOfMostErrorsFixed } else assert( false ); // shouldn't be any other kind of experiment if ( ( ABS( dErrorsFixed - pExperimentJustChosen->dErrorsFixed_ ) > .000001 ) && ( pExperimentJustChosen->nPurpose_ != nCoverSingleSubcloneBases ) ) { fprintf( pAO, "debugging: adjustConsensusDistributionsAndArrays, dErrorsFixed = %e but pExperimentJustChosen->dErrorsFixed_ = %e for experiment from %d to %d id start: %d\n", dErrorsFixed, pExperimentJustChosen->dErrorsFixed_, pExperimentJustChosen->nUnpaddedLeft_, pExperimentJustChosen->nUnpaddedRight_, pExperimentJustChosen->aExpID_[0] ); cerr << "alert--see \"debugging\" in .out file--there is a bug" << endl; } } void Contig :: printNoExperiments() { fprintf( pAO, "CONTIG: %s does not need any experiments because it is already below the error threshold ", (char*) soGetName().data() ); fprintf( pAO, "ACCEPTABLE ERRORS: %.1f CURRENT ERRORS EXTENDED CONTIG: %.1f CONSENSUS: %.1f\n ", dTargetNumberOfErrors_, dExpectedNumberOfErrorsInExtendedContig_, dExpectedNumberOfErrorsInConsensus_ ); } void Contig :: setTargetNumberOfErrorsForThisContig() { // this includes gap errors int nBasesToCareAbout = 0; for( int nUnpadded = pOriginalUnpaddedErrorProbabilities_->nGetStartIndex(); nUnpadded <= pOriginalUnpaddedErrorProbabilities_->nGetEndIndex(); ++nUnpadded ) { if ( (*pOriginalUnpaddedErrorProbabilities_)[ nUnpadded ] > 0.0 ) ++nBasesToCareAbout; } dTargetNumberOfErrors_ = ( (double) consedParameters::pGetConsedParameters()->nAutoFinishMaxAcceptableErrorsPerMegabase_ * (double) nBasesToCareAbout ) / (double) 1000000.0; } void Contig :: setExpectedNumberOfErrorsForThisContig() { // this includes errors both in the consensus and in the gaps dExpectedNumberOfErrorsInExtendedContig_ = 0.0; int nUnpadded; for( nUnpadded = pOriginalUnpaddedErrorProbabilities_->nGetStartIndex(); nUnpadded <= pOriginalUnpaddedErrorProbabilities_->nGetEndIndex(); ++nUnpadded ) dExpectedNumberOfErrorsInExtendedContig_ += (*pOriginalUnpaddedErrorProbabilities_)[ nUnpadded ]; dExpectedNumberOfErrorsInConsensus_ = 0.0; for( nUnpadded = nGetUnpaddedStartIndex(); nUnpadded <= nGetUnpaddedEndIndex(); ++nUnpadded ) dExpectedNumberOfErrorsInConsensus_ += (*pOriginalUnpaddedErrorProbabilities_)[ nUnpadded ]; } void Contig :: recalculateCurrentExpectedNumberOfErrorsForThisContig( double& dErrorsInConsensus, double& dErrorsInExtendedContig ) { dErrorsInExtendedContig = 0.0; int nUnpadded; for( nUnpadded = pUnpaddedErrorProbabilities_->nGetStartIndex(); nUnpadded <= pUnpaddedErrorProbabilities_->nGetEndIndex(); ++nUnpadded ) { dErrorsInExtendedContig += (*pUnpaddedErrorProbabilities_)[ nUnpadded ]; } dErrorsInConsensus = 0.0; for( nUnpadded = nGetUnpaddedStartIndex(); nUnpadded <= nGetUnpaddedEndIndex(); ++nUnpadded ) { dErrorsInConsensus += (*pUnpaddedErrorProbabilities_)[ nUnpadded ]; } } double Contig :: dGetExpectedNumberOfErrorsPerMegabaseForConsensus() { // this includes only errors in the consensus int nUnpaddedBases = 0; double dExpectedNumberOfErrors = 0.0; double dError; double dQuality; for( int nPos = nGetStartConsensusIndex(); nPos <= nGetEndConsensusIndex(); ++nPos ) { // tried ntGetCons( nPos ).bIsPad() but that still called the // operator= Ntide nt = ntGetCons( nPos ); if ( !nt.bIsPad() ) { Quality qual = nt.qualGetQuality(); // do not include edited bases in the quality // calculation if (qual == ucQualityLowEdited ) continue; if (qual == ucQualityHighEdited ) continue; dError = dErrorRateFromQuality[ qual ]; dExpectedNumberOfErrors += dError; ++nUnpaddedBases; } } if ( nUnpaddedBases > 0 ) dExpectedNumberOfErrors = (dExpectedNumberOfErrors * 1000000.0)/nUnpaddedBases; else dExpectedNumberOfErrors = -1; return( dExpectedNumberOfErrors ); } bool Contig :: bFindTemplatesForPrimer( primerType* pPrimer, const readTypes ucRead ) { // Then we use the template's // error rates (and the redundancy) to see which template will be // best and will cover. (I am not going to use Pat's // parameter for length of not--there might be situations in which // we would only have a very short template, but it would be good // enough for the problem at hand.) // We are only interested in templates that cover this area: // (primer) (high quality) // ---> llllllHHHHHHHHHHHHHHllllllllllllllllllllllllllllll // a d b where a = nReadConsPosLeft and b = nReadConsPosRight // ----- (minimum acceptable extents of template--one more // base than primer) // try templates like this: // ----------- (not ok) // ------------------------------ (ok) // -------- (not ok) // ----------------- ok // ------------------------------ // ------------------------------ // ------------------------------ // ------------------------------ // all of the ones below are past the primer, and all further // ones will be also, so we can't stop looking at templates // ------------------------------ // ------------------------------ // ------------------------------ // // or for bottom strand templates: // // llllllllllllHHHHHHHHHllll <--- // ----- (minimum acceptable extents // of template--one more base // than primer) // try templates like this: // -------------------------------- (not ok) // --------------------------------------- ok // ------------------------------ (not ok) // -------------------------------------- ok // -------------------------------------- // -------------------------------------- // -------------------------------------- // -------------------------------------- // -------------------------------------- // . // . // . // -------------------------------------- // all of the ones below are no good because they don't overlap // the high quality or low quality part of the read at all. Since // the templates are sorted by left end, we can stop looking at templates // // -------------------------------------- // -------------------------------------- // -------------------------------------- // -------------------------------------- bool bTopStrandNotBottomStrand = bIsTopStrandRead( ucRead ); int nReadUnpaddedLeft; int nReadUnpaddedRight; if ( bTopStrandNotBottomStrand ) { // note that even though the read won't start until // pPrimer->nUnpaddedEnd_ + 1, the template must start at // pPrimer->nUnpaddedStart_ or to the left of it. Otherwise // the primer won't bind to the template. nReadUnpaddedLeft = pPrimer->nUnpaddedStart_; nReadUnpaddedRight = pPrimer->nUnpaddedStart_ + pCP->nPrimersWhenChoosingATemplateMinPotentialReadLength_; } else { // for reverse primers, nUnpaddedEnd_ is still // the rightmost base of the primer nReadUnpaddedRight = pPrimer->nUnpaddedEnd_; nReadUnpaddedLeft = pPrimer->nUnpaddedEnd_ - pCP->nPrimersWhenChoosingATemplateMinPotentialReadLength_; } // since the templates are sorted by left-end, let's be generous and // look back further than is possible for a template to start and // still cover this read (if the user is honest with // the maximum template size) subcloneTTemplate* pSubTT = new subcloneTTemplate(); pSubTT->nUnpaddedStart_ = nReadUnpaddedRight - consedParameters::pGetConsedParameters()->nPrimersMaxInsertSizeOfASubclone_; int nTemplate = aSubcloneTemplates_.nFindIndexOfMatchOrSuccessor( pSubTT ); delete pSubTT; // this was just used to find the first one if ( nTemplate == RW_NPOS ) { pPrimer->bAcceptable_ = false; pPrimer->nWhyIsPrimerNotAcceptable_ = BAD_PRIMER_HAS_NO_TEMPLATE; return( false ); // no templates cover the read } bool bTemplatesStillMightCover = true; for( ; nTemplate < aSubcloneTemplates_.entries() && bTemplatesStillMightCover; ++nTemplate ) { pSubTT = aSubcloneTemplates_[ nTemplate ]; if ( pSubTT->nUnpaddedStart_ > nReadUnpaddedLeft ) bTemplatesStillMightCover = false; else { if ( !pSubTT->bOKToUseThisTemplate_ ) continue; // check if the end of the template is to the right of the // read we want to make if ( pSubTT->nUnpaddedEnd_ < nReadUnpaddedRight ) continue; // if reached here, the template covers the desired read // check that this template can be used for a read in the // desired orientation if ( bTopStrandNotBottomStrand ) { if ( !pSubTT->bOKToUseForTopStrandReads_ ) { continue; } } else { if ( !pSubTT->bOKToUseForBottomStrandReads_ ) { continue; } } pSubTT->tryToAddTemplateToPrimer( pPrimer ); } } int nNumberOfTemplates = 0; for( nTemplate = 0; nTemplate < nNUMBER_OF_TEMPLATES; ++nTemplate ) { if ( pPrimer->pSubcloneTemplate_[ nTemplate ] ) ++nNumberOfTemplates; else break; } // check to see that at least one template was found if ( nNumberOfTemplates == 0 ) { pPrimer->bAcceptable_ = false; pPrimer->nWhyIsPrimerNotAcceptable_ = BAD_PRIMER_HAS_NO_TEMPLATE; return( false ); } if ( nNumberOfTemplates < pCP->nPrimersMinNumberOfTemplatesForPrimers_ ) { pPrimer->bAcceptable_ = false; pPrimer->nWhyIsPrimerNotAcceptable_ = BAD_PRIMER_NOT_ENOUGH_TEMPLATES; return( false ); } return( true ); } void Contig :: createExperimentsWithResequencingUniversalPrimers() { for( int nTemplate = 0; nTemplate < aSubcloneTemplates_.entries(); ++nTemplate ) { subcloneTTemplate* pSubcloneTemplate = aSubcloneTemplates_[ nTemplate ]; if ( !pSubcloneTemplate->bOKToUseThisTemplate_ ) continue; LocatedFragment* pLocFragOfExistingTopStrandUniversalPrimer = NULL; LocatedFragment* pLocFragOfExistingBottomStrandUniversalPrimer = NULL; for( int nRead = 0; nRead < pSubcloneTemplate->aReads_.entries(); ++nRead ) { LocatedFragment* pLocFrag = pSubcloneTemplate->aReads_[ nRead ]; // Below has the effect of creating an experiment for each // universal primer dye primer read. This is ok, but // inefficient in 2 ways: // 1) there may already by a dye // terminator read and thus the new read won't fix any errors // and // 2) there may be more than one dye primer read. // // There is a better way--just check to see if there is any // dye primer read and no dye terminator read. This would // not be helpful, though, in the redundant phase. For now, // I'm not changing it since efficiency is not a problem // currently (8/99 DG) // DG 11/99: Now that the initial and redundant phases are done // at once, we can do the following: just check if there is any // top strand read from this template. If so, do another. Check // if there is any bottom strand read from this template. If so, // do another. Don't worry about whether the existing reads // are dye primer or dye terminator reads. That will be taken // care of by the error probability arrays--reads of redundant // chemistry/strand will fix fewer errors. // fixed error 9/02 Jody Schwartz, LBL, // in which the existing read was a pcr end read // Autofinish tried to do a resequence from it and got a // segmentation fault. if ( !pLocFrag->bIsUniversalPrimerRead() ) continue; // this can happen since some templates have reads in // different contigs if ( pLocFrag->pGetContig() != this ) continue; if ( pLocFrag->bComp() ) { if ( !pLocFragOfExistingBottomStrandUniversalPrimer ) pLocFragOfExistingBottomStrandUniversalPrimer = pLocFrag; else { // which read is better quality? if ( pLocFrag->nGetHighQualitySegmentLengthSortOfUnpadded() > pLocFragOfExistingBottomStrandUniversalPrimer->nGetHighQualitySegmentLengthSortOfUnpadded() ) pLocFragOfExistingBottomStrandUniversalPrimer = pLocFrag; } } else { if ( !pLocFragOfExistingTopStrandUniversalPrimer ) pLocFragOfExistingTopStrandUniversalPrimer = pLocFrag; else { if ( pLocFrag->nGetHighQualitySegmentLengthSortOfUnpadded() > pLocFragOfExistingTopStrandUniversalPrimer->nGetHighQualitySegmentLengthSortOfUnpadded() ) pLocFragOfExistingTopStrandUniversalPrimer = pLocFrag; } } } // for( int nRead = 0; ... if ( pLocFragOfExistingTopStrandUniversalPrimer ) createExperimentWithResequencingUniversalPrimer( pLocFragOfExistingTopStrandUniversalPrimer, pSubcloneTemplate ); if ( pLocFragOfExistingBottomStrandUniversalPrimer ) createExperimentWithResequencingUniversalPrimer( pLocFragOfExistingBottomStrandUniversalPrimer, pSubcloneTemplate ); } } void Contig :: createExperimentWithResequencingUniversalPrimer( LocatedFragment* pExistingLocFrag, subcloneTTemplate* pSubcloneTemplate ) { bool bTopStrandNotBottomStrand = !pExistingLocFrag->bComp(); int nFirstBaseOfReadUnpadded; // these used to be off by 1 so that you would see 79 399 in the // autoFinishExpTag's (DG 7/29/99) // DG 11/99: The location of the beginning of the new read is best // determined by an existing universal primer read--*not* by the location // of the vector/insert junction since universal primer reads start // *way* before (80 or so bases) the vector/insert junction. I changed // the above so we check which read is highest quality and use that one // if there is more than one if ( bTopStrandNotBottomStrand ) { nFirstBaseOfReadUnpadded = pExistingLocFrag->nGetAlignStartUnpadded(); } else { nFirstBaseOfReadUnpadded = pExistingLocFrag->nGetAlignEndUnpadded(); } readTypes ucStrandAndChemistry = ( bTopStrandNotBottomStrand ? TOP_STRAND_TERMINATOR_READS : BOTTOM_STRAND_TERMINATOR_READS ); // if we know where the end of the template is, we should use // that information // if we have already run into vector, use that information // to shorten the length of the possible read int nUnpaddedLeft; int nUnpaddedRight; if ( bTopStrandNotBottomStrand ) { nUnpaddedLeft = nFirstBaseOfReadUnpadded; nUnpaddedRight = nFirstBaseOfReadUnpadded + ConsEd::pGetAssembly()->pModelRead_->length() - 1; } else { nUnpaddedRight = nFirstBaseOfReadUnpadded; nUnpaddedLeft = nFirstBaseOfReadUnpadded - ConsEd::pGetAssembly()->pModelRead_->length() + 1; } // the read starts at the end of the X's rather than // at the start of the existing read if ( pSubcloneTemplate->bUnpaddedStartKnownPrecisely_ ) { nUnpaddedLeft = MAX( nUnpaddedLeft, pSubcloneTemplate->nUnpaddedStart_ ); } if ( pSubcloneTemplate->bUnpaddedEndKnownPrecisely_ ) { nUnpaddedRight = MIN( nUnpaddedRight, pSubcloneTemplate->nUnpaddedEnd_ ); } if ( nUnpaddedLeft > nUnpaddedRight ) return; if ( pCP->bAutoFinishAllowResequencingReadsOnlyForRunsAndStops_ ) { bool bCrossesARun; bool bCrossesAStop; if ( !bDoesRegionCrossARunOrStop( nUnpaddedLeft, nUnpaddedRight, bCrossesARun, bCrossesAStop ) ) return; } // since dGetErrorsFixed is expensive, let's first see if we can // quickly eliminate this read cheaply double dMaxErrorsFixed = dGetInitialMaxErrorsFixed( nUnpaddedLeft, nUnpaddedRight ); if ( dMaxErrorsFixed < pCP->dAutoFinishMinNumberOfErrorsFixedByAnExpToUse_ ) return; // now, if the read has made the cut, then check it more rigorously bool bJustCountConsensusErrorsFixed = pCP->bAutoFinishAllowResequencingReadsToExtendContigs_ ? false : true; double dErrorsFixed = dGetErrorsFixedForSubcloneRead( &pSubcloneTemplate, 1, // only 1 subclone bTopStrandNotBottomStrand, nFirstBaseOfReadUnpadded, true, // exclude vector bases false, // bChooseExperiment bJustCountConsensusErrorsFixed, // bJustConsensusNotGaps false ); // bFindWindowOfMostErrorsFixed // don't suggest an experiment if the errors fixed // is not sufficiently high if ( dErrorsFixed < consedParameters::pGetConsedParameters()->dAutoFinishMinNumberOfErrorsFixedByAnExpToUse_ ) return; experiment* pExp = new experiment(); pExp->dErrorsFixed_ = dErrorsFixed; pExp->ucStrandAndChemistry_ = ucStrandAndChemistry; // pExp->bCustomPrimerNotUniversalPrimer_ = false; pExp->nReadType_ = pExistingLocFrag->nReadType_; pExp->bCloneTemplateNotSubcloneTemplate_ = false; pExp->fCost_ = consedParameters::pGetConsedParameters()->dAutoFinishCostOfResequencingUniversalPrimerSubcloneReaction_; pExp->pCustomPrimer_ = NULL; pExp->pContig_ = this; pExp->pSubcloneTemplate_ = pSubcloneTemplate; pExp->nFirstBaseOfReadUnpadded_ = nFirstBaseOfReadUnpadded; pExp->nUnpaddedLeft_ = nUnpaddedLeft; pExp->nUnpaddedRight_ = nUnpaddedRight; pExp->bTopStrandNotBottomStrand_ = bTopStrandNotBottomStrand; pExp->nState_ = nAvailableExperiment; assert( pExp->nUnpaddedLeft_ <= pExp->nUnpaddedRight_ ); setVariousErrorData( pExp ); pCandidateExperiments_->insert( pExp ); if ( ConsEd::pGetAssembly()->bFoundAutoFinishDebugRead( pExp ) ) { fprintf( pAO, "\nfound debug read in createExperimentWithResequencingUniversalPrimer--printing:\n" ); pExp->print(); } } typedef struct { LocatedFragment* pLocFrag_; int nVectorInsertJunction_; // mark on not-X base } vectorReadType; static int nSortByXNotXJunction( const vectorReadType* pRead1, const vectorReadType* pRead2 ) { if ( pRead1->nVectorInsertJunction_ < pRead2->nVectorInsertJunction_ ) return( -1 ); else if ( pRead1->nVectorInsertJunction_ > pRead2->nVectorInsertJunction_ ) return( 1 ); else return( 0 ); } bool Contig :: bIsThisEndTheEndOfTheClone( const int nWhichEnd ) { if ( nWhichEnd == nLeftGap ) { if ( !bLeftEndOfContigExaminedForEndOfClone_ ) determineWhetherEndOfClone( nWhichEnd ); return( bLeftEndOfContigIsEndOfClone_ ); } else { if ( !bRightEndOfContigExaminedForEndOfClone_ ) determineWhetherEndOfClone( nWhichEnd ); return( bRightEndOfContigIsEndOfClone_ ); } } void Contig :: determineWhetherEndOfClone( const int nWhichEnd ) { bool bEndOfClone; if ( pCP->bAutoFinishCDNANotGenomic_ ) bEndOfClone = bIsThisEndTheEndOfTheCloneCDNA( nWhichEnd ); else bEndOfClone = bIsThisEndTheEndOfTheCloneGenomic( nWhichEnd ); if ( nWhichEnd == nLeftGap ) { bLeftEndOfContigExaminedForEndOfClone_ = true; bLeftEndOfContigIsEndOfClone_ = bEndOfClone; } else { bRightEndOfContigExaminedForEndOfClone_ = true; bRightEndOfContigIsEndOfClone_ = bEndOfClone; } } bool Contig :: bIsThisEndTheEndOfTheCloneCDNA( const int nWhichEnd ) { const int nAmountIntoConsensus = 50; const int nAmountOffConsensus = 50; int nWindowToLookLeft; int nWindowToLookRight; bool bLookForTopStrandNotBottomStrandRead; if ( nWhichEnd == nLeftGap ) { nWindowToLookLeft = nGetStartConsensusIndex() - nAmountOffConsensus; nWindowToLookRight = nGetStartConsensusIndex() + nAmountIntoConsensus; bLookForTopStrandNotBottomStrandRead = true; } else { nWindowToLookLeft = nGetEndConsensusIndex() - nAmountIntoConsensus; nWindowToLookRight = nGetEndConsensusIndex() + nAmountOffConsensus; bLookForTopStrandNotBottomStrandRead = false; } ContigView* pContigView = pGetContigView( nWindowToLookLeft, nWindowToLookRight ); bool bThisIsCloneEnd = false; for( int nRead = 0; nRead < pContigView->nGetNumberOfFragments(); ++nRead ) { LocatedFragment* pLocFrag = pContigView->pLocatedFragmentGet( nRead ); if ( bLookForTopStrandNotBottomStrandRead == pLocFrag->bComp() ) continue; if ( pLocFrag->nReadType_ == nUniversalForward || pLocFrag->nReadType_ == nUniversalReverse ) { bThisIsCloneEnd = true; break; } } return( bThisIsCloneEnd ); } bool Contig :: bIsThisEndTheEndOfTheCloneGenomic( const int nWhichEnd ) { const int nAmountIntoConsensus = 50; const int nAmountOffConsensus = 10; int nWindowToLookLeft; int nWindowToLookRight; if ( nWhichEnd == nLeftGap ) { nWindowToLookLeft = nGetStartConsensusIndex() - nAmountOffConsensus; nWindowToLookRight = nGetStartConsensusIndex() + nAmountIntoConsensus; } else { nWindowToLookLeft = nGetEndConsensusIndex() - nAmountIntoConsensus; nWindowToLookRight = nGetEndConsensusIndex() + nAmountOffConsensus; } // get all reads in a big window (60 bases) around the end of the consensus ContigView* pContigView = pGetContigView( nWindowToLookLeft, nWindowToLookRight ); if ( pContigView->nGetNumberOfFragments() < 2 ) return( false ); vectorReadType* pVectorReadArray = ( vectorReadType*) calloc( pContigView->nGetNumberOfFragments() + 1, sizeof( vectorReadType ) ); int nNumberOfXNotXReads = 0; const int nLookingForX = 1; const int nLookingForNotX = 2; int nRead; for( nRead = 0; nRead < pContigView->nGetNumberOfFragments(); ++nRead ) { LocatedFragment* pFrag = pContigView->pLocatedFragmentGet( nRead ); if ( pFrag->bIsWholeReadLowQuality() ) continue; int nHighQualityStart = pFrag->nGetHighQualityStart(); int nHighQualityEnd = pFrag->nGetHighQualityEnd(); if ( !bIntervalsIntersect( nHighQualityStart, nHighQualityEnd, nWindowToLookLeft, nWindowToLookRight ) ) continue; // if reached here, we have a read whose high quality segment // intersects the window around the end of the consensus // widen this window a little to possibly include some low quality // bases as well, but not larger than the window around the // end of the consensus int nConsPosStart; int nConsPosEnd; assert( bIntersect( nWindowToLookLeft, nWindowToLookRight, pFrag->nGetAlignStart(), pFrag->nGetAlignEnd(), nConsPosStart, nConsPosEnd ) ); // find first/last x bool bFoundX = false; bool bFoundNotX = false; int nXNotXJunction; // mark on not-X base if ( nWhichEnd == nLeftGap ) { int nState = nLookingForX; // want to find where XXXXBBBBB junction is bool bStillLooking = true; for( int nConsPos = nConsPosStart; nConsPos <= nConsPosEnd && bStillLooking; ++nConsPos ) { char c = pFrag->ntGetFragFromConsPos( nConsPos ).cGetBase(); if ( c == '*' ) continue; if ( nState == nLookingForX ) { if ( c == 'x' ) { bFoundX = true; nState = nLookingForNotX; } } else if ( nState == nLookingForNotX ) { if ( c != 'x' ) { bFoundNotX = true; nXNotXJunction = nConsPos; bStillLooking = false; } } } // for( int nConsPos } // if ( nWhichEnd == nLeftGap ) { else { int nState = nLookingForNotX; // want to find where (notX)XXXXX junction is bool bStillLooking = true; for( int nConsPos = nConsPosStart; nConsPos <= nConsPosEnd && bStillLooking; ++nConsPos ) { char c = pFrag->ntGetFragFromConsPos( nConsPos ).cGetBase(); if ( c == '*' ) continue; if ( nState == nLookingForNotX ) { if ( c != 'x' ) { bFoundNotX = true; nState = nLookingForX; } } else if ( nState == nLookingForX ) { if ( c == 'x' ) { bFoundX = true; nXNotXJunction = nConsPos - 1; bStillLooking = false; } } } // for (int nConsPos .. } // for this read to be informative, there must be an X/notX junction if ( bFoundX && bFoundNotX ) { // add this read and its junction to the list pVectorReadArray[ nNumberOfXNotXReads ].pLocFrag_ = pFrag; pVectorReadArray[ nNumberOfXNotXReads ].nVectorInsertJunction_ = nXNotXJunction; ++nNumberOfXNotXReads; } } // for( int nRead... qsort( pVectorReadArray, nNumberOfXNotXReads, sizeof( vectorReadType ), ( ( int(*) ( const void*, const void* ) ) nSortByXNotXJunction ) ); // look for a pair of reads in which one of the reads that both have // junction close to each other and one of them is over 100 bases from // the beginning const int nLittleWindow = 6; bool bFoundPairOfReads = false; int nVectorInsertJunction = 0; for( nRead = 0; nRead < nNumberOfXNotXReads - 1; ++nRead ) { int nLocDiff = pVectorReadArray[ nRead + 1 ].nVectorInsertJunction_ - pVectorReadArray[ nRead ].nVectorInsertJunction_; assert( nLocDiff >= 0 ); if ( nLocDiff <= nLittleWindow ) { // found possible vector/insert junction // Let's see if one of these reads has the junction // not in the 1st hundred bases // If the gap is the left end, then the read must be top strand // (pointing in) and have the xxxxxAAAAAA be over base position 100. // This is because bottom strand reads could have // xxxxxxxAAAAAAA over base position 100 due to a short insert // size and the sequence running into sequencing vector. // If the gap is on the right end, then the read must be bottom // (pointing in) and have the AAAAAAxxxxx after base 100. // This is because top strand (pointing out) reads could have // AAAAAAxxxxxx due to short insert size. I actually saw this. // Phil had me remove the above condition and instead have // a condition that there is no read that has all high quality // non-X in a window around the junction. // I want to check that the 2 reads are not from the same template. // If they are, then the fact that 2 reads have XXXX's starting // in the same place could just be due to sequencing (e.g., M13) // vector. Phil agreed 3/99 if ( pVectorReadArray[ nRead ].pLocFrag_->soGetTemplateName() == pVectorReadArray[ nRead + 1 ].pLocFrag_->soGetTemplateName() ) continue; if ( ( pVectorReadArray[ nRead ].pLocFrag_->nOrientedUnpaddedFragPosFromConsPos( pVectorReadArray[ nRead ].nVectorInsertJunction_ ) > 100 ) || ( pVectorReadArray[ nRead + 1].pLocFrag_->nOrientedUnpaddedFragPosFromConsPos( pVectorReadArray[ nRead + 1 ].nVectorInsertJunction_ ) > 100 ) ) { // I think we've found a vector/insert junction bFoundPairOfReads = true; nVectorInsertJunction = pVectorReadArray[ nRead ].nVectorInsertJunction_; } } // if ( nLocDiff <= 6 ) } // for( int nRead ... // now we have a putative vector/insert junction free( pVectorReadArray ); if ( !bFoundPairOfReads ) return( false ); // now do this additional check--make sure that all other reads // are either: // 1) low quality // 2) have some X's pointing out of the contig if we look far enough // That is, if a read is high quality at the junction, then it must // not be all non-X's in a window. // That window used to be only 6 bases. This failed in a case like this: // BBBBBBBBBBBBBBBBBBB(contig) (consensus) // XXXXXXXXXXXXXXXXXXBBBBBBBBBBBBBBBBBBBBBBBBBBB (read 1) // xxxxxxxxxxxxxxxxxxxbbbbbbbbbbbbbbbbbb // xxxxxxxxxxxxxxxxxxxbbbbbbbbbbbbbbbbbb // read 1 was the high quality read. // It's vector wasn't masked all the way to the vector/insert junction. // So now I look 15 bases to see if the read is all high quality out of // the contig. If I have no X's in that 15 base high quality window, // then I assume not a clone end. const int nWindowForLookingForHighQuality = 15; nWindowToLookLeft = ( nWhichEnd == nLeftGap ) ? ( nVectorInsertJunction - nWindowForLookingForHighQuality ) : ( nVectorInsertJunction ) ; nWindowToLookRight = ( nWhichEnd == nLeftGap ) ? ( nVectorInsertJunction ) : ( nVectorInsertJunction + nWindowForLookingForHighQuality ); for( nRead = 0; nRead < pContigView->nGetNumberOfFragments(); ++nRead ) { LocatedFragment* pLocFrag = pContigView->pLocatedFragmentGet( nRead ); if ( pLocFrag->bIsWholeReadLowQuality() ) continue; int nHighQualityStart = pLocFrag->nGetHighQualityStart(); int nHighQualityEnd = pLocFrag->nGetHighQualityEnd(); int nIntersect1; int nIntersect2; if ( !bIntersect( nHighQualityStart, nHighQualityEnd, nWindowToLookLeft, nWindowToLookRight, nIntersect1, nIntersect2 ) ) continue; if ( ( nIntersect1 != nWindowToLookLeft ) || ( nIntersect2 != nWindowToLookRight ) ) continue; // if reached here, we have a read whose high quality segment // contains the window just outside the putative vector/insert // junction. // The condition that will show that this is *not* the // cloning site is if all these bases are high quality not X's bool bFoundX = false; for( int nConsPos = nIntersect1; nConsPos <= nIntersect2 && !bFoundX; ++nConsPos ) if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() == 'x' ) bFoundX = true; if ( !bFoundX ) { // oh, oh. There is a read with all non-X's outside the putative // insert. So we must not have found the vector/insert junction. return( false ); } }// for( int nRead... // so if passed all these tests, then it must be a cloning site return( true ); } void Contig :: createExperimentsWithFirstForwardsOrFirstReverses() { double dMinQuality = consedParameters::pGetConsedParameters()->dAutoFinishMinNumberOfErrorsFixedByAnExpToUse_; for( int nTemplate = 0; nTemplate < aSubcloneTemplates_.entries(); ++nTemplate ) { subcloneTTemplate* pSub = aSubcloneTemplates_[ nTemplate ]; bool bTopNotBottomStrand; int nReadType; if ( !pSub->bOKToCallDeNovoUniversalPrimerRead( bTopNotBottomStrand, nReadType, this ) ) continue; if ( pSub->bChosenPriorToMakingListOfExperimentCandidates_ ) continue; int nFirstBaseOfReadUnpadded; int nUnpaddedLeft; int nUnpaddedRight; if ( bTopNotBottomStrand ) { // case like this: // xxxx<--------xxxx( existing read) // -------->xxxx( reverse to be called) // or this: // --------> <--------xxxx(existing read) // <-------insert size--------> nFirstBaseOfReadUnpadded = pSub->nUnpaddedStart_ - pCP->nAutoFinishNumberOfVectorBasesAtBeginningOfAUniveralPrimerRead_; nUnpaddedLeft = pSub->nUnpaddedStart_; nUnpaddedRight = nFirstBaseOfReadUnpadded + ConsEd::pGetAssembly()->pModelRead_->length() - 1; } else { // ----------> (existing univ primer read ) // <-------------------- (mate to suggest) // ^ ( nConsPosStart_ ) // (first base of read)^ // case like this: // xxxx----------->xxxxx (existing read) // <------- (candidate reverse) // or case like this: // xxxx-----------> (existing read) // <--------- (candidate reverse) // <-------insert size------> nFirstBaseOfReadUnpadded = pSub->nUnpaddedEnd_ + pCP->nAutoFinishNumberOfVectorBasesAtBeginningOfAUniveralPrimerRead_; nUnpaddedLeft = nFirstBaseOfReadUnpadded - ConsEd::pGetAssembly()->pModelRead_->length() + 1; nUnpaddedRight = pSub->nUnpaddedEnd_; } // case in which very short model read and long parameter // pCP->nAutoFinishNumberOfVectorBasesAtBeginningOfAUniveralPrimerRead_; if ( nUnpaddedLeft > nUnpaddedRight ) continue; // is it possible to have a low quality region on the extended contig // but not a high quality region? Yes: // (contig) // ------------------------------ // ------> (existing read) // dErrorsFixed_ = dErrorsFixed; readTypes ucChemistryAndStrand = ( bTopNotBottomStrand ? TOP_STRAND_TERMINATOR_READS : BOTTOM_STRAND_TERMINATOR_READS ); pExp->ucStrandAndChemistry_ = ucChemistryAndStrand; pExp->nReadType_ = nReadType; pExp->bCloneTemplateNotSubcloneTemplate_ = false; pExp->fCost_ = pCP->dAutoFinishCostOfDeNovoUniversalPrimerSubcloneReaction_; pExp->pCustomPrimer_ = 0; // must figure out the errors on just the consensus pExp->pContig_ = this; pExp->pSubcloneTemplate_ = pSub; pExp->nFirstBaseOfReadUnpadded_ = nFirstBaseOfReadUnpadded; pExp->nUnpaddedLeft_ = nUnpaddedLeft; pExp->nUnpaddedRight_ = nUnpaddedRight; // case in which the model read is shorter than the // parameter nAutoFinishNumberOfVectorBasesAtBeginningOfAUniveralPrimerRead_ if ( pExp->nUnpaddedLeft_ > pExp->nUnpaddedRight_ ) continue; pExp->bTopStrandNotBottomStrand_ = bTopNotBottomStrand; pExp->nState_ = nAvailableExperiment; pExp->bDeNovoUniversalPrimerRead_ = true; setVariousErrorData( pExp ); pCandidateExperiments_->insert( pExp ); } // for( int nTemplate = 0; nTemplate < aSubcloneTemplates_.entries(); } // Note: this can only be used before pUnpaddedErrorProbabilities_ // starts being modified double Contig :: dGetOriginalConsensusErrors( const int nUnpaddedLeft, const int nUnpaddedRight ) { int nUnpaddedLeftOnConsensus; int nUnpaddedRightOnConsensus; if ( !bIntersect( nUnpaddedLeft, nUnpaddedRight, nGetUnpaddedStartIndex(), nGetUnpaddedEndIndex(), nUnpaddedLeftOnConsensus, nUnpaddedRightOnConsensus )) { // the region is completely off the consensus return( 0.0 ); } double dOriginalErrors = 0.0; for( int nUnpadded = nUnpaddedLeftOnConsensus; nUnpadded <= nUnpaddedRightOnConsensus; ++nUnpadded ) { dOriginalErrors += (*pOriginalUnpaddedErrorProbabilities_)[ nUnpadded ]; } return( dOriginalErrors ); } double Contig :: dGetInitialMaxErrorsFixed2( const int nFirstBaseOfReadUnpadded, const bool bTopStrandNotBottomStrand ) { int nUnpaddedLeft; int nUnpaddedRight; if ( bTopStrandNotBottomStrand ) { nUnpaddedLeft = nFirstBaseOfReadUnpadded; nUnpaddedRight = nUnpaddedLeft + ConsEd::pGetAssembly()->pModelRead_->length() - 1; } else { nUnpaddedRight = nFirstBaseOfReadUnpadded; nUnpaddedLeft = nUnpaddedRight - ConsEd::pGetAssembly()->pModelRead_->length() + 1; } return( dGetInitialMaxErrorsFixed( nUnpaddedLeft, nUnpaddedRight ) ); } double Contig :: dGetInitialMaxErrorsFixed( const int nUnpaddedLeft, const int nUnpaddedRight ) { int nRightEndOfRead = nUnpaddedRight; int nJustToLeftOfLeftEndOfRead = nUnpaddedLeft - 1; // I think this method may not take into account the error probability // of the leftmost base if the read is hanging off the left end // (DG 1/00). Yes, it does, since the pCumUnpaddedErrors starts // one base to the left of the leftmost extended consensus error probability // array. (DG 1/00). int nIntersectLeft; int nIntersectRight; if ( !bIntersect( nJustToLeftOfLeftEndOfRead, nRightEndOfRead, pCumUnpaddedErrors_->nGetStartIndex(), pCumUnpaddedErrors_->nGetEndIndex(), nIntersectLeft, nIntersectRight ) ) return( 0.0 ); return( (*pCumUnpaddedErrors_)[ nIntersectRight ] - (*pCumUnpaddedErrors_)[ nIntersectLeft ] ); } void Contig :: dumpContigErrorProbabilities( const int nFileID ) { FileName filDump = "error_probabilities." + RWCString( (long) nFileID ); fprintf( pAO, "%s\n", filDump.data() ); FILE* pDump = fopen( filDump.data(), "w" ); if ( !pDump ) { PANIC_OST( ost ) << "In dumpContigErrorProbablities: cannot open " << filDump << ends; throw SysRequestFailed( ost.str().c_str() ); } for( int nUnpadded = 2851; nUnpadded <= 2860; ++nUnpadded ) { double dErrorProbability = (*pUnpaddedErrorProbabilities_)[ nUnpadded ]; double dQuality = -10.0 * log10( dErrorProbability ); fprintf( pDump, "%d %.2f %.6e\n", nUnpadded, dQuality, dErrorProbability ); } fclose( pDump ); } bool Contig :: bThereIsAnotherSubcloneCustomPrimerReadTooCloseOnTheSameStrand( experiment* pExperimentCandidate ) { for( int nExp = 0; nExp < pChosenExperiments_->length(); ++nExp ) { experiment* pExp = (*pChosenExperiments_)[ nExp ]; if ( ABS( pExp->nFirstBaseOfReadUnpadded_ - pExperimentCandidate->nFirstBaseOfReadUnpadded_ ) < pCP->nAutoFinishDoNotAllowSubcloneCustomPrimerReadsCloserThanThisManyBases_ ) { if ( ( pExp->bTopStrandNotBottomStrand_ == pExperimentCandidate->bTopStrandNotBottomStrand_ ) && ( pExp->nReadType_ == nWalk ) && !pExp->bCloneTemplateNotSubcloneTemplate_ ) { return( true ); } } } return( false ); } bool Contig :: bThereIsAnotherWholeCloneCustomPrimerReadTooCloseOnTheSameStrand( experiment* pExperimentCandidate ) { for( int nExp = 0; nExp < pChosenExperiments_->length(); ++nExp ) { experiment* pExp = (*pChosenExperiments_)[ nExp ]; if ( ABS( pExp->nFirstBaseOfReadUnpadded_ - pExperimentCandidate->nFirstBaseOfReadUnpadded_ ) < pCP->nAutoFinishDoNotAllowWholeCloneCustomPrimerReadsCloserThanThisManyBases_ ) { if ( ( pExp->bTopStrandNotBottomStrand_ == pExperimentCandidate->bTopStrandNotBottomStrand_ ) && ( pExp->nReadType_ == nWalk ) && pExp->bCloneTemplateNotSubcloneTemplate_ ) { return( true ); } } } return( false ); } void Contig :: possiblyUpdateContigHighQualitySegment( experiment* pChosenExp ) { if ( pChosenExp->nUnpaddedLeft_ < nUpdatedUnpaddedHighQualitySegmentStart_ ) nUpdatedUnpaddedHighQualitySegmentStart_ = pChosenExp->nUnpaddedLeft_; if ( pChosenExp->nUnpaddedRight_ > nUpdatedUnpaddedHighQualitySegmentEnd_ ) nUpdatedUnpaddedHighQualitySegmentEnd_ = pChosenExp->nUnpaddedRight_; } class endpointsOfSubcloneOfRead { public: endpointsOfSubcloneOfRead( const int nUnpaddedLeftEndpoint, const int nUnpaddedRightEndpoint, subcloneTTemplate* pSub ) : nUnpaddedLeftEndpoint_( nUnpaddedLeftEndpoint ), nUnpaddedRightEndpoint_( nUnpaddedRightEndpoint ), pSub_( pSub ) {} // default ctor to keep RWTValOrderedVector happy endpointsOfSubcloneOfRead() {} // also needed to keep RWTValOrderedVector happy on hp-ux bool operator==( const endpointsOfSubcloneOfRead& end ) { if ( nUnpaddedLeftEndpoint_ == end.nUnpaddedLeftEndpoint_ && nUnpaddedRightEndpoint_ == end.nUnpaddedRightEndpoint_ && pSub_ == end.pSub_ ) return( true ); else return( false ); } public: int nUnpaddedLeftEndpoint_; int nUnpaddedRightEndpoint_; subcloneTTemplate* pSub_; }; static int nCompareLeftEndpoints( endpointsOfSubcloneOfRead* pEndPoints1, endpointsOfSubcloneOfRead* pEndPoints2 ) { if ( pEndPoints1->nUnpaddedLeftEndpoint_ < pEndPoints2->nUnpaddedLeftEndpoint_ ) return( -1 ); else if ( pEndPoints1->nUnpaddedLeftEndpoint_ == pEndPoints2->nUnpaddedLeftEndpoint_ ) return( 0 ); else return( 1 ); } static int nCompareRightEndpoints( endpointsOfSubcloneOfRead* pEndPoints1, endpointsOfSubcloneOfRead* pEndPoints2 ) { if ( pEndPoints1->nUnpaddedRightEndpoint_ < pEndPoints2->nUnpaddedRightEndpoint_ ) return( -1 ); else if ( pEndPoints1->nUnpaddedRightEndpoint_ == pEndPoints2->nUnpaddedRightEndpoint_ ) return( 0 ); else return( 1 ); } double Contig :: dGetErrorsFixedForCustomPrimerSubcloneRead( primerType* pPrimer, const bool bTopStrandNotBottomStrand, const int nFirstBaseOfReadUnpadded, const bool bChooseExperiment, const bool bJustConsensusNotGaps, const bool bFindWindowOfMostErrorsFixed ) { int nNumberOfTemplatesToUse = 0; for( int nTemplate = 0; nTemplate < pCP->nAutoFinishHowManyTemplatesYouIntendToUseForCustomPrimerSubcloneReactions_; ++nTemplate ) { if ( pPrimer->pSubcloneTemplate_[ nTemplate ] ) ++nNumberOfTemplatesToUse; } return( dGetErrorsFixedForSubcloneRead( pPrimer->pSubcloneTemplate_, nNumberOfTemplatesToUse, bTopStrandNotBottomStrand, nFirstBaseOfReadUnpadded, false, // bExcludeVectorBaseAtBeginningOfRead bChooseExperiment, // bChooseExperiment bJustConsensusNotGaps, bFindWindowOfMostErrorsFixed ) ); } static int nWindowOfMostErrorsFixedUnpaddedRight; static double dErrorsFixedInWindowOfMostErrorsFixed; static double dErrorsFixedInCurrentWindow; static double dErrorsFixed[10]; static int nPointerToLastAddedBase; static int n10thBaseUnpadded; static float fProductDistributionSameStrand[ nMAX_QUALITY_LEVEL2 + 1 ]; static float fProductDistributionOtherStrand[ nMAX_QUALITY_LEVEL2 + 1 ]; bool bProductDistributionSameStrandAllOnes = true; bool bProductDistributionOtherStrandAllOnes = true; double Contig :: dGetErrorsFixedForSubcloneRead( subcloneTTemplate** ppSubcloneTemplateArray, const int nNumberOfTemplates, const bool bTopStrandNotBottomStrand, const int nFirstBaseOfReadUnpadded, const bool bExcludeVectorBasesAtBeginningOfRead, const bool bChooseExperiment, const bool bJustConsensusNotGaps, const bool bFindWindowOfMostErrorsFixed ) { // fixing bug in which, if the entire read is off the extended consensus, // such as when finding all reads that might close a gap, the // window of most errors fixed was garbage if ( bFindWindowOfMostErrorsFixed ) { // these will be changed if we are able to calculate them nWindowOfMostErrorsFixedUnpaddedRight = 0; dErrorsFixedInWindowOfMostErrorsFixed = 0.0; } // nLeftmostBaseOfRead and nRightmostBaseOfRead and the // extents of the read(s) created with this primer if the read // were not to be limited by the primers. int nLeftmostBaseOfRead; int nRightmostBaseOfRead; if ( bTopStrandNotBottomStrand ) { if ( bExcludeVectorBasesAtBeginningOfRead ) nLeftmostBaseOfRead = nFirstBaseOfReadUnpadded + pCP->nAutoFinishNumberOfVectorBasesAtBeginningOfAUniveralPrimerRead_; else nLeftmostBaseOfRead = nFirstBaseOfReadUnpadded; nRightmostBaseOfRead = nFirstBaseOfReadUnpadded + ConsEd::pGetAssembly()->pModelRead_->length() - 1; } else { nLeftmostBaseOfRead = nFirstBaseOfReadUnpadded - ConsEd::pGetAssembly()->pModelRead_->length() + 1; if ( bExcludeVectorBasesAtBeginningOfRead ) nRightmostBaseOfRead = nFirstBaseOfReadUnpadded - pCP->nAutoFinishNumberOfVectorBasesAtBeginningOfAUniveralPrimerRead_; else nRightmostBaseOfRead = nFirstBaseOfReadUnpadded; } // this would happen if this project had short reads--shorter than // the beginning vector on universal primer reads if ( ! ( nLeftmostBaseOfRead <= nRightmostBaseOfRead ) ) return( 0.0 ); // intersect this with the extended consensus--we are only interested // in the portion of the read on the extended consensus since no credit // is given for any part of the read that sticks out beyond this. // (usually this will be all of it) int nOverlapReadLeft; int nOverlapReadRight; if ( ! bIntersect( pUnpaddedErrorProbabilities_->nGetStartIndex(), pUnpaddedErrorProbabilities_->nGetEndIndex(), nLeftmostBaseOfRead, nRightmostBaseOfRead, nOverlapReadLeft, nOverlapReadRight ) ) return( 0.0 ); if ( bJustConsensusNotGaps ) { if ( ! bIntersect( nOverlapReadLeft, nOverlapReadRight, nGetUnpaddedStartIndex(), nGetUnpaddedEndIndex(), nOverlapReadLeft, nOverlapReadRight ) ) { return( 0.0 ); } } RWTValOrderedVector aEndpointsOfTransitionsBetweenTemplateCoverage( (size_t) nNumberOfTemplates ); for( int nTemplate = 0; nTemplate < nNumberOfTemplates; ++nTemplate ) { subcloneTTemplate* pSub = ppSubcloneTemplateArray[ nTemplate ]; int nTemplateReadLeft; int nTemplateReadRight; if ( bIntersect( nOverlapReadLeft, nOverlapReadRight, pSub->nUnpaddedStart_, pSub->nUnpaddedEnd_, nTemplateReadLeft, nTemplateReadRight ) ) { aEndpointsOfTransitionsBetweenTemplateCoverage.insert( endpointsOfSubcloneOfRead( nTemplateReadLeft, nTemplateReadRight, pSub ) ); } } // for( int nTemplate = 0; // sort this array void* pArray = (void*) aEndpointsOfTransitionsBetweenTemplateCoverage.pArray_; size_t nNumberOfElements = aEndpointsOfTransitionsBetweenTemplateCoverage.nCurrentLength_; size_t nSizeOfAnElement = sizeof( endpointsOfSubcloneOfRead ); if ( bTopStrandNotBottomStrand ) qsort( pArray, nNumberOfElements, nSizeOfAnElement, ( (int(*) ( const void*, const void*) ) nCompareRightEndpoints ) ); else qsort( pArray, nNumberOfElements, nSizeOfAnElement, ( (int(*) ( const void*, const void*) ) nCompareLeftEndpoints ) ); // now for each base, see how many errors will be fixed // Note that each read here will be the same strand and chemistry // now for each *unpadded* consensus base, we will compute how many // errors are fixed. A read gets no credit for fixing anything // where there is a pad in the consensus. readTypes ucNewReadsStrandAndChemistry = ( bTopStrandNotBottomStrand ? TOP_STRAND_TERMINATOR_READS : BOTTOM_STRAND_TERMINATOR_READS ); readTypes ucOtherStrandAndChemistry = ( !bTopStrandNotBottomStrand ? TOP_STRAND_TERMINATOR_READS : BOTTOM_STRAND_TERMINATOR_READS ); if ( bFindWindowOfMostErrorsFixed ) { dErrorsFixedInWindowOfMostErrorsFixed = 0.0; dErrorsFixedInCurrentWindow = 0.0; // find 2 critical points: // --------------------------------------- // [ ] // ^ 10th base // ^ last base // before the 10th base, we just accumulate errors fixed // at the 10th base, we set the first best window // after the 10th base, when we add a base, we take one away n10thBaseUnpadded = aEndpointsOfTransitionsBetweenTemplateCoverage[ 0 ].nUnpaddedLeftEndpoint_ + 9; nPointerToLastAddedBase = -1; // prepare to increment to 0 } double dTotalErrorsFixed = 0.0; for( int nTemplateTransitions = 0; nTemplateTransitions < aEndpointsOfTransitionsBetweenTemplateCoverage.length(); ++nTemplateTransitions ) { int nUnpaddedLeft; int nUnpaddedRight; int nNumberOfTemplatesInThisInterval; // top strand reads have templates like this: // ------ (template 1) // ---------- (template 2) // ------------ (template 3) // <----><--><> // A B C // So when the nNumberOfTemplatesInThisInterval is 2 (region B), we use // template 2 and 3 // bottom strand reads have templates like this: // -------------------- template 1 // ---------------- template 2 // ----------- template 3 // <--><---><---------> // A B C // so when nNumberOfTemplatesInThisInterval is 2 (region B), we use templates // 1 and 2 // nUnpaddedLeft and nUnpaddedRight (below) are the boundaries // of the regions A, B, C in turn if ( bTopStrandNotBottomStrand ) { nNumberOfTemplatesInThisInterval = aEndpointsOfTransitionsBetweenTemplateCoverage.length() - nTemplateTransitions; if ( nTemplateTransitions == 0 ) nUnpaddedLeft = aEndpointsOfTransitionsBetweenTemplateCoverage[ 0 ].nUnpaddedLeftEndpoint_; else nUnpaddedLeft = aEndpointsOfTransitionsBetweenTemplateCoverage[ nTemplateTransitions - 1 ].nUnpaddedRightEndpoint_ + 1; nUnpaddedRight = aEndpointsOfTransitionsBetweenTemplateCoverage[ nTemplateTransitions ].nUnpaddedRightEndpoint_; } else { // bottom strand nNumberOfTemplatesInThisInterval = nTemplateTransitions + 1; nUnpaddedLeft = aEndpointsOfTransitionsBetweenTemplateCoverage[ nTemplateTransitions ].nUnpaddedLeftEndpoint_; if ( nTemplateTransitions == ( aEndpointsOfTransitionsBetweenTemplateCoverage.length() - 1 ) ) nUnpaddedRight = aEndpointsOfTransitionsBetweenTemplateCoverage[ nTemplateTransitions ].nUnpaddedRightEndpoint_; else nUnpaddedRight = aEndpointsOfTransitionsBetweenTemplateCoverage[ nTemplateTransitions + 1 ].nUnpaddedLeftEndpoint_ - 1; } MBTValOrderedOffsetVector* pModelReadWithThisNumberOfTemplates = (*(ConsEd::pGetAssembly()->pArrayOfModelReadsWithSeveralTemplates_))[ nNumberOfTemplatesInThisInterval ]; // set up for walking along read int nIncrement; int nSeqPosition; if ( bTopStrandNotBottomStrand ) { nSeqPosition = nUnpaddedLeft - nFirstBaseOfReadUnpadded + 1; nIncrement = 1; // --------------> read // ----------- extended consensus // ^ nOverlapLeft (start here and nSeqPosition then increases // } else { // <----------------- read // --------------- extended consensus // ^ nFirstBaseOfReadUnpadded // ^ nOverlapRight // ^ nOverlapLeft (start here and nSeqPosition then decreases nSeqPosition = nFirstBaseOfReadUnpadded - nUnpaddedLeft + 1; nIncrement = -1; } // walk along read double dErrorsFixedThisBase2; for( int nUnpadded = nUnpaddedLeft; nUnpadded <= nUnpaddedRight; ++nUnpadded, nSeqPosition += nIncrement ) { unsigned char ucMedianMaxQualityOfNewReads; RWTPtrOrderedVector* pArrayOfDistributions = (*pDistributionsOfChosenExperimentsArray_)[ nUnpadded ]; if ( !pArrayOfDistributions ) { // case of no experiments chosen at this consensus location // This is no longer true since there may be a minilibrary // called which will make pUnpaddedErrorProbabilities zero // where it covers the consensus. // assert( (*pUnpaddedErrorProbabilities_)[ nUnpadded ] == // (*pOriginalUnpaddedErrorProbabilities_)[ nUnpadded ] ); ucMedianMaxQualityOfNewReads = (*pModelReadWithThisNumberOfTemplates)[ nSeqPosition ]; dErrorsFixedThisBase2 = dGetErrorsFixedAtBase( (*pUnpaddedErrorProbabilities_)[ nUnpadded ], dErrorRateFromQuality[ ucMedianMaxQualityOfNewReads ], ucNewReadsStrandAndChemistry, (*pUnpaddedReadTypesCoveringEachBase_)[ nUnpadded ] ); dTotalErrorsFixed += dErrorsFixedThisBase2; // bChooseExperiment && nUnpadded >= -28 && nUnpadded <= -26 // if ( bFoundDebug && bChooseExperiment ) { // fprintf( pAO, "no exp yet, nUnpadded = %d and total errors fixed = %g median quality = %d seq pos = %d\n", // nUnpadded, dErrorsFixedAtBase2, (int) ucMedianMaxQualityOfNewReads, // nSeqPosition ); // } if ( bChooseExperiment ) { // update the consensus error probabilities (*pUnpaddedErrorProbabilities_)[ nUnpadded ] -= dErrorsFixedThisBase2; (*pUnpaddedReadTypesCoveringEachBase_)[ nUnpadded ] |= ucNewReadsStrandAndChemistry; // must put an array of distributions on the consensus RWTPtrOrderedVector* pArrayOfDistributions = new RWTPtrOrderedVector( (size_t) 3 ); (*pDistributionsOfChosenExperimentsArray_)[ nUnpadded ] = pArrayOfDistributions; // now make a distribution for each template add it to // the array of distributions int nTemplateStart; int nTemplateEnd; if ( bTopStrandNotBottomStrand ) { nTemplateStart = aEndpointsOfTransitionsBetweenTemplateCoverage.length() - nNumberOfTemplatesInThisInterval; nTemplateEnd = aEndpointsOfTransitionsBetweenTemplateCoverage.length() - 1; } else { nTemplateStart = 0; nTemplateEnd = nNumberOfTemplatesInThisInterval - 1; } float* pModelReadDistribution = (*ConsEd::pGetAssembly()->pModelReadDistributions_)[ nSeqPosition ]; for( int nTemplate = nTemplateStart; nTemplate <= nTemplateEnd; ++nTemplate ) { distribution* pDis = new distribution(); // fixed bug 5/8/2000 pDis->pSub_ = aEndpointsOfTransitionsBetweenTemplateCoverage[ nTemplate ].pSub_; pDis->bTopStrandNotBottomStrand_ = bTopStrandNotBottomStrand; for( int nQuality = 0; nQuality <= nMAX_QUALITY_LEVEL2; ++nQuality ) { pDis->fCumulativeQuality_[ nQuality ] = pModelReadDistribution[ nQuality ]; } pArrayOfDistributions->insert( pDis ); } } } else { // all hell and fury float* pModelReadDistribution = (*ConsEd::pGetAssembly()->pModelReadDistributions_)[ nSeqPosition ]; // top strand reads have templates like this: // ------ (template 1) // ---------- (template 2) // ------------ (template 3) // <----><--><> // A B C // So when the nNumberOfTemplatesInThisInterval is 2 (region B), we use // template 2 and 3 // bottom strand reads have templates like this: // -------------------- template 1 // ---------------- template 2 // ----------- template 3 // <--><---><---------> // A B C // so when nNumberOfTemplatesInThisInterval is 2 (region B), we use templates // 1 and 2 int nTemplateStart; int nTemplateEnd; if ( bTopStrandNotBottomStrand ) { nTemplateStart = aEndpointsOfTransitionsBetweenTemplateCoverage.length() - nNumberOfTemplatesInThisInterval; nTemplateEnd = aEndpointsOfTransitionsBetweenTemplateCoverage.length() - 1; } else { nTemplateStart = 0; nTemplateEnd = nNumberOfTemplatesInThisInterval - 1; } // make a little array of bool that indicates which of the // consensus templates we have already used RWTValVector aChosenTemplatesCombined( pArrayOfDistributions->length(), false ); bProductDistributionSameStrandAllOnes = true; bProductDistributionOtherStrandAllOnes = true; for( int nTemplate = nTemplateStart; nTemplate <= nTemplateEnd; ++nTemplate ) { // found bug 5/8/2000 subcloneTTemplate* pSubOfConsideredRead = aEndpointsOfTransitionsBetweenTemplateCoverage[ nTemplate ].pSub_; // check if this template is found in one of the chosen // reads at this base position distribution* pDistributionWithSameTemplate = NULL; int nDistributionWithSameTemplate; for( int nDis = 0; nDis < pArrayOfDistributions->length(); ++nDis ) { if ( pSubOfConsideredRead == (*pArrayOfDistributions)[ nDis ]->pSub_ ) { if ( bTopStrandNotBottomStrand == (*pArrayOfDistributions)[ nDis ]->bTopStrandNotBottomStrand_ ) { pDistributionWithSameTemplate = (*pArrayOfDistributions)[ nDis ]; nDistributionWithSameTemplate = nDis; break; } } } if (!pDistributionWithSameTemplate ) { // easy case--just multiply this distribution if ( bProductDistributionSameStrandAllOnes ) { assignDistributions( fProductDistributionSameStrand, pModelReadDistribution ); bProductDistributionSameStrandAllOnes = false; } else multiplyDistributions( fProductDistributionSameStrand, pModelReadDistribution ); if ( bChooseExperiment ) { // must add the distribution for this new template distribution* pNewDistribution = new distribution(); pNewDistribution->pSub_ = pSubOfConsideredRead; pNewDistribution->bTopStrandNotBottomStrand_ = bTopStrandNotBottomStrand; for( int nQuality = 0; nQuality <= nMAX_QUALITY_LEVEL2; ++nQuality ) pNewDistribution->fCumulativeQuality_[ nQuality ] = pModelReadDistribution[ nQuality ]; pArrayOfDistributions->insert( pNewDistribution ); } } else { // case in which there is a chosen read with the same // template. In this case, find the max of the two // distributions, and then multiply it. aChosenTemplatesCombined[ nDistributionWithSameTemplate ] = true; if ( bProductDistributionSameStrandAllOnes ) { makeDistributionAllOnes( fProductDistributionSameStrand ); bProductDistributionSameStrandAllOnes = false; } multiplyDistributionWithMinimum( fProductDistributionSameStrand, pModelReadDistribution, pDistributionWithSameTemplate->fCumulativeQuality_ ); if ( bChooseExperiment ) { for( int nQuality = 0; nQuality <= nMAX_QUALITY_LEVEL2; ++nQuality ) { pDistributionWithSameTemplate->fCumulativeQuality_[ nQuality ] = MIN( pModelReadDistribution[ nQuality ], pDistributionWithSameTemplate->fCumulativeQuality_[ nQuality ] ); } } // if ( bChooseExperiment ) {... } } // for( int nTemplate = nTemplateStart; // We have figured out the product distribution of all // templates in the considered read. We need now to include // any distributions on chosen reads that have have templates // distinct from any on the considered read. for( int nChosenTemplate = 0; nChosenTemplate < aChosenTemplatesCombined.length(); ++nChosenTemplate ) { if ( !aChosenTemplatesCombined[ nChosenTemplate ] ) { float* pCumulativeQualityArray = (*pArrayOfDistributions)[ nChosenTemplate ]->fCumulativeQuality_; if ( (*pArrayOfDistributions)[ nChosenTemplate ]->bTopStrandNotBottomStrand_ == bTopStrandNotBottomStrand ) { if ( bProductDistributionSameStrandAllOnes ) { assignDistributions( fProductDistributionSameStrand, pCumulativeQualityArray ); bProductDistributionSameStrandAllOnes = false; } else multiplyDistributions2( fProductDistributionSameStrand, pCumulativeQualityArray ); } else { if ( bProductDistributionOtherStrandAllOnes ) { assignDistributions( fProductDistributionOtherStrand, pCumulativeQualityArray ); bProductDistributionOtherStrandAllOnes = false; } else multiplyDistributions3( fProductDistributionOtherStrand, pCumulativeQualityArray ); } } } // for( int nChosenTemplate = 0; ... // have figured out the product distribution at this // base position for the considered read and the chosen reads // of the same strand and of the other strand. // figure out the median of the distributions of each strand // and then combine these medians with the original // consensus quality to get the new consensus quality. unsigned char ucMedianQualitySameStrand; if ( bProductDistributionSameStrandAllOnes ) ucMedianQualitySameStrand = 0; else { for( int nQuality = 0; nQuality <= nMAX_QUALITY_LEVEL2; ++nQuality ) { if ( fProductDistributionSameStrand[ nQuality ] >= 0.5 ) { ucMedianQualitySameStrand = nQuality; break; } } } // there should be some way of speeding this up when there // is *no* reads on the opposite strand. I don't know how // often this will be the case--perhaps more than half the time? // Sped up. unsigned char ucMedianQualityOtherStrand; if ( bProductDistributionOtherStrandAllOnes ) ucMedianQualityOtherStrand = 0; else { for( int nQuality = 0; nQuality <= nMAX_QUALITY_LEVEL2; ++nQuality ) { if ( fProductDistributionOtherStrand[ nQuality ] >= 0.5 ) { ucMedianQualityOtherStrand = nQuality; break; } } } double dErrorsFixedByOtherStrand = dGetErrorsFixedAtBase( (*pOriginalUnpaddedErrorProbabilities_)[ nUnpadded ], dErrorRateFromQuality[ ucMedianQualityOtherStrand ], ucOtherStrandAndChemistry, (*pOriginalUnpaddedReadTypesCoveringEachBase_)[ nUnpadded ] ); double dErrorsFixedBySameStrand = dGetErrorsFixedAtBase( (*pOriginalUnpaddedErrorProbabilities_)[ nUnpadded ] - dErrorsFixedByOtherStrand, dErrorRateFromQuality[ ucMedianQualitySameStrand ], ucNewReadsStrandAndChemistry, (*pOriginalUnpaddedReadTypesCoveringEachBase_)[ nUnpadded ] ); double dErrorsFixedThisBase = dErrorsFixedBySameStrand + dErrorsFixedByOtherStrand; // this represents the total errors fixed at this base by all // reads chosen plus the read currently considered. // But we are only interested in the *additional* errors // fixed by this read, not counting the errors fixed by // previous reads. double dAdditionalErrorsFixed = dErrorsFixedThisBase2 = dErrorsFixedThisBase + (*pUnpaddedErrorProbabilities_)[ nUnpadded ] - (*pOriginalUnpaddedErrorProbabilities_)[ nUnpadded ]; // note that (*pOriginalErrorProbabilities_)[ nUnpadded ] - // (*pUnpaddedErrorProbabilities_)[ nUnpadded ] // is the errors fixed by already chosen experiments dTotalErrorsFixed += dAdditionalErrorsFixed; // if ( bChooseExperiment && nUnpadded >= -28 && nUnpadded <= -26 ) { // if ( bFoundDebug && bChooseExperiment ) { // fprintf( pAO, "choosing additional experiment, nUnpadded = %d and total errors fixed = %g additional errors fixed = %g seq pos = %d dTotalErrors fixed = %g\n", // nUnpadded, dErrorsFixedThisBase, // dAdditionalErrorsFixed, // nSeqPosition, // dTotalErrorsFixed ); // if ( nUnpadded == 180 ) { // if ( !bProductDistributionSameStrandAllOnes ) { // fprintf( pAO, "product dist same strand: " ); // dumpDistribution( fProductDistributionSameStrand ); // } // else // fprintf( pAO, "product dist same strand all ones\n" ); // if ( !bProductDistributionOtherStrandAllOnes ) { // fprintf( pAO, "product dist other strand: " ); // dumpDistribution( fProductDistributionOtherStrand ); // } // else // fprintf( pAO, "product dist other strand all ones\n" ); // fprintf( pAO, "model read distribution: " ); // dumpDistribution( pModelReadDistribution ); // } // } if ( bChooseExperiment ) { (*pUnpaddedErrorProbabilities_)[ nUnpadded ] -= dAdditionalErrorsFixed; (*pUnpaddedReadTypesCoveringEachBase_)[ nUnpadded ] |= ucNewReadsStrandAndChemistry; } } // if ( !pArrayOfDistributions ) else if ( bFindWindowOfMostErrorsFixed ) { if ( nUnpadded <= n10thBaseUnpadded ) { ++nPointerToLastAddedBase; dErrorsFixed[ nPointerToLastAddedBase ] = dErrorsFixedThisBase2; dErrorsFixedInCurrentWindow += dErrorsFixedThisBase2; if ( nUnpadded == n10thBaseUnpadded ) { dErrorsFixedInWindowOfMostErrorsFixed = dErrorsFixedInCurrentWindow; nWindowOfMostErrorsFixedUnpaddedRight = nUnpadded; } } else { ++nPointerToLastAddedBase; if ( nPointerToLastAddedBase > 9 ) nPointerToLastAddedBase = 0; dErrorsFixedInCurrentWindow -= dErrorsFixed[ nPointerToLastAddedBase ]; dErrorsFixedInCurrentWindow += dErrorsFixedThisBase2; dErrorsFixed[ nPointerToLastAddedBase ] = dErrorsFixedThisBase2; if ( dErrorsFixedInCurrentWindow > dErrorsFixedInWindowOfMostErrorsFixed ) { dErrorsFixedInWindowOfMostErrorsFixed = dErrorsFixedInCurrentWindow; nWindowOfMostErrorsFixedUnpaddedRight = nUnpadded; } } } // if ( bFindWindowOfMostErrorsFixed ) { } // int nUnpadded =... } // for( int nTemplateTransitions = 0; return( dTotalErrorsFixed ); } // dGetErrorsFixedForCustomPrimerRead double Contig :: dGetErrorsFixedForWholeCloneRead( const bool bTopStrandNotBottomStrand, const int nFirstBaseOfReadUnpadded, const bool bChooseExperiment, const bool bJustConsensusNotGaps, const bool bFindWindowOfMostErrorsFixed ) { return( dGetErrorsFixedForSubcloneRead( &pFakeSubcloneForWholeCloneReads_, 1, bTopStrandNotBottomStrand, nFirstBaseOfReadUnpadded, false, // bExcludeVectorBasesAtBeginningOfRead, bChooseExperiment, bJustConsensusNotGaps, bFindWindowOfMostErrorsFixed ) ); } void Contig :: createFakeSubcloneTTemplatesForWholeCloneReads() { pFakeSubcloneForWholeCloneReads_ = new subcloneTTemplate(); int nUnpaddedMin = pUnpaddedErrorProbabilities_->nGetStartIndex(); int nUnpaddedMax = pUnpaddedErrorProbabilities_->nGetEndIndex(); pFakeSubcloneForWholeCloneReads_->nUnpaddedStart_ = nUnpaddedMin; pFakeSubcloneForWholeCloneReads_->nUnpaddedEnd_ = nUnpaddedMax; pFakeSubcloneForWholeCloneReads_->bOKToUseThisTemplate_ = true; // pFakeSubcloneForWholeCloneReads_->bTemplateIsDoubleStranded_ = true; pFakeSubcloneForWholeCloneReads_->pContig_ = this; pFakeSubcloneForWholeCloneReads_->bUnpaddedStartKnownPrecisely_ = true; pFakeSubcloneForWholeCloneReads_->bUnpaddedEndKnownPrecisely_ = true; pFakeSubcloneForWholeCloneReads_->bOKToUseForTopStrandReads_ = true; pFakeSubcloneForWholeCloneReads_->bOKToUseForBottomStrandReads_ = true; } void Contig :: findWindowOfMostErrorsFixed( experiment* pExp ) { double dErrorsFixed; if ( pExp->bIsUniversalPrimerRead() ) { bool bJustCountConsensusErrorsFixed = false; if ( !pExp->bDeNovoUniversalPrimerRead_ && !pCP->bAutoFinishAllowResequencingReadsToExtendContigs_ ) bJustCountConsensusErrorsFixed = true; dErrorsFixed = dGetErrorsFixedForSubcloneRead( &(pExp->pSubcloneTemplate_), 1, // nNumberOfSubclones pExp->bTopStrandNotBottomStrand_, pExp->nFirstBaseOfReadUnpadded_, true, // bExcludeVectorBasesAtBeginningOfRead false, // bChooseExperiment bJustCountConsensusErrorsFixed, // bJustConsensusNotGaps true ); // bFindWindowOfMostErrorsFixed } else if ( pExp->bIsCustomPrimerSubcloneRead() ) { dErrorsFixed = dGetErrorsFixedForCustomPrimerSubcloneRead( pExp->pCustomPrimer_, pExp->bTopStrandNotBottomStrand_, pExp->nFirstBaseOfReadUnpadded_, false, // bChooseExperiment false, // bJustConsensusNotGaps true ); // bFindWindowOfMostErrorsFixed } else if ( pExp->bIsWholeCloneRead() ) { dErrorsFixed = dGetErrorsFixedForWholeCloneRead( pExp->bTopStrandNotBottomStrand_, pExp->nFirstBaseOfReadUnpadded_, false, // bChooseExperiment false, // bJustConsensusNotGaps true ); // bFindWindowOfMostErrorsFixed } else assert( false ); // shouldn't be any other kind of experiment // the result is stored in these static variables pExp->nWindowOfMostErrorsFixedUnpaddedRight_ = nWindowOfMostErrorsFixedUnpaddedRight; pExp->dErrorsFixedInWindow_ = dErrorsFixedInWindowOfMostErrorsFixed; } void Contig :: clearExperimentCandidatesList() { for( int nExp = 0; nExp < pCandidateExperiments_->length(); ++nExp ) { experiment* pExp = (*pCandidateExperiments_)[ nExp ]; if ( pExp->nState_ != nChosenExperiment ) delete pExp; } pCandidateExperiments_->clear(); } void Contig :: saveStateOfConsensus() { *pSavedUnpaddedErrorProbabilities_ = *pUnpaddedErrorProbabilities_; *pSavedUnpaddedReadTypesCoveringEachBase_ = *pUnpaddedReadTypesCoveringEachBase_; *pSavedDistributionsOfChosenExperimentsArray_ = *pDistributionsOfChosenExperimentsArray_; } void Contig :: restoreStateOfConsensus() { *pUnpaddedReadTypesCoveringEachBase_ = *pSavedUnpaddedReadTypesCoveringEachBase_; // restore old distribution array *pDistributionsOfChosenExperimentsArray_ = *pSavedDistributionsOfChosenExperimentsArray_; // now restore (partly) the pUnpaddedErrorProbabilities_ array double dRedundancy = pCP->dAutoFinishRedundancy_; if ( dRedundancy < 1.0 ) dRedundancy = 1.0; if ( dRedundancy > 2.0 ) dRedundancy = 2.0; for( int nUnpadded = pUnpaddedErrorProbabilities_->nGetStartIndex(); nUnpadded <= pUnpaddedErrorProbabilities_->nGetEndIndex(); ++nUnpadded ) { if ( (*pUnpaddedErrorProbabilities_)[ nUnpadded ] == 0.0 || (*pSavedUnpaddedErrorProbabilities_)[ nUnpadded ] == 0.0 ) (*pUnpaddedErrorProbabilities_)[ nUnpadded ] = 0.0; else { // create affine sum of error probabilities before and // after the first pass of universal primer reads // Use the affine sum of logs instead of error probabilities // since this will reflect more closely the number of additional // reads to be called. (*pUnpaddedErrorProbabilities_)[ nUnpadded ] = exp( (2.0 - dRedundancy) * log( (*pUnpaddedErrorProbabilities_)[ nUnpadded ] ) + ( dRedundancy - 1.0 ) * log( (*pSavedUnpaddedErrorProbabilities_)[ nUnpadded ] ) ); } // if ( dRedundancy == 2.0 ) { // if ( ABS( (*pSavedUnpaddedErrorProbabilities_)[ nUnpadded ] - // (*pUnpaddedErrorProbabilities_)[ nUnpadded ] ) // > .001 ) { // fprintf( pAO, "too big of difference\n" ); // fprintf( pAO, "%.9g ", // (*pSavedUnpaddedErrorProbabilities_)[ nUnpadded ] - // (*pUnpaddedErrorProbabilities_)[ nUnpadded ] ); // } // } // else if ( dRedundancy == 1.0 ) { // fprintf( pAO, "%.9g ", // (*pUnpaddedErrorProbabilities_)[ nUnpadded ] - d ); // } } } void Contig :: recalculateConsensusUsingAllChosenUniversalPrimerExperiments() { for( int nExp = 0; nExp < pUniversalPrimerExperimentsForLowQualityRegions_->length(); ++nExp ) { experiment* pExp = (*pUniversalPrimerExperimentsForLowQualityRegions_)[ nExp ]; double d = dGetErrorsFixedForSubcloneRead( &(pExp->pSubcloneTemplate_), 1, // nNumberOfSubclones pExp->bTopStrandNotBottomStrand_, pExp->nFirstBaseOfReadUnpadded_, true, // bExcludeVectorBasesAtBeginningOfRead true, // bChooseExperiment false, // bJustConsensusNotGaps false ); // bFindWindowOfMostErrorsFixed } } void Contig :: recalculateErrorsFixedOfExperimentCandidates() { for( int nCan = 0; nCan < pCandidateExperiments_->length(); ++nCan ) { experiment* pExp = (*pCandidateExperiments_)[ nCan ]; if ( pExp->nState_ == nChosenExperiment ) continue; // make all experiments fair game again if ( pExp->nState_ == nRejectedExperiment ) pExp->nState_ = nAvailableExperiment; bool bJustCountConsensusErrorsFixed = false; // optionally do not allow resequencing reads to close gaps if ( pExp->bIsUniversalPrimerRead() && !pExp->bDeNovoUniversalPrimerRead_ && !pCP->bAutoFinishAllowResequencingReadsToExtendContigs_ ) bJustCountConsensusErrorsFixed = true; pExp->dErrorsFixed_ = dGetErrorsFixedForSubcloneRead( &(pExp->pSubcloneTemplate_), 1, // nNumberOfSubclones pExp->bTopStrandNotBottomStrand_, pExp->nFirstBaseOfReadUnpadded_, true, // bExcludeVectorBasesAtBeginningOfRead false, // bChooseExperiment bJustCountConsensusErrorsFixed, // bJustConsensusNotGaps false ); // bFindWindowOfMostErrorsFixed setVariousErrorData( pExp ); } } void Contig :: chooseExperimentsToCoverLowQualityRegionsAndGapsFor9_66() { fprintf( pAO, "now attempting to solve weak regions in consensus and extend the consensus...\n\n" ); // Possible problem: if the consensus is high quality, but you want // autofinish to extend the consensus, it might not do so. // (DG, Feb 2000) if ( dExpectedNumberOfErrorsInExtendedContig_ <= dTargetNumberOfErrors_ ) { fprintf( pAO, "Estimated number of errors in extended contig %.2f is already less than the maximum %.2f for this contig\n", dExpectedNumberOfErrorsInExtendedContig_, dTargetNumberOfErrors_ ); return; } chooseExperimentsToCoverLowQualityRegionsAndGapsFor9_66_2(); if ( pCP->bAutoFinishCloseGaps_ ) warnUserIfCouldNotExtendContig(); } void Contig :: chooseExperimentsToCoverLowQualityRegionsAndGapsFor9_66_2() { if ( pCandidateExperiments_->length() == 0 ) { fprintf( pAO, "No possible experiments found\n" ); return; } bool bMoreAvailableExperiments; bool bConsiderMoreExperiments = true; do { experiment* pBestExp = pGetBestExperimentCandidate(); if ( pBestExp == NULL ) { bMoreAvailableExperiments = false; bConsiderMoreExperiments = false; } else { // found the best experiment assert( pBestExp->dErrorsFixed_ >= consedParameters::pGetConsedParameters()->dAutoFinishMinNumberOfErrorsFixedByAnExpToUse_ ); // the best experiment is fine--transfer it pBestExp->nPurpose_ = nFixWeakRegion; pBestExp->chooseExperiment(); possiblyUpdateContigHighQualitySegment( pBestExp ); pChosenExperiments_->insert( pBestExp ); ConsEd::pGetAssembly()->pAllChosenExperiments_->insert( pBestExp ); adjustExpectedNumberOfErrors( pBestExp ); findWindowOfMostErrorsFixed( pBestExp ); pBestExp->print(); fprintf( pAO, "After this experiment,\ncalculated number of errors in extended contig: %.2f and consensus: %.2f\n\n", dExpectedNumberOfErrorsInExtendedContig_, dExpectedNumberOfErrorsInConsensus_ ); if ( !bStillTooManyExpectedErrors() ) bConsiderMoreExperiments = false; else { adjustRemainingExperimentCandidates( pBestExp ); // check no bugs double dErrorsInConsensus; double dErrorsInExtendedContig; recalculateCurrentExpectedNumberOfErrorsForThisContig( dErrorsInConsensus, dErrorsInExtendedContig ); if ( ( ABS( dErrorsInConsensus - dExpectedNumberOfErrorsInConsensus_ ) > .000001 ) || ( ABS( dErrorsInExtendedContig - dExpectedNumberOfErrorsInExtendedContig_ ) > .000001 ) ) { fprintf( pAO, "Warning: dExpectedNumberOfErrorsInConsensus = %e but calculated is %e and dExpectedNumberOfErrorsInExtendedContig_ = %e but calculated is %e\n", dExpectedNumberOfErrorsInConsensus_, dErrorsInConsensus, dExpectedNumberOfErrorsInExtendedContig_, dErrorsInExtendedContig ); } else { fprintf( pAO, "OK3\n" ); } // end check no bugs } } // if ( pBestExp == NULL ) else } while( bConsiderMoreExperiments ); // tell user why we stopped // We could have stopped because: // 1) the error rate was below the threshold // 2) there were no more experiments // 3) there were more experiments, but they weren't worth doing if ( bStillTooManyExpectedErrors() ) { fprintf( pAO, "There are still too many expected errors but we stopped because\n" ); if ( bMoreAvailableExperiments ) fprintf( pAO, "all remaining available experiments each solved too \nfew errors\nto be worthwhile doing\n" ); else fprintf( pAO, "there were no more experiments this program could \nfind to resolve the remaining errors.\n" ); printWorstRemainingRegion(); } else fprintf( pAO, "The expected error count for the contig should be acceptable after doing these experiments\n" ); } void Contig :: createExperimentCandidatesListFor9_66() { fprintf( pAO, "creating list of possible experiments with universal primers..." ); fflush( pAO ); createExperimentsWithResequencingUniversalPrimers(); fprintf( pAO, "done\n" ); fflush( pAO ); if ( pCP->bAutoFinishAllowCustomPrimerSubcloneReads_ || pCP->bAutoFinishAllowWholeCloneReads_ ) { fprintf( pAO, "creating list of possible experiments with custom primers..." ); fflush( pAO ); createExperimentsWithCustomPrimers(); fprintf( pAO, "done\n\n" ); fflush( pAO ); } fprintf( pAO, "creating list of possible experiments of reverses where there is only a forward or forwards where there is only a reverse..." ); fflush( pAO ); createExperimentsWithFirstForwardsOrFirstReverses(); fprintf( pAO, "done\n\n" ); fflush( pAO ); } bool mbtValVectorOfBool::bMasksSet_ = false; unsigned char mbtValVectorOfBool::ucSetBitMasks_[nNumberOfBitsInMbtValVectorOfBoolElement]; unsigned char mbtValVectorOfBool::ucClearBitMasks_[nNumberOfBitsInMbtValVectorOfBoolElement]; void Contig :: determineLocationsOfRunsAndStops() { int nLength = nGetUnpaddedSeqLength(); int nOffset = nGetUnpaddedStartIndex(); pUnpaddedLocationsOfRuns_ = new mbtValVectorOfBool( nLength, nOffset, "Contig::pUnpaddedLocationsOfRuns_" ); pUnpaddedLocationsOfStops_ = new mbtValVectorOfBool( nLength, nOffset, "Contig::pUnpaddedLocationsOfStops_" ); gotoList* pStopsGotoList = new gotoList(); addStopsCausingLCQs( pStopsGotoList, false ); // bSetPaddedPositionsArray const int nWindowSizeOfStops = 20; int nGotoItem; for( nGotoItem = 0; nGotoItem < pStopsGotoList->nGetNumGotoItems(); ++nGotoItem ) { gotoItem* pGotoItem = pStopsGotoList->pGetGotoItem( nGotoItem ); int nUnpaddedLeft; int nUnpaddedRight; if ( pGotoItem->soInitialDescription_ == " stop left of here" ) { nUnpaddedRight = pGotoItem->nUnpaddedGotoItemEnd_; nUnpaddedLeft = pGotoItem->nUnpaddedGotoItemEnd_ - nWindowSizeOfStops + 1; } else if ( pGotoItem->soInitialDescription_ == " stop right of here" ) { nUnpaddedLeft = pGotoItem->nUnpaddedGotoItemStart_; nUnpaddedRight = nUnpaddedLeft + nWindowSizeOfStops - 1; } // don't be off the consensus if ( bIntersect( nUnpaddedLeft, nUnpaddedRight, nGetUnpaddedStartIndex(), // consensus nGetUnpaddedEndIndex(), nUnpaddedLeft, nUnpaddedRight ) ) { for( int nUnpadded = nUnpaddedLeft; nUnpadded <= nUnpaddedRight; ++nUnpadded ) { pUnpaddedLocationsOfStops_->setValue( nUnpadded, 1 ); } } } // for( int nGotoItem = 0; delete pStopsGotoList; gotoList* pRunsGotoList = new gotoList(); addRunsAndMicrosatellitesCausingLCQs( pRunsGotoList, false ); // bSetPaddedPositionsArray for( nGotoItem = 0; nGotoItem < pRunsGotoList->nGetNumGotoItems(); ++nGotoItem ) { gotoItem* pGotoItem = pRunsGotoList->pGetGotoItem( nGotoItem ); int nUnpaddedLeft = pGotoItem->nUnpaddedGotoItemStart_; int nUnpaddedRight = pGotoItem->nUnpaddedGotoItemEnd_; // don't be off the consensus if ( bIntersect( nUnpaddedLeft, nUnpaddedRight, nGetUnpaddedStartIndex(), // consensus nGetUnpaddedEndIndex(), nUnpaddedLeft, nUnpaddedRight ) ) { for( int nUnpadded = nUnpaddedLeft; nUnpadded <= nUnpaddedRight; ++nUnpadded ) { pUnpaddedLocationsOfRuns_->setValue( nUnpadded, 1 ); } } } // for( int nGotoItem = 0; delete pRunsGotoList; } bool Contig :: bDoesRegionCrossARunOrStop( const int nUnpaddedLeft, const int nUnpaddedRight, bool& bCrossesRun, bool& bCrossesStop ) { bCrossesRun = false; bCrossesStop = false; int nUnpaddedLeftOnConsensus; int nUnpaddedRightOnConsensus; if ( bIntersect( nUnpaddedLeft, nUnpaddedRight, nGetUnpaddedStartIndex(), nGetUnpaddedEndIndex(), nUnpaddedLeftOnConsensus, nUnpaddedRightOnConsensus ) ) { for( int nUnpadded = nUnpaddedLeftOnConsensus; nUnpadded <= nUnpaddedRightOnConsensus; ++nUnpadded ) { if ( (*pUnpaddedLocationsOfStops_)[ nUnpadded ] ) bCrossesStop = true; if ( (*pUnpaddedLocationsOfRuns_)[ nUnpadded ] ) bCrossesRun = true; } } if ( bCrossesRun || bCrossesStop ) { return( true ); } else return( false ); } void Contig :: determineWhetherThisExperimentNeedsSpecialChemistry( experiment* pExp ) { bDoesRegionCrossARunOrStop( pExp->nUnpaddedLeft_, pExp->nUnpaddedRight_, pExp->bCrossesRun_, pExp->bCrossesStop_ ); }