/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ // module for calling reverses for flanking gaps #include "contig.h" #include "experiment.h" #include "consedParameters.h" #include "locatedFragment.h" #include "contigview.h" #include "rwcstring.h" #include "consed.h" #include "autoFinishExpTag.h" #include "fileDefines.h" #include "library.h" #include "contigEndPair.h" #include "templateForMinilibrary.h" #include "listOfLibraries.h" #include "rwtptrorderedvector.h" void Contig :: dealWithContigEnds() { dealWithOneContigEnd( nLeftGap ); dealWithOneContigEnd( nRightGap ); } void Contig :: dealWithOneContigEnd( const int nWhichGap ) { if ( bIsThisEndTheEndOfTheClone( nWhichGap ) ) { if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO,"Not Considering Reverses to Flank Gap at %s End of Contig %s because\nthis is the clone end\n\n", ( (nWhichGap == nLeftGap ) ? "Left" : "Right" ), (char*) soGetName().data() ); } return; } if ( !bDoTagsAllowExtendingThisEnd( nWhichGap ) ) { if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO, "Tags do not allow extending %s end of contig %s\n", ( (nWhichGap == nLeftGap ) ? "Left" : "Right" ), (char*) soGetName().data() ); } } if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO, "\n\nExamining existing fwd/rev pairs that flank gap at %s end of %s\n\n", (( nWhichGap == nLeftGap ) ? "Left" : "Right" ), (char*) soGetName().data() ); } contigEndPair* pPair = NULL; int nNumberOfExistingForwardReversePairs; findNeighborForOneGap( pPair, nWhichGap, nNumberOfExistingForwardReversePairs ); if ( pPair ) { Assembly* pAssembly = ConsEd::pGetAssembly(); if ( !pAssembly->pContigEndPairArray_ ) pAssembly->pContigEndPairArray_ = new RWTPtrOrderedVector( (size_t) pAssembly->nNumContigs(), "dealWithOneContigEnd RWTPtrOrderedVector" ); // look to see whether this pair of contig/ends is already // in the list. (It might be a different object, but it would // contain the same contig/ends.) (Question July 2002: How could // it not already be in the list? ) pAssembly->insertContigEndPairIfItIsNotAlreadyThere( pPair ); // consider making a minilibrary for this gap, if it hasn't // already been considered when the other end of the gap was // examined if ( pCP->bAutoFinishAllowMinilibraries_ && pPair && bDoTagsAllowExtendingThisEnd( nWhichGap ) ) { if ( pPair->bAlreadySuggestedMinilibrariesForThisGap_ ) { // I think we should zero the error probabilities here anyway, since // the minilibrary should help here also. (DG, 4/01) zeroErrorProbabilitiesWhereChoseMinilibrary( pPair ); } else { // haven't already chose a minilibrary for this gap. See if // we should: if ( pCP->bAutoFinishAlwaysCloseGapsUsingMinilibraries_ || ( pPair->nGapSize_ >= pCP->nAutoFinishSuggestMinilibraryIfGapThisManyBasesOrLarger_ ) ) { suggestMinilibrary( pPair ); zeroErrorProbabilitiesWhereChoseMinilibrary( pPair ); } } } } // if ( pPair ) if ( ( nNumberOfExistingForwardReversePairs < pCP->nAutoFinishCallHowManyReversesToFlankGaps_ ) && bDoTagsAllowExtendingThisEnd( nWhichGap ) ) { // only reach here if there are not enough fwd/rev pairs if ( pCP->bAutoFinishAllowDeNovoUniversalPrimerSubcloneReads_ ) { int nNumberOfReversesToCall = pCP->nAutoFinishCallHowManyReversesToFlankGaps_ - nNumberOfExistingForwardReversePairs; fprintf( pAO, "Insufficient existing fwd/rev pairs to connect %s end of %s\n\n", (( nWhichGap == nLeftGap ) ? "Left" : "Right" ), (char*) soGetName().data() ); if ( pCP->bAutoFinishCallReversesToFlankGaps_ ) { autoFinishCallReversesToFlankGapsOnOneEnd2( nWhichGap, nNumberOfReversesToCall ); } } else { fprintf( pAO, "not considering reverses to flank %s of contig %s since consed.autoFinishAllowDeNovoUniversalPrimerSubcloneReads is false\n", szWhichGap( nWhichGap ), soGetName().data() ); } } // if ( nNumberOfForwardReversePairs < ... } // dealWithOneContigEnd // this is the routine that actually calls the reverses--everything up // to here is just to determine whether reverses need to be called at // all and, if so, how many void Contig :: autoFinishCallReversesToFlankGapsOnOneEnd2( const int nWhichGap, const int nNumberOfReversesToCall ) { fprintf( pAO, "Attempting to suggest de novo univeral primer reads to flank gap...\n\n" ); for( int nNumberOfReversesCalled = 0; nNumberOfReversesCalled < nNumberOfReversesToCall; ++nNumberOfReversesCalled ) { subcloneTTemplate* pBestTemplateSoFar = NULL; if ( nWhichGap == nLeftGap ) { // just start at beginning of subcloneTTemplates array looking // for the first one that is not already chosen and that doesn't // already have a fwd/rev pair for it. // Where do we stop looking? When the left end of the template is // at or to the right of the left end of the consensus, then stop // since all templates to the right will be useless for the // purpose of flanking the left gap. int nTemplateUnpaddedStartMustBeLeftOfThis = nGetUnpaddedStartIndex() - pCP->nAutoFinishReversesForFlankingGapsTemplateMustProtrudeFromContigThisMuch_; int nLeftUnpaddedOfBestTemplateSoFar; bool bContinueCheckingTemplates = true; for( int nSub = 0; nSub < aSubcloneTemplates_.entries() && bContinueCheckingTemplates; ++nSub ) { subcloneTTemplate* pTrialSub = aSubcloneTemplates_[ nSub ]; // this check has an added benefit--no reverse will be chosen // on a template whose left end is the left end of the // consensus. Such templates are useless for flanking gaps. // template picking proceeds as follows: // ---------------------- (consensus) // --------- template 0 // --------- template 1 // --------- template 2 // --------- template 3 // --------- template 4 // ---------- template 5 // Note that template 5 is completely to the right of the // left end of the consensus so there is no point in picking // it. if ( nGetUnpaddedStartIndex() <= pTrialSub->nUnpaddedStart_ ) bContinueCheckingTemplates = false; else { // such a reverse will not be far enough left to // be useful if ( nTemplateUnpaddedStartMustBeLeftOfThis <= pTrialSub->nUnpaddedStart_ ) continue; if ( !pBestTemplateSoFar || pTrialSub->nUnpaddedStart_ < nLeftUnpaddedOfBestTemplateSoFar ) { // found a better one // According to Phil 10/31/99, the template quality // is ignored for the purpose of choosing flanking // universal primer reads. The only thing that // matters is that the new read is of high enough // quality to go into the other contig so that the // contigs can be oriented. If the existing // template produced a read that was high enough // quality to go into this contig, it can produce // another one. int nReadTypeToSuggest; if ( !bWillThisTemplateWorkForFlankingThisGap( pTrialSub, nWhichGap, nReadTypeToSuggest ) ) continue; // have we tried it in an earlier round of autofinish? autoFinishExpTag* pAutoFinishExpTag; if ( ConsEd::pGetAssembly()->bIsThereAnAutoFinishExpTagForThisReverse( pTrialSub->soTemplateName_, pAutoFinishExpTag ) ) { fprintf( pAO, "rejecting experiment: reverse universal primer read with template %s\n", (char*) pTrialSub->soTemplateName_.data() ); fprintf( pAO, "because an earlier round of autofinish called this with expid: %d\n", pAutoFinishExpTag->aExpID_[0] ); continue; } // but have we already chosen it? if ( !pTrialSub->bChosenPriorToMakingListOfExperimentCandidates_ ) { // not already found--so this one is better pBestTemplateSoFar = pTrialSub; nLeftUnpaddedOfBestTemplateSoFar = pTrialSub->nUnpaddedStart_; } } } } // for( int nSub = 0 ... } // if (nWhichGap == nLeftGap ) ... else { // right end of contig int nTemplateUnpaddedEndMustBeRightOfThis = nGetUnpaddedEndIndex() + pCP->nAutoFinishReversesForFlankingGapsTemplateMustProtrudeFromContigThisMuch_; int nUnpaddedConsPosEndOfContig = nGetUnpaddedEndIndex(); int nUnpaddedLeftConsPosOfTemplate = nUnpaddedConsPosEndOfContig - consedParameters::pGetConsedParameters()->nPrimersMaxInsertSizeOfASubclone_; subcloneTTemplate* pDummyTT = new subcloneTTemplate(); pDummyTT->nUnpaddedStart_ = nUnpaddedLeftConsPosOfTemplate; int nTemplate = aSubcloneTemplates_.nFindIndexOfMatchOrSuccessor( pDummyTT ); delete pDummyTT; if ( nTemplate == RW_NPOS ) { fprintf( pAO, "no acceptable templates to call reverses on right end\n" ); return; } int nRightUnpaddedOfBestTemplateSoFar; // examine all templates to the right of this one looking for the one that reaches furthest // (whose right end is furthest to the right) for( ; nTemplate < aSubcloneTemplates_.entries(); ++nTemplate ) { subcloneTTemplate* pTrialSub = aSubcloneTemplates_[ nTemplate ]; if ( pTrialSub->nUnpaddedEnd_ <= nTemplateUnpaddedEndMustBeRightOfThis ) continue; if ( !pBestTemplateSoFar || ( nRightUnpaddedOfBestTemplateSoFar < pTrialSub->nUnpaddedEnd_ ) ) { // found a better one // does it already exist? int nReadTypeToSuggest; if (!bWillThisTemplateWorkForFlankingThisGap( pTrialSub, nWhichGap, nReadTypeToSuggest ) ) continue; // have we tried it in an earlier round of autofinish? autoFinishExpTag* pAutoFinishExpTag; if ( ConsEd::pGetAssembly()->bIsThereAnAutoFinishExpTagForThisReverse( pTrialSub->soTemplateName_, pAutoFinishExpTag ) ) { fprintf( pAO, "rejecting experiment: reverse universal primer read with template %s\n", (char*) pTrialSub->soTemplateName_.data() ); fprintf( pAO, "because an earlier round of autofinish called this with expid: %d\n", pAutoFinishExpTag->aExpID_[0] ); continue; } // have we already chosen it? if ( !pTrialSub->bChosenPriorToMakingListOfExperimentCandidates_ ) { // not chosen pBestTemplateSoFar = pTrialSub; nRightUnpaddedOfBestTemplateSoFar = pTrialSub->nUnpaddedEnd_; } } } } if ( !pBestTemplateSoFar ) { fprintf( pAO, "need more reverses for this end, but there are no more suitable\n" ); return; } // found another template--add it to our list of chosen templates pBestTemplateSoFar->bChosenPriorToMakingListOfExperimentCandidates_ = true; experiment* pExp = new experiment(); int nReadTypeToSuggest; // we did this before to make sure this template would work, but // we didn't save the read type to suggest. Now we need it. assert( bWillThisTemplateWorkForFlankingThisGap( pBestTemplateSoFar, nWhichGap, nReadTypeToSuggest ) ); pExp->nReadType_ = nReadTypeToSuggest; pExp->ucStrandAndChemistry_ = ( nWhichGap == nLeftGap ) ? TOP_STRAND_TERMINATOR_READS : BOTTOM_STRAND_TERMINATOR_READS; pExp->bCloneTemplateNotSubcloneTemplate_ = false; pExp->fCost_ = pCP->dAutoFinishCostOfDeNovoUniversalPrimerSubcloneReaction_; pExp->pCustomPrimer_ = NULL; pExp->nExtendsIntoWhichGap_ = nWhichGap; pExp->pContig_ = this; pExp->pSubcloneTemplate_ = pBestTemplateSoFar; pExp->bTopStrandNotBottomStrand_ = ( nWhichGap == nLeftGap ) ? true : false; // pExp->bCustomPrimerNotUniversalPrimer_ = false; pExp->nPurpose_ = nFlankGap; // Let's figure out where the read will go (yes, it will be off // the consensus, but still let's figure it out). This will // just be used for printing out // This makes the assumption that nConsPosStart_ or nConsPosEnd_ // is the beginning // of the template. This may not be correct. nConsPosStart_ // or nConsPosEnd_ may be determined by a walking read that // has not yet run into vector. // ---------------------------------------------------> contig // ------ <------ // walk univ // // I think it is ok because // bDoesDesiredUniversalPrimerAlreadyExist would have found // that, in this case, the universal primer read was pointing // in to the end of the contig so the code above would have // rejected the template. assert( pBestTemplateSoFar->pContig_ == this ); if ( pExp->bTopStrandNotBottomStrand_ ) { // -----------> new read // <-------- existing read // ^ // pBestTemplateSoFar->nConsPosEnd_ pExp->nFirstBaseOfReadUnpadded_ = pBestTemplateSoFar->nUnpaddedEnd_ - pBestTemplateSoFar->pLibrary_->nAverageInsertSizeToUse_; pExp->nUnpaddedLeft_ = pExp->nFirstBaseOfReadUnpadded_; pExp->nUnpaddedRight_ = pExp->nFirstBaseOfReadUnpadded_ + ConsEd::pGetAssembly()->pModelRead_->length() - 1; } else { pExp->nFirstBaseOfReadUnpadded_ = pBestTemplateSoFar->nUnpaddedStart_ + pBestTemplateSoFar->pLibrary_->nAverageInsertSizeToUse_; pExp->nUnpaddedRight_ = pExp->nFirstBaseOfReadUnpadded_; pExp->nUnpaddedLeft_ = pExp->nFirstBaseOfReadUnpadded_ - ConsEd::pGetAssembly()->pModelRead_->length() + 1; } pExp->bDeNovoUniversalPrimerRead_ = true; // will not be necessary--so don't call this // pExp->setPrimerNumberIfNecessary(); pExp->chooseExperiment(); pExp->print(); pChosenExperiments_->insert( pExp ); ConsEd::pGetAssembly()->pAllChosenExperiments_->insert( pExp ); ConsEd::pGetAssembly()->bAnyReadsToCloseGaps_ = true; } // for( int nNumberOfReversesCalled = 0; } // currently, this is giving the gap size from consensus to consensus // (i.e.--not from high quality to high quality. Low quality bases // are not considered part of the gap.) int Contig :: nGetGapSize( arrayOfTemplates* pArrayOfInformativeTemplates ) { // Since pArrayOfInformativeTemplates may contain duplicates, // remove any duplicates RWTValOrderedVector aInformativeTemplates( (size_t) pArrayOfInformativeTemplates->length(), "nGetGapSize aInformativeTemplates" ); for( int n = 0; n < pArrayOfInformativeTemplates->length(); ++n ) { subcloneTTemplate* pSub = (*pArrayOfInformativeTemplates)[ n ]; if ( aInformativeTemplates.index( pSub ) == RW_NPOS ) { aInformativeTemplates.insert( pSub ); } } // now aInformativeTemplates does not contain duplicate templates // see if I can find a template where both vector/insert junctions // are known precisely int nSumOfPutativeGapSizes = 0; int nNumberOfPutativeGapSizes = 0; for( int nSub = 0; nSub < aInformativeTemplates.length(); ++nSub ) { subcloneTTemplate* pSub = aInformativeTemplates[ nSub ]; LocatedFragment* pLocFrag1; LocatedFragment* pLocFrag2; if (! pSub->bHasForwardReversePairSortByContigName( pLocFrag1, pLocFrag2 ) ) continue; Contig* pContig1 = pLocFrag1->pContig_; Contig* pContig2 = pLocFrag2->pContig_; if ( pContig1 == pContig2 ) continue; // don't use this template for calculating gap size if ( pLocFrag1->bIsWholeReadUnaligned() ) continue; // don't use this template for calculating gap size if ( pLocFrag2->bIsWholeReadUnaligned() ) continue; // don't use this template for calculating gap size int nUnpaddedLeft1; int nUnpaddedRight1; pLocFrag1->getFromStartOfInsertToEndOfContig( nUnpaddedLeft1, nUnpaddedRight1 ); int nContig1Overlap = nUnpaddedRight1 - nUnpaddedLeft1 + 1; int nUnpaddedLeft2; int nUnpaddedRight2; pLocFrag2->getFromStartOfInsertToEndOfContig( nUnpaddedLeft2, nUnpaddedRight2 ); int nContig2Overlap = nUnpaddedRight2 - nUnpaddedLeft2 + 1; int nEstimatedGapSize = pSub->pLibrary_->nAverageInsertSizeToUse_ - nContig1Overlap - nContig2Overlap; nSumOfPutativeGapSizes += nEstimatedGapSize; ++nNumberOfPutativeGapSizes; } // int nSub = 0; nSub < aInformativeTemplates.length(); ++nSub ) { if ( nNumberOfPutativeGapSizes == 0 ) return( 0 ); else return( nSumOfPutativeGapSizes / nNumberOfPutativeGapSizes ); } bool Contig :: bWillThisTemplateWorkForFlankingThisGap( subcloneTTemplate* pTrialSub, const int nWhichGap, int& nReadTypeToSuggest ) { // Does this one already have a reverse for it? bool bUniversalForwardExists; bool bUniversalForwardExistsInThisContig; bool bUniversalForwardIsInBottomStrandOfThisContig; bool bUniversalReverseExists; bool bUniversalReverseExistsInThisContig; bool bUniversalReverseIsInBottomStrandOfThisContig; pTrialSub->inventoryUniversalPrimerReadsForGapFlanking( bUniversalForwardExists, bUniversalForwardExistsInThisContig, bUniversalForwardIsInBottomStrandOfThisContig, bUniversalReverseExists, bUniversalReverseExistsInThisContig, bUniversalReverseIsInBottomStrandOfThisContig, this ); if ( bUniversalForwardExists && bUniversalReverseExists ) return( false ); if ( bUniversalForwardExistsInThisContig ) { if ( bUniversalForwardIsInBottomStrandOfThisContig ) { if ( nWhichGap == nLeftGap ) { nReadTypeToSuggest = nUniversalReverse; return( true ); } else return( false ); } else { if ( nWhichGap == nLeftGap ) return( false ); else { nReadTypeToSuggest = nUniversalReverse; return( true ); } } } // you may worry that the "return( false )" above will prematurely // terminate this without checking the code below. However, that // would only happen if there were already a universal forward primer, // none of which were in the right orientation, so there is no use in // checking if there is a reverse because we don't want to suggest another // forward if ( bUniversalReverseExistsInThisContig ) { if ( bUniversalReverseIsInBottomStrandOfThisContig ) { if ( nWhichGap == nLeftGap ) { nReadTypeToSuggest = nUniversalForward; return( true ); } else return( false ); } else { if ( nWhichGap == nLeftGap ) return( false ); else { nReadTypeToSuggest = nUniversalForward; return( true ); } } } // case in which there is either no universal primer read // (unlikely) or else a universal primer read but it // is not in this contig so suggesting the other univ read // will not help for orienting the contigs return( false ); } static int compareTemplatesForMinilibraries( const templateForMinilibrary** ppMini1, const templateForMinilibrary** ppMini2 ) { if ( (*ppMini1)->bSizeTooFarFromMean_ && !(*ppMini2)->bSizeTooFarFromMean_ ) return( -1 ); else if ( !(*ppMini1)->bSizeTooFarFromMean_ && (*ppMini2)->bSizeTooFarFromMean_ ) return( 1 ); else { // both are within 1 standard dev of mean, or else both // are not within 1 standard dev of mean if ( (*ppMini1)->dErrorsFixedPerDollar_ > (*ppMini2)->dErrorsFixedPerDollar_ ) return( -1 ); else if ( (*ppMini1)->dErrorsFixedPerDollar_ == (*ppMini2)->dErrorsFixedPerDollar_ ) return( 0 ); else return( 1 ); } } void Contig :: suggestMinilibrary( contigEndPair* pPair ) { // we need to calculate the number of errors fixed for each // template and also what the cost of the template is. fprintf( pAO, "\nTrying to suggest minilibrary for gap between %s of %s and %s of %s\n", szWhichGap( pPair->nWhichEnd_[0] ), pPair->pContig_[0]->soGetName().data(), szWhichGap( pPair->nWhichEnd_[1] ), pPair->pContig_[1]->soGetName().data() ); RWTPtrOrderedVector* pSortedArrayOfMinilibraries = new RWTPtrOrderedVector( (size_t) 20, "suggestMinilibrary, RWTPtrOrderedVector" ); double dMaxErrorsFixedPerDollar = -1.0; subcloneTTemplate* pBestTemplate = NULL; int n; for( n = 0; n < pPair->pArrayOfTemplates_->length(); ++n ) { subcloneTTemplate* pSub = (*pPair->pArrayOfTemplates_)[ n ]; LocatedFragment* pLocFrag1; LocatedFragment* pLocFrag2; if ( !pSub->bHasForwardReversePairSortByContigName( pLocFrag1, pLocFrag2 ) ) { fprintf( pAO, "template %s does not have a fwd/rev pair so can't use it for minilibrary to close gap\n", pSub->soTemplateName_.data() ); continue; } int nUnpaddedLeft1; int nUnpaddedRight1; pLocFrag1->getFromStartOfInsertToEndOfContig( nUnpaddedLeft1, nUnpaddedRight1 ); double dErrorsContig1; int nTotalNonEditedBases1; double dErrorRate1; pLocFrag1->pContig_->getErrorInfo( nUnpaddedLeft1, nUnpaddedRight1, dErrorsContig1, nTotalNonEditedBases1, dErrorRate1 ); int nUnpaddedLeft2; int nUnpaddedRight2; pLocFrag2->getFromStartOfInsertToEndOfContig( nUnpaddedLeft2, nUnpaddedRight2 ); double dErrorsContig2; int nTotalNonEditedBases2; double dErrorRate2; pLocFrag2->pContig_->getErrorInfo( nUnpaddedLeft2, nUnpaddedRight2, dErrorsContig2, nTotalNonEditedBases2, dErrorRate2 ); int nTemplateSize = nUnpaddedRight1 - nUnpaddedLeft1 + pPair->nGapSize_ + nUnpaddedRight2 - nUnpaddedLeft2; double dGapErrors = ( pPair->nGapSize_ >= 0 ) ? (double) pPair->nGapSize_ : 0.0; double dTotalErrorsFixed = dErrorsContig1 + dErrorsContig2 + dGapErrors; // fprintf( pAO, "template %s has dErrorsContig1 = %.12g dErrorsContig2 = %.12g nGapSize = %d dTotalErrorsFixed = %.12g errors fixed\n", // pSub->soTemplateName_.data(), // dErrorsContig1, // dErrorsContig2, // nGapSize, // dTotalErrorsFixed ); double dErrorsFixedPerDollar = dTotalErrorsFixed / pSub->pLibrary_->dCostToMakeMinilibrary_; templateForMinilibrary* pMini = new templateForMinilibrary(); pMini->pSub_ = pSub; pMini->dErrorsFixed_ = dTotalErrorsFixed; pMini->dErrorsFixedPerDollar_ = dErrorsFixedPerDollar; pMini->nSize_ = nTemplateSize; // warning: standard deviation may not be set if there are no // forward/reverse pairs if ( ABS( pMini->nSize_ - pMini->pSub_->pLibrary_->nAverageInsertSizeToUse_ ) > pCP->dAutoFinishMinilibrariesPreferTemplateIfSizeThisManyStdDevsFromMean_ * pMini->pSub_->pLibrary_->nStandardDeviationOfInsertSize_ ) pMini->bSizeTooFarFromMean_ = true; else pMini->bSizeTooFarFromMean_ = false; pSortedArrayOfMinilibraries->insert( pMini ); } if ( pSortedArrayOfMinilibraries->entries() == 0 ) { fprintf( pAO, "Sorry--no acceptable templates for minilibraries for this gap\n" ); return; } // sort the templates void* pArray = (void*) pSortedArrayOfMinilibraries->data(); size_t nNumberOfTemplates = pSortedArrayOfMinilibraries->entries(); size_t nSizeOfAnElement = sizeof( templateForMinilibrary* ); qsort( pArray, nNumberOfTemplates, nSizeOfAnElement, ( (int(*) ( const void*, const void*) ) compareTemplatesForMinilibraries ) ); // check that they are now sorted for( n = 0; n < pSortedArrayOfMinilibraries->entries() - 1; ++n ) { templateForMinilibrary* pMini1 = (*pSortedArrayOfMinilibraries)[ n ]; templateForMinilibrary* pMini2 = (*pSortedArrayOfMinilibraries)[ n + 1 ]; if ( pMini1->bSizeTooFarFromMean_ == pMini2->bSizeTooFarFromMean_ ) { if ( pMini1->dErrorsFixedPerDollar_ < pMini2->dErrorsFixedPerDollar_ ) { PANIC_OST( ost ) << "Contig::suggestMinilibrary didn't work for elements " << n << " and " << n + 1 << " out of a total of " << pSortedArrayOfMinilibraries->entries() << ends; throw ProgramLogicError( ost.str().c_str() ); } } } // need to print out minilibrary pPair->pChoiceOfTemplatesForMinilibraries_ = new RWTPtrOrderedVector( (size_t) pCP->nAutoFinishSuggestThisManyMinilibrariesPerGap_, "suggestMinilibrary RWTPtrOrderedVector" ); templateForMinilibrary* pMini = (*pSortedArrayOfMinilibraries)[ 0 ]; fprintf( pAO, "MINILIBRARY{\n" ); fprintf( pAO, "best template: %s from lib %s\n", pMini->pSub_->soTemplateName_.data(), pMini->pSub_->pLibrary_->soName_.data() ); fprintf( pAO, "size: %d errors fixed: %.2f errors fixed per dollar: %.2f\n", pMini->nSize_, pMini->dErrorsFixed_, pMini->dErrorsFixedPerDollar_ ); fprintf( pAO, "connecting %s end of %s to %s end of %s with estimated gap size %d\n", ( pPair->nWhichEnd_[0] == nLeftGap ? "left" : "right" ), pPair->pContig_[0]->soGetName().data(), ( pPair->nWhichEnd_[1] == nLeftGap ? "left" : "right" ), pPair->pContig_[1]->soGetName().data(), pPair->nGapSize_ ); pPair->pChoiceOfTemplatesForMinilibraries_->insert( pMini ); for( int nMini = 1; nMini < pCP->nAutoFinishSuggestThisManyMinilibrariesPerGap_ && nMini < pSortedArrayOfMinilibraries->length(); ++nMini ) { pMini = (*pSortedArrayOfMinilibraries)[ nMini ]; fprintf( pAO, "\n" ); fprintf( pAO, "alternative template: %s from lib %s\n", pMini->pSub_->soTemplateName_.data(), pMini->pSub_->pLibrary_->soName_.data() ); fprintf( pAO, "size: %d errors fixed: %.2f errors fixed per dollar: %.2f\n", pMini->nSize_, pMini->dErrorsFixed_, pMini->dErrorsFixedPerDollar_ ); pPair->pChoiceOfTemplatesForMinilibraries_->insert( pMini ); } fprintf( pAO,"}\n\n" ); pPair->bAlreadySuggestedMinilibrariesForThisGap_ = true; // need to add it to the list of minilibraries--why? So that // we can print out a file of minilibraries? So we don't // call the same minilibrary again for the other end. // need to zero out error probabilities in arrays } void Contig :: zeroErrorProbabilitiesWhereChoseMinilibrary( contigEndPair* pPair ) { // handle case in which no minilibraries were chosen // (this can occur in pathological cases such as when // a gap is captured by a fwd/walk pair instead of a // fwd/rev pair) if ( !pPair->pChoiceOfTemplatesForMinilibraries_ ) return; if ( pPair->pChoiceOfTemplatesForMinilibraries_->length() == 0 ) return; templateForMinilibrary* pMini = (*pPair->pChoiceOfTemplatesForMinilibraries_)[ 0 ]; subcloneTTemplate* pSub = pMini->pSub_; LocatedFragment* pLocFrag1; LocatedFragment* pLocFrag2; assert( pSub->bHasAlignedForwardReversePair( pLocFrag1, pLocFrag2 ) ); pLocFrag1->zeroErrorProbabilitiesWhereChoseMinilibrary(); pLocFrag2->zeroErrorProbabilitiesWhereChoseMinilibrary(); // One contig has already had its cumulative probability array // calculated, so we need to recalculate it. The other one has // not, and thus we don't need to worry about it--it will be // correctly calculated when Autofinish reaches that contig. // DG, 4/01: the *cumulative* error probability array will // be correctly calculated, but the pUnpaddedErrorProbabilities_ // has to be adjusted (and is) when we are looking for minilibraries // for the second contig and find the same minilibrary. resetCumulativeErrorArray(); recalculateCurrentExpectedNumberOfErrorsForThisContig( dExpectedNumberOfErrorsInExtendedContig_, dExpectedNumberOfErrorsInConsensus_ ); } void Contig :: nearGapsSuggestEachMissingReadOfReadPairs() { if ( !bIsThisEndTheEndOfTheClone( nLeftGap ) ) { if ( bDoTagsAllowExtendingThisEnd( nLeftGap ) ) { suggestEachMissingReadOfReadPairsOnOneEnd( nLeftGap ); } else { fprintf( pAO, "not suggesting de novo reads to close gap on left end of contig %s because tags say not to close this gap\n", soGetName().data() ); } } if ( !bIsThisEndTheEndOfTheClone( nRightGap ) ) { if ( bDoTagsAllowExtendingThisEnd( nRightGap ) ) { suggestEachMissingReadOfReadPairsOnOneEnd( nRightGap ); } else { fprintf( pAO, "not suggesting de novo reads to close gap on right end of contig %s because tags say not to close this gap\n", soGetName().data() ); } } } void Contig :: suggestEachMissingReadOfReadPairsOnOneEnd( const int nWhichGap ) { int nHowFarDoWeStartLookingFromEnd = 0; for( int nLib = 0; nLib < pCP->pListOfLibraries_->aLibraries_.length(); ++nLib ) { library* pLib = pCP->pListOfLibraries_->aLibraries_[ nLib ]; int nStd = pLib->nStandardDeviationOfInsertSize_; if ( nStd <= 0 ) nStd = pLib->nAverageInsertSizeToUse_ / 5; // just a guess nHowFarDoWeStartLookingFromEnd = MAX( nHowFarDoWeStartLookingFromEnd, pLib->nAverageInsertSizeToUse_ + nStd * pCP->dAutoFinishStandardDeviationsFromMeanFromGapToLookForTemplatesForSuggestingEachMissingReadOfReadPairs_ ); } int nExtremeTemplateUnpaddedLeft; int nExtremeTemplateUnpaddedRight; bool bWantTopStrandReads; if ( nWhichGap == nLeftGap ) { nExtremeTemplateUnpaddedLeft = nGetUnpaddedStartIndex(); nExtremeTemplateUnpaddedRight = nExtremeTemplateUnpaddedLeft + nHowFarDoWeStartLookingFromEnd; bWantTopStrandReads = true; } else { nExtremeTemplateUnpaddedRight = nGetUnpaddedEndIndex(); nExtremeTemplateUnpaddedLeft = nExtremeTemplateUnpaddedRight - nHowFarDoWeStartLookingFromEnd; bWantTopStrandReads = false; } int nExtremeTemplateConsPosLeft = nPaddedIndexFast( nExtremeTemplateUnpaddedLeft ); int nExtremeTemplateConsPosRight = nPaddedIndexFast( nExtremeTemplateUnpaddedRight ); ContigView* pContigView = pGetContigView( nExtremeTemplateConsPosLeft, nExtremeTemplateConsPosRight ); for( int nRead = 0; nRead < pContigView->nGetNumberOfFragments(); ++nRead ) { LocatedFragment* pLocFrag = pContigView->pLocatedFragmentGet( nRead ); // if right gap, want bottom strand reads, so existing // read must be top (not complemented). if ( pLocFrag->bComp() != bWantTopStrandReads ) continue; if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( pLocFrag->nReadType_ != nUniversalForward && pLocFrag->nReadType_ != nUniversalReverse ) continue; if ( pLocFrag->bIsWholeCloneTemplateNotSubcloneTemplate() ) continue; // fix bug reported by Emmanuel in which gets the following error: // exception thrown: File 'locatedFragment.cpp', line 1123 in // LocatedFragment :: bDoesReadHaveAMate with read JoiningRead_1.a // with template name: JoiningRead_1 but couldn't find this read in // the list of reads sorted by template name // fixed 12/2/99 by following: if ( pLocFrag->bIsAFakeRead() ) continue; // now check that it comes from a template that is does not have // inconsistent reads or possibly misassembled reads // ask assembly for the template object of this read // ask the subclone template object if this template is ok // Then ask it if there is a read in another contig from // this same template (doesn't matter whether univ primer/univ primer // pair or univ primer/walk pair // (If performance ever becomes a problem, we could do this more // efficiently by putting the pointer pSub into each locatedFragment // and thus saving this expensive lookup.) subcloneTTemplate* pSub = ConsEd::pGetAssembly()->pGetSubcloneTemplateByTemplateName( pLocFrag->soGetTemplateName() ); // I can't think of any reason that the template object would not // be there, but let's handle that case anyway. if ( !pSub ) continue; if ( !pSub->bOKToUseThisTemplate_ ) continue; bool bTopNotBottomStrandOfSuggestedRead; int nReadTypeOfSuggestedRead; if ( !pSub->bOKToCallDeNovoUniversalPrimerRead( bTopNotBottomStrandOfSuggestedRead, nReadTypeOfSuggestedRead, this ) ) continue; // if we have reached here, it is ok to use this template to // call a de novo read. if ( bTopNotBottomStrandOfSuggestedRead != bWantTopStrandReads ) { cerr << "How could this happen: fwd/rev universal primer reads on same strand template: " << pSub->soGetName() << endl; continue; } // check that this template will likely extend the contig int nStdOfThisLibrary = pSub->pLibrary_->nStandardDeviationOfInsertSize_; if ( nStdOfThisLibrary <= 0 ) nStdOfThisLibrary = pSub->pLibrary_->nAverageInsertSizeToUse_ / 5; // just a guess int nHowFarCanThisTemplateBeFromEndOfContig = pSub->pLibrary_->nAverageInsertSizeToUse_ + nStdOfThisLibrary * pCP->dAutoFinishStandardDeviationsFromMeanFromGapToLookForTemplatesForSuggestingEachMissingReadOfReadPairs_; if ( nWhichGap == nLeftGap ) { int nExtremeTemplateUnpaddedRight2 = nGetUnpaddedStartIndex() + nHowFarCanThisTemplateBeFromEndOfContig; // ----------------------------------------- contig // ^ nExtremeTemplateUnpaddedRight2 // ---------- ok template // ------------ not ok template if ( nExtremeTemplateUnpaddedRight2 < pSub->nUnpaddedEnd_ ) continue; // check for this case (the left vector-insert junction is // on the consensus so adding the reverse won't extend the contig): // xxx--- if ( pSub->bUnpaddedStartKnownPrecisely_ && nGetUnpaddedStartIndex() <= pSub->nUnpaddedStart_ ) continue; } else { // contig --------------------------------------------- // ^ nUnpaddedLeft // ok template ----------- // not ok template ------------- int nExtremeTemplateUnpaddedLeft2 = nGetUnpaddedEndIndex() - nHowFarCanThisTemplateBeFromEndOfContig; if ( pSub->nUnpaddedStart_ < nExtremeTemplateUnpaddedLeft2 ) continue; // check for this case: // ---xxx if ( pSub->bUnpaddedEndKnownPrecisely_ && pSub->nUnpaddedEnd_ <= nGetUnpaddedEndIndex() ) continue; } // perhaps we already chose this for flanking gaps if ( pSub->bChosenPriorToMakingListOfExperimentCandidates_ ) continue; // has this de novo universal primer ever been chosen before by // Autofinish? if ( ConsEd::pGetAssembly()->bHasUniversalPrimerReadBeenTriedBeforeByAutoFinish( pSub, nReadTypeOfSuggestedRead ) ) { fprintf( pAO,"Not choosing %s of template %s because this was chosen by Autofinish before and it failed\n", szGetReadType2( nReadTypeOfSuggestedRead ), pSub->soGetName().data() ); continue; } // if reached here, good chance that a de novo read can extend the // contig // point of no return--we will choose this read int nFirstBaseOfReadUnpadded; int nUnpaddedLeft; int nUnpaddedRight; if ( nWhichGap == nLeftGap ) { nFirstBaseOfReadUnpadded = pSub->nUnpaddedStart_ - pCP->nAutoFinishNumberOfVectorBasesAtBeginningOfAUniveralPrimerRead_; nUnpaddedLeft = pSub->nUnpaddedStart_; nUnpaddedRight = nFirstBaseOfReadUnpadded + ConsEd::pGetAssembly()->pModelRead_->length() - 1; } else { // right gap nFirstBaseOfReadUnpadded = pSub->nUnpaddedEnd_ + pCP->nAutoFinishNumberOfVectorBasesAtBeginningOfAUniveralPrimerRead_; nUnpaddedLeft = nFirstBaseOfReadUnpadded - ConsEd::pGetAssembly()->pModelRead_->length() + 1; nUnpaddedRight = pSub->nUnpaddedEnd_; } experiment* pExp = new experiment(); pExp->dErrorsFixed_ = dGetErrorsFixedForSubcloneRead( &pSub, 1, // only 1 subclone bTopNotBottomStrandOfSuggestedRead, nFirstBaseOfReadUnpadded, true, // exclude vector bases false, // bChooseExperiment false, // bJustConsensusNotGaps true ); // bFindWindowOfMostErrorsFixed readTypes ucChemistryAndStrand = ( bTopNotBottomStrandOfSuggestedRead ? TOP_STRAND_TERMINATOR_READS : BOTTOM_STRAND_TERMINATOR_READS ); pExp->ucStrandAndChemistry_ = ucChemistryAndStrand; pExp->nReadType_ = nReadTypeOfSuggestedRead; pExp->bCloneTemplateNotSubcloneTemplate_ = false; pExp->fCost_ = pCP->dAutoFinishCostOfDeNovoUniversalPrimerSubcloneReaction_; pExp->pCustomPrimer_ = 0; // must figure out the errors on just the consensus pExp->pContig_ = this; pExp->pSubcloneTemplate_ = pSub; pExp->nFirstBaseOfReadUnpadded_ = nFirstBaseOfReadUnpadded; pExp->nUnpaddedLeft_ = nUnpaddedLeft; pExp->nUnpaddedRight_ = nUnpaddedRight; // case in which the model read is shorter than the // parameter nAutoFinishNumberOfVectorBasesAtBeginningOfAUniveralPrimerRead_ if ( pExp->nUnpaddedLeft_ > pExp->nUnpaddedRight_ ) { delete pExp; continue; } pExp->bTopStrandNotBottomStrand_ = bTopNotBottomStrandOfSuggestedRead; pExp->bDeNovoUniversalPrimerRead_ = true; setVariousErrorData( pExp ); if ( pExp->bHasThisExperimentOrOneLikeItBeenTriedBefore() ) { delete pExp; continue; } pExp->nPurpose_ = nFixWeakRegion; pExp->chooseExperiment(); // commented-out July, 2002 so that when reverses are chosen, // oligo walks into gap still must overlap consensus by enough. // possiblyUpdateContigHighQualitySegment( pExp ); pChosenExperiments_->insert( pExp ); pUniversalPrimerExperimentsForLowQualityRegions_->insert( pExp ); ConsEd::pGetAssembly()->pAllChosenExperiments_->insert( pExp ); ConsEd::pGetAssembly()->bAnyReadsToCloseGaps_ = true; adjustExpectedNumberOfErrors( pExp ); findWindowOfMostErrorsFixed( pExp ); pExp->print(); fprintf( pAO, "After this experiment,\ncalculated number of errors in extended contig: %.2f and consensus: %.2f\n\n", dExpectedNumberOfErrorsInExtendedContig_, dExpectedNumberOfErrorsInConsensus_ ); pSub->bChosenPriorToMakingListOfExperimentCandidates_ = true; adjustRemainingExperimentCandidates( pExp ); } } void Contig :: findNeighborForOneGap( contigEndPair*& pPair, const int nWhichGap, int& nNumberOfExistingForwardReversePairs ) { int nConsPosLeft; int nConsPosRight; if ( nWhichGap == nLeftGap ) { nConsPosLeft = nGetStartIndex(); nConsPosRight = nConsPosLeft + pCP->nPrimersMaxInsertSizeOfASubclone_; } else { nConsPosRight = nGetEndIndex(); nConsPosLeft = nConsPosRight - pCP->nPrimersMaxInsertSizeOfASubclone_; } const int nInitialNumberOfContigEnds = 20; // These arrays are like this: // aContigsOfMates aEndsOfContigsOfMates aNumberOfMatesPerContigEnd // Contig1 nLeftGap 1 // Contig1 nRightGap 1 // Contig2 nLeftGap 0 // Contig2 nRightGap 2 // . . . // // so the first 2 arrays are the keys and the value is the 3rd array RWTValOrderedVector aContigsOfMates( nInitialNumberOfContigEnds, "findNeighborForOneGap aContigsOfMates" ); RWTValOrderedVector aEndsOfContigsOfMates( nInitialNumberOfContigEnds, "findNeighborForOneGap aEndsOfContigsOfMates" ); RWTValOrderedVector aNumberOfMatesPerContigEnd( nInitialNumberOfContigEnds, "findNeighborForOneGap aNumberOfMatesPerContigEnd" ); // for each contig/end in these arrays, this give an *array* of templates // that are informative as to that contig/end being the one next to // the one we are considering now. Note that since we are considering // all reads near the end of the contig, including resequencing reads, // we may have multiple universal primers from the same template. // Thus we could have the following: // fwd1 ---> <--- (rev) // fwd2 ---> // (All from same template.) // This means that for the same contig/end, the arrayOfTemplatesNames // may have the same template more than once--but I'm going to change // that right now... RWTValOrderedVector aInformativeTemplates( nInitialNumberOfContigEnds, "findNeighborForOneGap aInformativeTemplates" ); bool bSomeReadsPrinted = false; // Get all reads that are within nPrimersMaxInsertSizeOfASubclone_ // of the end of the contig. ContigView* pContigView = pGetContigView( nConsPosLeft, nConsPosRight ); bool bWantBottomStrandReads = ( nWhichGap == nLeftGap ) ? true : false; for( int nRead = 0; nRead < pContigView->nGetNumberOfFragments(); ++nRead ) { LocatedFragment* pLocFrag = pContigView->pLocatedFragmentGet( nRead ); if ( pLocFrag->bComp() != bWantBottomStrandReads ) continue; if ( pLocFrag->bIsWholeReadUnaligned() ) continue; // according to Chad Nusbaum 6/16/99, a univ primer read/walking read // pair can capture a gap. Thus no check if the read is // a walk or not. if ( pLocFrag->bIsWholeCloneTemplateNotSubcloneTemplate() ) continue; if ( pLocFrag->bIsAFakeRead() ) continue; // get template of the read. // If performance ever becomes a problem, we could do this more // efficiently by putting the pointer pSub into each // locatedFragment and thus saving this expensive lookup. subcloneTTemplate* pSub = ConsEd::pGetAssembly()->pGetSubcloneTemplateByTemplateName( pLocFrag->soGetTemplateName() ); // I can't think of any reason that the template object would not be // there. if ( !pSub ) continue; // Ask the subclone template if this // template is ok. "OK"--a template should not be excluded // just because it is in the badTemplates.txt or badLibraries.txt // file. Thus I am not using bOKToUseThisTemplate_ if ( pSub->bReadsAreInconsistent_ || pSub->bReadPairTooFarApart_ || pSub->bShortInsert_ || pSub->bUnalignedHighQualityRegionTooLong_ || pSub->bIsChimeric_ || pSub->bInconsistentGapSpanning_ || /* unfortunately, this hasn't been set yet */ ( pSub->bHasSeriousHighQualityDiscrepancies_ && !pSub->bHasForwardAndReversePair_ ) ) continue; LocatedFragment* pLocFragMate = NULL; if ( !pSub->bHasAnotherReadInADifferentContig( pLocFrag, pLocFragMate ) ) continue; // so we've found another read in a different contig! Contig* pMateContig = pLocFragMate->pGetContig(); if ( !bSomeReadsPrinted ) { bSomeReadsPrinted = true; if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO,"read contig strand (start end) mate-read mate-contig strand (start end) mate-contig-length\n" ); } } if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO,"%-20s %-10s %2s %7d %7d %-20s %-10s %2s %7d %7d %8d\n", (char*) pLocFrag->soGetName().data(), pLocFrag->pGetContig()->soGetName().data(), ( pLocFrag->bComp() ? "<-" : "->" ), pLocFrag->nGetAlignStartUnpadded(), pLocFrag->nGetAlignEndUnpadded(), (char*) pLocFragMate->soGetName().data(), (char*) pLocFragMate->pGetContig()->soGetName().data(), ( pLocFragMate->bComp() ? "<-" : "->" ), pLocFragMate->nGetAlignStartUnpadded(), pLocFragMate->nGetAlignEndUnpadded(), pLocFragMate->pGetContig()->nGetUnpaddedSeqLength() ); } int nMateWhichEndOfContig; // this should return true since otherwise // bReadsAreInconsistent_ would be set. So why are we checking // this? (DG, 4/01) bool bMateOK; if ( pLocFragMate->bIsWalkingNotUniversalPrimerRead() ) { bMateOK = pLocFragMate->bIsReadNearEndOfContig( nMateWhichEndOfContig ); if ( !bMateOK ) { if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO, " Mate not near end of contig\n\n" ); } } } else { bMateOK = pLocFragMate->bIsReadNearEndOfContigAndPointingOut( nMateWhichEndOfContig ); if ( !bMateOK ) { if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO, " Mate not near end of contig and pointing out.\n\n" ); } } } bool bFound = false; int nFoundIndex; for( int nIndex = 0; nIndex < aContigsOfMates.entries() && !bFound; ++nIndex ) { if ( ( pMateContig == aContigsOfMates[ nIndex ] ) && ( nMateWhichEndOfContig == aEndsOfContigsOfMates[ nIndex ] ) ) { bFound = true; nFoundIndex = nIndex; } } if ( !bFound ) { aContigsOfMates.insert( pMateContig ); aEndsOfContigsOfMates.insert( nMateWhichEndOfContig ); aNumberOfMatesPerContigEnd.insert( 1 ); arrayOfTemplates* pArrayOfTemplates = new RWTValOrderedVector( 4, "findNeighborForOneGap" ); pArrayOfTemplates->insert( pSub ); aInformativeTemplates.insert( pArrayOfTemplates ); } else { // we might not count it--let's see if there is // already a fwd/rev pair from the same template // that is already counted... arrayOfTemplates* pArrayOfTemplates = aInformativeTemplates[ nFoundIndex ]; if ( pArrayOfTemplates->index( pSub ) == RW_NPOS ) { ++aNumberOfMatesPerContigEnd[ nFoundIndex ]; pArrayOfTemplates->insert( pSub ); } } } // for( int nRead = 0; ... Contig* pMateContig1 = NULL; // first choice of mate contig end Contig* pMateContig2 = NULL; // second choice of mate contig end int nContigEnd1; int nContigEnd2; int nNumberOfForwardReversePairs1 = 0; int nNumberOfForwardReversePairs2 = 0; arrayOfTemplates* pArrayOfTemplateNames1; arrayOfTemplates* pArrayOfTemplateNames2; // find the contig/end that has the most forward/reverse pairs with // the contig/end in question. Find the contig/end that has the next // most forward/reverse pairs with the contig/end in question. if ( aContigsOfMates.entries() != 0 ) { int nMateContig; for( nMateContig = 0; nMateContig < aContigsOfMates.entries(); ++nMateContig ) { if ( aNumberOfMatesPerContigEnd[ nMateContig ] > nNumberOfForwardReversePairs1 ) { nNumberOfForwardReversePairs2 = nNumberOfForwardReversePairs1; nContigEnd2 = nContigEnd1; pMateContig2 = pMateContig1; pArrayOfTemplateNames2 = pArrayOfTemplateNames1; nNumberOfForwardReversePairs1 = aNumberOfMatesPerContigEnd[ nMateContig ]; nContigEnd1 = aEndsOfContigsOfMates[ nMateContig ]; pMateContig1 = aContigsOfMates[ nMateContig ]; pArrayOfTemplateNames1 = aInformativeTemplates[ nMateContig ]; } else if ( aNumberOfMatesPerContigEnd[ nMateContig ] > nNumberOfForwardReversePairs2 ) { nNumberOfForwardReversePairs2 = aNumberOfMatesPerContigEnd[ nMateContig ]; nContigEnd2 = aEndsOfContigsOfMates[ nMateContig ]; pMateContig2 = aContigsOfMates[ nMateContig ]; pArrayOfTemplateNames2 = aInformativeTemplates[ nMateContig ]; } } // start code for eliminating inconsistent gap-spanning templates // If there is one contig that has the required number of // forward/reverse pairs, then any other template with // forward/reverse pairs to any other contig will be considered // bad--one of the reads from it is either mislabelled or // misassembled. // If there are 2 different contig ends that both have the // required number of fwd/rev pairs, we do not eliminate either // of them since we really don't know which is correct. Either // is available for walking and minilibraries. if ( nNumberOfForwardReversePairs1 >= pCP->nAutoFinishCallHowManyReversesToFlankGaps_ ) { for( nMateContig = 0; nMateContig < aContigsOfMates.entries(); ++nMateContig ) { if ( aNumberOfMatesPerContigEnd[ nMateContig ] < pCP->nAutoFinishCallHowManyReversesToFlankGaps_ ) { arrayOfTemplates* pArrayOfTemplates = aInformativeTemplates[ nMateContig ]; for( int nSub = 0; nSub < pArrayOfTemplates->length(); ++nSub ) { subcloneTTemplate* pSub = (*pArrayOfTemplates)[ nSub ]; pSub->bInconsistentGapSpanning_ = true; pSub->bOKToUseThisTemplate_ = false; } } } } // if ( nNumberOfForwardReversePairs1 >= // end of code for eliminating inconsistent gap-spanning templates if ( nNumberOfForwardReversePairs1 >= pCP->nAutoFinishCallHowManyReversesToFlankGaps_ ) { int nGapSize = nGetGapSize( pArrayOfTemplateNames1 ); if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO, "Enough existing fwd/rev pairs to establish:\n" ); fprintf( pAO, "%s end of %s has %d fwd/rev pairs connecting it to\n%s end of %s with gap size %d %s\n", ( ( nContigEnd1 == nLeftGap ) ? "Left" : "Right" ), (char*) pMateContig1->soGetName().data(), nNumberOfForwardReversePairs1, ( ( nWhichGap == nLeftGap ) ? "Left" : "Right" ), (char*) soGetName().data(), nGapSize, ( nGapSize < 0 ? "(contigs overlap)" : "" ) ); } // I put the contigs in alphabetical order so that // contigEndPairs are easy to compare to see if they // are for the same contig ends pPair = new contigEndPair( this, pMateContig1, nWhichGap, nContigEnd1 ); // now the ctor does all this // if ( soGetName() < pMateContig1->soGetName() ) { // pPair->nWhichEnd_[0] = nWhichGap; // pPair->pContig_[0] = this; // pPair->nWhichEnd_[1] = nContigEnd1; // pPair->pContig_[1] = pMateContig1; // } // else { // pPair->nWhichEnd_[0] = nContigEnd1; // pPair->pContig_[0] = pMateContig1; // pPair->nWhichEnd_[1] = nWhichGap; // pPair->pContig_[1] = this; // } pPair->nNumberOfForwardReversePairs_ = nNumberOfForwardReversePairs1; pPair->nGapSize_ = nGapSize; pPair->pArrayOfTemplates_ = pArrayOfTemplateNames1; if ( nNumberOfForwardReversePairs2 >= pCP->nAutoFinishCallHowManyReversesToFlankGaps_ ) { if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO, "Warning: another contig is a candidate:\n" ); fprintf( pAO, "%s end of %s has %d fwd/rev pairs connecting it to this end of this contig\n", ( ( nContigEnd2 == nLeftGap ) ? "Left" : "Right" ), (char*) pMateContig2->soGetName().data(), nNumberOfForwardReversePairs2 ); } } } // if ( nNumberOfForwardReversePairs1 >= } // if ( aContigsOfMates.entries() != 0 ) { nNumberOfExistingForwardReversePairs = nNumberOfForwardReversePairs1; } // this differs from findNeighborForOneGap (above) in that it returns // *all* neighbors (with sufficient linking information)--not just the // best one. void Contig :: findNeighborForOneGap2( const int nWhichGap, RWTPtrSortedVector aContigsConsideredSoFar, // this must be already created and we will add to it: RWTPtrOrderedVector*& pArrayOfContigEndPairs ) { int nConsPosLeft; int nConsPosRight; if ( nWhichGap == nLeftGap ) { nConsPosLeft = nGetStartIndex(); nConsPosRight = nConsPosLeft + pCP->nPrimersMaxInsertSizeOfASubclone_; } else { nConsPosRight = nGetEndIndex(); nConsPosLeft = nConsPosRight - pCP->nPrimersMaxInsertSizeOfASubclone_; } const int nInitialNumberOfContigEnds = 20; // These arrays are like this: // aContigsOfMates aEndsOfContigsOfMates aNumberOfMatesPerContigEnd // Contig1 nLeftGap 1 // Contig1 nRightGap 1 // Contig2 nLeftGap 0 // Contig2 nRightGap 2 // . . . // // so the first 2 arrays are the keys and the value is the 3rd array RWTValOrderedVector aContigsOfMates( nInitialNumberOfContigEnds, "findNeighborForOneGap aContigsOfMates" ); RWTValOrderedVector aEndsOfContigsOfMates( nInitialNumberOfContigEnds, "findNeighborForOneGap aEndsOfContigsOfMates" ); RWTValOrderedVector aNumberOfMatesPerContigEnd( nInitialNumberOfContigEnds, "findNeighborForOneGap aNumberOfMatesPerContigEnd" ); // for each contig/end in these arrays, this give an *array* of templates // that are informative as to that contig/end being the one next to // the one we are considering now. Note that since we are considering // all reads near the end of the contig, including resequencing reads, // we may have multiple universal primers from the same template. // Thus we could have the following: // fwd1 ---> <--- (rev) // fwd2 ---> // (All from same template.) // This means that for the same contig/end, the arrayOfTemplatesNames // may have the same template more than once--but I'm going to change // that right now... // Note: this array is indexed by the same index as the 3 arrays above. RWTValOrderedVector aInformativeTemplates( nInitialNumberOfContigEnds, "findNeighborForOneGap aInformativeTemplates" ); bool bSomeReadsPrinted = false; // Get all reads that are within nPrimersMaxInsertSizeOfASubclone_ // of the end of the contig. RWTPtrOrderedVector aReadsFromSameTemplateButInDifferentContigs( (size_t) 3 ); ContigView* pContigView = pGetContigView( nConsPosLeft, nConsPosRight ); bool bWantBottomStrandReads = ( nWhichGap == nLeftGap ) ? true : false; for( int nRead = 0; nRead < pContigView->nGetNumberOfFragments(); ++nRead ) { LocatedFragment* pLocFrag = pContigView->pLocatedFragmentGet( nRead ); if ( pLocFrag->bIsUniversalPrimerRead() && ( pLocFrag->bComp() != bWantBottomStrandReads ) ) continue; if ( pLocFrag->bIsWholeReadUnaligned() ) continue; // according to Chad Nusbaum 6/16/99, a univ primer read/walking read // pair can capture a gap. Thus no check if the read is // a walk or not. 5/04: Nuts to Chad Nusbaum: walks are not // as useful since we don't know which strand they are on: // contigA contigB // ---------- ---------- // ---> <--- // fwd walk // or is the orientation like this? // contigA contigB // ---------- --------- // ---> ---> // fwd walk // With a walk we can't tell. They occur as links so seldom (according // to Rachel), that I'm not going to worry about them. // missing some important PCR links, so I'm going to use // walks after all. Hat off to Chad Nusbaum. // if ( !pLocFrag->bIsUniversalPrimerRead() ) continue; if ( pLocFrag->bIsWholeCloneTemplateNotSubcloneTemplate() ) continue; if ( pLocFrag->bIsAFakeRead() ) continue; // get template of the read. subcloneTTemplate* pSub = pLocFrag->pSub_; if ( !pSub ) { if ( !pLocFrag->bIsAShortRead() ) { if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( stderr, "read %s in contig %s has no pSub_\n", pLocFrag->soGetName().data(), this->soGetName().data() ); } } continue; } // Ask the subclone template if this // template is ok. "OK"--a template should not be excluded // just because it is in the badTemplates.txt or badLibraries.txt // file. Thus I am not using bOKToUseThisTemplate_ if ( pSub->bReadsAreInconsistent_ || pSub->bReadPairTooFarApart_ || pSub->bShortInsert_ || pSub->bUnalignedHighQualityRegionTooLong_ || pSub->bIsChimeric_ || pSub->bInconsistentGapSpanning_ || /* unfortunately, this hasn't been set yet */ ( pSub->bHasSeriousHighQualityDiscrepancies_ && !pSub->bHasForwardAndReversePair_ ) ) continue; aReadsFromSameTemplateButInDifferentContigs.clear(); if ( !pSub->bHasAnotherReadInADifferentContig2( pLocFrag, &aReadsFromSameTemplateButInDifferentContigs ) ) continue; // so we've found another read in a different contig! for( int nReadMate = 0; nReadMate < aReadsFromSameTemplateButInDifferentContigs.length(); ++nReadMate ) { LocatedFragment* pLocFragMate = aReadsFromSameTemplateButInDifferentContigs[ nReadMate ]; Contig* pMateContig = pLocFragMate->pGetContig(); // no point in processing this pair if we have already considered // this contig before if ( aContigsConsideredSoFar.bContains( pMateContig ) ) { if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO, "already considered pair %s %s between contigs %s and %s\n", pLocFrag->soGetName().data(), pLocFragMate->soGetName().data(), this->soGetName().data(), pMateContig->soGetName().data() ); } continue; } if ( !bSomeReadsPrinted ) { bSomeReadsPrinted = true; if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO,"read contig strand (start end) mate-read mate-contig strand (start end) mate-contig-length\n" ); } } if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO,"%-20s %-10s %2s %7d %7d %-20s %-10s %2s %7d %7d %8d\n", (char*) pLocFrag->soGetName().data(), pLocFrag->pGetContig()->soGetName().data(), ( pLocFrag->bComp() ? "<-" : "->" ), pLocFrag->nGetAlignStartUnpadded(), pLocFrag->nGetAlignEndUnpadded(), (char*) pLocFragMate->soGetName().data(), (char*) pLocFragMate->pGetContig()->soGetName().data(), ( pLocFragMate->bComp() ? "<-" : "->" ), pLocFragMate->nGetAlignStartUnpadded(), pLocFragMate->nGetAlignEndUnpadded(), pLocFragMate->pGetContig()->nGetUnpaddedSeqLength() ); } int nMateWhichEndOfContig; // this should return true since otherwise // bReadsAreInconsistent_ would be set. So why are we checking // this? (DG, 4/01) bool bMateOK; if ( pLocFragMate->bIsWalkingNotUniversalPrimerRead() ) { bMateOK = pLocFragMate->bIsReadNearEndOfContig( nMateWhichEndOfContig ); if ( !bMateOK ) { if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO, " Mate not near end of contig\n\n" ); } continue; } } else { bMateOK = pLocFragMate->bIsReadNearEndOfContigAndPointingOut( nMateWhichEndOfContig ); if ( !bMateOK ) { if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO, " Mate not near end of contig and pointing out.\n\n" ); } continue; } } bool bFound = false; int nFoundIndex; for( int nIndex = 0; nIndex < aContigsOfMates.entries() && !bFound; ++nIndex ) { if ( ( pMateContig == aContigsOfMates[ nIndex ] ) && ( nMateWhichEndOfContig == aEndsOfContigsOfMates[ nIndex ] ) ) { bFound = true; nFoundIndex = nIndex; } } if ( !bFound ) { aContigsOfMates.insert( pMateContig ); aEndsOfContigsOfMates.insert( nMateWhichEndOfContig ); aNumberOfMatesPerContigEnd.insert( 1 ); arrayOfTemplates* pArrayOfTemplates = new RWTValOrderedVector( 4, "findNeighborForOneGap2::pArrayOfTemplates" ); pArrayOfTemplates->insert( pSub ); aInformativeTemplates.insert( pArrayOfTemplates ); } else { // we might not count it--let's see if there is // already a fwd/rev pair from the same template // that is already counted... This can occur, for example // when there are several fwds that correctly all assemble // near the end of a template. arrayOfTemplates* pArrayOfTemplates = aInformativeTemplates[ nFoundIndex ]; if ( pArrayOfTemplates->index( pSub ) == RW_NPOS ) { ++aNumberOfMatesPerContigEnd[ nFoundIndex ]; pArrayOfTemplates->insert( pSub ); } } } // for( int nReadMate = 0; } // for( int nRead = 0; ... if ( aContigsOfMates.entries() != 0 ) { int nMateContig; for( nMateContig = 0; nMateContig < aContigsOfMates.entries(); ++nMateContig ) { if ( aNumberOfMatesPerContigEnd[ nMateContig ] >= pCP->nAutoFinishCallHowManyReversesToFlankGaps_ ) { // this will be a contigEndPair contigEndPair* pPair = new contigEndPair( this, aContigsOfMates[ nMateContig ], nWhichGap, aEndsOfContigsOfMates[ nMateContig ] ); // warning--this is pointing to memory that // must not be deallocated pPair->pArrayOfTemplates_ = aInformativeTemplates[ nMateContig ]; pPair->nNumberOfForwardReversePairs_ = aNumberOfMatesPerContigEnd[ nMateContig ]; pPair->nGapSize_ = nGetGapSize( pPair->pArrayOfTemplates_ ); pPair->calculateHighQualityGapSize(); if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO, "gap from %s of %s to %s of %s high quality size %d based on %d links\n", szWhichGap( pPair->nWhichEnd_[0] ), pPair->pContig_[0]->soGetName().data(), szWhichGap( pPair->nWhichEnd_[1] ), pPair->pContig_[1]->soGetName().data(), pPair->nHighQualityGapSize_, pPair->nNumberOfForwardReversePairs_ ); } pPair->calculateHighQualityGapSize(); pArrayOfContigEndPairs->insert( pPair ); } else { // case in which this array is not being referenced // by a contigEndPair and thus won't get deleted // when the contigEndPairs are deleted arrayOfTemplates* pArrayOfTemplates = aInformativeTemplates[ nMateContig ]; delete pArrayOfTemplates; aInformativeTemplates[ nMateContig ] = NULL; } } } // if ( aContigsOfMates.entries() != 0 ) { }