/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "consed.h" #include "consedParameters.h" #include "subcloneTTemplate.h" #include "locatedFragment.h" #include "assembly.h" #include "contig.h" #include "min.h" #include "max.h" #include "abs.h" #include #include "assert.h" #include "fileDefines.h" #include "dErrorRateFromQuality.h" #include "library.h" #include "fwdRevPair2.h" #include "popupErrorMessage.h" subcloneTTemplate :: subcloneTTemplate( LocatedFragment* pLocFrag ) : bOKToUseThisTemplate_( true ), bBadTemplateFile_( false ), bBadLibraryFile_( false ), bShortInsert_( false ), bReadsAreInconsistent_( false ), bReadPairTooFarApart_( false ), bUnalignedHighQualityRegionTooLong_( false ), bHasSeriousHighQualityDiscrepancies_( false ), bIsChimeric_( false ), bInconsistentGapSpanning_( false ), bDoNotShowInAssemblyView_( false ), // bTemplateIsDoubleStranded_( false ), bOKToUseForTopStrandReads_( false ), bOKToUseForBottomStrandReads_( false ), bErrorRateCalculated_( false ), aContigs_( (size_t) 3 ), aUnpaddedLeft_( (size_t) 3 ), aUnpaddedRight_( (size_t) 3 ), aFlags_( (size_t) 3 ), nInsertSizeFromForwardReversePair_( -1 ), bOKToUseThisTemplateForCalculatingInsertSizeFromForwardReversePairs_( false ), bChosenPriorToMakingListOfExperimentCandidates_( false ), pLibrary_( NULL ) { soTemplateName_ = pLocFrag->soGetTemplateName(); // do not do this so soon--it is done later in // flagAllSubclonesIfBadTemplateOrBadLibrary // bBadTemplateFile_ = !ConsEd::pGetAssembly()->bIsThisTemplateOKNotBad( soTemplateName_ ); // use from library // bTemplateIsDoubleStranded_ = // pCP->bPrimersAssumeTemplatesAreDoubleStrandedUnlessSpecified_; aReads_.insert( pLocFrag ); pLocFrag->pSub_ = this; // add thyself to the assembly array ConsEd::pGetAssembly()->subcloneTTemplateArray_.insert( this ); } // copy constructor for newsub = new subcloneTTemplate( *this ); subcloneTTemplate :: subcloneTTemplate( const subcloneTTemplate& sub) { soTemplateName_ = sub.soTemplateName_; soLibrary_ = sub.soLibrary_; pLibrary_ = sub.pLibrary_; for( int nRead = 0; nRead < sub.aReads_.length(); ++nRead ) { aReads_.insert( sub.aReads_[ nRead ] ); // should the LocatedFragment::pSub_ be changed to point here? } bOKToUseThisTemplate_ = sub.bOKToUseThisTemplate_; bBadTemplateFile_ = sub.bBadTemplateFile_; bBadLibraryFile_ = sub.bBadLibraryFile_; bShortInsert_ = sub.bShortInsert_; bReadsAreInconsistent_ = sub.bReadsAreInconsistent_; bReadPairTooFarApart_ = sub.bReadPairTooFarApart_; bUnalignedHighQualityRegionTooLong_ = sub.bUnalignedHighQualityRegionTooLong_; bHasSeriousHighQualityDiscrepancies_ = sub.bHasSeriousHighQualityDiscrepancies_; bIsChimeric_ = sub.bIsChimeric_; bInconsistentGapSpanning_ = sub.bInconsistentGapSpanning_; // use from library // bTemplateIsDoubleStranded_ = sub.bTemplateIsDoubleStranded_; bOKToUseThisTemplateForCalculatingInsertSizeFromForwardReversePairs_ = sub.bOKToUseThisTemplateForCalculatingInsertSizeFromForwardReversePairs_; nInsertSizeFromForwardReversePair_ = sub.nInsertSizeFromForwardReversePair_; bChosenPriorToMakingListOfExperimentCandidates_ = sub.bChosenPriorToMakingListOfExperimentCandidates_; // aContigs_ is the list of contigs for which there // is a universal primer read // In each such contig, aUnpaddedLeft_ and aUnpaddedRight_ give // the starting and ending location of the template. // aFlags indicates whether these locations are known exactly // or approximately. (See constants above for values.) for( int nContig = 0; nContig < sub.aContigs_.length(); ++nContig ) { aContigs_.insert( sub.aContigs_[ nContig ] ); aUnpaddedLeft_.insert( sub.aUnpaddedLeft_[ nContig ] ); aUnpaddedRight_.insert( sub.aUnpaddedRight_[ nContig ] ); aFlags_.insert( sub.aFlags_[ nContig ] ); } bHasForwardAndReversePair_ = sub.bHasForwardAndReversePair_; // used to search for a template for a primer // Note: this structure pContig_ = sub.pContig_; // Start means Left and End means Right nUnpaddedStart_ = sub.nUnpaddedStart_; nUnpaddedEnd_ = sub.nUnpaddedEnd_; bUnpaddedStartKnownPrecisely_ = sub.bUnpaddedStartKnownPrecisely_; bUnpaddedEndKnownPrecisely_ = sub.bUnpaddedEndKnownPrecisely_; bOKToUseForTopStrandReads_ = sub.bOKToUseForTopStrandReads_; bOKToUseForBottomStrandReads_ = sub.bOKToUseForBottomStrandReads_; bErrorRateCalculated_ = sub.bErrorRateCalculated_; dErrorRate_ = sub.dErrorRate_; } void subcloneTTemplate :: transferLibraryNameFromReadsToSubcloneTTemplate() { // determine the subclone library name for( int nRead = 0; nRead < aReads_.entries(); ++nRead ) { LocatedFragment* pLocFrag = aReads_[ nRead ]; if ( soLibrary_.isNull() && !pLocFrag->soLibrary_.isNull() ) soLibrary_ = pLocFrag->soLibrary_; else if ( !soLibrary_.isNull() && !pLocFrag->soLibrary_.isNull() && pLocFrag->soLibrary_ != soLibrary_ ) { RWCString soError = "error: reads from the same template say different libraries. "; soError += "Will use library: \""; soError += soLibrary_; soError += "\""; for( int nRead2 = 0; nRead2 < aReads_.entries(); ++nRead2 ) { LocatedFragment* pLocFrag2 = aReads_[ nRead2 ]; soError += " read: "; soError += pLocFrag2->soGetName(); soError += " is from library: "; soError += pLocFrag2->soLibrary_; } // this used to throw an exception (prior to July 2005) which // then caused some templates to not be assigned to libraries // which caused seg fault popupErrorMessage( soError ); } } } void subcloneTTemplate :: processingAfterAllReadsAdded() { int nRead; for( nRead = 0; nRead < aReads_.entries(); ++nRead ) { LocatedFragment* pLocFrag = aReads_[ nRead ]; if ( pLocFrag->bIsWalkingNotUniversalPrimerRead() ) continue; if ( pLocFrag->bWholeReadIsUnaligned_ ) continue; // no information from this read Contig* pContig = pLocFrag->pGetContig(); int nIndex = aContigs_.index( pLocFrag->pGetContig() ); if ( nIndex == RW_NPOS ) { aContigs_.insert( pContig ); aUnpaddedLeft_.insert( -1 ); aUnpaddedRight_.insert( -1 ); aFlags_.insert( 0 ); nIndex = aContigs_.entries() - 1; } int nFirstVectorInsertJunctionConsPos; int nSecondVectorInsertJunctionConsPos; bool bFoundSecondVectorInsertJunction; pLocFrag->findVectorInsertJunctionsInUniversalPrimerRead( nFirstVectorInsertJunctionConsPos, nSecondVectorInsertJunctionConsPos, bFoundSecondVectorInsertJunction ); if ( pLocFrag->bComp() ) setRightUnpaddedExact( pContig->nUnpaddedIndex( nFirstVectorInsertJunctionConsPos ), nIndex ); else setLeftUnpaddedExact( pContig->nUnpaddedIndex( nFirstVectorInsertJunctionConsPos ), nIndex ); if ( bFoundSecondVectorInsertJunction ) { if ( pLocFrag->bComp() ) setLeftUnpaddedExact( pContig->nUnpaddedIndex( nSecondVectorInsertJunctionConsPos ), nIndex ); else setRightUnpaddedExact( pContig->nUnpaddedIndex( nSecondVectorInsertJunctionConsPos ), nIndex ); } } // now use walking reads // If the walking read is in the same contig as a universal primer, // and is 5' from it, then it is informative. for( nRead = 0; nRead < aReads_.entries(); ++nRead ) { LocatedFragment* pLocFrag = aReads_[ nRead ]; if ( !pLocFrag->bIsWalkingNotUniversalPrimerRead() ) continue; if ( pLocFrag->bWholeReadIsUnaligned_ ) continue; // no information from this read int nIndex = aContigs_.index( pLocFrag->pGetContig() ); if ( nIndex == RW_NPOS ) { // This is the case in which we have a walking read // in a contig without a universal primer read of // the same template. If the read is close enough to // the end of the contig, then I will use the template // in this contig. // I am going to make the following // assumption--that the template extends as far into the // contig as the aligned portion of the read. And it extends // into the gap. walkingReadByItselfInContig( pLocFrag ); continue; } int nFlags = aFlags_[nIndex ]; if ( pLocFrag->bComp( ) ) { if ( nFlags & LEFT_CONS_POS_SET ) continue; } else { if ( nFlags & RIGHT_CONS_POS_SET ) continue; } // If reached here, one end of the template has not been set. // so the walking read may have some additional information // If the walking read walks into vector, then we know the other // vector insert junction. We know it precisely if the walk // gives good sequence for a while ( by default, more than 50 bases) // and then runs into xxxx's. On the other hand, if we run into // xxx's within the first 50 bases, then it is possible that the // walk int nLastBaseBeforeVector; bool bFoundXsAfterAligned; pLocFrag->findVectorInsertJunctionInWalkingRead( bFoundXsAfterAligned, nLastBaseBeforeVector ); if ( bFoundXsAfterAligned ) { if ( pLocFrag->bComp() ) { aFlags_[ nIndex ] = aFlags_[ nIndex ] | LEFT_CONS_POS_SET; aUnpaddedLeft_[ nIndex ] = pLocFrag->pContig_->nUnpaddedIndex( nLastBaseBeforeVector ); aFlags_[ nIndex ] |= LEFT_CONS_POS_EXACT; } else { aFlags_[ nIndex ] |= RIGHT_CONS_POS_SET; aUnpaddedRight_[ nIndex ] = pLocFrag->pContig_->nUnpaddedIndex( nLastBaseBeforeVector ); aFlags_[ nIndex ] |= RIGHT_CONS_POS_EXACT; } } else { // we already checked that the read has an aligned part if ( pLocFrag->bComp() ) { aFlags_[ nIndex ] = aFlags_[ nIndex ] | LEFT_CONS_POS_SET; aUnpaddedLeft_[ nIndex ] = pLocFrag->nGetAlignClipStartUnpadded(); aFlags_[ nIndex ] |= LEFT_CONS_POS_EQUAL_OR_FURTHER_LEFT; } else { aFlags_[ nIndex ] = aFlags_[ nIndex ] | RIGHT_CONS_POS_SET; aUnpaddedRight_[ nIndex ] = pLocFrag->nGetAlignClipEndUnpadded(); aFlags_[ nIndex ] |= RIGHT_CONS_POS_EQUAL_OR_FURTHER_RIGHT; } } } assert( aContigs_.entries() == aUnpaddedLeft_.entries() ); assert( aContigs_.entries() == aUnpaddedRight_.entries() ); // If the template endpoints are still not determined, defer setting // them until after the mean insert size for the library are // calculated. if ( pCP->bAutoFinishCheckThatReadsFromTheSameTemplateAreConsistent_ ) bReadsAreInconsistent_ = !bReadsAreConsistent(); else bReadsAreInconsistent_ = false; // check unaligned high quality and high quality discrepancies // Pat and I decided to only apply these checks to universal primer // reads at this point since walking reads are put into problem areas // and thus the walking read may be correct but the reads around it // may have deletions and thus the walking read may have high quality // discrepancies. In such a case you would still want to use the primer. for( nRead = 0; nRead < aReads_.entries(); ++nRead ) { LocatedFragment* pLocFrag = aReads_[ nRead ]; if ( pLocFrag->bIsWalkingNotUniversalPrimerRead() ) continue; if ( pLocFrag->bUnalignedHighQualityRegionTooLong( true ) ) { bUnalignedHighQualityRegionTooLong_ = true; // for time being, I'm not going to find any other reasons // to not use this template break; } if ( pLocFrag->bIsChimeric() ) { bIsChimeric_ = true; break; } if ( pLocFrag->bHasSeriousHighQualityDiscrepancies() ) { bHasSeriousHighQualityDiscrepancies_ = true; // for time being, I'm not going to find any other reasons // to not use this template break; } } bHasForwardAndReversePair_ = bTemplateHasForwardReversePair(); bool bRejectTemplateBasedOnHQD = false; if ( bHasSeriousHighQualityDiscrepancies_ ) { bRejectTemplateBasedOnHQD = true; if ( pCP->bAutoFinishAllowHighQualityDiscrepanciesInTemplateIfConsistentForwardReversePair_ && bHasForwardAndReversePair_ ) { bRejectTemplateBasedOnHQD = false; if ( pCP->bPrimersPrintInfoOnRejectedTemplates_ && ( pCP->nWhatIsRunning_ == nAutoFinishIsRunning ) ) { fprintf( pAO, "template: %s has serious high quality discrepancies but might still use it because it has a consistent fwd/rev pair\n", (char*) soTemplateName_.data() ); } } } if ( bReadsAreInconsistent_ || bUnalignedHighQualityRegionTooLong_ || bRejectTemplateBasedOnHQD || bIsChimeric_ ) bOKToUseThisTemplate_ = false; if ( !bOKToUseThisTemplate_ ) { RWCString soReason = ""; if ( bShortInsert_ ) soReason += "short insert "; if ( bReadsAreInconsistent_ ) soReason += "reads are inconsistent "; if ( bUnalignedHighQualityRegionTooLong_ ) soReason += "unaligned high quality region "; if ( bHasSeriousHighQualityDiscrepancies_ ) soReason += "high quality discrepancies "; if ( bIsChimeric_ ) soReason += "is chimeric "; if ( pCP->bPrimersPrintInfoOnRejectedTemplates_ && ( pCP->nWhatIsRunning_ == nAutoFinishIsRunning ) ) fprintf( pAO, "not using template: %s because %s\n", (char*) soTemplateName_.data(), (char*) soReason.data() ); // I think that if the template is not ok, then should not go ahead and // insert it into any contig since we don't want to use it for anything. return; } } void subcloneTTemplate :: processingAfterAddedAllTemplatesInThisLibrary( ) { // now if the template endpoints are still not determined, // use the avg insert size // I am using a little less than the average insert size for // both walking reads and reverses since in both cases the // read will be more likely to succeed: // xx (problem region) // ------------------------ // ---> existing read // ^ guess of template insert size // <-- <-- // case1 case2 // // case 1 is when the insert is actually shorter. In that case, // the reverse will not work. In case2, the reverse will work, // but will just not fix the problem as well since the read will // be in higher base position where it covers the problem region. // Thus it is better to underestimate the size of the template // (it is better for the actual insert size to greater than your // guess.) // Similarly, walks are less likely to walk into vector if you // underestimate the insert size. int nContig; for( nContig = 0; nContig < aContigs_.entries(); ++nContig ) { int nFlag = aFlags_[ nContig ]; if ( !( nFlag & LEFT_CONS_POS_SET ) ) { if ( nFlag & RIGHT_CONS_POS_SET ) { aFlags_[ nContig ] |= LEFT_CONS_POS_SET; aUnpaddedLeft_[ nContig ] = aUnpaddedRight_[ nContig ] - pLibrary_->nALittleLessThanAverageInsertSizeToUse_ + 1; } } if ( !( nFlag & RIGHT_CONS_POS_SET ) ) { if ( nFlag & LEFT_CONS_POS_SET ) { aFlags_[ nContig ] |= RIGHT_CONS_POS_SET; aUnpaddedRight_[ nContig ] = aUnpaddedLeft_[ nContig ] + pLibrary_->nALittleLessThanAverageInsertSizeToUse_ - 1; } } } // Now check if the insert is too short. for( nContig = 0; nContig < aContigs_.entries(); ++nContig ) { assert( ( aFlags_[ nContig ] & LEFT_CONS_POS_SET ) && ( aFlags_[ nContig ] & RIGHT_CONS_POS_SET ) ); if ( ( aFlags_[ nContig ] & LEFT_CONS_POS_SET ) && ( aFlags_[ nContig ] & RIGHT_CONS_POS_SET ) ) { if ( ABS( aUnpaddedRight_[ nContig ] - aUnpaddedLeft_[ nContig ] ) < consedParameters::pGetConsedParameters()->nAutoFinishPotentialHighQualityPartOfReadEnd_ ) { bShortInsert_ = true; // one indication of a short insert is enough--no need to continue break; } } } if ( bShortInsert_ ) { bOKToUseThisTemplate_ = false; if ( pCP->bPrimersPrintInfoOnRejectedTemplates_ && ( pCP->nWhatIsRunning_ == nAutoFinishIsRunning ) ) fprintf( pAO, "not using template: %s because short insert\n", (char*) soTemplateName_.data() ); } // if the template is not ok, do not add it to any contig. One reason // it might not be ok is if it is in the bad templates or bad libraries // files. Thus this list Contig::aSubcloneTemplates_ is not a good // list to use for Assembly View fwd/rev pair depth or showing fwd/rev pairs // which doesn't care about bad templates or bad libraries if ( !bOKToUseThisTemplate_ ) return; for( nContig = 0; nContig < aContigs_.entries(); ++nContig ) { subcloneTTemplate* pSub = NULL; if ( nContig != 0 ) { pSub = new subcloneTTemplate(*this); } else { pSub = this; } pSub->pContig_ = aContigs_[ nContig ]; pSub->nUnpaddedStart_ = aUnpaddedLeft_[ nContig ]; pSub->nUnpaddedEnd_ = aUnpaddedRight_[ nContig ]; if ( pSub->aFlags_[ nContig ] & LEFT_CONS_POS_EXACT ) pSub->bUnpaddedStartKnownPrecisely_ = true; else pSub->bUnpaddedStartKnownPrecisely_ = false; if ( pSub->aFlags_[ nContig ] & RIGHT_CONS_POS_EXACT ) pSub->bUnpaddedEndKnownPrecisely_ = true; else pSub->bUnpaddedEndKnownPrecisely_ = false; pSub->figureOutWhichStrandsTheTemplateHas( pSub->pContig_ ); pSub->pContig_->aSubcloneTemplates_.insert( pSub ); } } bool subcloneTTemplate :: bReadsAreConsistent() { // currently this does not check for how many different contigs these // reads are in if ( aReads_.entries() > 10 ) { if ( pCP->bPrimersPrintInfoOnRejectedTemplates_ && ( pCP->nWhatIsRunning_ == nAutoFinishIsRunning ) ) fprintf( pAO,"warning: you must have misnamed reads since more than 10 seem to be from template %s--autofinish template calling is unpredictable\n", (char*) soTemplateName_.data() ); return( false ); } for( int nRead1 = 0; nRead1 < aReads_.entries() - 1; ++nRead1 ) { LocatedFragment* pLocFrag1 = aReads_[ nRead1 ]; if ( pLocFrag1->bIsWholeReadUnaligned() ) continue; for( int nRead2 = nRead1 + 1; nRead2 < aReads_.entries(); ++nRead2 ) { LocatedFragment* pLocFrag2 = aReads_[ nRead2 ]; if ( pLocFrag2->bIsWholeReadUnaligned() ) continue; // are these both universal primer reads? If they are both forwards // or both reverses, they must both be in the same contig, both // on the same strand, and must both be roughly the same location if ( !pLocFrag1->bIsWalkingNotUniversalPrimerRead() && !pLocFrag2->bIsWalkingNotUniversalPrimerRead() ) { if ( pLocFrag1->bIsForwardNotReverseRead() == pLocFrag2->bIsForwardNotReverseRead() ) { if ( pLocFrag1->pGetContig() != pLocFrag2->pGetContig() ) { if ( pCP->bPrimersPrintInfoOnRejectedTemplates_ && ( pCP->nWhatIsRunning_ == nAutoFinishIsRunning ) ) fprintf( pAO, "not using template %s because reads %s and %s are both universal primer reads from this template and both are %s reads but are in different contigs\n", (char*) soTemplateName_.data(), (char*) pLocFrag1->soGetName().data(), (char*) pLocFrag2->soGetName().data(), ( pLocFrag1->bIsForwardNotReverseRead() ? "forward" : "reverse" ) ); return( false ); } // if reached here, we have a fwd/fwd or rev/rev pair both // in the same contig. // Check that they are on the same strand if ( pLocFrag1->bComp() != pLocFrag2->bComp() ) { // this is the case in which they have opposite orientation on the // same contig. This indicates that one of them is missassembled. // Since we don't know which one, we don't know where the template // is, so don't use these reads as templates. if ( pCP->bPrimersPrintInfoOnRejectedTemplates_ && ( pCP->nWhatIsRunning_ == nAutoFinishIsRunning ) ) fprintf( pAO, "not using template %s because reads %s and %s are both %s universal primer reads but are on opposite strands\n", (char*) soTemplateName_.data(), (char*) pLocFrag1->soGetName().data(), (char*) pLocFrag2->soGetName().data(), ( pLocFrag1->bIsForwardNotReverseRead() ? "forward" : "reverse" ) ); return( false ); } // check that they are roughly the same location int nConsPos1 = pLocFrag1->nGetConsPosOfBeginningOfRead(); int nConsPos2 = pLocFrag2->nGetConsPosOfBeginningOfRead(); if ( ABS( nConsPos1 - nConsPos2 ) > consedParameters::pGetConsedParameters()->nPrimersToleranceForDifferentBeginningLocationOfUniversalPrimerReads_ ) { if ( pCP->bPrimersPrintInfoOnRejectedTemplates_ && ( pCP->nWhatIsRunning_ == nAutoFinishIsRunning ) ) fprintf( pAO, "not using template %s because reads %s and %s are both %s universal primer reads but start %d bases apart (%d is the maximum tolerance)\n", (char*) soTemplateName_.data(), (char*) pLocFrag1->soGetName().data(), (char*) pLocFrag2->soGetName().data(), ( pLocFrag1->bIsForwardNotReverseRead() ? "forward" : "reverse" ), ABS( nConsPos1 - nConsPos2 ), consedParameters::pGetConsedParameters()->nPrimersToleranceForDifferentBeginningLocationOfUniversalPrimerReads_ ); return( false ); } } else { // we have a forward/reverse read pair if ( pLocFrag1->pGetContig() == pLocFrag2->pGetContig() ) { // the fwd/rev pair is in the same contig // check orientations--they must be pointing towards each // other if ( pLocFrag1->bComp() == pLocFrag2->bComp() ) { if ( pCP->bPrimersPrintInfoOnRejectedTemplates_ && ( pCP->nWhatIsRunning_ == nAutoFinishIsRunning ) ) fprintf( pAO, "not using template %s because fwd/rev pair reads %s and %s should be on opposite strands but are not\n", (char*) soTemplateName_.data(), (char*) pLocFrag1->soGetName().data(), (char*) pLocFrag2->soGetName().data() ); return( false ); } LocatedFragment* pLocFragTopStrand; LocatedFragment* pLocFragBottomStrand; if ( pLocFrag1->bComp() ) { pLocFragBottomStrand = pLocFrag1; pLocFragTopStrand = pLocFrag2; } else { pLocFragBottomStrand = pLocFrag2; pLocFragTopStrand = pLocFrag1; } int nUnpaddedConsPosTopStrand = pLocFragTopStrand->nGetAlignStartUnpadded(); int nUnpaddedConsPosBottomStrand = pLocFragBottomStrand->nGetAlignEndUnpadded(); if ( nUnpaddedConsPosTopStrand > nUnpaddedConsPosBottomStrand ) { if ( pCP->bPrimersPrintInfoOnRejectedTemplates_ && ( pCP->nWhatIsRunning_ == nAutoFinishIsRunning ) ) fprintf( pAO, "not using template %s because fwd/rev pair reads %s and %s are pointing away from each other\n", (char*) soTemplateName_.data(), (char*) pLocFrag1->soGetName().data(), (char*) pLocFrag2->soGetName().data() ); return( false ); } // At this point, we haven't yet calculated the // maximum insert size from the forward/reverse pairs // and I've found that the maximum insert size from // the librariesInfo.txt file is not reliable (since // people put in the wrong number). Thus let's not // eliminate any templates at this point due to // the forward/reverse pair being too far from each other. // (DG, 6/02) } // if ( pLocFrag1->pGetContig() ==... else } // if ( pLocFrag1->bIsForwardNotReverseRead() == ... else } // if ( !pLocFrag1->bIsWalkingNotUniversalPrimerRead() ... else { // case in which one of the reads is a walking read } // if ( !pLocFrag1->bIsWalkingNotUniversalPrimerRead() ... } // for( int nRead2 ... } // for( int nRead1 ... // if passed all these tests, the reads are consistent return( true ); } // bReadsAreConsistent bool subcloneTTemplate :: bHasAConsistentFwdRevPair( LocatedFragment*& pFwdRead, LocatedFragment*& pRevRead, bool& bHasFwdRevPair, char& cProblem ) { pFwdRead = NULL; pRevRead = NULL; bHasFwdRevPair = true; // we are just interested in find *one* forward and *one* reverse // that are consistent with each other. If there are other forwards // or other reverses that are inconsistent, we don't care. RWTPtrOrderedVector aForwardReads( aReads_.entries() ); RWTPtrOrderedVector aReverseReads( aReads_.entries() ); for( int nRead = 0; nRead < aReads_.length(); ++nRead ) { LocatedFragment* pLocFrag = aReads_[ nRead ]; if ( pLocFrag->bIsWholeReadUnaligned() ) continue; // the rationale is that we are read pairs in which one (or both) // reads are unaligned gives no information about whether the assembly // is correct or incorrect if ( pLocFrag->nReadType_ == nUniversalForward ) aForwardReads.insert( pLocFrag ); else if ( pLocFrag->nReadType_ == nUniversalReverse ) aReverseReads.insert( pLocFrag ); } if ( aForwardReads.length() == 0 || aReverseReads.length() == 0 ) { bHasFwdRevPair = false; return( false ); } // now consider all pairs of forwards and reverses for( int nForwardRead = 0; nForwardRead < aForwardReads.length(); ++nForwardRead ) { LocatedFragment* pForwardLocFrag = aForwardReads[ nForwardRead ]; for( int nReverseRead = 0; nReverseRead < aReverseReads.length(); ++nReverseRead ) { LocatedFragment* pReverseLocFrag = aReverseReads[ nReverseRead ]; char cProblem2; if ( bIsForwardReversePairConsistent( pForwardLocFrag, pReverseLocFrag, cProblem2 ) ) { pFwdRead = pForwardLocFrag; pRevRead = pReverseLocFrag; return( true ); } } } // if reached here, none were consistent, but there is a fwd/rev // pair which ones shall we pick? The highest quality. This may // take a long time--if so we can just pick a fwd/rev pair at // random. assert( bHasAlignedForwardReversePair( pFwdRead, pRevRead ) ); // find what the problem is with that highest quality fwd/rev pair assert( !bIsForwardReversePairConsistent( pFwdRead, pRevRead, cProblem ) ); return( false ); } bool subcloneTTemplate :: bIsForwardReversePairConsistent( LocatedFragment* pFwdLocFrag, LocatedFragment* pRevLocFrag, char& cProblem ) { if ( pFwdLocFrag->pGetContig() == pRevLocFrag->pGetContig() ) { // check the orientations if ( pFwdLocFrag->bComp() == pRevLocFrag->bComp() ) { // bad news: the reads are both pointing in the same direction if ( pFwdLocFrag->bComp() ) { // both are pointing left cProblem = fwdRevPair2::cBOTH_POINT_LEFT; return( false ); } else { // both are pointing right cProblem = fwdRevPair2::cBOTH_POINT_RIGHT; return( false ); } } // if ( pFwdLocFrag->bComp() == pRevLocFrag->bComp() ) { else { // in same contig and pointing in opposite directions LocatedFragment* pLocFragTopStrand; LocatedFragment* pLocFragBottomStrand; if ( pFwdLocFrag->bComp() ) { pLocFragBottomStrand = pFwdLocFrag; pLocFragTopStrand = pRevLocFrag; } else { pLocFragTopStrand = pFwdLocFrag; pLocFragBottomStrand = pRevLocFrag; } int nUnpaddedConsPosTopStrand = pLocFragTopStrand->nGetAlignStartUnpadded(); int nUnpaddedConsPosBottomStrand = pLocFragBottomStrand->nGetAlignEndUnpadded(); if ( nUnpaddedConsPosTopStrand > nUnpaddedConsPosBottomStrand ) { // they are like this: <- -> cProblem = fwdRevPair2::cPOINT_AWAY_FROM_EACH_OTHER; return( false ); } if ( bIsSameContigFwdRevPairTooFarApart( pFwdLocFrag, pRevLocFrag ) ) { cProblem = fwdRevPair2::cTOO_FAR_APART; return( false ); } // passed all checks cProblem = fwdRevPair2::cNO_PROBLEM; return( true ); } // if ( pFwdLocFrag->bComp() == pRevLocFrag->bComp() ) { else } // if ( pFwdLocFrag->pGetContig() == pRevLocFrag->pGetContig() ) { else { // in different contigs bool bDoNotKnow; bool bTooFarApart = bIsGapSpanningFwdRevPairTooFarApart( pFwdLocFrag, pRevLocFrag, bDoNotKnow ); if ( bDoNotKnow ) { cProblem = fwdRevPair2::cDO_NOT_KNOW_IF_TOO_FAR_APART; return( false ); } else { if ( bTooFarApart ) { cProblem = fwdRevPair2::cTOO_FAR_FROM_END_OF_CONTIG; return( false ); } else { cProblem = fwdRevPair2::cNO_PROBLEM; return( true ); } } } } void subcloneTTemplate :: tryToAddTemplateToPrimer( primerType* pPrimer ) { if ( !pCP->bPrimersChooseTemplatesByPositionInsteadOfQuality_ ) calculateErrorRateIfNecessary(); int nTemplateIndexInPrimerType = 0; bool bKeepTryingToAddTemplate = true; while( bKeepTryingToAddTemplate ) { // if the next spot is open, stick this template there if ( ! pPrimer->pSubcloneTemplate_[ nTemplateIndexInPrimerType] ) { pPrimer->pSubcloneTemplate_[ nTemplateIndexInPrimerType ] = this; bKeepTryingToAddTemplate = false; } else if ( bPreferSelfTemplate( pPrimer, pPrimer->pSubcloneTemplate_[ nTemplateIndexInPrimerType ] ) ) { // found a better template // Slide the others down the list and insert this one for( int n = nNUMBER_OF_TEMPLATES - 1; n > nTemplateIndexInPrimerType; --n ) { pPrimer->pSubcloneTemplate_[ n ] = pPrimer->pSubcloneTemplate_[ n - 1]; } pPrimer->pSubcloneTemplate_[ nTemplateIndexInPrimerType ] = this; bKeepTryingToAddTemplate = false; } // else if ( pPrimer->pSubcloneTemplate_[ nTem... else { ++nTemplateIndexInPrimerType; if ( nTemplateIndexInPrimerType >= nNUMBER_OF_TEMPLATES ) { // case in which this template is of worse // quality than all the templates already in // the primerType structure bKeepTryingToAddTemplate = false; } } } // while( bKeepTryingToAddTemplate ) { } bool subcloneTTemplate :: bPreferSelfTemplate( primerType* pPrimer, subcloneTTemplate* pOtherSub ) { if ( pCP->bPrimersChooseTemplatesByPositionInsteadOfQuality_ ) { if ( bHasForwardAndReversePair_ && !pOtherSub->bHasForwardAndReversePair_ ) return( true ); else if ( !bHasForwardAndReversePair_ && pOtherSub->bHasForwardAndReversePair_ ) return( false ); else { // Both templates have fwd/rev pairs, or neither has // a fwd/rev pair so choose the one that // extends furthest in the direction of the read if ( pPrimer->bTopStrandNotBottomStrand_ ) { if ( pOtherSub->nUnpaddedEnd_ < nUnpaddedEnd_ ) return( true ); else return( false ); } else { // bottom strand primer so prefer the template // that starts furthest left if ( pOtherSub->nUnpaddedStart_ < nUnpaddedStart_ ) return( false ); else return( true ); } } } else { if ( dErrorRate_ > pOtherSub->dErrorRate_ ) return( true ); else return( false ); } } void subcloneTTemplate :: calculateErrorRateIfNecessary() { if ( bErrorRateCalculated_ ) return; bErrorRateCalculated_ = true; dErrorRate_ = 1.0; for( int nRead = 0; nRead < aReads_.entries(); ++nRead ) { LocatedFragment* pLocFrag = aReads_[ nRead ]; int nConsPosStart; int nConsPosEnd; pLocFrag->dErrorRate_ = 1.0; if ( pLocFrag->bComp() ) { nConsPosStart = pLocFrag->nGetAlignEnd() - consedParameters::pGetConsedParameters()->nAutoFinishPotentialHighQualityPartOfReadEnd_; nConsPosEnd = pLocFrag->nGetAlignEnd() - consedParameters::pGetConsedParameters()->nAutoFinishPotentialHighQualityPartOfReadStart_; } else { nConsPosStart = pLocFrag->nGetAlignStart() + consedParameters::pGetConsedParameters()->nAutoFinishPotentialHighQualityPartOfReadStart_; nConsPosEnd = pLocFrag->nGetAlignStart() + consedParameters::pGetConsedParameters()->nAutoFinishPotentialHighQualityPartOfReadEnd_; } if ( nConsPosStart < pLocFrag->nGetAlignStart() ) { continue; } if ( pLocFrag->nGetAlignEnd() < nConsPosEnd ) { continue; } double dQuality; Ntide nt; double dError; double dTotalErrors = 0.0; int nUnpaddedBases = 0; for( int nConsPos = nConsPosStart; nConsPos <= nConsPosEnd; ++nConsPos ) { nt = pLocFrag->ntGetFragFromConsPos( nConsPos ); if ( nt.bIsPad() ) continue; Quality ucQual = nt.qualGetQuality(); if ( (ucQual == 98 ) || ( ucQual == 99 ) ) continue; if ( ucQual <= 0 ) dError = 1.0; else dError = dErrorRateFromQuality[ ucQual ]; dTotalErrors += dError; ++nUnpaddedBases; } if ( nUnpaddedBases > 0 ) { pLocFrag->dErrorRate_ = dTotalErrors / (double) nUnpaddedBases; dErrorRate_ = MIN( dErrorRate_, pLocFrag->dErrorRate_ ); } } // for( int nRead ... } void subcloneTTemplate :: figureOutWhichStrandsTheTemplateHas( Contig* pContig ) { if ( pLibrary_ && !pLibrary_->bSingleNotDoubleStranded_ ) { bOKToUseForTopStrandReads_ = true; bOKToUseForBottomStrandReads_ = true; } else { for( int nRead = 0; nRead < aReads_.entries(); ++nRead ) { LocatedFragment* pLocFrag = aReads_[ nRead ]; if ( pLocFrag->pGetContig() == pContig ) { if ( pLocFrag->bComp() ) bOKToUseForBottomStrandReads_ = true; else bOKToUseForTopStrandReads_ = true; } } } } void subcloneTTemplate :: setRightUnpaddedExact( const int nUnpadded, const int nIndex ) { if ( aFlags_[ nIndex ] & RIGHT_CONS_POS_SET ) { if ( aFlags_[ nIndex ] & RIGHT_CONS_POS_EXACT ) { if ( nUnpadded < aUnpaddedRight_[ nIndex ] ) aUnpaddedRight_[ nIndex ] = nUnpadded; } else { // if what it was before was some range, // then use the new exact value aUnpaddedRight_[ nIndex ] = nUnpadded; aFlags_[ nIndex ] |= RIGHT_CONS_POS_EXACT; } } else { aFlags_[ nIndex ] |= RIGHT_CONS_POS_SET; aFlags_[ nIndex ] |= RIGHT_CONS_POS_EXACT; aUnpaddedRight_[ nIndex ] = nUnpadded; } } void subcloneTTemplate :: setLeftUnpaddedExact( const int nUnpadded, const int nIndex ) { if ( aFlags_[ nIndex ] & LEFT_CONS_POS_SET ) { if ( aFlags_[ nIndex ] & LEFT_CONS_POS_EXACT ) { if ( aUnpaddedLeft_[ nIndex ] < nUnpadded ) aUnpaddedLeft_[ nIndex ] = nUnpadded; } else { aUnpaddedLeft_ = nUnpadded; aFlags_[ nIndex ] |= LEFT_CONS_POS_EXACT; } } else { aFlags_[ nIndex ] |= LEFT_CONS_POS_SET; aFlags_[ nIndex ] |= LEFT_CONS_POS_EXACT; aUnpaddedLeft_[ nIndex ] = nUnpadded; } } void subcloneTTemplate :: inventoryUniversalPrimerReadsForGapFlanking( bool& bUniversalForwardExists, bool& bUniversalForwardExistsInThisContig, bool& bUniversalForwardIsInBottomStrandOfThisContig, bool& bUniversalReverseExists, bool& bUniversalReverseExistsInThisContig, bool& bUniversalReverseIsInBottomStrandOfThisContig, Contig* pContig ) { bUniversalForwardExists = false; bUniversalReverseExists = false; bUniversalForwardExistsInThisContig = false; bUniversalReverseExistsInThisContig = false; for( int nRead = 0; nRead < aReads_.entries(); ++nRead ) { LocatedFragment* pLocFrag = aReads_[ nRead ]; if ( pLocFrag->bIsWalkingNotUniversalPrimerRead() ) continue; if ( pLocFrag->bIsForwardNotReverseRead() ) { bUniversalForwardExists = true; if ( pContig == pLocFrag->pGetContig() ) { bUniversalForwardExistsInThisContig = true; bUniversalForwardIsInBottomStrandOfThisContig = pLocFrag->bComp(); } } else { bUniversalReverseExists = true; if ( pContig == pLocFrag->pGetContig() ) { bUniversalReverseExistsInThisContig = true; bUniversalReverseIsInBottomStrandOfThisContig = pLocFrag->bComp(); } } } } void subcloneTTemplate :: walkingReadByItselfInContig( LocatedFragment* pLocFrag) { // Talk to Phil about this if ( pCP->bPrimersPrintInfoOnRejectedTemplates_ && ( pCP->nWhatIsRunning_ == nAutoFinishIsRunning ) ) fprintf( pAO, "Warning: template %s has walking read %s in %s all by itself\n", (char*) soTemplateName_.data(), (char*) pLocFrag->soGetName().data(), (char*) pLocFrag->pGetContig()->soGetName().data() ); } void subcloneTTemplate :: dumpTemplate() { // fprintf( pFILE, "template: %s from %d to %d (size: %d) in %s lib: %s %s %s %s %s %s %s %s %s\n", // (char*) soTemplateName_.data(), // nUnpaddedStart_, // nUnpaddedEnd_, // nUnpaddedEnd_ - nUnpaddedStart_ + 1, // (char*) pContig_->soGetName().data(), // ( pLibrary_ ? pLibrary_->soName_.data() : "no lib" ), // ( bOKToUseForTopStrandReads_ ? "top" : " " ), // ( bOKToUseForBottomStrandReads_ ? "bot" : " " ), // ( bOKToUseThisTemplate_ ? "ok" : "not ok" ), // ( bShortInsert_ ? "short insert" : "" ), // ( bReadsAreInconsistent_ ? "reads incons" : "" ), // ( bReadPairTooFarApart_ ? "pair of reads too far apart for insert" : "" ), // ( bUnalignedHighQualityRegionTooLong_ ? "unaligned h qual" : "" ), // ( bHasSeriousHighQualityDiscrepancies_ ? "hqd" : "" ) // ); fprintf( pFILE, "template: %s from %d to %d (size from fwd/rev pair: %d ) in lib: %s %s %s %s %s %s %s %s %s fwd/rev calc: %s\n", soTemplateName_.data(), nUnpaddedStart_, nUnpaddedEnd_, nInsertSizeFromForwardReversePair_, ( pLibrary_ ? pLibrary_->soName_.data() : "no lib" ), ( bOKToUseForTopStrandReads_ ? "top" : " " ), ( bOKToUseForBottomStrandReads_ ? "bot" : " " ), ( bOKToUseThisTemplate_ ? "ok" : "not ok" ), ( bShortInsert_ ? "short insert" : "" ), ( bReadsAreInconsistent_ ? "reads incons" : "" ), ( bReadPairTooFarApart_ ? "pair of reads too far apart for insert" : "" ), ( bUnalignedHighQualityRegionTooLong_ ? "unaligned h qual" : "" ), ( bHasSeriousHighQualityDiscrepancies_ ? "hqd" : "" ), ( bOKToUseThisTemplateForCalculatingInsertSizeFromForwardReversePairs_ ? "yes" : "no" ) ); } bool subcloneTTemplate :: bCalculateInsertSizeFromForwardReversePair() { // note that I am disregarding bBadTemplateFile_ and bBadLibraryFile_ // I want to include those in calculating the insert size. // I'm torn with bInconsistentGapSpanning_. If the user sets // an incorrect maximum insert size, then we will be eliminating // some good templates and thus the mean insert size will be incorrect. // DG, 4/01. No, because only fwd/rev pairs that are within the same // contig (not gap spanning) will be used for calculating the mean // insert size. if ( bShortInsert_ || bReadsAreInconsistent_ || bUnalignedHighQualityRegionTooLong_ || bHasSeriousHighQualityDiscrepancies_ || bIsChimeric_ || bInconsistentGapSpanning_ ) { bOKToUseThisTemplateForCalculatingInsertSizeFromForwardReversePairs_ = false; return( false ); } LocatedFragment* pLocFragForward = NULL; LocatedFragment* pLocFragReverse = NULL; if ( !bHasForwardReversePairInSameContig( pLocFragForward, pLocFragReverse ) ) { bOKToUseThisTemplateForCalculatingInsertSizeFromForwardReversePairs_ = false; return( false ); } if ( pLocFragForward->bWholeReadIsUnaligned_ || pLocFragReverse->bWholeReadIsUnaligned_ ) { bOKToUseThisTemplateForCalculatingInsertSizeFromForwardReversePairs_ = false; return( false ); } bOKToUseThisTemplateForCalculatingInsertSizeFromForwardReversePairs_ = true; Contig* pContig = pLocFragForward->pContig_; // find this contig in the list int nFoundContig = -1; for( int nContig = 0; nContig < aContigs_.entries(); ++nContig ) { if ( aContigs_[ nContig ] == pContig ) { nFoundContig = nContig; break; } } // make a big error message if ( nFoundContig < 0 ) { printf( "nFoundContig = %d, template name = %s\n", nFoundContig, soTemplateName_.data() ); printf( "forward = %s, reverse = %s\n", pLocFragForward->soGetName().data(), pLocFragReverse->soGetName().data() ); printf( "contigs = %s , reverse contig = %s %d\n", pLocFragForward->pContig_->soGetName().data(), pLocFragReverse->pContig_->soGetName().data(), aContigs_.length() ); for( int nContig = 0; nContig < aContigs_.entries(); ++nContig ) { printf( "contig %d: %s\n", nContig, (aContigs_[ nContig ])->soGetName().data() ); } assert( false ); } // nConsPosStart_ and nConsPosEnd_ haven't been set yet (they will be // set in processingAfterAddedAllTemplatesInThisLibrary ) // so we have to get them from aConsPosLeft_ and aConsPosRight_ nInsertSizeFromForwardReversePair_ = aUnpaddedRight_[ nFoundContig ] - aUnpaddedLeft_[ nFoundContig ] + 1; // already set to false (in ctor) bOKToCallFirstForwardFromThisTemplate_ // and bOKToCallFirstReverseFromThisTemplate_ return( true ); } bool subcloneTTemplate :: bHasAnotherReadInADifferentContig( LocatedFragment* pOriginalRead, LocatedFragment*& pAnotherRead ) { Contig* pOriginalContig = pOriginalRead->pGetContig(); // note that there may be other reads in this other contig, // or in still additional contigs. The bReadsInconsistent_ flag // will be set if the reads are from more than 2 contigs. There // could be a univ primer from contig 1 and a walk and universal // primer read from contig2. for( int nRead = 0; nRead < aReads_.length(); ++nRead ) { LocatedFragment* pLocFrag = aReads_[ nRead ]; if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( pLocFrag->pGetContig() != pOriginalContig ) { pAnotherRead = pLocFrag; return( true ); } } return( false ); } bool subcloneTTemplate :: bHasAnotherReadInADifferentContig2( LocatedFragment* pOriginalRead, RWTPtrOrderedVector* pArrayOfLocatedFragments ) { Contig* pOriginalContig = pOriginalRead->pGetContig(); // note that there may be other reads in this other contig, // or in still additional contigs. The bReadsInconsistent_ flag // will be set if the reads are from more than 2 contigs. There // could be a univ primer from contig 1 and a walk and universal // primer read from contig2. for( int nRead = 0; nRead < aReads_.length(); ++nRead ) { LocatedFragment* pLocFrag = aReads_[ nRead ]; if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( pLocFrag->pGetContig() != pOriginalContig ) { // check if there is another read already in the list with // the same contig bool bAlreadyInList = false; for( int nRead = 0; nRead < pArrayOfLocatedFragments->length(); ++nRead ) { if ( (*pArrayOfLocatedFragments)[ nRead ]->pGetContig() == pLocFrag->pGetContig() ) { bAlreadyInList = true; break; } } if ( !bAlreadyInList ) { pArrayOfLocatedFragments->insert( pLocFrag ); } } } if ( pArrayOfLocatedFragments->length() > 0 ) return( true ); else return( false ); } bool subcloneTTemplate :: bHasAnotherUnivPrimerReadInADifferentContig( LocatedFragment* pOriginalRead, LocatedFragment*& pAnotherRead ) { Contig* pOriginalContig = pOriginalRead->pGetContig(); for( int nRead = 0; nRead < aReads_.length(); ++nRead ) { LocatedFragment* pLocFrag = aReads_[ nRead ]; if ( pLocFrag->pGetContig() == pOriginalContig ) continue; if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( pLocFrag->bIsUniversalPrimerRead() ) { pAnotherRead = pLocFrag; return( true ); } } return( false ); } // note: this does *not* take into account reads that may be in // the singlets file. This is ok since we use the fwd/rev pair to // figure out the location of the insert, and singlet reverses // are useless for this purpose bool subcloneTTemplate :: bTemplateHasForwardReversePair() { if ( aReads_.length() == 1 ) return( false ); bool bFoundForward = false; bool bFoundReverse = false; for( int nRead = 0; nRead < aReads_.length(); ++nRead ) { LocatedFragment* pLocFrag = aReads_[ nRead ]; if ( pLocFrag->nReadType_ == nUniversalForward ) bFoundForward = true; else if ( pLocFrag->nReadType_ == nUniversalReverse ) bFoundReverse = true; } if ( bFoundForward && bFoundReverse ) return( true ); else return( false ); } bool subcloneTTemplate :: bHasAlignedForwardReversePair( LocatedFragment*& pBestForwardRead, LocatedFragment*& pBestReverseRead ) { // fixed bug 6/1/2000 reported by Rose James (bus error with Autofinish) calculateErrorRateIfNecessary(); pBestForwardRead = NULL; pBestReverseRead = NULL; double dErrorRateOfBestForwardRead = 1.0; double dErrorRateOfBestReverseRead = 1.0; // fixed bug 12/01: some reads are totally quality 98 (or all x's) // and thus their error rate is 1.0 and thus this routine will // say there is no fwd/rev pair when there is. So change the // below to <= so it will find such reads. for( int nRead = 0; nRead < aReads_.length(); ++nRead ) { LocatedFragment* pLocFrag = aReads_[ nRead ]; // added Jan 2002 since there is no situation in which were // are interested in a fwd/rev pair in which one of the reads // is unaligned if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( pLocFrag->nReadType_ == nUniversalForward ) { if ( pLocFrag->dErrorRate_ <= dErrorRateOfBestForwardRead ) { dErrorRateOfBestForwardRead = pLocFrag->dErrorRate_; pBestForwardRead = pLocFrag; } } else if ( pLocFrag->nReadType_ == nUniversalReverse ) { if ( pLocFrag->dErrorRate_ <= dErrorRateOfBestReverseRead ) { dErrorRateOfBestReverseRead = pLocFrag->dErrorRate_; pBestReverseRead = pLocFrag; } } } if ( !pBestForwardRead || !pBestReverseRead ) return( false ); else return( true ); } bool subcloneTTemplate :: bHasForwardReversePairSortByContigName( LocatedFragment*& pLocFrag1, LocatedFragment*& pLocFrag2 ) { if ( !bHasAlignedForwardReversePair( pLocFrag1, pLocFrag2 ) ) return( false ); if ( pLocFrag1->pContig_->soGetName() > pLocFrag2->pContig_->soGetName() ) { LocatedFragment* pTemp = pLocFrag2; pLocFrag2 = pLocFrag1; pLocFrag1 = pTemp; } return( true ); } bool subcloneTTemplate :: bHasForwardReversePairInSameContig( LocatedFragment*& pBestForwardRead, LocatedFragment*& pBestReverseRead ) { // fixed bug 6/1/2000 reported by Rose James (bus error with Autofinish) if ( !bHasAlignedForwardReversePair( pBestForwardRead, pBestReverseRead ) ) { return( false ); } // Note: If the following is the case: // contig1: ----> <---- // poor fwd reverse // contig2: // ----> good fwd // In this case, this routine will return false. However, I think // that in such a pathological case, we probably don't want to use // this forward reverse pair for calculating the insert size--there is // probably a misassembly and poor fwd may not be in the correct position. // So I think it is ok to return false. if ( pBestForwardRead->pContig_ == pBestReverseRead->pContig_ ) return( true ); else return( false ); } bool subcloneTTemplate :: bOKToCallDeNovoUniversalPrimerRead( bool& bTopNotBottomStrand, int& nReadType, Contig* pContig ) { // this figures out if there is a missing forward // or a missing reverse. It must also figure out which strand of // the current contig the missing forward or reverse is on. // bool bHasForwardRead = false; bool bHasReverseRead = false; bool bForwardReadIsTopNotBottomStrand; bool bReverseReadIsTopNotBottomStrand; bool bHasForwardInSameContig = false; bool bHasReverseInSameContig = false; for( int nRead = 0; nRead < aReads_.length(); ++nRead ) { LocatedFragment* pLocFrag = aReads_[ nRead ]; if ( pLocFrag->nReadType_ == nUniversalForward ) { bHasForwardRead = true; if ( pLocFrag->pContig_ == pContig ) { bHasForwardInSameContig = true; bForwardReadIsTopNotBottomStrand = !pLocFrag->bComp(); } } else if ( pLocFrag->nReadType_ == nUniversalReverse ) { bHasReverseRead = true; if ( pLocFrag->pContig_ == pContig ) { bHasReverseInSameContig = true; bReverseReadIsTopNotBottomStrand = !pLocFrag->bComp(); } } } // check in singlets file if ( !bHasForwardRead ) { bHasForwardRead = ConsEd::pGetAssembly()->bIsThisUniversalPrimerReadInSinglets( nUniversalForward, soTemplateName_ ); } if ( !bHasReverseRead ) { bHasReverseRead = ConsEd::pGetAssembly()->bIsThisUniversalPrimerReadInSinglets( nUniversalReverse, soTemplateName_ ); } // We must check this, since we don't want to make a reverse in // the following case: // there is a forward in the current contig and a reverse // in some other contig if ( bHasForwardRead && bHasReverseRead ) return( false ); // For calling a de novo universal primer read, one of the existing // universal primer reads must be in the same contig. If there is just // a walk, but no universal primer read, that isn't good enough. if ( bHasForwardInSameContig && !bHasReverseRead ) { nReadType = nUniversalReverse; bTopNotBottomStrand = !bForwardReadIsTopNotBottomStrand; return( true ); } else if ( !bHasForwardRead && bHasReverseInSameContig ) { nReadType = nUniversalForward; bTopNotBottomStrand = !bReverseReadIsTopNotBottomStrand; return( true ); } else // cases left: // since we already checked for bHasForwardRead && bHasReverseRead, // it must be that !bHasForwardRead || !bHasReverseRead // If !bHasReverseRead, then in case 1 above, we already checked // for bHasForwardInSameContig, so the only other case is when // there is no forward at all, or else it is somewhere else (another // contig, or in the singlets.) In this case, we don't want to // call a de novo universal primer read in this contig. Similarly // if !bHasForwardRead return( false ); } // this answers the question: is the bTopNotBottomStrandOfContig strand // of contig pContig in the forward or reverse direction of this template? bool subcloneTTemplate :: bIsForwardNotReverseStrand( const bool bTopNotBottomStrandOfContig, Contig* pContig ) { for( int nRead = 0; nRead < aReads_.length(); ++nRead ) { LocatedFragment* pLocFrag = aReads_[ nRead ]; if ( pLocFrag->pContig_ != pContig ) continue; if ( pLocFrag->nReadType_ != nUniversalForward && pLocFrag->nReadType_ != nUniversalReverse ) continue; // if reached here, we have a universal primer read in the same contig bool bForwardNotReverseStrand = ( pLocFrag->nReadType_ == nUniversalForward ); if ( pLocFrag->bComp() ) bForwardNotReverseStrand = !bForwardNotReverseStrand; if ( !bTopNotBottomStrandOfContig ) bForwardNotReverseStrand = !bForwardNotReverseStrand; return( bForwardNotReverseStrand ); } // should never reach here, but put this in just so doesn't crash return( true ); } bool subcloneTTemplate :: bThereIsAlreadySuchAnAutofinishUniversalPrimerRead( const bool bForwardNotReverse, LocatedFragment*& pExistingLocFrag ) { int nReadType = ( bForwardNotReverse ? nUniversalForward : nUniversalReverse ); for( int nRead = 0; nRead < aReads_.length(); ++nRead ) { LocatedFragment* pLocFrag = aReads_[ nRead ]; if ( pLocFrag->nReadType_ != nReadType ) continue; // if got here, found universal primer read of specified type // Let's see if it is an Autofinish read. for( int nWR = 0; nWR < pLocFrag->aWholeReadItems_.length(); ++nWR ) { wholeReadItem* pWR = pLocFrag->aWholeReadItems_[ nWR ]; if ( pWR->soType_ == "expid" ) { pExistingLocFrag = pLocFrag; return( true ); } } } return( false ); } void subcloneTTemplate :: flagSubcloneIfBadTemplateOrBadLibrary() { bBadTemplateFile_ = !ConsEd::pGetAssembly()->bIsThisTemplateOKNotBad( soTemplateName_ ); if ( !soLibrary_.isNull() ) bBadLibraryFile_ = !ConsEd::pGetAssembly()->bIsThisLibraryOKNotBad( soLibrary_ ); if ( bBadTemplateFile_ || bBadLibraryFile_ ) { bOKToUseThisTemplate_ = false; if ( pCP->bPrimersPrintInfoOnRejectedTemplates_ && ( pCP->nWhatIsRunning_ == nAutoFinishIsRunning ) ) { RWCString soMessage = "not using template: " + soTemplateName_ + " because "; if ( bBadTemplateFile_ ) soMessage += " in bad templates file"; if ( bBadLibraryFile_ ) soMessage += " in bad libraries file"; fprintf( pAO, "%s\n", soMessage.data() ); } } } void subcloneTTemplate :: markTemplateBadIfInsertSeemsTooBig() { for( int nRead1 = 0; nRead1 < aReads_.entries() - 1; ++nRead1 ) { LocatedFragment* pLocFrag1 = aReads_[ nRead1 ]; if ( pLocFrag1->bIsWholeReadUnaligned() ) continue; for( int nRead2 = nRead1 + 1; nRead2 < aReads_.entries(); ++nRead2 ) { LocatedFragment* pLocFrag2 = aReads_[ nRead2 ]; if ( pLocFrag2->bIsWholeReadUnaligned() ) continue; if ( pLocFrag1->bIsWalkingNotUniversalPrimerRead() || pLocFrag2->bIsWalkingNotUniversalPrimerRead() ) { // at least one of the reads is a walking read if ( pLocFrag1->pGetContig() == pLocFrag2->pGetContig() ) { // reads are in the same contig // check that they are not too far from each other int nUnpadded1 = pLocFrag1->nGetFirstBaseOfInsertUnpadded(); int nUnpadded2 = pLocFrag2->nGetFirstBaseOfInsertUnpadded(); int nApart = ABS( nUnpadded1 - nUnpadded2 ) + 1; // I don't understand why I handle the case in which pLibrary_ // is not set, because I think it always will be set, but // leave it for safety. 6/02 if ( ( pLibrary_ && ( nApart > pLibrary_->nMaxInsertSize_ )) || ( !pLibrary_ && ( nApart > pCP->nPrimersMaxInsertSizeOfASubclone_ ) ) ) { if ( pCP->bPrimersPrintInfoOnRejectedTemplates_ && ( pCP->nWhatIsRunning_ == nAutoFinishIsRunning ) ) fprintf( pAO, "not using template %s because pair of reads %s and %s are %d away from each other while the maximum is %d\n", (char*) soTemplateName_.data(), (char*) pLocFrag1->soGetName().data(), (char*) pLocFrag2->soGetName().data(), nApart, ( pLibrary_ ? pLibrary_->nMaxInsertSize_ : pCP->nPrimersMaxInsertSizeOfASubclone_ ) ); bReadPairTooFarApart_ = true; bOKToUseThisTemplate_ = false; return; } } else { // reads are in different contigs // check that both reads are near the ends of the contigs int nWhichEnd; if ( !pLocFrag1->bIsReadNearEndOfContig( nWhichEnd ) || !pLocFrag2->bIsReadNearEndOfContig( nWhichEnd ) ) { if ( pCP->bPrimersPrintInfoOnRejectedTemplates_ && ( pCP->nWhatIsRunning_ == nAutoFinishIsRunning ) ) { // fix for purify RWCString soLocFrag1 = pLocFrag1->soGetName(); RWCString soLocFrag2 = pLocFrag2->soGetName(); fprintf( pAO, "not using template %s because reads %s and %s are in different contigs but read %s is not near the end of its contig\n", (char*) soTemplateName_.data(), soLocFrag1.data(), soLocFrag2.data(), ( !pLocFrag1->bIsReadNearEndOfContig( nWhichEnd ) ? soLocFrag1.data() : soLocFrag2.data() ) ); } bReadPairTooFarApart_ = true; bOKToUseThisTemplate_ = false; return; } } // if ( pLocFrag1->pGetContig() == ... else } else { // universal primer pair // we are only interested in forward/reverse pairs (not // forward/forward or reverse/reverse pairs ) if ( pLocFrag1->bIsForwardNotReverseRead() == pLocFrag2->bIsForwardNotReverseRead() ) continue; // if reached here, we have a forward/reverse pair if ( pLocFrag1->pGetContig() == pLocFrag2->pGetContig() ) { // we have already checked the orientations // check how far they are away from each other if ( bIsSameContigFwdRevPairTooFarApart( pLocFrag1, pLocFrag2 ) ) { // insert size is too big! bReadPairTooFarApart_ = true; bOKToUseThisTemplate_ = false; // no need to check any others--no matter what // they turn up, we still won't use this template return; } } // if ( pLocFrag1->pGetContig() == pLocFrag2->pGetContig() ) { else { // different contigs bool bDoNotKnow; bool bTooFarApart = bIsGapSpanningFwdRevPairTooFarApart( pLocFrag1, pLocFrag2, bDoNotKnow ); if ( bDoNotKnow ) continue; // can't say whether consistent or inconsistent if ( bTooFarApart ) { bReadPairTooFarApart_ = true; bOKToUseThisTemplate_ = false; return; } } } } // for( int nRead2... } // for( int nRead1 // if made it here, there were no forward reverse pairs too far apart // since bReadPairTooFarApart_ was originally set to false, // just leave it that way } bool subcloneTTemplate :: bIsSameContigFwdRevPairTooFarApart( LocatedFragment* pLocFrag1, LocatedFragment* pLocFrag2 ) { int nVectorInsertJunction1ConsPos; int nVectorInsertJunction2ConsPos; // bFoundVectorInsertJunctionForUniversalPrimerRead will return // true unless read is unaligned. Check that the reads are aligned // before calling this. assert( pLocFrag1->bFoundVectorInsertJunctionForUniversalPrimerRead( nVectorInsertJunction1ConsPos ) ); assert( pLocFrag2->bFoundVectorInsertJunctionForUniversalPrimerRead( nVectorInsertJunction2ConsPos ) ); int nVectorInsertJunction1Unpadded = pLocFrag1->pContig_->nUnpaddedIndex( nVectorInsertJunction1ConsPos ); int nVectorInsertJunction2Unpadded = pLocFrag2->pContig_->nUnpaddedIndex( nVectorInsertJunction2ConsPos ); assert( pLibrary_ ); int nInsertSize = ABS( nVectorInsertJunction1Unpadded - nVectorInsertJunction2Unpadded ) + 1; if ( nInsertSize > pLibrary_->nMaxInsertSize_ ) { if ( pCP->bPrimersPrintInfoOnRejectedTemplates_ && ( pCP->nWhatIsRunning_ == nAutoFinishIsRunning ) ) { fprintf( pAO, "not using template %s because fwd/rev pair reads %s and %s are %d apart while library %s says maximum is %d\n", soTemplateName_.data(), pLocFrag1->soGetName().data(), pLocFrag2->soGetName().data(), nInsertSize, pLibrary_->soName_.data(), pLibrary_->nMaxInsertSize_ ); } return( true ); } else return( false ); } bool subcloneTTemplate :: bIsGapSpanningFwdRevPairTooFarApart( LocatedFragment* pLocFrag1, LocatedFragment* pLocFrag2, bool& bDoNotKnow ) { bool bThereIsAHighQualitySegment; int nDistance1 = pLocFrag1->nHowFarIsReadFromEndOfHighQualitySegment( bThereIsAHighQualitySegment ); if ( !bThereIsAHighQualitySegment ) { bDoNotKnow = true; return( false ); } int nDistance2 = pLocFrag2->nHowFarIsReadFromEndOfHighQualitySegment( bThereIsAHighQualitySegment ); if ( !bThereIsAHighQualitySegment ) { bDoNotKnow = true; return( false ); } bDoNotKnow = false; // insert is likely larger than this, but this puts // a minimum distance, providing there are no misassemblies int nTotalSizeOfInsert = nDistance1 + nDistance2; if ( ( pLibrary_ && ( nTotalSizeOfInsert > pLibrary_->nMaxInsertSize_ ) ) || ( !pLibrary_ && ( nTotalSizeOfInsert > pCP->nPrimersMaxInsertSizeOfASubclone_ ) ) ) { if ( pCP->bPrimersPrintInfoOnRejectedTemplates_ && ( pCP->nWhatIsRunning_ == nAutoFinishIsRunning ) ) { fprintf( pAO, "not using template %s because pair of reads %s (%d from end) and %s (%d from end) are at least %d away from each other while the maximum is %d\n", (char*) soTemplateName_.data(), (char*) pLocFrag1->soGetName().data(), nDistance1, (char*) pLocFrag2->soGetName().data(), nDistance2, nTotalSizeOfInsert, ( pLibrary_ ? pLibrary_->nMaxInsertSize_ : pCP->nPrimersMaxInsertSizeOfASubclone_ ) ); } return( true ); } else return( false ); } void subcloneTTemplate :: determineIfOKToUseTemplate() { bOKToUseThisTemplate_ = true; if ( bBadTemplateFile_ || bBadLibraryFile_ || bShortInsert_ || bReadsAreInconsistent_ || bReadPairTooFarApart_ || bUnalignedHighQualityRegionTooLong_ || ( bHasSeriousHighQualityDiscrepancies_ && ( !pCP->bAutoFinishAllowHighQualityDiscrepanciesInTemplateIfConsistentForwardReversePair_ || !bHasForwardAndReversePair_ ) ) || bIsChimeric_ || bInconsistentGapSpanning_ ) bOKToUseThisTemplate_ = false; }