/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "evalExpCluster.h" #include "singletInfo.h" #include "contig.h" #include "autoFinishExpTag.h" #include "abs.h" #include "rwcregexp.h" #include "rwctokenizer.h" #include "consed.h" #include "consedParameters.h" #include "fileDefines.h" void evalExpCluster :: addFoundReadInAssembly( const int nExpID, LocatedFragment* pLocFrag ) { aLocFrag_.insert( pLocFrag ); aExpID_.insert( nExpID ); aFlags_.insert( nInAssembly ); aSingletFromAutoFinish_.insert( NULL ); } void evalExpCluster :: addFoundReadInSinglets( const int nExpID, singletInfo* pSingletFromAutoFinish ) { aLocFrag_.insert( NULL ); aExpID_.insert( nExpID ); aFlags_.insert( nInSinglets ); aSingletFromAutoFinish_.insert( pSingletFromAutoFinish ); } void evalExpCluster :: addNotFoundRead( const int nExpID ) { aLocFrag_.insert( NULL ); aExpID_.insert( nExpID ); aFlags_.insert( nNotFound ); aSingletFromAutoFinish_.insert( NULL ); } void evalExpCluster :: evaluate() { pAutoFinishExpTag_->evaluateExperimentCluster(); // now let's look at the individual reads int nNumberNotFound = 0; int nNumberInSinglets = 0; int nNumberInAssembly = 0; int nExp; for( nExp = 0; nExp < aExpID_.length(); ++nExp ) { if ( aFlags_[ nExp ] == nNotFound ) ++nNumberNotFound; else if ( aFlags_[ nExp ] == nInSinglets ) ++nNumberInSinglets; else if ( aFlags_[ nExp ] == nInAssembly ) ++nNumberInAssembly; } // printf( "Reads that went into assembly: %d\n", nNumberInAssembly ); // printf( "Reads that became singlets: %d\n", nNumberInSinglets ); // printf( "Reads that were not found: %d\n", nNumberNotFound ); for( nExp = 0; nExp < aExpID_.length(); ++nExp ) { assert( aExpID_[ nExp ] == pAutoFinishExpTag_->aExpID_[ nExp ] ); fprintf( pAO, "expid: %d type: %s template: %s\n", aExpID_[ nExp ], szGetReadType2( pAutoFinishExpTag_->nReadType_ ), (char*) pAutoFinishExpTag_->aTemplates_[ nExp ].data() ); if ( aFlags_[ nExp ] == nNotFound ) { fprintf( pAO, "this experiment was not found!\n" ); } else if ( aFlags_[ nExp ] == nInSinglets ) { singletInfo* pSinglet = aSingletFromAutoFinish_[ nExp ]; fprintf( pAO, "phd file: %s found in singlets file. Bases: %d High quality bases: %d start: %d end: %d\n", (char*) pSinglet->filPHD_.data(), pSinglet->nNumberOfBases_, ( (pSinglet->nHighQualitySegmentStart_ == -1 ) ? 0 : pSinglet->nHighQualitySegmentEnd_ - pSinglet->nHighQualitySegmentStart_ + 1 ), pSinglet->nHighQualitySegmentStart_, pSinglet->nHighQualitySegmentEnd_ ); } else if ( aFlags_[ nExp ] == nInAssembly ) { // now I want to ask the question of whether this read went in the location // it was supposed to LocatedFragment* pLocFrag = aLocFrag_[ nExp ]; fprintf( pAO, "read name: %s found in assembly\n", (char*) pLocFrag->soGetName().data() ); // is this read the type of read it was supposed to be? if ( pLocFrag->nReadType_ != pAutoFinishExpTag_->nReadType_ ) { fprintf( pAO, "Error: autofinish read %s expid %d was supposed to be read type %s but was %s. Possible mislabelling of expids in phd file\n", (const char*) pLocFrag->soGetName(), aExpID_[ nExp ], szGetReadType2( pAutoFinishExpTag_->nReadType_ ), szGetReadType2( pLocFrag->nReadType_ ) ); } if ( pAutoFinishExpTag_->aTemplates_[ nExp ] != "whole clone" ) { if ( pLocFrag->soGetTemplateName() != pAutoFinishExpTag_->aTemplates_[ nExp ] ) { fprintf( pAO, "Error: autofinish read %s expid %d was supposed to be with template %s but was %s. Possible mislabelling of expids in phd file\n", (const char*) pLocFrag->soGetName(), aExpID_[ nExp ], (const char*) pAutoFinishExpTag_->aTemplates_[ nExp ], (const char*) pLocFrag->soGetTemplateName() ); } } if ( pAutoFinishExpTag_->nPurpose_ == nFlankGap ) { // nExp should be 0 but let's be safe in case // in future have multiple flank gap experiments per // experiment cluster evaluateFlankGapExperiment( nExp ); } else { if ( pLocFrag->pContig_ != pAutoFinishExpTag_->pContig_ ) { fprintf( pAO, "exp was supposed to go into contig %s but instead was assembled into contig %s\n", (char*) pAutoFinishExpTag_->pContig_->soGetName().data(), (char*) pLocFrag->pContig_->soGetName().data() ); } else { // so at least the read went into the correct contig bool bShouldHaveBeenTopStrand; if ( !pAutoFinishExpTag_->bComplementedFromWayPhrapCreatedContig_ && !pAutoFinishExpTag_->pContig_->bIsComplementedFromWayPhrapCreated() || pAutoFinishExpTag_->bComplementedFromWayPhrapCreatedContig_ && pAutoFinishExpTag_->pContig_->bIsComplementedFromWayPhrapCreated() ) bShouldHaveBeenTopStrand = true; else bShouldHaveBeenTopStrand = false; bool bIsStrandCorrect; if ( (!bShouldHaveBeenTopStrand && pLocFrag->bComp()) || ( !bShouldHaveBeenTopStrand && !pLocFrag->bComp() ) ) bIsStrandCorrect = true; else bIsStrandCorrect = false; if ( bIsStrandCorrect ) fprintf( pAO, "correct strand: %s\n", ( pLocFrag->bComp() ? "bottom" : "top" ) ); else fprintf( pAO, "incorrect strand. Should have been %s but is %s\n", ( bShouldHaveBeenTopStrand ? "top" : "bottom" ), ( pLocFrag->bComp() ? "bottom" : "top" ) ); if ( ABS( pAutoFinishExpTag_->nPaddedConsPosStart_ - pLocFrag->nGetAlignStart() ) < 50 ) { fprintf( pAO, "read went %d close to where it was supposed to go %d.\n", pAutoFinishExpTag_->pContig_->nUnpaddedIndex( pAutoFinishExpTag_->nPaddedConsPosStart_ ), pLocFrag->nGetAlignStartUnpadded() ); } else { fprintf( pAO, "read did not go %d where it was supposed to go %d\n", pAutoFinishExpTag_->pContig_->nUnpaddedIndex( pAutoFinishExpTag_->nPaddedConsPosStart_ ), pLocFrag->nGetAlignStartUnpadded() ); } } // if ( pLocFrag->pContig_ != pAutoFinishExpTag_->pContig_ ) else } // if ( pAutoFinishExpTag_ == nFlankGap ) { } // else if ( aFlags_[ nExp ] == nInAssembly ) { else assert( false ); } // for( int nExp = 0; nExp < aExpID_.length(); ++nExp ) { } void evalExpCluster :: evaluateFlankGapExperiment( const int nExp ) { LocatedFragment* pExistingUniversalPrimerRead = NULL; RWTPtrOrderedVector aExistingReads( (size_t) 4 ); RWCRegexp regExistingReads( "existing reads: [^\n]*\n" ); RWCString soExistingReadsWithTitle = pAutoFinishExpTag_->soMiscData_( regExistingReads ); if ( !soExistingReadsWithTitle.isNull() ) { RWCTokenizer tok( soExistingReadsWithTitle ); assert( tok() == "existing" ); assert( tok() == "reads:" ); RWCString soExistingRead; while( !( soExistingRead = tok() ).isNull() ) { LocatedFragment* pLocFrag = ConsEd::pGetAssembly()->pGetLocatedFragmentByName( soExistingRead ); if ( !pLocFrag ) { // The read is no longer present in the assembly. Has // someone has removed this read? fprintf( pAO, "warning: the read %s which was present when autofinish ran before is no longer present. Thus Autofinish cannot evaluate how well related reads did\n", soExistingRead.data() ); // I originally tried to just continue here, but // then pExistingUniversalPrimerRead isn't set, which also // gives a segmentation fault. Thus just don't evaluate this // flank gap experiment. DG, 11/01 return; } aExistingReads.insert( pLocFrag ); // all this below is to set pExistingUniversalPrimerRead and if ( !pLocFrag->bIsWalkingNotUniversalPrimerRead() ) { if ( !pExistingUniversalPrimerRead ) { pExistingUniversalPrimerRead = pLocFrag; bool bExistingReadComplementedByPhrap = pExistingUniversalPrimerRead->bComp(); if ( pExistingUniversalPrimerRead->pContig_->bComplementedFromWayPhrapCreated_ ) bExistingReadComplementedByPhrap = !bExistingReadComplementedByPhrap; bool bExistingReadShouldBeComplementedAsIndicatedByAutoFinishTag = !pAutoFinishExpTag_->bComplementedFromWayPhrapCreatedContig_; if ( bExistingReadComplementedByPhrap != bExistingReadShouldBeComplementedAsIndicatedByAutoFinishTag ) { fprintf( pAO, "warning: autofinish tag indicates that existing read %s should be on %s strand but read is on %s strand as created by phrap\n", (const char*) pExistingUniversalPrimerRead->soGetName(), ( bExistingReadShouldBeComplementedAsIndicatedByAutoFinishTag ? "bottom" : "top" ), ( bExistingReadShouldBeComplementedAsIndicatedByAutoFinishTag ? "bottom" : "top" ) ); } } else { // quick check that the 2 existing universal primer reads agree if ( pExistingUniversalPrimerRead->pContig_ != pLocFrag->pContig_ ) fprintf( pAO, "warning--universal primer reads %s and %s are in different contigs\n", (const char*) pLocFrag->soGetName(), (const char*) pExistingUniversalPrimerRead->soGetName() ); else { if ( pExistingUniversalPrimerRead->bComp() != pLocFrag->bComp() ) { fprintf( pAO, "warning--universal primer reads %s and %s are in opposite strands\n", (const char*) pLocFrag->soGetName(), (const char*) pExistingUniversalPrimerRead->soGetName() ); } else if ( ABS( pExistingUniversalPrimerRead->nGetAlignStartUnpadded() - pLocFrag->nGetAlignStartUnpadded() ) > consedParameters::pGetConsedParameters()->nPrimersToleranceForDifferentBeginningLocationOfUniversalPrimerReads_ ) { fprintf( pAO, "warning--universal primer read %s starts at %d while universal primer read %s starts at %d--too much discrepancy: %d\n", (const char*) pLocFrag->soGetName(), pLocFrag->nGetAlignStartUnpadded(), (const char*) pExistingUniversalPrimerRead->soGetName(), pExistingUniversalPrimerRead->nGetAlignStartUnpadded(), ABS( pLocFrag->nGetAlignStartUnpadded() - pExistingUniversalPrimerRead->nGetAlignStartUnpadded() ) ); } else { // not important to discuss consistency of old universal // primer reads. We are interested (below) in how // the new read did. // printf( "the reads are in the same contig and are consistent\n" ); // the universal primer reads agree reasonably well } } // if ( pExistingUniversalPrimerRead->pContig_ != else... } // if ( !pExistingUniversalPrimerRead ) else ... } // if ( pLocFrag->bIsUniversalPrimerRead() ) { } } else { // error--missing existing reads: line // I had earlier fixed the cause of this, but it still persisted // in some old files so I fixed it here also. fprintf( pAO, "error: flank gap experiment with info %s did not have \"existing reads:\" line in the AutofinishExpTag", pAutoFinishExpTag_->soMiscData_.data() ); return; } LocatedFragment* pLocFragNew = aLocFrag_[ nExp ]; if ( pLocFragNew->bIsWalkingNotUniversalPrimerRead() ) { fprintf( pAO, "error: read %s expid %d was supposed to be a flank gap experiment so it should be a universal primer read but is a walk\n", (const char*) pLocFragNew->soGetName(), aExpID_[ nExp ] ); // no further consistency checks can be done if this experiment really // is a walk, can they? } else { if ( ( pLocFragNew->bIsForwardNotReverseRead() && pExistingUniversalPrimerRead->bIsForwardNotReverseRead() ) || ( !pLocFragNew->bIsForwardNotReverseRead() && !pExistingUniversalPrimerRead->bIsForwardNotReverseRead() ) ) { fprintf( pAO, "error: read %s expid %d was a %s read but existing read %s was also a %s read but they should have been opposites", (const char*) pLocFragNew->soGetName(), aExpID_[ nExp ], szGetReadType2( pLocFragNew->nReadType_ ), (const char*) pExistingUniversalPrimerRead->soGetName(), szGetReadType2( pExistingUniversalPrimerRead->nReadType_ ) ); } } if ( pLocFragNew->pContig_ == pExistingUniversalPrimerRead->pContig_ ) { // check that the orientations are consistent bool bReadsAreConsistent = false; if ( pLocFragNew->nGetAlignStart() < pExistingUniversalPrimerRead->nGetAlignStart() ) { // ---> <---- // pLocFragNew pExistingUniversalPrimerRead if ( !pLocFragNew->bComp() && pExistingUniversalPrimerRead->bComp() ) { fprintf( pAO, "Reads %s and %s are consistent: --> <--\n", (const char*) pLocFragNew->soGetName(), (const char*) pExistingUniversalPrimerRead->soGetName() ); bReadsAreConsistent = true; // great! } else if ( !pLocFragNew->bComp() && !pExistingUniversalPrimerRead->bComp() ) fprintf( pAO, "Error: reads %s and %s should be like --> <--- but are like this ---> --->\n", (const char*) pLocFragNew->soGetName(), (const char*) pExistingUniversalPrimerRead->soGetName() ); else if ( pLocFragNew->bComp() && !pExistingUniversalPrimerRead->bComp() ) fprintf( pAO, "Error: reads %s and %s should be like --> <--- but are like this <--- --->\n", (const char*) pLocFragNew->soGetName(), (const char*) pExistingUniversalPrimerRead->soGetName() ); else if ( pLocFragNew->bComp() && pExistingUniversalPrimerRead->bComp() ) fprintf( pAO, "Error: reads %s and %s should be like ---> <--- but are like this <--- <---\n", (const char*) pLocFragNew->soGetName(), (const char*) pExistingUniversalPrimerRead->soGetName() ); else assert( false ); } else { // ---> <---- // pExistingUniversalPrimerRead pLocFragNew if ( !pExistingUniversalPrimerRead->bComp() && pLocFragNew->bComp() ) { fprintf( pAO, "Reads %s and %s are consistent: --> <--\n", (const char*) pExistingUniversalPrimerRead->soGetName(), (const char*) pLocFragNew->soGetName() ); bReadsAreConsistent = true; // great! } else if ( pExistingUniversalPrimerRead->bComp() && !pLocFragNew->bComp() ) fprintf( pAO, "Error: reads %s and %s should be like ---> <--- but are like <--- --->\n", (const char*) pExistingUniversalPrimerRead->soGetName(), (const char*) pLocFragNew->soGetName() ); else if ( !pExistingUniversalPrimerRead->bComp() && !pLocFragNew->bComp() ) fprintf( pAO, "Error: reads %s and %s should be like ---> <--- but are like ---> --->\n", (const char*) pExistingUniversalPrimerRead->soGetName(), (const char*) pLocFragNew->soGetName() ); else if ( pExistingUniversalPrimerRead->bComp() && pLocFragNew->bComp() ) fprintf( pAO, "Error: reads %s and %s should be like <--- <--- but are like ---> --->\n", (const char*) pExistingUniversalPrimerRead->soGetName(), (const char*) pLocFragNew->soGetName() ); } // According to Cindy, we should check that the new read has // extended the gap--I guess this can be accomplished by // checking the distance between the reads int nDistanceBetweenReads; // this only works if the reads are in the correct orientation if ( pExistingUniversalPrimerRead->bComp() ) { // ----> <---- // new existing nDistanceBetweenReads = ABS( pLocFragNew->nGetAlignStartUnpadded() - pExistingUniversalPrimerRead->nGetAlignEndUnpadded() ); } else { // ----> <---- // existing new nDistanceBetweenReads = ABS( pExistingUniversalPrimerRead->nGetAlignStartUnpadded() - pLocFragNew->nGetAlignEndUnpadded() ); } if ( nDistanceBetweenReads > consedParameters::pGetConsedParameters()->nPrimersMaxInsertSizeOfASubclone_ ) { fprintf( pAO, "Error: reads %s and %s are %d apart but the maximum size of a subclone (specified by you) is %d\n", (const char*) pExistingUniversalPrimerRead->soGetName(), (const char*) pLocFragNew->soGetName(), nDistanceBetweenReads, consedParameters::pGetConsedParameters()->nPrimersMaxInsertSizeOfASubclone_ ); } else fprintf( pAO, "distance %d between reads within limits\n", nDistanceBetweenReads ); } // if ( pLocFragNew->pContig_ == ... else { // reads are in different contigs. This is better to flank the gap. int nWhichEndNewRead; int nWhichEndExistingRead; if ( pLocFragNew->bIsReadNearEndOfContigAndPointingOut( nWhichEndNewRead ) && pExistingUniversalPrimerRead->bIsReadNearEndOfContigAndPointingOut( nWhichEndExistingRead ) ) { if ( nWhichEndExistingRead == nRightGap ) { if ( nWhichEndNewRead == nLeftGap ) { fprintf( pAO, "( %s ->)(<- %s )\n", (const char*) pExistingUniversalPrimerRead->pContig_->soGetName(), (const char*) pLocFragNew->pContig_->soGetName() ); fprintf( pAO, "(%s->) (<-%s)\n", (const char*) pExistingUniversalPrimerRead->soGetName(), (const char*) pLocFragNew->soGetName() ); } else { fprintf( pAO, "( %s ->)(<- %s complemented )\n", (const char*) pExistingUniversalPrimerRead->pContig_->soGetName(), (const char*) pLocFragNew->pContig_->soGetName() ); fprintf( pAO, "( %s ->)(<- %s )\n", (const char*) pExistingUniversalPrimerRead->soGetName(), (const char*) pLocFragNew->soGetName() ); } } else { // so pExistingUniversalPrimerRead is pointing out the // left gap if ( nWhichEndNewRead == nLeftGap ) { fprintf( pAO, "( %s complemented ->)(<- %s )\n", (const char*) pLocFragNew->pContig_->soGetName(), (const char*) pExistingUniversalPrimerRead->pContig_->soGetName() ); fprintf( pAO, "( %s ->)(<- %s )\n", (const char*) pLocFragNew->soGetName(), (const char*) pExistingUniversalPrimerRead->soGetName() ); } else { fprintf( pAO, "( %s ->)(<- %s )\n", (const char*) pLocFragNew->pContig_->soGetName(), (const char*) pExistingUniversalPrimerRead->soGetName() ); fprintf( pAO, "( %s ->)(<- %s )\n", (const char*) pLocFragNew->soGetName(), (const char*) pExistingUniversalPrimerRead->soGetName() ); } } } // if ( pLocFragNew->bIsReadNearEndOfContigAndPointingOut( ... else { if ( !pLocFragNew->bIsReadNearEndOfContigAndPointingOut( nWhichEndNewRead ) ) { fprintf( pAO, "Problem with new read: it is pointing from base %d to %d in a contig %d long\nso it is %d from the end of the contig which is too far for the possible\n insert size of this template. There must be a misassembly, \none of the reads must be misnamed, or the expid in this read is incorrect.\n", ( pLocFragNew->bComp() ? pLocFragNew->nGetAlignEndUnpadded() : pLocFragNew->nGetAlignStartUnpadded() ), ( pLocFragNew->bComp() ? pLocFragNew->nGetAlignStartUnpadded() : pLocFragNew->nGetAlignEndUnpadded() ), ( pLocFragNew->bComp() ? pLocFragNew->nGetAlignStartUnpadded() : pLocFragNew->pContig_->nGetUnpaddedEndIndex() - pLocFragNew->nGetAlignEndUnpadded() + 1 ) ); if ( pLocFragNew->bComp() ) fprintf( pAO, "( <- )\n" ); else fprintf( pAO, "( -> )\n" ); } if ( !pExistingUniversalPrimerRead->bIsReadNearEndOfContigAndPointingOut( nWhichEndExistingRead ) ) { fprintf( pAO, "Problem with existing read from this template: it \nis pointing from base %d to %d in a contig %d long \nso it is %d from the end of the contig which is too \nfar for the possible insert size of this template. \nThere must be a misassembly, one of the reads must \nbe misnamed, or the expid in the read must be incorrect.\n", ( pExistingUniversalPrimerRead->bComp() ? pExistingUniversalPrimerRead->nGetAlignEndUnpadded() : pExistingUniversalPrimerRead->nGetAlignStartUnpadded() ), ( pExistingUniversalPrimerRead->bComp() ? pExistingUniversalPrimerRead->nGetAlignStartUnpadded() : pExistingUniversalPrimerRead->nGetAlignEndUnpadded() ), pExistingUniversalPrimerRead->pContig_->nGetUnpaddedSeqLength(), ( pExistingUniversalPrimerRead->bComp() ? pExistingUniversalPrimerRead->nGetAlignStartUnpadded() : pExistingUniversalPrimerRead->pContig_->nGetUnpaddedEndIndex() - pExistingUniversalPrimerRead->nGetAlignEndUnpadded() + 1 ) ); } } // // if ( pLocFragNew->bIsReadNearEndOfContigAndPointingOut( ... else } // if ( pLocFragNew->pContig_ == ... else }