/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "contig.h" #include "contigwin.h" #include "locatedFragment.h" #include "bIntersect.h" #include "mbtValVector.h" #include "newContig.h" #include "bGuiGetAnswerYesNo.h" #include "consed.h" #include "bIntervalsIntersect.h" #include "popupInfoMessage.h" #include "guicontigwin.h" #include "please_wait.h" #include "popupErrorMessage.h" #include "readsSortedByReadName.h" void Contig :: putReadIntoItsOwnContig( LocatedFragment* pLocFragToRemove, ContigWin* pOldContigWin, const bool bOKToUseGui ) { int nOldConsPosDisplayed = ( pOldContigWin->nLeftEdgeConsensusPosition_ + pOldContigWin->nGetRightEdgeConsensusPosition() ) / 2; // careful--this contigwin will be closed by the code // in putReadsIntoTheirOwnContigs readsSortedByReadName aArrayOfReadsToRemove; aArrayOfReadsToRemove.insert( pLocFragToRemove ); RWTPtrOrderedVector aAdditionalReadsToRemove; putReadsIntoTheirOwnContigs( &aArrayOfReadsToRemove, bOKToUseGui, nOldConsPosDisplayed, &aAdditionalReadsToRemove ); } void Contig :: putReadsIntoTheirOwnContigs( readsSortedByReadName* pReadsToRemove, const bool bOKToUseGui, const int nOldConsPosDisplayed, RWTPtrOrderedVector* pAdditionalReadsPutIntoOwnContigs ) { // nOldConsPosDisplayed is only used if bOKToUseGui is true if ( pReadsToRemove->length() == 0 ) { popupErrorMessage( "you specified no reads to remove" ); return; } ConsEd::pGetConsEd()->whatToDoBeforeModifyAssembly(); // this will be used to pop up the contigwin later on. LocatedFragment* pSampleRemovedRead = (*pReadsToRemove)[0]; pReadsToRemove->resort(); // just for reporting results RWTPtrOrderedVector aOriginalReadsToRemove( (*pReadsToRemove ) ); aOriginalReadsToRemove.soName_ = "Contig::aOriginalReadsToRemove"; // all unaligned reads will also be removed. So put them // into pReadsToRemove RWTPtrOrderedVector aUnalignedReads; aUnalignedReads.soName_ = "Contig::aUnalignedReads"; // at the same time, count the number of aligned reads at // each location. We will use this later to decide whether // we have to recalculate consensus qualities. mbtValVector aNumberOfAlignedReadsInOriginalContig( nGetPaddedConsLength(), 1, 0 ); aNumberOfAlignedReadsInOriginalContig.soName_ = "Contig::aNumberOfAlignedReadsInOriginalContig"; int nRead; for( nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->bIsWholeReadUnaligned() ) { if ( ! pReadsToRemove->pFindReadByName( pLocFrag->soGetName() ) ) { aUnalignedReads.insert( pLocFrag ); } } else { // in the past, there was a bug causing some aligned regions // to actually be off the consensus. This would cause // a subscript error here. To protect from that subscript // error, intersect with the consensus positions. int nOldAlignLeft = pLocFrag->nAlignClipStart_; int nOldAlignRight = pLocFrag->nAlignClipEnd_; if ( !bIntersect( pLocFrag->nAlignClipStart_, pLocFrag->nAlignClipEnd_, nGetStartConsensusIndex(), nGetEndConsensusIndex(), pLocFrag->nAlignClipStart_, pLocFrag->nAlignClipEnd_ ) ) { pLocFrag->bWholeReadIsUnaligned_ = true; RWCString soError = "read "; soError += pLocFrag->soGetName(); soError += " had a problem with its aligned position. I have corrected it. Please try again"; THROW_ERROR( soError ); } // just for reporting if we corrected the alignment positions if ( nOldAlignLeft != pLocFrag->nAlignClipStart_ || nOldAlignRight != pLocFrag->nAlignClipEnd_ ) { cerr << "align clipping changed for read " << pLocFrag->soGetName() << " old left: " << nOldAlignLeft << " new left: " << pLocFrag->nAlignClipStart_ << " old right: " << nOldAlignRight << " new right: " << pLocFrag->nAlignClipEnd_ << endl; } for( int nConsPos = pLocFrag->nAlignClipStart_; nConsPos <= pLocFrag->nAlignClipEnd_; ++nConsPos ) { ++aNumberOfAlignedReadsInOriginalContig[ nConsPos ]; } } } // add aUnalignedReads to pReadsToRemove and sort it again for( nRead = 0; nRead < aUnalignedReads.length(); ++nRead ) { LocatedFragment* pLocFrag = aUnalignedReads[ nRead ]; pReadsToRemove->insert( pLocFrag ); // keep track of reads that the user didn't specify to be // pulled but we are pulling them anyway pAdditionalReadsPutIntoOwnContigs->insert( pLocFrag ); } pReadsToRemove->resort(); // now pReadsToRemove includes all reads to remove, including // unaligned reads RWTPtrOrderedVector aRemainingReads( (size_t) ( nGetNumberOfFragsInContig() - pReadsToRemove->length() ), "Contig::aRemainingReads" ); mbtValVectorOfBool aConsensusBitArray( nGetPaddedConsLength(), 1, // number of first element "Contig::putReadsIntoOwnContigs aConsensusBitArray" ); const int nAmountToClipAlignedRegion = 5; for( nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); // check if the read is one of the ones we want to remove if ( pReadsToRemove->pFindReadByName( pLocFrag->soGetName() ) ) continue; aRemainingReads.insert( pLocFrag ); int nAlignedRegionLeft = pLocFrag->nAlignClipStart_ + nAmountToClipAlignedRegion; int nAlignedRegionRight = pLocFrag->nAlignClipEnd_ - nAmountToClipAlignedRegion; // very small aligned region of the read if ( nAlignedRegionLeft > nAlignedRegionRight ) continue; // maybe I should keep track of the highest quality read at // each point so that I can create base segments, if I need to for( int nConsPos = nAlignedRegionLeft; nConsPos <= nAlignedRegionRight; ++nConsPos ) { aConsensusBitArray.setValue( nConsPos, true ); } } // now look for gaps between the contiguous regions of bits. they are // the new contigs newContig* pCurrentNewContig = NULL; RWTPtrOrderedVector aNewContigs( (size_t) 5, "Contig::aNewContigs" ); const unsigned char ALIGNED = 'a'; const unsigned char UNALIGNED = 'u'; unsigned char cOldState = UNALIGNED; // this sets it up for the first aligned unsigned char cNewState; // to make a state change a create a newContig int nConsPos; for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( aConsensusBitArray[ nConsPos ] ) cNewState = ALIGNED; else cNewState = UNALIGNED; // look for state changes if ( cNewState == ALIGNED && cOldState == UNALIGNED ) { // started a new contig pCurrentNewContig = new newContig( nConsPos, this ); aNewContigs.insert( pCurrentNewContig ); } else if ( cNewState == UNALIGNED && cOldState == ALIGNED ) { // just finished a newcontig at the position before this pCurrentNewContig->nConsPosRight_ = nConsPos - 1; } cOldState = cNewState; } // after moving off the consensus, we may have not yet completed a // newContig. So complete it: if ( cOldState == ALIGNED ) { pCurrentNewContig->nConsPosRight_ = nConsPos - 1; } PleaseWait* pPleaseWait = NULL; if ( bOKToUseGui ) { if ( aNewContigs.length() > 1 ) { RWCString soWarning = "removing read(s): "; for( int nReadToRemove = 0; nReadToRemove < pReadsToRemove->length(); ++nReadToRemove ) { LocatedFragment* pLocFragToRemove = (*pReadsToRemove)[ nReadToRemove ]; if ( nReadToRemove > 0 ) soWarning += ", "; soWarning += pLocFragToRemove->soGetName(); if ( pLocFragToRemove->bIsWholeReadUnaligned() ) { soWarning += " (because unaligned)"; } } soWarning += " will cause "; soWarning += soGetName(); // contig name soWarning += " to fall apart into "; soWarning += RWCString( (long) aNewContigs.length() ); soWarning += " new contigs since there are no aligned reads (after clipping aligned region back 5 bases) at the following region(s): "; for( int nNewContig = 1; nNewContig < aNewContigs.length(); ++nNewContig ) { int nHoleLeftUnpadded = nUnpaddedIndex( aNewContigs[ nNewContig - 1 ]->nConsPosRight_ ) + 1; int nHoleRightUnpadded = nUnpaddedIndex( aNewContigs[ nNewContig ]->nConsPosLeft_ ) - 1; soWarning += RWCString( (long) nHoleLeftUnpadded ); soWarning += "-"; soWarning += RWCString( (long) nHoleRightUnpadded ); soWarning += " "; } soWarning += " Do you still want to remove these reads? (y/n)"; if ( !bGuiGetAnswerYesNo( soWarning ) ) { // clean up and return aNewContigs.clearAndDestroy(); return; } } pPleaseWait = new PleaseWait(); // close existing contigwin's ConsEd::pGetConsEd()->deleteAllContigWinsForContig( this ); } // if ( bOKToUseGui ) { // point of no return // save into .wrk file what we intend to do so if there is a bug, // we can reproduce it: ConsEd::pGetAssembly()->writeEditToEditHistoryFile( "!putting following reads into their own contigs: " ); for( nRead = 0; nRead < aOriginalReadsToRemove.length(); ++nRead ) { LocatedFragment* pLocFrag = aOriginalReadsToRemove[ nRead ]; ConsEd::pGetAssembly()->writeEditToEditHistoryFile( " %s", pLocFrag->soGetName().data() ); } // now put reads into the new contigs RWTPtrOrderedVector aReadsNotGoingIntoAnyNewContig; aReadsNotGoingIntoAnyNewContig.soName_ = "Contig::aReadsNotGoingIntoAnyNewContig"; for( nRead = 0; nRead < aRemainingReads.length(); ++nRead ) { LocatedFragment* pLocFrag = aRemainingReads[ nRead ]; // this read must be aligned. // Since the new contigs were constructed using the clipped // back aligned region of the reads, we should use the clipped // back aligned region of the reads to check which new contig the // reads should go into. Otherwise, if we use the unclipped // aligned clipped region of the reads, the read may actually overlap // more than one contig and this leads to a subscript error // with exception thrown: File 'rwtvalvector.h', line 83 In RWTValVector::operator[] ( 5870 ) out of bounds with nCurrentLength_ = 5870 Contig::aNumberOfAlignedReads // This happens like this: // read A ------tttttuuu // read B uuuttttt-------- // ------------ -------------- // new contig A new ContigB // where --- are aligned bases // ttt is clipped back aligned bases // uu are unaligned bases // In this case, notice that the aligned part of read B intersects // both new contig A and new Contig B. Thus it is important // that we just look at the trimmed back part of the aligned part of // read B which just intersects new ContigB. int nClippedAlignClipStart = pLocFrag->nAlignClipStart_ + nAmountToClipAlignedRegion; int nClippedAlignClipEnd = pLocFrag->nAlignClipEnd_ - nAmountToClipAlignedRegion; if ( nClippedAlignClipStart > nClippedAlignClipEnd ) { cerr << "warning: couldn't put read " << pLocFrag->soGetName() << " into any contig because aligned region so short" << endl; aReadsNotGoingIntoAnyNewContig.insert( pLocFrag ); continue; } bool bFoundContig = false; for( int nNewContig = 0; !bFoundContig && ( nNewContig < aNewContigs.length() ); ++nNewContig ) { newContig* pNewContig = aNewContigs[ nNewContig ]; if ( bIntervalsIntersect( nClippedAlignClipStart, nClippedAlignClipEnd, pNewContig->nConsPosLeft_, pNewContig->nConsPosRight_ ) ) { pNewContig->aReads_.insert( pLocFrag ); bFoundContig = true; } } // could it be that there is a read that goes into more than // one new contig? No, because if it did, then the new // contigs would have been merged into one. // could it be that there are any reads that do not go into // any of the contigs? I suppose that could happen if they had // very short aligned region (less than 10 bases). I'm not sure // currently what to do with these reads. We could pull them // out entirely, or else we could put them in the contig // where the aligned region is closest. if ( !bFoundContig ) aReadsNotGoingIntoAnyNewContig.insert( pLocFrag ); } // for( nRead = 0; nRead < aRemainingReads.length(); ++nRead ) { RWCString soMessage; for( nRead = 0; nRead < aReadsNotGoingIntoAnyNewContig.length(); ++nRead ) { LocatedFragment* pLocFragNotGoingIntoAnyNewContig = aReadsNotGoingIntoAnyNewContig[ nRead ]; pReadsToRemove->insert( pLocFragNotGoingIntoAnyNewContig ); pAdditionalReadsPutIntoOwnContigs->insert( pLocFragNotGoingIntoAnyNewContig ); if ( nRead != 0 ) { soMessage += ", "; } soMessage += pLocFragNotGoingIntoAnyNewContig->soGetName(); } if ( bOKToUseGui ) { if ( !soMessage.isNull() ) { soMessage += " are going removed because their aligned region is very short and they don't sufficiently overlap other aligned reads"; popupInfoMessage( GAPP->widGetTopLevel(), soMessage ); } } pReadsToRemove->resort(); // create new contigs int nNewContig; for( nNewContig = 0; nNewContig < aNewContigs.length(); ++nNewContig ) { newContig* pNewContig = aNewContigs[ nNewContig ]; bool bFoundBestAlternateReadsForBaseSegments = false; mbtValVector* pQualityOfBestReads; mbtPtrVector* pBestReads; // expand back to their original size pNewContig->nConsPosLeft_ -= nAmountToClipAlignedRegion; pNewContig->nConsPosRight_ += nAmountToClipAlignedRegion; RWCString soNewContigName; if ( nNewContig == 0 ) { // the leftmost new contig will use the name of the old contig. soNewContigName = soGetName(); } else { soNewContigName = ConsEd::pGetConsEd()->pGetAssembly()->soGetNewContigName(); } int nNewNumberOfReads = pNewContig->aReads_.length(); // nNewNumberOfBaseSegments can be estimated by the number of // base segments in the old contig where this new contig lies: int nBaseSegmentLeft = baseSegArray_.nGetBaseSegmentIndexByConsPos( pNewContig->nConsPosLeft_ ); int nBaseSegmentRight = baseSegArray_.nGetBaseSegmentIndexByConsPos( pNewContig->nConsPosRight_ ); int nNewNumberOfBaseSegments; if ( nBaseSegmentLeft != RW_NPOS && nBaseSegmentRight != RW_NPOS ) { nNewNumberOfBaseSegments = nBaseSegmentRight - nBaseSegmentLeft + 1; } else { nNewNumberOfBaseSegments = baseSegArray_.nGetNumSegments(); // set up for loop below which starts at nBaseSegmentLeft and // ends on nBaseSegmentRight: if ( nBaseSegmentLeft == RW_NPOS ) nBaseSegmentLeft = 0; if ( nBaseSegmentRight == RW_NPOS ) nBaseSegmentRight = baseSegArray_.dapBs_.length() - 1; } Contig* pRealContig = new Contig( soNewContigName.data(), nNewNumberOfReads, nNewNumberOfBaseSegments, bComplementedFromWayPhrapCreated_, ConsEd::pGetAssembly() ); // set aside room for the Ntides now--otherwise removing a read which // took 3 seconds in consed 16 takes 15 minutes (?) in consed 17 pRealContig->resize( pNewContig->nConsPosRight_ - pNewContig->nConsPosLeft_ + 1 ); pNewContig->pNewRealContig_ = pRealContig; int nNewConsPosFromOldConsPos = 1 - pNewContig->nConsPosLeft_; // reads must be sorted so we can use pFindReadByName (below) pNewContig->aReads_.resort(); // add new consensus bases and base segments. // Find the existing base segments, and then within each base segment, // add the bases to the new contig. int nExpectedConsPosOfNextBase = pNewContig->nConsPosLeft_; for( int nBaseSegment = nBaseSegmentLeft; nBaseSegment <= nBaseSegmentRight; ++nBaseSegment ) { BaseSegment* pOldBaseSeg = baseSegArray_[ nBaseSegment ]; int nBaseSegOnNewContigLeft; int nBaseSegOnNewContigRight; if ( !bIntersect( pOldBaseSeg->nStartInConsensus_, pOldBaseSeg->nEndInConsensus_, pNewContig->nConsPosLeft_, pNewContig->nConsPosRight_, nBaseSegOnNewContigLeft, nBaseSegOnNewContigRight ) ) { cerr << "base segment " << pOldBaseSeg->pLocFrag_->soGetName() << " from " << pOldBaseSeg->nStartInConsensus_ << " to " << pOldBaseSeg->nEndInConsensus_ << " does not intersect new contig from " << pNewContig->nConsPosLeft_ << " to " << pNewContig->nConsPosRight_ << endl; continue; // shouldn't ever happen } if ( nExpectedConsPosOfNextBase != nBaseSegOnNewContigLeft ) { cerr << "nExpectedConsPosOfNextBase = " << nExpectedConsPosOfNextBase << " but nBaseSegOnNewContigLeft = " << nBaseSegOnNewContigLeft << endl; assert( false ); } nExpectedConsPosOfNextBase = nBaseSegOnNewContigRight + 1; // let's see if the old base segment is from a read that // is in this new contig. If not, we will need to replace it. if ( pNewContig->aReads_.pFindReadByName( pOldBaseSeg->pLocFrag_->soGetName() ) ) { // good, the read is here. We will keep this read for // the base segment and we will move over the consensus // bases int nWillAddThisManyNtides = nBaseSegOnNewContigRight - nBaseSegOnNewContigLeft + 1; pRealContig->increaseMaxLengthIfNecessary( nWillAddThisManyNtides ); for( int nConsPos = nBaseSegOnNewContigLeft; nConsPos <= nBaseSegOnNewContigRight; ++nConsPos ) { pRealContig->append( ntGetCons( nConsPos ) ); } BaseSegment* pNewBaseSeg = new BaseSegment( pOldBaseSeg->pLocFrag_, nBaseSegOnNewContigLeft + nNewConsPosFromOldConsPos, nBaseSegOnNewContigRight + nNewConsPosFromOldConsPos ); pRealContig->baseSegArray_.addPtrToNewBaseSeg( pNewBaseSeg ); } else { // the tough case. For some reason the read that was used // for the old base segment is not part of the new contig. // (This might be because we are removing that read, it is // completely unaligned, or its base segment was in its // unaligned region, and thus in a different new contig.) // In any case, we need to find an alternative read. if ( !bFoundBestAlternateReadsForBaseSegments ) { bFoundBestAlternateReadsForBaseSegments = true; pQualityOfBestReads = new mbtValVector( pNewContig->nConsPosRight_ - pNewContig->nConsPosLeft_ + 1, pNewContig->nConsPosLeft_, 0 ); pQualityOfBestReads->soName_ = "Contig::*pQualityOfBestReads"; pBestReads = new mbtPtrVector( pNewContig->nConsPosRight_ - pNewContig->nConsPosLeft_ + 1, pNewContig->nConsPosLeft_ ); pBestReads->soName_ = "Contig::*pBestReads"; int nRead2; for( nRead2 = 0; nRead2 < pNewContig->aReads_.length(); ++nRead2 ) { LocatedFragment* pLocFrag2 = pNewContig->aReads_[ nRead2 ]; for( int nConsPos2 = pLocFrag2->nAlignClipStart_; nConsPos2 <= pLocFrag2->nAlignClipEnd_; ++nConsPos2 ) { if ( !(*pBestReads)[ nConsPos2 ] ) { (*pBestReads)[ nConsPos2 ] = pLocFrag2; (*pQualityOfBestReads)[ nConsPos2 ] = pLocFrag2->ntGetFragFromConsPos( nConsPos2 ).qualGetQualityWithout98or99(); } else { unsigned char ucQuality = pLocFrag2->ntGetFragFromConsPos( nConsPos2 ).qualGetQualityWithout98or99(); if ( ucQuality > (*pQualityOfBestReads)[ nConsPos2 ] ) { (*pQualityOfBestReads)[ nConsPos2 ] = ucQuality; (*pBestReads)[ nConsPos2 ] = pLocFrag2; } } } } // for( nRead2 = 0; nRead2 < ... } // if ( !bFoundBestAlternateReadsForBaseSegments for( int nConsPos = nBaseSegOnNewContigLeft; nConsPos <= nBaseSegOnNewContigRight; ++nConsPos ) { LocatedFragment* pAlternateLocFrag = (*pBestReads)[ nConsPos ]; assert( pAlternateLocFrag ); pRealContig->append( pAlternateLocFrag->ntGetFragFromConsPos( nConsPos ) ); BaseSegment* pNewBaseSeg = new BaseSegment( pAlternateLocFrag, nConsPos + nNewConsPosFromOldConsPos, nConsPos + nNewConsPosFromOldConsPos ); pRealContig->baseSegArray_.addPtrToNewBaseSeg( pNewBaseSeg ); } // for( int nConsPos = ... } // if ( pFindReadByName( } // for( int nBaseSegment // make sure that the last base used was the last base we expected // to be used assert( (nExpectedConsPosOfNextBase - 1 ) == pNewContig->nConsPosRight_ ); ConsEd::pGetAssembly()->addContig( pRealContig ); // In a minute, we will recalculate consensus quality in the // region where the number of aligned reads has changed. Before // we add the reads to the new contig (and change the nAlignClipStart_ // from old contig positions to new contig positions), count // how many aligned reads there will be in the new contig, but do // it in the old contig positions. size_t nCapacity = pNewContig->nConsPosRight_ - pNewContig->nConsPosLeft_ + 1; int nOffset = pNewContig->nConsPosLeft_; int nInitialValue = 0; mbtValVector aNumberOfAlignedReads( nCapacity, nOffset, nInitialValue ); aNumberOfAlignedReads.soName_ = "Contig::aNumberOfAlignedReads"; // calculate the new number of aligned reads at each location. // do this with respect to the original contig consensus positions. for( nRead = 0; nRead < pNewContig->aReads_.length(); ++nRead ) { LocatedFragment* pLocFrag = pNewContig->aReads_[ nRead ]; for( int nConsPos = pLocFrag->nAlignClipStart_; nConsPos <= pLocFrag->nAlignClipEnd_; ++nConsPos ) { ++aNumberOfAlignedReads[ nConsPos ]; } } // now add reads to this new contig int nRead; for( nRead = 0; nRead < pNewContig->aReads_.length(); ++nRead ) { LocatedFragment* pLocFrag = pNewContig->aReads_[ nRead ]; pLocFrag->moveReadAndReadTagsToNewContig( pRealContig, nNewConsPosFromOldConsPos ); } pRealContig->updateFirstAndLastDisplayableContigPos(); pRealContig->setUnpaddedPositionsArray(); pRealContig->setPaddedPositionsArray(); assert( pRealContig->baseSegArray_.bGetDataStructureOk( true // check that base segment go from // beginning to end of consensus ) ); // Compare the number of aligned reads in this new contig // to the number of aligned reads in the old contig. If it is fewer, // then recalculate the consensus quality at that location. // Note: recalculateConsensusQualitiesAndChange is optimized // to be able to work on a large area at once. Thus we should // find the regions that are changed, rather than a set of // consensus positions. // these are in the consensus positions of the original contig int nChangedRegionLeft; int nChangedRegionRight; const char cSAME_READS = 's'; const char cFEWER_READS = 'f'; // set up in case there is an immediate transition to a region // with fewer reads char cOldState = cSAME_READS; for( int nConsPos = pNewContig->nConsPosLeft_; nConsPos <= pNewContig->nConsPosRight_; ++nConsPos ) { char cCurrentState; if ( aNumberOfAlignedReads[ nConsPos ] == aNumberOfAlignedReadsInOriginalContig[ nConsPos ] ) cCurrentState = cSAME_READS; else if ( aNumberOfAlignedReads[ nConsPos ] < aNumberOfAlignedReadsInOriginalContig[ nConsPos ] ) cCurrentState = cFEWER_READS; else // the new contigs should never have more aligned reads // than there were in the old contig at the same location assert( false ); if ( ( cCurrentState == cSAME_READS && cOldState == cFEWER_READS ) || ( cCurrentState == cFEWER_READS && nConsPos == pNewContig->nConsPosRight_ ) ) { // just finished a region // changed Nov 2008 to fix bug in which last base // on right end of a contig would have its consensus // quality wrong if ( nConsPos == pNewContig->nConsPosRight_ ) nChangedRegionRight = nConsPos; else nChangedRegionRight = nConsPos - 1; // convert these positions to that of the new contig cerr << "recalculating consensus quality values in original contig " << soGetName() << " from " << nUnpaddedIndex( nChangedRegionLeft ) << " to " << nUnpaddedIndex( nChangedRegionRight ) << endl; nChangedRegionLeft = nChangedRegionLeft + 1 - pNewContig->nConsPosLeft_; nChangedRegionRight = nChangedRegionRight + 1 - pNewContig->nConsPosLeft_; cerr << "(corresponding to " << pRealContig->soGetName() << " from " << pRealContig->nUnpaddedIndex( nChangedRegionLeft ) << " to " << pRealContig->nUnpaddedIndex( nChangedRegionRight ) << ")" << endl; pRealContig->recalculateConsensusQualitiesAndChange( nChangedRegionLeft, nChangedRegionRight ); } else if ( cCurrentState == cFEWER_READS && cOldState == cSAME_READS ) { nChangedRegionLeft = nConsPos; } cOldState = cCurrentState; } // for( int nConsPos = pNewContig->nConsPosLeft_; if ( bFoundBestAlternateReadsForBaseSegments ) { delete pQualityOfBestReads; delete pBestReads; } } // for( int nNewContig = 0; nNewContig < ... // -------------------------------------------------------- // end of big contig loop // -------------------------------------------------------- // need to deal with the consensus tags. consensus tags may not // fit neatly into one of the new contigs. If it doesn't, delete // it. (Might it fit completely into more than one? No, because // that would mean that there was a region that had aligned reads // from more than one contig, so there wouldn't have been more than // one contig--they would have been the same contig.) RWTPtrOrderedVector aConsensusTagsToGoOnRemovedReads; RWTValOrderedVector aReadIndexForConsensusTags; RWTPtrOrderedVector aDeletedConsensusTags; aConsensusTagsToGoOnRemovedReads.soName_ = "Contig::aConsensusTagsToGoOnRemovedReads"; aReadIndexForConsensusTags.soName_ = "Contig::aReadIndexForConsensusTags"; aDeletedConsensusTags.soName_ = "Contig::aDeletedConsensusTags"; for( int nConsTag = aConsensusTags_.length() - 1; nConsTag >= 0; --nConsTag ) { tag* pConsTag = aConsensusTags_[ nConsTag ]; // find a new contig that completely contains this tag bool bFoundANewContigContainingTag = false; newContig* pNewContig; for( nNewContig = 0; nNewContig < aNewContigs.length() && !bFoundANewContigContainingTag; ++nNewContig ) { pNewContig = aNewContigs[ nNewContig ]; if ( pNewContig->nConsPosLeft_ <= pConsTag->nPaddedConsPosStart_ && pConsTag->nPaddedConsPosEnd_ <= pNewContig->nConsPosRight_ ) { bFoundANewContigContainingTag = true; } } if ( bFoundANewContigContainingTag ) { // I don't think I will bother removing the tags one by one from // the old big contig--we'll just remove them all at once. pConsTag->pContig_ = pNewContig->pNewRealContig_; // change the tag positions in terms of the new contig positions int nOffset = 1 - pNewContig->nConsPosLeft_; pConsTag->nPaddedConsPosStart_ += nOffset; pConsTag->nPaddedConsPosEnd_ += nOffset; pNewContig->pNewRealContig_->addConsensusTag( pConsTag ); } else { // check to see if the tag can go into any of the reads that are // being removed bool bFoundARead = false; for( int nRead = 0; nRead < pReadsToRemove->length() && !bFoundARead; ++nRead ) { LocatedFragment* pLocFragToRemove = (*pReadsToRemove)[ nRead ]; if ( pLocFragToRemove->nGetAlignStart() <= pConsTag->nPaddedConsPosStart_ && pConsTag->nPaddedConsPosEnd_ <= pLocFragToRemove->nGetAlignEnd() ) { bFoundARead = true; aConsensusTagsToGoOnRemovedReads.insert( pConsTag ); aReadIndexForConsensusTags.insert( nRead ); // this tag will be added to the contig of the removed // read later } } if ( !bFoundARead ) { // if the tag overlaps several new contigs, but isn't // contained in any single new contig, delete it. if ( !bFoundANewContigContainingTag ) { aDeletedConsensusTags.insert( pConsTag ); } } } } // for( int nConsTag = aConsensusTags_ // remove (bug do not delete) all consensus tags at once aConsensusTags_.clear(); // now create little contigs for the all the reads we are removing RWTPtrOrderedVector aNewLittleContigs( (size_t) pReadsToRemove->length(), "Contig::aNewLittleContigs" ); RWTValOrderedVector aOldAlignStartPositions( (size_t) pReadsToRemove->length(), "Contig::aOldAlignStartPositions" ); int nRemovedRead; for( nRemovedRead = 0; nRemovedRead < pReadsToRemove->length(); ++nRemovedRead ) { LocatedFragment* pLocFrag = (*pReadsToRemove)[ nRemovedRead ]; aOldAlignStartPositions.insert( pLocFrag->nGetAlignStart() ); aNewLittleContigs.insert( pLocFrag->pMakeAContigOutOfThisRead() ); } // add any consensus tags that should go with these removed reads int nTag; for( nTag = 0; nTag < aConsensusTagsToGoOnRemovedReads.length(); ++nTag ) { tag* pTag = aConsensusTagsToGoOnRemovedReads[ nTag ]; int nReadIndex = aReadIndexForConsensusTags[ nTag ]; // due to the order that the aNewLittleContigs are added, // the index of the reads to be removed is also the index // into the new contigs of those reads Contig* pContig = aNewLittleContigs[ nReadIndex ]; pTag->nPaddedConsPosStart_ += ( 1 - aOldAlignStartPositions[ nReadIndex ] ); pTag->nPaddedConsPosEnd_ += ( 1 - aOldAlignStartPositions[ nReadIndex ] ); pTag->pContig_ = pContig; pContig->addConsensusTag( pTag ); // If we ever squeeze pads out of the reads to be removed, // the consensus tag might protrude beyond the end of the read } // finally delete the old contig delete this; ContigWin* pSomeBigContigWin = NULL; ContigWin* pSomeLittleContigWin = NULL; if ( bOKToUseGui ) { // update the top level window's list of contigs and reads ConsEd::pGetAssembly()->sortContigsByName(); ConsEd::pGetConsEd()->updateContigListOnMainConsedWindow(); // popup Aligned Reads Window for all the "big" contigs // at approximately the same consensus positions as where the // old contig was (if possible) for( nNewContig = 0; nNewContig < aNewContigs.length(); ++nNewContig ) { newContig* pNewContig = aNewContigs[ nNewContig ]; // convert the old middle of the screen to the new contig coordinates int nConsPosNewContig = nOldConsPosDisplayed + 1 - pNewContig->nConsPosLeft_; // the only problem is that the old middle of the screen // might be off one end of the new contig. Fortunately, // pScrollExistingContigWinOrMakeNewContigWin takes care of // this, although it is many calls down. Contig* pNewRealContig = pNewContig->pNewRealContig_; pSomeBigContigWin = ConsEd::pGetConsEd()->pScrollExistingContigWinOrMakeNewContigWin( pNewRealContig, nConsPosNewContig ); } // put all of the removed reads into Aligned Reads Windows, // also. No--Pat said this would be too many. Just put a // sample into an Aligned Reads Window. This sample might be // the only one if we are just removing a single read. Contig* pContigToDisplay = pSampleRemovedRead->pContig_; assert( aOldAlignStartPositions.length() == aNewLittleContigs.length() ); bool bFoundContig = false; for( int nNewLittleContig = 0; nNewLittleContig < aNewLittleContigs.length() && !bFoundContig; ++nNewLittleContig ) { Contig* pContig = aNewLittleContigs[ nNewLittleContig ]; if ( pContig == pContigToDisplay ) { bFoundContig = true; int nNewConsPos = nOldConsPosDisplayed + 1 - aOldAlignStartPositions[ nNewLittleContig ]; pSomeLittleContigWin = ConsEd::pGetConsEd()->pScrollExistingContigWinOrMakeNewContigWin( pContig, nNewConsPos ); } } // we just want the error message to not pull the Consed Main Window // above all the ContigWin's, so connect it to a contigwin // this is because, theoretically, it is possible that there is // no large contigs remaining ContigWin* pSomeContigWin = ( pSomeBigContigWin ? pSomeBigContigWin : pSomeLittleContigWin ); RWCString soMessage = "removed "; soMessage += RWCString( (long) pReadsToRemove->length() ); soMessage += " reads including: "; for( nRead = 0; nRead < aOriginalReadsToRemove.length(); ++nRead ) { LocatedFragment* pLocFragToRemove = aOriginalReadsToRemove[ nRead ]; soMessage += pLocFragToRemove->soGetName(); soMessage += " "; } if ( pAdditionalReadsPutIntoOwnContigs->length() > 0 ) { soMessage += "\nIn addition, removing the following reads either because they were unaligned or they had too small an aligned region: "; for( nRead = 0; nRead < pAdditionalReadsPutIntoOwnContigs->length(); ++nRead ) { if ( nRead != 0 ) soMessage += ", "; soMessage += (*pAdditionalReadsPutIntoOwnContigs)[ nRead ]->soGetName(); } } soMessage += " You must save the assembly before doing any other edits or else the .wrk recovery file will not be usable and consed may crash if you try to undo any edits"; popupInfoMessage( pSomeContigWin->pGcw_->widGetGuiContigWinTopLevel(), soMessage ); delete pPleaseWait; } // if ( bOKToUseGui ) // this will cause the user to be prompted to save the assembly when // quitting ConsEd::pGetAssembly()->setChanged(); } // I created this for the purpose of deleting the fake read from // a miniassembly in fixContigEnd void Contig :: deleteReadFromContig( LocatedFragment* pLocFragToDelete ) { // remove any base segments that refer to this read. Just leave // a hole there (sorry, but will be fixed hopefully when consed // starts up again) int nBaseSegEnd = baseSegArray_.nGetBaseSegmentIndexByConsPos( pLocFragToDelete->nGetAlignEnd() ); if ( nBaseSegEnd != RW_NPOS ) { for( int nBaseSeg = nBaseSegEnd; 0 <= nBaseSeg; --nBaseSeg ) { BaseSegment* pSeg = baseSegArray_[ nBaseSeg ]; if ( pSeg->nEndInConsensus_ < pLocFragToDelete->nGetAlignStart() ) break; if ( pLocFragToDelete == pSeg->pLocFrag_ ) { baseSegArray_.dapBs_.removeAt( nBaseSeg ); } } } apLocatedFragment_.remove( pLocFragToDelete ); }