/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "fixContigEnd.h" #include "contigEnd.h" #include "getAlignment.h" #include "contig.h" #include "locatedFragment.h" #include using namespace std; #include "consed.h" #include "assembly.h" #include "paddedPieceOfRead.h" #include "soAddCommas.h" #include "szGetTime.h" #include "fileDefines.h" #include #include "min.h" #include "max.h" #include "fullSmithWaterman.h" const int nNumberOfMatchingBasesInARowInAMatchSegment = 5; void fixContigEnd :: doIt( Contig*& pNewContig ) { pNewContig = pBigContig_; // in case of premature error fprintf( pAO, "working on %s of %s\n", szWhichGap( nWhichEnd_ ), pBigContig_->soGetName().data() ); // which reads should be used? // find high quality segment of contig if ( nWhichEnd_ == nLeftGap ) { pBigContig_->complementContig(); } // from here on, we work on just the right end // HHHHHHHHHHHHHHHHHlllllllllll (consensus) // rrrrrrrrrrrrrrr (take this) // rrrrrrrrr (take this) // rrrrrrrrrrrr (do not take this) int nHighQualitySegmentRightConsPos = pBigContig_->nPaddedIndexFast( pBigContig_->nGetHighQualitySegmentEnd() ); // find all reads protruding to the right of high quality segment bool bStartedFindingLeftmost = false; int nRead; for( nRead = 0; nRead < pBigContig_->nGetNumberOfReads(); ++nRead ) { LocatedFragment* pLocFrag = pBigContig_->pLocatedFragmentGet( nRead ); if ( nHighQualitySegmentRightConsPos < pLocFrag->nGetAlignEnd() ) { aProtrudingReads_.insert( pLocFrag ); if ( !bStartedFindingLeftmost ) { bStartedFindingLeftmost = true; nLeftmostConsPosOfProtrudingReads_ = pLocFrag->nGetAlignStart(); } else { nLeftmostConsPosOfProtrudingReads_ = MIN( nLeftmostConsPosOfProtrudingReads_, pLocFrag->nGetAlignStart() ); } } } fprintf( pAO, "number of protruding reads for %s %s : %d\n", szWhichGap( nWhichEnd_ ), pBigContig_->soGetName().data(), aProtrudingReads_.length() ); if ( aProtrudingReads_.length() < 2 ) { fprintf( pAO, "ignoring %s %s because there were only %d reads protruding from the high quality segment\n", szWhichGap( nWhichEnd_ ), pBigContig_->soGetName().data(), aProtrudingReads_.length() ); if ( nWhichEnd_ == nLeftGap ) { pBigContig_->complementContig(); } return; } fflush( pAO ); // we don't want nLeftmostConsPosOfProtrudingReads_ to be off the consensus // because it is used for recalculateConsensusQuality and for pulling out // base segments and for making a fake read. if ( nLeftmostConsPosOfProtrudingReads_ < pBigContig_->nGetStartConsensusIndex() ) nLeftmostConsPosOfProtrudingReads_ = pBigContig_->nGetStartConsensusIndex(); // write the .fasta and .fasta.qual for use for miniassembly soBaseNameForTemporaryFiles_.increaseMaxLength( (size_t) 1000 ); soBaseNameForTemporaryFiles_.nCurrentLength_ = sprintf( soBaseNameForTemporaryFiles_.data(), "%s_%s_%s", szLeftOrRight( nWhichEnd_ ), pBigContig_->soGetName().data(), pCP->soDateTimeDotInMiddle_.data() ); FileName filReadsToPhrap = soBaseNameForTemporaryFiles_ + ".fasta"; FileName filReadsToPhrapQual = filReadsToPhrap + ".qual"; FILE* pReads = fopen( filReadsToPhrap.data(), "w" ); if ( !pReads ) { THROW_FILE_ERROR( filReadsToPhrap ); } FILE* pQual = fopen( filReadsToPhrapQual.data(), "w" ); if ( !pQual ) { THROW_FILE_ERROR( filReadsToPhrapQual ); } for( nRead = 0; nRead < aProtrudingReads_.length(); ++nRead ) { LocatedFragment* pLocFrag = aProtrudingReads_[ nRead ]; // believe should use edited bases pLocFrag->writeReadToFasta( pReads, pQual, false ); // bUseSeqPhredCalledBases } for( nRead = 0; nRead < aNewReads_.length(); ++nRead ) { LocatedFragment* pLocFrag = aNewReads_[ nRead ]; pLocFrag->writeReadToFasta( pReads, pQual, true ); // bUseSeqPhredCalledBases // because new reads only have bases here } fclose( pReads ); fclose( pQual ); // run phrap via a script // if ( nWhichEnd_ == nLeftGap ) { // pBigContig_->complementContig(); // } FileName filAceFileFOF = soBaseNameForTemporaryFiles_ + ".fixConsensusEndAceFile.fof"; RWCString soCommand = pCP->filFullPathnameOfFixContigEndScript_ + " " + filReadsToPhrap + " " + filAceFileFOF; cout << "about to run: " << soCommand << endl; int nRetStat = system( (char*) soCommand.data() ); if ( nRetStat != 0 ) { char* szErrorMessage = strerror( nRetStat ); RWCString soError; if ( szErrorMessage ) { soError = "Something went wrong running " + soCommand + "\n Gave error code " + RWCString( (long) nRetStat ) + " which means " + RWCString( szErrorMessage ); } else { soError = "Something went wrong running " + soCommand + "\n Gave error code " + RWCString( (long) nRetStat ) + " which could not be decoded."; } reportMessage( true, soError ); if ( nWhichEnd_ == nLeftGap ) { pBigContig_->complementContig(); } cleanUp(); return; } // now read the new assembly FILE* pAceFOF = fopen( filAceFileFOF.data(), "r" ); if ( !pAceFOF ) { THROW_FILE_ERROR( filAceFileFOF ); } const int nMaxAceFileNameSize = 2000; FileName filAceFileName( (size_t) nMaxAceFileNameSize ); if ( fgets( filAceFileName.data(), nMaxAceFileNameSize, pAceFOF ) == NULL ) { RWCString soError = filAceFileFOF + " was empty"; THROW_ERROR2( soError ); } filAceFileName.nCurrentLength_ = strlen( filAceFileName.data() ); filAceFileName.stripTrailingWhitespaceFast(); fclose( pAceFOF ); cerr << "now trying to read>" << filAceFileName << "<" << endl; cerr.flush(); bool bError; openAceFileAndChooseContig( filAceFileName, bError ); if ( bError ) { if ( nWhichEnd_ == nLeftGap ) { pBigContig_->complementContig(); } cleanUp(); return; } // let's print some statistics on the miniassembly RWCString soMessage = RWCString( (long) ( aNewReads_.length() + aProtrudingReads_.length() ) ) + " reads was given to phrap and the resulting assembly contained " + RWCString( (long) pMiniContig_->nGetNumberOfReads() ) + " reads in the contig"; reportMessage( false, soMessage ); moveMiniAssemblyIntoMainAssembly(); pBigContig_->setUnpaddedPositionsArray(); pMiniContig_->setUnpaddedPositionsArray(); pBigContig_->setPaddedPositionsArray(); pMiniContig_->setPaddedPositionsArray(); alignMiniContigToBigContig( bError ); if ( bError ) { if ( nWhichEnd_ == nLeftGap ) { pBigContig_->complementContig(); } cleanUp(); return; } addColumnsOfPadsToAlignContigs(); pBigContig_->setUnpaddedPositionsArray(); pMiniContig_->setUnpaddedPositionsArray(); pBigContig_->setPaddedPositionsArray(); pMiniContig_->setPaddedPositionsArray(); assert( bCheckAddColumnsOfPadsToAlignContigs() ); createNewContig(); if ( nWhichEnd_ == nLeftGap ) { pNewContig_->complementContig(); } // to return pNewContig = pNewContig_; cleanUp(); } // doIt void fixContigEnd :: openAceFileAndChooseContig( const FileName& filAceFileName, bool& bError ) { bool bUsingPhdFilesSaved = ConsEd::pGetConsEd()->bUsingPhdFiles_; ConsEd::pGetConsEd()->bUsingPhdFiles_ = false; int nContigs = nOpenAceFileAndGetNumberOfContigs( filAceFileName ); if ( nContigs == 0 ) { reportMessage( true, "miniassembly had 0 contigs so not fixing this end" ); bError = true; return; } try { pMiniAssembly_ = new Assembly( filAceFileName, false ); // bSetGlobalAssembly } catch( InputDataError ide ) { reportMessage( true, "miniassembly failed--ace file couldn't be read" ); bError = true; return; } ConsEd::pGetConsEd()->bUsingPhdFiles_ = bUsingPhdFilesSaved; Contig* pBestContig = NULL; int nSumOfQualitiesOfBestContig = -666; for( int nContig = 0; nContig < pMiniAssembly_->nGetNumberOfContigs(); ++nContig ) { Contig* pContig = pMiniAssembly_->pGetContig( nContig ); if ( pContig->nGetNumberOfReads() == 1 ) continue; int nSumOfQualities = 0; for( int nConsPos = pContig->nGetStartConsensusIndex(); nConsPos <= pContig->nGetEndConsensusIndex(); ++nConsPos ) { if ( pContig->ntGetCons( nConsPos ).bIsPad() ) continue; nSumOfQualities += pContig->ntGetCons( nConsPos ).qualGetQualityWithout98or99X0(); } if ( nSumOfQualities > nSumOfQualitiesOfBestContig ) { nSumOfQualitiesOfBestContig = nSumOfQualities; pBestContig = pContig; } } if ( !pBestContig ) { reportMessage( true, "miniassembly did not contain any contig with more than 1 read so not using it." ); bError = true; return; } // LOOK HERE!!!! pMiniContig_ = pBestContig; // does the new contig need to be flipped? int nNumberToBeFlipped = 0; int nNumberToNotBeFlipped = 0; for( int nRead = 0; nRead < pMiniContig_->nGetNumberOfReads(); ++nRead ) { LocatedFragment* pLocFragInMiniassembly = pMiniContig_->pGetLocatedFragment( nRead ); LocatedFragment* pLocFragInMainAssembly = pBigContig_->pLocFragGetByName( pLocFragInMiniassembly->soGetName() ); assert( pLocFragInMainAssembly ); if ( pLocFragInMiniassembly->bComp() == pLocFragInMainAssembly->bComp() ) { ++nNumberToNotBeFlipped; } else { ++nNumberToBeFlipped; } } if ( ( nNumberToBeFlipped != 0 ) && ( nNumberToNotBeFlipped == 0 ) ) { pMiniContig_->complementContig(); } else if ( ( nNumberToBeFlipped == 0 ) && ( nNumberToNotBeFlipped != 0 ) ) { // do nothing } else { reportMessage( true, "in the miniassembly some reads were on opposite strand and some reads were on the same strand as in the main assembly so not using the miniassembly" ); bError = true; return; } bError = false; } // void fixContigEnd :: openAceFileAndFindFakeRead( void fixContigEnd :: reportMessage( const bool bError, const RWCString& soError ) { RWCString soFullError; if ( bError ) { soFullError = "cannot use miniassembly for "; } soFullError += pBigContig_->soGetName(); soFullError += " "; soFullError += szLeftOrRight( nWhichEnd_ ); soFullError += " "; soFullError += soError; cerr << soFullError << endl; cerr.flush(); fprintf( pAO, "%s\n", soFullError.data() ); fflush( pAO ); } void fixContigEnd :: alignMiniContigToBigContig( bool& bError ) { // the output of this is 2 paddedPieceOfRead's // CCCCCCCCCCCCCCCCCCCccccccccccccccccc old consensus // CxCxCxCxCxCxCCCCCCCCCCCCCCCCCCCCCCCC miniassembly consensus // which will not always perfectly match the old // consensus // we will use the entire miniassembly consensus in the alignment // we will use as much of the old consensus up to the left part where // the leftmost part of the reads that were used for the miniassembly int nUnpaddedConsPosToAlignLeftBigC = pBigContig_->nUnpaddedIndex( nLeftmostConsPosOfProtrudingReads_ ); int nUnpaddedConsPosToAlignLeftMini = pMiniContig_->nGetUnpaddedStartIndex(); // handle case in which there are pads at the beginning of the contig so // the unpadded position could be 0 causing getPartOfUnpaddedConsensus (below) to give an exception: if ( nUnpaddedConsPosToAlignLeftBigC < pBigContig_->nGetUnpaddedStartIndex() ) nUnpaddedConsPosToAlignLeftBigC = pBigContig_->nGetUnpaddedStartIndex(); int nUnpaddedConsPosToAlignRightBigC = pBigContig_->nGetUnpaddedEndIndex(); int nUnpaddedConsPosToAlignRightMini = pMiniContig_->nGetUnpaddedEndIndex(); RWCString soUnpaddedBigContigToAlign; RWCString soUnpaddedMinContigToAlign; pBigContig_->getPartOfUnpaddedConsensus( nUnpaddedConsPosToAlignLeftBigC, nUnpaddedConsPosToAlignRightBigC, soUnpaddedBigContigToAlign, false, // bComplemented true ); // bTowardsIncreasingUnpaddedPositions pMiniContig_->getPartOfUnpaddedConsensus( nUnpaddedConsPosToAlignLeftMini, nUnpaddedConsPosToAlignRightMini, soUnpaddedMinContigToAlign, false, // bComplemented true ); // bTowardsIncreasingUnpaddedPositions int n0BestAlignStartBigC; int n0BestAlignEndBigC; int n0BestAlignStartMini; int n0BestAlignEndMini; int nBestScore; cerr << "doing Smith-Waterman alignment..."; cerr.flush(); const int nGapPenalty = 4; const int nMatchBoost = 1; const int nMismatchPenalty = 2; fullSmithWaterman( soUnpaddedBigContigToAlign.data(), soUnpaddedMinContigToAlign.data(), &n0BestAlignStartBigC, &n0BestAlignStartMini, &n0BestAlignEndBigC, &n0BestAlignEndMini, &nBestScore, nGapPenalty, nMatchBoost, nMismatchPenalty ); cerr << "done" << endl; cerr.flush(); if ( nBestScore < pCP->nFixContigEndsMinSmithWatermanScoreToMakeJoin_ ) { RWCString soError = "will ignore this contig end because alignment score " + RWCString( (long) nBestScore ) + " is less than min " + RWCString( (long) pCP->nFixContigEndsMinSmithWatermanScoreToMakeJoin_ ); reportMessage( true, soError ); bError = true; return; } soUnpaddedBigContigToAlign.nCurrentLength_ = strlen( soUnpaddedBigContigToAlign.data() ); soUnpaddedMinContigToAlign.nCurrentLength_ = strlen( soUnpaddedMinContigToAlign.data() ); if ( ( n0BestAlignStartBigC == n0BestAlignEndBigC ) || ( n0BestAlignStartMini == n0BestAlignEndMini ) ) { reportMessage( true, "alignment failed\n" ); bError = true; return; } int nScore; cerr << "computing Needleman-Wunsch alignment..."; cerr.flush(); RWCString soAlignmentBigC; RWCString soAlignmentMini; getAlignment( soUnpaddedBigContigToAlign.data(), soUnpaddedMinContigToAlign.data(), nGapPenalty, nMatchBoost, nMismatchPenalty, soAlignmentBigC, soAlignmentMini, nScore ); assert( soAlignmentBigC.length() == soAlignmentMini.length() ); assert( nScore == nBestScore ); RWCString soAlignmentMidd( (size_t) soAlignmentBigC.length() ); int nMaxMatchesInARow = 0; int nMatchesInARow = 0; int n; for( n = 0; n < soAlignmentBigC.length(); ++n ) { if ( soAlignmentBigC[n] == soAlignmentMini[n] ) { soAlignmentMidd.append( ' ' ); ++nMatchesInARow; } else { if ( nMatchesInARow > nMaxMatchesInARow ) { nMaxMatchesInARow = nMatchesInARow; } nMatchesInARow = 0; soAlignmentMidd.append( 'X' ); } } // for final matching base if ( nMatchesInARow > nMaxMatchesInARow ) { nMaxMatchesInARow = nMatchesInARow; } // let's print out the alignment const int nLineSize = 50; for( n = 0; n < soAlignmentBigC.length(); n += nLineSize ) { int nWrittenToLine = MIN( soAlignmentBigC.length() - n, nLineSize ); fprintf( pAO, "\n%d:\n", n ); fprintf( pAO, "%.*s\n", nWrittenToLine, soAlignmentBigC.data() + n ); fprintf( pAO, "%.*s\n", nWrittenToLine, soAlignmentMidd.data() + n ); fprintf( pAO, "%.*s\n", nWrittenToLine, soAlignmentMini.data() + n ); } fflush( pAO ); if ( nMaxMatchesInARow < nNumberOfMatchingBasesInARowInAMatchSegment ) { RWCString soError = "alignment failed because only " + RWCString( (long) nMaxMatchesInARow ) + " matches in a row but there must be at least " + RWCString( (long) nNumberOfMatchingBasesInARowInAMatchSegment ); reportMessage( true, soError ); bError = true; return; } int nUnpaddedAlignmentStartBigC = nUnpaddedConsPosToAlignLeftBigC + n0BestAlignStartBigC; int nUnpaddedAlignmentEndBigC = nUnpaddedConsPosToAlignLeftBigC + n0BestAlignEndBigC; int nUnpaddedAlignmentStartMini = nUnpaddedConsPosToAlignLeftMini + n0BestAlignStartMini; int nUnpaddedAlignmentEndMini = nUnpaddedConsPosToAlignLeftMini + n0BestAlignEndMini; int nPaddedAlignmentStartBigC = pBigContig_->nPaddedIndex( nUnpaddedAlignmentStartBigC ); int nPaddedAlignmentStartMini = pMiniContig_->nPaddedIndex( nUnpaddedAlignmentStartMini ); pPaddedPieceOfReadBigC_ = new paddedPieceOfRead( "fixContigEnd::pPaddedPieceOfReadBigC_", soAlignmentBigC, 0, nPaddedAlignmentStartBigC, pBigContig_, nUnpaddedAlignmentStartBigC, nUnpaddedAlignmentEndBigC, 0, // where in soAlignmentBigC the alignment starts soAlignmentBigC.length() - 1, // where in soAlignmentBigC the alignment ends false ); // bComplemented pPaddedPieceOfReadMini_ = new paddedPieceOfRead( "fixContigEnd::pPaddedPieceOfReadMini_", soAlignmentMini, 0, nPaddedAlignmentStartMini, pMiniContig_, nUnpaddedAlignmentStartMini, nUnpaddedAlignmentEndMini, 0, // where in soAlignmentMini the alignment starts soAlignmentMini.length() - 1, // where in soAlignmentMini the alignment ends false ); // bComplemented bError = false; } // void fixContigEnd :: alignMiniContigToBigContig( void fixContigEnd :: moveMiniAssemblyIntoMainAssembly() { // remove reassembled reads from Main Assembly for( int nRead = 0; nRead < pMiniContig_->nGetNumberOfReads(); ++nRead ) { LocatedFragment* pLocFragInMini = pMiniContig_->pGetLocatedFragment( nRead ); LocatedFragment* pLocFragInMain = pBigContig_->pLocFragGetByName( pLocFragInMini->soGetName() ); if ( !pLocFragInMain ) { // note: for adding new reads, don't do this check because // the read will not be in the main assembly RWCString soError = pLocFragInMini->soGetName() + " is in the miniassembly but not in the main contig for " + pBigContig_->soGetName() + " " + szWhichGap( nWhichEnd_ ); THROW_ERROR2( soError ); } // this would be the place to move over quality values, if // we wanted to pLocFragInMain->transferQualityValues( pLocFragInMini ); // is this going to be too slow? // also, this read might be used in the base segments, but // they will be fixed when consed is restarted pBigContig_->apLocatedFragment_.remove( pLocFragInMain ); delete pLocFragInMain; // probably unnecessary } // since we have removed reads from apLocatedFragment_, the read // lists used by pGetContigView, apVeryLongReads_ and apLocatedFragment2_ // are now wrong, so fix them. pBigContig_->fixReadLists(); // since reads have been removed, we should decrease the // consensus quality. This is important for when we make a join, // the new consensus should reflect whichever has higher true // quality which partially reflects which has more reads. pBigContig_->recalculateConsensusQualitiesAndChange( nLeftmostConsPosOfProtrudingReads_, pBigContig_->nGetEndConsensusIndex() ); // delete base segments that now probably refer to reads that // no longer exist: pBigContig_->baseSegArray_.removeBaseSegmentsToRight( nLeftmostConsPosOfProtrudingReads_ ); // now rename miniassembly contig and move it to main assembly Assembly* pMainAssembly = ConsEd::pGetAssembly(); int nIndexInMiniassembly = pMiniAssembly_->dapContigs_.index( pMiniContig_ ); pMiniContig_->soSequenceName_ = pMainAssembly->soGetNewContigName(); // notice that I am removing the contig by index // that is because dapContigs_.remove( pContig ) would // remove the first contig in the miniassembly that has the // same name as pContig. In miniassembiles with many // contigs, and since we just renamed it above, it is possible // that there is another contig of the same name and it is // delete instead. This actually happened. (July 2010 DG) pMainAssembly->addContig( pMiniContig_ ); Contig* pContigRemoved = pMiniAssembly_->dapContigs_.removeAt( nIndexInMiniassembly ); // check that we removed the right contig assert( pContigRemoved == pMiniContig_ ); delete pMiniAssembly_; } // void fixContigEnd :: moveMiniAssemblyIntoMainAssembly() { // Overview: when making a join, we must decide what the consensus // should be. This is *not* done by looking at the highest quality // base in each column. Rather it is done by taking one of the other // of the old consensus over a stretch of bases, some of which may be // of lower quality than the base in the other consensus. These // stretches of bases are defined as follows: we look for stretches of // the alignment in which (I think) 5 bases or more in a row match // precisely. These are "match segments". The regions in between are // "discrepant segments". It is a little more complicated because // discrepant segments include some of the matching bases on each end. // Then each such segment is scored by adding the quality values. The // high scoring segment is used for the consensus. const int nBackUpInMatchSegment = 3; // in closing a discrepant contig segment, after found 5 bases in a row, // back up 3 to put the end on the 2nd matching base, ready to put the // start of the next contig segment on the 3rd matching base const int nBackUpFromFirstDiscrepancy = 4; // in closing a match contig segment, when find a discrepancy, backup // 4 bases hence putting the discrepant contig segment starting with // 3 matching bases void fixContigEnd :: createNewContig() { // save some information for when we create a tag (far below) RWCString soCommentForTag = "old contigs:\n" + pBigContig_->soGetName() + " length: " + soAddCommas( pBigContig_->nGetUnpaddedSeqLength() ) + " reads: " + soAddCommas( pBigContig_->nGetNumberOfReads() ) + "\n" + pMiniContig_->soGetName() + " length: " + soAddCommas( pMiniContig_->nGetUnpaddedSeqLength() ) + " reads: " + soAddCommas( pMiniContig_->nGetNumberOfReads() ) + "\n" + "ace file: " + ConsEd::pGetAssembly()->filGetAceFileFullPathname(); cout << "finding contig segments" << endl; cout.flush(); // find contigSegments for each contig // terminate when reach end of alignment int nPaddedStart = pPaddedPieceOfReadBigC_->nPaddedAlignmentStart_; int nUnpaddedStart1 = 1; int nUnpaddedStart2 = 1; bool bFirstTime = true; bool bEndOfAlignment = false; while( !bEndOfAlignment ) { int nPaddedEnd; int nUnpaddedEnd1; int nUnpaddedEnd2; findEndOfDiscrepantSegment( nPaddedStart, nPaddedEnd, nUnpaddedEnd1, nUnpaddedEnd2, bEndOfAlignment ); if ( bEndOfAlignment && bFirstTime ) { RWCString soError = "The alignment is not sufficiently good to make a join " + pBigContig_->soGetName() + " " + szWhichGap( nWhichEnd_ ); THROW_ERROR2( soError ); } bFirstTime = false; addContigSegment( nUnpaddedStart1, nUnpaddedEnd1, nUnpaddedStart2, nUnpaddedEnd2 ); if ( !bEndOfAlignment ) { nUnpaddedStart1 = nUnpaddedEnd1 + 1; nUnpaddedStart2 = nUnpaddedEnd2 + 1; nPaddedStart = nPaddedEnd + 1; bool bMatchSegmentExists; findEndOfMatchSegment( nPaddedStart, nPaddedEnd, nUnpaddedEnd1, nUnpaddedEnd2, bMatchSegmentExists ); if ( bMatchSegmentExists ) { addContigSegment( nUnpaddedStart1, nUnpaddedEnd1, nUnpaddedStart2, nUnpaddedEnd2 ); } nUnpaddedStart1 = nUnpaddedEnd1 + 1; nUnpaddedStart2 = nUnpaddedEnd2 + 1; nPaddedStart = nPaddedEnd + 1; } } // while( !bEndOfAlignment ) cout << "calculating total qualities for join of " << pBigContig_->soGetName() << " and " << pMiniContig_->soGetName() << " working on " << szWhichGap( nWhichEnd_ ) << endl; cout.flush(); // calculate total qualities for each contig segment int nContigSeg; for( nContigSeg = 0; nContigSeg < contigSegments_.length(); ++nContigSeg ) { calculateTotalQualitiesOfContigSegment( contigSegments_[ nContigSeg ] ); } cout << "done" << endl; cout.flush(); // which contig is the left contig? (This is necessary in order to // determine how to modify the read positions in the new contig.) // This is *not* determined by which contig sticks out furthest to // the left (although usually that will turn out to be the case. // Instead, it is determined by which contig has the highest sum of // quality values up to the first run of matches within the // alignment. I believe that "run" is defined as 5 matches in a // row, and count up to the 2nd match. // Similary, the rightmost contig is defined as the contig that has the // highest sum of quality values in the last contig segment. That // means the sum of quality values from the last run of 5 matches // within the alignment to the end of the contig. compareContigsTypes eLeftContig; compareContigsTypes eNotLeftContig; Contig* pLeftContig; Contig* pNotLeftContig; Contig* pBestContig; int nBestPaddedStart; int nBestPaddedEnd; getBestContigSegment( 0, pBestContig, nBestPaddedStart, nBestPaddedEnd ); if ( pBestContig == pBigContig_ ) { eLeftContig = eContig1; eNotLeftContig = eContig2; pLeftContig = pBigContig_; pNotLeftContig = pMiniContig_; } else { eLeftContig = eContig2; eNotLeftContig = eContig1; pLeftContig = pMiniContig_; pNotLeftContig = pBigContig_; } // if ( pGuiCompareContigs_->bHighlightRightReads() ) { // pLeftContig->unhighlightAllReads(); // pNotLeftContig->highlightAllReads(); // } // Which old contig is used at the end of the new contig? This is // important for trimming back the aligned region of reads--some will get // trimmed back more than others. compareContigsTypes eRightContig; compareContigsTypes eNotRightContig; Contig* pRightContig; Contig* pNotRightContig; getBestContigSegment( contigSegments_.length() - 1, pBestContig, nBestPaddedStart, nBestPaddedEnd ); if ( pBestContig == pBigContig_ ) { eRightContig = eContig1; eNotRightContig = eContig2; pRightContig = pBigContig_; pNotRightContig = pMiniContig_; } else { eRightContig = eContig2; eNotRightContig = eContig1; pRightContig = pMiniContig_; pNotRightContig = pBigContig_; } // This is the padded offset of the not left contig from the // left contig. // Presumably, the left-most aligned bases are both not pads and // they are aligned. Thus the difference of their padded positions // in their respective coordinate systems gives the offset between // the coordinate systems (padded cons pos) int nNewConsPosFromNotLeftConsPos = pGetPaddedPieceOfRead( eLeftContig )->pContig_->nPaddedIndexFast( pGetPaddedPieceOfRead( eLeftContig )->nUnpaddedAlignmentStart_ ) - pGetPaddedPieceOfRead( eNotLeftContig)->pContig_->nPaddedIndexFast( pGetPaddedPieceOfRead( eNotLeftContig)->nUnpaddedAlignmentStart_ ); makeContigSegmentsContiguous( pLeftContig, nNewConsPosFromNotLeftConsPos ); assert( bContigSegmentsOK( ) ); nInNewJoinedContigAlignedRegionLeftConsPos_ = pGetPaddedPieceOfRead( eLeftContig )->pContig_->nPaddedIndexFast( pGetPaddedPieceOfRead( eLeftContig )->nUnpaddedAlignmentStart_ ); nInNewJoinedContigAlignedRegionRightConsPos_ = pGetPaddedPieceOfRead( eLeftContig )->pContig_->nPaddedIndexFast( pGetPaddedPieceOfRead( eLeftContig )->nUnpaddedAlignmentEnd_ ); // make a new contig. How long should it be? int nPaddedBasesInNewContig = 0; for( nContigSeg = 0; nContigSeg < contigSegments_.length(); ++nContigSeg ) { int nNewPaddedStart; int nNewPaddedEnd; getBestContigSegment( nContigSeg, pBestContig, nNewPaddedStart, nNewPaddedEnd ); int nPaddedBasesInContigSegment = nNewPaddedEnd - nNewPaddedStart + 1; nPaddedBasesInNewContig += nPaddedBasesInContigSegment; } RWCString soNewContigName = ConsEd::pGetConsEd()->pGetAssembly()->soGetNewContigName(); fprintf( pAO, "joining big contig %s to reassembled contig %s to form %s\n", pBigContig_->soGetName().data(), pMiniContig_->soGetName().data(), soNewContigName.data() ); int nNewNumberOfReads = pBigContig_->nGetNumberOfFragsInContig() + pMiniContig_->nGetNumberOfFragsInContig(); // I'm not going to work on the base segments int nNewNumberOfBaseSegments = 0; // try to not screw up most of the tags that depend on direction // fix me!!! bool bComplementedFromWayPhrapCreated; if (pBigContig_->bIsComplementedFromWayPhrapCreated() ) { if ( pMiniContig_->bIsComplementedFromWayPhrapCreated() ) bComplementedFromWayPhrapCreated = true; else { // ambiguous case--one contig is complemented and // the other is now. if ( pBigContig_->nGetNumberOfFragsInContig() > pMiniContig_->nGetNumberOfFragsInContig() ) bComplementedFromWayPhrapCreated = true; else bComplementedFromWayPhrapCreated = false; } } else { if ( pMiniContig_->bIsComplementedFromWayPhrapCreated() ) { // ambiguous case again if ( pBigContig_->nGetNumberOfFragsInContig() > pMiniContig_->nGetNumberOfFragsInContig() ) bComplementedFromWayPhrapCreated = false; else bComplementedFromWayPhrapCreated = true; } else bComplementedFromWayPhrapCreated = false; } cout << "ctor for new contig" << endl; cout.flush(); // now create space for new contig pNewContig_ = new Contig( (char*) soNewContigName.data(), nNewNumberOfReads, nNewNumberOfBaseSegments, bComplementedFromWayPhrapCreated, ConsEd::pGetAssembly() ); pNewContig_->resize( nPaddedBasesInNewContig ); ConsEd::pGetConsEd()->pGetAssembly()->addContig( pNewContig_ ); cout << " adding new bases " << endl; cout.flush(); // add bases to new contig for( nContigSeg = 0; nContigSeg < contigSegments_.length(); ++nContigSeg ) { int nNewPaddedStart; int nNewPaddedEnd; getBestContigSegment( nContigSeg, pBestContig, nNewPaddedStart, nNewPaddedEnd ); // convert back to the padded positions of the old contigs int nPaddedStart; int nPaddedEnd; if ( pBestContig == pLeftContig ) { nPaddedStart = nNewPaddedStart; nPaddedEnd = nNewPaddedEnd; } else { nPaddedStart = nNewPaddedStart - nNewConsPosFromNotLeftConsPos; nPaddedEnd = nNewPaddedEnd - nNewConsPosFromNotLeftConsPos; } // where consensus bases are made for( int nConsPos = nPaddedStart; nConsPos <= nPaddedEnd; ++nConsPos ) { pNewContig_->append( pBestContig->ntGetCons( nConsPos ) ); } } cout << "transferring reads to new contig" << endl; cout.flush(); // transfer reads to new contig // At the same time, trim back the aligned region of each read, if // necessary. int nLeftContigAlignedRegionLeft = 1; int nLeftContigAlignedRegionRight; if ( pLeftContig == pRightContig ) nLeftContigAlignedRegionRight = pNewContig_->nGetEndConsensusIndex(); else nLeftContigAlignedRegionRight = nInNewJoinedContigAlignedRegionRightConsPos_; int nRead; for( nRead = 0; nRead < pLeftContig->nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLeftContig->pLocatedFragmentGet( nRead ); pLocFrag->pContig_ = pNewContig_; pLocFrag->transferTagsToNewContig( pNewContig_, 0 ); pNewContig_->addLocatedFragment( pLocFrag ); // trim back the aligned clipped region, if necessary pLocFrag->trimBackAlignedRegion( nLeftContigAlignedRegionLeft, nLeftContigAlignedRegionRight ); } // move the reads in the "not left" contig. // At the same time, trim back the aligned region of each read. int nNotLeftContigAlignedRegionLeft = nInNewJoinedContigAlignedRegionLeftConsPos_; int nNotLeftContigAlignedRegionRight; if ( pNotLeftContig == pRightContig ) nNotLeftContigAlignedRegionRight = pNewContig_->nGetEndConsensusIndex(); else nNotLeftContigAlignedRegionRight = nInNewJoinedContigAlignedRegionRightConsPos_; for( nRead = 0; nRead < pNotLeftContig->nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pNotLeftContig->pLocatedFragmentGet( nRead ); pLocFrag->moveReadAndReadTagsToNewContig( pNewContig_, nNewConsPosFromNotLeftConsPos ); // moveReadAndReadTagsToNewContig will change the nAlignClipStart_ // and nAlignClipEnd_ to the new coordinates. Now I need to // trim them back. pLocFrag->trimBackAlignedRegion( nNotLeftContigAlignedRegionLeft, nNotLeftContigAlignedRegionRight ); } pNewContig_->updateLastDisplayableContigPos(); // updateLastDisplayableContigPos must come before // setPaddedPositionsArray since the latter uses // ntGetCons which does a check that the nConsPos // is within the displayable contig positions pNewContig_->setUnpaddedPositionsArray(); pNewContig_->setPaddedPositionsArray(); cout << "moving consensus tags..." << endl; // transfer consensus tags // the left contig positions are the same as the new contig positions // Let's see if we need to change the // bComplementedWithRespectToWayPhrapCreatedContig_ flag in the // oligo and autoFinishExpTag objects: bool bComplementFlag = ( pLeftContig->bComplementedFromWayPhrapCreated_ == pNewContig_->bComplementedFromWayPhrapCreated_ ? false : true ); int nTag; for( nTag = 0; nTag < pLeftContig->nGetNumberOfTags(); ++nTag ) { tag* pTag = pLeftContig->pGetTag( nTag ); pTag->pContig_ = pNewContig_; if ( bComplementFlag ) pTag->flipOrientation(); pNewContig_->addConsensusTag( pTag ); } // Let's see if we need to change the // bComplementedWithRespectToWayPhrapCreatedContig_ flag in the // oligo and autoFinishExpTag objects: bComplementFlag = ( pNotLeftContig->bComplementedFromWayPhrapCreated_ == pNewContig_->bComplementedFromWayPhrapCreated_ ? false : true ); // the not left contig positions must be adjusted for( nTag = 0; nTag < pNotLeftContig->nGetNumberOfTags(); ++nTag ) { tag* pTag = pNotLeftContig->pGetTag( nTag ); pTag->pContig_ = pNewContig_; pTag->nPaddedConsPosStart_ += nNewConsPosFromNotLeftConsPos; pTag->nPaddedConsPosEnd_ += nNewConsPosFromNotLeftConsPos; if ( bComplementFlag ) pTag->flipOrientation(); pNewContig_->addConsensusTag( pTag ); } // In some cases, the consensus tag may not be on the consensus // any longer. In such a case trim it back or delete the tag // altogether. Report this to the user. RWCString soConsensusTagErrors; for( nTag = pNewContig_->nGetNumberOfTags() - 1; nTag >= 0; --nTag ) { tag* pTag = pNewContig_->pGetTag( nTag ); int nIntersectLeftConsPos; int nIntersectRightConsPos; if (bIntersect( pNewContig_->nGetStartIndex(), pNewContig_->nGetEndIndex(), pTag->nPaddedConsPosStart_, pTag->nPaddedConsPosEnd_, nIntersectLeftConsPos, nIntersectRightConsPos ) ) { if ( nIntersectLeftConsPos != pTag->nPaddedConsPosStart_ || nIntersectRightConsPos != pTag->nPaddedConsPosEnd_ ) { soConsensusTagErrors += " consensus tag of type "; soConsensusTagErrors += pTag->soType_; soConsensusTagErrors += " had to be trimmed back from padded pos"; soConsensusTagErrors += RWCString( (long) pTag->nPaddedConsPosStart_ ); soConsensusTagErrors += "-"; soConsensusTagErrors += RWCString( (long) pTag->nPaddedConsPosEnd_ ); soConsensusTagErrors += " to "; soConsensusTagErrors += RWCString( (long) nIntersectLeftConsPos ); soConsensusTagErrors += "-"; soConsensusTagErrors += RWCString( (long) nIntersectRightConsPos ); soConsensusTagErrors += " in order to fit on the consensus\n\n"; } pTag->nPaddedConsPosStart_ = nIntersectLeftConsPos; pTag->nPaddedConsPosEnd_ = nIntersectRightConsPos; } else { // tag doesn't fit on consensus at all soConsensusTagErrors += "tag of type "; soConsensusTagErrors += pTag->soType_; soConsensusTagErrors += " data: "; soConsensusTagErrors += pTag->soMiscData_; soConsensusTagErrors += " mapped to padded cons pos "; soConsensusTagErrors += RWCString( (long) pTag->nPaddedConsPosStart_ ); soConsensusTagErrors += " to "; soConsensusTagErrors += RWCString( (long) pTag->nPaddedConsPosEnd_ ); soConsensusTagErrors += " which is completely off the new consensus so deleting this consensus tag.\n\n"; pNewContig_->aConsensusTags_.removeAt( nTag ); delete pTag; } } // Now calculate new quality values for the consensus. I will only // recalculate it over the region that is aligned (as indicated in // the CompareContigs window). For the regions to the left and // right of these, it is fine to just use the existing quality // values from the consensus of the leftmost and rightmost contigs // (the ones that had the highest total qualities over those // regions). After all, the reads of the opposite contigs are not // even aligned against the consensus in these regions, so why care // about them? (Are the alignment clipping of reads changed?) int nRegionToRecalculateConsPosLeft; int nRegionToRecalculateConsPosRight; int nTemp; Contig* pTempContig; getBestContigSegment( 0, pTempContig, nTemp, nRegionToRecalculateConsPosLeft ); getBestContigSegment( contigSegments_.length() - 1, pTempContig, nRegionToRecalculateConsPosRight, nTemp ); assert( nRegionToRecalculateConsPosLeft <= nRegionToRecalculateConsPosRight ); // trying not recalculating cons qualities...either use consensus quality // from the new assembly or from the old big contig. Both of these // are more accurate--the former because phrap knew the read qualities // and the latter because, after removing reads, recalculateConsensusQualities // was run on it and the reads there had correct qualities. // pNewContig_->recalculateConsensusQualitiesAndChange( // nRegionToRecalculateConsPosLeft, // nRegionToRecalculateConsPosRight ); // figure out some location in the middle of the alignment // to show user when done paddedPieceOfRead* pPaddedPieceOfReadLeft = pGetPaddedPieceOfRead( eLeftContig ); int nUnpaddedMiddleOfAlignment = ( pPaddedPieceOfReadLeft->nUnpaddedAlignmentStart_ + pPaddedPieceOfReadLeft->nUnpaddedAlignmentEnd_ ) / 2; int nMiddleOfAlignmentConsPos = pPaddedPieceOfReadLeft->pContig_->nPaddedIndexFast( nUnpaddedMiddleOfAlignment ); // add a join tag at this location soCommentForTag = soCommentForTag + "\nnew contig " + pNewContig_->soGetName() + " " + " length: " + soAddCommas( pNewContig_->nGetUnpaddedSeqLength() ) + " reads: " + soAddCommas( pNewContig_->nGetNumberOfReads() ); tag* pTag = new tag( NULL, pNewContig_, "join", nMiddleOfAlignmentConsPos, nMiddleOfAlignmentConsPos, false, // bWriteToPhdFileNotAceFile soCommentForTag, // soComment, soConsed, // consed soEmptyString, // date will be current date false ); // bNoTrans pNewContig_->addConsensusTag( pTag ); // I do not think that we want to either make adding this tag // undoable or write it to the edit history file since it is // an indivisible part of a join. Thus we add it to the contig // above. // EditAction* pEditAction = new editAddConsensusTag( pTag ); //ConsEd::pGetConsEd()->doEditAction( pEditAction, // true ); // bWriteToEditHistoryFile cout << "deleting old contigs..." << endl; RWCString soOldContigName = pBigContig_->soGetName(); delete pBigContig_; delete pMiniContig_; pBigContig_ = NULL; pMiniContig_ = NULL; pNewContig_->soSequenceName_ = soOldContigName; } // void fixContigEnd :: createNewContig() { void fixContigEnd :: findEndOfMatchSegment( const int nStartPadded, int& nEndPadded, int& nEndUnpadded1, int& nEndUnpadded2, bool& bMatchSegmentExists ) { bool bEndOfAlignment = false; int nPadded = nStartPadded; // two ways to terminate: 1) end of alignment // 2) find discrepancy // xxmmmmmxmmxmmmmmxmmmmmmxmm (end ) // ---||-------||----|||----- // In either case, back up 4 bases. This may be before nStartPadded, // in which case there is no matching segment. If this is at nStartPadded // or after, this finishes a matching segment (which may be just one // length). bool bContinue = true; while( bContinue ) { if ( nPadded > pPaddedPieceOfReadBigC_->nPaddedAlignmentEnd_ ) { bContinue = false; bEndOfAlignment = true; } else { if ( pPaddedPieceOfReadBigC_->soBasesAndPads_[ nPadded ] != pPaddedPieceOfReadMini_->soBasesAndPads_[ nPadded ] ) { // found discrepancy bContinue = false; } else ++nPadded; } } // only ways to leave loop are that are now past the end of the // alignment or have found a discrepancy. Being off the end // of the alignment is considered a discrepancy, so they are // handled identically. if ( nStartPadded <= (nPadded - nBackUpFromFirstDiscrepancy ) ) { nEndPadded = nPadded - nBackUpFromFirstDiscrepancy; bMatchSegmentExists = true; } else { // set up for next findEndOfDiscrepantSegment nEndPadded = nStartPadded - 1; bMatchSegmentExists = false; } nEndUnpadded1 = pPaddedPieceOfReadBigC_->nUnpaddedContigPosFromZeroBasedPaddedPos( nEndPadded ); nEndUnpadded2 = pPaddedPieceOfReadMini_->nUnpaddedContigPosFromZeroBasedPaddedPos( nEndPadded ); } void fixContigEnd :: findEndOfDiscrepantSegment( const int nPaddedStart, // zero-based paddedPieceOfRead coordinates int& nPaddedEnd, // zero-based paddedPieceOfRead coordinates int& nUnpaddedEnd1, // in unpadded pBigContig_ coordinates int& nUnpaddedEnd2, // in unpadded pMiniContig_ coordinates bool& bEndOfAlignment ) { int nPadded = nPaddedStart; // ways to terminate: // 1) end of alignment // 2) matching segment 5 chars long int nMatchingBasesInARow = 0; bEndOfAlignment = false; bool bContinue = true; while( bContinue ) { if ( nPadded > pPaddedPieceOfReadBigC_->nPaddedAlignmentEnd_ ) { bContinue = false; bEndOfAlignment = true; } else { if ( pPaddedPieceOfReadBigC_->soBasesAndPads_[ nPadded ] == pPaddedPieceOfReadMini_->soBasesAndPads_[ nPadded ] ) { ++nMatchingBasesInARow; if ( nMatchingBasesInARow >= nNumberOfMatchingBasesInARowInAMatchSegment ) { bContinue = false; } } else { nMatchingBasesInARow = 0; } } if (bContinue ) ++nPadded; } if (bEndOfAlignment ) { nPaddedEnd = 0; // not used nUnpaddedEnd1 = pBigContig_->nGetUnpaddedEndIndex(); nUnpaddedEnd2 = pMiniContig_->nGetUnpaddedEndIndex(); } else { nPaddedEnd = nPadded - nBackUpInMatchSegment; nUnpaddedEnd1 = pGetPaddedPieceOfRead( eContig1 )->nUnpaddedContigPosFromZeroBasedPaddedPos( nPaddedEnd ); nUnpaddedEnd2 = pGetPaddedPieceOfRead( eContig2 )->nUnpaddedContigPosFromZeroBasedPaddedPos( nPaddedEnd ); } } void fixContigEnd :: addContigSegment( const int nUnpaddedStart1, const int nUnpaddedEnd1, const int nUnpaddedStart2, const int nUnpaddedEnd2 ) { contigSegment cs; cs.nUnpaddedStart1_ = nUnpaddedStart1; cs.nUnpaddedEnd1_ = nUnpaddedEnd1; cs.nUnpaddedStart2_ = nUnpaddedStart2; cs.nUnpaddedEnd2_ = nUnpaddedEnd2; contigSegments_.append( cs ); } void fixContigEnd :: getBestContigSegment( const int nContigSeg, Contig*& pBestContig, int& nBestPaddedStart, int& nBestPaddedEnd ) { contigSegment cs = contigSegments_[ nContigSeg ]; if ( cs.bBestContigIsContig1NotContig2_ ) { pBestContig = pBigContig_; } else { pBestContig = pMiniContig_; } nBestPaddedStart = cs.nNewPaddedStart_; nBestPaddedEnd = cs.nNewPaddedEnd_; } void fixContigEnd :: calculateTotalQualitiesOfContigSegment( contigSegment& cs ) { int nTotalQuality1 = 0; int nUnpadded; for( nUnpadded = cs.nUnpaddedStart1_; nUnpadded <= cs.nUnpaddedEnd1_; ++nUnpadded ) { int nConsPos = pBigContig_->nPaddedIndexFast( nUnpadded ); if ( pBigContig_->ntGetCons( nConsPos ).cGetBase() != 'x' ) { int nQuality = pBigContig_->ntGetCons( nConsPos ).qualGetQualityWithout98or99(); // this differs from the calculateTotalQualitiesOfContigSegment( // in compareContigs2 in that we don't want long stretches // of quality 0 consensus where there are no bases // if ( nQuality == 0 ) nQuality = 1; nTotalQuality1 += nQuality; } } int nTotalQuality2 = 0; for( nUnpadded = cs.nUnpaddedStart2_; nUnpadded <= cs.nUnpaddedEnd2_; ++nUnpadded ) { int nConsPos = pMiniContig_->nPaddedIndexFast( nUnpadded ); if ( pMiniContig_->ntGetCons( nConsPos ).cGetBase() != 'x' ) { int nQuality = pMiniContig_->ntGetCons( nConsPos ).qualGetQualityWithout98or99(); // if ( nQuality == 0 ) nQuality = 1; nTotalQuality2 += nQuality; } } if ( nTotalQuality1 > nTotalQuality2 ) cs.bBestContigIsContig1NotContig2_ = true; else cs.bBestContigIsContig1NotContig2_ = false; } bool fixContigEnd :: bContigSegmentsOK( ) { // This is the padded offset of the not left contig from the // left contig. // Presumably, the left-most aligned bases are both not pads and // they are aligned. Thus the difference of their padded positions // in their respective coordinate systems gives the offset between // the coordinate systems (padded cons pos) int nConsPos1FromConsPos2 = pGetPaddedPieceOfRead( eContig1 )->pContig_->nPaddedIndexFast( pGetPaddedPieceOfRead( eContig1 )->nUnpaddedAlignmentStart_ ) - pGetPaddedPieceOfRead( eContig2 )->pContig_->nPaddedIndexFast( pGetPaddedPieceOfRead( eContig2 )->nUnpaddedAlignmentStart_ ); // check that starting locations of each contig correspond // check that ending locations of each contig correspond int nOldUnpaddedEnd1; int nOldUnpaddedEnd2; contigSegment csOld; bool bOK = true; for( int nContigSeg = 0; nContigSeg < contigSegments_.length(); ++nContigSeg ) { contigSegment cs = contigSegments_[ nContigSeg ]; int nConsPosStart1 = pBigContig_->nPaddedIndexFast( cs.nUnpaddedStart1_ ); int nConsPosStart2 = pMiniContig_->nPaddedIndexFast( cs.nUnpaddedStart2_ ); if ( ( nConsPosStart1 - nConsPosStart2 ) != nConsPos1FromConsPos2 ) { if ( nContigSeg != 0 ) { cerr << "boink 1 nContigSeg = " << nContigSeg << " cs.nUnpaddedStart1_ = " << cs.nUnpaddedStart1_ << " cs.nUnpaddedStart2_ = " << cs.nUnpaddedStart2_ << " nConsPosStart1 = " << nConsPosStart1 << " nConsPosStart2 = " << nConsPosStart2 << " nConsPos1FromConsPos2 = " << nConsPos1FromConsPos2 << endl; bOK = false; } } int nConsPosEnd1 = pBigContig_->nPaddedIndexFast( cs.nUnpaddedEnd1_ ); int nConsPosEnd2 = pMiniContig_->nPaddedIndexFast( cs.nUnpaddedEnd2_ ); if ( ( nConsPosEnd1 - nConsPosEnd2 ) != nConsPos1FromConsPos2 ) { if ( nContigSeg != ( contigSegments_.length() - 1 ) ) { cerr << "boink 2 nContigSeg = " << nContigSeg << " cs.nUnpaddedEnd1_ = " << cs.nUnpaddedEnd1_ << " cs.nUnpaddedEnd2_ = " << cs.nUnpaddedEnd2_ << " nConsPosEnd1 = " << nConsPosEnd1 << " nConsPosEnd2 = " << nConsPosEnd2 << " nConsPos1FromConsPos2 = " << nConsPos1FromConsPos2 << endl; bOK = false; } } if ( nContigSeg != 0 ) { if ( ! ( ( ( nOldUnpaddedEnd1 + 1 ) == cs.nUnpaddedStart1_ ) && ( ( nOldUnpaddedEnd2 + 1 ) == cs.nUnpaddedStart2_ ) && ( ( csOld.nNewPaddedEnd_ + 1 ) == cs.nNewPaddedStart_ ) ) ) { cerr << "boink not contiguous nContigSeg = " << nContigSeg << " cs.nUnpaddedEnd1_ = " << cs.nUnpaddedEnd1_ << " cs.nUnpaddedEnd2_ = " << cs.nUnpaddedEnd2_ << " nConsPosEnd1 = " << nConsPosEnd1 << " nConsPosEnd2 = " << nConsPosEnd2 << " nConsPos1FromConsPos2 = " << nConsPos1FromConsPos2 << " nOldUnpaddedEnd1 = " << nOldUnpaddedEnd1 << " nOldUnpaddedEnd2 = " << nOldUnpaddedEnd2 << " cs.nUnpaddedStart1_ = " << cs.nUnpaddedStart1_ << " cs.nUnpaddedStart2_ = " << cs.nUnpaddedStart2_ << " csOld.nUnpaddedStart1_ = " << csOld.nUnpaddedStart1_ << " csOld.nUnpaddedStart2_ = " << csOld.nUnpaddedStart2_ << " csOld.nUnpaddedEnd1_ = " << csOld.nUnpaddedEnd1_ << " csOld.nUnpaddedEnd2_ = " << csOld.nUnpaddedEnd2_ << endl; bOK = false; } } // if ( nContigSeg != 0 ) ... csOld = cs; nOldUnpaddedEnd1 = cs.nUnpaddedEnd1_; nOldUnpaddedEnd2 = cs.nUnpaddedEnd2_; } return( bOK ); } // consensus // A*B contig source, CD contig dest // alignment // AB contig source // CD contig dest // nSourceUnpadded is the position of A // nDestUnpadded is the position of C // nDestPadded is the position of C // Hence must insert just before nDestPadded + 1 // final alignment should be: // A*B // C*D // If consensus is A*BC and DE // and alignment is ABC // D*E // then final alignment should be: // A*BC // D**E // The first column of pads cannot be removed because there is // some read in the A*BC contig that has a base in that column and // the pad is necessary in order to make room for that base. The 2 // pads in D**E are necessary so that A aligns with D and C aligns with // E // If the consensus is A*B*C and DE // and the alignment is ABC // D*E // then the final alignment should be: // A*B*C // D***E // If the consensus is A*B and CDE // and the alignment is A*B // CDE // then the final alignment is (?) // A*B // CDE // or alternatively: // A**B // C*DE // where the first column contains the base from the read in the AB // contig that has an extra base and the 3rd column has all pads in // the AB contig and the high quality bases in the CDE contig. void fixContigEnd :: addColumnsOfPadsToAlignContigs() { assert( pBigContig_->baseSegArray_.bGetDataStructureOk() ); assert( pMiniContig_->baseSegArray_.bGetDataStructureOk() ); int nAlignmentIndexOfRightNonPadAlignment; bool bFirstTime = true; for( int nAlignmentIndex = pPaddedPieceOfReadBigC_->nPaddedAlignmentEnd_; nAlignmentIndex >= pPaddedPieceOfReadBigC_->nPaddedAlignmentStart_; --nAlignmentIndex ) { if ( ( pPaddedPieceOfReadBigC_->soBasesAndPads_[ nAlignmentIndex ] != '*' ) && ( pPaddedPieceOfReadMini_->soBasesAndPads_[ nAlignmentIndex ] != '*' ) ) { // found an alignment of two non-pads if (bFirstTime ) { bFirstTime = false; nAlignmentIndexOfRightNonPadAlignment = nAlignmentIndex; } else { int nAlignmentIndexOfLeftNonPadAlignment = nAlignmentIndex; int nUnpaddedLeft1 = pPaddedPieceOfReadBigC_->nUnpaddedContigPosFromZeroBasedPaddedPos( nAlignmentIndexOfLeftNonPadAlignment ); int nPaddedLeft1 = pBigContig_->nPaddedIndexFast( nUnpaddedLeft1 ); int nUnpaddedRight1 = pPaddedPieceOfReadBigC_->nUnpaddedContigPosFromZeroBasedPaddedPos( nAlignmentIndexOfRightNonPadAlignment ); int nPaddedRight1 = pBigContig_->nPaddedIndexFast( nUnpaddedRight1 ); int nUnpaddedLeft2 = pPaddedPieceOfReadMini_->nUnpaddedContigPosFromZeroBasedPaddedPos( nAlignmentIndexOfLeftNonPadAlignment ); int nPaddedLeft2 = pMiniContig_->nPaddedIndexFast( nUnpaddedLeft2 ); int nUnpaddedRight2 = pPaddedPieceOfReadMini_->nUnpaddedContigPosFromZeroBasedPaddedPos( nAlignmentIndexOfRightNonPadAlignment ); int nPaddedRight2 = pMiniContig_->nPaddedIndexFast( nUnpaddedRight2 ); // check if the 2 contigs already have the same number // of bases and pads between the two aligned non-pads int nContig1MoreBasesThanContig2 = ( nPaddedRight1 - nPaddedLeft1 ) - ( nPaddedRight2 - nPaddedLeft2 ); if ( nContig1MoreBasesThanContig2 > 0 ) { // better add some pads to contig2 to even them up for( int n = 1; n <= nContig1MoreBasesThanContig2; ++n ) { pMiniContig_->insertColOfPads( nPaddedLeft2 + 1 ); cout << "."; cout.flush(); } } else if ( nContig1MoreBasesThanContig2 < 0 ) { // better add some pads to contig1 to even them up for( int n = 1; n <= -nContig1MoreBasesThanContig2; ++n ) { pBigContig_->insertColOfPads( nPaddedLeft1 + 1 ); cout << "."; cout.flush(); } } else { // they are even--great. Nothing to do. } // set up for next time nAlignmentIndexOfRightNonPadAlignment = nAlignmentIndexOfLeftNonPadAlignment; } } // if ( ( pPaddedPieceOfReadBigC_->soBases... } // for( int nAlignmentIndex ... // this is important because, in the case of the big contig at least, // the reads that extend to the end have been removed. But insertColOfPads // calls updateLastDisplayableContigPos which probably uses the reads // to determine where the last base should be. pBigContig_->nLastDisplayableContigPos_ = MAX( pBigContig_->nGetEndConsensusIndex(), pBigContig_->nLastDisplayableContigPos_ ); pMiniContig_->nLastDisplayableContigPos_ = MAX( pMiniContig_->nGetEndConsensusIndex(), pMiniContig_->nLastDisplayableContigPos_ ); cout << "done" << endl; } bool fixContigEnd :: bCheckAddColumnsOfPadsToAlignContigs() { bool bOK = true; int nAlignmentIndexOfRightNonPadAlignment; bool bFirstTime = true; for( int nAlignmentIndex = pPaddedPieceOfReadBigC_->nPaddedAlignmentEnd_; nAlignmentIndex >= pPaddedPieceOfReadBigC_->nPaddedAlignmentStart_; --nAlignmentIndex ) { if ( ( pPaddedPieceOfReadBigC_->soBasesAndPads_[ nAlignmentIndex ] != '*' ) && ( pPaddedPieceOfReadMini_->soBasesAndPads_[ nAlignmentIndex ] != '*' ) ) { // found an alignment of two non-pads if (bFirstTime ) { bFirstTime = false; nAlignmentIndexOfRightNonPadAlignment = nAlignmentIndex; } else { int nAlignmentIndexOfLeftNonPadAlignment = nAlignmentIndex; int nUnpaddedLeft1 = pPaddedPieceOfReadBigC_->nUnpaddedContigPosFromZeroBasedPaddedPos( nAlignmentIndexOfLeftNonPadAlignment ); int nPaddedLeft1 = pBigContig_->nPaddedIndexFast( nUnpaddedLeft1 ); int nUnpaddedRight1 = pPaddedPieceOfReadBigC_->nUnpaddedContigPosFromZeroBasedPaddedPos( nAlignmentIndexOfRightNonPadAlignment ); int nPaddedRight1 = pBigContig_->nPaddedIndexFast( nUnpaddedRight1 ); int nUnpaddedLeft2 = pPaddedPieceOfReadMini_->nUnpaddedContigPosFromZeroBasedPaddedPos( nAlignmentIndexOfLeftNonPadAlignment ); int nPaddedLeft2 = pMiniContig_->nPaddedIndexFast( nUnpaddedLeft2 ); int nUnpaddedRight2 = pPaddedPieceOfReadMini_->nUnpaddedContigPosFromZeroBasedPaddedPos( nAlignmentIndexOfRightNonPadAlignment ); int nPaddedRight2 = pMiniContig_->nPaddedIndexFast( nUnpaddedRight2 ); // check if the 2 contigs already have the same number // of bases and pads between the two aligned non-pads int nContig1MoreBasesThanContig2 = ( nPaddedRight1 - nPaddedLeft1 ) - ( nPaddedRight2 - nPaddedLeft2 ); if ( nContig1MoreBasesThanContig2 != 0 ) { bOK = false; cout << "boink 3 " << " unpadded range " << pBigContig_->soGetName() << " " << nUnpaddedLeft1 << " " << nUnpaddedRight1 << " " << pMiniContig_->soGetName() << " " << nUnpaddedLeft2 << " " << nUnpaddedRight2; cout << "padded range: " << nPaddedLeft1 << " " << nPaddedRight1 << " " << nPaddedLeft2 << " " << nPaddedRight2 << endl; } // set up for next time nAlignmentIndexOfRightNonPadAlignment = nAlignmentIndexOfLeftNonPadAlignment; } } // if ( ( pPaddedPieceOfReadBigC_->soBases... } // for( int nAlignmentIndex ... return( bOK ); } void fixContigEnd :: dumpContigSegments() { for( int nContigSeg = 0; nContigSeg < contigSegments_.length(); ++nContigSeg ) { printf( "----------------------------------\n" ); printf( "contig seg %5d %10d %10d %s\n", nContigSeg, contigSegments_[ nContigSeg ].nUnpaddedStart1_, contigSegments_[ nContigSeg ].nUnpaddedEnd1_, szPrintBool( contigSegments_[ nContigSeg ].bBestContigIsContig1NotContig2_ ) ); printf( " %10d %10d %s\n", contigSegments_[ nContigSeg ].nUnpaddedStart2_, contigSegments_[ nContigSeg ].nUnpaddedEnd2_, szPrintBool( contigSegments_[ nContigSeg ].bBestContigIsContig1NotContig2_ ) ); printf( "new padded: %d to %d\n", contigSegments_[ nContigSeg].nNewPaddedStart_, contigSegments_[ nContigSeg].nNewPaddedEnd_ ); } } void fixContigEnd :: makeContigSegmentsContiguous( Contig* pLeftContig, const int nNewConsPosFromNotLeftConsPos ) { // first calculate the padded positions of the contig segments // The starting position of the first contig segment is a special case // since the starting bases are not usually aligned. Thus we need // to know which contig to use for the first one. We've already // determined that--it is pLeftContig. Thus we use its starting // location for calculating the starting padded position of the // first contig segment. No, the first and last are no longer // special cases. Now we just use the starting and ending of // whichever is the best contig for a particular contig segment. int nContigSeg; for( nContigSeg = 0; nContigSeg < contigSegments_.length(); ++nContigSeg ) { int nPaddedStart; int nPaddedEnd; bool bBestContigLeftContig; if ( contigSegments_[ nContigSeg ].bBestContigIsContig1NotContig2_ ) { // this is a pBigContig_ contig segment, so calculate // the padded start and padded end relative to pBigContig_ nPaddedStart = pBigContig_->nPaddedIndexFast( contigSegments_[ nContigSeg ].nUnpaddedStart1_ ); nPaddedEnd = pBigContig_->nPaddedIndexFast( contigSegments_[ nContigSeg ].nUnpaddedEnd1_ ); bBestContigLeftContig = ( pLeftContig == pBigContig_ ); } else { // this is a pMiniContig_ contig segment, so calculate // the padded start and padded end relative to pMiniContig_ nPaddedStart = pMiniContig_->nPaddedIndexFast( contigSegments_[ nContigSeg ].nUnpaddedStart2_ ); nPaddedEnd = pMiniContig_->nPaddedIndexFast( contigSegments_[ nContigSeg ].nUnpaddedEnd2_ ); bBestContigLeftContig = ( pLeftContig == pMiniContig_ ); } if ( nContigSeg == 0 ) assert( bBestContigLeftContig ); if ( !bBestContigLeftContig ) { nPaddedStart += nNewConsPosFromNotLeftConsPos; nPaddedEnd += nNewConsPosFromNotLeftConsPos; } contigSegments_[ nContigSeg ].nNewPaddedStart_ = nPaddedStart; contigSegments_[ nContigSeg ].nNewPaddedEnd_ = nPaddedEnd; } // now make contig segments contiguous for( nContigSeg = 1; nContigSeg < contigSegments_.length(); ++nContigSeg ) { if ( ( contigSegments_[ nContigSeg - 1 ].nNewPaddedEnd_ + 1 ) != contigSegments_[ nContigSeg ].nNewPaddedStart_ ) { assert( contigSegments_[ nContigSeg - 1 ].nNewPaddedEnd_ < contigSegments_[ nContigSeg ].nNewPaddedStart_ ); contigSegments_[ nContigSeg - 1 ].nNewPaddedEnd_ = contigSegments_[ nContigSeg ].nNewPaddedStart_ - 1; } } // if the best 1st consensus starts with a pad, then the first contigSegment // will not start at position 1, which will cause a bug since the first // base segment will not start at position 1 so an exception will // be thrown. (July 2002). contigSegment& csFirstCS = contigSegments_[0]; if ( csFirstCS.nNewPaddedStart_ != 1 ) { Contig* pContig = ( csFirstCS.bBestContigIsContig1NotContig2_ ? pBigContig_ : pMiniContig_ ); // I believe this is the only way this could happen assert( pContig->ntGetCons( 1 ).bIsPad() ); csFirstCS.nNewPaddedStart_ = 1; } // I am not doing the same with the last contigSegment since // the new consensus doesn't include terminal pads } void fixContigEnd :: cleanUp() { if ( !pCP->bFixContigEndsCleanUpTemporaryFiles_ ) return; unlink( (soBaseNameForTemporaryFiles_ + ".fasta" ).data() ); unlink( (soBaseNameForTemporaryFiles_ + ".fasta.qual" ).data() ); unlink( (soBaseNameForTemporaryFiles_ + ".fasta.ace").data() ); unlink( (soBaseNameForTemporaryFiles_ + ".fasta.contigs" ).data() ); unlink( (soBaseNameForTemporaryFiles_ + ".fasta.contigs.qual" ).data() ); unlink( (soBaseNameForTemporaryFiles_ + ".fasta.log" ).data() ); unlink( (soBaseNameForTemporaryFiles_ + ".fasta.problems" ).data() ); unlink( (soBaseNameForTemporaryFiles_ + ".fasta.problems.qual" ).data() ); unlink( (soBaseNameForTemporaryFiles_ + ".fasta.singlets" ).data() ); unlink( (soBaseNameForTemporaryFiles_ + ".phrap.out" ).data() ); unlink( (soBaseNameForTemporaryFiles_ + ".phrap.err" ).data() ); unlink( (soBaseNameForTemporaryFiles_ + ".fixConsensusEndAceFile.fof" ).data() ); // e.g., // left_Contig10_100618.112445.fasta // left_Contig10_100618.112445.fasta.ace // left_Contig10_100618.112445.fasta.contigs // left_Contig10_100618.112445.fasta.contigs.qual // left_Contig10_100618.112445.fasta.log // left_Contig10_100618.112445.fasta.problems // left_Contig10_100618.112445.fasta.problems.qual // left_Contig10_100618.112445.fasta.qual // left_Contig10_100618.112445.fasta.singlets // left_Contig10_100618.112445.phrap.out } int fixContigEnd :: nOpenAceFileAndGetNumberOfContigs( const FileName& filAceFileName ) { FILE* pAceFile = fopen( filAceFileName.data(), "r" ); if ( !pAceFile ) { THROW_FILE_ERROR( filAceFileName ); } // only read first line int nContigs; int nReads; int nTokens = fscanf( pAceFile, "AS %d %d", &nContigs, &nReads ); if ( nTokens != 2 ) { THROW_ERROR2( "first line of ace file " + filAceFileName + " was not of form AS (number of contigs) (number of reads)" ); } fclose( pAceFile ); return nContigs; }