/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ // // basesegment.cpp // // implementation for BaseSegment, BaseSegArray classes // #include "numutil.h" #include "basesegment.h" #include "contig.h" #include "assert.h" #include "tagTypes.h" #include "fileDefines.h" #include "soAddCommas.h" void consolidateChanges(BaseSegChanges* pBsChanges); bool BaseSegArray :: bIsInBaseSegment(const int nConsPos, const LocatedFragment* pLf) const { // create dummy for use in find fun BaseSegment bsTemp(nConsPos); // find the base segment for this consensus position BaseSegment* pBsFound = dapBs_.findMatchOrPredecessor(&bsTemp); // it's possible that there might not be one, if the fragment // hangs off either side of the consensus if (!pBsFound) return false; // equality determined by pointer compare. if (pBsFound->pGetLocFrag() != pLf) return false; // check the alignment. if ((nConsPos < pBsFound->nGetStartInConsensus()) || (nConsPos > pBsFound->nGetEndInConsensus())) return false; return true; } BaseSegment* BaseSegArray :: pGetBaseSegmentByConsPos( const int nConsPos ) { BaseSegment bsTemp( nConsPos ); BaseSegment* pBsFound = dapBs_.findMatchOrPredecessor( &bsTemp ); if ( !pBsFound ) return NULL; if ( ( pBsFound->nStartInConsensus_ <= nConsPos ) && ( nConsPos <= pBsFound->nEndInConsensus_ ) ) { return pBsFound; } else { return NULL; } } int BaseSegArray :: nGetBaseSegmentIndexByConsPos( const int nConsPos ) { BaseSegment bsTemp( nConsPos ); int nBaseSeg = dapBs_.nFindIndexOfMatchOrPredecessor( &bsTemp ); if ( nBaseSeg != RW_NPOS ) { BaseSegment* pBaseSeg = dapBs_[ nBaseSeg ]; if ( (pBaseSeg->nStartInConsensus_ <= nConsPos ) && ( nConsPos <= pBaseSeg->nEndInConsensus_ ) ) return( nBaseSeg ); else return( RW_NPOS ); } else return( RW_NPOS ); } // changes the base segment array used to determine the consensus. // // note that in our current editing paradigm, selecting // a segment of a read and "forcing" the consensus to be // generated from that segment is the most desirable and // hopefully the most common kind of edit. It is possible // to edit the fragment or consensus afterwards, but this // is considered a different edit action. Thus there should // never be any chance of the _alignment_ changing as a // result of calling setBaseSegToFragRegion. // // here's the way it gets done. [this supports undo and also // gets around any screwiness because of the RTWPtrSortedVector // moving things around while you're working on them.] // // 1) new an object for the new base segment // 2) new a BaseSegChanges object to record our changes // 2) make copies of all affected (old) base segments // 3) figure out which old segments will survive after edits // i.e. cause they partly overlapped the new one // 4) make new versions of surviving segments with corrections // 5) delete all affected (old) base segs, recorded in BaseSegChanges // 6) insert the new segments, recorded in BaseSegChanges // 7) return the BaseSegChanges object to caller for (possible) // use by undo function later // // chrisa 17-apr-95 // BaseSegChanges* BaseSegArray :: setBaseSegToFragRegion(LocatedFragment* pLFrag, const int nStartConsPos, const int nEndConsPos) { // for debugging assert (bGetDataStructureOk()); // sanity check the passed params assert ((nStartConsPos <= nEndConsPos) && pLFrag); // first make sure the edit is possible. does this fragment // intersect the consensus within these bounds? assert ((pLFrag->nGetAlignStart() <= nStartConsPos) && (pLFrag->nGetAlignEnd() >= nEndConsPos)); // is there "real" consensus here? test it against the actual // length of the consensus, not including "overhanging" frags if (nStartConsPos < pLFrag->pGetContig()->nGetStartConsensusIndex()) { ostrstream ost; ost << "You cannot extend the consensus on the left end, but you can extend it on the right end. So complement, extend, and complement back." << ends; InputDataError ide(ost.str()); throw ide; } // // create the new base segment that will go into the // array. there is at this point no rule that adjacent // base segments that point to the same fragment be // collapsed. // // this gets used first for searching. BaseSegment bsNew(pLFrag, // fragment nStartConsPos, // start pos in cons nEndConsPos); // end pos in cons // // create the BaseSegChanges object that will record what // happens here // BaseSegChanges* pBsChanges = new BaseSegChanges; // // and save a copy of the new one. _always_ gets inserted // pBsChanges->daInsertedBaseSegs_.insert(bsNew); // // get the index to the base segment (if any) whose // starting consensus align position is the highest but not // greater than nStartConsPos. we can ask the mbt pointer sorted // vector because the array of base segments is sorted // by start pos in consensus. // int nFirstDeadSeg = dapBs_.nFindIndexOfMatchOrPredecessor(&bsNew); // // if we found nothing, there is only one possiblility // and that is that the new segment extends past the // previously defined start of the consensus. This is // possible in cases where a region of a low quality // read that aligns in front of the start of the consensus // has been manually upgraded to be a base segment, and // hence part of the new extended consensus. if the new // seg entirely preceeds the old ones, we catch that later. // if (nFirstDeadSeg == RW_NPOS) nFirstDeadSeg = 0; // // we know the first one affected, but what is the last one // affected? allow the new segment to hang off the endm // for same reason as above. // int nLastDeadSeg = nFirstDeadSeg; bool bFound = false; while ( (nLastDeadSeg < dapBs_.length()) && (! bFound) ) { // does the new segment extend beyond this old seg? BaseSegment* pBs = dapBs_[nLastDeadSeg]; if (dapBs_[nLastDeadSeg]->nGetEndInConsensus() < nEndConsPos) { nLastDeadSeg++; // if at end of array, } else { bFound = true; } } // loop above must have terminated when went off the end of the list // So return the last base segment. This happens when the the // consensus is being extended. if (!bFound) --nLastDeadSeg; // // there is a chance that the new seg does not intersect // any of the old ones in which case no deletions are // required, only the insertion of the new one. // bool bNoIntersectingOldSegs = ( (nEndConsPos < dapBs_[nFirstDeadSeg]->nGetStartInConsensus()) || (nStartConsPos > dapBs_[nLastDeadSeg]->nGetEndInConsensus()) ); // handle the trivial case of no intersecting old segs if (bNoIntersectingOldSegs) { // just insert new one makeBsChanges(pBsChanges); // changes may get consolidated return pBsChanges; // done, return description of what we did } // make copies of all the ones we're going to delete later for (int nBseg = nFirstDeadSeg; nBseg <= nLastDeadSeg; nBseg++) { pBsChanges->daRemovedBaseSegs_.insert(*(dapBs_[nBseg])); } // // special case. new boundaries == old boundaries // if ( (nStartConsPos == dapBs_[nFirstDeadSeg]->nGetStartInConsensus()) && (nEndConsPos == dapBs_[nFirstDeadSeg]->nGetEndInConsensus()) ) { // just insert the new one makeBsChanges(pBsChanges); // changes may get consolidated // in this case there can be only one old affected, so return return pBsChanges; // return description of what we did } // // special case. new contained within old // if ( (nStartConsPos > dapBs_[nFirstDeadSeg]->nGetStartInConsensus()) && (nEndConsPos < dapBs_[nFirstDeadSeg]->nGetEndInConsensus()) ) { // make left one BaseSegment bsNewLeft(dapBs_[nFirstDeadSeg]->pGetLocFrag(), dapBs_[nFirstDeadSeg]->nGetStartInConsensus(), nStartConsPos - 1); pBsChanges->daInsertedBaseSegs_.insert(bsNewLeft); // save copy // make right one BaseSegment bsNewRight(dapBs_[nFirstDeadSeg]->pGetLocFrag(), nEndConsPos + 1, dapBs_[nFirstDeadSeg]->nGetEndInConsensus() ); pBsChanges->daInsertedBaseSegs_.insert(bsNewRight); // save copy // do the work and leave makeBsChanges(pBsChanges); // changes may get consolidated return pBsChanges; } // // at this point there are one or two old segments of // possible concern to us: the ones indexed at nFirstDeadSeg // and nLastDeadSeg (any in between simply are deleted). // handle both (if there are two) the same way in a loop. // // dump the struct //cout << "fixing base segs:" << endl; //cout << "new base seg start = " << nStartConsPos // << " new base seg end = " << nEndConsPos << endl; //for (int n = nFirstDeadSeg; n <= nLastDeadSeg; n++) { // cout << "dapBs_[" << n << "] = " << *(dapBs_[n]) << endl; //} //cout.flush(); int nFixThisSeg = nFirstDeadSeg; bool bFinished = false; while (! bFinished) { // overlap on right side if (nEndConsPos < dapBs_[nFixThisSeg]->nGetEndInConsensus()) { // make shorter version clipped on left BaseSegment bsRightLeft(dapBs_[nFixThisSeg]->pGetLocFrag(), nEndConsPos + 1, dapBs_[nFixThisSeg]->nGetEndInConsensus() ); pBsChanges->daInsertedBaseSegs_.insert(bsRightLeft); // save copy } // overlap on the left side else if (dapBs_[nFixThisSeg]->nGetStartInConsensus() < nStartConsPos ) { // new up shorter version clipped on left BaseSegment bsNewLeft(dapBs_[nFixThisSeg]->pGetLocFrag(), dapBs_[nFixThisSeg]->nGetStartInConsensus(), nStartConsPos - 1); pBsChanges->daInsertedBaseSegs_.insert(bsNewLeft); // save copy } if (nFixThisSeg == nLastDeadSeg) { bFinished = true; } else { nFixThisSeg = nLastDeadSeg; } } // while not finished // do the work (apply these changes to the contig's array) makeBsChanges(pBsChanges); // for debugging assert (bGetDataStructureOk()); return pBsChanges; } // make all changes to the array of base segments at once. // avoids indexing hassles, allows for trivial undo void BaseSegArray::makeBsChanges(BaseSegChanges* pBsChanges) { // consolidate new segments - i.e. if two adjacent segments // refer to the same fragment, collapse them together before // cluttering the base segment array extra segments consolidateChanges(pBsChanges); // debug //cout << "before changes: array length = " << // dapBs_.length() << " nFirsDeadSeg = " << nFirstDeadSeg << // " nLastDeadSeg = " << nLastDeadSeg << endl; //cout << "Base segment array:" << endl; //for (int n = 0; n < dapBs_.length(); n++) { // cout << "segment index " << n << " " << *(dapBs_[n]); //} //cout << "array of changes:" << endl << *pBsChanges; //cout.flush(); // delete the dead. int nBseg; for( nBseg = 0; nBseg < pBsChanges->daRemovedBaseSegs_.length(); nBseg++) { // find index of base seg array element equal in value // to this copy (marked for deletion) int nRemoveMe = dapBs_.index(&pBsChanges->daRemovedBaseSegs_[nBseg]); // get the pointer to it BaseSegment* pBsDeadSeg = dapBs_[nRemoveMe]; // remove it from the array dapBs_.removeAt(nRemoveMe); // and delete it delete pBsDeadSeg; } // and insert the newborn for (nBseg = 0; nBseg < pBsChanges->daInsertedBaseSegs_.length(); nBseg++) { // allocate the copy the contig will use and // initialize it to equal the old one // we still keep a copy for undo() calls // in the BaseSegChanges object BaseSegment* pNewBs = new BaseSegment(pBsChanges->daInsertedBaseSegs_[nBseg]); dapBs_.insert(pNewBs); } // debug //cout << "after changes: array length = " << dapBs_.length() << endl; //cout << "Base segment array:" << endl; //for (n = 0; n < dapBs_.length(); n++) { // cout << "segment index " << n << " " << *(dapBs_[n]); //} //cout.flush(); } // reads through the array of changes to be made and // collapses any adjacent segments that refer to the same // fragment into one. also handles special case of // segment being replaced with itself. void consolidateChanges(BaseSegChanges* pBsChanges) { // if only one there's nothing to do int nInserts = pBsChanges->daInsertedBaseSegs_.length(); if (nInserts < 2) return; int nSeg = 0; // stop with second to last, which gets compared to last while (nSeg < (pBsChanges->daInsertedBaseSegs_.length() - 1)) { // are this one and the next one in the same located fragment? if (pBsChanges->daInsertedBaseSegs_[nSeg].pGetLocFrag() == pBsChanges->daInsertedBaseSegs_[nSeg+1].pGetLocFrag()) { // yes, consolidate BaseSegment bsNew(pBsChanges->daInsertedBaseSegs_[nSeg].pGetLocFrag(), pBsChanges->daInsertedBaseSegs_[nSeg].nGetStartInConsensus(), pBsChanges->daInsertedBaseSegs_[nSeg+1].nGetEndInConsensus()); // remove the second one pBsChanges->daInsertedBaseSegs_.removeAt(nSeg+1); // change the value of the first one pBsChanges->daInsertedBaseSegs_[nSeg] = bsNew; // do NOT bump the counter, next one to test has moved up } else { nSeg++; // not same, so bump counter } } // while not bStop // check for segment replacing itself // not really necessary, but useful in debugging if ((pBsChanges->daInsertedBaseSegs_.length() == 1) && (pBsChanges->daRemovedBaseSegs_.length() == 1)) { if (pBsChanges->daInsertedBaseSegs_[0] == pBsChanges->daRemovedBaseSegs_[0]) { // no effective change would be made, clear array //cout << "null change array" << endl; cout.flush(); pBsChanges->daInsertedBaseSegs_.clear(); pBsChanges->daRemovedBaseSegs_.clear(); } } } void BaseSegArray::undoBaseSegChanges(BaseSegChanges* pBsChanges) { // the dead shall live again int nBseg; for( nBseg = 0; nBseg < pBsChanges->daInsertedBaseSegs_.length(); nBseg++) { // find index of base seg array element equal in value // to this copy (marked for deletion) int nRemoveMe = dapBs_.index(&pBsChanges->daInsertedBaseSegs_[nBseg]); // get the pointer to it BaseSegment* pBsDeadSeg = dapBs_[nRemoveMe]; // remove it from the array dapBs_.removeAt(nRemoveMe); // and delete it delete pBsDeadSeg; } // and the presumptuous be chastised for (nBseg = 0; nBseg < pBsChanges->daRemovedBaseSegs_.length(); nBseg++) { // allocate the copy the contig will use and // initialize it to equal the old one // we still keep a copy for undo() calls // in the BaseSegChanges object BaseSegment* pNewBs = new BaseSegment(pBsChanges->daRemovedBaseSegs_[nBseg]); dapBs_.insert(pNewBs); } // for debugging assert (bGetDataStructureOk()); } // check the internal consistency of the BaseSegArray. Returns true // ("ok") if the indices are in the right order (start <= end), // the base segments are strictly increasing // (e.g, do not overlap ) bool BaseSegArray :: bGetDataStructureOk( const bool bCheckThatBaseSegmentsGoFromBeginningToEndOfConsensus ) { // handle case in which we are processing an assembly from a different // assembler such as Velet. March 2009 (DG) if ( dapBs_.length() == 0 ) return true; for (int nSeg = 0; nSeg < dapBs_.length(); nSeg++) { if (dapBs_[nSeg]->nGetStartInConsensus() > dapBs_[nSeg]->nGetEndInConsensus() ) { cerr << "base segment " << nSeg << " has nGetStartInConsensus() = " << dapBs_[nSeg]->nGetStartInConsensus() << " and nGetEndInConsensus() = " << dapBs_[nSeg]->nGetEndInConsensus() << " but start should be <= end" << endl; cerr << "base segment for read " << dapBs_[nSeg]->pLocFrag_->soGetName() << " in contig " << dapBs_[nSeg]->pLocFrag_->pContig_->soGetName() << endl; cerr.flush(); return false; } // check that the base segment for a read lies within the read LocatedFragment* pLocFrag = dapBs_[ nSeg ]->pLocFrag_; int nConsPosStart = dapBs_[ nSeg ]->nGetStartInConsensus(); int nConsPosEnd = dapBs_[ nSeg ]->nGetEndInConsensus(); if ( ! ( pLocFrag->nGetAlignStart() <= nConsPosStart && nConsPosStart <= nConsPosEnd && nConsPosEnd <= pLocFrag->nGetAlignEnd() ) ) { cerr << "base segment from padded cons pos " << nConsPosStart << " to " << nConsPosEnd << " is not within read " << pLocFrag->soGetName() << " which lies within padded cons pos " << pLocFrag->nGetAlignStart() << " to " << pLocFrag->nGetAlignEnd() << endl; return( false ); } // skip test the first time if (nSeg != 0) { // check that this one starts to the right of where last left off, // or further right if ( dapBs_[nSeg]->nGetStartInConsensus() <= dapBs_[nSeg-1]->nGetEndInConsensus() ) { cerr << "not strictly increasing at " << nSeg << endl; cerr.flush(); for (int j = nSeg - 1; ( j < nSeg + 2) && (j < dapBs_.length() ); j++) { cerr << "index " << j << " " <<*(dapBs_[j]); } Contig* pContig = dapBs_[ nSeg ]->pLocFrag_->pGetContig(); int nConsPosStart = dapBs_[ nSeg - 1 ]->nGetStartInConsensus(); int nConsPosEnd = dapBs_[ nSeg + 1 ]->nGetEndInConsensus(); cerr << "base from " << nConsPosStart << " to " << nConsPosEnd << endl; for( int nConsPos = nConsPosStart; nConsPos <= nConsPosEnd; ++nConsPos ) { cerr << pContig->cGetConsensusBase( nConsPos ); } cerr << endl; return false; } } } // loop through all base segments // check the first and last base segments to be sure that the // entire consensus is covered with base segments if ( bCheckThatBaseSegmentsGoFromBeginningToEndOfConsensus ) { if ( dapBs_[0]->nGetStartInConsensus() != 1 ) { cerr << "Base segment 0 of contig " << pContig_->soGetName() << " is not at position 1--it should be" << endl; return false; } if ( dapBs_[ dapBs_.length() - 1 ]->nGetEndInConsensus() != pContig_->nGetEndConsensusIndex() ) { cerr << "In contig " << pContig_->soGetName() << " last base segment (" << dapBs_.length() - 1 << ") should end on the last padded consensus base (" << pContig_->nGetEndConsensusIndex() << ") but instead ends on " << dapBs_[ dapBs_.length() - 1 ]->nGetEndInConsensus() << endl; return false; } } // if you got here, the array is ok return true; } // an artifact of phrap is discontinuity in the base segments // under certain circumstances. this call fixes any "gaps" // between base segs. void BaseSegArray::forceSegsToBeContiguous() { for (int n = 1; n < dapBs_.length(); n++) { // if the end of the prev does not but // right up against start of next int nShouldEnd = dapBs_[n]->nGetStartInConsensus() - 1; if (dapBs_[n-1]->nGetEndInConsensus() != nShouldEnd) { // adjust endpoint of previous one. // Can't always do this (DG, 10/00) since the previous // read may have ended. LocatedFragment* pLocFragOfPreviousBS = dapBs_[n-1]->pLocFrag_; if ( nShouldEnd <= pLocFragOfPreviousBS->nGetAlignEnd() ) dapBs_[n-1]->setEndInConsensus(nShouldEnd); else { // we have a problem. Hopefully the read of the next base segment // can cover this region. Otherwise, I'm // going to just leave a hole here. int nShouldStart = dapBs_[n-1]->nGetEndInConsensus() + 1; LocatedFragment* pLocFragOfCurrentBS = dapBs_[n]->pLocFrag_; if ( pLocFragOfCurrentBS->nGetAlignStart() <= nShouldStart ) dapBs_[n]->setStartInConsensus( nShouldStart ); else { cerr << "could not make base segments contiguous at padded pos " << nShouldStart << " after read " << pLocFragOfPreviousBS->soGetName() << endl; } } } } } void BaseSegment :: complementBaseSegment() { pLocFrag_->pGetContig()->complementEndPoints( nStartInConsensus_, nEndInConsensus_ ); } void BaseSegArray :: reverseBaseSegArray() { for( int nIndex = 0; nIndex < dapBs_.length() / 2; ++nIndex ) { BaseSegment* pTempBaseSegment = dapBs_[nIndex]; dapBs_[nIndex] = dapBs_[ dapBs_.length() - nIndex - 1 ]; dapBs_[ dapBs_.length() - nIndex - 1 ] = pTempBaseSegment; } } bool BaseSegArray :: bIsBaseSegArraySorted( int& nBadPosition ) { for( int nIndex = 0; nIndex <= ((int)dapBs_.length()) - 2; ++nIndex ) { if ( ! (*dapBs_[ nIndex ] < *dapBs_[ nIndex + 1 ] ) ) { nBadPosition = nIndex; return( false ); } } return( true ); } void BaseSegArray :: complementBaseSegArray() { // ask all base segments to complement themselves int nIndex; for( nIndex = 0; nIndex < dapBs_.length(); ++nIndex ) dapBs_[nIndex]->complementBaseSegment(); // and put them back into order (don't use re_sort for efficiency) for( nIndex = 0; nIndex < dapBs_.length() / 2; ++nIndex ) { BaseSegment* pTempBaseSegment = dapBs_[nIndex]; dapBs_[nIndex] = dapBs_[ dapBs_.length() - nIndex - 1 ]; dapBs_[ dapBs_.length() - nIndex - 1 ] = pTempBaseSegment; } // note that dapBs_.length() is unsigned int so // if dapBs_.length() = 1, then // dapBs_.length() - 2 = 4294967295 // This gives a RW indexing error. To fix this bug, // convert dapBs_.length() to signed int // This is the reason for the (int) // check that base segment array is now sorted for( nIndex = 0; nIndex <= ((int)dapBs_.length()) - 2; ++nIndex ) { assert( *dapBs_[ nIndex ] < *dapBs_[ nIndex + 1 ] ); } } // repair the base segment array after an insertion in the // consensus has caused all frags at or after that pos // to adjust their alignment void BaseSegArray::adjustSegsForInsertionAtPos(const int nConsInsertPos) { // loop through all base segments for (int nSeg = 0; nSeg < nGetNumSegments(); nSeg++) { // pick up pointer to this base segment BaseSegment* pSeg = (*this)[nSeg]; // are we in the affected region at all? if (nConsInsertPos <= pSeg->nGetEndInConsensus() ) { // yes. is this position within this base seg? if ( pSeg->nGetStartInConsensus() <= nConsInsertPos ) { // yes. fix the end alignment point pSeg->shiftEndAlignmentPlusOne(); // add one to end point } else { // the insertion pos is not within the base segment, // but since this segment is aligned after that pos, the // alignment must be fixed. pSeg->shiftAlignmentPlusOne(); } } // base segment is in affected region } // loop through all base segments } // repair the base segment array after a deletion in the // consensus has caused all frags at or after that pos // to adjust their alignment void BaseSegArray::adjustSegsForDeletionAtPos(const int nConsDeletePos) { // loop through all base segments for (int nSeg = 0; nSeg < nGetNumSegments(); nSeg++) { // pick up pointer to this base segment BaseSegment* pSeg = (*this)[nSeg]; // are we in the affected region at all? if (nConsDeletePos <= pSeg->nGetEndInConsensus() ) { // yes. is this position within this base seg? if ( pSeg->nGetStartInConsensus() <= nConsDeletePos ) { // yes. fix the end alignment point pSeg->shiftEndAlignmentMinusOne(); // add one to end point } else { // the insertion pos is not within the base segment, // but since this segment is aligned after that pos, the // alignment must be fixed. pSeg->shiftAlignmentMinusOne(); } } // base segment is in affected region } // loop through all base segments } void BaseSegArray :: removeBaseSegmentsToRight( const int nStartPaddedConsPos ) { int nStartBaseSegIndex = nGetBaseSegmentAtOrAfterConsPos( nStartPaddedConsPos ); if ( nStartBaseSegIndex == RW_NPOS ) return; BaseSegment* pBaseSegment = dapBs_[ nStartBaseSegIndex ]; // for cases of the base segment straddling the nStartPaddedConsPos // like this: // ---------- // ^ nStartPaddedConsPos // we want to truncate the base segment to this: // ---- // ^ nStartPaddedConsPos // the "<" below is important--if it looks like this: // ----- // ^ nStartPaddedConsPos // then this base segment should be removed--not shortened if ( ( pBaseSegment->nStartInConsensus_ < nStartPaddedConsPos ) && ( nStartPaddedConsPos <= pBaseSegment->nEndInConsensus_ ) ) { // case in which a base segment starting before the region // to be deleted ends in the region to be deleted. // In this case, clip off the base // segment before the region to be deleted. pBaseSegment->nEndInConsensus_ = nStartPaddedConsPos - 1; ++nStartBaseSegIndex; } // remove the base segments in reverse order for efficiency for( int nBaseSeg = dapBs_.length() - 1; nBaseSeg >= nStartBaseSegIndex; --nBaseSeg ) { assert ( dapBs_.removeAt( nBaseSeg ) ); } } void BaseSegArray :: fixBaseSegments() { // This code is to remove the base segments that phrap has screwed up. // These are base segments that are beyond the ends of the read, // or where the bases in the read do not match the consensus. for (int nSeg = dapBs_.length() - 1; nSeg >= 0; nSeg-- ) { // check that the base segment for a read lies within the read LocatedFragment* pLocFrag = dapBs_[ nSeg ]->pLocFrag_; int nConsPosStart = dapBs_[ nSeg ]->nGetStartInConsensus(); int nConsPosEnd = dapBs_[ nSeg ]->nGetEndInConsensus(); if ( ! ( pLocFrag->nGetAlignStart() <= nConsPosStart && nConsPosStart <= nConsPosEnd && nConsPosEnd <= pLocFrag->nGetAlignEnd() ) ) { cerr << "Base segment in contig " << pContig_->soGetName() << " from padded cons pos " << nConsPosStart << " to " << nConsPosEnd << " is not within read " << pLocFrag->soGetName() << " which lies within padded cons pos " << pLocFrag->nGetAlignStart() << " to " << pLocFrag->nGetAlignEnd() << " in contig " << pContig_->soGetName() << endl; cerr << "Consed is repairing base segments." << endl; fixBaseSegment( nSeg ); } else if ( nConsPosStart < pContig_->nGetStartConsensusIndex() || pContig_->nGetEndConsensusIndex() < nConsPosEnd ) { // this will fix the cshl bug of OSJNBa0042K08 // and BSJNBa0094H10 in which there are a lot of little // base segments beyond the end of the consensus cerr << "Base segment in contig " << pContig_->soGetName() << " from padded cons pos " << nConsPosStart << " to " << nConsPosEnd << " is not within the contig which goes from " << pContig_->nGetStartConsensusIndex() << " to " << pContig_->nGetEndConsensusIndex() << endl; cerr << "Consed is repairing base segments." << endl; fixBaseSegment( nSeg ); } // I've commented this out because it happens frequently, and // does not harm. Just let it happen silently and not worry people. // (Oct 2000) // else { // // check that the read matches the consensus at this position // bool bFoundDifference = false; // int nConsPos; // for( nConsPos = nConsPosStart; nConsPos <= nConsPosEnd; ++nConsPos ) { // if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() != // pContig_->ntGetCons( nConsPos ).cGetBase() ) { // bFoundDifference = true; // break; // } // } // if ( bFoundDifference ) { // cerr << "The base segment at unpadded position " << // pContig_->nUnpaddedIndex( nConsPos ) << // " is read " << pLocFrag->soGetName() << // " with base " << pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() << // " but the consensus has base " << // pContig_->ntGetCons( nConsPos ).cGetBase() << // " base segment index: " << // nSeg << " from padded " << nConsPosStart << " to " << // nConsPosEnd << " or unpadded " << // pContig_->nUnpaddedIndex( nConsPosStart ) << " to " << // pContig_->nUnpaddedIndex( nConsPosEnd ) << endl; // } // } // if ( ! ( pLocFrag->nGetAlignStart ... ) ) else } // for (int nSeg = dapBs_.length() - 1; nSeg >= 0; nSeg-- ) { // dare I fix the first and last base segments so that they // go to the ends of the consensus? Yes, this is a problem at // St Louis H_NH0358G16 if ( dapBs_[0]->nGetStartInConsensus() != 1 ) { cerr << "Base segment 0 of contig " << pContig_->soGetName() << " is from " << dapBs_[0]->nStartInConsensus_ << " to " << dapBs_[0]->nEndInConsensus_ << " but should start at 1 so attempting to fix this" << endl; if ( dapBs_[0]->nGetStartInConsensus() > 1 ) fixBaseSegmentAtBeginning(); } int nLastBaseSegment = dapBs_.length() - 1; if ( dapBs_[ nLastBaseSegment ]->nGetEndInConsensus() != pContig_->nGetEndIndex() ) { cerr << "Last base segment " << nLastBaseSegment << " in contig " << pContig_->soGetName() << " in read "<< dapBs_[ nLastBaseSegment ]->pLocFrag_->soGetName() << " is from " << dapBs_[ nLastBaseSegment ]->nStartInConsensus_ << " to " << dapBs_[ nLastBaseSegment ]->nEndInConsensus_ << " but contig ends at " << pContig_->nGetEndIndex() << " so attempting to fix this" << endl; if ( dapBs_[ nLastBaseSegment ]->nGetEndInConsensus() < pContig_->nGetEndIndex() ) fixBaseSegmentAtEnd(); } } void BaseSegArray :: fixBaseSegment( const int nSeg ) { BaseSegment* pOldBaseSegment = dapBs_[ nSeg ]; LocatedFragment* pLocFrag = pOldBaseSegment->pLocFrag_; int nOldBaseSegmentOnContigStart; int nOldBaseSegmentOnContigEnd; if ( !bIntersect( pOldBaseSegment->nGetStartInConsensus(), pOldBaseSegment->nGetEndInConsensus(), pContig_->nGetStartConsensusIndex(), pContig_->nGetEndConsensusIndex(), nOldBaseSegmentOnContigStart, nOldBaseSegmentOnContigEnd ) ) { // the base segment is completely off the consensus--delete it! // And there is no need to create any others, either dapBs_.removeAt( nSeg ); return; } int nEmptyRegionConsPosLeft; int nEmptyRegionConsPosRight; int nOverlapConsPosLeft; int nOverlapConsPosRight; if ( bIntersect( pLocFrag->nGetAlignStart(), pLocFrag->nGetAlignEnd(), nOldBaseSegmentOnContigStart, nOldBaseSegmentOnContigEnd, nOverlapConsPosLeft, nOverlapConsPosRight ) ) { // clip base segment back so it is on the consensus and // on the read pOldBaseSegment->nStartInConsensus_ = nOverlapConsPosLeft; pOldBaseSegment->nEndInConsensus_ = nOverlapConsPosRight; // what is left? There are two possible places if ( nOldBaseSegmentOnContigStart < nOverlapConsPosLeft ) { nEmptyRegionConsPosLeft = nOldBaseSegmentOnContigStart; nEmptyRegionConsPosRight = nOverlapConsPosLeft - 1; findReadsToMatchConsensus( nEmptyRegionConsPosLeft, nEmptyRegionConsPosRight ); } if ( nOverlapConsPosRight < nOldBaseSegmentOnContigEnd ) { nEmptyRegionConsPosLeft = nOverlapConsPosRight + 1; nEmptyRegionConsPosRight = nOldBaseSegmentOnContigEnd; findReadsToMatchConsensus( nEmptyRegionConsPosLeft, nEmptyRegionConsPosRight ); } } else { // the base segment is completely off the read--delete it! dapBs_.removeAt( nSeg ); nEmptyRegionConsPosLeft = nOldBaseSegmentOnContigStart; nEmptyRegionConsPosRight = nOldBaseSegmentOnContigEnd; findReadsToMatchConsensus( nEmptyRegionConsPosLeft, nEmptyRegionConsPosRight ); } } void BaseSegArray :: fixBaseSegmentAtBeginning() { int nEmptyRegionConsPosLeft = pContig_->nGetStartIndex(); // 1 int nEmptyRegionConsPosRight = dapBs_[0]->nStartInConsensus_ - 1; findReadsToMatchConsensus( nEmptyRegionConsPosLeft, nEmptyRegionConsPosRight ); } void BaseSegArray :: fixBaseSegmentAtEnd() { int nLastBaseSeg = dapBs_.length() - 1; int nEmptyRegionConsPosLeft = dapBs_[ nLastBaseSeg ]->nEndInConsensus_ + 1; int nEmptyRegionConsPosRight = pContig_->nGetEndIndex(); findReadsToMatchConsensus( nEmptyRegionConsPosLeft, nEmptyRegionConsPosRight ); } void BaseSegArray :: findReadsToMatchConsensus( const int nConsPosLeft, const int nConsPosRight ) { // find a read that matches the consensus. If can't find one, // then at least find a read that intersects the consensus at // this location. if ( ! ( ( pContig_->nGetStartConsensusIndex() <= nConsPosLeft ) && ( nConsPosLeft <= nConsPosRight ) && ( nConsPosRight <= pContig_->nGetEndConsensusIndex() ) ) ) { cerr << "severe warning: pContig_->nGetStartConsensusIndex() = " << pContig_->nGetStartConsensusIndex() << " nConsPosLeft = " << nConsPosLeft << " nConsPosRight = " << nConsPosRight << " pContig_->nGetEndConsensusIndex() = " << pContig_->nGetEndConsensusIndex() << " while trying to fix corrupted base segments. Aborting trying to fix base segments. Good luck!" << endl; return; } int nConsPos = nConsPosLeft; while( nConsPos <= nConsPosRight ) { LocatedFragment* pLocFrag = NULL; int nLocFrag = 0; for( int LocFrag = 0; nLocFrag < pContig_->apLocatedFragment_.length(); ++nLocFrag ) { // find a read that intersects this position if ( ( pContig_->apLocatedFragment_[ nLocFrag ]->nGetAlignStart() <= nConsPos ) && ( nConsPos <= pContig_->apLocatedFragment_[ nLocFrag ]->nGetAlignEnd() ) ) { pLocFrag = pContig_->apLocatedFragment_[ nLocFrag ]; if ( pContig_->ntGetCons( nConsPos ).cGetBase() == pLocFrag->ntGetFragFromConsPos( nConsPos ) ) break; } } if ( !pLocFrag ) { cerr << "severe warning--cannot fix corrupted base segment at padded consensus base" << nConsPos << endl; return; } BaseSegment* pNewBaseSeg = new BaseSegment( pLocFrag, nConsPos, nConsPos ); dapBs_.insert( pNewBaseSeg ); // see how far we can extend this base segment ++nConsPos; // note: nConsPosRight damn well better be before the end of the contig while( nConsPos <= nConsPosRight && nConsPos <= pLocFrag->nGetAlignEnd() && ( pContig_->ntGetCons( nConsPos ).cGetBase() == pLocFrag->ntGetFragFromConsPos( nConsPos ) ) ) { pNewBaseSeg->nEndInConsensus_ = nConsPos; ++nConsPos; } } // it might be a good idea to put a tag indicating that consed has // fixed base segments here. bool bUserPushedCancel; // not used tag* pTag = tagTypes::pGetTagTypes()->pCreateConsensusTag( "consedFixedGoldenPath", "", // comment nConsPosLeft, nConsPosRight, pContig_, false, // no NoTrans bUserPushedCancel, "" ); pContig_->addConsensusTag( pTag ); } void BaseSegArray :: dumpBaseSegments() { for (int nSeg = 0; nSeg < dapBs_.length(); nSeg++) { BaseSegment* pBS = dapBs_[ nSeg ]; fprintf( pAO, "%10s %10s %s %d\n", soAddCommas( pBS->nStartInConsensus_ ).data(), soAddCommas( pBS->nEndInConsensus_ ).data(), pBS->pLocFrag_->soGetName().data(), nSeg ); } fflush( pAO ); } int BaseSegArray :: nGetBaseSegmentAtOrAfterConsPos( const int nConsPos ) { // code from readsSortedByLeftEndPosition.h which was written Jan 2008 if ( dapBs_.isEmpty() ) return( RW_NPOS ); // region A // ----------- nTooSmallIndex // region B // ----------- nTestIndex // region C // ----------- nMaxIndex // region D // an index becomes nMaxIndex if it is greater than nMatch // an index becomes nTooSmallIndex if it is less than nMatch int nTooSmallIndex = 0; int nMaxIndex = dapBs_.length() - 1; if ( dapBs_[ nMaxIndex ]->nEndInConsensus_ < nConsPos ) return( RW_NPOS ); // if reached here, nConsPos <= dapBs_[ nMaxIndex ]->nEndInConsensus_ if ( nConsPos <= dapBs_[ nTooSmallIndex ]->nEndInConsensus_ ) return( nTooSmallIndex ); // if reached here, // operator[]( nTooSmallIndex)->nEndInConsensus_ < // nConsPos <= // operator[]( nMaxIndex )->nEndInConsensus_ // Thus the names nTooSmallIndex and nMaxIndex apply. while( true ) { if ( nMaxIndex - nTooSmallIndex <= 1 ) return( nMaxIndex ); else { int nTestIndex = ( nTooSmallIndex + nMaxIndex ) / 2; if ( dapBs_[ nTestIndex ]->nEndInConsensus_ < nConsPos ) nTooSmallIndex = nTestIndex; else nMaxIndex = nTestIndex; } } }