/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ // // contig.cpp // // implementation file for the Contig class // // #define DEPTH_OF_COVERAGE_VARIABLE int #include #include using namespace std; #include #include "dictionary.h" #include #include #include // for chmod #include #include #include "rwcregexp.h" #include "contig.h" #include "contigview.h" #include "numutil.h" #include "string.h" #include "assert.h" #include "quality.h" #include "basesegment.h" #include "mbt_errors.h" #include "mbt_val_ord_offset_vec.h" #include "consed.h" #include "locatedFragment.h" #include "assembly.h" //#include "collectableLocatedFragmentPtr.h" #include "unpaddedFromPadded.h" #include "mbt_exception.h" #include "nConvertFrom4Mer.h" #include "nLengthOf4Mer.h" #include "setMatchLookupTable.h" #include "paddedFromUnpadded.h" #include "consedParameters.h" #include "popupErrorMessage.h" #include "basesegment.h" #include "mbtPtrVector.h" #include "popupInfoMessage.h" #include "guicontigwin.h" #include "bGuiGetAnswerYesNo.h" #include "dErrorRateFromQuality.h" #include "soGetDateTime.h" #include "whyIsPrimerNotAcceptableTypes.h" #include "unalignedHighQualityRegionsGotoList.h" #include "lowconsqual.h" #include "findRepeat.h" #include "fileDefines.h" #include "mbtValOrderedVectorOfInt.h" #include "nFindScoreOfMatch.h" #include "exclusive_or.h" #include "bIsNumericMaybeWithWhitespace.h" #include "rwt2dvalvector.h" #include "nNumberFromBase3.h" #include "soAddCommas.h" #include "mbtValVectorOfBool.h" #include "rwtvalvector.h" #include "rwtptrvector.h" void Contig :: addLocatedFragment( LocatedFragment* plocfrag) { int nFragLen = plocfrag->nGetPaddedSeqLength(); int nConsLen = nGetPaddedSeqLength(); // if this located fragment extends past either end of the // consensus, keep track of the first (or last) aligned // base's position. that way when displaying the alignment, // we can start at the beginning (end) of the first (last) // aligned fragment even though the consensus sequence has no // bases at that relative position. this happens when the // fragment gets aligned but part of it's data that might // have been used to extend the consensus is of too low // a quality if (plocfrag->nGetAlignStart() < nFirstDisplayableContigPos_) { nFirstDisplayableContigPos_ = plocfrag->nGetAlignStart(); } if (plocfrag->nGetAlignEnd() > nLastDisplayableContigPos_) { nLastDisplayableContigPos_ = plocfrag->nGetAlignEnd(); } // the fragment and the consensus must have at least 1 base // of intersection. so it nPos cannot be more than // (its own length - 1) below 0 // assert(nPos >= -(nFragLen - 1)); // and it cannot be past the end of the consensus // assert(nPos < nConsLen); // create an attachment object and add to the // array of attachments so that it is in order of // left clone end // not that we do not enforce uniqueness - for all // we know right now, the same fragment could be // attached to more than one place. apLocatedFragment_.insert( plocfrag ); bReadListsNeedFixing_ = true; ConsEd::pGetAssembly()->bReadsSortedByReadNameNeedsFixing_ = true; // now apLocatedFragment_ is used to find reads by read name // instead of this dictionary // bool bDuplicate; // dictOfReads_.insertKeyValuePair( plocfrag->soGetName(), // (void*) plocfrag, // bDuplicate ); // if ( bDuplicate ) { // cerr << // "warning: there is more than one read in the ace file with name " << // plocfrag->soGetName() << // ". This will cause lots of problems." << endl; // } } // this is really the ctor for ContigView. should be changed. ContigView* Contig :: pGetContigView( const int nStartConsPos, const int nEndConsPos) { assert( nStartConsPos <= nEndConsPos ); // client is responsible for deleting this ContigView* pContigView = new ContigView( this, nStartConsPos, nEndConsPos); if ( bReadListsNeedFixing_ ) fixReadLists(); // iterate thru list of attachments, looking for those // that intersect passed range for (int n = 0; n < apVeryLongReads_.length(); n++) { if (bIntervalsIntersect(apVeryLongReads_[n]->nGetAlignStart(), apVeryLongReads_[n]->nGetAlignEnd(), nStartConsPos, nEndConsPos)) { LocatedFragment* pLocFrag = apVeryLongReads_[n]; pContigView->addLocatedFragment( pLocFrag ); if ( pLocFrag->soReadPrefix_.length() > pContigView->nSizeOfLargestReadPrefix_ ) pContigView->nSizeOfLargestReadPrefix_ = pLocFrag->soReadPrefix_.length(); // set the flags indicating whether any forward // or reverse reads are present if ( pLocFrag->bComp()) { pContigView->incRevFrags(); } else { pContigView->incForFrags(); } } } int nFirstReadToTry = apLocatedFragment2_.nFindIndexOfMatchOrSuccessor( nStartConsPos - pCP->nMaxLengthOfReadsInapLocatedFragment2_ ); if ( nFirstReadToTry != RW_NPOS ) { for( int n = nFirstReadToTry; n < apLocatedFragment2_.length(); ++n ) { if ( nEndConsPos < apLocatedFragment2_[n]->nGetAlignStart() ) break; LocatedFragment* pLocFrag = apLocatedFragment2_[n]; if ( bIntervalsIntersect( pLocFrag->nGetAlignStart(), pLocFrag->nGetAlignEnd(), nStartConsPos, nEndConsPos ) ) { pContigView->addLocatedFragment( pLocFrag ); if ( pLocFrag->soReadPrefix_.length() > pContigView->nSizeOfLargestReadPrefix_ ) pContigView->nSizeOfLargestReadPrefix_ = pLocFrag->soReadPrefix_.length(); // set the flags indicating the number of forward and // reverse reads present if ( pLocFrag->bComp() ) { pContigView->incRevFrags(); } else { pContigView->incForFrags(); } } } // for( int n = nFirstReadToTry; ;++n ) { } // if ( nFirstReadToTry != RW_NPOS ) { if ( pContigView->nSizeOfLargestReadPrefix_ > pCP->nMaxCharsDisplayedForReadPrefix_ ) pContigView->nSizeOfLargestReadPrefix_ = pCP->nMaxCharsDisplayedForReadPrefix_; pContigView->sortReads(); return pContigView; } int Contig :: nDumpContig() { cerr << "Consensus from " << nGetStartConsensusIndex() << " to " << nGetEndConsensusIndex() << endl; for( int n = 0; n < nGetNumberOfFragsInContig(); ++n ) { cerr << "fragment " << n << " from " << apLocatedFragment_[n]->nGetAlignStart() << " to " << apLocatedFragment_[n]->nGetAlignEnd() << endl; } return(ERR_NONE); } // this wrapper around the inherited Sequence member function // accounts for the overhanging fragment situation. // there are no bases in the consensus beyond the // endpoints indicated by nGet[Start|Stop]Index(), but // there are no pads either. all "blanks" in these // overhanging regions are (for convenience) counted // as "negative non-pads" if the sequence starts at // 1, the unpadded index of -5 is int Contig :: nUnpaddedIndex(const int nPaddedConsPos) { if (nPaddedConsPos < Sequence::nGetStartIndex()) { return nPaddedConsPos; } else if (nPaddedConsPos > Sequence::nGetEndIndex()) { return( nUnpaddedIndex(nGetEndIndex()) + nPaddedConsPos - Sequence::nGetEndIndex() ); } else { return( unpaddedFromPadded_.nUnpaddedFromPadded( nPaddedConsPos ) ); } } void Contig :: setUnpaddedPositionsArray() { unpaddedFromPadded_.setUnpaddedPositionsArray( this ); } // overrides Sequence::nGetUnpaddedEndIndex to be more efficient int Contig :: nGetUnpaddedEndIndex() { return( nUnpaddedIndex( nGetEndConsensusIndex() ) ); } // modified this to never exceed array boundary, just silently // truncate. chrisa 10-mar-95 char* Contig :: szGetPartOfConsensus(const int nStartPos, const int nLength) const { assert( nLength > 0 ); char* szNewStr = new char [nLength+1]; assert(szNewStr); // changed to while loop so I can use n to terminate char array int n = 0; while ( (n < nLength) && ( (n + nStartPos) <= nGetLastDisplayableContigPos() ) ) { szNewStr[n] = ntGetCons(nStartPos + n); // casts to char n++; } szNewStr[n] = '\0'; return szNewStr; } // same as above, except uses RWCString void Contig :: getPartOfConsensus( const int nStartConsPos, const int nLength, RWCString& soPartOfConsensus ) { soPartOfConsensus = ""; soPartOfConsensus.increaseButNotDecreaseMaxLength( nLength ); assert( nLength > 0 ); int nEndConsPos = nStartConsPos + nLength - 1; assert( nGetStartConsensusIndex() <= nStartConsPos ); assert( nStartConsPos <= nEndConsPos ); assert( nEndConsPos <= nGetEndConsensusIndex() ); for( int nConsPos = nStartConsPos; nConsPos <= nEndConsPos; ++nConsPos ) { soPartOfConsensus += cGetConsensusBaseUnsafe( nConsPos ); } } void Contig :: getPartOfComplementedConsensus( const int nStartConsPos, const int nLength, RWCString& soPartOfConsensus ) { soPartOfConsensus = ""; soPartOfConsensus.increaseButNotDecreaseMaxLength( nLength ); // nStartPos is given in *uncomplemented* coordinates. The string // is returned in reverse order. For example, // if nStartPos == 10 and nLength == 4, then the following // bases are returned: the complement of base at 10, then the // complement of the base at 9, then 8, then 7. int nEndConsPos = nStartConsPos - nLength + 1; assert( nGetEndConsensusIndex() >= nStartConsPos ); assert( nStartConsPos >= nEndConsPos ); assert( nEndConsPos >= nGetStartConsensusIndex() ); // after doing all that, we no longer have to check each subscript for( int nConsPos = nStartConsPos; nConsPos >= nEndConsPos; --nConsPos ) { soPartOfConsensus += cComplementBase( cGetConsensusBaseUnsafe( nConsPos ) ); } } void Contig :: getPartOfUnpaddedConsensus( const int nUnpaddedStart, const int nUnpaddedLength, RWCString& soPartOfUnpaddedConsensus, const bool bComplemented, const bool bTowardsIncreasingUnpaddedPositions ) { bool bToLeftNotToRightCompareContigsWindow = bComplemented && bTowardsIncreasingUnpaddedPositions || !bComplemented && !bTowardsIncreasingUnpaddedPositions; getPartOfUnpaddedConsensusForCompareContigs( nUnpaddedStart, nUnpaddedLength, soPartOfUnpaddedConsensus, bComplemented, bToLeftNotToRightCompareContigsWindow ); } void Contig :: getPartOfUnpaddedConsensusForCompareContigs( const int nUnpaddedStart, const int nUnpaddedLength, RWCString& soPartOfUnpaddedConsensus, const bool bComplemented, const bool bToLeftNotToRight ) { // warning: bToLeftNotToRight applies to the compareContigs Window. // It does *NOT* apply to the Aligned Reads Window. // So if you want to get complemented bases in reverse orientation // (towards lower unpadded positions), then you must ask // this routine for bToLeftNotRight = false (towards right in the // compareContigs Window) and complemented. When one of the // contigs is complemented in the compareContigs window, positions // go lower to the right. bool bIncrementNotDecrement = bComplemented && bToLeftNotToRight || !bComplemented && !bToLeftNotToRight; // the caller is asking for a piece of the consensus that is // off the consensus if ( bIncrementNotDecrement && nGetUnpaddedEndIndex() < nUnpaddedStart ) { soPartOfUnpaddedConsensus = ""; return; } if ( !bIncrementNotDecrement && nUnpaddedStart < nGetUnpaddedStartIndex() ) { soPartOfUnpaddedConsensus = ""; return; } // There are two more cases: both are off the consensus, but // pointing into the consensus. I think these probably indicate // a programming error, so I will throw an exception. assert( nGetUnpaddedStartIndex() <= nUnpaddedStart ); assert( nUnpaddedStart <= nGetUnpaddedEndIndex() ); soPartOfUnpaddedConsensus = ""; soPartOfUnpaddedConsensus.increaseButNotDecreaseMaxLength( nUnpaddedLength ); int nUnpaddedEnd = ( bIncrementNotDecrement ? nUnpaddedStart + nUnpaddedLength - 1 : nUnpaddedStart - nUnpaddedLength + 1 ); // we can't assume that nUnpaddedEnd hasn't run off the end. Trim // it if necessary, and then check that everything is in order. if ( bIncrementNotDecrement ) { if ( nUnpaddedEnd > nGetUnpaddedEndIndex() ) nUnpaddedEnd = nGetUnpaddedEndIndex(); assert( nGetUnpaddedStartIndex() <= nUnpaddedStart ); assert( nUnpaddedStart <= nUnpaddedEnd ); assert( nUnpaddedEnd <= nGetUnpaddedEndIndex() ); } else { if ( nUnpaddedEnd < nGetUnpaddedStartIndex() ) nUnpaddedEnd = nGetUnpaddedStartIndex(); assert( nGetUnpaddedStartIndex() <= nUnpaddedEnd ); assert( nUnpaddedEnd <= nUnpaddedStart ); assert( nUnpaddedStart <= nGetUnpaddedEndIndex() ); } int nPaddedStart = nPaddedIndexFast( nUnpaddedStart ); int nPaddedEnd = nPaddedIndexFast( nUnpaddedEnd ); if ( bIncrementNotDecrement ) { // I'm doing this so I feel ok about using the "unsafe" routines // below assert( nGetStartIndex() <= nPaddedStart ); assert( nPaddedStart <= nPaddedEnd ); assert( nPaddedEnd <= nGetEndIndex() ); if ( bComplemented ) { for( int nConsPos = nPaddedStart; nConsPos <= nPaddedEnd; ++nConsPos ) { if ( !ntGetConsUnsafe( nConsPos ).bIsPad() ) soPartOfUnpaddedConsensus += cComplementBase( cGetConsensusBaseUnsafe( nConsPos ) ); } } else { for( int nConsPos = nPaddedStart; nConsPos <= nPaddedEnd; ++nConsPos ) { if ( !ntGetConsUnsafe( nConsPos ).bIsPad() ) soPartOfUnpaddedConsensus += cGetConsensusBaseUnsafe( nConsPos ); } } } // if ( bIncrementNotDecrement ) { else { // decrementing assert( nGetStartIndex() <= nPaddedEnd ); assert( nPaddedEnd <= nPaddedStart ); assert( nPaddedStart <= nGetEndIndex() ); if ( bComplemented ) { for( int nConsPos = nPaddedStart; nConsPos >= nPaddedEnd; --nConsPos ) { if ( !ntGetConsUnsafe( nConsPos ).bIsPad() ) soPartOfUnpaddedConsensus += cComplementBase( cGetConsensusBaseUnsafe( nConsPos ) ); } } else { // not complemented for( int nConsPos = nPaddedStart; nConsPos >= nPaddedEnd; --nConsPos ) { if ( !ntGetConsUnsafe( nConsPos ).bIsPad() ) soPartOfUnpaddedConsensus += cGetConsensusBaseUnsafe( nConsPos ); } } // if ( bComplemented ) { else } // if ( bIncrementNotDecrement ) { else } RWCString Contig :: soGetPartOfConsensusUpperOrLower( const int nConsPosStart, const int nConsPosEnd ) { assert( nConsPosStart <= nConsPosEnd ); RWCString soPartOfConsensus; for( int nConsPos = nConsPosStart; nConsPos <= nConsPosEnd; ++nConsPos ) { Ntide nt = ntGetCons( nConsPos ); char cBase = nt.cGetBaseUpperOrLower(); RWCString soBase( cBase ); soPartOfConsensus += soBase; } return( soPartOfConsensus ); } char* Contig :: szGetUnpaddedConsensusToRightEnd( const int nConsPosPadded ) { assert( nConsPosPadded <= nGetEndConsensusIndex() ); assert( nGetStartConsensusIndex() <= nConsPosPadded ); // count how many bases there are int nUnpaddedConsensusToRightLen = 0; int nConsPos; for( nConsPos = nConsPosPadded; nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if (!ntGetCons( nConsPos ).bIsPad() ) { ++nUnpaddedConsensusToRightLen; } } // if there aren't any bases to the right, return a pointer to a null if (nUnpaddedConsensusToRightLen <= 0 ) { char* pChar = new char[1]; pChar[0] = '\0'; return( pChar ); } char* szUnpaddedConsensusToRight = new char[ nUnpaddedConsensusToRightLen + 1 ]; assert( szUnpaddedConsensusToRight ); // move the bases over int nUnpaddedConsPos = 0; for(nConsPos = nConsPosPadded; nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if (!ntGetCons( nConsPos ).bIsPad() ) { szUnpaddedConsensusToRight[ nUnpaddedConsPos ] = cGetConsensusBase( nConsPos ); ++nUnpaddedConsPos; } } szUnpaddedConsensusToRight[ nUnpaddedConsPos ] = '\0'; return( szUnpaddedConsensusToRight ); } char* Contig :: szGetPartUnpaddedConsensusToRight( const int nStartConsPosPadded, const int nLength ) { assert( nStartConsPosPadded <= nGetEndConsensusIndex() ); assert( nGetStartConsensusIndex() <= nStartConsPosPadded ); char* szUnpaddedConsensusToRight = new char[ nLength + 1 ]; assert( szUnpaddedConsensusToRight ); // move the bases over int nUnpaddedIndex = 0; for(int nConsPos = nStartConsPosPadded; ( nConsPos <= nGetEndConsensusIndex()) && ( nUnpaddedIndex < nLength ); ++nConsPos ) { if (!ntGetCons( nConsPos ).bIsPad() ) { szUnpaddedConsensusToRight[ nUnpaddedIndex ] = cGetConsensusBase( nConsPos ); ++nUnpaddedIndex; } } szUnpaddedConsensusToRight[ nUnpaddedIndex ] = '\0'; return( szUnpaddedConsensusToRight ); } char* Contig :: szGetUnpaddedConsensusToLeftEnd( const int nConsPosPadded ) { assert( nGetStartConsensusIndex() <= nConsPosPadded ); assert( nConsPosPadded <= nGetEndConsensusIndex() ); // count how many bases there are int nUnpaddedConsensusToLeftLen = 0; int nConsPos; for( nConsPos = nConsPosPadded; nConsPos >= nGetEndConsensusIndex(); --nConsPos ) { if (!ntGetCons( nConsPos ).bIsPad() ) { ++nUnpaddedConsensusToLeftLen; } } // if there aren't any bases to the left, return a pointer to a null if (nUnpaddedConsensusToLeftLen <= 0 ) { char* pChar = new char[1]; pChar[0] = '\0'; return( pChar ); } char* szUnpaddedConsensusToLeft = new char[ nUnpaddedConsensusToLeftLen + 1 ]; assert( szUnpaddedConsensusToLeft ); // move the bases over int nUnpaddedConsPos = 0; for(nConsPos = nConsPosPadded; nConsPos >= nGetStartConsensusIndex(); --nConsPos ) { if (!ntGetCons( nConsPos ).bIsPad() ) { szUnpaddedConsensusToLeft[ nUnpaddedConsPos ] = cGetConsensusBase( nConsPos ); ++nUnpaddedConsPos; } } szUnpaddedConsensusToLeft[ nUnpaddedConsPos ] = '\0'; return( szUnpaddedConsensusToLeft ); } // this will not try to return further back than the beginning of the // sequence, even if nLength is set too large char* Contig :: szGetPartUnpaddedConsensusToLeft( const int nConsPosPadded, const int nLength ) { assert( nGetStartConsensusIndex() <= nConsPosPadded ); assert( nConsPosPadded <= nGetEndConsensusIndex() ); char* szUnpaddedConsensusToLeft = new char[ nLength + 1 ]; assert( szUnpaddedConsensusToLeft ); // move the bases over int nUnpaddedConsPos = 0; for(int nConsPos = nConsPosPadded; nConsPos >= nGetStartConsensusIndex() && ( nUnpaddedConsPos < nLength ); --nConsPos ) { if (!ntGetCons( nConsPos ).bIsPad() ) { szUnpaddedConsensusToLeft[ nUnpaddedConsPos ] = cGetConsensusBase( nConsPos ); ++nUnpaddedConsPos; } } szUnpaddedConsensusToLeft[ nUnpaddedConsPos ] = '\0'; return( szUnpaddedConsensusToLeft ); } RWCString Contig :: soGetUnpaddedConsensus() { RWCString soUnpaddedConsensus; getUnpaddedSequence( soUnpaddedConsensus ); // for( int nPadded = nGetStartIndex(); nPadded <= nGetEndIndex(); ++nPadded ){ // Ntide nt = ntGetCons( nPadded ); // if (!nt.bIsPad() ) { // RWCString soBase( nt.cGetBase() ); // soUnpaddedConsensus.append( soBase ); // } // } return( soUnpaddedConsensus ); } char* Contig :: szGetUnpaddedConsensus( int& nUnpaddedConsensusLength, const int nExtraRoom ) { // count how many bases there are nUnpaddedConsensusLength = nGetUnpaddedSeqLength(); // if there aren't any bases to the right, return a pointer to a null if (nUnpaddedConsensusLength <= 0 ) { char* pChar = new char[1]; pChar[0] = '\0'; return( pChar ); } char* szUnpaddedConsensus = new char[ nUnpaddedConsensusLength + 1 + nExtraRoom ]; assert( szUnpaddedConsensus ); // move the bases over int nZeroBasedIndex = 0; for(int nPaddedConsPos = nGetStartConsensusIndex(); nPaddedConsPos <= nGetEndConsensusIndex(); ++nPaddedConsPos ) { if (!ntGetCons( nPaddedConsPos ).bIsPad() ) { szUnpaddedConsensus[ nZeroBasedIndex ] = tolower( cGetConsensusBase( nPaddedConsPos ) ); ++nZeroBasedIndex; } } szUnpaddedConsensus[ nZeroBasedIndex ] = '\0'; return( szUnpaddedConsensus ); } // return pointer to located fragment with same name as passed arg // or NULL if not found LocatedFragment* Contig :: pLocFragGetByName(const RWCString soFragName) { // // This commented-out code takes considerably less memory than the // RWBTreeDictionary scheme. However, it is also much slower. // I found that for MBAC20, this scheme used roughly 3.9Mb out of // a total of 29Mb (with a 9.5Mb size of the executable without //loading in any data). That means roughly 13% of the memory consed takes //is from this // // for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); // ++nRead ) { // LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); // if (pLocFrag->soGetFragmentName() == soFragName ) // return( pLocFrag ); // } // return( NULL ); LocatedFragment* pLocFrag = apLocatedFragment_.pFindReadByName( soFragName ); if ( pLocFrag ) assert( pLocFrag->soGetName() == soFragName ); return( pLocFrag ); } // PURPOSE: complement the contig void Contig :: complementContig() { ConsEd::pGetConsEd()->whatToDoBeforeModifyAssembly(); // complement the consensus first Sequence :: complementSequence(); complementEndPoints( nFirstDisplayableContigPos_, nLastDisplayableContigPos_ ); // ask all located fragments to complement themselves int nIndex; for( nIndex = 0; nIndex < apLocatedFragment_.length() ; ++nIndex ) { apLocatedFragment_[ nIndex ]->complementLocatedFragment(); } // However, now they are out of order. So re-sort them. // no longer sorted--the sorting was never used (DG, 7/99) // apLocatedFragment_.re_sort(); baseSegArray_.complementBaseSegArray(); complementConsensusTags(); setUnpaddedPositionsArray(); bComplementedFromWayPhrapCreated_ = !bComplementedFromWayPhrapCreated_; bReadListsNeedFixing_ = true; } void Contig :: complementConsensusTags() { for( int nTag = 0; nTag < nGetNumberOfTags(); ++nTag ) { tag* pTag = pGetTag( nTag ); pTag->complementSelf(); } } // // PURPOSE: for complementing LocatedFragment (and the contig clipping // positions) // // Takes this: // // ______________________________________________ // ^ ^ ^ ^ // nFirstContigIndex nStart nEnd nLastContigIndex // // and flips it end for end, still labelling the end coordinates // nFirstContigIndex and nLastContigIndex // // // // ______________________________________________ // ^ ^ ^ ^ // nFirstContigIndex nLastContigIndex // nStart nEnd // void Contig :: complementEndPoints( int& nStart, int& nEnd ) { int nOrigEnd = nEnd; nEnd = nGetEndConsensusIndex() - nStart + nGetStartConsensusIndex(); nStart = nGetEndConsensusIndex() - nOrigEnd + nGetStartConsensusIndex(); } void Contig :: complement_coordinate( int& nConsensusPosition ) { nConsensusPosition = nGetEndConsensusIndex() - nConsensusPosition + nGetStartConsensusIndex(); } // inserts a column of pads into consensus and any frags aligned // at the passed consensus relative positon // just before the passed consensus relative position // Thus the new column of pads becomes at nConsPos and what // used to be at nConsPos now becomes at nConsPos + 1 void Contig :: insertColOfPads(const int nConsInsertPos) { if (nConsInsertPos < nGetStartConsensusIndex() ) { GuiApp::popupErrorMessage("Currently you can't insert before the beginning of the consensus\n (You may complement the contig and then insert at the end.)" ); InputDataError ide; ide.setUserNotified(); throw ide; // throw exception } // // form the nucleotide for the new consensus position // Ntide ntNew('*', ucDefaultQualityEdited ); // why do this check? Because we may be inserting a column into // a fragment that is past the end of the consensus or before the // beginning of the consensus. // Only insert the pads into the consensus if there is a consensus // at that point. if ((nGetStartIndex() <= nConsInsertPos) && (nConsInsertPos <= nGetEndIndex()) ) { // insert it into the consensus sequence // ASSUMES NO POINT POSITION IN CONSENSUS NTIDES Sequence::insertNtideAtPos(ntNew, nConsInsertPos ); // we need to recalculate the conversion table for unpadded to padded // positions setUnpaddedPositionsArray(); } // don't use the contig view, because there may be multiple // windows. run down the entire array of frags in the contig // checking each one to see if it need be edited for (int nLocFrag = 0; nLocFrag < nGetNumberOfFragsInContig(); nLocFrag++) { // pick up pointer to this located fragment LocatedFragment* pLocFrag = pLocatedFragmentGet(nLocFrag); // are we in the affected region at all? if ( pLocFrag->nGetAlignEnd() < nConsInsertPos ) { // nothing to do--pad lies after the fragment } else { // The insertion is either before the read or in the read. pLocFrag->moveTagsToRight( nConsInsertPos ); // Aug 2008 changed this from < to <= so that if // the insertion is precisely before the start of the // read, there is no longer a * added to the // beginning of the read. No, Mike McClellan convinced me // to leave this to cover this case: // original alignment // > > bbcgcgcgbbbbbbbbbbbbbbb (read A) // > > agcgcgbbbbbbbbbbbbbbb (read B) // the true sequence is: // > > bbcagcgcgbbbbbbbbbbbbbbb (read A) // we want to edit readB so it says: // > > cagcgcgbbbbbbbbbbbbbbb (read B) // if we can add a column of pads: // > > bb*cgcgcgbbbbbbbbbbbbbbb (read A) // > > *agcgcgbbbbbbbbbbbbbbb (read B) // we can edit this. But if we have <, then we can't edit read B: // > > bb*cgcgcgbbbbbbbbbbbbbbb (read A) // > > agcgcgbbbbbbbbbbbbbbb (read B) if (nConsInsertPos < pLocFrag->nGetAlignStart() ) { // pad lies before the entire fragment // the insertion pos is not within the located fragment, // but since this frag is aligned after that pos, the // alignment must be fixed. pLocFrag->shiftAlignmentPlusOne(); } else { // Case in which the pad lies within the fragment // insert the pad into this fragment. frag // will fix its own alignment and interpolate point pos pLocFrag->insertNtideAtConsPos( nConsInsertPos, ntNew); } pLocFrag->adjustQualAndAlignClippingWhenInsertColumnOfPads( nConsInsertPos ); } // frag is in affected region } // loop through all located fragments // now tell the base segment array to do essentially the // same thing. baseSegArray_.adjustSegsForInsertionAtPos(nConsInsertPos); assert( baseSegArray_.bGetDataStructureOk() ); // now adjust consensus tags for( int nTag = 0; nTag < nGetNumberOfTags(); ++nTag ) { tag* pTag = pGetTag( nTag ); pTag->adjustTagPosition( nConsInsertPos, 1 ); } // now update the last displayable contig position, since the last // fragment has been moved to the right updateLastDisplayableContigPos(); } // deletes a column of pads in consensus and any frags aligned // at the passed consensus relative positon. this is only // intended to be used as an undo version of insertColOfPads, // in that the user is never allowed to delete an entire column // simply for it's own sake. void Contig :: undoInsertColOfPads(const int nConsDeletePos) { deleteColumn( nConsDeletePos, true ); } // deletes a column of pads in consensus and any frags aligned // at the passed consensus relative positon. this is only // intended to be used as an undo version of insertColOfPads, // in that the user is never allowed to delete an entire column // simply for it's own sake. void Contig :: deleteColumn(const int nConsDeletePos, const bool bColumnOfPads ) { // only remove a pad from the consensus if there is a // consensus at that position (we could be before or after // the consensus ) if ((nGetStartIndex() <= nConsDeletePos) && (nConsDeletePos <= nGetEndIndex()) ) { // insert it into the consensus sequence // ASSUMES NO POINT POSITION IN CONSENSUS NTIDES Ntide ntShouldBePad = Sequence::deleteNtideAtPos(nConsDeletePos, false ); // bDeletePointPos // if you're NOT a pad something is screwed up if ( bColumnOfPads ) assert(ntShouldBePad.bIsPad()); setUnpaddedPositionsArray(); } // don't use the contig view, because there may be multiple // windows. run down the entire array of frags in the contig // checking each one to see if it need be edited for (int nLocFrag = 0; nLocFrag < nGetNumberOfFragsInContig(); nLocFrag++) { // pick up pointer to this located fragment LocatedFragment* pLocFrag = pLocatedFragmentGet(nLocFrag); // are we in the affected region at all? if ( pLocFrag->nGetAlignEnd() < nConsDeletePos ) { // nothing to do--pad lies after the fragment } else { pLocFrag->setChanged(); pLocFrag->moveTagsToLeft( nConsDeletePos ); // yes. is the entire fragment after the insert position? if (nConsDeletePos < pLocFrag->nGetAlignStart() ) { // pad lies before the entire fragment // the insertion pos is not within the located fragment, // but since this frag is aligned after that pos, the // alignment must be fixed. pLocFrag->shiftAlignmentMinusOne(); } else { // Case in which the pad lies within the fragment // delete the pad from this fragment. frag // will fix its own alignment and interpolate point pos pLocFrag->deleteNtide(nConsDeletePos ); } } // frag is in affected region pLocFrag->adjustQualAndAlignClippingWhenRemoveColumnOfPads( nConsDeletePos ); } // loop through all located fragments // now tell the base segment array to do essentially the // same thing. baseSegArray_.adjustSegsForDeletionAtPos(nConsDeletePos); assert( baseSegArray_.bGetDataStructureOk() ); // now adjust consensus tags for( int nTag = 0; nTag < nGetNumberOfTags(); ++nTag ) { tag* pTag = pGetTag( nTag ); pTag->adjustTagPosition( nConsDeletePos, -1 ); } // now update the last displayable contig position, since the last // fragment has been moved to the right updateLastDisplayableContigPos(); } void Contig :: updateLastDisplayableContigPos() { // made this so it is never less than // the last consensus base. (DG, May 2010) // Otherwise, when running down the consensus, Contig::ntGetCons will // throw an exception when getting beyond nLastDisplayableContigPos_ // This occurred in fixContigEnds for regions where there are no reads. nLastDisplayableContigPos_ = nGetEndConsensusIndex(); int nNumFrags = nGetNumberOfFragsInContig(); for (int nFrag = 0; nFrag < nNumFrags; nFrag++) { // get pointer to this located frag from contig LocatedFragment* pLocFrag = pLocatedFragmentGet( nFrag ); if (pLocFrag->nGetAlignEnd() > nLastDisplayableContigPos_) { nLastDisplayableContigPos_ = pLocFrag->nGetAlignEnd(); } } } void Contig :: updateFirstDisplayableContigPos() { nFirstDisplayableContigPos_ = 1; int nNumFrags = nGetNumberOfFragsInContig(); for( int nRead = 0; nRead < nNumFrags; ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->nGetAlignStart() < nFirstDisplayableContigPos_ ) { nFirstDisplayableContigPos_ = pLocFrag->nGetAlignStart(); } } } void Contig :: updateFirstAndLastDisplayableContigPos() { // this was crazily inefficient: // updateFirstDisplayableContigPos(); //updateLastDisplayableContigPos(); nFirstDisplayableContigPos_ = 1; nLastDisplayableContigPos_ = nGetEndConsensusIndex(); for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->nGetAlignStart() < nFirstDisplayableContigPos_ ) { nFirstDisplayableContigPos_ = pLocFrag->nGetAlignStart(); } if ( nLastDisplayableContigPos_ < pLocFrag->nGetAlignEnd() ) { nLastDisplayableContigPos_ = pLocFrag->nGetAlignEnd(); } } } void Contig :: setChanged() { bChanged_ = true; ConsEd::pGetAssembly()->setChanged(); bOKToUsePaddedIndexFast_ = false; // force calling setPaddedPositionsArray // in navigateByHighlyDiscrepantPositions, force recalculating // paDiscrepanciesNotIndels_ and paDiscrepanciesJustIndels_ when // the user changes the contig, such as when the consensus is extended. // This fixes a bug in which the user would have assembly view up, // extend the consensus, and then Assembly View would spew errors looking // like this: //Gordon C++ class toolkit error: File 'mbtValVectorOfBool.h', line 47 In mbtValVectorOfBool::operator[] ( 121519 ) out of bounds with nCurrentLength_ = 121518 name: Contig::pDiscrepanciesNotIndels_ Version 17.45 (beta) (081029) bDiscrepancyListsCalculated_ = false; } int Contig :: nPaddedIndex( const int nUnpaddedConsPos ) { assert( nGetFirstDisplayableContigPos() <= nUnpaddedConsPos ); // if the position is less than the beginning of the contig bases // then unpadded = padded if ( nUnpaddedConsPos < Sequence::nGetStartIndex() ) return( nUnpaddedConsPos ); int nUnpaddedPosOfLastContigBase = nUnpaddedIndex( nGetEndConsensusIndex() ); if ( nUnpaddedConsPos <= nUnpaddedPosOfLastContigBase ) return( Sequence::nPaddedIndex( nUnpaddedConsPos ) ); else { // case in which the base is off the right edge int nBasesPastRightMostContigBase = nUnpaddedConsPos - nUnpaddedPosOfLastContigBase; int nPaddedIndex = nGetEndConsensusIndex() + nBasesPastRightMostContigBase; assert( nPaddedIndex <= nGetLastDisplayableContigPos() ); return( nPaddedIndex ); } } void Contig :: setContigMatchTables() { if ( pUnpaddedContig_ ) { freeContigMatchTables( pUnpaddedContig_ ); } if ( pUnpaddedContigComplemented_ ) { freeContigMatchTables( pUnpaddedContigComplemented_ ); } pUnpaddedContig_ = (contigMatchTablesType*) malloc( sizeof( contigMatchTablesType ) ); if ( !pUnpaddedContig_ ) { THROW_NOT_ENOUGH_MEMORY; } pUnpaddedContigComplemented_ = (contigMatchTablesType*) malloc( sizeof( contigMatchTablesType ) ); if ( !pUnpaddedContigComplemented_ ) { THROW_NOT_ENOUGH_MEMORY; } int nUnpaddedContigLength = nGetUnpaddedSeqLength(); pUnpaddedContig_-> nUnpaddedBases_ = nUnpaddedContigLength; pUnpaddedContigComplemented_->nUnpaddedBases_ = nUnpaddedContigLength; pUnpaddedContig_-> szUnpaddedBases_ = (char*) malloc( sizeof(char) * ( nUnpaddedContigLength + 1 ) ); if ( !pUnpaddedContig_->szUnpaddedBases_ ) { THROW_NOT_ENOUGH_MEMORY; } pUnpaddedContigComplemented_->szUnpaddedBases_ = (char*) malloc( sizeof(char) * ( nUnpaddedContigLength + 1 ) ); if ( !pUnpaddedContigComplemented_->szUnpaddedBases_ ) { THROW_NOT_ENOUGH_MEMORY; } pUnpaddedContig_-> pUnpaddedQualityArray_ = (unsigned char*) malloc( sizeof( unsigned char ) * nUnpaddedContigLength ); if ( !pUnpaddedContig_->pUnpaddedQualityArray_ ) { THROW_NOT_ENOUGH_MEMORY; } pUnpaddedContigComplemented_->pUnpaddedQualityArray_ = (unsigned char*) malloc( sizeof( unsigned char ) * nUnpaddedContigLength ); if ( !pUnpaddedContigComplemented_->pUnpaddedQualityArray_ ) { THROW_NOT_ENOUGH_MEMORY; } int nZeroBasedUnpadded = 0; for( int nPaddedConsPos = nGetStartConsensusIndex(); nPaddedConsPos <= nGetEndConsensusIndex(); ++nPaddedConsPos ) { Ntide nt = ntGetCons( nPaddedConsPos ); if (! nt.bIsPad() ) { pUnpaddedContig_->szUnpaddedBases_[ nZeroBasedUnpadded ] = nt.cGetBase(); pUnpaddedContig_->pUnpaddedQualityArray_[ nZeroBasedUnpadded ] = nt.qualGetQuality(); ++nZeroBasedUnpadded; } } assert( nZeroBasedUnpadded == nUnpaddedContigLength ); // now that we have gotten the unpadded array, the complemented // unpadded array can get gotten more efficiently for( nZeroBasedUnpadded = 0; nZeroBasedUnpadded < nUnpaddedContigLength; ++nZeroBasedUnpadded ) { int nZeroBasedComplement = nUnpaddedContigLength - 1 - nZeroBasedUnpadded; pUnpaddedContigComplemented_->szUnpaddedBases_[ nZeroBasedUnpadded ] = cComplementBase( pUnpaddedContig_->szUnpaddedBases_[ nZeroBasedComplement ] ); pUnpaddedContigComplemented_->pUnpaddedQualityArray_[ nZeroBasedUnpadded ] = pUnpaddedContig_->pUnpaddedQualityArray_[ nZeroBasedComplement ]; } setMatchLookupTable( pUnpaddedContig_ ); setMatchLookupTable( pUnpaddedContigComplemented_ ); } void Contig :: addConsensusTag( tag* pConsensusTag ) { aConsensusTags_.insert( pConsensusTag ); } void Contig :: dumpConsensusTags() { for( int nTag = 0; nTag < nGetNumberOfTags(); ++nTag ) { tag* pTag = pGetTag( nTag ); cout << (long) pTag << " type: " << pTag->soType_ << " source: " << pTag->soSource_ << " from " << pTag->nPaddedConsPosStart_ << " to " << pTag->nPaddedConsPosEnd_ << " date: " << pTag->soDate_ << endl; } } void Contig :: removeConsensusTag( tag* pConsensusTag ) { assert( pConsensusTag == aConsensusTags_.remove( pConsensusTag ) ); } void Contig :: writeConsensusTagsToAceFile( ofstream& ofsAceFile ) { for( int nTag = 0; nTag < nGetNumberOfTags(); ++nTag ) { tag* pTag = pGetTag( nTag ); pTag->writeTagToAceFile( ofsAceFile ); } } void Contig :: setPaddedPositionsArray() { if (!pPaddedFromUnpadded_ ) { pPaddedFromUnpadded_ = new paddedFromUnpadded( this ); } pPaddedFromUnpadded_->setPaddedPositionsArray(); bOKToUsePaddedIndexFast_ = true; } void Contig :: writeConsensusToFastaFile2( const bool bWriteBothBasesAndQualFiles, const FileName& soBasesFileName, const FileName& soQualFileName, const bool bWriteWholeContig, const int nStartUnpaddedConsPos, const int nEndUnpaddedConsPos ) { FILE* pfilBases = fopen( (char*) soBasesFileName.data(), "w" ); if ( pfilBases == NULL ) { ostringstream ost; ost << "Unable to open file " << soBasesFileName << " for writing bases " << endl << ends; SysRequestFailed srf( ost.str().c_str() ); srf.includeErrnoDescription(); throw srf; } FILE* pfilQualities; if ( bWriteBothBasesAndQualFiles ) { pfilQualities = fopen( (char*) soQualFileName.data(), "w" ); if ( pfilQualities == NULL ) { ostringstream ost; ost << "Unable to open file " << soQualFileName << " for writing qualities " << endl << ends; SysRequestFailed srf( ost.str().c_str() ); srf.includeErrnoDescription(); throw srf; } } writeConsensusToFastaFile3( pfilBases, pfilQualities, bWriteBothBasesAndQualFiles, soBasesFileName, soQualFileName, bWriteWholeContig, nStartUnpaddedConsPos, nEndUnpaddedConsPos ); fclose( pfilBases ); if ( bWriteBothBasesAndQualFiles ) { fclose( pfilQualities ); } } void Contig :: writeConsensusToFastaFile3( FILE *pfilBases, FILE *pfilQualities, const bool bWriteBothBasesAndQualFiles, const FileName& soBasesFileName, const FileName& soQualFileName, const bool bWriteWholeContig, const int nStartUnpaddedConsPos, const int nEndUnpaddedConsPos, const RWCString& soHeaderLine ) { if ( soHeaderLine.bIsNull() ) { fprintf( pfilBases, ">%s %s", (char*) soGetName().data(), (char*) ConsEd::pGetAssembly()->soGetAceFileName().data() ); if ( bWriteWholeContig ) { fprintf( pfilBases, " (whole contig)\n" ); } else { fprintf( pfilBases, " from %d to %d\n", nStartUnpaddedConsPos, nEndUnpaddedConsPos ); } if ( bWriteBothBasesAndQualFiles ) { fprintf( pfilQualities, ">%s %s", (char*) soGetName().data(), (char*) ConsEd::pGetAssembly()->soGetAceFileName().data() ); if ( bWriteWholeContig ) { fprintf( pfilQualities, " (whole contig)\n" ); } else { fprintf( pfilQualities, " from %d to %d\n", nStartUnpaddedConsPos, nEndUnpaddedConsPos ); } } } else { fprintf( pfilBases, ">%s\n", soHeaderLine.data() ); if ( bWriteBothBasesAndQualFiles ) { fprintf( pfilQualities, ">%s\n", soHeaderLine.data() ); } } int nStartUnpadded = nStartUnpaddedConsPos; int nEndUnpadded = nEndUnpaddedConsPos; if ( bWriteWholeContig ) { nStartUnpadded = nGetUnpaddedStartIndex(); nEndUnpadded = nGetUnpaddedEndIndex(); } setPaddedPositionsArray(); const int nFastaMaxLineLen = 50; int nCharsOnLine = 0; for( int nUnpadded = nStartUnpadded; nUnpadded <= nEndUnpadded; ++ nUnpadded ) { int nPadded = nPaddedIndexFast( nUnpadded ); Ntide nt = ntGetCons( nPadded ); fprintf( pfilBases, "%c", toupper( nt.cGetBase() ) ); if ( bWriteBothBasesAndQualFiles ) fprintf( pfilQualities, "%d", (int) nt.qualGetQuality() ); ++nCharsOnLine; if ( nCharsOnLine < nFastaMaxLineLen ) { if ( bWriteBothBasesAndQualFiles ) fprintf( pfilQualities, " " ); } else { nCharsOnLine = 0; fprintf( pfilBases, "\n" ); if ( bWriteBothBasesAndQualFiles ) fprintf( pfilQualities, "\n" ); } } if ( nCharsOnLine ) { fprintf( pfilBases, "\n" ); if ( bWriteBothBasesAndQualFiles ) fprintf( pfilQualities, "\n" ); } } void Contig :: writeConsensusToPhdFile( const FileName& soPhdPathname, const bool bWriteWholeContig, const int nStartUnpaddedConsPos, const int nEndUnpaddedConsPos ) { ofstream ofsPHD; RWCString soLine( (RWSize_2T) 200 ); #ifdef OFSTREAM_OPEN_WITHOUT_PERMISSIONS ofsPHD.open( soPhdPathname.data(), ios:: out ); #else ofsPHD.open( soPhdPathname.data(), ios:: out, 0666 ); #endif if ( !ofsPHD ) { ostringstream ost; ost << "could not open file " << soPhdPathname << " for output" << endl << ends; SysRequestFailed srf(ost.str().c_str() ); // what the hell, show 'em the errno srf.includeErrnoDescription(); throw srf; } // This used to make the phd file have: // BEGIN_SEQUENCE myname.phd.1 // but addReads2Consed.perl had a problem with this because it would // append .phd.1 to this name and thus look for the file // myname.phd.1.phd.1 which it wouldn't find (according to Don Bovee, // Aug 2002) so make the BEGIN_SEQUENCE line not include this. RWCString soSequenceName = soPhdPathname.soGetBasename(); RWCRegexp regPhd = "\\.phd\\.[0-9]+$"; soSequenceName( regPhd ) = ""; ofsPHD << "BEGIN_SEQUENCE " << soSequenceName << endl; ofsPHD << endl; ofsPHD << "BEGIN_COMMENT" << endl; ofsPHD << endl; ofsPHD << "CHROMAT_FILE: none" << endl; ofsPHD << "ABI_THUMBPRINT: none" << endl; ofsPHD << "PHRED_VERSION: not called by phred" << endl; ofsPHD << "CALL_METHOD: consed export consensus" << endl; ofsPHD << "QUALITY_LEVELS: 99" << endl; time_t tm = time( NULL ); ofsPHD << "TIME: " << ctime( &tm ) << endl; ofsPHD << "END_COMMENT" << endl; ofsPHD << endl; ofsPHD << "BEGIN_DNA" << endl; int nStartUnpadded = nStartUnpaddedConsPos; int nEndUnpadded = nEndUnpaddedConsPos; if ( bWriteWholeContig ) { nStartUnpadded = nGetUnpaddedStartIndex(); nEndUnpadded = nGetUnpaddedEndIndex(); } setPaddedPositionsArray(); for( int nUnpadded = nStartUnpadded; nUnpadded <= nEndUnpadded; ++ nUnpadded ) { int nPadded = nPaddedIndexFast( nUnpadded ); ofsPHD << ntGetCons( nPadded ).cGetBase() << " " << ( (int) ntGetCons( nPadded ).qualGetQuality() ) << " 0" << endl; } ofsPHD << "END_DNA" << endl; ofsPHD << endl; ofsPHD << "END_SEQUENCE" << endl; ofsPHD << endl; ofsPHD << "WR{" << endl; // add item type and source ofsPHD << "fake consed " << soGetDateTime( nNoSlashes ) << endl; ofsPHD << "}" << endl; ofsPHD << endl; #ifdef OFSTREAM_OPEN_WITHOUT_PERMISSIONS // I'm putting the close inside the ifdef because there never // used to be a close at all. ofsPHD.close(); int nError = chmod( soPhdPathname.data(), 0666 ); #endif } Contig :: ~Contig() { ConsEd::pGetConsEd()->deleteAllCompareContigWindowsForContig( this ); ConsEd::pGetConsEd()->deleteAllContigWinsForContig( this ); // assumes that reads have been put somewhere else // so don't delete them. Just delete pointers to them. apLocatedFragment_.clear(); baseSegArray_.dapBs_.clearAndDestroy(); delete pUnpaddedContig_; delete pUnpaddedContigComplemented_; // free unpaddedFromPadded_.pUnpaddedPositions_; // This is already freed by ~unpaddedFromPadded which is called by // ~Contig automatically since unpaddedFromPadded is memory data of // Contig delete pPaddedFromUnpadded_; ConsEd::pGetAssembly()->removeContig( this ); } double Contig :: dGetDepthOfCoverage() { int nSumOfPaddedReadLengths = 0; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); nSumOfPaddedReadLengths += pLocFrag->nGetPaddedSeqLength(); } return( (double) nSumOfPaddedReadLengths / (double) nGetPaddedSeqLength() ); } void Contig :: dumpTemplates() { for( int nTemplate = 0; nTemplate < aSubcloneTemplates_.length(); ++nTemplate ) { subcloneTTemplate* pSub = aSubcloneTemplates_[ nTemplate ]; pSub->dumpTemplate(); } } void Contig :: getErrorInfo( const int nUnpaddedLeftPos, const int nUnpaddedRightPos, double& dTotalErrors, int& nTotalNonEditedBases, double& dErrorRate ) { int nLeftConsPos = nPaddedIndex( nUnpaddedLeftPos ); int nRightConsPos = nPaddedIndex( nUnpaddedRightPos ); dTotalErrors = 0.0; nTotalNonEditedBases = 0; double dError; Quality qual; double dQuality; for( int nConsPos = nLeftConsPos; nConsPos <= nRightConsPos; ++nConsPos ) { Ntide nt = ntGetCons( nConsPos ); if ( !nt.bIsPad() ) { qual = nt.qualGetQuality(); // do not include edited bases in the quality // calculation if (qual == ucQualityLowEdited ) continue; if (qual == ucQualityHighEdited ) continue; if ( qual <= 0 ) dError = 1.0; else dError = dErrorRateFromQuality[ qual ]; ++nTotalNonEditedBases; dTotalErrors += dError; } } if ( nTotalNonEditedBases > 0 ) dErrorRate = dTotalErrors / ( double ) nTotalNonEditedBases; else dErrorRate = -1; } void Contig ::getSingleSubcloneInfo( const int nUnpaddedLeftPos, const int nUnpaddedRightPos, int& nSingleSubcloneBases ) { autoFinishSetSingleSubcloneArrays( true ); // bObserveDoNotFinishTags nSingleSubcloneBases = 0; for( int nUnpadded = nUnpaddedLeftPos; nUnpadded <= nUnpaddedRightPos; ++nUnpadded ) { if ( (*pNumberOfSubclonesCoveringEachBase_)[ nUnpadded ] < 2 ) ++nSingleSubcloneBases; } } int Contig :: nGetCurrentSingleSubcloneBases( const int nUnpaddedLeftPos, const int nUnpaddedRightPos ) { int nUnpaddedLeftOnConsensus = MAX( nUnpaddedLeftPos, nGetUnpaddedStartIndex() ); int nUnpaddedRightOnConsensus = MIN( nUnpaddedRightPos, nGetUnpaddedEndIndex() ); if ( nUnpaddedLeftOnConsensus > nUnpaddedRightOnConsensus ) { // region is completely off the consensus so count each // base as a single subclone base return( nUnpaddedRightPos - nUnpaddedLeftPos + 1 ); } int nSingleSubcloneBases = 0; for( int nUnpadded = nUnpaddedLeftOnConsensus; nUnpadded <= nUnpaddedRightOnConsensus; ++nUnpadded ) { if ( (*pNumberOfSubclonesCoveringEachBaseWithoutSuggestedReads_)[ nUnpadded ] < 2 ) ++nSingleSubcloneBases; } // ---------------------- (consensus) // ---------------- (high quality part of read) // ^ ^ nCurrentSingleSubcloneBases (from above) // ^ ^ (additional single subclone bases) // b a // where a = nUnpaddedHighQualityRegionLeftOnConsensus - 1 // b = nHighQualityRegionLeft_ // so additional is a - b + 1 or just // nUnpaddedHighQualityRegionLeftOnConsensus - nHighQualityRegionLeft_) if ( nUnpaddedLeftPos < nUnpaddedLeftOnConsensus ) nSingleSubcloneBases += ( nUnpaddedLeftOnConsensus - nUnpaddedLeftPos ); if ( nUnpaddedRightPos > nUnpaddedRightOnConsensus ) nSingleSubcloneBases += ( nUnpaddedRightPos - nUnpaddedRightOnConsensus ); return( nSingleSubcloneBases ); } // quality 13 has an error rate of 0.050118723 so // 0.05 - (error rate of quality 13 base) = neg number // Hence quality 13 bases on the end of a high quality segment will // not be included in it. static double dAmountToSubtractFrom = 0.05; void Contig :: setHighQualitySegment() { bool bHighQualitySegmentExists; int nUnpaddedHighQualitySegmentStart; int nUnpaddedHighQualitySegmentEnd; int nHighQualitySegmentConsPosStart; int nHighQualitySegmentConsPosEnd; findHighQualitySegment( bHighQualitySegmentExists, nUnpaddedHighQualitySegmentStart, nUnpaddedHighQualitySegmentEnd, nHighQualitySegmentConsPosStart, nHighQualitySegmentConsPosEnd ); if ( bHighQualitySegmentExists ) { nUnpaddedHighQualitySegmentStart_ = nUnpaddedHighQualitySegmentStart; nUnpaddedHighQualitySegmentEnd_ = nUnpaddedHighQualitySegmentEnd; } else { // note that nUnpaddedHighQualitySegmentStart_ and // nUnpaddedHighQualitySegmentEnd_ may both be -1, indicating that // there is no high quality segment nUnpaddedHighQualitySegmentStart_ = -1; nUnpaddedHighQualitySegmentEnd_ = -1; } assert( nUnpaddedHighQualitySegmentStart_ <= nUnpaddedHighQualitySegmentEnd_ ); // prepare these just in case we are autofinish and going to choose reads nUpdatedUnpaddedHighQualitySegmentStart_ = nUnpaddedHighQualitySegmentStart_; nUpdatedUnpaddedHighQualitySegmentEnd_ = nUnpaddedHighQualitySegmentEnd_; } RWCString Contig :: soGetFancyName() { RWCString soFancyName; for( int nTag = 0; nTag < aConsensusTags_.length(); ++nTag ) { if( aConsensusTags_[ nTag ]->soType_ == "contigName" ) { tag* pTag = aConsensusTags_[ nTag ]; if ( pTag->soMiscData_.length() > 1 ) { if ( !soFancyName.isNull() ) soFancyName += " "; soFancyName += pTag->soMiscData_.soGetRestOfString( 1 ); } if ( pTag->soMiscData_.length() > 0 ) { soFancyName += " "; if ( pTag->soMiscData_[0] == 'U' ) soFancyName += "->"; else soFancyName += "<-"; } } } if ( !soFancyName.isNull() ) soFancyName += " "; soFancyName += soGetName(); return( soFancyName ); } void Contig :: getUnpaddedRangeForMakingPCRPrimers( const int nWhichEnd, int& nUnpaddedLeft, int& nUnpaddedRight, bool& bProblems ) { if ( nWhichEnd == nLeftGap ) { nUnpaddedLeft = nGetHighQualitySegmentStart(); // handles case // of no high quality segment nUnpaddedLeft += pCP->nPrimersMakePCRPrimersThisManyBasesBackFromEndOfHighQualitySegment_; nUnpaddedRight = nUnpaddedLeft + pCP->nPrimersWindowSizeInLookingToUse_; } else { nUnpaddedRight = nGetHighQualitySegmentEnd(); nUnpaddedRight -= pCP->nPrimersMakePCRPrimersThisManyBasesBackFromEndOfHighQualitySegment_; nUnpaddedLeft = nUnpaddedRight - pCP->nPrimersWindowSizeInLookingToUse_; } if ( bIntersect( nUnpaddedLeft, nUnpaddedRight, nGetUnpaddedStartIndex(), nGetUnpaddedEndIndex(), nUnpaddedLeft, nUnpaddedRight ) ) bProblems = false; else bProblems = true; } int Contig :: nGetPaddedConsPosForMakingPCRPrimers( const int nWhichEnd ) { int nUnpadded; if ( nWhichEnd == nLeftGap ) { if ( bIsThereAHighQualitySegment() ) nUnpadded = nUnpaddedHighQualitySegmentStart_; else nUnpadded = nGetUnpaddedStartIndex(); nUnpadded += pCP->nPrimersMakePCRPrimersThisManyBasesBackFromEndOfHighQualitySegment_; if ( nUnpadded > nGetUnpaddedEndIndex() ) nUnpadded = nGetUnpaddedEndIndex(); } else { if ( bIsThereAHighQualitySegment() ) nUnpadded = nUnpaddedHighQualitySegmentEnd_; else nUnpadded = nGetUnpaddedEndIndex(); nUnpadded -= pCP->nPrimersMakePCRPrimersThisManyBasesBackFromEndOfHighQualitySegment_; if ( nUnpadded < nGetUnpaddedStartIndex() ) nUnpadded = nGetUnpaddedStartIndex(); } return( nPaddedIndex( nUnpadded ) ); } void Contig :: findProblemsInContig( const bool bFindSingleSubcloneRegions, const bool bFindHighQualityDiscrepancies, const bool bFindUnalignedHighQualityRegions) { if ( pProblemsInUnpaddedContig_ ) delete pProblemsInUnpaddedContig_; pProblemsInUnpaddedContig_ = new mbtValVector( nGetUnpaddedSeqLength(), 1, 0 ); if ( bFindSingleSubcloneRegions ) addSingleSubcloneBasesToProblemsArray(); if ( bFindHighQualityDiscrepancies ) addHighQualityDiscrepanciesToProblemsArray(); if ( bFindUnalignedHighQualityRegions ) addUnalignedHighQualityRegionsToProblemsArray(); } void Contig :: addSingleSubcloneBasesToProblemsArray() { mbtPtrVector aPaddedContigOfReadPtrs( nGetPaddedSeqLength(), nGetStartIndex() ); for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->bIsWholeReadUnaligned() ) continue; RWCString soTemplateName = pLocFrag->soGetTemplateName(); int nConsPosStart = pLocFrag->nGetAlignClipStart(); int nConsPosEnd = pLocFrag->nGetAlignClipEnd(); // we are just interested in the part of the read that // is on the contig (overlaps the consensus) if ( !bIntersect( nConsPosStart, nConsPosEnd, nGetStartIndex(), nGetEndIndex(), nConsPosStart, nConsPosEnd ) ) continue; for( int nConsPos = nConsPosStart; nConsPos <= nConsPosEnd; ++nConsPos ) { if ( aPaddedContigOfReadPtrs[ nConsPos ] == (LocatedFragment*) -1 ) continue; if ( !aPaddedContigOfReadPtrs[ nConsPos ] ) aPaddedContigOfReadPtrs[ nConsPos ] = pLocFrag; else { if ( aPaddedContigOfReadPtrs[ nConsPos ]->soGetTemplateName() != soTemplateName ) aPaddedContigOfReadPtrs[ nConsPos ] = (LocatedFragment*) -1; } } } for( int nConsPos = nGetStartIndex(); nConsPos <= nGetEndIndex(); ++nConsPos ) { if ( aPaddedContigOfReadPtrs[ nConsPos ] != (LocatedFragment*) -1 ) { int nUnpadded = nUnpaddedIndex( nConsPos ); // this check is to fix case in which the first base // of the consensus is a pad if ( nUnpadded >= nGetUnpaddedStartIndex() ) { (*pProblemsInUnpaddedContig_)[ nUnpadded ] |= uSINGLE_SUBCLONE_BASE; } } } } void Contig :: addHighQualityDiscrepanciesToProblemsArray() { for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->bIsWholeReadUnaligned() ) continue; int nConsPosStartOnConsensus; int nConsPosEndOnConsensus; if ( bIntersect( pLocFrag->nGetAlignStart(), pLocFrag->nGetAlignEnd(), nGetStartIndex(), nGetEndIndex(), nConsPosStartOnConsensus, nConsPosEndOnConsensus ) ) { for( int nConsPos = nConsPosStartOnConsensus; nConsPos <= nConsPosEndOnConsensus; ++nConsPos ) { if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQualityWithout98or99() >= pCP->nQualityThresholdForFindingHighQualityDiscrepancies_ ) { if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() != cGetConsensusBase( nConsPos ) ) { if ( !pLocFrag->bIsInAlignedPartOfReadAndNotTooCloseToUnalignedPart( nConsPos ) ) continue; if ( pLocFrag->bIsWithinACompressionOrG_dropoutTag( nConsPos ) ) continue; int nUnpadded = nUnpaddedIndex( nConsPos ); (*pProblemsInUnpaddedContig_)[ nUnpadded ] |= uHIGH_QUALITY_DISCREPANCY; } // if ( pLocFrag->ntGetFragFromConsPos( ... } // if ( pLocFrag->ntGetFragFromConsPos( nConsPos )... } // for( int nConsPos = ... } // if ( bIntersect( pLocFrag->nGetAlignStart(), ... } // for( int nRead = ... } void Contig :: addUnalignedHighQualityRegionsToProblemsArray() { gotoList* pGotoList = new unalignedHighQualityRegionsGotoList( this, false ); // do *not* // add totally unaligned reads to the problems array (which will // be used to pick primers) for( int nGotoItem = 0; nGotoItem < pGotoList->nGetNumGotoItems(); ++nGotoItem ) { gotoItem* pGotoItem = (*pGotoList)[ nGotoItem ]; int nUnpaddedOnConsensusStart; int nUnpaddedOnConsensusEnd; if ( bIntersect( pGotoItem->nUnpaddedGotoItemStart_, pGotoItem->nUnpaddedGotoItemEnd_, nGetUnpaddedStartIndex(), nGetUnpaddedEndIndex(), nUnpaddedOnConsensusStart, nUnpaddedOnConsensusEnd ) ) { for( int nUnpadded = nUnpaddedOnConsensusStart; nUnpadded <= nUnpaddedOnConsensusEnd; ++nUnpadded ) { (*pProblemsInUnpaddedContig_)[ nUnpadded ] |= uUNALIGNED_HIGH_QUALITY_REGION; } } } delete pGotoList; } bool Contig :: bIsThereAProblemHere( const int nUnpaddedLeft, const int nUnpaddedRight, int& nWhichProblem ) { if ( !pProblemsInUnpaddedContig_ ) findProblemsInContig( !pCP->bPrimersOKToChoosePrimersInSingleSubcloneRegion_, !pCP->bPrimersOKToChoosePrimersWhereHighQualityDiscrepancies_, !pCP->bPrimersOKToChoosePrimersWhereUnalignedHighQualityRegion_ ); for( int nUnpadded = nUnpaddedLeft; nUnpadded <= nUnpaddedRight; ++nUnpadded ) { if ( (*pProblemsInUnpaddedContig_)[ nUnpadded ] ) { if ( (*pProblemsInUnpaddedContig_)[ nUnpadded ] & uSINGLE_SUBCLONE_BASE ) { nWhichProblem = BAD_PRIMER_IN_SINGLE_SUBCLONE_REGION; } else if ( (*pProblemsInUnpaddedContig_)[ nUnpadded ] & uHIGH_QUALITY_DISCREPANCY ) nWhichProblem = BAD_PRIMER_WHERE_HIGH_QUALITY_DISCREPANCIES; else if ( (*pProblemsInUnpaddedContig_)[ nUnpadded ] & uUNALIGNED_HIGH_QUALITY_REGION ) nWhichProblem = BAD_PRIMER_WHERE_UNALIGNED_HIGH_QUALITY_REGION; else assert( false ); return( true ); } } return( false ); } static int nTable[256]; static bool bTableIsSet = false; void Contig :: addMononucleotideRichRegionsToList( gotoList* pGotoList, const int nUnpaddedStartOfRegion, const int nUnpaddedEndOfRegion, const bool bNeedToSetPaddedPositionsArrays, bool& bFoundARegion ) { bFoundARegion = false; if ( bNeedToSetPaddedPositionsArrays ) setPaddedPositionsArray(); int nPaddedStart = nPaddedIndexFast( nUnpaddedStartOfRegion ); int nUnpaddedLengthOfRegion = nUnpaddedEndOfRegion - nUnpaddedStartOfRegion; char* szBases = szGetPartUnpaddedConsensusToRight( nPaddedStart, nUnpaddedLengthOfRegion ); unsigned char* pUnpaddedRegion = (unsigned char*) calloc( nUnpaddedLengthOfRegion + 1, // save room for sentinel sizeof( unsigned char ) ); assert( pUnpaddedRegion ); const int nWindow = 20; const int nMinNumberOfSingleNucleotides = 16; if ( nUnpaddedLengthOfRegion < nWindow ) return; int nACGT[5]; nACGT[0] = 0; nACGT[1] = 0; nACGT[2] = 0; nACGT[3] = 0; nACGT[4] = 0; if ( !bTableIsSet ) { for( int n = 0; n < 256; ++n) nTable[n] = 0; nTable['a'] = 1; nTable['c'] = 2; nTable['g'] = 3; nTable['t'] = 4; // everything else is zero bTableIsSet = true; } // count the bases in the first window int nZeroBased; for( nZeroBased = 0; nZeroBased < nWindow; ++nZeroBased ) { ++nACGT[ nTable[ szBases[ nZeroBased ] ] ]; } // check that first window and see if it is a mononucleotide-rich region if ( nACGT[1] >= nMinNumberOfSingleNucleotides || nACGT[2] >= nMinNumberOfSingleNucleotides || nACGT[3] >= nMinNumberOfSingleNucleotides || nACGT[4] >= nMinNumberOfSingleNucleotides ) { // pepper the bit-string for( nZeroBased = 0; nZeroBased < nWindow; ++nZeroBased ) { pUnpaddedRegion[ nZeroBased ] = 1; } } int nZeroBasedBeforeWindow; for( int nZeroBasedEndOfWindow = nWindow; nZeroBasedEndOfWindow < nUnpaddedLengthOfRegion; ++nZeroBasedEndOfWindow ) { ++nACGT[ nTable[ szBases[ nZeroBasedEndOfWindow ] ] ]; nZeroBasedBeforeWindow = nZeroBasedEndOfWindow - nWindow; --nACGT[ nTable[ szBases[ nZeroBasedBeforeWindow ] ] ]; if ( nACGT[1] >= nMinNumberOfSingleNucleotides || nACGT[2] >= nMinNumberOfSingleNucleotides || nACGT[3] >= nMinNumberOfSingleNucleotides || nACGT[4] >= nMinNumberOfSingleNucleotides ) { // pepper the region for( nZeroBased = nZeroBasedEndOfWindow - nWindow + 1; nZeroBased <= nZeroBasedEndOfWindow; ++nZeroBased ) { pUnpaddedRegion[ nZeroBased ] = 1; } } } bool bInRegion = false; int nZeroBasedStartingPositionOfRegion; // using nUnpaddedLengthOfRegion + 1 so it will check the 0-element of // pUnpaddedRegion after the end of the region. Thus if a // mononucleotide-reich region goes right up to the end of the // region, we can add the goto item with the same code for( nZeroBased = 0; nZeroBased < ( nUnpaddedLengthOfRegion + 1 ); ++nZeroBased ) { if ( pUnpaddedRegion[ nZeroBased ] ) { if ( !bInRegion ) nZeroBasedStartingPositionOfRegion = nZeroBased; bInRegion = true; } else { if ( bInRegion ) { // we just left a region int nUnpaddedLeft = nZeroBasedStartingPositionOfRegion + nUnpaddedStartOfRegion; int nUnpaddedRight = nZeroBased + nUnpaddedStartOfRegion; int nPaddedLeft = nPaddedIndexFast( nUnpaddedLeft ); int nPaddedRight = nPaddedIndexFast( nUnpaddedRight ); RWCString soNote( (long) ( nUnpaddedRight - nUnpaddedLeft + 1 ) ); soNote += " mononucleotide-rich region"; gotoItem* pGotoItem = new gotoItem( this, NULL, nPaddedLeft, nPaddedRight, nUnpaddedLeft, nUnpaddedRight, soNote ); pGotoList->addToList( pGotoItem ); bFoundARegion = true; } bInRegion = false; } } delete szBases; free( pUnpaddedRegion ); } void Contig :: addDinucleotideRichRegionsToList( gotoList* pGotoList, const int nUnpaddedStartOfRegion, const int nUnpaddedEndOfRegion, const bool bNeedToSetPaddedPositionsArrays, bool& bFoundARegion ) { bFoundARegion = false; if ( bNeedToSetPaddedPositionsArrays ) setPaddedPositionsArray(); int nPaddedStart = nPaddedIndexFast( nUnpaddedStartOfRegion ); int nUnpaddedLengthOfRegion = nUnpaddedEndOfRegion - nUnpaddedStartOfRegion; char* szBases = szGetPartUnpaddedConsensusToRight( nPaddedStart, nUnpaddedLengthOfRegion ); unsigned char* pUnpaddedRegion = (unsigned char*) calloc( nUnpaddedLengthOfRegion + 1, // save room for sentinel sizeof( unsigned char ) ); assert( pUnpaddedRegion ); const int nWindow2 = 60; const int nMinNumberOfTwoNucleotides = 57; if ( nUnpaddedLengthOfRegion < nWindow2 ) return; int nACGT[5]; nACGT[0] = 0; nACGT[1] = 0; nACGT[2] = 0; nACGT[3] = 0; nACGT[4] = 0; if ( !bTableIsSet ) { for( int n = 0; n < 256; ++n) nTable[n] = 0; nTable['a'] = 1; nTable['c'] = 2; nTable['g'] = 3; nTable['t'] = 4; // everything else is zero bTableIsSet = true; } // count the bases in the first window int nZeroBased; for( nZeroBased = 0; nZeroBased < nWindow2; ++nZeroBased ) { ++nACGT[ nTable[ szBases[ nZeroBased ] ] ]; } // check that first window and see if it is a mononucleotide-rich region if ( ( nACGT[1] + nACGT[2] ) >= nMinNumberOfTwoNucleotides || ( nACGT[1] + nACGT[3] ) >= nMinNumberOfTwoNucleotides || ( nACGT[1] + nACGT[4] ) >= nMinNumberOfTwoNucleotides || ( nACGT[2] + nACGT[3] ) >= nMinNumberOfTwoNucleotides || ( nACGT[2] + nACGT[4] ) >= nMinNumberOfTwoNucleotides || ( nACGT[3] + nACGT[4] ) >= nMinNumberOfTwoNucleotides ) { // pepper the bit-string for( nZeroBased = 0; nZeroBased < nWindow2; ++nZeroBased ) { pUnpaddedRegion[ nZeroBased ] = 1; } } int nZeroBasedBeforeWindow; for( int nZeroBasedEndOfWindow = nWindow2; nZeroBasedEndOfWindow < nUnpaddedLengthOfRegion; ++nZeroBasedEndOfWindow ) { ++nACGT[ nTable[ szBases[ nZeroBasedEndOfWindow ] ] ]; nZeroBasedBeforeWindow = nZeroBasedEndOfWindow - nWindow2; --nACGT[ nTable[ szBases[ nZeroBasedBeforeWindow ] ] ]; if ( ( nACGT[1] + nACGT[2] ) >= nMinNumberOfTwoNucleotides || ( nACGT[1] + nACGT[3] ) >= nMinNumberOfTwoNucleotides || ( nACGT[1] + nACGT[4] ) >= nMinNumberOfTwoNucleotides || ( nACGT[2] + nACGT[3] ) >= nMinNumberOfTwoNucleotides || ( nACGT[2] + nACGT[4] ) >= nMinNumberOfTwoNucleotides || ( nACGT[3] + nACGT[4] ) >= nMinNumberOfTwoNucleotides ) { // pepper the region for( nZeroBased = nZeroBasedEndOfWindow - nWindow2 + 1; nZeroBased <= nZeroBasedEndOfWindow; ++nZeroBased ) { pUnpaddedRegion[ nZeroBased ] = 1; } } } bool bInRegion = false; int nZeroBasedStartingPositionOfRegion; // using nUnpaddedLengthOfRegion + 1 so it will check the 0-element of // pUnpaddedRegion after the end of the region. Thus if a // mononucleotide-reich region goes right up to the end of the // region, we can add the goto item with the same code for( nZeroBased = 0; nZeroBased < ( nUnpaddedLengthOfRegion + 1 ); ++nZeroBased ) { if ( pUnpaddedRegion[ nZeroBased ] ) { if ( !bInRegion ) nZeroBasedStartingPositionOfRegion = nZeroBased; bInRegion = true; } else { if ( bInRegion ) { // we just left a region int nUnpaddedLeft = nZeroBasedStartingPositionOfRegion + nUnpaddedStartOfRegion; int nUnpaddedRight = nZeroBased + nUnpaddedStartOfRegion; int nPaddedLeft = nPaddedIndexFast( nUnpaddedLeft ); int nPaddedRight = nPaddedIndexFast( nUnpaddedRight ); RWCString soNote( (long) ( nUnpaddedRight - nUnpaddedLeft + 1 ) ); soNote += " dinucleotide-rich region"; gotoItem* pGotoItem = new gotoItem( this, NULL, nPaddedLeft, nPaddedRight, nUnpaddedLeft, nUnpaddedRight, soNote ); pGotoList->addToList( pGotoItem ); bFoundARegion = true; } bInRegion = false; } } delete szBases; free( pUnpaddedRegion ); } void Contig :: addStopsCausingLCQs( gotoList* pGotoList, const bool bSetPaddedPositionsArray ) { // now let's run through and see which reads are shorter than the mean // minus 2 standard deviations...well, now we are increasing this read // length double dMean; double dStandardDeviation; ConsEd::pGetAssembly()->getReadLengthStatistics( dMean, dStandardDeviation ); int nTooSmall = dMean - 0.9 * dStandardDeviation; if ( bSetPaddedPositionsArray ) setPaddedPositionsArray(); for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); // we are only interested if a dye terminator read // ends too short before a low consensus quality region if ( pLocFrag->bIsDyePrimerNotDyeTerminator() ) continue; if ( pLocFrag->bWholeReadIsLowQuality_ ) continue; // read doesn't belong here, so no information from it // about the structure of the consensus if ( pLocFrag->bIsWholeReadUnaligned() ) continue; int nEndOfHighQualityReadPos; if ( pLocFrag->bComp() ) { // <----------------------------- // lllllHHHHHHHHHHHHHHHHHHlllllllll // ^ // nGetHighQualityStart() // ^ // nGetAlignEnd() nEndOfHighQualityReadPos = nUnpaddedIndex( pLocFrag->nGetAlignEnd() ) - nUnpaddedIndex( pLocFrag->nGetHighQualityStart() ) + 1; } else { // --------------------------------> // llllllHHHHHHHHHHHHHHHllllllllllll // ^ // nGetHighQualityEnd() // ^ // nGetAlignStart() nEndOfHighQualityReadPos = nUnpaddedIndex( pLocFrag->nGetHighQualityEnd() ) - nUnpaddedIndex( pLocFrag->nGetAlignStart() ) + 1; } // check that the read ends prematurely--this is probably the // check that eliminates most reads if ( nEndOfHighQualityReadPos > nTooSmall ) { // one more way we might be interested--look for n's after then // high quality segment int nConsPosLeft; int nConsPosRight; const int nMinimumNumberOfNs = 10; if ( pLocFrag->bComp() ) { nConsPosLeft = pLocFrag->nGetAlignStart(); nConsPosRight = pLocFrag->nGetHighQualityStart(); } else { nConsPosLeft = pLocFrag->nGetHighQualityEnd(); nConsPosRight = pLocFrag->nGetAlignEnd(); } if ( nConsPosLeft > nConsPosRight ) continue; // I don't see how this could ever happen, // but don't crash if it does int nNumberOfNs = 0; // now count n's for( int nConsPos = nConsPosLeft; nConsPos <= nConsPosRight; ++nConsPos ) { if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() == 'n' ) ++nNumberOfNs; } if ( nNumberOfNs < nMinimumNumberOfNs ) { // this read is both too long and doesn't have enough n's. continue; } } // now check that there is a low consensus quality region // (or a gap) downstream from the read const int nWindowDownstream = 25; // was 40, then 50, then 25 const int nWindowUpstream = 5; // was 20, then 5 int nAfterHighQualityConsPosLeft; int nAfterHighQualityConsPosRight; // top strand reads: // ---------^---------------------- // ^ nGetHighQualityEnd() // ^ nGetHighQualityEnd - nWindowUpstream + 1 // ^ // nGetHighQualityEnd + nWindowDownstream - 1 // bottom strand reads: // ----------------------^--------- // ^ nGetHighQualityStart() // ^ nGetHighQualityStart() - nWindowDownstream + 1 // ^ // nGetHighQualityStart() + nWindowUpstream - 1 if ( pLocFrag->bComp() ) { nAfterHighQualityConsPosRight = pLocFrag->nGetHighQualityStart() + nWindowUpstream - 1; nAfterHighQualityConsPosLeft = pLocFrag->nGetHighQualityStart() - nWindowDownstream + 1; } else { nAfterHighQualityConsPosLeft = pLocFrag->nGetHighQualityEnd() - nWindowUpstream + 1; nAfterHighQualityConsPosRight = pLocFrag->nGetHighQualityEnd() + nWindowDownstream - 1; } int nOnConsensusConsPosLeft; int nOnConsensusConsPosRight; bool bTooLowQuality = false; bool bRunsIntoGap = false; if ( !bIntersect( nGetStartIndex(), // contig nGetEndIndex(), // contig nAfterHighQualityConsPosLeft, nAfterHighQualityConsPosRight, nOnConsensusConsPosLeft, nOnConsensusConsPosRight ) ) bRunsIntoGap = true; else { if ( pLocFrag->bComp() && nAfterHighQualityConsPosLeft < nOnConsensusConsPosLeft ) { bRunsIntoGap = true; // case in which the read is pointing left and sticks // out the left end of the contig // (We do not consider reads like this to be stops // or runs unless there is a low consensus quality region: // --------- (consensus) // <----- read coming into consensus from right } else if ( !pLocFrag->bComp() && nOnConsensusConsPosRight < nAfterHighQualityConsPosRight ) { // case in which the read is pointing right and sticks out // the right end of the contig bRunsIntoGap = true; } else { // case in which the window is not in a gap // Shall I average error probabilities or // quality values? Or should I take the minimum? const int nMinNumberOfLowQualityBases = 5; int nTotalNumberOfLowQualityBases = 0; const Quality qMinAcceptableQuality = 25; for( int nConsPos = nOnConsensusConsPosLeft; nConsPos <= nOnConsensusConsPosRight; ++nConsPos ) { if ( ntGetCons( nConsPos ).qualGetQuality() < qMinAcceptableQuality ) { ++nTotalNumberOfLowQualityBases; if ( nTotalNumberOfLowQualityBases >= nMinNumberOfLowQualityBases ) { bTooLowQuality = true; break; } } } // for( int nConsPos ... } // if ( ( nOnConsensusConsPosRight ... else } // if ( !bIntersect.... else if ( !bRunsIntoGap && !bTooLowQuality ) { // cerr << "rejecting read " << pLocFrag->soGetName() << // " due to not being next to a low quality region or next to a gap" << endl; continue; } // now look for vector in a window following the high quality // region int nOnReadConsPosLeft; int nOnReadConsPosRight; bool bFoundVector = false; if ( bIntersect( pLocFrag->nGetAlignStart(), pLocFrag->nGetAlignEnd(), nAfterHighQualityConsPosLeft, nAfterHighQualityConsPosRight, nOnReadConsPosLeft, nOnReadConsPosRight ) ) { for( int nConsPos = nOnReadConsPosLeft; nConsPos <= nOnReadConsPosRight; ++nConsPos ) { if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() == 'x' ) { bFoundVector = true; break; } } if ( bFoundVector ) { // cerr << "rejecting read " << pLocFrag->soGetName() << // " due to running into vector" << endl; continue; } } // If reached here, the read has a low quality region // or a gap following it. // if ( pLocFrag->bUnalignedHighQualityRegionTooLong() ) { // cerr << "rejecting read " << pLocFrag->soGetName() << " due to unaligned high quality region too long" << endl; // continue; // } // If the read is just a crummy read, we don't want it to // indicate a need for special chemistry. So check that // there are enough high quality bases in the read. // let's check how many bases there are above a particular // threshold. const unsigned char ucMinimumQualityToBeCounted = 30; const int nMinimumNumberOfSuchBases = 20; bool bFoundEnoughHighQualityBases = false; int nTotalHighQualityBases = 0; int nConsPos; for( nConsPos = pLocFrag->nGetHighQualityStart(); nConsPos <= pLocFrag->nGetHighQualityEnd(); ++nConsPos ) { Quality q = pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); // don't count pads or edited bases if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).bIsPad() ) continue; if ( q == 98 || q == 99 ) continue; if ( q >= ucMinimumQualityToBeCounted ) { ++nTotalHighQualityBases; if ( nTotalHighQualityBases >= nMinimumNumberOfSuchBases ) { bFoundEnoughHighQualityBases = true; break; } } } if ( !bFoundEnoughHighQualityBases ) { continue; } // Check that this is not a compression. Special chemistries // do not work for compressions. The way we can check is to // see that there are enough high quality bases again after the // end of the high quality region. int nIncrement; int nBaseToStartAt; if ( pLocFrag->bComp() ) { nIncrement = -1; nBaseToStartAt = pLocFrag->nGetHighQualityStart() - 1; } else { nIncrement = 1; nBaseToStartAt = pLocFrag->nGetHighQualityEnd() + 1; } const unsigned char ucMinimumQualityToBeCounted2 = 30; const int nMinimumNumberOfSuchBases2 = 20; bool bFoundEnoughHighQualityBases2 = false; int nTotalHighQualityBases2 = 0; for( nConsPos = nBaseToStartAt; ( pLocFrag->bComp() ? ( nConsPos >= pLocFrag->nGetAlignStart() ) : ( nConsPos <= pLocFrag->nGetAlignEnd() ) ); nConsPos += nIncrement ) { if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).bIsPad() ) continue; if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQuality() >= ucMinimumQualityToBeCounted2 ) { ++nTotalHighQualityBases2; if ( nTotalHighQualityBases2 >= nMinimumNumberOfSuchBases2 ) { bFoundEnoughHighQualityBases2 = true; break; } } } if ( bFoundEnoughHighQualityBases2 ) { // cerr << "rejecting read " << pLocFrag->soGetName() << // " because there are at least " << nMinimumNumberOfSuchBases2 // << " bases of at least quality " << (int) ucMinimumQualityToBeCounted2 // << " after the putative stop or run so this can't be a real stop or run" << endl; continue; } // we are considering that the stop occurs at a single base-- // the end of the high quality region int nPaddedConsPos = ( pLocFrag->bComp() ? pLocFrag->nGetHighQualityStart() : pLocFrag->nGetHighQualityEnd() ); int nUnpaddedConsPos = pLocFrag->pContig_->nUnpaddedIndex( nPaddedConsPos ); RWCString soNote = ( pLocFrag->bComp() ? " stop left of here" : " stop right of here" ); gotoItem* pGotoItem = new gotoItem( this, NULL, nPaddedConsPos, nPaddedConsPos, nUnpaddedConsPos, nUnpaddedConsPos, soNote ); pGotoList->addToList( pGotoItem ); } //for( int nRead = 0; nRead < pContig->nGetNumberOfFragsInContig(); } void Contig :: addRunsAndMicrosatellitesCausingLCQs( gotoList* pGotoList, const bool bSetPaddedPositionsArray) { if ( bSetPaddedPositionsArray ) setPaddedPositionsArray(); const int nConsolidateIfLessThanOrEqualToThisDistance = 20; gotoList* pLowConsQualGotoList = new LowConsQualGotoList( this ); // add lcq items at first and last bases of the contig so that // if there is a run near a gap, special chemistry is called. gotoItem* pGotoItemFirst = new gotoItem( this, NULL, 1, 1, 1, 1, "left gap" ); pLowConsQualGotoList->apGotoItem_.insertAt( 0, pGotoItemFirst ); gotoItem* pGotoItemLast = new gotoItem( this, NULL, nGetEndIndex(), nGetEndIndex(), nGetUnpaddedEndIndex(), nGetUnpaddedEndIndex(), "right gap" ); pLowConsQualGotoList->addToList( pGotoItemLast ); pLowConsQualGotoList->consolidateGotoItems( nConsolidateIfLessThanOrEqualToThisDistance ); for( int nGotoItem = 0; nGotoItem < pLowConsQualGotoList->nGetNumGotoItems(); ++nGotoItem ) { gotoItem* pGotoItem = pLowConsQualGotoList->pGetGotoItem( nGotoItem ); const int nLookThisFarUpAndDownConsensusForRun = 40; int nUnpaddedLeftOfRegion = pGotoItem->nUnpaddedGotoItemStart_ - nLookThisFarUpAndDownConsensusForRun; int nUnpaddedRightOfRegion = pGotoItem->nUnpaddedGotoItemEnd_ + nLookThisFarUpAndDownConsensusForRun; // look in this region for both a repeat and a mononucleotide-rich // region // I don't know how this region could not intersect the // consensus, but let's be safe. It definitely could be // partially off the consensus, so I need to check that. if ( !bIntersect( nUnpaddedLeftOfRegion, nUnpaddedRightOfRegion, nGetUnpaddedStartIndex(), // contig nGetUnpaddedEndIndex(), // contig nUnpaddedLeftOfRegion, nUnpaddedRightOfRegion ) ) continue; int nUnpaddedLength = nUnpaddedRightOfRegion - nUnpaddedLeftOfRegion + 1; int nPaddedLeft = nPaddedIndexFast( nUnpaddedLeftOfRegion ); char* szUnpaddedRegion = szGetPartUnpaddedConsensusToRight( nPaddedLeft, nUnpaddedLength ); bool bFoundARegion; addDinucleotideRichRegionsToList( pGotoList, nUnpaddedLeftOfRegion, nUnpaddedRightOfRegion, false, bFoundARegion ); const int nLookThisFarUpAndDownConsensusForMicrosatellite = 80; int nEndingRowOfBestScoredAlignment; int nEndingColOfBestScoredAlignment; int nBestScore; findRepeat( szUnpaddedRegion, nUnpaddedLength, &nBestScore, &nEndingRowOfBestScoredAlignment, &nEndingColOfBestScoredAlignment ); if ( nBestScore >= pCP->nAutoFinishMinSmithWatermanScoreOfARun_ ) { // find the alignment RWCString soAlignment1; RWCString soAlignment2; RWCString soAlignment3; int nBeginningRowOfBestScoredAlignment; int nBeginningColOfBestScoredAlignment; findAlignment( szUnpaddedRegion, nUnpaddedLength, nBestScore, nEndingRowOfBestScoredAlignment, nEndingColOfBestScoredAlignment, &nBeginningRowOfBestScoredAlignment, &nBeginningColOfBestScoredAlignment, soAlignment1, soAlignment2, soAlignment3 ); // report this run // the unpadded position is 1-based while the // location is zero-based int nUnpaddedLeft = nBeginningColOfBestScoredAlignment + nUnpaddedLeftOfRegion; int nUnpaddedRight = nEndingColOfBestScoredAlignment + nUnpaddedLeftOfRegion; int nPaddedLeft = nPaddedIndexFast( nUnpaddedLeft ); int nPaddedRight = nPaddedIndexFast( nUnpaddedRight ); RWCString soNote = "repeat with score " + RWCString( (long) nBestScore ); gotoItem* pGotoItem = new gotoItem( this, NULL, nPaddedLeft, nPaddedRight, nUnpaddedLeft, nUnpaddedRight, soNote ); pGotoList->addToList( pGotoItem ); } delete szUnpaddedRegion; } // for( int nGotoItem = 0; } // return true if we want to show this base in the hqd list bool Contig :: bDoesPadIndicateAPossibleMisassemblyHere( LocatedFragment* pLocFragOfHQDPad, int nConsPosOfPad ) { // I believe this is checking for the following condition: // consensus aa // read 1 *a // read 2 *a // read 3 a* // read 4 a* // so the conclusion is that the consensus is wrong--there is // really should be only a single "a" // Note that this problem can cause hqd's if ( ntGetCons( nConsPosOfPad ).qualGetQuality() < 40 ) return( true ); // the high quality discrepancy is a pad. Let's see // what the consensus has here and if it is a repeated base int nLeftMostBase = nConsPosOfPad; int nRightMostBase = nConsPosOfPad; char cConsensusBase = cGetConsensusBase( nConsPosOfPad ); // back up as far as the consensus base matches this // base int nConsPos; for( nConsPos = nConsPosOfPad - 1; nConsPos >= nGetStartIndex(); --nConsPos ) { if ( cConsensusBase != cGetConsensusBase( nConsPos ) && cGetConsensusBase( nConsPos ) != '*' ) { // we've found the left-most boundard nLeftMostBase = nConsPos + 1; break; } } // go forward as far as the consensus base matches // the consensus base for( nConsPos = nConsPosOfPad + 1; nConsPos <= nGetEndIndex(); ++nConsPos ) { if ( cConsensusBase != cGetConsensusBase( nConsPos ) && cGetConsensusBase( nConsPos ) != '*' ) { nRightMostBase = nConsPos - 1; break; } } // look at the other aligned reads at this location // and see if any of them have pads in this range, but // not in this same column bool bFoundPadInAnotherColumn = false; ContigView* pContigView = pGetContigView( nLeftMostBase, nRightMostBase ); for( int nRead = 0; nRead < pContigView->nGetNumberOfFragments(); ++nRead ) { LocatedFragment* pLocFrag = pContigView->pLocatedFragmentGet( nRead ); if ( pLocFrag == pLocFragOfHQDPad ) continue; if ( pLocFrag->bIsWholeReadUnaligned() ) continue; int nAlignedConsPosStart = pLocFrag->nGetAlignClipStart(); int nAlignedConsPosEnd = pLocFrag->nGetAlignClipEnd(); int nIntersectLeft; int nIntersectRight; if ( bIntersect( nLeftMostBase, nRightMostBase, nAlignedConsPosStart, nAlignedConsPosEnd, nIntersectLeft, nIntersectRight ) ) { for( nConsPos = nIntersectLeft; nConsPos <= nIntersectRight; ++nConsPos ) { if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() == '*' && nConsPos != nConsPosOfPad ) { // one more check--if the consensus at this point is a pad, // we don't care about this instance of a pad in the read if ( cGetConsensusBase( nConsPos ) != '*' ) return( true ); } } } } return( false ); } bool Contig :: bIAmARealContig() { if ( nGetUnpaddedSeqLength() <= pCP->nAutoFinishExcludeContigIfThisManyBasesOrLess_ ) { fprintf( pAO, "%s has only %d bases \n( consed.autoFinishExcludeContigIfThisManyBasesOrLess = %d ) \nso not processing it\n\n", soGetName().data(), nGetUnpaddedSeqLength(), pCP->nAutoFinishExcludeContigIfThisManyBasesOrLess_ ); bThisContigIsExcludedFromAutofinishFinishing_ = true; return( false ); } if ( nGetNumberOfFragsInContig() <= pCP->nAutoFinishExcludeContigIfOnlyThisManyReadsOrLess_ ) { fprintf( pAO, "contig %s has only %d reads so not processing it\n", soGetName().data(), nGetNumberOfFragsInContig() ); bThisContigIsExcludedFromAutofinishFinishing_ = true; return( false ); } if ( dGetDepthOfCoverage() > pCP->dAutoFinishExcludeContigIfDepthOfCoverageGreaterThanThis_ ) { fprintf( pAO, "contig %s has depth of coverage %.1f but the user-settable parameter consed.autoFinishExcludeContigIfDepthOfCoverageGreaterThanThis is set to %.1f so not processing this contig\n", soGetName().data(), dGetDepthOfCoverage(), pCP->dAutoFinishExcludeContigIfDepthOfCoverageGreaterThanThis_ ); bThisContigIsExcludedFromAutofinishFinishing_ = true; return( false ); } // if passed all checks, then this must be a real contig return( true ); } Contig* Contig :: pGetNextContigInScaffold( const int nPreviousContigWasConnectedAtWhichEndOfThisContig, bool& bThisContigIsComplementedInScaffold, int& nThisContigIsConnectedToWhichEndOfNextContig ) { if ( nPreviousContigWasConnectedAtWhichEndOfThisContig == nLeftGap ) { bThisContigIsComplementedInScaffold = false; nThisContigIsConnectedToWhichEndOfNextContig = nWhichEnd_[ nRightGap ]; return( pContig_[ nRightGap ] ); } else if ( nPreviousContigWasConnectedAtWhichEndOfThisContig == nRightGap ) { bThisContigIsComplementedInScaffold = true; nThisContigIsConnectedToWhichEndOfNextContig = nWhichEnd_[ nLeftGap ]; return( pContig_[ nLeftGap ] ); } else { RWCString soMessage( "Contig::pGetNextContigInScaffold " ); soMessage += " pContig_[0] which is "; if ( pContig_[0] ) soMessage += pContig_[0]->soGetName(); else soMessage += " null "; soMessage += " and pContig_[1] is "; if ( pContig_[1] ) soMessage += pContig_[1]->soGetName(); else soMessage += " null "; soMessage += " in this contig = "; soMessage += soGetName(); THROW_ERROR2( soMessage ); // just for compiler return( NULL ); } } RWCString Contig :: soGetAbbreviatedName() { // this code will take Contig15 and return 15 RWCString soContigName = soGetName(); if ( soContigName.length() > strlen( "Contig" ) ) { if ( soContigName( 0, 6 ) == "Contig" ) { soContigName = soContigName.soGetRestOfString( 6 ); } } return( soContigName ); } void Contig :: appendUnpaddedHighQualitySegment( RWCString& soStringToAppend, const bool bComplement ) { // added Aug 2002 setPaddedPositionsArray(); int nConsPosLeft; int nConsPosRight; if ( nUnpaddedHighQualitySegmentStart_ == -2 ) { setHighQualitySegment(); } if ( nUnpaddedHighQualitySegmentStart_ == -1 ) { nConsPosLeft = nGetStartIndex(); nConsPosRight = nGetEndIndex(); } else { nConsPosLeft = nPaddedIndexFast( nUnpaddedHighQualitySegmentStart_ ); nConsPosRight = nPaddedIndexFast( nUnpaddedHighQualitySegmentEnd_ ); } int nEstimateOfBasesToAppend = nConsPosRight - nConsPosLeft + 1; int nSpaceNeeded = soStringToAppend.length() + nEstimateOfBasesToAppend; soStringToAppend.increaseMaxLength( (size_t) nSpaceNeeded ); if ( bComplement ) { for( int nConsPos = nConsPosRight; nConsPos >= nConsPosLeft; --nConsPos ) { char c = ntGetCons( nConsPos ); if ( c == '*' ) continue; c = cComplementBase( c ); soStringToAppend.append( c ); } } else { for( int nConsPos = nConsPosLeft; nConsPos <= nConsPosRight; ++nConsPos ) { char c = ntGetCons( nConsPos ); if ( c == '*' ) continue; soStringToAppend.append( c ); } } // if ( bComplement ) { else } void Contig :: appendPartOfUnpaddedHighQualitySegment( RWCString& soAppendToThis, const bool bComplement, const int nFirstOrLastUnpadded, const bool bPreviousArgumentIsFirstNotLast ) { // setPaddedPositionsArray is called by Contig::setHighQualitySegment() // no longer! July 2002 so must call it here to use // nPaddedIndexFast setPaddedPositionsArray(); int nConsPosLeft; int nConsPosRight; if ( nUnpaddedHighQualitySegmentStart_ == -1 ) { nConsPosLeft = nGetStartIndex(); nConsPosRight = nGetEndIndex(); } else { nConsPosLeft = nPaddedIndexFast( nUnpaddedHighQualitySegmentStart_ ); nConsPosRight = nPaddedIndexFast( nUnpaddedHighQualitySegmentEnd_ ); } int nFirstOrLastPadded = nPaddedIndexFast( nFirstOrLastUnpadded ); // shouldn't nFirstOrLastUnpadded lie between nConsPosLeft and nConsPosRight? // Well, yes, they should. If they aren't, that indicates that // the user is trying to get some of the low quality region // for doing the digest (which this is currently used for). if ( nFirstOrLastPadded < nConsPosLeft ) { nConsPosLeft = nGetStartIndex(); } if ( nConsPosRight < nFirstOrLastPadded ) { nConsPosRight = nGetEndIndex(); } assert( nConsPosLeft <= nFirstOrLastPadded ); assert( nFirstOrLastPadded <= nConsPosRight ); // not complemented: // _______________________________________ // ^nConsPosLeft ^nFirstOrLastPadded ^nConsPosRight // bPreviousArgumentIsFirstNotLast == true // ^nConsPosLeft ^nConsPosRight // bPreviousArgumentIsFirstNotLast == false // ^nConsPosLeft ^nConsPosRight // complemented // // _______________________________________ // ^nConsPosLeft ^nFirstOrLastPadded ^nConsPosRight // bPreviousArgumentIsFirstNotLast == true // ^nConsPosLeft ^nConsPosRight // (This is because we will order the bases from right // to left so we want nFirstOrLastPadded to come first.) // // bPreviousArgumentIsFirstNotLast == false // ^nConsPosLeft ^nConsPosRight // So false, true = right part // false, false = left part // true, true = left part // true, false = right part bool bLeftNotRightSection; if ( bComplement ) { if ( bPreviousArgumentIsFirstNotLast ) bLeftNotRightSection = true; else bLeftNotRightSection = false; } else { if ( bPreviousArgumentIsFirstNotLast ) bLeftNotRightSection = false; else bLeftNotRightSection = true; } if ( bLeftNotRightSection ) nConsPosRight = nFirstOrLastPadded; else nConsPosLeft = nFirstOrLastPadded; if ( nConsPosLeft > nConsPosRight ) { popupErrorMessage( "internal logic error: appendPartOfUnpaddedHighQualitySegment nConsPosLeft = %d nConsPosRight = %d %s %d %s\n", nConsPosLeft, nConsPosRight, ( bComplement ? "true" : "false" ), nFirstOrLastUnpadded, ( bPreviousArgumentIsFirstNotLast ? "prev is first" : "prev is last" ) ); return; } if ( bComplement ) { for( int nConsPos = nConsPosRight; nConsPos >= nConsPosLeft; --nConsPos ) { char c = ntGetCons( nConsPos ); if ( c == '*' ) continue; c = cComplementBase( c ); soAppendToThis.append( c ); } } else { for( int nConsPos = nConsPosLeft; nConsPos <= nConsPosRight; ++nConsPos ) { char c = ntGetCons( nConsPos ); if ( c == '*' ) continue; soAppendToThis.append( c ); } } } void Contig :: appendPartOfUnpaddedConsensus( RWCString& soAppendToThis, const bool bComplement, const int nLeftUnpaddedd, const int nRightUnpaddedd ) { // this routine is sometimes used by seqMatch in which case // the nUnpaddedStart and nUnpaddedEnd points are not necessarily // in the correct order. int nLeftUnpadded; int nRightUnpadded; if ( nLeftUnpaddedd <= nRightUnpaddedd ) { nLeftUnpadded = nLeftUnpaddedd; nRightUnpadded = nRightUnpaddedd; } else { nLeftUnpadded = nRightUnpaddedd; nRightUnpadded = nLeftUnpaddedd; } // if bComplement, this will add the bases in reverse order, // starting at nRightUnpadded and working down to nLeftUnpadded // added Aug 2002 setPaddedPositionsArray(); int nConsPosLeft = nPaddedIndexFast( nLeftUnpadded ); int nConsPosRight = nPaddedIndexFast( nRightUnpadded ); if ( bComplement ) { for( int nConsPos = nConsPosRight; nConsPos >= nConsPosLeft; --nConsPos ) { char c = ntGetCons( nConsPos ); if ( c == '*' ) continue; c = cComplementBase( c ); soAppendToThis.append( c ); } } else { for( int nConsPos = nConsPosLeft; nConsPos <= nConsPosRight; ++nConsPos ) { char c = ntGetCons( nConsPos ); if ( c == '*' ) continue; soAppendToThis.append( c ); } } } int Contig :: nGetHighQualitySegmentLengthUnpadded() { if ( nUnpaddedHighQualitySegmentStart_ == -2 ) setHighQualitySegment(); if ( !bIsThereAHighQualitySegment() ) return( 0 ); return( nGetHighQualitySegmentEnd() - nGetHighQualitySegmentStart() + 1 ); } int Contig :: nGetUnpaddedClippedLowQualityBases( const int nWhichEnd ) { if ( nUnpaddedHighQualitySegmentStart_ == -2 ) setHighQualitySegment(); // bug--this used to be without the ! so would normally // return 0 if ( !bIsThereAHighQualitySegment() ) return( 0 ); if ( nWhichEnd == nLeftGap ) { return( nUnpaddedHighQualitySegmentStart_ - 1 ); } else { return( nGetUnpaddedEndIndex() - nUnpaddedHighQualitySegmentEnd_ ); } } int Contig :: nGetUnpaddedConsPosFromScaffoldPos( const int nScaffoldPos ) { if ( ! ( nClonePosLeft_ <= nScaffoldPos && nScaffoldPos <= nClonePosRight_ ) ) { RWCString soError = "found contig " + soGetName() + " but could not find contig pos for scaffold position " + RWCString( (long) nScaffoldPos ); THROW_ERROR2( soError ); } return( nGetUnpaddedConsPosFromScaffoldPosUnsafe( nScaffoldPos ) ); } // this allows the scaffold pos to be off of the contig and just // extrapolates to where it would be int Contig :: nGetUnpaddedConsPosFromScaffoldPosUnsafe( const int nScaffoldPos ) { if ( bThisContigIsComplementedInTheScaffold_ ) { // uncomplemented contig: // llllllllllllHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHlllllllllllll // ^ ^ // nClonePosRight_ nClonePosLeft_ // (scaffold positions, has no pads) // nUnpaddedHighQualitySegmentStart_ // nUnpaddedHighQualitySegmentEnd_ // (unpadded contig positions) // <-------- // nScaffoldPos - nClonePosLeft_ // if ( nUnpaddedHighQualitySegmentEnd_ == -2 ) setHighQualitySegment(); int nHighestUnpadded = ( nUnpaddedHighQualitySegmentEnd_ == -1 ) ? nGetUnpaddedEndIndex() : nUnpaddedHighQualitySegmentEnd_; return( nHighestUnpadded - nScaffoldPos + nClonePosLeft_ ); } else { // llllllllllllHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHlllllllllllll // ^ ^ // nClonePosLeft_ nClonePosRight_ // (scaffold positions, has no pads) // nUnpaddedHighQualitySegmentStart_ // nUnpaddedHighQualitySegmentEnd_ // (unpadded contig positions) // we also need to consider the case in which there is no // high quality segment if ( nUnpaddedHighQualitySegmentStart_ == -2 ) setHighQualitySegment(); int nOffset = ( nUnpaddedHighQualitySegmentStart_ == -1 ) ? 1 : nUnpaddedHighQualitySegmentStart_; return( nScaffoldPos - nClonePosLeft_ + nOffset ); } } int Contig :: nGetScaffoldPosFromUnpaddedConsPos( const int nUnpadded ) { // I am doing no check on nUnpadded. It could be outside the high quality // segment, or even off the consensus completely. This is necessary // if we want to find the scaffold position of a read that might // start off the consensus. Note that it is theoretically possible // for this to return a scaffold position that is not on the scaffold. if ( bThisContigIsComplementedInTheScaffold_ ) { // uncomplemented contig: // llllllllllllHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHlllllllllllll // ^ ^ // nClonePosRight_ nClonePosLeft_ // (scaffold positions, has no pads) // nUnpaddedHighQualitySegmentStart_ // nUnpaddedHighQualitySegmentEnd_ // (unpadded contig positions) // <-------- // nScaffoldPos - nClonePosLeft_ // int nHighestUnpadded = nGetHighQualitySegmentEnd(); return( nClonePosLeft_ + nHighestUnpadded - nUnpadded ); } else { // llllllllllllHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHlllllllllllll // ^ ^ // nClonePosLeft_ nClonePosRight_ // (scaffold positions, has no pads) // nUnpaddedHighQualitySegmentStart_ // nUnpaddedHighQualitySegmentEnd_ // (unpadded contig positions) // we also need to consider the case in which there is no // high quality segment int nLowestUnpadded = nGetHighQualitySegmentStart(); return( nUnpadded + nClonePosLeft_ - nLowestUnpadded ); } } bool Contig :: bFindWhichEndOfVectorIsConnectedToContigEnd( const int nWhichEndOfContig, int& nWhichEndOfVector, const RWCString& soVectorSequence, const RWCString& soComplementedVectorSequence ) { if ( bAlreadyCheckedEnd_[ nWhichEndOfContig ] ) { nWhichEndOfVector = nWhichEndOfVector_[ nWhichEndOfContig ]; // if this returns false, then the above setting of // nWhichEndOfVector doesn't matter return( bAbleToFindEndOfVector_[ nWhichEndOfContig ] ); } // get the position of the left or right end of the consensus int nConsPos; if ( nWhichEndOfContig == nLeftGap ) nConsPos = nGetStartIndex(); else nConsPos = nGetEndIndex(); ContigView* pContigView = pGetContigView( nConsPos, nConsPos ); // for each read, calculate the high quality segment. This isn't // the same as the one calculated by phrap since phrap changed x's // to quality 0. RWTPtrOrderedVector aReads( (size_t) pContigView->nGetNumberOfFragments() ); RWTValOrderedVector aLengthOfHighQualitySegmentOfReadIntoVector( (size_t) pContigView->nGetNumberOfFragments() ); RWTValOrderedVector aReadPosHighQualityVectorLeft( (size_t) pContigView->nGetNumberOfFragments() ); RWTValOrderedVector aReadPosHighQualityVectorRight( (size_t) pContigView->nGetNumberOfFragments() ); int nRead; for( nRead = 0; nRead < pContigView->nGetNumberOfFragments(); ++nRead ) { LocatedFragment* pLocFrag = pContigView->pLocatedFragmentGet( nRead ); if ( pLocFrag->bIsWholeReadUnaligned() ) continue; // CCCCCCCCCCCCCC (consensus) // BBBBBBBBBBBBBBXXXXXXXX (read) // check that most of the B's are aligned and that most of the X's // are X's (that this read agrees that this is the end of the // consensus. int nCheckAlignedLeft; int nCheckAlignedRight; const int nAlignedWindow = 15; const int nMinimumNumberOfAlignedBases = 13; if ( nWhichEndOfContig == nLeftGap ) { nCheckAlignedLeft = nGetStartIndex(); nCheckAlignedRight = nCheckAlignedLeft + nAlignedWindow - 1; } else { nCheckAlignedRight = nGetEndIndex(); nCheckAlignedLeft = nCheckAlignedRight - nAlignedWindow + 1; } int nAlignedPartOfWindowLeft; int nAlignedPartOfWindowRight; if ( !bIntersect( nCheckAlignedLeft, nCheckAlignedRight, pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), nAlignedPartOfWindowLeft, nAlignedPartOfWindowRight ) ) continue; if ( ( nAlignedPartOfWindowRight - nAlignedPartOfWindowLeft + 1 ) < nMinimumNumberOfAlignedBases ) continue; int nCheckXsLeft; int nCheckXsRight; const int nXsWindow = 15; const int nMinimumNumberOfXBases = 13; if ( nWhichEndOfContig == nLeftGap ) { nCheckXsRight = nGetStartIndex() - 1; nCheckXsLeft = nCheckXsRight - nXsWindow + 1; } else { nCheckXsLeft = nGetEndIndex() + 1; nCheckXsRight = nCheckXsLeft + nXsWindow - 1; } int nWindowOnReadLeft; int nWindowOnReadRight; if ( !bIntersect( nCheckXsLeft, nCheckXsRight, pLocFrag->nGetAlignStart(), pLocFrag->nGetAlignEnd(), nWindowOnReadLeft, nWindowOnReadRight ) ) continue; // count the number of x's in this region int nNumberOfXs = 0; for( int nConsPos = nWindowOnReadLeft; nConsPos <= nWindowOnReadRight; ++nConsPos ) { if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() == 'x' ) ++nNumberOfXs; } if ( nNumberOfXs < nMinimumNumberOfXBases ) continue; // now check to see how much of the high quality segment // protrudes into vector. To do this, we need the actual phred // calls since we need their qualities. int nLengthOfHighQualityVectorSegment; int nReadPosHighQualityVectorLeft; int nReadPosHighQualityVectorRight; try { if ( !pLocFrag->bThereIsHighQualitySegmentOffConsensus( nWhichEndOfContig, nLengthOfHighQualityVectorSegment, nReadPosHighQualityVectorLeft, nReadPosHighQualityVectorRight ) ) continue; } catch( InputDataError eb ) { // case in which the phd file couldn't be read so // this read can't be used to help find which end is connected // to the vector continue; } aReads.insert( pLocFrag ); aLengthOfHighQualitySegmentOfReadIntoVector.insert( nLengthOfHighQualityVectorSegment ); aReadPosHighQualityVectorLeft.insert( nReadPosHighQualityVectorLeft ); aReadPosHighQualityVectorRight.insert( nReadPosHighQualityVectorRight ); } // for( int nRead = 0; nRead < pContigView->nGetNumberOfFragments()... if ( aReads.length() == 0 ) { // what can we do? Return something at random? RWCString soErrorMessage = "can't determine which end of vector is connected to the "; soErrorMessage += szWhichGap( nWhichEndOfContig ); soErrorMessage += " of contig "; soErrorMessage += soGetName(); soErrorMessage += " so you might need to complement the vector"; popupErrorMessage( soErrorMessage ); // don't try again bAlreadyCheckedEnd_[ nWhichEndOfContig ] = true; bAbleToFindEndOfVector_[ nWhichEndOfContig ] = false; return( false ); } RWTValVector aTriedToMatchThisReadAgainstVector( (size_t) aReads.length(), false ); int nMatchesWhichEndOfVector; bool bFoundAMatchingRead = false; int nBestRead; do { // find the read whose high scoring segment protrudes furthest // in vector int nLengthOfHighQualitySegmentOfBestRead = 0; nBestRead = -666; for( nRead = 0; nRead < aReads.length(); ++nRead ) { if ( aTriedToMatchThisReadAgainstVector[ nRead ] ) continue; if ( aLengthOfHighQualitySegmentOfReadIntoVector[ nRead ] > nLengthOfHighQualitySegmentOfBestRead ) { nBestRead = nRead; nLengthOfHighQualitySegmentOfBestRead = aLengthOfHighQualitySegmentOfReadIntoVector[ nRead ]; } } if ( nBestRead != -666 ) { aTriedToMatchThisReadAgainstVector[ nBestRead ] = true; // ok, now try it. LocatedFragment* pLocFrag = aReads[ nBestRead ]; int nReadPosHighQualityVectorLeft = aReadPosHighQualityVectorLeft[ nBestRead ]; int nReadPosHighQualityVectorRight = aReadPosHighQualityVectorRight[ nBestRead ]; bFoundAMatchingRead = pLocFrag->bDoesEitherEndOfVectorMatchThisPartOfRead( nWhichEndOfContig, soVectorSequence, soComplementedVectorSequence, nReadPosHighQualityVectorLeft, nReadPosHighQualityVectorRight, nMatchesWhichEndOfVector ); } } while( !bFoundAMatchingRead && nBestRead != -666 ); if ( !bFoundAMatchingRead ) { popupErrorMessage( "I could not figure out which end of vector the %s end of contig %s is attached to. I tried every read that protruded into the vector and none of these reads matched either end of the vector. Could it be that your vector sequence is incorrect or that you do not have the vector starting where you cut the vector to insert the insert? Or could it be that this is really not the cloning vector/insert junction?", szWhichGap( nWhichEndOfContig ), soGetName().data() ); bAlreadyCheckedEnd_[ nWhichEndOfContig ] = true; bAbleToFindEndOfVector_[ nWhichEndOfContig ] = false; // make a random decision nWhichEndOfVector_[ nWhichEndOfContig ] = nLeftGap; return( false ); } bAlreadyCheckedEnd_[ nWhichEndOfContig ] = true; bAbleToFindEndOfVector_[ nWhichEndOfContig ] = true; nWhichEndOfVector_[ nWhichEndOfContig ] = nMatchesWhichEndOfVector; nWhichEndOfVector = nMatchesWhichEndOfVector; return( true ); } const unsigned char uLowConsensusBase = 1; const unsigned char uTooManyLowConsensusBasesInWindow = 2; const unsigned char uEnoughAlignedReads = 4; const unsigned char uTopStrandReads = 8; const unsigned char uBottomStrandReads = 16; const unsigned char uDyeTerminatorReads = 32; const unsigned char uDyePrimerReads = 64; static int nPaddedConsensusMax; static void pepperWindow( unsigned char* pPaddedConsensus, const int nConsPosLeft, const int nConsPosRight ) { assert( 1 <= nConsPosLeft && nConsPosRight <= nPaddedConsensusMax ); for( int nConsPos = nConsPosLeft; nConsPos <= nConsPosRight; ++nConsPos ) { pPaddedConsensus[ nConsPos ] |= uTooManyLowConsensusBasesInWindow; } } void Contig :: addEditableSites( gotoList* pEditableLCQList ) { unsigned char* pPaddedConsensus = (unsigned char*) calloc( nGetPaddedConsLength() + 1, sizeof( unsigned char ) ); nPaddedConsensusMax = nGetPaddedConsLength(); assert( pPaddedConsensus ); unsigned char* pNumberOfAlignedReads = (unsigned char*) calloc( nGetPaddedConsLength() + 1, sizeof( unsigned char ) ); assert( pNumberOfAlignedReads ); for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { // get pointer to this located frag from contig LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->bIsWholeReadUnaligned() ) continue; int nConsPosLeft = pLocFrag->nGetAlignClipStart(); int nConsPosRight = pLocFrag->nGetAlignClipEnd(); if ( !bIntersect( nConsPosLeft, nConsPosRight, 1, nPaddedConsensusMax, nConsPosLeft, nConsPosRight ) ) continue; unsigned char uStrandAndChemistry = 0; if ( pLocFrag->bIsDyePrimerNotDyeTerminator() ) uStrandAndChemistry |= uDyeTerminatorReads; else uStrandAndChemistry |= uDyePrimerReads; if ( pLocFrag->bComp() ) uStrandAndChemistry |= uBottomStrandReads; else uStrandAndChemistry |= uTopStrandReads; for( int nConsPos = nConsPosLeft; nConsPos <= nConsPosRight; ++nConsPos ) { ++pNumberOfAlignedReads[ nConsPos ]; pPaddedConsensus[ nConsPos ] |= uStrandAndChemistry; } } const int nEnoughAlignedReads = 3; int nConsPos; for( nConsPos = nGetStartIndex(); nConsPos <= nGetEndIndex(); ++nConsPos ) { if ( pNumberOfAlignedReads[ nConsPos ] >= nEnoughAlignedReads ) { pPaddedConsensus[ nConsPos ] |= uEnoughAlignedReads; } } LowConsQualGotoList* pLCQGotoList = new LowConsQualGotoList( this ); for( int nRegion = 0; nRegion < pLCQGotoList->nGetNumGotoItems(); ++nRegion ) { gotoItem* pGotoItem = pLCQGotoList->pGetGotoItem( nRegion ); int nMinConsPos; int nMaxConsPos; if ( !bIntersect( pGotoItem->nGotoItemStart_, pGotoItem->nGotoItemEnd_, 1, nPaddedConsensusMax, nMinConsPos, nMaxConsPos ) ) continue; for( int nConsPos = nMinConsPos; nConsPos <= nMaxConsPos; ++nConsPos ) { pPaddedConsensus[ nConsPos ] |= uLowConsensusBase; } } // now look for 50-base regions that contain 10 or more lcq bases. // These regions show *not* be tagged since they should have additional // data. const int nWindowOfLCQ = 50; const int nMaxLCQsInWindow = 15; int nLCQsInWindow = 0; int nMinConsPos; int nMaxConsPos; if ( bIntersect( nGetStartIndex(), nWindowOfLCQ, 1, nPaddedConsensusMax, nMinConsPos, nMaxConsPos ) ) { // initialize the window for( nConsPos = nMinConsPos; nConsPos <= nMaxConsPos; ++nConsPos ) { if ( pPaddedConsensus[ nConsPos ] & uLowConsensusBase ) ++nLCQsInWindow; } if ( nLCQsInWindow >= nMaxLCQsInWindow ) { pepperWindow( pPaddedConsensus, nMinConsPos, nMaxConsPos ); } } for( int nConsPosOfEndOfWindow = nWindowOfLCQ + 1; nConsPosOfEndOfWindow <= nGetEndIndex(); ++nConsPosOfEndOfWindow ) { if ( pPaddedConsensus[ nConsPosOfEndOfWindow - nWindowOfLCQ ] & uLowConsensusBase ) { --nLCQsInWindow; } // the loop conditions guarantee that nConsPosOfEndOfWindow // <= nGetEndIndex() so no seg fault here if ( pPaddedConsensus[ nConsPosOfEndOfWindow ] & uLowConsensusBase ) { ++nLCQsInWindow; } if ( nLCQsInWindow >= nMaxLCQsInWindow ) { pepperWindow( pPaddedConsensus, nConsPosOfEndOfWindow - nWindowOfLCQ + 1, nConsPosOfEndOfWindow ); } } gotoItem* pPreviousGotoItem = NULL; for( nConsPos = nGetStartIndex() + pCP->nAutoFinishDoNotIgnoreLCQIfThisManyBasesFromEndOfContigForLCQTagger_ - 1; nConsPos <= nGetEndIndex() - pCP->nAutoFinishDoNotIgnoreLCQIfThisManyBasesFromEndOfContigForLCQTagger_ + 1; ++nConsPos ) { // if high quality base, ignore if ( ! (pPaddedConsensus[ nConsPos ] & uLowConsensusBase ) ) continue; // this asks if there is a low consensus base in a window // in which there are lots of low consensus bases if ( ( pPaddedConsensus[ nConsPos ] & uLowConsensusBase ) && ( pPaddedConsensus[ nConsPos ] & uTooManyLowConsensusBasesInWindow ) ) continue; // if fewer than 3 aligned reads, ignore if ( ! (pPaddedConsensus[ nConsPos ] & uEnoughAlignedReads ) ) continue; // if reached here, then there is a low consensus base which // is isolated and the depth of coverage is high bool bTagEditable = false; bool bDoubleChemistry = ( pPaddedConsensus[ nConsPos ] & uDyeTerminatorReads ) && ( pPaddedConsensus[ nConsPos ] & uDyePrimerReads ); bool bDoubleStranded = ( pPaddedConsensus[ nConsPos ] & uTopStrandReads ) && ( pPaddedConsensus[ nConsPos ] & uBottomStrandReads ); if ( bDoubleChemistry ) bTagEditable = true; else if ( bDoubleStranded ) bTagEditable = true; else { // single stranded, single chemistry if ( pPaddedConsensus[ nConsPos ] & uDyeTerminatorReads ) bTagEditable = true; } if ( bTagEditable ) { if ( pPreviousGotoItem ) { if ( ( pPreviousGotoItem->nGotoItemEnd_ + 1 ) == nConsPos ) { pPreviousGotoItem->setNewEnd( nConsPos, nUnpaddedIndex( nConsPos ) ); continue; } } gotoItem* pGotoItem = new gotoItem( this , NULL, nConsPos, nConsPos, nUnpaddedIndex( nConsPos ), nUnpaddedIndex( nConsPos ), "isolated lcq" ); pEditableLCQList->addToList( pGotoItem ); pPreviousGotoItem = pGotoItem; } } free( pPaddedConsensus ); free( pNumberOfAlignedReads ); } class consensusSegment { public: consensusSegment( const int nConsPosLeft, const int nConsPosRight ) : nConsPosLeft_( nConsPosLeft ), nConsPosRight_( nConsPosRight ), nWillContainThisManyReads_( 0 ) {} bool operator==( const consensusSegment& con ) const { return( this == &con ); } public: int nConsPosLeft_; int nConsPosRight_; int nWillContainThisManyReads_; RWTPtrOrderedVector aReads_; }; void Contig :: recalculateConsensusQualitiesNoChange( const int nRegionConsPosLeft, const int nRegionConsPosRight, RWTValOrderedVector& aNewQualities ) { ContigView* pContigView = pGetContigView( nRegionConsPosLeft, nRegionConsPosRight ); aNewQualities.clear(); long lGuessOfNumberOfBoundariesNeeded = ( ( (double) nRegionConsPosRight - (double) nRegionConsPosLeft ) * (double) nGetNumberOfFragsInContig() ) / (double) nGetUnpaddedSeqLength(); lGuessOfNumberOfBoundariesNeeded *= 3 / 2; // the division site will be .5 base to the left of the number // put into this array mbtValOrderedVectorOfInt aReadBoundaries( (size_t) lGuessOfNumberOfBoundariesNeeded ); aReadBoundaries.soName_ = "recalculateConsensus::aReadBoundaries"; aReadBoundaries.insert( nRegionConsPosLeft ); aReadBoundaries.insert( nRegionConsPosRight + 1 ); int nRead; for( nRead = 0; nRead < pContigView->nGetNumberOfFragments(); ++nRead ) { LocatedFragment* pLocFrag = pContigView->pLocatedFragmentGet( nRead ); if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( pLocFrag->bIsAFakeRead() ) continue; int nIntersectLeft; int nIntersectRight; if ( !bIntersect( nRegionConsPosLeft, nRegionConsPosRight, pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), nIntersectLeft, nIntersectRight ) ) continue; // if reached here, this read (partly) overlaps the region aReadBoundaries.insert( nIntersectLeft ); aReadBoundaries.insert( nIntersectRight + 1 ); } // now sort this. (I wish C had a sort that eliminated duplicates, // of which there might be many.) aReadBoundaries.resort(); // take the time and eliminate duplicates int nSegment; for( nSegment = aReadBoundaries.length() - 1; nSegment >= 1; --nSegment ) { if ( aReadBoundaries[nSegment] == aReadBoundaries[nSegment - 1] ) { aReadBoundaries.removeAt( nSegment ); } } RWTPtrOrderedVector aConsensusSegments( (size_t) aReadBoundaries.length() ); aConsensusSegments.soName_ = "Contig::recalculateConsensus aConsensusSegments"; for( nSegment = 0; nSegment < ( aReadBoundaries.length() - 1 ); ++nSegment ) { int nConsPosLeft = aReadBoundaries[ nSegment ]; int nConsPosRight = aReadBoundaries[ nSegment + 1 ] - 1; aConsensusSegments.insert( new consensusSegment( nConsPosLeft, nConsPosRight ) ); } // go through all the reads again, this time figuring out how many // reads go into each consensusSegment for( nRead = 0; nRead < pContigView->nGetNumberOfFragments(); ++nRead ) { LocatedFragment* pLocFrag = pContigView->pLocatedFragmentGet( nRead ); if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( pLocFrag->bIsAFakeRead() ) continue; int nIntersectLeft; int nIntersectRight; if ( !bIntersect( nRegionConsPosLeft, nRegionConsPosRight, pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), nIntersectLeft, nIntersectRight ) ) continue; int nFirstConsSeg = aReadBoundaries.nFindIndexOfMatchOrPredecessor( nIntersectLeft ); // -------- ------- --------- ---------- consensus segments // -------------------- read // ^ nFirstConsSeg int nConsSegStart; if ( nFirstConsSeg == RW_NPOS ) nConsSegStart = 0; else nConsSegStart = nFirstConsSeg; bool bContinue; int nConsSeg = nConsSegStart; bContinue = true; // first figure out how large aReads_ must be so it doesn't keep resizing for( ; nConsSeg < aConsensusSegments.length() && bContinue; ++nConsSeg ) { consensusSegment* pConsSeg = aConsensusSegments[ nConsSeg ]; if ( bIntervalsIntersect( nIntersectLeft, nIntersectRight, pConsSeg->nConsPosLeft_, pConsSeg->nConsPosRight_ ) ) { ++pConsSeg->nWillContainThisManyReads_; } else { if ( nIntersectRight < pConsSeg->nConsPosLeft_ ) { // ------- ------- ------- // ----------- ^this consensusSegment // read // So continuing to examine more consensusSegment 's will // not help since then will continue to be off the read // to the right. bContinue = false; } } } // for( ; nConsSeg < aConsensusSegments.length() && bContinue; } // for( nRead = 0; nRead < // resize aReads_ in all consensusSegments for( int nConsSeg = 0; nConsSeg < aConsensusSegments.length(); ++nConsSeg ) { consensusSegment* pConsSeg = aConsensusSegments[ nConsSeg ]; pConsSeg->aReads_.resize( pConsSeg->nWillContainThisManyReads_ ); } // go through all the reads a 3rd time, this time putting each one into // each appropriate consensusSegment for( nRead = 0; nRead < pContigView->nGetNumberOfFragments(); ++nRead ) { LocatedFragment* pLocFrag = pContigView->pLocatedFragmentGet( nRead ); if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( pLocFrag->bIsAFakeRead() ) continue; int nIntersectLeft; int nIntersectRight; if ( !bIntersect( nRegionConsPosLeft, nRegionConsPosRight, pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), nIntersectLeft, nIntersectRight ) ) continue; int nFirstConsSeg = aReadBoundaries.nFindIndexOfMatchOrPredecessor( nIntersectLeft ); // -------- ------- --------- ---------- consensus segments // -------------------- read // ^ nFirstConsSeg int nConsSeg; if ( nFirstConsSeg == RW_NPOS ) nConsSeg = 0; else nConsSeg = nFirstConsSeg; bool bContinue = true; for( ; nConsSeg < aConsensusSegments.length() && bContinue; ++nConsSeg ) { consensusSegment* pConsSeg = aConsensusSegments[ nConsSeg ]; if ( bIntervalsIntersect( nIntersectLeft, nIntersectRight, pConsSeg->nConsPosLeft_, pConsSeg->nConsPosRight_ ) ) { pConsSeg->aReads_.insert( pLocFrag ); } else { if ( nIntersectRight < pConsSeg->nConsPosLeft_ ) { // ------- ------- ------- // ----------- ^this consensusSegment // read // So continuing to examine more consensusSegment 's will // not help since then will continue to be off the read // to the right. bContinue = false; } } } // for( ; nConsSeg < aConsensusSegments.length() && bContinue; } // for( nRead typedef RWTPtrOrderedVector arrayOfPointersToLocatedFragments; arrayOfPointersToLocatedFragments aAgreeingReads[4]; // put up here just so this array need not be created with each // consensus position RWTValOrderedVector aTemplates( (size_t) 30 ); aTemplates.soName_ = "Contig::recalculateConsensus aTemplates"; // now work way through the consensusSegments, and within each of those, // through the consensus positions. What is the highest quality read // at one base within a consensusSegment may be different than the highest // quality read at another base within the same consensusSegment. for( nSegment = 0; nSegment < aConsensusSegments.length(); ++nSegment ) { consensusSegment* pConsSeg = aConsensusSegments[ nSegment ]; for( int nConsPos = pConsSeg->nConsPosLeft_; nConsPos <= pConsSeg->nConsPosRight_; ++nConsPos ) { // we want to find a window about the consensus which includes // 5 non-pads. I guess I will allow the consensus to be a pad // and just look for 2 non-pads on each side of the consensus base. // putting default values in case we are close to the beginning // or end of the sequence. (I believe Purify found this problem // Jan 2002). int nWindowLeft = nConsPos - 2; int nWindowRight = nConsPos + 2; int nNonpads = 0; int nConsPos3; for( nConsPos3 = nConsPos - 1; nConsPos3 >= nGetStartIndex(); --nConsPos3 ) { if ( !ntGetCons( nConsPos3 ).bIsPad() ) { ++nNonpads; if ( nNonpads == 2 ) { nWindowLeft = nConsPos3; break; } } } nNonpads = 0; for( nConsPos3 = nConsPos + 1; nConsPos3 <= nGetEndIndex(); ++nConsPos3 ) { if ( !ntGetCons( nConsPos3 ).bIsPad() ) { ++nNonpads; if ( nNonpads == 2 ) { nWindowRight = nConsPos3; break; } } } // this was added Nov 2008 to fix bug in which // bases at beginning of consensus were low quality, even if there // was a high quality read sticking out. the reason was that // the window, which extended off of the consensus, would // not be contained in any read's aligned region if ( !bIntersect( nWindowLeft, nWindowRight, nGetStartConsensusIndex(), nGetEndConsensusIndex(), nWindowLeft, nWindowRight ) ) { THROW_ERROR2( "window completely off consensus--how could this happen?" ); } // consider reads that agree with the // consensus in this window LocatedFragment* pBestAgreeingRead = NULL; Quality qualBestAgreeingRead = 0; aTemplates.clear(); int nNumberOfWholeCloneReads = 0; for( int nGroup = 0; nGroup < 4; ++nGroup ) { aAgreeingReads[ nGroup ].clear(); } for( int nRead = 0; nRead < pConsSeg->aReads_.length(); ++nRead ) { LocatedFragment* pLocFrag = pConsSeg->aReads_[ nRead ]; // check first that the window is completely within the // aligned portion of the read if ( ! (pLocFrag->nAlignClipStart_ <= nWindowLeft && nWindowRight <= pLocFrag->nAlignClipEnd_ ) ) { continue; } bool bAgreeSoFar = true; for( int nConsPos2 = nWindowLeft; nConsPos2 <= nWindowRight && bAgreeSoFar; ++nConsPos2 ) { if ( pLocFrag->ntGetFragFromConsPos( nConsPos2 ).cGetBase() != ntGetCons( nConsPos2 ).cGetBase() ) { bAgreeSoFar = false; } } if ( bAgreeSoFar ) { int nStrandAndChemistry = pLocFrag->nGetStrandAndChemistry(); aAgreeingReads[ nStrandAndChemistry ].insert( pLocFrag ); Quality qual = pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); if ( !pBestAgreeingRead || qual > qualBestAgreeingRead ) { qualBestAgreeingRead = qual; pBestAgreeingRead = pLocFrag; } if ( pLocFrag->bIsWholeCloneTemplateNotSubcloneTemplate() ) ++nNumberOfWholeCloneReads; else { // changed Dec 2008 since otherwise all solexa reads // will have null template name and thus all will be // considered to come from the same template so top // and bottom strand reads will not confirm each other. RWCString soTemplate = pLocFrag->soGetTemplateName(); if ( aTemplates.index( soTemplate ) == RW_NPOS ) { aTemplates.insert( soTemplate ); } } } // if ( bAgreeSoFar ) ... } // for( int nRead = 0; ... // we've already found the highest quality *single* read (not part // of a pair) LocatedFragment* pBestRead1 = pBestAgreeingRead; LocatedFragment* pBestRead2 = NULL; Quality qualBest = qualBestAgreeingRead; // now look over pairs of reads to see if we can // find a pair of reads that does better // pick reads from each of the 1st 3 groups for( int nStrandChem1 = 0; nStrandChem1 <= 2; ++nStrandChem1 ) { // pick all the reads in the group for( int nRead1 = 0; nRead1 < aAgreeingReads[ nStrandChem1 ].length(); ++nRead1 ) { LocatedFragment* pLocFragStrandChem1 = (aAgreeingReads[ nStrandChem1 ])[nRead1]; // pick reads from each of the remaining groups for( int nStrandChem2 = nStrandChem1 + 1; nStrandChem2 <= 3; ++nStrandChem2 ) { // pick all reads in the group for( int nRead2 = 0; nRead2 < aAgreeingReads[ nStrandChem2 ].length(); ++nRead2 ) { LocatedFragment* pLocFragStrandChem2 = (aAgreeingReads[ nStrandChem2 ])[nRead2]; if ( pLocFragStrandChem1->bIsThisReadFromTheSameTemplate( pLocFragStrandChem2 ) ) continue; // if reach here, the reads are from different // templates Quality qual = pLocFragStrandChem1->ntGetFragFromConsPos( nConsPos ).qualGetQuality() + pLocFragStrandChem2->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); if ( qual > qualBest ) { pBestRead1 = pLocFragStrandChem1; pBestRead2 = pLocFragStrandChem2; qualBest = qual; } } // for( int nRead2 = 0; } // for( int nStrandChem2 ... } // for( int nRead1 = 0; } // for( int nStrandChem1 ... // when reached here, we have tried each individual read // and we have tried all pairs of reads from different // chem/strand groups. The pair pBestRead1, pBestRead2 // represents the best pair or best single read ( if pBestRead2 // is null). // check if we quality for a +5 boost. If there is a pair // of reads that gives the best quality, then this pair already // has 2 templates, so there must be 3 templates or more to // give a +5 boost. If there is a single read, then there must // be 2 or more templates to give a +5 boost. int nNumberOfTemplates = nNumberOfWholeCloneReads + aTemplates.length(); if ( pBestRead2 ) { if ( nNumberOfTemplates >= 3 ) qualBest += 5; } else { if ( nNumberOfTemplates >= 2 ) qualBest += 5; } // there is an exception. If there is only a single template, // and all the reads are in the low quality segment, then // set the consensus to quality 0. // The reason we don't check for == 0 is that in this case the // quality will already be set to zero. The reason we don't need // to check the case of nNumberOfTemplates > 1 is that in this case // there are > 1 aligned templates perfectly matching a 5 base window, // so there are clearly > 1 aligned templates if ( nNumberOfTemplates == 1 ) { // are all of the reads in the low quality segment? bool bAllInLowQualitySegment = true; // we are just interested in aligned reads--not whether // they agree with the consensus in a 5 base window for( int nRead = 0; nRead < pConsSeg->aReads_.length() && bAllInLowQualitySegment; ++nRead ) { LocatedFragment* pLocFrag = pConsSeg->aReads_[ nRead ]; if ( pLocFrag->bIsInHighQualitySegmentOfRead( nConsPos ) ) bAllInLowQualitySegment = false; } if ( bAllInLowQualitySegment ) { // now check if there is only one template that is aligned // When we checked before nNumberOfTemplates == 1, we // were counting the number of templates with 5 base // window agreeing with the consensus. Now let's do // a more careful check that there is only one that is // aligned: RWTValOrderedVector aAlignedTemplates; bool bOnlyOneAlignedTemplate = true; for( int nRead = 0; nRead < pConsSeg->aReads_.length() && bOnlyOneAlignedTemplate; ++nRead ) { LocatedFragment* pLocFrag = pConsSeg->aReads_[ nRead ]; if ( pLocFrag->bIsWholeCloneTemplateNotSubcloneTemplate() ) { bOnlyOneAlignedTemplate = false; } else { if ( aAlignedTemplates.index( pLocFrag->soTemplate_ ) == RW_NPOS ) { aAlignedTemplates.insert( pLocFrag->soTemplate_ ); if ( aAlignedTemplates.length() > 1 ) { bOnlyOneAlignedTemplate = false; } } } } if ( bOnlyOneAlignedTemplate ) { qualBest = 0; } } // if ( bAllInLowQualitySegment ) { } // if ( nNumberOfTemplates == 1 ) { // now we have found the answer: qualBest. We need to put the // answer in the Sequence if ( qualBest > 90 ) qualBest = 90; aNewQualities.append( qualBest ); } // nConsPos } // for( int nSegment = 0; nSegment < aConsensusSegments.length(); } // recalculateConsensusQualitiesNoChange void Contig :: recalculateConsensusQualitiesAndChange( const int nConsPosLeft, const int nConsPosRight ) { RWTValOrderedVector aNewQualities( (size_t) ( nConsPosRight - nConsPosLeft + 1 ) ); recalculateConsensusQualitiesNoChange( nConsPosLeft, nConsPosRight, aNewQualities ); for( int nConsPos = nConsPosLeft; nConsPos <= nConsPosRight; ++nConsPos ) { Quality qualOld = ntGetCons( nConsPos ).qualGetQuality(); Quality qualNew = aNewQualities[ nConsPos - nConsPosLeft ]; setQualityAtSeqPos( nConsPos, qualNew ); } } void Contig :: appendListOfDifferencesBetweenRecalculatedAndOriginalConsensus( gotoList* pGotoList ) { RWTValOrderedVector aNewQualities( (size_t) length() ); recalculateConsensusQualitiesNoChange( nGetStartIndex(), nGetEndIndex(), aNewQualities ); // now compare the new consensus to the old consensus for( int nConsPos = nGetStartIndex(); nConsPos <= nGetEndIndex(); ++nConsPos ) { int nPhrapQuality = ntGetCons( nConsPos ).qualGetQuality(); // aNewQualities is 0-based int nConsedQuality = aNewQualities[ nConsPos - nGetStartIndex() ]; int nDifference = ABS( nPhrapQuality - nConsedQuality ); if ( nPhrapQuality != nConsedQuality ) { // if ( nPhrapQuality > 50 && // nConsedQuality > 50 ) continue; // if ( nPhrapQuality > 30 && // nConsedQuality > 30 && // nDifference < 10 ) continue; // if ( nPhrapQuality > 10 && // nConsedQuality > 10 && // nDifference < 4 ) continue; RWCString soDescription = "quality difference. phrap: "; soDescription += RWCString( (long) nPhrapQuality ); soDescription += " consed: "; soDescription += RWCString( (long) nConsedQuality ); soDescription += " difference: "; soDescription += RWCString( (long) nDifference ); pGotoList->addToList( new gotoItem( this, NULL, // pLocFrag nConsPos, nConsPos, nUnpaddedIndex( nConsPos ), nUnpaddedIndex( nConsPos ), soDescription ) ); } } } void Contig :: highlightAllReads() { highlightAllReads( true ); } void Contig :: unhighlightAllReads() { highlightAllReads( false ); } void Contig :: highlightAllReads( const bool bHighlight ) { for (int nRead = 0; nRead < nGetNumberOfFragsInContig(); nRead++) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); pLocFrag->bReadCurrentlyHighlighted_ = bHighlight; } ConsEd::pGetConsEd()->refreshAllContigWinsAndAllTeditors(); } void Contig :: addAllSeriousFalseMatchesInOneContig( primerType* pPrimer, RWTValOrderedVector& aFalseMatchLocations, RWTValOrderedVector& aFalseMatchContigsOfLocations, RWTValOrderedVector& aFalseMatchStrandsOfLocations ) { addAllSeriousFalseMatchesInOneContigOneStrand( pPrimer, aFalseMatchLocations, aFalseMatchContigsOfLocations, aFalseMatchStrandsOfLocations, true ); // bTopStrandNotBottomStrand addAllSeriousFalseMatchesInOneContigOneStrand( pPrimer, aFalseMatchLocations, aFalseMatchContigsOfLocations, aFalseMatchStrandsOfLocations, false ); // bTopStrandNotBottomStrand } void Contig :: addAllSeriousFalseMatchesInOneContigOneStrand( primerType* pPrimer, RWTValOrderedVector& aFalseMatchLocations, RWTValOrderedVector& aFalseMatchContigsOfLocations, RWTValOrderedVector& aFalseMatchStrandsOfLocations, const bool bLookInTopStrandNotBottomStrandOfContig ) { int n4MerToLookup = pPrimer->n3Prime4Bases_; contigMatchTablesType* pUnpaddedContig = bLookInTopStrandNotBottomStrandOfContig ? pUnpaddedContig_ : pUnpaddedContigComplemented_; blockOfLocationsOfA4MerType* pBlockOfLocationsOfA4Mer = &( pUnpaddedContig->blockOfLocationsOfA4Mer_[ n4MerToLookup ] ); bool bAgain = true; while( bAgain ) { for( int nIndexWithinBlock = 0; nIndexWithinBlock < pBlockOfLocationsOfA4Mer->nCurrentLength_; ++nIndexWithinBlock ) { int nUnpaddedPosOfMatchingContig4Mer = pBlockOfLocationsOfA4Mer->nUnpaddedPosition_[ nIndexWithinBlock ]; int nContigZeroBasedUnpaddedOfFirstBase = nUnpaddedPosOfMatchingContig4Mer + nLengthOf4Mer - pPrimer->nUnpaddedLength_ - 1; if ( nContigZeroBasedUnpaddedOfFirstBase < 0 ) continue; // target is too close to beginning of contig if ( bLookInTopStrandNotBottomStrandOfContig && pPrimer->bTopStrandNotBottomStrand_ && ( this == pPrimer->pContig_ ) && ( nContigZeroBasedUnpaddedOfFirstBase == ( pPrimer->nUnpaddedStart_ - 1 ) ) ) continue; // primer is the same as the target if ( !bLookInTopStrandNotBottomStrandOfContig && !pPrimer->bTopStrandNotBottomStrand_ && ( this == pPrimer->pContig_ ) ) { int nComplementedStartOfPrimer = nComplementUnpaddedIndex( pPrimer->nUnpaddedEnd_ ); if ( nContigZeroBasedUnpaddedOfFirstBase == nComplementedStartOfPrimer - 1 ) continue; // primer is the same as the target } char* szFalseMatch = pUnpaddedContig->szUnpaddedBases_ + nContigZeroBasedUnpaddedOfFirstBase; int nPosOfMaxCumulativeScore; int nScoreOfFalseMatch = nFindScoreOfMatch( pPrimer->nUnpaddedLength_, szFalseMatch, nPosOfMaxCumulativeScore ); // at least the last 4 bases much match assert( nScoreOfFalseMatch >= 8 ); if ( nScoreOfFalseMatch > pCP->nPrimersMaxMatchElsewhereScoreToUse_ ) { aFalseMatchContigsOfLocations.append( this ); aFalseMatchStrandsOfLocations.append( bLookInTopStrandNotBottomStrandOfContig ); int nUnpadded3PrimeEndOfMatch = nUnpaddedPosOfMatchingContig4Mer + nLengthOf4Mer - 1; if ( !bLookInTopStrandNotBottomStrandOfContig ) { nUnpadded3PrimeEndOfMatch = nComplementUnpaddedIndex( nUnpadded3PrimeEndOfMatch ); } aFalseMatchLocations.append( nUnpadded3PrimeEndOfMatch ); } } // for( int nIndexWithinBlock = 0; if ( pBlockOfLocationsOfA4Mer->pNextBlockOfLocationsOfA4Mer_ ) pBlockOfLocationsOfA4Mer = pBlockOfLocationsOfA4Mer->pNextBlockOfLocationsOfA4Mer_; else bAgain = false; } // while( bAgain ) } void Contig :: freeContigMatchTables( contigMatchTablesType* pContigMatchTables ) { free( pContigMatchTables->szUnpaddedBases_ ); free( pContigMatchTables->pUnpaddedQualityArray_ ); for( int n4Mer = 0; n4Mer < nNumberOf4Mers; ++n4Mer ) { blockOfLocationsOfA4MerType* pBlock = (pContigMatchTables->blockOfLocationsOfA4Mer_[ n4Mer ]).pNextBlockOfLocationsOfA4Mer_; while( pBlock ) { blockOfLocationsOfA4MerType* pNextBlock = pBlock->pNextBlockOfLocationsOfA4Mer_; free( pBlock ); pBlock = pNextBlock; } } // finally, deallocated the whole structure free( pContigMatchTables ); } void Contig :: figureOutConsistentForwardReversePairDepth() { aConsistentFwdRevPairDepth_.clear(); aConsistentFwdRevPairDepth_.resize( nGetUnpaddedSeqLength() ); for( int nUnpadded = nGetUnpaddedStartIndex(); nUnpadded <= nGetUnpaddedEndIndex(); ++nUnpadded ) { aConsistentFwdRevPairDepth_.insert( 0 ); } for( int nSub = 0; nSub < pSubcloneTemplatesForFwdRevPairDepth_->length(); ++nSub ) { subcloneTTemplate* pSub = (*pSubcloneTemplatesForFwdRevPairDepth_)[ nSub ]; LocatedFragment* pFwdRead = NULL; LocatedFragment* pRevRead = NULL; bool bHasFwdRevPair = false; char cProblem = ' '; assert( pSub->bHasAConsistentFwdRevPair( pFwdRead, pRevRead, bHasFwdRevPair, cProblem ) ); assert( bHasFwdRevPair ); // so we have a consistent fwd/rev pair and both are in this // contig // note: due to the way // figureOutConsistentForwardReversePairDepthOfAllContigs // picked the subclones for pSubcloneTemplatesForFwdRevPairDepth_, // all of the templates have an *aligned* fwd/rev pair. // Thus I will use the aligned clipping positions to indicate // the extent of the template insert. int nUnpaddedLeft; int nUnpaddedRight; if ( pFwdRead->bComp() ) { nUnpaddedLeft = nUnpaddedIndex( pRevRead->nGetAlignClipStart() ); nUnpaddedRight = nUnpaddedIndex( pFwdRead->nGetAlignClipEnd() ); } else { nUnpaddedLeft = nUnpaddedIndex( pFwdRead->nGetAlignClipStart() ); nUnpaddedRight = nUnpaddedIndex( pRevRead->nGetAlignClipEnd() ); } // put on the consensus if ( !bIntersect( nUnpaddedLeft, nUnpaddedRight, nGetUnpaddedStartIndex(), nGetUnpaddedEndIndex(), nUnpaddedLeft, nUnpaddedRight ) ) continue; // completely off consensus for( int nUnpadded = nUnpaddedLeft; nUnpadded <= nUnpaddedRight; ++nUnpadded ) { // I don't want to exceed the maximum # that an unsigned char // can hold if ( aConsistentFwdRevPairDepth_[ nUnpadded ] < 250 ) ++aConsistentFwdRevPairDepth_[ nUnpadded ]; } } } // void Contig :: figureOutConsistentForwardReversePairDepth() { // this is used to autoEdit so it writes to the .out file // it does not write to the .wrk file or to any Gui void Contig :: changeToXsInAllReads( const int nFirstBaseToMakeAnX, const char cChangeToXsRightOrLeft ) { int nLeftMostPosition; int nRightMostPosition; if ( cChangeToXsRightOrLeft == 'r' ) { nLeftMostPosition = nFirstBaseToMakeAnX; nRightMostPosition = nGetLastDisplayableContigPos(); } else if ( cChangeToXsRightOrLeft == 'l' ) { nLeftMostPosition = nGetFirstDisplayableContigPos(); nRightMostPosition = nFirstBaseToMakeAnX; } else assert( false ); ContigView* pContigView = pGetContigView( nLeftMostPosition, nRightMostPosition ); for( int nRead = 0; nRead < pContigView->nGetNumberOfFragments(); ++nRead ) { LocatedFragment* pLocFrag = pContigView->pLocatedFragmentGet( nRead ); int nConsPosLeftInRead; int nConsPosRightInRead; if ( !bIntersect( nLeftMostPosition, nRightMostPosition, pLocFrag->nGetAlignStart(), pLocFrag->nGetAlignEnd(), nConsPosLeftInRead, nConsPosRightInRead ) ) continue; // I don't see how this could ever happen pLocFrag->changeToXs( nConsPosLeftInRead, nConsPosRightInRead ); pLocFrag->setChanged(); // record the change in the .out file fprintf( pAO, "read %s changed to Xs from %d to %d\n", pLocFrag->soGetName().data(), nUnpaddedIndex( pLocFrag->nGetAlignStart() ), nUnpaddedIndex( pLocFrag->nGetAlignEnd() ) ); } } class linkedHQD { public: LocatedFragment* pLocFrag_; char cDiscrepantBase_; linkedHQD* pNextLinkedHQD_; public: bool operator==( const linkedHQD& lhqd ) const { return( this == &lhqd ); } }; int nCompareLinkedHQDs( linkedHQD** ppHQD1, linkedHQD** ppHQD2 ) { if ( (*ppHQD1)->cDiscrepantBase_ < (*ppHQD2)->cDiscrepantBase_ ) return( -1 ); else if ( (*ppHQD1)->cDiscrepantBase_ > (*ppHQD2)->cDiscrepantBase_ ) return( 1 ); else { // same discrepant base if ( (*ppHQD1)->pLocFrag_->pSub_->soGetName() < (*ppHQD2)->pLocFrag_->pSub_->soGetName() ) { return( -1 ); } else if ( (*ppHQD1)->pLocFrag_->pSub_->soGetName() > (*ppHQD2)->pLocFrag_->pSub_->soGetName() ) return( 1 ); else return( 0 ); } } void Contig :: findMultipleHighQualityDiscrepancies( gotoList* pGotoList ) { // at each consensus base, mark it with the read if // there is a read that has a high quality discrepancy and // has a 9 base high quality window about that position // If there is more than 1 such read, just record the first such. // Comments added June 2007 // Look at aligned region of read, // find high quality discrepancies that // are not pads and the consensus is not a pad and that are not too close // to the unaligned part of the read and that are not within a tagged // compression or G dropout and the discrepancy must lie within a 9-base // window all of which is high quality and aligned (it must not have any // low quality bases close by) // Then go through the list of reads again and find any other read at the // same position that // is from a different subclone, is also aligned against the consensus, // and matches the discrepant read found above (they are both discrepancy // and they both agree). mbtPtrVector aDiscrepantRead( nGetPaddedSeqLength(), 1 ); int nRead; for( nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->bIsWholeReadUnaligned() ) continue; int nConsPosStartOnConsensus; int nConsPosEndOnConsensus; if ( bIntersect( pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), nGetStartIndex(), nGetEndIndex(), nConsPosStartOnConsensus, nConsPosEndOnConsensus ) ) { for( int nConsPos = nConsPosStartOnConsensus; nConsPos <= nConsPosEndOnConsensus; ++nConsPos ) { if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() == cGetConsensusBase( nConsPos ) ) continue; if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQualityWithout98or99() >= pCP->nQualityThresholdForFindingHighQualityDiscrepancies_ ) { if ( ntGetCons( nConsPos ).bIsPad() ) continue; if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).bIsPad() ) continue; if ( !pLocFrag->bIsInAlignedPartOfReadAndNotTooCloseToUnalignedPart( nConsPos ) ) continue; if ( pLocFrag->bIsWithinACompressionOrG_dropoutTag( nConsPos ) ) continue; // no need to record more than one discrepant read if ( aDiscrepantRead[ nConsPos ] ) continue; // check that it lies within a 9-base high quality window int nUnpaddedLeft = nUnpaddedIndex( nConsPos ) - 4; int nUnpaddedRight = nUnpaddedLeft + 8; int nPaddedLeft = nPaddedIndexFast( nUnpaddedLeft ); int nPaddedRight = nPaddedIndexFast( nUnpaddedRight ); // this window might go off the read. To avoid subscript // errors, intersect it with the read. It also might extend // off the consensus, so intersect it with the consensus. int nConsPosIntersectLeft; int nConsPosIntersectRight; if (!bIntersect( nPaddedLeft, nPaddedRight, pLocFrag->nGetAlignStart(), pLocFrag->nGetAlignEnd(), nConsPosIntersectLeft, nConsPosIntersectRight ) ) continue; if (!bIntersect( nConsPosIntersectLeft, nConsPosIntersectRight, nGetStartIndex(), // consensus left nGetEndIndex(), // consensus right nConsPosIntersectLeft, nConsPosIntersectRight ) ) continue; bool bAllHighQuality = true; for( int nWindowConsPos = nConsPosIntersectLeft; nWindowConsPos <= nConsPosIntersectRight && bAllHighQuality; ++nWindowConsPos ) { if ( pLocFrag->ntGetFragFromConsPos( nWindowConsPos ).qualGetQualityWithout98or99() < pCP->nQualityThresholdForFindingHighQualityDiscrepancies_ ) bAllHighQuality = false; } if ( !bAllHighQuality ) continue; aDiscrepantRead[ nConsPos ] = pLocFrag; } // if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQualityWithout98or99() } // for( int nConsPos = nConsPosStartOnConsensus; } // if ( bIntersect( pLocFrag->nGetAlignClipStart(), } // for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); // now go through each consensus position and see if there are other // discrepant reads from different subclones mbtValVectorOfBool aAlreadyRecorded( nGetPaddedSeqLength(), 1, "Contig::findMultipleHighQualityDiscrepancies2 aAlreadyRecorded" ); for( nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->bIsWholeReadUnaligned() ) continue; int nConsPosStartOnConsensus; int nConsPosEndOnConsensus; if ( bIntersect( pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), nGetStartIndex(), nGetEndIndex(), nConsPosStartOnConsensus, nConsPosEndOnConsensus ) ) { for( int nConsPos = nConsPosStartOnConsensus; nConsPos <= nConsPosEndOnConsensus; ++nConsPos ) { // we are not interested in discrepant reads where // there is no high quality discrepant read with a 10-base // high quality window if ( !aDiscrepantRead[ nConsPos ] ) continue; // this read must match the high quality discrepant read if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() != aDiscrepantRead[ nConsPos ]->ntGetFragFromConsPos( nConsPos ).cGetBase() ) continue; // so we have a discrepant read where there is a hqd and // they share the same discrepant base. Check that it // isn't the same subclone (might even be the same read!) if ( pLocFrag->pSub_ == aDiscrepantRead[ nConsPos ]->pSub_ ) continue; if ( aAlreadyRecorded[ nConsPos ] ) continue; aAlreadyRecorded.setValue( nConsPos, true ); RWCString soMessage = "confirmed by: "; soMessage += pLocFrag->soGetName(); gotoItem* pGotoItem = new gotoItem( this, aDiscrepantRead[ nConsPos ], // LocatedFragment nConsPos, nConsPos, nUnpaddedIndex( nConsPos ), nUnpaddedIndex( nConsPos ), soMessage.data(), true, // bPrefixContigToDescription NULL ); // pOtherData pGotoList->addToList( pGotoItem ); } // for( int nConsPos = nConsPosStartOnConsensus; } // if ( bIntersect( ... } // nRead = 0 ... } // (DG July 2009) It appears that x in a read is converted to quality // 0 so will not count as a discrepancy. n in a read is not converted // to quality 0 so will count as a discrepancy. void Contig :: navigateByHighlyDiscrepantPositions( const int nMinDiscrepantReads, const int nMaxDepthOfCoverage, const int nMinQuality, const bool bJustListIndels, const bool bIgnoreOtherReadsStartingAtSameLocation, const bool bIgnoreIfListedBasesInConsensus, const RWCString& soIgnoreIfTheseBasesInConsensus, const bool bGotoListNotArrays, gotoList* pGotoList ) { if ( bGotoListNotArrays ) { assert( pGotoList ); } else { if ( bDiscrepancyListsCalculated_ && ( nMinDiscrepantReads == nMinDiscrepantReads_ ) && ( nMaxDepthOfCoverage == nMaxDepthOfCoverage_ ) && ( nMinQuality == nMinQuality_ ) && ( bIgnoreOtherReadsStartingAtSameLocation == bIgnoreOtherReadsStartingAtSameLocation_ ) ) { return; } if ( paDiscrepanciesNotIndels_ ) delete paDiscrepanciesNotIndels_; if ( paDiscrepanciesJustIndels_ ) delete paDiscrepanciesJustIndels_; paDiscrepanciesNotIndels_ = new mbtValVectorOfBool( nGetPaddedSeqLength(), 1, "Contig::paDiscrepanciesNotIndels_" ); paDiscrepanciesJustIndels_ = new mbtValVectorOfBool( nGetPaddedSeqLength(), 1, "Contig::paDiscrepanciesJustIndels_" ); } const int nNumberOfBaseTypesAtEachConsPos = 6; RWT2DValVector aACGTAtEachConsPos( nGetPaddedSeqLength(), 6, // A, C, G, T, other, pad 1, // nConsPosStart "navigateByHighlyDiscrepantPositions::aACGTAtEachConsPos" ); int nStart; int nEnd; if ( bIgnoreOtherReadsStartingAtSameLocation ) { nStart = nGetDisplayableContigLength(); nEnd = nGetFirstDisplayableContigPos(); } else { // I really don't want the array at all, but to keep the compiler // happy, make it short nStart = 0; nEnd = 0; } const int nMANY_READS = 100000; if ( nGetNumberOfFragsInContig() > nMANY_READS ) fprintf( stdout, "navigateByHighlyDiscrepant positions" ); // this is just so that aSomeReadsStartAtThisPosition is // freed up when no longer needed { mbtValVectorOfBool aSomeReadsStartAtThisPosition( nStart, nEnd, "allHighQualityReadsDisagreeWithConsensus::aNumberOfReadsStartingAtThisPosition" ); int nRead; for( nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( bIgnoreOtherReadsStartingAtSameLocation ) { if ( aSomeReadsStartAtThisPosition[ pLocFrag->nGetAlignStart() ] ) continue; aSomeReadsStartAtThisPosition.setValue( pLocFrag->nGetAlignStart(), true ); } if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( pLocFrag == pReferenceSequence_ ) continue; int nConsPosStartOnConsensus; int nConsPosEndOnConsensus; if ( ! bIntersect( pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), nGetStartConsensusIndex(), nGetEndConsensusIndex(), nConsPosStartOnConsensus, nConsPosEndOnConsensus ) ) continue; // for time being, do to eliminate a base just because // it is in the low quality segment of the read (Feb 2008) // if ( ! bIntersect( nConsPosStartOnConsensus, // nConsPosEndOnConsensus, // pLocFrag->nGetHighQualityStart(), // pLocFrag->nGetHighQualityEnd(), // nConsPosStartOnConsensus, // nConsPosEndOnConsensus ) ) continue; if ( ( nRead != 0 ) && ( nRead % nMANY_READS == 0 ) ) { if ( pCP->nWhatIsRunning_ == nGraphicalConsedIsRunning ) { fprintf( stdout, "." ); fflush( stdout ); } } for( int nConsPos = nConsPosStartOnConsensus; nConsPos <= nConsPosEndOnConsensus; ++nConsPos ) { if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQualityWithout98or99X0() < nMinQuality ) continue; ++aACGTAtEachConsPos.valGet( nConsPos, nNumberFromBase3[ pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() ] ); } } // for( nRead = 0; nRead < nGetNumberOfFragsInContig(); } // // this is just so that aSomeReadsStartAtThisPosition is // freed up when no longer needed if ( nGetNumberOfFragsInContig() > nMANY_READS ) { fprintf( stdout, "done\n" ); fflush( stdout ); } RWCString soDescription( (size_t) 10000 ); const int n256 = 256; unsigned char ucConsensusIsOKToUse[ n256 ]; int n; for( n = 0; n < n256; ++n ) { ucConsensusIsOKToUse[n] = 1; } if ( bIgnoreIfListedBasesInConsensus ) { for( n = 0; n < soIgnoreIfTheseBasesInConsensus.length(); ++n ) { char cBase = soIgnoreIfTheseBasesInConsensus[n]; ucConsensusIsOKToUse[ tolower( cBase ) ] = 0; ucConsensusIsOKToUse[ toupper( cBase ) ] = 0; } } for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( !ucConsensusIsOKToUse[ cGetConsensusBase( nConsPos ) ] ) continue; int nConsensusBase = nNumberFromBase3[ cGetConsensusBase( nConsPos ) ]; int nDisagreeingReads = 0; int nTotalHighQualityReads = 0; for( int nBase = 0; nBase < nNumberOfBaseTypesAtEachConsPos; ++nBase ) { nTotalHighQualityReads += aACGTAtEachConsPos.valGet( nConsPos, nBase ); if ( bJustListIndels && ( nConsensusBase != nPAD ) && ( nBase != nPAD ) ) continue; if ( nBase != nConsensusBase ) { nDisagreeingReads += aACGTAtEachConsPos.valGet( nConsPos, nBase ); } } if ( nTotalHighQualityReads == 0 ) { continue; } if ( ( nMinDiscrepantReads <= nDisagreeingReads ) && ( nTotalHighQualityReads <= nMaxDepthOfCoverage ) ) { if ( bGotoListNotArrays ) { double dPerCentA = aACGTAtEachConsPos.valGet( nConsPos, nA ) * 100.0 / (double) nTotalHighQualityReads; double dPerCentC = aACGTAtEachConsPos.valGet( nConsPos, nC ) * 100.0 / (double) nTotalHighQualityReads; double dPerCentG = aACGTAtEachConsPos.valGet( nConsPos, nG ) * 100.0 / (double) nTotalHighQualityReads; double dPerCentT = aACGTAtEachConsPos.valGet( nConsPos, nT ) * 100.0 / (double) nTotalHighQualityReads; double dPerCentPad = aACGTAtEachConsPos.valGet( nConsPos, nPAD ) * 100.0 / (double) nTotalHighQualityReads; soDescription.nCurrentLength_ = sprintf( soDescription.data(), "%3d %5.1f%%%c %3d %5.1f%%%c %3d %5.1f%%%c %3d %5.1f%%%c %3d %5.1f%%%c %11s %s", aACGTAtEachConsPos.valGet( nConsPos, nA ), dPerCentA, ( nConsensusBase == nA ? 'r' : ' ' ), aACGTAtEachConsPos.valGet( nConsPos, nC ), dPerCentC, ( nConsensusBase == nC ? 'r' : ' ' ), aACGTAtEachConsPos.valGet( nConsPos, nG ), dPerCentG, ( nConsensusBase == nG ? 'r' : ' ' ), aACGTAtEachConsPos.valGet( nConsPos, nT ), dPerCentT, ( nConsensusBase == nT ? 'r' : ' ' ), aACGTAtEachConsPos.valGet( nConsPos, nPAD ), dPerCentPad, ( nConsensusBase == nPAD ? 'r' : ' ' ), soAddCommas( nUnpaddedIndex( nConsPos ) ).data(), this->soGetName().data() ); gotoItem* pGotoItem = new gotoItem( this, NULL, // LocatedFragment nConsPos, nConsPos, nUnpaddedIndex( nConsPos ), nUnpaddedIndex( nConsPos ), soDescription, false ); // bPrefixContigToDescription pGotoList->addToList( pGotoItem ); } // if ( bGotoListNotArrays ) { else { if ( ( nConsensusBase == nPAD ) || ( nConsensusBase != nPAD && aACGTAtEachConsPos.valGet( nConsPos, nPAD ) >= nMinDiscrepantReads ) ) { paDiscrepanciesJustIndels_->setValue( nConsPos, true ); } else { paDiscrepanciesNotIndels_->setValue( nConsPos, true ); } } // if ( bGotoListNotArrays ) { else } // ( nMinDiscrepantReads <= nDisagreeingReads ) } // for( nConsPos = nGetStartConsensusIndex(); if ( !bGotoListNotArrays ) { bDiscrepancyListsCalculated_ = true; nMinDiscrepantReads_ = nMinDiscrepantReads; nMaxDepthOfCoverage_ = nMaxDepthOfCoverage; nMinQuality_ = nMinQuality; bIgnoreOtherReadsStartingAtSameLocation_ = bIgnoreOtherReadsStartingAtSameLocation; } } // void Contig :: navigateByHighlyDiscrepantPositions static int nCompareReadsByPosition( const LocatedFragment** ppLocFrag1, const LocatedFragment** ppLocFrag2 ) { if ( (*ppLocFrag1)->nAlignStartPos_ < (*ppLocFrag2)->nAlignStartPos_ ) return( -1 ); else if ( (*ppLocFrag1)->nAlignStartPos_ == (*ppLocFrag2)->nAlignStartPos_ ) return( 0 ); else return( 1 ); } void Contig :: sortReadsByPosition() { qsort( apLocatedFragment_.data(), apLocatedFragment_.length(), sizeof( LocatedFragment* ), ( ( int(*) ( const void*, const void*) ) nCompareReadsByPosition ) ); // check that qsort worked bool bSorted = true; for( int nRead = 1; nRead < apLocatedFragment_.length(); ++nRead ) { LocatedFragment* pLocFrag1 = apLocatedFragment_[ nRead - 1 ]; LocatedFragment* pLocFrag2 = apLocatedFragment_[ nRead ]; if ( pLocFrag1->nAlignStartPos_ > pLocFrag2->nAlignStartPos_ ) { cerr << "reads " << nRead - 1 << " " << pLocFrag1->soGetName() << " and " << nRead << " " << pLocFrag2->soGetName() << " are out of order." << endl; bSorted = false; } } if ( !bSorted ) { THROW_ERROR( "qsort failed" ); } // note that the reads are no longer in alphabetical order so the // next pFindReadByName must sort them alphabetically first apLocatedFragment_.bIsSorted_ = false; } void Contig :: tellPhrapNotToOverlapReadsDiscrepantAtThisLocation( const int nConsPosWhereToSplit ) { // need to get a ContigView of reads ContigView* pContigView = pGetContigView( nConsPosWhereToSplit - ContigView::nWINDOW_SIZE, nConsPosWhereToSplit + ContigView::nWINDOW_SIZE ); pContigView->tellPhrapNotToOverlapReadsDiscrepantAtThisLocation( nConsPosWhereToSplit, false, // using Gui NULL ); delete pContigView; } void Contig :: searchForHighQualityDiscrepanciesWithReference( LocatedFragment* pLocFragRef, gotoList* pGotoList ) { int nLeftmostPos = pLocFragRef->nGetAlignStart(); int nRightmostPos = pLocFragRef->nGetAlignEnd(); for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag == pLocFragRef ) continue; int nIntersectLeft; int nIntersectRight; if ( !bIntersect( nLeftmostPos, nRightmostPos, pLocFrag->nGetAlignStart(), pLocFrag->nGetAlignEnd(), nIntersectLeft, nIntersectRight ) ) continue; // the read did // overlap the reference sequence for( int nConsPos = nIntersectLeft; nConsPos <= nIntersectRight; ++nConsPos ) { // do not allow x's to count as discrepancies if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() != pLocFragRef->ntGetFragFromConsPos( nConsPos ).cGetBase() && pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() != 'x' ) { // discrepancy! But is it high quality? if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQualityWithout98or99() >= pCP->nQualityThresholdForFindingHighQualityDiscrepancies_ ) { gotoItem* pGotoItem = new gotoItem( this, pLocFrag, nConsPos, nConsPos, nUnpaddedIndex( nConsPos ), nUnpaddedIndex( nConsPos ), "", // message true, // // bPrefixContigToDescription NULL ); // pOtherData pGotoList->addToList( pGotoItem ); } } } // for( int nConsPos = nIntersectLeft; ... } // for( int nRead = 0; pGotoList->sortByPosition(); } const int nNumberOfBasesForContigEndPairTags = 10; int Contig :: nGetGoodPlaceForContigEndPairTag( const int nWhichEnd ) { int nUnpadded; if ( nWhichEnd == nRightGap ) { if ( bIsThereAHighQualitySegment() ) { nUnpadded = nGetHighQualitySegmentEnd() - nNumberOfBasesForContigEndPairTags + 1; } else { nUnpadded = nGetUnpaddedEndIndex() - nNumberOfBasesForContigEndPairTags + 1; } if ( nUnpadded < nGetUnpaddedStartIndex() ) nUnpadded = nGetUnpaddedStartIndex(); } else { assert( nWhichEnd == nLeftGap ); // left gap if ( bIsThereAHighQualitySegment() ) { nUnpadded = nGetHighQualitySegmentStart() + nNumberOfBasesForContigEndPairTags - 1; } else { nUnpadded = nGetUnpaddedStartIndex() + nNumberOfBasesForContigEndPairTags - 1; } if ( nUnpadded > nGetUnpaddedEndIndex() ) nUnpadded = nGetUnpaddedEndIndex(); } int nConsPos = nPaddedIndexFast( nUnpadded ); assert( nGetStartIndex() <= nConsPos ); assert( nConsPos <= nGetEndIndex() ); return( nConsPos ); } void Contig :: complementContigIfNecessary( RWCString& soErrorMessage ) { soErrorMessage = ""; if ( aContigOrientationWholeReadItems_.length() == 0 ) return; // check for consistency among the wholeReadItems bool bNeedToComplementContig = false; bool bError = false; for( int nWRI = 0; nWRI < aContigOrientationWholeReadItems_.length(); ++nWRI ) { wholeReadItem* pWRI = aContigOrientationWholeReadItems_[ nWRI ]; LocatedFragment* pLocFrag = pWRI->pLocFrag_; bool bContigComplementedRelativeToRead = pLocFrag->bComp(); bool bContigShouldBeComplementedRelativeToRead; if ( pWRI->soData_.bStartsWith( szSAME_AS_READ ) ) bContigShouldBeComplementedRelativeToRead = false; else if ( pWRI->soData_.bStartsWith( szOPPOSITE_READ ) ) bContigShouldBeComplementedRelativeToRead = true; else { RWCString soError = "Error in whole read item in read "; soError += pLocFrag->soGetName(); soError += " whole read item contains "; soError += pWRI->soData_; soError += " but should start with either "; soError += szSAME_AS_READ; soError += " or "; soError += szOPPOSITE_READ; THROW_ERROR( soError ); } if ( nWRI == 0 ) { bNeedToComplementContig = exclusive_or( bContigComplementedRelativeToRead, bContigShouldBeComplementedRelativeToRead ); } else { if ( bNeedToComplementContig != exclusive_or( bContigComplementedRelativeToRead, bContigShouldBeComplementedRelativeToRead ) ) { bError = true; } } } // for( int nWRI = 0; ... if ( bError ) { soErrorMessage = "contigOrientation whole read items are inconsistent in reads "; for( int nWRI = 0; nWRI < aContigOrientationWholeReadItems_.length(); ++nWRI ) { wholeReadItem* pWRI = aContigOrientationWholeReadItems_[ nWRI ]; soErrorMessage += " "; soErrorMessage += pWRI->pLocFrag_->soGetName(); } } else { if ( bNeedToComplementContig ) complementContig(); return; } } int Contig :: nGetMeanAlignedRegionOfReads() { double dSum = 0.0; double dSumOfSquares = 0.0; int nRead; for( nRead = 0; nRead < apLocatedFragment_.length(); ++nRead ) { LocatedFragment* pLocFrag = apLocatedFragment_[ nRead ]; int nUnpaddedAlignedLength; if ( pLocFrag->bIsWholeReadUnaligned() ) { nUnpaddedAlignedLength = 0; } else { nUnpaddedAlignedLength = pLocFrag->nGetAlignClipEndUnpadded() - pLocFrag->nGetAlignClipStartUnpadded(); assert( nUnpaddedAlignedLength >= 0 ); } dSum += nUnpaddedAlignedLength; dSumOfSquares += ( nUnpaddedAlignedLength * nUnpaddedAlignedLength ); } double dMean = dSum / apLocatedFragment_.length(); double dStandardDeviation = sqrt( dSumOfSquares / apLocatedFragment_.length() - dMean * dMean ); int nLowerBound = dMean - 3 * dStandardDeviation; int nUpperBound = dMean + 3 * dStandardDeviation; // now go through again, calculating the statistics for // the reads, but ignoring outliers dSum = 0.0; // dSumOfSquares = 0.0; int nNumberOfReads = 0; for( nRead = 0; nRead < apLocatedFragment_.length(); ++nRead ) { LocatedFragment* pLocFrag = apLocatedFragment_[ nRead ]; int nUnpaddedAlignedLength; if ( pLocFrag->bIsWholeReadUnaligned() ) { nUnpaddedAlignedLength = 0; } else { nUnpaddedAlignedLength = pLocFrag->nGetAlignClipEndUnpadded() - pLocFrag->nGetAlignClipStartUnpadded(); assert( nUnpaddedAlignedLength >= 0 ); } if ( nLowerBound <= nUnpaddedAlignedLength && nUnpaddedAlignedLength <= nUpperBound ) { dSum += nUnpaddedAlignedLength; // dSumOfSquares += ( nUnpaddedAlignedLength * nUnpaddedAlignedLength ); ++nNumberOfReads; } } int nMean = dSum / nNumberOfReads; return( nMean ); } void Contig :: fixOrientation( RWCString& soErrorMessage ) { soErrorMessage = ""; unfixOrientation(); // look for a read that is near the middle of the contig // and has a long aligned region (per Phil) int nMeanAlignedLength = nGetMeanAlignedRegionOfReads(); // What is the "middle" of the contig? The center of the consensus? // Let's say the center of the high quality region. And we want // to try to find a read that has its center near there. // And where is the center of the read? How about the middle of // the high quality segment? int nCenterUnpaddedConsPos = ( nGetHighQualitySegmentStart() + nGetHighQualitySegmentEnd() ) / 2; LocatedFragment* pBestLocFrag = NULL; int nBestReadDistance = 2 * nGetUnpaddedSeqLength(); // a number guaranteed to be larger than any found for( int nRead = 0; nRead < apLocatedFragment_.length(); ++nRead ) { LocatedFragment* pLocFrag = apLocatedFragment_[ nRead ]; if ( pLocFrag->nGetAlignedLengthUnpadded() >= nMeanAlignedLength ) { int nDistance = ABS( pLocFrag->nGetCenterOfAlignedRegionUnpadded() - nCenterUnpaddedConsPos ); if ( nDistance < nBestReadDistance ) { nDistance = nBestReadDistance; pBestLocFrag = pLocFrag; } } } if ( !pBestLocFrag ) { soErrorMessage.increaseMaxLengthIfNecessary( 1000 ); soErrorMessage.appendFormat( "could not fix the orientation of contig %s (mean aligned length = %d)", soGetName().data(), nMeanAlignedLength ); return; } // we found a read. Let's add a wholeReadItem to it RWCString soWRIData = ( pBestLocFrag->bComp() ? szOPPOSITE_READ : szSAME_AS_READ ); wholeReadItem* pWR = new wholeReadItem( pBestLocFrag, (char*) szCONTIG_ORIENTATION, "consed", soGetDateTime( nNoSlashes ), soWRIData ); pWR->appendThyself(); cerr << "contigOrientation wholeReadItem added to " << pBestLocFrag->soGetName() << endl; pBestLocFrag->setChanged(); // so this read will be written out } void Contig :: unfixOrientation() { for( int nContigOrientation = aContigOrientationWholeReadItems_.length() - 1; nContigOrientation >= 0; --nContigOrientation ) { wholeReadItem* pWRI = aContigOrientationWholeReadItems_[ nContigOrientation ]; aContigOrientationWholeReadItems_.removeAt( nContigOrientation ); LocatedFragment* pLocFrag = pWRI->pLocFrag_; pLocFrag->aWholeReadItems_.remove( pWRI ); delete pWRI; pLocFrag->setChanged(); } } void Contig :: calculateReadDepth() { if ( pReadDepthArray_ ) { delete pReadDepthArray_; } double dErrorProbability = dErrorRateFromQuality[ pCP->nAssemblyViewReadDepthQuality_ ]; // 1-based array so 0th position is unused and make array 1 longer pReadDepthArray_ = new RWTValVector( nGetUnpaddedSeqLength() + 1, 0, "Contig::pReadDepthArray_" ); for( int nRead = 0; nRead < apLocatedFragment_.length(); ++nRead ) { LocatedFragment* pLocFrag = apLocatedFragment_[ nRead ]; int nUnpaddedConsPosLeft; int nUnpaddedConsPosRight; if ( !pLocFrag->bGetUserDefinedHighQualitySegment( dErrorProbability, nUnpaddedConsPosLeft, nUnpaddedConsPosRight ) ) { // there is no high quality segment for this read continue; } if ( !bIntersect( nUnpaddedConsPosLeft, nUnpaddedConsPosRight, nGetUnpaddedStartIndex(), nGetUnpaddedEndIndex(), nUnpaddedConsPosLeft, nUnpaddedConsPosRight ) ) { // the high quality segment is completely off the consensus continue; } for( int nUnpadded = nUnpaddedConsPosLeft; nUnpadded <= nUnpaddedConsPosRight; ++nUnpadded ) { ++((*pReadDepthArray_)[ nUnpadded ]); } } bReadDepthArrayCalculated_ = true; nReadDepthArrayCalculatedWithQuality_ = pCP->nAssemblyViewReadDepthQuality_; } void Contig :: calculateReadDepthIfNecessary() { if ( bReadDepthArrayCalculated_ && nReadDepthArrayCalculatedWithQuality_ == pCP->nAssemblyViewReadDepthQuality_ && pReadDepthArray_ && ( nGetUnpaddedEndIndex() == pReadDepthArray_->nGetEndIndex() ) ) return; calculateReadDepth(); } // for assemblyView. similar to lookForCloneEndTags in contig2.cpp // which is for Autofinish void Contig :: lookForCloneEndTags2( RWTPtrOrderedVector* pCloneEndTags ) { for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); for( int nTag = 0; nTag < pLocFrag->nGetNumberOfTags(); ++nTag ) { tag* pTag = pLocFrag->pGetTag( nTag ); if ( pTag->soGetTagType() != "cloneEnd" ) continue; // found a cloneEnd tag! pCloneEndTags->insert( pTag ); } } // now look in consensus tags for( int nConsensusTag = 0; nConsensusTag < nGetNumberOfTags(); ++nConsensusTag ) { tag* pTag = pGetTag( nConsensusTag ); if ( pTag->soGetTagType() != "cloneEnd" ) continue; // found a cloneEnd tag! pCloneEndTags->insert( pTag ); } } void Contig :: setStartNumberingUnpaddedConsensusAtUserDefinedIfNecessary() { if ( bSetStartNumberingUnpaddedConsensusAtUserDefined_ ) return; for( int nTag = 0; nTag < nGetNumberOfTags(); ++nTag ) { tag* pTag = pGetTag( nTag ); if ( pTag->soType_ != "startNumberingConsensus" ) continue; if ( !bIsNumericMaybeWithWhitespace( pTag->soMiscData_, nStartNumberingUnpaddedConsensusAtUserDefined_ ) ) { RWCString soError = "tag "; soError += pTag->soGetBasicInformation(); soError += " has misc data: "; soError += pTag->soMiscData_; soError += " which should be numeric"; popupErrorMessage( soError ); } } for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); for( int nTag = 0; nTag < pLocFrag->nGetNumberOfTags(); ++nTag ) { tag* pTag = pLocFrag->pGetTag( nTag ); if ( pTag->soType_ != "startNumberingConsensus" ) continue; if ( !bIsNumericMaybeWithWhitespace( pTag->soMiscData_, nStartNumberingUnpaddedConsensusAtUserDefined_ ) ) { RWCString soError = "tag "; soError += pTag->soGetBasicInformation(); soError += " has misc data: "; soError += pTag->soMiscData_; soError += " which should be numeric"; popupErrorMessage( soError ); } } } bSetStartNumberingUnpaddedConsensusAtUserDefined_ = true; } void Contig :: fixReadLists() { apLocatedFragment2_.clear(); apVeryLongReads_.clear(); if ( apLocatedFragment2_.nMaxLength_ == 0 ) { apLocatedFragment2_.resize( apLocatedFragment_.length() ); } for( int nRead = 0; nRead < apLocatedFragment_.length(); ++nRead ) { LocatedFragment* pLocFrag = apLocatedFragment_[ nRead ]; if ( pLocFrag->nGetAlignEnd() - pLocFrag->nGetAlignStart() + 1 > pCP->nMaxLengthOfReadsInapLocatedFragment2_ ) { apVeryLongReads_.insert( pLocFrag ); } else { apLocatedFragment2_.insert( pLocFrag ); } } apLocatedFragment2_.resort(); bReadListsNeedFixing_ = false; } static char cBaseFromNumber[6]; static bool bBaseFromNumberSet = false; void Contig :: appendListOfQuestionableConsensusBases( gotoList* pGotoList ) { if ( !bBaseFromNumberSet ) { bBaseFromNumberSet = true; cBaseFromNumber[0] = 'a'; cBaseFromNumber[1] = 'c'; cBaseFromNumber[2] = 'g'; cBaseFromNumber[3] = 't'; cBaseFromNumber[4] = '?'; cBaseFromNumber[5] = '*'; } const int nNumberOfBaseTypesAtEachConsPos = 6; RWT2DValVector aACGTAtEachConsPos( nGetPaddedSeqLength(), nNumberOfBaseTypesAtEachConsPos, // A, C, G, T, other, pad 1, // nConsPosStart "appendListOfQuestionableConsensusBases" ); int nRead; for( nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( pLocFrag == pReferenceSequence_ ) continue; int nConsPosStartOnConsensus; int nConsPosEndOnConsensus; if ( ! bIntersect( pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), nGetStartConsensusIndex(), nGetEndConsensusIndex(), nConsPosStartOnConsensus, nConsPosEndOnConsensus ) ) continue; if ( nRead % 100000 == 0 ) { if ( pCP->nWhatIsRunning_ == nGraphicalConsedIsRunning ) { fprintf( stdout, "." ); fflush( stdout ); } } for( int nConsPos = nConsPosStartOnConsensus; nConsPos <= nConsPosEndOnConsensus; ++nConsPos ) { ++aACGTAtEachConsPos.valGet( nConsPos, nNumberFromBase3[ pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() ] ); } } // for( nRead = 0; nRead < nGetNumberOfFragsInContig(); for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { int nMostReads = -666; int nBaseOfMostReads; int nBase; for( nBase = 0; nBase < nNumberOfBaseTypesAtEachConsPos; ++nBase ) { int nReadsThisBase = aACGTAtEachConsPos.valGet( nConsPos, nBase ); if ( aACGTAtEachConsPos.valGet( nConsPos, nBase ) > nMostReads ) { nMostReads = aACGTAtEachConsPos.valGet( nConsPos, nBase ); nBaseOfMostReads = nBase; } } int nConsensusBase = nNumberFromBase3[ cGetConsensusBase( nConsPos )]; int nTotalNumberOfReads = 0; for( nBase = 0; nBase < nNumberOfBaseTypesAtEachConsPos; ++nBase ) { nTotalNumberOfReads += aACGTAtEachConsPos.valGet( nConsPos, nBase ); } if ( ( nConsensusBase != nBaseOfMostReads ) && ( nTotalNumberOfReads > 2 ) ) { RWCString soDescription( (size_t) 500 ); soDescription.nCurrentLength_ = sprintf( soDescription.data(), "%15s %10s reads: %d %c ", soGetName().data(), soAddCommas( nUnpaddedIndex( nConsPos ) ).data(), nTotalNumberOfReads, cGetConsensusBase( nConsPos ) ); if ( aACGTAtEachConsPos.valGet( nConsPos, nConsensusBase ) == nMostReads ) { // case in which there are at least 2 bases with the same // number of reads. The consensus is one of them. soDescription += " same # of reads: "; soDescription += cBaseFromNumber[ nBaseOfMostReads ]; } else { soDescription += cBaseFromNumber[ nBaseOfMostReads ]; } gotoItem* pGotoItem = new gotoItem( this, NULL, // LocatedFragment nConsPos, nConsPos, nUnpaddedIndex( nConsPos ), nUnpaddedIndex( nConsPos ), soDescription, false ); // bPrefixContigToDescription pGotoList->addToList( pGotoItem ); } } } void Contig :: fixRunsInConsensus() { bool bInRun = false; char cPreviousBase = ' '; int nStartOfRunConsPos; int nEndOfRunConsPos; int nConsPos; int nStartOfRunUnpadded; setUnpaddedPositionsArray(); int nUnpadded = 0; for( nConsPos = nGetStartIndex(); nConsPos <= nGetEndIndex(); ++nConsPos ) { char cCurrent = ntGetConsUnsafe( nConsPos ).cGetBase(); if ( cCurrent != '*' ) ++nUnpadded; if ( ( cCurrent == cPreviousBase ) && ( cCurrent != '*' ) ) { if ( !bInRun ) { bInRun = true; nStartOfRunConsPos = nConsPos - 1; nStartOfRunUnpadded = nUnpadded - 1; } } else { if ( bInRun ) { nEndOfRunConsPos = nConsPos - 1; maybeFixConsensusInRun( nStartOfRunConsPos, nEndOfRunConsPos, cPreviousBase, nStartOfRunUnpadded, nUnpadded ); bInRun = false; } cPreviousBase = cCurrent; } } if ( bInRun ) { nEndOfRunConsPos = nGetEndIndex(); maybeFixConsensusInRun( nStartOfRunConsPos, nEndOfRunConsPos, cPreviousBase, nStartOfRunUnpadded, nUnpadded ); } setUnpaddedPositionsArray(); } // void Contig :: fixRunsInConsensus() { void Contig :: maybeFixConsensusInRun( int nStartOfRunConsPos, int nEndOfRunConsPos, const char cBaseOfRun, const int nStartOfRunUnpadded, int& nAfterEndOfRunUnpadded ) { int nConsPos; bool bSomeConsensusBaseWasChanged = false; bool bModifiedSomeRead = false; RWCString soOldBases( (size_t) nEndOfRunConsPos - nStartOfRunConsPos + 1 ); ContigView* pContigView; if ( nEndOfRunConsPos < nGetEndIndex() ) { if ( ntGetCons( nEndOfRunConsPos + 1 ).bIsPad() ) { // we do not want this since the neighboring column // may actually be included in this run goto prepareToReturn; } } pContigView = pGetContigView( nStartOfRunConsPos, nEndOfRunConsPos ); for( int nRead = 0; nRead < pContigView->nGetNumberOfFragments(); ++nRead ) { LocatedFragment* pLocFrag = pContigView->pLocatedFragmentGet( nRead ); int nConsPosStartOnRead; int nConsPosEndOnRead; assert( bIntersect( pLocFrag->nGetAlignStart(), pLocFrag->nGetAlignEnd(), nStartOfRunConsPos, nEndOfRunConsPos, nConsPosStartOnRead, nConsPosEndOnRead ) ); // I'm only interested in cases in which the read has at // least 1 pad and 1 cBaseOfRun and consists of only those 2 // types int nNumberOfPads = 0; int nNumberOfBasesOfRun = 0; bool bSomethingElse = false; int nConsPos; for( nConsPos = nConsPosStartOnRead; nConsPos <= nConsPosEndOnRead; ++nConsPos ) { char cBase = pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase(); if ( cBase == '*' ) ++nNumberOfPads; else if ( cBase == cBaseOfRun ) ++nNumberOfBasesOfRun; else { bSomethingElse = true; break; } } if ( !bSomethingElse && ( nNumberOfPads > 0 ) && ( nNumberOfBasesOfRun > 0 ) ) { // see if pads need to be moved bool bPadsAllAtEnd = true; int nLeftMostPadIfAllAtEnd = nConsPosEndOnRead - nNumberOfPads + 1; for( nConsPos = nConsPosEndOnRead; nConsPos >= nLeftMostPadIfAllAtEnd; --nConsPos ) { if ( !pLocFrag->ntGetFragFromConsPos( nConsPos ).bIsPad() ) { bPadsAllAtEnd = false; break; } } if ( !bPadsAllAtEnd ) { // we have to move pads // first record the quality values of the bases (not pads) RWTValOrderedVector aQualityValues( nNumberOfBasesOfRun ); unsigned char ucLastBaseQualityNo98or99 = 0; for( nConsPos = nConsPosStartOnRead; nConsPos <= nConsPosEndOnRead; ++nConsPos ) { if ( !pLocFrag->ntGetFragFromConsPos( nConsPos ).bIsPad() ) { // transfer quality, even if 98 or 99 aQualityValues.insert( pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ); if ( aQualityValues.length() == nNumberOfBasesOfRun ) { // this is the last non-pad base ucLastBaseQualityNo98or99 = pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQualityWithout98or99(); } } } int nQualityArrayIndex = 0; for( nConsPos = nConsPosStartOnRead; nConsPos < nLeftMostPadIfAllAtEnd; ++nConsPos ) { int nReadIndex = pLocFrag->nGetFragIndexFromConsPos(nConsPos); pLocFrag->Sequence::setBaseAtPos( nReadIndex, cBaseOfRun ); pLocFrag->Sequence::setQualityAtSeqPos( nReadIndex, aQualityValues[ nQualityArrayIndex ] ); ++nQualityArrayIndex; } // figure out quality to set for pads for this read unsigned char ucNextBaseQuality; if ( nConsPosEndOnRead < pLocFrag->nGetAlignEnd() ) { ucNextBaseQuality = pLocFrag->ntGetFragFromConsPos( nConsPosEndOnRead + 1 ).qualGetQualityWithout98or99(); } else { ucNextBaseQuality = ucLastBaseQualityNo98or99; } unsigned char ucQualityForAllPadsThisRead = ( ucLastBaseQualityNo98or99 + ucNextBaseQuality ) / 2; for( nConsPos = nLeftMostPadIfAllAtEnd; nConsPos <= nConsPosEndOnRead; ++nConsPos ) { int nReadIndex = pLocFrag->nGetFragIndexFromConsPos(nConsPos); pLocFrag->Sequence::setBaseAtPos( nReadIndex, '*' ); pLocFrag->Sequence::setQualityAtSeqPos( nReadIndex, ucQualityForAllPadsThisRead ); } // add an edit tag tag* pTag = new tag( pLocFrag, NULL, // contig "edit", nConsPosStartOnRead, nConsPosEndOnRead, false, // bWriteToPhdFileNotAceFile "shifted pads", // soCommand "autoEdit", // soSource "", // current date false ); // bNoTrans pLocFrag->aTags_.insert( pTag ); // this cannot be set or else thousands of phd files will // be written. It need not be set because we are just // moving pads--not changing bases // pLocFrag->setChanged(); bModifiedSomeRead = true; } // if ( !bPadsAllAtEnd ) { } // if ( !bSomethingElse && } // for( int nRead = 0; nRead < pContigView->nGetNumberOfFragments(); if ( bModifiedSomeRead ) { // keep a an old copy of the bases int nConsPos; for( nConsPos = nStartOfRunConsPos; nConsPos <= nEndOfRunConsPos; ++nConsPos ) { soOldBases += RWCString( ntGetCons( nConsPos ).cGetBase() ); } // now need to recalculate the consensus here. Phil's method would // be to do this in one pass, figuring out the combined quality of // each base and taking the maximum. We already know the RWTValVector aThereAreMultipleTopStrandReads( nNumberFromBase3MaxValue + 1, false, "aThereAreMultipleTopStrandReads" ); RWTValVector aThereAreMultipleBottomStrandReads( nNumberFromBase3MaxValue + 1, false, "aThereAreMultipleBottomStrandReads" ); RWTPtrVector aHighestQualityTopStrandRead( nNumberFromBase3MaxValue + 1, NULL, "aHighestQualityTopStrandRead" ); RWTPtrVector aHighestQualityBottomStrandRead( nNumberFromBase3MaxValue + 1, NULL, "aHighestQualityBottomStrandRead" ); RWTValVector aTopStrandReadQuality( nNumberFromBase3MaxValue + 1, 0, "aTopStrandReadQuality" ); RWTValVector aBottomStrandReadQuality( nNumberFromBase3MaxValue + 1, 0, "aBottomStrandReadQuality" ); bool bFirstTime = false; for( nConsPos = nStartOfRunConsPos; nConsPos <= nEndOfRunConsPos; ++nConsPos ) { if ( !bFirstTime ) { aThereAreMultipleTopStrandReads.setAllValues( false ); aThereAreMultipleBottomStrandReads.setAllValues( false ); aHighestQualityTopStrandRead.setAllValues( NULL ); aHighestQualityBottomStrandRead.setAllValues( NULL ); aTopStrandReadQuality.setAllValues( 0 ); aBottomStrandReadQuality.setAllValues( 0 ); } else { bFirstTime = false; } for( int nRead = 0; nRead < pContigView->nGetNumberOfFragments(); ++nRead ) { LocatedFragment* pLocFrag = pContigView->pLocatedFragmentGet( nRead ); if ( !pLocFrag->bIsInRead( nConsPos ) ) continue; Ntide nt = pLocFrag->ntGetFragFromConsPos( nConsPos ); int nBase = nNumberFromBase3[ nt.cGetBase() ]; // ignore reads with X, N, or ambiguity codes if ( nBase == nNOT_PAD_OR_BASE ) continue; unsigned char ucQuality = nt.qualGetQualityWithout98or99X0(); LocatedFragment* pReadWithHighestQuality = NULL; unsigned char ucHighestQuality = 0; int nStartPositionInDirectionOfSequencing = -666; if ( pLocFrag->bComp() ) { pReadWithHighestQuality = aHighestQualityBottomStrandRead[ nBase ]; ucHighestQuality = aBottomStrandReadQuality[ nBase ]; if ( pReadWithHighestQuality ) { if ( pReadWithHighestQuality->nGetAlignEnd() != pLocFrag->nGetAlignEnd() ) { aThereAreMultipleBottomStrandReads[ nBase ] = true; } } } else { pReadWithHighestQuality = aHighestQualityTopStrandRead[ nBase ]; ucHighestQuality = aTopStrandReadQuality[ nBase ]; if ( pReadWithHighestQuality ) { if ( pReadWithHighestQuality->nGetAlignStart() != pLocFrag->nGetAlignStart() ) { aThereAreMultipleTopStrandReads[ nBase ] = true; } } } if ( !pReadWithHighestQuality || ( ucQuality > ucHighestQuality ) ) { // set new highest quality read if ( pLocFrag->bComp() ) { aHighestQualityBottomStrandRead[ nBase ] = pLocFrag; aBottomStrandReadQuality[ nBase ] = ucQuality; } else { aHighestQualityTopStrandRead[ nBase ] = pLocFrag; aTopStrandReadQuality[ nBase ] = ucQuality; } } } // for( nRead = 0; nRead < pContigView->nGetNumberOfFragments(); unsigned char ucHighestCombinedQuality = 0; int nBaseOfHighestCombinedQuality = -666; for( int nBase = 0; nBase <= nNumberFromBase3MaxValue; ++nBase ) { unsigned char ucCombinedQuality = aTopStrandReadQuality[ nBase ] + aBottomStrandReadQuality[ nBase ]; if ( aThereAreMultipleTopStrandReads[ nBase ] || aThereAreMultipleBottomStrandReads[ nBase ] ) ucCombinedQuality += 5; if ( ( nBase == 0 ) || // first time ( ucCombinedQuality > ucHighestCombinedQuality ) ) { ucHighestCombinedQuality = ucCombinedQuality; nBaseOfHighestCombinedQuality = nBase; } } if ( ucHighestCombinedQuality > 90 ) { ucHighestCombinedQuality = 90; } assert( nBaseOfHighestCombinedQuality != -666 ); char cNewBase = cBaseFromNumber3[ nBaseOfHighestCombinedQuality ]; if ( ntGetCons( nConsPos ).cGetBase() == cNewBase ) continue; // if reach here, need to change the consensus base. // I'm not going to change the base segments since // bGetDataStructureOk doesn't check that the consensus matches // the read and base segments are going away. char cOldBase = ntGetCons( nConsPos ).cGetBase(); bool bNewNtideIsPad = ( cNewBase == '*' ? true : false ); bool bOldNtideIsPad = ( cOldBase == '*' ? true : false ); Sequence::setBaseAtPos( nConsPos, cNewBase ); Sequence::setQualityAtSeqPos( nConsPos, ucHighestCombinedQuality ); bSomeConsensusBaseWasChanged = true; // add edit tag tag* pTag = new tag( NULL, // LocatedFragment this, // Contig "edit", nConsPos, nConsPos, false, // bWriteToPhdFileNotAceFile "changed from " + RWCString( cOldBase ) + " to " + RWCString( cNewBase ), "autoEdit", soEmptyString, // current date/time false ); // bNoTrans addConsensusTag( pTag ); } // for( nConsPos = nStartOfRunConsPos; nConsPos <= nEndOfRunConsPos; } // bModifiedSomeRead prepareToReturn: int nEndOfRunUnpadded = nStartOfRunUnpadded - 1; // The reason for the -1 above is that we don't want to count // the first base yet--it might have been changed to a pad. We // will count it in the loop below for( nConsPos = nStartOfRunConsPos; nConsPos <= nEndOfRunConsPos; ++nConsPos ) { if ( !ntGetCons( nConsPos ).bIsPad() ) ++nEndOfRunUnpadded; } if ( bSomeConsensusBaseWasChanged ) { fprintf( pCUSTOM_NAVIGATION, "BEGIN_REGION\n" ); fprintf( pCUSTOM_NAVIGATION, "TYPE: CONSENSUS\n" ); fprintf( pCUSTOM_NAVIGATION, "CONTIG: %s\n", soGetName().data() ); fprintf( pCUSTOM_NAVIGATION, "UNPADDED_CONS_POS: %d %d\n", nStartOfRunUnpadded, nEndOfRunUnpadded ); fprintf( pCUSTOM_NAVIGATION, "COMMENT: padded %d %d old: %s\n", nStartOfRunConsPos, nEndOfRunConsPos, soOldBases.data() ); fprintf( pCUSTOM_NAVIGATION, "END_REGION\n\n" ); } // this is the case in which the end of the run is *not* the end // of the consensus. If it is the end of the consensus, don't even // bother to set nAfterEndOfRunUnpadded because it won't be used. if ( nEndOfRunConsPos < nGetEndIndex() ) { if ( ntGetCons( nEndOfRunConsPos + 1 ).bIsPad() ) nAfterEndOfRunUnpadded = nEndOfRunUnpadded; else nAfterEndOfRunUnpadded = nEndOfRunUnpadded + 1; } } // void Contig :: maybeFixConsensusInRun( const int nStartOfRunConsPos, static int nCompareIntegers( const int* pA, const int* pB ) { if ( *pA < *pB ) { return -1; } else if (*pA > *pB ) { return 1; } else { return 0; } } void Contig :: getDepthOfCoverage( int& nMinReadDepth, int& nMaxReadDepth, int& nMedianDepth, int& nMeanDepth ) { // too large, but remember I'm using 1-based coordinates RWTValVector aReadDepth( nGetPaddedConsLength() + 2, 0, "aReadDepth" + soGetName() ); for( int nRead = 0; nRead < nGetNumberOfReads(); ++nRead ) { LocatedFragment* pLocFrag = pGetLocatedFragment( nRead ); if ( pLocFrag->bIsWholeReadUnaligned() ) continue; int nAlignedStart = pLocFrag->nGetAlignClipStart(); int nAlignedEnd = pLocFrag->nGetAlignClipEnd(); if ( !bIntersect( nAlignedStart, nAlignedEnd, nGetStartConsensusIndex(), nGetEndConsensusIndex(), nAlignedStart, nAlignedEnd ) ) continue; bool bInHighQualityRegion = false; for( int nConsPos = nAlignedStart; nConsPos <= nAlignedEnd; ++nConsPos ) { bool bHighQuality = ( pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQualityWithout98or99() >= pCP->nQualityThresholdForNavigateByDepthOfCoverage_ ) ? true : false; if ( bHighQuality && !bInHighQualityRegion ) { ++aReadDepth[ nConsPos ]; bInHighQualityRegion = true; } else if ( !bHighQuality && bInHighQualityRegion ) { --aReadDepth[ nConsPos ]; bInHighQualityRegion = false; } } if ( bInHighQualityRegion ) { // the read was high quality up to the end, so after the end, // terminate this section of high quality --aReadDepth[ nAlignedEnd + 1 ]; } } // for( int nRead = 0; ... // convert to read depth. At same time, calculate min, max, and mean // read depth long lSum = aReadDepth[ nGetStartIndex() ]; nMaxReadDepth = aReadDepth[ nGetStartIndex() ]; nMinReadDepth = aReadDepth[ nGetStartIndex() ]; int nConsPos; for( nConsPos = nGetStartIndex() + 1; nConsPos <= nGetEndIndex(); ++nConsPos ) { aReadDepth[ nConsPos ] += aReadDepth[ nConsPos - 1 ]; lSum += aReadDepth[ nConsPos ]; nMaxReadDepth = MAX( nMaxReadDepth, aReadDepth[ nConsPos ] ); nMinReadDepth = MIN( nMinReadDepth, aReadDepth[ nConsPos ] ); } nMeanDepth = lSum / ( nGetEndIndex() - nGetStartIndex() + 1 ); // currently we are sorting the whole array, including the 0th //element size_t nNumberOfElements = aReadDepth.nCurrentLength_; size_t nSizeOfAnElement = sizeof( int ); void* pArray = aReadDepth.pArray_; qsort( pArray, nNumberOfElements, sizeof(int), ( int(*) ( const void*, const void*) ) nCompareIntegers ); // check that it is sorted for( nConsPos = nGetStartIndex() + 1; nConsPos <= nGetEndIndex(); ++nConsPos ) { // printf( "%d\n", aReadDepth[ nConsPos ] ); if ( aReadDepth[ nConsPos - 1 ] > aReadDepth[ nConsPos ] ) { printf( "out of order: %d %d\n", aReadDepth[ nConsPos - 1 ], aReadDepth[ nConsPos ] ); assert( aReadDepth[ nConsPos - 1 ] <= aReadDepth[ nConsPos ] ); } } // now find median (fixed bug in calculation, Oct 2010, DG) int nMiddleElement = nNumberOfElements / 2; if ( nNumberOfElements > 0 ) { nMedianDepth = aReadDepth[ nMiddleElement ]; } else { nMedianDepth = 0; } }