/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ // // sequence.cpp // // implementation for Sequence object // // chrisa 23-mar-95 // #include using namespace std; #include #include #include "sequence.h" #include "mbt_exception.h" #include "guiapp.h" #include "assert.h" #include "consedParameters.h" #include #include "popupErrorMessage.h" #include "dErrorRateFromQuality.h" // set the entire sequence from passed char array // will clear any previous values void Sequence :: setSequence(RWCString& soSequence) { (*this).clear(); // clear the array (*this).resize( soSequence.length() ); // allocate only once soSequence.toLower(); for (int n = 0; n < soSequence.length(); n++) { // this fires an extra (small) constructor but has the // advantage of hiding the conversion from char inside // the Ntide class (*this).insert(Ntide(soSequence[n])); } } void Sequence :: setSequence2( char* szBases, const int nNumberOfBases ) { assert( nNumberOfBases == strlen( szBases ) ); (*this).clear(); (*this).resize( nNumberOfBases ); for( int n = 0; n < nNumberOfBases; ++n ) { (*this).insert( Ntide( tolower( szBases[ n ] ) ) ); } } void Sequence :: setSequence3( char* szBases, const int nNumberOfBases ) { // no need for this--look in parseAceFile at how this is used. // nNumberOfBases is *set* from how many bases are in szBases // assert( nNumberOfBases == strlen( szBases ) ); // this->clear(); resize( nNumberOfBases ); // I think that for the time being, leave garbage in the quality and // point position. These will be set later. nCurrentLength_ = nNumberOfBases; for( int n = 0; n < nNumberOfBases; ++n ) { // faster by eliminating check for subscript error // but when actually measured it, I found no difference this->RWTValOrderedVector::pArray_[n].cBase_ = tolower( szBases[n] ); // this->RWTValOrderedVector::operator[](n).cBase_ = // tolower( szBases[n] ); } } // returns a pointer to a newly allocated null terminated // char string copy of the sequence. client responsible // for deletion // // currently unused. test for bit decay! // char* Sequence::szGetCopyOfSeq() { int n; int nSeqLen = length(); char* szNewStr = new char [nSeqLen+1]; assert(szNewStr); for (n = 0; n < nSeqLen; n++) { szNewStr[n] = (*this)[n + nGetStartIndex()]; // casts to char } szNewStr[nSeqLen] = '\0'; return szNewStr; } // convert from padded to unpadded index int Sequence :: nUnpaddedIndex(const int nPaddedPos) const { if ( ! (nGetStartIndex() <= nPaddedPos ) ) { RWCString soError = "in Sequence::nUnpaddedIndex of sequence "; soError += soGetName(); soError += " nGetStartIndex = "; soError += RWCString( (long) nGetStartIndex() ); soError += " but nPaddedPos = "; soError += RWCString( (long) nPaddedPos ); THROW_ERROR( soError ); } assert(nPaddedPos <= nGetEndIndex() ); int nRealNtides = nGetStartIndex(); int nCurPos = nGetStartIndex(); while ( nCurPos < nPaddedPos ) { if (! ntideGet(nCurPos++).bIsPad()) { nRealNtides++; } } // return the number of encountered REAL ntides return nRealNtides; } // convert from unpadded to padded index int Sequence :: nPaddedIndex(const int nUnpaddedPos) const { assert(nGetStartIndex() <= nUnpaddedPos ); int nPadded = nGetStartIndex() - 1; int nCurUnpadded = nGetStartIndex() - 1; bool bFound = False; while ( !bFound ) { ++nPadded; assert( nPadded <= nGetEndIndex() ); if (! ntideGet(nPadded).bIsPad()) { nCurUnpadded++; if (nCurUnpadded == nUnpaddedPos ) bFound = True; } } // return the number of encountered REAL ntides return nPadded; } // write a file at passed path containing sequence, // stripped of pads, in fasta format with passed // comment (if any) appended to first line which // begins ">". throws SysRequestFailed exception // if cannot write file. void Sequence::writeUnpaddedSeqToFastaFile(const char* szFilePath, const char* szComment) const { // make sure we got a name assert (szFilePath); // open the output file ofstream ofsFastaFile(szFilePath, ios::out); if (! ofsFastaFile) { // throw an exception with a detailed explanation // no need to include this file name and line number... ostringstream ost; ost << "Unable to open file " << szFilePath << " for writing" << endl << ends; SysRequestFailed srf(ost.str().c_str()); // what the hell, show 'em the errno srf.includeErrnoDescription(); throw srf; } // file open, write the comment and consensus ofsFastaFile << ">"; if (szComment) ofsFastaFile << szComment; ofsFastaFile << endl; // the max number of bases on 1 line of a fasta output file static const int nFastaMaxLineLen = 50; // loop through entire sequence int nCharsOnLine = 0; for (int nPos = nGetStartIndex(); nPos <= nGetEndIndex(); nPos++) { if ( (! (*this)[nPos].bIsPad()) && ((*this)[nPos] != ' ')) { // force char output instead of ostringstream overload format ofsFastaFile << (char )toupper((char )(*this)[nPos]); nCharsOnLine++; if (nCharsOnLine == nFastaMaxLineLen) { ofsFastaFile << endl; nCharsOnLine = 0; } } } if (nCharsOnLine) ofsFastaFile << endl; ofsFastaFile.close(); } // reverses the sequence, complementing the ntide as we go // // changed this to swap elements instead of creating new // dynamic array and copying. chrisa 27-jun-95 // void Sequence :: complementSequence() { complementBasesNotPeakPositions(); if ( cWhichPeakPositionsAreUsed_ == cBig ) { // e.g., 0, 1, 2 (length = 3) // switch pBigPeakPositions_->reverseElementOrder(); for (int nPeak = pBigPeakPositions_->nGetStartIndex(); nPeak <= pBigPeakPositions_->nGetEndIndex(); nPeak++) { (*pBigPeakPositions_)[ nPeak ] = nTraceArrayMaxIndex_ - (*pBigPeakPositions_)[ nPeak ]; } } else if ( cWhichPeakPositionsAreUsed_ == cLittle ) { pLittlePeakPositions_->reverseElementOrder(); for( int nPeak = pLittlePeakPositions_->nGetStartIndex(); nPeak <= pLittlePeakPositions_->nGetEndIndex(); ++nPeak ) { (*pLittlePeakPositions_)[ nPeak ] = nTraceArrayMaxIndex_ - (*pLittlePeakPositions_)[ nPeak ]; } } } // complementSequence() void Sequence :: complementBasesNotPeakPositions() { reverseElementOrder(); // less efficient to do this separately, but perhaps a little clearer // does this sequence contain point positions? // use version of Ntide member fun that complements them as well for (int nIndex = nGetStartIndex(); nIndex <= nGetEndIndex(); nIndex++) { (*this)[nIndex].complementBase(); } } void Sequence :: complementSequenceAfterReadingPHDFile( const bool bFoundTraceArrayMax ) { if ( !bFoundTraceArrayMax ) nTraceArrayMaxIndex_ = 0; // will be set later, but this // causes the peak positions to be set negative // currently, I can just call complementSequence. Someday this may // change... complementSequence(); } // passed a trace point position range, returns the corresponding index // range of ntides in the sequence that are contained within. void Sequence::getIndexRangeFromPointPositions(const int nPointMin, const int nPointMax, int& nIndexMin, int& nIndexMax, bool& bError) const { int nIndex = nGetStartIndex(); // case in which all bases lie to the right of the teditor window if ( nPointMax < nGetPointPos( nIndex ) ) { bError = true; return; } bool bFound = false; while( ! bFound && (nIndex <= nGetEndIndex()) ) { if ( nPointMin <= nGetPointPos( nIndex ) ) bFound = true; else ++nIndex; } // case in which all bases lie to the left of the left edge of the // teditor window if (!bFound ) { bError = true; return; } // we need to check to see if part of a base might be on the left edge // of the screen // just being extra cautious to not get a subscript error if ( nIndex == nGetStartIndex() ) nIndexMin = nIndex; else { int nPeakOfBaseToLeftOfPoint = nGetPointPos( nIndex - 1 ); int nPeakOfBaseToRightOfPoint = nGetPointPos( nIndex ); int nDividingPoint = (nPeakOfBaseToLeftOfPoint + nPeakOfBaseToRightOfPoint) / 2; if ( nPointMin <= nDividingPoint ) nIndexMin = nIndex - 1; else nIndexMin = nIndex; } // now looking for the base at the right of the screen bFound = false; while( !bFound && (nIndex <= nGetEndIndex()) ) { if ( nPointMax <= nGetPointPos( nIndex ) ) bFound = true; else ++nIndex; } if (bFound) // we need to figure out if the base we've found is still at all // on the screen, or if it is off the screen and we should use the // base to the left of it. // just being extra cautious that we don't get a subscript error if ( nGetStartIndex() == nIndex ) { nIndexMax = nIndex; } else { int nPutativePoint = nGetPointPos( nIndex ); int nPenultimatePoint = nGetPointPos( nIndex - 1 ); int nDividingPoint = (nPutativePoint + nPenultimatePoint ) / 2; if ( nPointMax <= nDividingPoint ) nIndexMax = nIndex - 1; else nIndexMax = nIndex; } else // case in which the last base lies in the teditor window nIndexMax = nGetEndIndex(); bError = false; } void Sequence :: insertNtideAtPos( const Ntide& nt, const int nPos ) { // insertion can only occur within the sequence assert ((nPos >= nGetStartIndex()) && (nPos <= nGetEndIndex())); (*this).insertAt(nPos, nt ); } void Sequence :: insertNtideWithPeakAtPos(const Ntide& nt, const int nPos) { // the new Ntide will be put at nPos, moving everything else to the right insertNtideAtPos( nt, nPos ); // interpolate new point position if (nPos == nGetStartIndex()) { // inserted at head of array, so make the point position // halfway between 0 and the first ntide already there insertPointPos( nPos, nGetPointPos( nPos ) / 2 ); } else { // we can safely assume that there's more than one ntide // in the sequence int nLeftPos = nGetPointPos( nPos - 1 ); int nRightPos = nGetPointPos( nPos ); insertPointPos( nPos, nLeftPos + ((nRightPos - nLeftPos) / 2)); } } void Sequence :: appendNtide( const Ntide& nt ) { // uses the RWTValOrderedVector append function // I don't believe an additional copy of the ntide is needed. append( nt ); } void Sequence :: appendPointPos( const int nPointPos ) { if ( cWhichPeakPositionsAreUsed_ == cLittle ) { pLittlePeakPositions_->append( nPointPos ); } else if ( cWhichPeakPositionsAreUsed_ == cBig ) { pBigPeakPositions_->append( nPointPos ); } else { RWCString soError = "tried to Sequence::appendPointPos when the cWhichPeakPositionsAreUsed_ = "; soError += RWCString( cWhichPeakPositionsAreUsed_ ); THROW_ERROR2( soError ); } } int Sequence :: nComplementUnpaddedIndex( const int nUnpadded ) { assert( nGetStartIndex() <= nUnpadded ); assert( nUnpadded <= nGetUnpaddedSeqLength() ); return( nGetStartIndex() + nGetUnpaddedSeqLength() - nUnpadded ); } int Sequence :: nComplementPaddedIndex( const int nPadded ) const { assert( nGetStartIndex() <= nPadded ); assert( nPadded <= nGetPaddedSeqLength() ); return( nGetStartIndex() + nGetPaddedSeqLength() - nPadded ); } void Sequence :: assignQualityAndPeakPositionsToPads( const bool bAndPeakPositions) { bool bPadsAtBeginningOfSequence = false; bool bPadsAtEndingOfSequence = false; // loop through all bases in LocatedFragment int nPadded = nGetStartIndex(); while( nPadded <= nGetEndIndex() ) { if ( ntideGet( nPadded ).bIsPad() ) { int nBeforePads = nPadded - 1; if (nBeforePads < nGetStartIndex() ) bPadsAtBeginningOfSequence = true; else bPadsAtBeginningOfSequence = false; // look past the pad to see where the next non-pad is. // the pads positon is between the surrounding non-pads. duh. int nAfterPads; int n = nPadded + 1; bool bFoundNotPad = False; while( !bFoundNotPad && n <= nGetEndIndex() ) { if ( ! ntideGet(n).bIsPad() ) { bFoundNotPad = True; nAfterPads = n; } else ++n; } if (!bFoundNotPad ) { bPadsAtEndingOfSequence = true; nAfterPads = nGetEndIndex() + 1; // just to terminate the loop } else bPadsAtEndingOfSequence = false; if (bPadsAtBeginningOfSequence && bPadsAtEndingOfSequence ) assignQualityAndPeakPositionsToSequenceOfPads( bAndPeakPositions ); else if ( bPadsAtBeginningOfSequence ) assignQualityAndPeakPositionsToPadsAtStartOrEndOfSequence( nAfterPads, true, // pads at start of sequence bAndPeakPositions ); else if ( bPadsAtEndingOfSequence ) assignQualityAndPeakPositionsToPadsAtStartOrEndOfSequence( nBeforePads, false, // pads at end of sequence bAndPeakPositions ); else assignQualityAndPeakPositionsToARangeOfPads( nBeforePads, nAfterPads, bAndPeakPositions ); nPadded = nAfterPads + 1; } else ++nPadded; } // while( nPadded ... } // // PURPOSE: assign quality to a pads (or several pads) that have a non-pad // on each side of it void Sequence :: assignQualityAndPeakPositionsToARangeOfPads( int nBeforePads, int nAfterPads, const bool bAndPeakPositions ) { int nPointDistancePerInterval; if ( bAndPeakPositions ) { int nIntervals = nAfterPads - nBeforePads; int nPointBeforePads = nGetPointPos( nBeforePads ); int nPointAfterPads = nGetPointPos( nAfterPads ); nPointDistancePerInterval = ( nPointAfterPads - nPointBeforePads ) / nIntervals; } unsigned char ucQualityBeforePads; // don't want pads to stick out as high quality if they are flanked by // x's, so consider an x to be low quality. Fixed per Pat, Feb 2000. if ( ntideGet( nBeforePads ).cGetBase() == 'x' ) ucQualityBeforePads = 0; else { ucQualityBeforePads = ntideGet( nBeforePads ).qualGetQuality(); convertFrom9899QualityToNormalQuality( ucQualityBeforePads ); } unsigned char ucQualityAfterPads; if ( ntideGet( nAfterPads ).cGetBase() == 'x' ) ucQualityAfterPads = 0; else { ucQualityAfterPads = ntideGet( nAfterPads ).qualGetQuality(); convertFrom9899QualityToNormalQuality( ucQualityAfterPads ); } unsigned char ucAverageQuality = ( ucQualityBeforePads + ucQualityAfterPads ) / 2; for(int nPadded = nBeforePads + 1; nPadded <= nAfterPads - 1; ++nPadded ) { setQualityAtSeqPos( nPadded, ucAverageQuality ); if ( bAndPeakPositions ) { int nPointPosOfPad = nGetPointPos( nBeforePads ) + nPointDistancePerInterval * (nPadded - nBeforePads ); setTracePointPosAtSeqPos( nPadded, nPointPosOfPad ); } } } int Sequence :: nGetUnpaddedSeqLength() { int nUnpaddedLength = 0; for (int n = nGetStartIndex(); n <= nGetEndIndex(); n++) { if (! (*this)[n].bIsPad() ) nUnpaddedLength++; } return( nUnpaddedLength ); } bool Sequence :: bReadLengthAndTracesLengthAreEqual() { if ( cWhichPeakPositionsAreUsed_ == cLittle ) { return( this->nCurrentLength_ == pLittlePeakPositions_->length() ); } else if ( cWhichPeakPositionsAreUsed_ == cBig ) { return( this->nCurrentLength_ == pBigPeakPositions_->length() ); } else return true; } Sequence* Sequence :: pSeqMakeCopyOfPhredCalls() { Sequence* pSeq = new Sequence( soSequenceName_ ); pSeq->nTraceArrayMinIndex_ = nTraceArrayMinIndex_; pSeq->nTraceArrayMaxIndex_ = nTraceArrayMaxIndex_; pSeq->cWhichPeakPositionsAreUsed_ = cWhichPeakPositionsAreUsed_; if ( cWhichPeakPositionsAreUsed_ == cLittle ) { pSeq->pLittlePeakPositions_ = new MBTValOrderedOffsetVector( (size_t) pLittlePeakPositions_->length() ); pSeq->pLittlePeakPositions_->soName_ = "Sequence::pLittlePeakPositions_"; } else if ( cWhichPeakPositionsAreUsed_ == cBig ) { pSeq->pBigPeakPositions_ = new MBTValOrderedOffsetVector( (size_t) pBigPeakPositions_->length() ); pSeq->pBigPeakPositions_->soName_ = "Sequence::pBigPeakPositions_"; } pSeq->resize( nGetUnpaddedSeqLength() ); if ( bHasPeakPositions() ) { for( int n = nGetStartIndex(); n <= nGetEndIndex(); ++n ) { Ntide nt = ntideGet( n ); if (! nt.bIsPad() ) { pSeq->append( nt ); pSeq->appendPointPos( nGetPointPos( n ) ); } } } else { for( int n = nGetStartIndex(); n <= nGetEndIndex(); ++n ) { Ntide nt = ntideGet( n ); if (! nt.bIsPad() ) { pSeq->append( nt ); } } } return( pSeq ); } void Sequence :: assignQualityAndPeakPositionsToPadsAtStartOrEndOfSequence( const int nPaddedSeqPosOfNonPad, const bool bPadsAtStartNotEnd, const bool bAndPeakPositions ) { int nStart; int nEnd; if ( bPadsAtStartNotEnd ) { // cout << "Warning: pads at beginning of sequence " << // soSequenceName_ << endl; nStart = nGetStartIndex(); nEnd = nPaddedSeqPosOfNonPad; } else { // cout << "Warning: pads at end of sequence " << // soSequenceName_ << endl; nStart = nPaddedSeqPosOfNonPad; nEnd = nGetEndIndex(); } unsigned char ucQualityOfANonPad = ntideGet( nPaddedSeqPosOfNonPad ).qualGetQuality(); int nPointPos; if ( bAndPeakPositions ) { nPointPos = nGetPointPos( nPaddedSeqPosOfNonPad ); } convertFrom9899QualityToNormalQuality( ucQualityOfANonPad ); for( int nConsPos = nStart; nConsPos <= nEnd; ++nConsPos ) { setQualityAtSeqPos( nConsPos, ucQualityOfANonPad ); if ( bAndPeakPositions ) { setTracePointPosAtSeqPos( nConsPos, nPointPos ); } } } // pathological case in which the entire sequence is pads void Sequence :: assignQualityAndPeakPositionsToSequenceOfPads( const bool bAndPeakPositions ) { cerr << "warning: sequence is all pads: " << soSequenceName_ << endl; for( int nConsPos = nGetStartIndex(); nConsPos <= nGetEndIndex(); ++nConsPos ) { setQualityAtSeqPos( nConsPos, 0 ); if ( bAndPeakPositions ) setTracePointPosAtSeqPos( nConsPos, 0 ); } } int Sequence :: nGetPointPos( const int nPaddedSeqPos ) const { if ( cWhichPeakPositionsAreUsed_ == cLittle ) { return( (*pLittlePeakPositions_)[ nPaddedSeqPos ] ); } else if ( cWhichPeakPositionsAreUsed_ == cBig ) { return( (*pBigPeakPositions_)[ nPaddedSeqPos ] ); } else { RWCString soError = "In sequence "; soError += soGetName(); soError += " Sequence::nGetPointPos was called for nPaddedSeqPos = "; soError += RWCString( (long) nPaddedSeqPos ); soError += " when cWhichPeakPositionsAreUsed_ = "; soError += cWhichPeakPositionsAreUsed_; THROW_ERROR( soError ); } } void Sequence :: insertPointPos( const int nPos, const int nPointPos ) { if ( cWhichPeakPositionsAreUsed_ == cLittle ) { pLittlePeakPositions_->insertAt( nPos, nPointPos ); } else if ( cWhichPeakPositionsAreUsed_ == cBig ) { pBigPeakPositions_->insertAt( nPos, nPointPos ); } else { RWCString soError = "tried to Sequence::insertPointPos when the cWhichPeakPositionsAreUsed_ = "; soError += RWCString( cWhichPeakPositionsAreUsed_ ); THROW_ERROR2( soError ); } } void Sequence :: deletePointPos( const int nPos ) { if ( cWhichPeakPositionsAreUsed_ == cLittle ) { pLittlePeakPositions_->removeAt( nPos ); } else if ( cWhichPeakPositionsAreUsed_ == cBig ) { pBigPeakPositions_->removeAt( nPos ); } else { RWCString soError = "tried to Sequence::deletePointPos when the cWhichPeakPositionsAreUsed_ = "; soError += cWhichPeakPositionsAreUsed_; THROW_ERROR( soError ); } } void Sequence :: createPointPosArray( const bool bFoundTraceArrayMaxIndex, const int nTraceArrayMaxIndex, const size_t nInitialSizeOfArray, const bool bFillUpArray ) { unsigned short int s = 0; s = ~s; int nMaxNumberInShort = (int) s / 2; // the -10 is just to be on the safe side. short's are good to // about 32,000 if ( bFoundTraceArrayMaxIndex && nTraceArrayMaxIndex < (nMaxNumberInShort - 10 ) ) { createPointPosArray2( cLittle, nInitialSizeOfArray, bFillUpArray ); } else { createPointPosArray2( cBig, nInitialSizeOfArray, bFillUpArray ); } } void Sequence :: createPointPosArray2( const char cWhichPeakPositionsAreUsed, const size_t nInitialSizeOfArray, const bool bFillUpArray ) { if ( cWhichPeakPositionsAreUsed == cLittle ) { pLittlePeakPositions_ = new MBTValOrderedOffsetVector( (size_t) nInitialSizeOfArray ); pLittlePeakPositions_->soName_ = "Sequence::pLittlePeakPositions_"; cWhichPeakPositionsAreUsed_ = cLittle; if ( bFillUpArray ) pLittlePeakPositions_->nCurrentLength_ = nInitialSizeOfArray; } else if ( cWhichPeakPositionsAreUsed == cBig ) { pBigPeakPositions_ = new MBTValOrderedOffsetVector( (size_t) nInitialSizeOfArray ); pBigPeakPositions_->soName_ = "Sequence::pBigPeakPositions_"; cWhichPeakPositionsAreUsed_ = cBig; if ( bFillUpArray ) pBigPeakPositions_->nCurrentLength_ = nInitialSizeOfArray; } else { RWCString soError = "Sequence::createPointPosArray2 and cWhichPeakPositionsAreUsed = "; soError += cWhichPeakPositionsAreUsed; THROW_ERROR2( soError ); } } // quality 13 has an error rate of 0.050118723 so // 0.05 - (error rate of quality 13 base) = neg number // Hence quality 13 bases on the end of a high quality segment will // not be included in it. static double dQuality13AmountToSubtractFrom = 0.05; void Sequence :: findHighQualitySegment( bool& bHighQualitySegmentExists, int& nUnpaddedHighQualitySegmentStart, int& nUnpaddedHighQualitySegmentEnd, int& nPaddedHighQualitySegmentStart, int& nPaddedHighQualitySegmentEnd ) { findHighQualitySegmentWithUserDefinedQuality( dQuality13AmountToSubtractFrom, bHighQualitySegmentExists, nUnpaddedHighQualitySegmentStart, nUnpaddedHighQualitySegmentEnd, nPaddedHighQualitySegmentStart, nPaddedHighQualitySegmentEnd ); } void Sequence :: findHighQualitySegmentWithUserDefinedQuality( const double dAmountToSubtractFrom, bool& bHighQualitySegmentExists, int& nUnpaddedHighQualitySegmentStart, int& nUnpaddedHighQualitySegmentEnd, int& nPaddedHighQualitySegmentStart, int& nPaddedHighQualitySegmentEnd ) { bHighQualitySegmentExists = false; // these are initialized to handle the case in which // the bases start out right away with high quality. // In that case, dLeastSumFromBeginning will never be // changed. and the high quality segment should start // at 1. These values allow this. int nUnpaddedLocationOfLeastSumFromBeginning = 0; int nPaddedLocationOfLeastSumFromBeginning = 0; double dLeastSumFromBeginning = 0.0; double dBestSumSegment = -1; int nUnpaddedStartOfBestSumSegment = -1; int nUnpaddedEndOfBestSumSegment = -1; int nPaddedStartOfBestSumSegment = -1; int nPaddedEndOfBestSumSegment = -1; double dSumFromBeginning = 0.0; // we will only concern ourselves with the high quality segment // on the unpadded sequence--forget about pads int nUnpadded = nGetUnpaddedStartIndex() - 1; // set up for first ++ for( int nPos = nGetStartIndex(); nPos <= nGetEndIndex(); ++nPos ) { if ( ntideGet( nPos ).bIsPad() ) continue; ++nUnpadded; Quality qual = ntideGet( nPos ).qualGetQualityWithout98or99(); double dAdjustedError = dAmountToSubtractFrom - dErrorRateFromQuality[ qual ]; dSumFromBeginning += dAdjustedError; if ( dSumFromBeginning < dLeastSumFromBeginning ) { dLeastSumFromBeginning = dSumFromBeginning; nUnpaddedLocationOfLeastSumFromBeginning = nUnpadded; nPaddedLocationOfLeastSumFromBeginning = nPos; } // the reason for the "else" is to prevent, in the case of // all adjusted errors being negative, each base would satisfy the // end of the least segment and the end of a zero-length best sum // segment. This would give the end of the best segment before // the beginning and the dBestSumSegment would be zero else if ( dSumFromBeginning - dLeastSumFromBeginning > dBestSumSegment ) { dBestSumSegment = dSumFromBeginning - dLeastSumFromBeginning; // the high quality segment starts the base *after* the base // that makes the least sum nUnpaddedStartOfBestSumSegment = nUnpaddedLocationOfLeastSumFromBeginning + 1; nUnpaddedEndOfBestSumSegment = nUnpadded; // this isn't quite right since it may be on a pad. Hence I really // should keep incrementing until find a non-pad. I do that // below. nPaddedStartOfBestSumSegment = nPaddedLocationOfLeastSumFromBeginning + 1; nPaddedEndOfBestSumSegment = nPos; bHighQualitySegmentExists = true; } } if ( bHighQualitySegmentExists ) { nUnpaddedHighQualitySegmentStart = nUnpaddedStartOfBestSumSegment; nUnpaddedHighQualitySegmentEnd = nUnpaddedEndOfBestSumSegment; nPaddedHighQualitySegmentStart = nPaddedStartOfBestSumSegment; // fix getting the padded end of the high quality segment while( ntideGet( nPaddedHighQualitySegmentStart ).bIsPad() && nPaddedHighQualitySegmentStart < nGetEndIndex() ) ++nPaddedHighQualitySegmentStart; nPaddedHighQualitySegmentEnd = nPaddedEndOfBestSumSegment; } } void Sequence :: getUnpaddedSequence( RWCString& soUnpaddedSequence ) { soUnpaddedSequence = ""; soUnpaddedSequence.increaseButNotDecreaseMaxLength( nGetPaddedSeqLength() ); for( int nPadded = nGetStartIndex(); nPadded <= nGetEndIndex(); ++nPadded ) { Ntide nt = ntideGet( nPadded ); if (!nt.bIsPad() ) { soUnpaddedSequence.append( nt.cGetBase() ); } } } void Sequence :: increaseMaxLengthIfNecessaryOfPeakPositions( const int nBasesToAdd ) { if ( cWhichPeakPositionsAreUsed_ == cLittle ) { pLittlePeakPositions_->increaseMaxLengthIfNecessary( nBasesToAdd ); } else if ( cWhichPeakPositionsAreUsed_ == cBig ) { pBigPeakPositions_->increaseMaxLengthIfNecessary( nBasesToAdd ); } }