/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ // // sequence.h // // The Sequence object is a currently inherited by Fragment, // LocatedFragment and Contig. It consists of a dynamic array of // Ntides with member functions to provide indexing, conversion from // padded to unpadded positions, insertions, deletions and edits. // It is itself a derived class of MBTValOrderedOffsetVector. // // The Sequence object assumes it will contain pads, inserted into // fragments and the consensus by phrap. indices come in 2 flavors: // the "padded" index is the index into the array of Ntide // that you see in the window. it contains "pads" i.e. // occurrances of '*' that are an artifact of the assembly. // member nUnpaddedIndex() will translate a base's position in the // padded string into it's position in the original array // that came from the basecalling software. There is no other // way to access a given ntide other than by it's padded index. // // as a general rule, the indexing of any sequence is 1 based. // this is the convention used by phrap. don't count on this. // Indexing is done by means of the overloaded operator [] or ntideGet(). // The MBTValOrderedOffsetVector class provides a 1 based array // (and allows the offset to be changed dynamically should that // ever be needed). Whenever the first and last indices are required // use the inherited members nGetStartIndex() and nGetEndIndex(). // these will always give you a valid range of indices, which, when // passed to operator [] or ntideGet() will give you all the ntides in // the sequence. // // it's always better to use a derived class' indexing function, and // wherever possible consensus based indices have been used as well. // thus for contig consensus use Contig::ntGetFromConsPos(), etc. // Example: LocatedFragments can be indexed via consensus relative // position. That is to say, you can access the ntide in a particular // fragment by indexing into the fragment with the position of that // ntide that it is currently aligned with in the consensus. // Makes everything alot easier (YMMV). // #ifndef SEQUENCE_INCLUDED #define SEQUENCE_INCLUDED #ifdef SCCS_ID static const char* sequenceSccsId = "@(#)sequence.h 1.24 01/08/96 11:20:46"; #endif #include "rwtvalorderedvector.h" #include "rwcstring.h" #include "assert.h" #include "mbt_val_ord_offset_vec.h" #include "ntide.h" #include "sysdepend.h" class Sequence : public MBTValOrderedOffsetVector { public: Sequence() : nTraceArrayMinIndex_( 0 ), nTraceArrayMaxIndex_( 0 ), cWhichPeakPositionsAreUsed_( cNone ) {} Sequence(const RWCString& soSequenceName ) : soSequenceName_( soSequenceName ), nTraceArrayMinIndex_( 0 ), nTraceArrayMaxIndex_( 0 ), cWhichPeakPositionsAreUsed_( cNone ) {} // almost an assignment operator, but it doesn't copy pads // Also, it will 'new' a Sequence, rather than requiring that both // already exist. Sequence* pSeqMakeCopyOfPhredCalls(); // may change soSequence case void setSequence( RWCString& soSequence ); void setSequence2( char* szBases, const int nNumberOfBases ); // attempt to be extremely efficient--not copying Ntide objects // around void setSequence3( char* szBases, const int nNumberOfBases ); // may be used either for reads or contigs RWCString soGetName() const { return soSequenceName_; } // how long are you? includes pads. int nGetPaddedSeqLength() const { return length(); } // how long are you? does not include pads. // virtual since contigs have a more efficient method and thus // if this is called by a contig object, the more efficient method // will be used. virtual int nGetUnpaddedSeqLength(); // returns a pointer to a newly allocated null terminated // char string copy of the sequence. client responsible // for deletion char* szGetCopyOfSeq(); // currently unused void getUnpaddedSequence( RWCString& soUnpaddedSequence ); // returns a copy of the referenced Ntide // operator [] can be used to get a non-const reference Ntide& ntideGet(const int nIndex) const { return( ((MBTValOrderedOffsetVector*)this)->operator[]( nIndex ) ); } int nGetUnpaddedStartIndex() { return( nGetStartIndex() ); } int nGetUnpaddedEndIndex() { return( nGetUnpaddedSeqLength() + nGetStartIndex() - 1 ); } bool bUnpaddedConsPosInRange( const int nUnpaddedConsPos ) { if ( ( nUnpaddedConsPos < nGetUnpaddedStartIndex() ) || ( nGetUnpaddedEndIndex() < nUnpaddedConsPos ) ) return( false ); else return( true ); } // Passed the padded index, returns equivalent unpadded index. // If the passed index points to a pad the position of the prior // "real" ntide is returned. int nUnpaddedIndex(const int nPadded) const; // passed the unpadded index, returns equivalent padded index int nPaddedIndex( const int ) const; // passed a trace point position range, returns the corresponding index // range of ntides in the sequence that are contained within. // throws exception if point positions have not been set void getIndexRangeFromPointPositions(const int nPointMin, const int nPointMax, int& nIndexMin, int& nIndexMax, bool& bError) const; //////////////////////////////////////////////////////////////////////// // // the following group of functions allow write access to // ntides in the sequence. they could equivalently be // written using the [] operator and calling the ntide's // member function or assigning the ntide directly, but // are generally used in this form for clarity. // //////////////////////////////////////////////////////////////////////// // change the value of the Ntide with one this explicit calls void setNtideAtPos(const int nIndex, const Ntide& nt) { (*this)[ nIndex ] = nt; } // change the value of the base void setBaseAtPos(const int nIndex, const char cBase) { (*this)[ nIndex ].setBase( cBase ); } // change the quality member of the Ntide void setQualityAtSeqPos(const int nIndex, const Quality& q) { (*this)[ nIndex ].setQuality(q); } // set the point position member of the Ntide void setTracePointPosAtSeqPos(const int nPadded, const int nPoint) { if ( cWhichPeakPositionsAreUsed_ == cLittle ) (*pLittlePeakPositions_)[ nPadded ] = nPoint; else if ( cWhichPeakPositionsAreUsed_ == cBig ) (*pBigPeakPositions_)[ nPadded ] = nPoint; } // the sequence object needs to know what the maximum point position // is in order to complement individual ntides' point positions void setMaxPointPos(const short int nMax) { nTraceArrayMaxIndex_ = nMax; } void setMinPointPos( const short int nMin) { nTraceArrayMinIndex_ = nMin; } int nGetMaxPointPos() { return nTraceArrayMaxIndex_; } //////////////////////////////////////////////////////////////////////// // // these implement editing at the sequence level // //////////////////////////////////////////////////////////////////////// // complement the entire sequence. A <- T, array reversed, // doing it twice returns it to original state. depends // on value of nTraceArrayMaxIndex_ having been set (will throw // exception if not). // // note that if the Sequence was inherited by a Fragment object, // this call will do nothing to correct the fragment's data // structures so the Fragment complement member fun should be // used instead (it will call this one). Same true of Contig. // note that complementation is an edit action, hence undoable. void complementSequence(); void complementBasesNotPeakPositions(); void complementSequenceAfterReadingPHDFile( const bool bFoundTraceArrayMax ); // takes an unpadded index and gives the unpadded index // of the base on the complementary strandX5F int nComplementUnpaddedIndex( const int nUnpadded ); // takes an padded index and gives the padded index // of the base on the complementary strandX5F // (used for displaying the read scale in teditor) int nComplementPaddedIndex( const int nPadded ) const; // deletes the Ntide at passed index // returns the Ntide that was deleted for use by undo() // member function of edit action // This removeAt is the one from MBTValOrderedOffsetVector--not // from RWTValOrderedVector Ntide deleteNtideAtPos(const int nPos, const bool bDeletePointPos ) { if ( bDeletePointPos ) { deletePointPos( nPos ); } return (*this).removeAt(nPos); } // useful for undoing extending consensus void removeLastBase() { removeLast(); } // inserts the Ntide at passed index. interpolates point positions // if they have already been set in this sequence. NOTE: this // procedure is meant for editing, not to build the Sequence object. void insertNtideAtPos(const Ntide& rNt, const int nPos); void insertNtideWithPeakAtPos(const Ntide& nt, const int nPos); // adds an ntide at end of the sequence--this is useful for // extending the consensus void appendNtide( const Ntide& nt ); void appendPointPos( const int nPointPos ); //////////////////////////////////////////////////////////////////////// // // anything that inherits Sequence can be written out to a fasta // file - stripped of pads, in fasta format with passed // comment (if any) appended to first line which // begins "> ". throws SysRequestFailed exception // if cannot write file. // //////////////////////////////////////////////////////////////////////// void writeUnpaddedSeqToFastaFile(const char* szFilePath, const char* szComment = 0) const; void assignZeroQualityToPads( int nFirstPad, int nLastPad ); void assignQualityAndPeakPositionsToARangeOfPads( int nBeforePads, int nAfterPads, const bool bAndPeakPositions ); void assignQualityAndPeakPositionsToPads( const bool bAndPeakPositions ); void assignQualityAndPeakPositionsToPadsAtStartOrEndOfSequence( const int nConsPosOfNonPad, const bool bPadsAtStartNotEnd, const bool bAndPeakPositions ); void assignQualityAndPeakPositionsToSequenceOfPads( const bool bAndPeakPositions ); int nGetPointPos(const int nPaddedSeqPos ) const; void insertPointPos( const int nPos, const int nPointPos ); void deletePointPos( const int nPaddedSeqPos ); void createPointPosArray( const bool bFoundTraceArrayMaxIndex, const int nTraceArrayMaxIndex, const size_t nInitialSizeOfArray, const bool bFillUpArray ); void createPointPosArray2( const char cWhichPeakPositionsAreUsed, const size_t nInitialSizeOfArray, const bool bFillUpArray ); void findHighQualitySegment( bool& bHighQualitySegmentExists, int& nUnpaddedHighQualitySegmentStart, int& nUnpaddedHighQualitySegmentEnd, int& nPaddedHighQualitySegmentStart, int& nPaddedHighQualitySegmentEnd ); void findHighQualitySegmentWithUserDefinedQuality( const double dAmountToSubtractFrom, bool& bHighQualitySegmentExists, int& nUnpaddedHighQualitySegmentStart, int& nUnpaddedHighQualitySegmentEnd, int& nPaddedHighQualitySegmentStart, int& nPaddedHighQualitySegmentEnd ); bool bHasPeakPositions() { return( cWhichPeakPositionsAreUsed_ != cNone ); } bool bReadLengthAndTracesLengthAreEqual(); void increaseMaxLengthIfNecessaryOfPeakPositions( const int nBasesToAdd ); public: // in order to complement the point positions of ntides // relative to the traces they came from, we need to know // the "length" i.e. number of points in the trace. // this gets initialized to nUninitializedPointPos (from ntide.h) // and set with setMaxPointPos() int nTraceArrayMinIndex_; int nTraceArrayMaxIndex_; enum { cNone = 'n', cBig = 'b', cLittle = 'l' }; char cWhichPeakPositionsAreUsed_; // 'n' none, 'b' big, 'l' little MBTValOrderedOffsetVector* pBigPeakPositions_; MBTValOrderedOffsetVector* pLittlePeakPositions_; RWCString soSequenceName_; }; #endif