/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #ifndef phaster2PhdBall_included #define phaster2PhdBall_included #include "rwtvalorderedvector.h" #include "filename.h" #include "mbtValOrderedVectorOfLong.h" #include "mbtValOrderedVectorOfRWCString.h" #include #include "mbtPtrOrderedVector.h" #include "rwtvalvector.h" class desiredLocation { public: long lGenomicLocationA_; long lGenomicLocationB_; RWCString soChromosome_; long l1ChrPosA_; long l1ChrPosB_; // additional site or end of window char cAllowedAllelesA_; char cAllowedAllelesB_; enum { cMatchOneSite = 'o', cMatchBothSites = 'b', cMatchWindow = 'w' }; char cTypeOfMatch_; bool bMateUnmapped_; FileName filPhdBall_; FILE* pPhdBall_; RWTValVector* paReadDepth_; public: desiredLocation( const bool bMateUnmapped, const RWCString& soChromosome, const long l1ChrPos, const char cTypeOfMatch, const FileName& filPhdBall ) : bMateUnmapped_( bMateUnmapped ), soChromosome_( soChromosome ), l1ChrPosA_( l1ChrPos ), l1ChrPosB_( l1ChrPos ), cAllowedAllelesA_( 'n' ), cAllowedAllelesB_( 'n' ), cTypeOfMatch_( cTypeOfMatch ), filPhdBall_( filPhdBall ), paReadDepth_(0) {} desiredLocation() {} bool operator==( const desiredLocation& myDesiredSite ) const { return( lGenomicLocationA_ == myDesiredSite.lGenomicLocationA_ ); } bool operator<( const desiredLocation& myDesiredSite ) const { return ( lGenomicLocationA_ < myDesiredSite.lGenomicLocationA_ ); } }; #define nMAXLINESIZEB 10000 class phaster2PhdBall { public: phaster2PhdBall( const FileName& filPhasterFOF, const FileName& filPhasterLocations, const FileName& filPhdBallFOF ) : filPhasterFOF_( filPhasterFOF ), filPhasterLocations_( filPhasterLocations ), filPhdBallFOF_( filPhdBallFOF ), bConversionTableSet_( false ), soLastChromosome_( "not-a-chromosome-name" ), nHits_( 0 ), soLine_( (size_t) nMAXLINESIZEB ) {} void doIt(); void searchOnePhasterFile( const FileName& filPhasterFile ); void saveReadPair( const int nIndexOfLocation ); void writeReadToPhdBall( const int nIndexOfLocation, const RWCString& soReadName, const RWCString& soBases, const RWTValOrderedVector& aQualities, const char cFirstOrSecondRead ); void convertQualities( const RWCString& soEncodedQualities, RWTValOrderedVector& aQualities ); long lConvertToGenomic( const RWCString& soChromosome, const long l1ChrPos ); void parseLocationsFile(); void parsePhasterLineCrudely(); void parsePhasterLineFinely(); void considerEachSnpForThisReadPair( bool& bSavedReadPair ); enum { cREAD1 = '1', cREAD2 = '2' }; enum { cSITEA = 'a', cSITEB = 'b' }; bool bAlleleOK( desiredLocation* pDL, const char cWhichRead, const char cWhichSite ); void setGenomicLocationsOfDesiredLocations(); void convertFromGenomic( const long lGenomic, RWCString& soChromosome, int& n1ChrPos ); void openPhdBalls(); void closePhdBalls(); void setReadDepthArrays(); void writeNewLocationsFile(); public: FileName filPhasterFOF_; FileName filPhasterLocations_; FileName filPhdBallFOF_; FILE* pPhdBallFOF_; FileName filCurrentPhasterFile_; // just the lines from the phaster file mbtPtrOrderedVector aDesiredLocations_; FILE* pNewLocations_; // for conversion of the desired locations in chromosome and chr pos // to phaster genomic coordinates bool bConversionTableSet_; mbtValOrderedVectorOfRWCString aConvertChromosomes_; mbtValOrderedVectorOfLong aConvertStarts_; mbtValOrderedVectorOfLong aConvertEnds_; // caches for lConvertToGenomic RWCString soLastChromosome_; int nLastChromosomeIndex_; RWCString soDateTimeForTimestamp_; RWCString soDateTimeForWRItems_; int nHits_; // a pair is counted as 1 // current read being parsed bool bCurrentPhasterLineIsFinelyParsed_; RWCString soReadName_; int nRead1Left_; int nRead2Left_; int nRead1Right_; int nRead2Right_; long lGenomicLeftRead1_; long lGenomicLeftRead2_; // these are safely further than // the right end of the read in the alignment long lGenomicFarRightRead1_; long lGenomicFarRightRead2_; // these are the precise right ends of the reads // in alignment to genomic. I only calculate these // if, using the above right ends, the read has a // chance of being selected. long lGenomicRightRead1_; long lGenomicRightRead2_; RWCString soEncodedBasesRead1_; RWCString soEncodedBasesRead2_; RWCString soEncodedQualitiesRead1_; RWCString soEncodedQualitiesRead2_; bool bTwoReads_; long lMaxRight_; RWCString soDecodedBasesRead1_; RWCString soDecodedBasesRead2_; RWCString soDecodedGenomicBasesRead1_; RWCString soDecodedGenomicBasesRead2_; RWCString soLine_; }; #endif