/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ // // tracefile.cpp // // Implementation for the TraceFile object. // // ABI and SCF format chromatograms are read using large portions // of slightly edited code from Ted. // // gordon 13-April-1995 // rev chrisa 3-Jul-95 // #include #include using namespace std; #include #include #include "sysdepend.h" #include "mbt_exception.h" #include "chromatData.h" #include "guiapp.h" #include "ntide.h" #include "sequence.h" #include "locatedFragment.h" #include "tracefile.h" #include "readPHD.h" #include "readData.h" #include "freeChromatData.h" #include "consed.h" #include "consedParameters.h" #include "filePopupAndGetFilename.h" #include "nNumberFromBase.h" // // macros format message, form and throw exception objects // #define PARSE_PANIC(file,line,message) \ { ostringstream ost; \ ost << "Error detected from source file " \ << __FILE__ << " at " << __LINE__ <bUsingPhdFiles_ ) { ostringstream ost; ost << "You brought this up -nophd so you can't bring up traces" << ends; InputDataError ide( ost.str().c_str() ); throw ide; // throw exception } FileName filChromatPathname; bool bDeleteAfterReadingChromat; filChromatPathname = filFindChromat( bDeleteAfterReadingChromat ); cerr << "filFindChromat returned " << filChromatPathname << endl; if ( filChromatPathname == "fake solexa" ) { createFakeSolexaTraces(); return; } ChromatData* pChromatData; int nStatus; pChromatData = readData((char* )filChromatPathname.data(), &nStatus ); if ( !pChromatData ) { PANIC_OST( ost ) << "Unable to open chromatogram file " << filChromatPathname << ends; SysRequestFailed srf(ost.str().c_str()); // what the hell, show 'em the errno srf.includeErrnoDescription(); throw srf; } if (pFrag_->bPhdFileHasTraceArrayMinAndMax_ ) { int nNumberOfTraceArrayPoints = pFrag_->nTraceArrayMaxIndex_ - pFrag_->nTraceArrayMinIndex_ + 1; if ( pChromatData->numPoint != nNumberOfTraceArrayPoints ) { ostringstream ost; ost << "Sorry--the chromatogram file " << filChromatPathname << " has " << pChromatData->numPoint << " trace array points while the phd file " << pFrag_->filGetPHDFilename() << " was made from a chromatogram with " << nNumberOfTraceArrayPoints << ". This means that someone overwrote the original chromatogram file. Check the file dates on the chromatogram file and the phd file. To correct this, I would suggest deleting the phd file and running the phredPhrap script again. To prevent this from happening again, find out who/why the chromatogram was switched. Sorry." << endl << ends; cout << ost.str().c_str() << endl; InputDataError ide( ost.str().c_str() ); throw ide; // throw exception } } // clean up--delete the uncompressed chromat if ( bDeleteAfterReadingChromat ) { unlink( (char*) filChromatPathname.data() ); } if (pChromatData->numPoint == 0) { ostringstream ost; ost << "Analysis file contains 0 points" << filChromatPathname << ends; InputDataError ide(ost.str().c_str()); throw ide; // throw exception } // // the file has been successfully read (by the Ted code) // // copy in the originally called bases and their point positions seqABICalledBases_.resize( pChromatData->numBase ); // assume that the max peak position is the same for the ABI called peaks // as for the phred called peaks. seqABICalledBases_.createPointPosArray( pFrag_->bPhdFileHasTraceArrayMinAndMax_, pFrag_->nTraceArrayMaxIndex_, pChromatData->numBase, false ); // bFillUpArray for( int nPos = 0; nPos < pChromatData->numBase; nPos++ ) { // ABI has slightly different notation convention for ntides char c = cConvertFromABI( pChromatData->base[ nPos ] ); // this currently ignores any per-base quality data // that might be available. later versions of phred // will make that available, and so will need to call // nt.setQuality() Ntide nt(c); // form Ntide from character // add the new ntide to the array seqABICalledBases_.append(nt); seqABICalledBases_.appendPointPos( pChromatData->baseLoc[nPos] ); } // save the number of points nPoints_ = pChromatData->numPoint; nMaxTraceValue_ = pChromatData->maxTraceValue; nMinTraceValue_ = pChromatData->minTraceValue; // and tell the Sequence object how many points there are // so it can complement point positions as well as called bases // note: since the point position is the same as the index // into the array of trace values, the max position is the // number of points - 1 seqABICalledBases_.setMaxPointPos(nPoints_ - 1); // new up appropriately sized arrays to hold the trace points pdaTraceA_ = new TracePointArray( pChromatData->numPoint ); pdaTraceC_ = new TracePointArray( pChromatData->numPoint ); pdaTraceG_ = new TracePointArray( pChromatData->numPoint ); pdaTraceT_ = new TracePointArray( pChromatData->numPoint ); TracePointArray* ppTracePointArray[4]; ppTracePointArray[0] = pdaTraceA_; ppTracePointArray[1] = pdaTraceC_; ppTracePointArray[2] = pdaTraceG_; ppTracePointArray[3] = pdaTraceT_; // scale traces if they are ESD data since ESD (Megabase) are // floating point numbers between -0.1 and 2.0 and if they are // just stored as an unsigned int, we will loose most of the precision if ( pChromatData->fileType == MD1Format || pChromatData->fileType == MD2Format ) { float fScaleFactor = 1000.0; nMaxTraceValue_ = 1000.0 * pChromatData->maxTraceValue; nMinTraceValue_ = 1000.0 * pChromatData->minTraceValue; for( int nBaseType = 0; nBaseType < 4; ++nBaseType ) { for( int nPoint = 0; nPoint < pChromatData->numPoint; ++nPoint) { pChromatData->trace[ nBaseType ][ nPoint ] *= fScaleFactor; } } } // copy traces into our data structure for( int nBaseType = 0; nBaseType < 4; ++nBaseType ) { for( int nPoint = 0; nPoint < pChromatData->numPoint; ++nPoint) { ppTracePointArray[ nBaseType ]->append( (unsigned int) pChromatData->trace[ nBaseType ][ nPoint ] ); } } freeChromatData( pChromatData ); // return storage for no longer needed ChromatData if ( pFrag_->bComp() ) complementTracesAndABICalledBases(); } // TraceFile :: TraceFile( const char* ) TraceFile::~TraceFile() { // free storage used by arrays of trace points. delete pdaTraceA_; delete pdaTraceC_; delete pdaTraceT_; delete pdaTraceG_; } // ABI has slightly different notation convention for uncallable // bases. use this to convert. char cConvertFromABI( const char c ) { char cNew = c; if (cNew == '-') cNew = 'N'; return( cNew ); } unsigned int TraceFile :: nGetTraceValue( const char c, const int nPoint ) { // ok, we'll cut you some slack about upper/lower case switch (toupper(c)) { case 'A': return (*pdaTraceA_)[nPoint]; break; case 'C': return (*pdaTraceC_)[nPoint]; break; case 'T': return (*pdaTraceT_)[nPoint]; break; case 'G': return (*pdaTraceG_)[nPoint]; break; default: assert( FALSE ); // programming error } } // this complements the trace point arrays as well as the // array of originally called bases and their point positions void TraceFile :: complementTracesAndABICalledBases() { // complement the array of originally called bases seqABICalledBases_.complementSequence(); // complement the arrays of traces TracePointArray* pdaTraceTemp; // complement the traces by (firstly) swapping the pointers // A <- T, T <- A pdaTraceTemp = pdaTraceA_; pdaTraceA_ = pdaTraceT_; pdaTraceT_ = pdaTraceTemp; // C <- G, G <- C pdaTraceTemp = pdaTraceC_; pdaTraceC_ = pdaTraceG_; pdaTraceG_ = pdaTraceTemp; // then use MBTValOrderedSwapVec member function to reverse the // sequence of the point values in the arrays. pdaTraceA_->reverseElementOrder(); pdaTraceT_->reverseElementOrder(); pdaTraceG_->reverseElementOrder(); pdaTraceC_->reverseElementOrder(); } FileName TraceFile :: filFindChromat( bool& bDeleteAfterReading ) { if ( pFrag_->filGetChromatFilename().bIsNull() ) cerr << "filGetChromatFilename is null" << endl; else cerr << "filGetChromatFilename: " << pFrag_->filGetChromatFilename() << endl; if ( pCP->soAlwaysRunProgramToGetChromats_ == "true" ) { RWCString soCommand = pCP->filProgramToRunToGetChromats_ + " " + pFrag_->filGetChromatFilename(); cerr << "to get chromat of read, about to run: " << soCommand << endl; int nRetStat = system( (char*)soCommand.data() ); if (nRetStat != 0 ) { ostringstream ost; ost << "Unable to get chromat file using command " << soCommand << ends; SysRequestFailed srf(ost.str().c_str()); throw srf; // throw exception } FileName filChromatPathname = pCP->soUncompressedChromatDirectory_ + "/" + pFrag_->filGetChromatFilename(); bDeleteAfterReading = true; return( filChromatPathname ); } FileName soLookHereForTraceFile = ConsEd::pGetConsEd()->filGetChromatDir() + "/" + pFrag_->filGetChromatFilename(); if ( ! pFrag_->filGetChromatFilename().bIsNull() ) { // check that the chromat exists. If it doesn't see if it has a // gzip'd file there instead. struct stat statBuf; int nRetStat = stat( (char*) soLookHereForTraceFile.data(), &statBuf ); if ( nRetStat == 0 ) { // this is the simple case--the file is uncompressed and in // ../chromat_dir // debugging cerr << "found file " << soLookHereForTraceFile << endl; // end debugging bDeleteAfterReading = false; return( soLookHereForTraceFile ); } else { // file not found. let's try for a .gz file FileName soGzipedTraceFile = soLookHereForTraceFile + consedParameters::pGetConsedParameters()->soCompressedChromatExtension_; nRetStat = stat( (char*) soGzipedTraceFile.data(), &statBuf ); if (nRetStat == 0) { // if reached here, there is a .gz file // So uncompress it to /tmp and then read it FileName soUncompressedFilePathname = pCP->soUncompressedChromatDirectory_ + "/" + soLookHereForTraceFile.soGetBasename(); RWCString soCommand = pCP->soGunzipFullPath_ + " -c " + soGzipedTraceFile + " >" + soUncompressedFilePathname; cerr << "to uncompress chromatogram, about to run: " << soCommand << endl; nRetStat = system( (char*)soCommand.data() ); if (nRetStat != 0 ) { ostringstream ost; ost << "Unable to uncompress analysis file using command " << soCommand << ends; SysRequestFailed srf(ost.str().c_str()); throw srf; // throw exception } bDeleteAfterReading = true; return( soUncompressedFilePathname ); } } } // if ( ! pFrag_->filGetChromatFilename().bIsNull() ) { // if reached here, couldn't find chromat // soChemistry_ == "454" is new way (after June 2008) // soCallMethod_ == "454" is old way (before June 2008) if ( pFrag_->soChemistry_ == "454" || pFrag_->soCallMethod_ == "454" ) { if ( !pCP->filProgramToRunToGetChromatsOf454Reads_.bIsNull() ) { bool bProblemWith454ChromatName = true; int nFirstColon = pFrag_->filGetChromatFilename().index( ":" ); if ( nFirstColon != RW_NPOS ) { if ( ( nFirstColon + 1 ) <= ( pFrag_->filGetChromatFilename().length() - 1 ) ) { int nSecondColon = pFrag_->filGetChromatFilename().soGetRestOfString( nFirstColon + 1 ).index(":" ); if ( nSecondColon != RW_NPOS ) { bProblemWith454ChromatName = false; } } } RWCString soCommand = pCP->filProgramToRunToGetChromatsOf454Reads_ + " " + pFrag_->filGetChromatFilename(); cerr << "to get chromat of 454 read, about to run: " << soCommand << endl; int nRetStat = system( soCommand.data() ); if ( nRetStat != 0 ) { RWCString soError = "Unable to make trace for 454 read using " + soCommand; if ( bProblemWith454ChromatName ) { soError += " For consed to be able to use sff2scf to create a trace, consed must be able to find the sff file. This is given in the CHROMAT_FILE: line and must look like this example: sff:reads.sff:EBE03TV04IHLTF where reads.sff is the name of the sff file and EBE03TV04IHLTF is the read name. Instead, the CHROMAT_FILE: field has "; soError += pFrag_->filGetChromatFilename(); } else { soError += " If you have put sff2scf somewhere else, use .consedrc parameter consed.programToRunToGetChromatsOf454Reads: (full path)"; } THROW_ERROR( soError ); } FileName filChromatPathname = pCP->fil454sff2scfDirectory_ + "/" + pFrag_->filGetChromatFilename(); bDeleteAfterReading = true; return( filChromatPathname ); } else { cout << "This is a 454 read but .consedrc parameter consed.programToRunToGetChromatsOf454Reads is not set so can't bring up traces" << endl; cout.flush(); } } // if ( soChemistry == "454" ) { // else if ( soChemistry == "solexa" ) { // // added July 2009 (DG) // createFakeChromatogramForSolexaReads(); // } if ( pFrag_->soChemistry_ == "solexa" && pCP->bCreateFakeChromatsForSolexaReads_ ) { return("fake solexa" ); } if ( pCP->soAlwaysRunProgramToGetChromats_ == "last" ) { RWCString soCommand = pCP->filProgramToRunToGetChromats_ + " " + pFrag_->filGetChromatFilename(); cerr << "to get chromat of read, about to run: " << soCommand << endl; int nRetStat = system( (char*)soCommand.data() ); if (nRetStat != 0 ) { ostringstream ost; ost << "Unable to get chromat file using command " << soCommand << ends; SysRequestFailed srf(ost.str().c_str()); throw srf; // throw exception } FileName filChromatPathname = pCP->soUncompressedChromatDirectory_ + "/" + pFrag_->filGetChromatFilename(); bDeleteAfterReading = true; return( filChromatPathname ); } RWCString soError; if ( pFrag_->filGetChromatFilename().bIsNull() ) { soError = "warning: DS line probably contains no CHROMAT_FILE: entry because filChromatFile_ is null"; } else { soError = "filChromatFile_ = " + pFrag_->filGetChromatFilename(); } soError += " Unable to find chromatogram file "; soError += soLookHereForTraceFile; ostringstream ost; ost << soError << ends; SysRequestFailed srf(ost.str().c_str()); throw srf; // throw exception } void TraceFile :: createFakeSolexaTraces() { const int nPointsPerBase = 12; const int nExtraPointsAtBeginning = 6; const int nExtraPointsAtEnd = 6; LocatedFragment* pLocFrag = (LocatedFragment*) pFrag_; nPoints_ = pLocFrag->nGetPaddedFragLength() * nPointsPerBase + nExtraPointsAtBeginning + nExtraPointsAtEnd; pLocFrag->nTraceArrayMinIndex_ = 0; pLocFrag->nTraceArrayMaxIndex_ = nPoints_ - 1; // the following is important for preventing fixFragmentPointPositions // from trying to change the peak positions in the case of complemented // reads pLocFrag->bPhdFileHasTraceArrayMinAndMax_ = true; pdaTraceA_ = new TracePointArray( nPoints_ ); pdaTraceC_ = new TracePointArray( nPoints_ ); pdaTraceG_ = new TracePointArray( nPoints_ ); pdaTraceT_ = new TracePointArray( nPoints_ ); pdaTraceA_->increaseCurrentLength( nPoints_, 0 ); pdaTraceC_->increaseCurrentLength( nPoints_, 0 ); pdaTraceG_->increaseCurrentLength( nPoints_, 0 ); pdaTraceT_->increaseCurrentLength( nPoints_, 0 ); int nPointPos; for( nPointPos = 0; nPointPos < nExtraPointsAtBeginning; ++nPointPos ) { (*pdaTraceA_)[ nPointPos ] = 0; (*pdaTraceC_)[ nPointPos ] = 0; (*pdaTraceG_)[ nPointPos ] = 0; (*pdaTraceT_)[ nPointPos ] = 0; } for( nPointPos = nPoints_ - nExtraPointsAtEnd; nPointPos < nPoints_; ++nPointPos ) { (*pdaTraceA_)[ nPointPos ] = 0; (*pdaTraceC_)[ nPointPos ] = 0; (*pdaTraceG_)[ nPointPos ] = 0; (*pdaTraceT_)[ nPointPos ] = 0; } float fLittleTriangle[ nPointsPerBase ]; const int nMaxLittleTrianglePeakHeight = 10; fLittleTriangle[0] = 0.0; fLittleTriangle[1] = 0.0; fLittleTriangle[2] = 0.0; fLittleTriangle[3] = 0.0; fLittleTriangle[4] = 0.25; fLittleTriangle[5] = 1.0; fLittleTriangle[6] = 1.0; fLittleTriangle[7] = 0.25; fLittleTriangle[8] = 0.0; fLittleTriangle[9] = 0.0; fLittleTriangle[10] =0.0; fLittleTriangle[11] = 0.0; for( int n0LittleTrianglePos = 0; n0LittleTrianglePos < nPointsPerBase; ++n0LittleTrianglePos ) { fLittleTriangle[ n0LittleTrianglePos ] *= nMaxLittleTrianglePeakHeight; } nMaxTraceValue_ = nMaxLittleTrianglePeakHeight * 100; // approx since // 97 is highest quality value nMinTraceValue_ = 0; // prepare to insert peak positions for bases unsigned short int s = 0; s = ~s; int nMaxNumberInShort = (int) s / 2; nMaxNumberInShort -= 10; // to be on the safe side char cSize; if ( pLocFrag->nGetPaddedFragLength() <= nMaxNumberInShort ) cSize = Sequence::cLittle; else cSize = Sequence::cBig; pLocFrag->createPointPosArray2( cSize, pLocFrag->nGetPaddedFragLength(), true ); // fill up array for( int n1Base = pLocFrag->nGetStartIndex(); n1Base <= pLocFrag->nGetEndIndex(); ++n1Base ) { Ntide nt = (*pLocFrag)[ n1Base ]; char cBase = nt.cGetBase(); int nEncodedBase = nNumberFromBase[ cBase ]; Quality qual = nt.qualGetQualityWithout98or99(); int n0StartPointPos = nExtraPointsAtBeginning + ( n1Base - 1 ) * nPointsPerBase; int n0EndPointPos = n0StartPointPos + nPointsPerBase - 1; int nPeakPos = n0StartPointPos + 5; pLocFrag->setTracePointPosAtSeqPos( n1Base, nPeakPos ); for( int nEncodedBaseType = 0; nEncodedBaseType < 4; ++nEncodedBaseType ) { TracePointArray* pTracePointArray; if ( nEncodedBaseType == 0 ) pTracePointArray = pdaTraceA_; else if ( nEncodedBaseType == 1 ) pTracePointArray = pdaTraceC_; else if ( nEncodedBaseType == 2 ) pTracePointArray = pdaTraceG_; else pTracePointArray = pdaTraceT_; if ( nEncodedBase == nEncodedBaseType ) { // make a little triangle for( int nPointPos = n0StartPointPos; nPointPos <= n0EndPointPos; ++nPointPos ) { int n0LittleTriangleIndex = nPointPos - n0StartPointPos; (*pTracePointArray)[nPointPos] = (int) ( fLittleTriangle[n0LittleTriangleIndex] * qual); } } else { // zero it out for( int nPointPos = n0StartPointPos; nPointPos <= n0EndPointPos; ++nPointPos ) { (*pTracePointArray)[nPointPos] = 0; } } } // for( int nEncodedBaseType = 0; } // for( int n1Base = pLocFrag->nGetStartIndex(); } // void TraceFile :: createFakeSolexaTraces() {