/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "readWholeReadItem.h" #include "rwcstring.h" #include "locatedFragment.h" #include "wholeReadItem.h" #include "mbt_exception.h" #include "nLine.h" #include "expidAndLocatedFragment.h" #include "consed.h" #include "bIsNumericMaybeWithWhitespace.h" #include "phdBall2Fasta.h" const int nMaxLineSize = 10000; static char szLine[ nMaxLineSize + 1]; static char szLine2[ nMaxLineSize + 1]; static char szTypeSaved[200]; static char szSourceSaved[200]; static char szDateSaved[200]; const int nMaxDataLines = 3; static RWCString soDataLine[ nMaxDataLines ]; #define PARSE_PANIC( message ) \ { ostrstream ost; \ ost << "Error detected from source file " \ << __FILE__ << " at " << __LINE__ <nReadType_ = nUniversalForward; else if ( soDataLine[0] == "univ rev" ) pLocFrag->nReadType_ = nUniversalReverse; else if ( soDataLine[0] == "walk" ) pLocFrag->nReadType_ = nWalk; else if ( soDataLine[0] == "pcr end" ) pLocFrag->nReadType_ = nPCREnd; else { PARSE_PANIC_PRIMER_WR( "after type: there must be one of univ fwd, univ rev, pcr end, or walk" ) } pLocFrag->nReadTypeFromPhdFile_ = pLocFrag->nReadType_; if ( nDataLine > 1 ) { if ( soDataLine[1](0, 4 ) != "seq:" ) { PARSE_PANIC_PRIMER_WR( "If you have a second data line in this WR item, it must be seq: (primer sequence) but was unrecognized" ); } // chop off the final CR and the beginning seq: soDataLine[1] = soDataLine[1](5, soDataLine[1].length() - 5 ); soDataLine[1] = soDataLine[1].stripWhitespace( RWCString::BOTH ); pLocFrag->soPrimerSequence_ = soDataLine[1]; } } #define PARSE_PANIC_PRIMER_WR2( szMessage ) \ { \ ostrstream ost; \ ost << "error in phd file " << filPhdBall_ << " at line " << \ nLine_ << \ " detected from consed source file " << __FILE__ \ << " at " << __LINE__ << endl << \ " primer WR item must look like this:\nWR{\nprimer phredPhrap 990224:045110\ntype: (primer type)\nseq: (primer sequence)\nwhere the seq line is optional and (primer type) is one of univ fwd, univ rev, pcr end, or walk.\ndata:\n";\ for( int nTemp = 0; nTemp < nDataLine; ++nTemp ) {\ ost << soDataLine[nTemp];\ }\ ost << endl << szMessage << endl << ends; \ InputDataError ide( ost.str() ); \ throw ide; } // Example: // WR{ // primer determineReadTypes 990603:090231 // type: univ fwd // } void phdBall2Fasta :: parsePrimerWRItemForPhdBall2Fasta( const int nDataLine, RWCString& soFastaHeader ) { if ( nDataLine == 0 ) { PARSE_PANIC_PRIMER_WR2( "There must be a data line within the WR{ which must start with type: and be followed with univ fwd, univ rev, pcr end, or walk" ) } if ( soDataLine[0](0, 5) != "type:" ) { PARSE_PANIC_PRIMER_WR2( "first data line must start with type: and be followed with univ fwd, univ rev, pcr end, or walk" ) } if ( soDataLine[0].length() < 6 ) PARSE_PANIC_PRIMER_WR2( "after type: there must be one of univ fwd, univ rev, pcr end, or walk but instead ended" ) // chop off the initial type: and the final CR soDataLine[0] = soDataLine[0]( 5, soDataLine[0].length() - 6 ); // strip off leading and trailing whitespace soDataLine[0] = soDataLine[0].stripWhitespace( RWCString::BOTH ); if ( soDataLine[0] == "univ fwd" ) soFastaHeader += " DIRECTION: fwd"; else if ( soDataLine[0] == "univ rev" ) soFastaHeader += " DIRECTION: rev"; } // Example: // WR{ // template determineReadTypes 990603:090231 // name: djs366_101 // lib: library1 // } // filPHD is for PARSE_PANIC's static void parseTemplateWRItem( const int nDataLines, LocatedFragment* pLocFrag, const RWCString& filPHD ) { for( int nDataLine = 0; nDataLine < nDataLines; ++nDataLine ) { RWCString& soDataLinee = soDataLine[ nDataLine ]; if ( soDataLinee( 0, 5 ) == "name:" ) { if ( soDataLinee.length() < 6 ) { PARSE_PANIC_TEMPLATE_WR( "nothing follows name:" ) } soDataLinee = soDataLinee( 5, soDataLinee.length() - 6 ); pLocFrag->soTemplate_ = soDataLinee.stripWhitespace( RWCString::BOTH ); } else if ( soDataLinee( 0, 5 ) == "type:" ) { if ( soDataLinee.length() < 6 ) { PARSE_PANIC_TEMPLATE_WR( "nothing follows type:" ) } soDataLinee = soDataLinee( 5, soDataLinee.length() - 6 ); // strip off leading and trailing whitespace pLocFrag->soTemplateType_ = soDataLinee.stripWhitespace( RWCString::BOTH ); } else if ( soDataLinee( 0, 5 ) == "size:" ) { if ( soDataLinee.length() < 6 ) { PARSE_PANIC_TEMPLATE_WR( "nothing follows size:" ) } soDataLinee = soDataLinee( 5, soDataLinee.length() - 6 ); if ( !bIsNumericMaybeWithWhitespace( soDataLinee, pLocFrag->nTemplateSize_ ) ) { PARSE_PANIC_TEMPLATE_WR_1_ARG( "the size must be numeric but is ", soDataLinee ) } } else if ( soDataLinee( 0, 4 ) == "lib:" ) { if ( soDataLinee.length() < 5 ) { PARSE_PANIC_TEMPLATE_WR( "nothing follows lib:" ); } soDataLinee = soDataLinee.soGetRestOfString( 4 ); soDataLinee = soDataLinee.stripWhitespace( RWCString::BOTH ); if ( soDataLinee.isNull() ) { PARSE_PANIC_TEMPLATE_WR( "nothing follows lib:" ); } pLocFrag->soLibrary_ = soDataLinee; } else { PARSE_PANIC_TEMPLATE_WR( "unrecognized line" ) } } // for( int nDataLine ... } // parseTemplateWRItem #define PARSE_PANIC_TEMPLATE_WR2( szMessage ) \ { \ ostrstream ost; \ ost << "error in phd file " << filPhdBall_ << " at line " << \ nLine_ << \ " detected from consed source file " << __FILE__ \ << " at " << __LINE__ << endl << \ "template WR item must look like this:\nWR{\ntemplate phredPhrap 990224:045110\ntemplate: (template name)\ntype: (bac, cosmid, puc, pbc, pcr, etc)\nsize: (template size)\n}\nwhere each of the above lines is optional\ndata:\n"; \ ost << soDataLinee;\ ost << endl << szMessage << endl << ends; \ InputDataError ide( ost.str() ); \ throw ide; } // Example: // WR{ // template determineReadTypes 990603:090231 // name: djs366_101 // lib: library1 // } void phdBall2Fasta :: parseTemplateWRItemForPhdBall2Fasta( const int nDataLines, RWCString& soFastaHeader ) { for( int nDataLine = 0; nDataLine < nDataLines; ++nDataLine ) { RWCString& soDataLinee = soDataLine[ nDataLine ]; if ( soDataLinee( 0, 5 ) == "name:" ) { if ( soDataLinee.length() < 6 ) { PARSE_PANIC_TEMPLATE_WR2( "nothing follows name:" ) } soDataLinee = soDataLinee( 5, soDataLinee.length() - 6 ); soFastaHeader += " TEMPLATE: "; soFastaHeader += soDataLinee.stripWhitespace( RWCString::BOTH ); return; } } } // parseTemplateWRItemForPhdBall2Fasta #define PARSE_PANIC_EXPID_WR_1_ARG( szMessage, szMessage2 ) \ { ostrstream ost; \ ost << "error in phd file " << filPHD << " at line " << \ nLine << \ " detected from consed source file " << __FILE__ \ << " at " << __LINE__ << endl << \ "expid WR item must look like this:\nWR{\nexpid (program name) 990224:045110\n25\n}\nwhere the 25 should be replaced by the experiment ID from the autofinish output\n" << \ "Line:\n" << szLine2 << "\n" << \ szMessage << endl << szMessage2 << endl << ends; \ InputDataError ide( ost.str() ); \ throw ide; } static void parseExpidWRItem( const int nDataLines, LocatedFragment* pLocFrag, const RWCString& filPHD ) { if ( nDataLines < 1 ) { PARSE_PANIC_EXPID_WR_1_ARG( "there should be exactly 1 data lines in this WR item but instead there is", nDataLines ); } int nExpID; if ( !bIsNumericMaybeWithWhitespace( soDataLine[ 0 ], nExpID ) ) { PARSE_PANIC_EXPID_WR_1_ARG( "this line should contain a number but instead contains", soDataLine[ 0 ] ); } expidAndLocatedFragment* pExpID = new expidAndLocatedFragment( nExpID, pLocFrag ); pLocFrag->pContig_->pAssembly_->aExpidAndLocatedFragment_.insert( pExpID ); } void readWholeReadItem( FILE* pFil, const FileName& filPHD, LocatedFragment* pLocFrag, wholeReadItem*& pWR ) { if ( fgets( szLine, nMaxLineSize, pFil ) == NULL ) { PARSE_PANIC( "premature end of file while looking for body of WR{ block" ); } else ++nLine; // save from the strtok's so can show szLine2 in error messages strcpy( szLine2, szLine ); char* szType = strtok( szLine, " " ); char* szSource = strtok( NULL, " " ); char* szDate = strtok( NULL, "\n " ); if ( !szType ) PARSE_PANIC_WR_LINE( "could not find whole read info item type" ); if ( !szSource ) PARSE_PANIC_WR_LINE( "could not find source" ); if ( !szDate ) PARSE_PANIC_WR_LINE( "could not find date" ); // save the data in 3 different buffers since szLine will now be reused strcpy( szTypeSaved, szType ); strcpy( szSourceSaved, szSource ); strcpy( szDateSaved, szDate ); RWCString soData; int nDataLine = 0; bool bFoundEnd = false; do { if ( fgets( szLine, nMaxLineSize, pFil ) == NULL ) { PARSE_PANIC( "premature end of file while looking for end of WR{ block which should be terminated with a }" ); } else ++nLine; if ( szLine[0] == '}' && isspace( szLine[1] ) ) bFoundEnd = true; else { soData += szLine; if ( nDataLine < nMaxDataLines ) { soDataLine[ nDataLine ] = szLine; ++nDataLine; } } } while ( !bFoundEnd ); // some recognized WR items that have consed-defined formats // and whose data is parsed and used if ( strcmp( szTypeSaved, "primer" ) == 0 ) { parsePrimerWRItem( nDataLine, pLocFrag, filPHD ); } else if ( strcmp( szTypeSaved, "template" ) == 0 ) { parseTemplateWRItem( nDataLine, pLocFrag, filPHD ); } else if ( strcmp( szTypeSaved, "expid" ) == 0 ) { parseExpidWRItem( nDataLine, pLocFrag, filPHD ); } else if ( strcmp( szTypeSaved, "referenceSequence" ) == 0 ) { pLocFrag->bIsAFakeRead_ = true; } // cut off final carriage return from soData soData = soData.strip( RWCString::TRAILING, '\n' ); pWR = new wholeReadItem( pLocFrag, szTypeSaved, szSourceSaved, szDateSaved, soData ); if ( strcmp( szTypeSaved, "referenceSequence" ) == 0 ) { // this read is a reference sequence pLocFrag->pContig_->pReferenceSequence_ = pLocFrag; } } #define PARSE_PANIC_WR_LINE2( szAdditionalMessage ) \ { \ ostrstream ost; \ ost << "error in phdball " << filPhdBall_ << " at line " << \ nLine_ << \ " detected from consed source file " << __FILE__ \ << " at " << __LINE__ << \ ". Line:\n" << szLine2 << "\n" << \ "should be of form: (whole read info item type) (source) (date in form YYMMDD:HHMISS)\n " << \ szAdditionalMessage << endl << ends; \ InputDataError ide( ost.str() ); \ throw ide; } #define PARSE_PANIC2( message ) \ { ostrstream ost; \ ost << "Error detected from source file " \ << __FILE__ << " at " << __LINE__ <