/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "locatedFragment.h" #include "contig.h" #include using namespace std; #include "basesegment.h" #include #include #include #include "mbt_errors.h" #include "consed.h" #include "numutil.h" #include #include "tagTypes.h" #include "oligoTag.h" #include "tag.h" #include "wholeAssemblyItem.h" #include "trimWhitespace.h" #include "autoFinishExpTag.h" #include "numutil.h" #include "bIsNumericDouble.h" #include "bIsNumericMaybeWithWhitespace.h" #include "bIsNumericLongMaybeWithWhitespace.h" #include "consedParameters.h" #include "rwctokenizer.h" #include "rwcstring.h" #include "userDefinedTagFieldType.h" #include "userDefinedTagField.h" #include "parseUserDefinedTagTypeLine.h" #include "popupErrorMessage.h" #include "bIsNumeric.h" #include "soAddCommas.h" #ifdef __SUNOS #define SEEK_SET 0 #endif #define PARSE_PANIC( szMessage ) \ { ostringstream ost; \ ost << "ace file error detected from source file " \ << __FILE__ << " at " << __LINE__ <= nCOMMENT ) { if (soComment( 0, nCOMMENT ) == szCOMMENT ) { soComment.remove( 0, nCOMMENT ); } // now remove any trailing C}\n if ( soComment.length() >= 3 ) { if ( soComment( soComment.length() - 3, 3 ) == "C}\n" ) { soComment.remove( soComment.length() - 3 ); } } } soComment = soComment.strip( RWCString::TRAILING, '\n' ); } static RWCString soGetComment() { RWCString soComment; bool bEndOfComment = false; do { FGETS_OR_PARSE_PANIC( "premature end of file while in comment part of tag" ); if ( szLine[0] == 'C' && szLine[1] == '}' && isspace( szLine[2] ) ) bEndOfComment = true; else soComment += szLine; } while( !bEndOfComment ); soComment = soComment.stripWhitespace( RWCString::TRAILING ); return( soComment ); } static char szDateSaved[100]; static char szTagSourceSaved[100]; static char szWATypeSaved[100]; static char szTagTypeSaved[200]; #define GET_NEXT_TAG_DATA_LINE_OR_PARSE_PANIC( szAdditionalMessage ) \ ++nTagDataLine; \ if ( nTagDataLine >= nCurrentSizeOfTagDataLines ) { \ ostringstream ost; \ ost << "ace file error detected from source file " << \ __FILE__ << " at " << __LINE__ << endl << \ "Error in file " << szAceFilenameStatic << \ " before line " << nCurrentLine << \ " in autoFinishTag with only" << \ nCurrentSizeOfTagDataLines << \ " data lines but needed line " << nTagDataLine << endl << \ " contents: " << endl; \ ost << soTagData; \ ost << szAdditionalMessage << endl << ends; \ InputDataError ide( ost.str().c_str() ); \ throw ide; \ } \ szTagLine = (char*) (aTagDataLines[ nTagDataLine ]).data(); #define PUT_BACK_TAG_DATA_LINE \ --nTagDataLine; \ if ( nTagDataLine < 0 ) { \ ostringstream ost; \ ost << "ace file error detected from source file " << \ __FILE__ << " at " << __LINE__ << endl << \ " before line " << nCurrentLine << " in autoFinishTag with " << \ nCurrentSizeOfTagDataLines << \ " but program tried to back up to line " << nTagDataLine << \ endl << " contents: " << endl << soTagData << endl << ends; \ InputDataError ide( ost.str().c_str() ); \ throw ide; \ } #define PARSE_PANIC_AUTOFINISH_EXP_TAG( szAdditionalMessage, aarg ) \ { ostringstream ost;\ \ ost << "ace file error detected from source file " << \ __FILE__ << " at " << __LINE__ << endl << \ "Error in file " << szAceFilenameStatic << \ " before line " << nCurrentLine << " in autoFinishExpTag:" << \ endl << " with " << nCurrentSizeOfTagDataLines << " lines" << endl; \ ost << "Error in autofinishExpTag" << endl << \ \ "found: " << endl; \ ost << soTagData; \ char szTemp[2000]; \ sprintf( szTemp, szAdditionalMessage, aarg ); \ ost << szTemp << endl << ends; \ InputDataError ide( ost.str().c_str() ); \ throw ide; \ } // experiment tags will look like this: // // CT{ // Contig1 autoFinishExp autofinish 95 95 981214:144920 // C // purpose: extend contig 43 bases // 50 350 // dyeTerm customPrimer // fix cons errors: 184.86 // primer: gatgatacaagcaaggga temp: 51 id: 37 // expID_and_template: 1 djs74_2594 2 djs74_2582 3 djs74_2750 4 djs74_465 5 djs74_2389 // } // or like this: // // CT{ // Contig1 autoFinishExp autofinish 95 95 981214:144920 // U // purpose: extend contig 33 bases // 50 350 // dyeTerm univ fwd // fix cons errors: 5.24 // expID_and_template: 6 djs74_2052 // } // // or like this: // CT{ // Contig1 autoFinishExp autofinish 95 95 981214:144920 // C // purpose: weak // 50 350 // dyeTerm customPrimer // fix cons errors: 184.86 // primer: gatgatacaagcaaggga temp: 51 id: 37 // expID_and_template: 7 djs74_2594 8 djs74_2582 9 djs74_2750 10 djs74_465 11 djs74_2389 // } // or like this: // // CT{ // Contig1 autoFinishExp autofinish 95 95 981214:144920 // U // purpose: weak // 50 350 // dyeTerm univ fwd // fix cons errors: 5.24 // expID_and_template: 12 djs74_2052 // } // // or like this: // CT{ // Contig1 autoFinishExp autofinish 95 95 981214:144920 // C // purpose: single subclone bases fixed: 11 // 50 350 // dyeTerm customPrimer // primer: gggtggatcatgaggt temp: 51 id: 38 // expID_and_template: 13 djs74_2392 14 djs74_1622 15 djs74_296 16 djs74_2003 17 djs74_192 // } // // or like this: // CT{ // Contig1 autoFinishExp autofinish 95 95 981214:144920 // U // purpose: flank gap right end of Contig1 // dyeTerm univ rev // template: 18 djs74_1304 // existing reads: djs74_1304.x1_21 (top strand in Contig25) from 1313 to 2808, // djs74_1304.s1 (top strand in Contig26) from 1489 to 2440 // djs74_1304.x2_21 (top strand in Contig25) from 1310 to 2469 // read starts 951 bases from right end of consensus // } const int nMaxNumberOfLinesInAutoFinishTag = 100; // these are made outside the procedures so they can be shared amongst them, // such as getPrimerOnNextLine and createAutoFinishTag static RWTValVector aTagDataLines( (size_t) nMaxNumberOfLinesInAutoFinishTag ); static int nCurrentSizeOfTagDataLines; static char* szTagLine; static char szSavedTagLine[1000]; static int nTagDataLine; static RWCString soTagData( (RWSize_2T) 2000 ); // parse: // primer: gggtggatcatgaggt temp: 51 id: 38 static void getPrimerOnNextLine( autoFinishExpTag* pAutoFinishExp ) { GET_NEXT_TAG_DATA_LINE_OR_PARSE_PANIC( "while looking for primer line of this form: primer: gatgatacaagcaaggga temp: 51 id: 37" ); strcpy( szSavedTagLine, szTagLine ); char* szPrimerKeyword = strtok( szSavedTagLine, " " ); if ( memcmp( szPrimerKeyword, "primer:", 7 ) != 0 ) PARSE_PANIC_AUTOFINISH_EXP_TAG( "expected primer line of this form:\nprimer: gatgatacaagcaaggga temp: 51 id: 37\n but found %s instead which had trouble with the primer: keyword at the beginning", szTagLine ); char* szPrimerBases = strtok( NULL, " "); pAutoFinishExp->soPrimerBases_ = szPrimerBases; // no need to parse the melting temperature of the oligo id because // we aren't going to use those for making sure we don't repeat this // experiment. } inline static void parseOneExpID_and_template_pair( char* szExpID_and_template, autoFinishExpTag* pAutoFinishExp ) { RWCTokenizer tok( szExpID_and_template ); RWCString soExpID = tok(); RWCString soTemplate = tok(); if ( soExpID.isNull() ) { PARSE_PANIC_AUTOFINISH_EXP_TAG( "2expected line of this form:\nexpID_and_template: 5 djs74_1304\nbut found nothing after expID_and_template:\nLine was:\n%s", szTagLine ); } if ( soTemplate.isNull() ) { PARSE_PANIC_AUTOFINISH_EXP_TAG( "expected line of this form:\nexpID_and_template: 5 djs74_1304\nbut found only 1 word after the expID_and_template:\nLine was:\n%s", szTagLine ); } if ( soTemplate == "whole" ) { if ( tok() == "clone" ) { soTemplate = "whole clone"; } } int nExpID; if ( nAToIWithError( soExpID.data(), nExpID ) != NO_ERROR ) PARSE_PANIC_AUTOFINISH_EXP_TAG( "expected line of this form:\nexpID_and_template: 5 djs74_1304\nbut 1st word %s was not numeric", soExpID.data() ); // on startup, look through all the experiments that have been done // or are planning to be done. Thus start numbering experiments // after this. Some or all of these experiment id's may be on // reads in this assembly. There may be reads that are still in the // pipeline and others that have been done but failed to go into the // assembly. if ( nExpID > consedParameters::pGetConsedParameters()->nLastUsedExpID_ ) consedParameters::pGetConsedParameters()->nLastUsedExpID_ = nExpID; pAutoFinishExp->aTemplates_.insert( soTemplate ); pAutoFinishExp->aExpID_.insert( nExpID ); } static void getTemplates( autoFinishExpTag* pAutoFinishExp ) { GET_NEXT_TAG_DATA_LINE_OR_PARSE_PANIC( "while looking for line of this form:\ntemplate: djs74_1304" ); strcpy( szSavedTagLine, szTagLine ); char* szTemplateKeyword = strtok( szSavedTagLine, " " ); if ( memcmp( szTemplateKeyword, "expID_and_template:", 19 ) != 0 ) PARSE_PANIC_AUTOFINISH_EXP_TAG( "expected line of this form:\nexpID_and_template: 5 djs74_1304\nbut couldn't find expID_and_template: at beginning but instead found %s", szTemplateKeyword ); char* szExpID_and_template = strtok( NULL, "\n,"); if ( szExpID_and_template == NULL ) PARSE_PANIC_AUTOFINISH_EXP_TAG( "expected line of this form:\nexpID_and_template: 5 djs74_1304\nbut found nothing after expID_and_template:\nLine was:\n%s", szTagLine ); bool bComplainIfMoreThanOneTemplate = ( pAutoFinishExp->nReadType_ == nWalk ) ? false : true; if ( bComplainIfMoreThanOneTemplate ) { parseOneExpID_and_template_pair( szExpID_and_template, pAutoFinishExp ); // now check there are no more char* szDummy = strtok( NULL, "\n, " ); if ( szDummy != NULL ) PARSE_PANIC_AUTOFINISH_EXP_TAG( "expected line of this form:\nexpID_and_template: 5 djs74_1304\nbut there was something following", szTagLine ); } else { while( szExpID_and_template != NULL ) { parseOneExpID_and_template_pair( szExpID_and_template, pAutoFinishExp ); szExpID_and_template = strtok( NULL, "\n," ); } } } static void createAutoFinishExpTag( FILE* pAceFile, Contig* pContig, const int nConsPosStart, const int nConsPosEnd, char* szTagSource, char* szTagDate ) { nCurrentSizeOfTagDataLines = 0; long lID = nUndefinedTagID; RWCString soComment; soTagData = ""; bool bContinue = true; do { FGETS_OR_PARSE_PANIC( "premature end of file while in autofinishExp tag" ); if ( szLine[0] == '}' ) { trimWhitespace( szLine ); if ( strcmp( szLine, "}" ) == 0 ) bContinue = false; } // let's not put the "}" into the data if ( bContinue ) { if ( memcmp( szLine, "COMMENT{", 8 ) == 0 && isspace( szLine[8] ) ) { soComment = soGetComment(); } else if ( szLine[0] == 'I' && szLine[1] == 'D' && szLine[2] == ':' ) { RWCString soRestOfLine( &szLine[3] ); if ( !bIsNumericLongMaybeWithWhitespace( soRestOfLine, lID ) ) { RWCString soMessage = "ID: line in autoFinishExp tag not followed by a number but rather by: "; soMessage += soRestOfLine; PARSE_PANIC( soMessage ); } ConsEd::pGetAssembly()->findLargestTagIDSoFar( lID ); } else { soTagData += szLine; aTagDataLines[ nCurrentSizeOfTagDataLines ] = szLine; ++nCurrentSizeOfTagDataLines; } } } while( bContinue ); autoFinishExpTag* pAutoFinishExp = new autoFinishExpTag( pContig, nConsPosStart, nConsPosEnd, szTagSource, szTagDate, soTagData ); pAutoFinishExp->lID_ = lID; pAutoFinishExp->soComment_ = soComment; bool bComplemented; nTagDataLine = -1; // set up for ++ GET_NEXT_TAG_DATA_LINE_OR_PARSE_PANIC( "while looking for U or C line" ); if ( szTagLine[0] == 'C' ) pAutoFinishExp->bComplementedFromWayPhrapCreatedContig_ = true; else if ( szTagLine[0] == 'U' ) pAutoFinishExp->bComplementedFromWayPhrapCreatedContig_ = false; else { PARSE_PANIC_AUTOFINISH_EXP_TAG( "an autofinishExp tag should have its first data line be just U or C indicating whether the tag is complemented or not with respect to the way that phrap created the contig but the first character not C or U but %s", szTagLine ); } GET_NEXT_TAG_DATA_LINE_OR_PARSE_PANIC( "while reading next line which should be purpose: (extend contig) or (weak) or (single subclone) or (flank gap)" ); if ( memcmp( szTagLine, "purpose:", 8 ) != 0 ) PARSE_PANIC_AUTOFINISH_EXP_TAG( "and line after autofinish id does not contain purpose: but instead %s", szTagLine ); char* szRestOfLine = &szTagLine[8]; char* szPurpose = strtok( szRestOfLine, " " ); if ( memcmp( szPurpose, "weak", 4 ) == 0 ) { pAutoFinishExp->nPurpose_ = nFixWeakRegion; } else if ( memcmp( szPurpose, "extend", 6 ) == 0 ) { pAutoFinishExp->nPurpose_ = nExtendContig; char* szContigKeyword = strtok( NULL, " " ); if ( memcmp( szContigKeyword, "contig", 6 ) != 0 ) { PARSE_PANIC_AUTOFINISH_EXP_TAG( "but extend was followed by %s instead of \"contig\"", szContigKeyword ); } char* szBases = strtok( NULL, " " ); if ( nAToIWithError( szBases, pAutoFinishExp->nExtendingIntoGapThisManyBases_ ) != NO_ERROR ) { PARSE_PANIC_AUTOFINISH_EXP_TAG( "but expected %s to be numeric but it wasn't", szBases ); } } else if ( memcmp( szPurpose, "single", 6 ) == 0 ) { pAutoFinishExp ->nPurpose_ = nCoverSingleSubcloneBases; char* szSubcloneKeyword = strtok( NULL, " " ); if ( memcmp( szSubcloneKeyword, "subclone", 8 ) != 0 ) PARSE_PANIC_AUTOFINISH_EXP_TAG( "but after \"single\" expected \"subclone\" but instead found %s", szSubcloneKeyword ); char* szBasesKeyword = strtok( NULL, " " ); if ( memcmp( szBasesKeyword, "bases", 5 ) != 0 ) PARSE_PANIC_AUTOFINISH_EXP_TAG( "but after \"single subclone\" expected \"bases\" but instead found %s", szBasesKeyword ); char* szFixedKeyword = strtok( NULL, " " ); if ( memcmp( szFixedKeyword, "fixed:", 6 ) != 0 ) PARSE_PANIC_AUTOFINISH_EXP_TAG( "but after \"single subclone bases\" expected \"fixed:\" but instead found %s", szFixedKeyword ); char* szBasesFixed = strtok( NULL, "\n " ); if ( nAToIWithError( szBasesFixed, pAutoFinishExp->nSingleSubcloneBasesFixed_ ) != NO_ERROR ) { PARSE_PANIC_AUTOFINISH_EXP_TAG( "but after \"single subclone bases fixed:\" expected numeric but instead found %s", szBasesFixed ); } } else if ( memcmp( szPurpose, "flank", 5 ) == 0 ) { pAutoFinishExp->nPurpose_ = nFlankGap; } else { PARSE_PANIC_AUTOFINISH_EXP_TAG( "but unrecognized purpose %s", szPurpose ); } // next line looks like this: // 50 350 GET_NEXT_TAG_DATA_LINE_OR_PARSE_PANIC( "while reading offsets line--should look like 50 350" ); // there were some previous beta versions of consed that did not // have this line in the case of flanking reverses. So make that // possible. RWCTokenizer tok( aTagDataLines[ nTagDataLine ] ); RWCString soHighQualityPartOfReadStart = tok(); RWCString soHighQualityPartOfReadEnd = tok(); RWCString soOffsetOfBeginningOfReadFromTag = tok(); if ( nAToIWithError( (char*) soHighQualityPartOfReadStart.data(), pAutoFinishExp->nUnpaddedOffsetToStartOfHighQualityRegion_ ) != NO_ERROR ) { if ( pAutoFinishExp->nPurpose_ == nFlankGap ) goto oldFormatWithNoPositionsForFlankGapExperiments; else { PARSE_PANIC_AUTOFINISH_EXP_TAG( "an autofinishExp tag should have a data line U or C followed by a line with 1 number on it (the experiment id) followed by a line with just 3 numbers on it--the start and end of the high quality portion of the read to be made but high quality part of read start must be numeric but instead was %s", (char*) soHighQualityPartOfReadStart.data() ); } } if ( nAToIWithError( (char*) soHighQualityPartOfReadEnd.data(), pAutoFinishExp->nUnpaddedOffsetToEndOfHighQualityRegion_ ) != NO_ERROR ) PARSE_PANIC_AUTOFINISH_EXP_TAG( "an autofinishExp tag should have a data line U or C followed by a line with 1 number on it (the experiment id) followed by a line with just 3 numbers on it--the start and end of the high quality portion of the read to be made but high quality part of read end must be numeric but instead was %s", (char*) soHighQualityPartOfReadEnd.data() ); if ( pAutoFinishExp->nUnpaddedOffsetToStartOfHighQualityRegion_ >= pAutoFinishExp->nUnpaddedOffsetToEndOfHighQualityRegion_ ) PARSE_PANIC_AUTOFINISH_EXP_TAG( "an autofinishExp tag should have a data line U or C followed by a line with 1 number on it (the experiment id) followed by a line with just 3 numbers on it--the start and end of the high quality portion of the read to be made but high quality part of read start must be < high quality part of read end: %s", (char*) aTagDataLines[ nTagDataLine ].data() ); // handle case of one version of beta autofinish in which there was // no offset number pAutoFinishExp->nUnpaddedOffsetOfBeginningOfReadFromTag_ = 0; if ( !soOffsetOfBeginningOfReadFromTag.isNull() ) { if ( nAToIWithError( (char*) soOffsetOfBeginningOfReadFromTag.data(), pAutoFinishExp->nUnpaddedOffsetOfBeginningOfReadFromTag_ ) != NO_ERROR ) PARSE_PANIC_AUTOFINISH_EXP_TAG( "an autofinishExp tag should have a data line U or C followed by a line with 1 number on it (the experiment id) followed by a line with just 3 numbers on it--the start and end of the high quality portion of the read to be made and an offset of the beginning of the read from the location of the tag but the this offset is not numeric: %s", (char*) soOffsetOfBeginningOfReadFromTag.data() ); } // now parsing line like this: // dyeTerm univ rev // dyeTerm univ fwd // dyeTerm customPrimer GET_NEXT_TAG_DATA_LINE_OR_PARSE_PANIC( "expecting line: dyeTerm univ fwd or univ rev or customPrimer of autofinishExp tag" ); oldFormatWithNoPositionsForFlankGapExperiments: char* szChemistry = strtok( szTagLine, " " ); char* szPrimer1 = strtok( NULL, "\n " ); char* szPrimer2 = strtok( NULL, "\n " ); if ( strcmp( szChemistry, "dyeTerm" ) != 0 ) PARSE_PANIC_AUTOFINISH_EXP_TAG( "the chemistry line should be dyeTerm followed by univ fwd, univ rev, or customPrimer but instead started with %s", szChemistry ); if ( strcmp( szPrimer1, "customPrimer" ) == 0 ) pAutoFinishExp->nReadType_ = nWalk; else if ( strcmp( szPrimer1, "univ" ) == 0 ) { if ( strcmp( szPrimer2, "fwd" ) == 0 ) { pAutoFinishExp->nReadType_ = nUniversalForward; } else if ( strcmp( szPrimer2, "rev" ) == 0 ) { pAutoFinishExp->nReadType_ = nUniversalReverse; } else PARSE_PANIC_AUTOFINISH_EXP_TAG( "unrecognized word %s after univ but should be fwd or rev", szPrimer2 ); } else if ( strcmp( szPrimer1, "pcr" ) == 0 && strcmp( szPrimer2, "end" ) == 0 ) { pAutoFinishExp->nReadType_ = nPCREnd; } else { PARSE_PANIC_AUTOFINISH_EXP_TAG( "couldn't recognize words after dyeTerm: %s", szPrimer1 ); } // next line will look like this: // fix cons errors: 184.86 if ( pAutoFinishExp->nPurpose_ == nFixWeakRegion || pAutoFinishExp->nPurpose_ == nExtendContig ) { GET_NEXT_TAG_DATA_LINE_OR_PARSE_PANIC( "while looking for autofinishExp line with \"fix cons errors:\"" ); RWCTokenizer tok( szTagLine ); soTemp = tok(); soTemp2 = tok(); soTemp3 = tok(); if ( soTemp != "fix" || soTemp2 != "cons" || soTemp3 != "errors:" ) { PARSE_PANIC_AUTOFINISH_EXP_TAG( "expecting \"fix cons errors:\" line but found %s", szTagLine ); } soTemp = tok(); if ( !bIsNumericDouble( soTemp, pAutoFinishExp->dErrorsFixed_ ) ) { PARSE_PANIC_AUTOFINISH_EXP_TAG( "while trying to convert line which should be of form fix cons errors: 5.2 original cons errors: 6.2 but the fix cons errors number %s wasn't numeric", (char*) soTemp.data() ); } soTemp = tok(); soTemp2 = tok(); soTemp3 = tok(); if ( soTemp == "original" && soTemp2 == "cons" && soTemp3 == "errors:" ) { // so this is a line that has // fix cons errors: 5.2 original cons errors: 6.2 // not just: // fix cons errors: 5.2 // which was used for a couple of beta versions soTemp = tok(); if ( !bIsNumericDouble( soTemp, pAutoFinishExp->dOriginalNumberOfErrors_ ) ) { PARSE_PANIC_AUTOFINISH_EXP_TAG( "while trying to convert line which should be of form fix cons errors: 5.2 original cons errors: 6.2 but the original cons errors number %s wasn't numeric", (char*) soTemp.data() ); } } // if ( soTemp == "original" ... else pAutoFinishExp->dOriginalNumberOfErrors_ = -1; } // now look for: // original cons errors: 0 GET_NEXT_TAG_DATA_LINE_OR_PARSE_PANIC( "while looking for line with \"original single subclone bases:\"" ); const int nSSLength = 31; if ( memcmp( szTagLine, "original single subclone bases:", nSSLength ) == 0 ) { if ( !bIsNumericMaybeWithWhitespace( szTagLine + nSSLength, pAutoFinishExp->nOriginalNumberOfSingleSubcloneBases_ ) ) { PARSE_PANIC_AUTOFINISH_EXP_TAG( "while trying to convert line which should be of form \"original single subclone bases: 5\" but the number %s was not numeric", szTagLine + nSSLength ); } } else { pAutoFinishExp->nOriginalNumberOfSingleSubcloneBases_ = -1; PUT_BACK_TAG_DATA_LINE; } if ( pAutoFinishExp->nReadType_ == nWalk ) { getPrimerOnNextLine( pAutoFinishExp ); } // template: djs74_1304 getTemplates( pAutoFinishExp ); pContig->addConsensusTag( pAutoFinishExp ); } #define OLIGO_INFO_LINE_FORM( szMessage ) \ "Line should be of form " szMessage static void getOligoInfo( FILE* pAceFile, RWCString& soName, RWCString& soBases, int& nMeltingTemp, bool& bComplementedWithRespectToWayPhrapCreatedContig, RWCString& soTemplates ) { // Oligo tag example: //CT{ //Contig1 oligo consed 1710 1726 000317:091555 //standard.1 ccacagcagacaccca 55 U //djs74_1861 //} // szLine contains the line like: // standard.1 ccacagcagacaccca 55 U RWCTokenizer tok( szLine ); soName = tok(); soBases = tok(); RWCString soMeltingTemp = tok(); RWCString soComplementedWithRespectToWayPhrapCreatedContig = tok(); if ( soName.isNull() || soBases.isNull() || soMeltingTemp.isNull() || soComplementedWithRespectToWayPhrapCreatedContig.isNull() ) { PARSE_PANIC( OLIGO_INFO_LINE_FORM( "but one of the fields is missing" ) ); } if (soComplementedWithRespectToWayPhrapCreatedContig == "U" ) bComplementedWithRespectToWayPhrapCreatedContig = false; else if ( soComplementedWithRespectToWayPhrapCreatedContig == "C" ) bComplementedWithRespectToWayPhrapCreatedContig = true; else PARSE_PANIC( OLIGO_INFO_LINE_FORM( "but is not U or C" ) ); if (nAToIWithError( soMeltingTemp.data(), nMeltingTemp ) != NO_ERROR ) PARSE_PANIC( OLIGO_INFO_LINE_FORM( "but isn't numeric" ) ); FGETS_OR_PARSE_PANIC( "premature end of file while getting oligo tags's purpose" ); if (szLine[0] == '}' ) PARSE_PANIC( "oligo tag templatese line missing--instead found a line beginning with a \'}\'" ); soTemplates = szLine; // remove the trailing "\n" // Note: templates may be null soTemplates = soTemplates.stripWhitespace(); } static int nHighestOligoNumber = 0; static void adjustHighestOligoNumber( const RWCString& soName ) { // look at number on right end of name if ( soName.isNull() ) return; int nPos; for( nPos = soName.length() - 1; nPos > 0 && isdigit( soName[nPos]); --nPos ); // cases to consider: no digits, 0 length, all digits, some digits // if no digits, nPos == ( nLength -1 ) // if all digits, nPos == -1 // if some digits, nPos + 1 is the left-most digit int nLeftmostDigit = nPos + 1; if (nLeftmostDigit == soName.length() ) return; // no digits char* szDigits = soName.data() + nLeftmostDigit; int nNumber = atoi( szDigits ); if (nNumber > nHighestOligoNumber) nHighestOligoNumber = nNumber; } void Assembly :: getReadTagAfterFirstLine( FILE* pAceFile, LocatedFragment* pLocFrag ) { FGETS_OR_PARSE_PANIC( "premature end of file while in the middle of a read tag" ); char* szReadName = strtok( szLine, " " ); char* szTagType = strtok( NULL, " " ); char* szTagSource= strtok( NULL, " " ); char* szReadPosStart = strtok( NULL, " " ); char* szReadPosEnd = strtok( NULL, " " ); char* szDate = strtok( NULL, "\n " ); #define PARSE_PANIC_RT_LINE( szAdditionalMessage ) \ PARSE_PANIC( "RT tag line should be:\n \nwhere the padded read pos's are from the left end " #szAdditionalMessage ) #define PARSE_PANIC_RT_LINE_2_ARGS( szAdditionalMessage, arg1, arg2 ) \ PARSE_PANIC_2_ARGS( "RT tag line should be:\n \nwhere the padded read pos's are from the left end " #szAdditionalMessage, arg1, arg2 ) if (!szDate ) PARSE_PANIC_RT_LINE( but not all of these fields are present ); // if I put in quotes, the quotes are printed out if ( pLocFrag ) { if ( pLocFrag->soGetName() != (const char*) szReadName ) { PARSE_PANIC_2_ARGS( "misplaced read tag in ace file--under read %s but for read %s", (char*) pLocFrag->soGetName().data(), szReadName ); } } else { pLocFrag = pGetLocatedFragmentByName( szReadName ); if ( !pLocFrag ) { PARSE_PANIC_1_ARG( "RT read tag references read %s which cannot be found in the assembly", szReadName ); } } if (! tagTypes::pGetTagTypes()->bIsAValidTagType( szTagType ) ) { PARSE_PANIC_RT_LINE( and tag type must be recognized by consed but tag type is not recognized ); } int nReadPosStart; int nReadPosEnd; if ( nAToIWithError( szReadPosStart, nReadPosStart ) != NO_ERROR ) PARSE_PANIC_RT_LINE( read pos start must be numeric ); if ( nAToIWithError( szReadPosEnd, nReadPosEnd ) != NO_ERROR ) PARSE_PANIC_RT_LINE( read pos end must be numeric ); if ( ! ( ( pLocFrag->nGetStartIndex() <= nReadPosStart ) && ( nReadPosStart <= nReadPosEnd ) && ( nReadPosEnd <= pLocFrag->nGetEndIndex() ) ) ) PARSE_PANIC_RT_LINE_2_ARGS( "and <= (counting *'s) and must be within the reads whose ends are %d and %d where position 1 is always at the left end of the read", pLocFrag->nGetStartIndex(), pLocFrag->nGetEndIndex() ); int nConsPosStart = pLocFrag->nGetConsPosFromFragIndex( nReadPosStart ); int nConsPosEnd = pLocFrag->nGetConsPosFromFragIndex( nReadPosEnd ); strcpy( szTagSourceSaved, szTagSource ); strcpy( szDateSaved, szDate ); strcpy( szTagTypeSaved, szTagType ); FGETS_OR_PARSE_PANIC( "premature end of file while in the middle of a read tag" ); bool bCommentLineAlreadyRead; long lID = nUndefinedTagID; if ( szLine[0] == 'I' && szLine[1] == 'D' && szLine[2] == ':' ) { RWCString soRestOfLine( &szLine[3] ); if ( !bIsNumericLongMaybeWithWhitespace( soRestOfLine , lID ) ) { RWCString soMessage = "ID: line in tag not followed by a number but rather by: "; soMessage += soRestOfLine; PARSE_PANIC_RT_LINE( soMessage ); } ConsEd::pGetAssembly()->findLargestTagIDSoFar( lID ); bCommentLineAlreadyRead = false; } else bCommentLineAlreadyRead = true; RWCString soComment; getCommentOfRTOrWA( bCommentLineAlreadyRead, pAceFile, soComment ); tag* pTag = new tag( pLocFrag, NULL, szTagTypeSaved, nConsPosStart, nConsPosEnd, false, // bWriteToPhdFileNotAceFile_ soComment, szTagSourceSaved, szDateSaved, false ); // no NoTrans pTag->lID_ = lID; pLocFrag->addTag( pTag ); } static void myParsePanic( const RWCString& soErrorMessage, const int nSourceFileLineNumber, char* szSourceFileName ) { ostringstream ost; ost << "ace file error detected from source file " << szSourceFileName << " at " << nSourceFileLineNumber < \n" #szAdditionalMessage ) #define PARSE_PANIC_TAG_LINE_2_ARGS( szAdditionalMessage, arg1, arg2 ) \ PARSE_PANIC_2_ARGS( "Tag line should be:\n \n" #szAdditionalMessage, arg1, arg2 ) if ( soDate.isNull() ) PARSE_PANIC_TAG_LINE( but not all of these fields are present ); bool bNoTrans = false; if ( !soNoTrans.isNull() ) { if ( soNoTrans == "NoTrans" ) bNoTrans = true; else { PARSE_PANIC_TAG_LINE_2_ARGS( "If tag line has a 7th parameter it should be NoTrans but it instead is %s", szNoTrans, "" ); } } Contig* pContig = NULL; if ( bIsReadNotConsensusTag ) { if ( pLocFrag->soGetName() != soReadNameOrContigName ) { PARSE_PANIC_2_ARGS( "misplaced read tag in ace file--under read %s but for read %s", (char*) pLocFrag->soGetName().data(), soReadNameOrContigName.data() ); } } else { pContig = pGetContigByName( soReadNameOrContigName ); if ( !pContig ) { PARSE_PANIC_1_ARG( "consensus tag for unknown contig: %s", soReadNameOrContigName.data() ); } } // get pTagType at the same time we check if it is a valid tag type // since we will use pTagType later tagType* pTagType = tagTypes::pGetTagTypes()->pGetTagType( soTagType ); if (! pTagType ) { PARSE_PANIC_TAG_LINE( and tag type must be recognized by consed ); } int nConsPosStart; int nConsPosEnd; if ( nAToIWithError( soConsPosStart.data(), nConsPosStart ) != NO_ERROR ) PARSE_PANIC_TAG_LINE( cons pos start must be numeric ); if ( nAToIWithError( soConsPosEnd.data(), nConsPosEnd ) != NO_ERROR ) PARSE_PANIC_TAG_LINE( cons pos end must be numeric ); if ( pContig ) { if ( ! ( pContig->nGetStartConsensusIndex() <= nConsPosStart && nConsPosStart <= nConsPosEnd && nConsPosEnd <= pContig->nGetEndConsensusIndex() ) ) { PARSE_PANIC_TAG_LINE_2_ARGS( "and <= (couting *'s) and both must be within the contig whose ends are %d and %d", pContig->nGetStartConsensusIndex(), pContig->nGetEndConsensusIndex() ); } } else { if ( ! (pLocFrag->nGetAlignStart() <= nConsPosStart && nConsPosStart <= nConsPosEnd && nConsPosEnd <= pLocFrag->nGetAlignEnd() ) ) { PARSE_PANIC_TAG_LINE_2_ARGS( "and <= (counting *'s) and must be within the reads whose ends are %d and %d", pLocFrag->nGetAlignStart(), pLocFrag->nGetAlignEnd() ); } } // RWCString soComment = soGetCommentIfAny(); // if ( soTagType == "comment" && soComment.isNull() ) // soComment = soGetOldStyleCommentForCommentTag(); tag* pTag; RWCString soComment; RWCString soMiscData; if ( soTagType == "oligo" ) { RWCString soName; RWCString soBases; int nMeltingTemp; RWCString soTemplates; bool bComplementedWithRespectToWayPhrapCreatedContig; long lID = nUndefinedTagID; bool bEndOfTag = false; do { FGETS_OR_PARSE_PANIC_3_ARGS( "premature end of file while in a %s tag from %d to %d", soTagType.data(), nConsPosStart, nConsPosEnd ); if ( memcmp( szLine, "COMMENT{", 8 ) == 0 && isspace( szLine[8] ) ) soComment = soGetComment(); else if ( szLine[0] == '}' && isspace( szLine[1] ) ) bEndOfTag = true; else if ( szLine[0] == 'I' && szLine[1] == 'D' && szLine[2] == ':' ) { RWCString soRestOfLine( &szLine[3] ); if ( !bIsNumericLongMaybeWithWhitespace( soRestOfLine, lID ) ) { RWCString soMessage = "ID: line in tag not followed by a number but rather by: "; soMessage += soRestOfLine; PARSE_PANIC( soMessage ); } ConsEd::pGetAssembly()->findLargestTagIDSoFar( lID ); } else getOligoInfo( pAceFile, soName, soBases, nMeltingTemp, bComplementedWithRespectToWayPhrapCreatedContig, soTemplates ); } while( !bEndOfTag ); if (soBases.isNull() ) PARSE_PANIC( "oligo tag did not contain any oligo information" ); adjustHighestOligoNumber( soName ); pTag = new oligoTag( pContig, nConsPosStart, nConsPosEnd, soTagSource, soDate, bNoTrans, soName, soBases, nMeltingTemp, bComplementedWithRespectToWayPhrapCreatedContig, soTemplates, soComment ); pTag->lID_ = lID; } else if ( soTagType == "autoFinishExp" ) { assert( !bIsReadNotConsensusTag ); createAutoFinishExpTag( pAceFile, pContig, nConsPosStart, nConsPosEnd, soTagSource, soDate ); return; } else { // any other kind of tag RWTPtrOrderedVector* pArrayOfUserDefinedTagFields = NULL; if ( pTagType->bIsAUserDefinedTagType() ) { pArrayOfUserDefinedTagFields = new RWTPtrOrderedVector; pArrayOfUserDefinedTagFields->soName_ = "pArrayOfUserDefinedTagFields"; } int nIndexOfNextUserDefinedField = 0; // used by parseUserDefinedTagTypeLine long lID = nUndefinedTagID; bool bEndOfTag = false; do { FGETS_OR_PARSE_PANIC_3_ARGS( "premature end of file while in a %s tag from %d to %d", soTagType.data(), nConsPosStart, nConsPosEnd ); if ( memcmp( szLine, "COMMENT{", 8 ) == 0 && isspace( szLine[8] ) ) soComment = soGetComment(); else if ( szLine[0] == '}' && isspace( szLine[1] ) ) bEndOfTag = true; else if ( szLine[0] == 'I' && szLine[1] == 'D' && szLine[2] == ':' ) { RWCString soRestOfLine( &szLine[3] ); if ( !bIsNumericLongMaybeWithWhitespace( soRestOfLine, lID ) ) { RWCString soMessage = "ID: line not followed by a number but rather by: "; soMessage += soRestOfLine; PARSE_PANIC( soMessage ); } ConsEd::pGetAssembly()->findLargestTagIDSoFar( lID ); } else { if ( pTagType->bIsAUserDefinedTagType() ) { userDefinedTagField* pUserDefinedTagField = NULL; parseUserDefinedTagTypeLine( szLine, (userDefinedTagType*) pTagType, pUserDefinedTagField, nIndexOfNextUserDefinedField, myParsePanic, myFgetsOrParsePanic ); pArrayOfUserDefinedTagFields->insert( pUserDefinedTagField ); } else { soMiscData += szLine; } } } while( !bEndOfTag ); if ( soTagType == "comment" && soComment.isNull() ) { soComment = soMiscData; soMiscData = ""; } if ( pTagType->bIsAUserDefinedTagType() ) { RWCString soErrorMessage; if ( !((userDefinedTagType*) pTagType)->bCorrectNumberOfEachField( pArrayOfUserDefinedTagFields, soErrorMessage ) ) { PARSE_PANIC( soErrorMessage ); } } pTag = new tag( pLocFrag, pContig, soTagType, nConsPosStart, nConsPosEnd, false, // bWriteToPhdFileNotAceFile_ soComment, soTagSource, soDate, bNoTrans ); soMiscData.stripTrailingWhitespaceFast(); pTag->soMiscData_ = soMiscData; pTag->lID_ = lID; if ( pTagType->bIsAUserDefinedTagType() ) { pTag->pUserDefinedTagType_ = (userDefinedTagType*) pTagType; pTag->pArrayOfUserDefinedTagFields_ = pArrayOfUserDefinedTagFields; pTag->pArrayOfUserDefinedTagFields_->soName_ = "tag::pArrayOfUserDefinedTagFields"; } } // if ( soTagType == "oligo" ) { else if (pLocFrag) pLocFrag->addTag( pTag ); else pContig->addConsensusTag( pTag ); } void Assembly :: getSinglets( const RWCString& soBody ) { // if this ace file has been run through mergeAceFiles.perl, // the first line may be // ORIG ACEFILE PREFIX: prefix // as an example. So through that line away. // line is of form: // SN 26 // so start at position 3 and get the number RWCTokenizer tokLines( soBody ); soTemp2 = tokLines('\n'); if ( soTemp2.bStartsWith( "ORIG ACEFILE PREFIX:" ) ) { // just get the next line. soTemp2 = tokLines('\n'); } if ( soTemp2( 0, 3 ) != "SN " ) { PARSE_PANIC_1_ARG( "singlets WA block should start with SN but instead has %s", (char*) soTemp2.data() ); } int nNumberOfSinglets; char* szNumber = (char*) soTemp2.data() + 3; if ( !bIsNumericMaybeWithWhitespace( szNumber, nNumberOfSinglets ) ) { PARSE_PANIC_2_ARGS( "singlets block should start with SN but %s was not numeric in line %s", szNumber, (char*) soTemp2.data() ); } if ( nNumberOfSinglets < 0 ) { PARSE_PANIC_1_ARG( "singlets block had negative number of singlets %d", nNumberOfSinglets ); } if ( nNumberOfSinglets == 0 ) return; aSingletPHDFilenames_.resize( (size_t) nNumberOfSinglets ); aSingletReadNames_.resize( (size_t) nNumberOfSinglets ); int nSinglets = 0; while( !( soTemp2 = tokLines( '\n' )).isNull() ) { ++nSinglets; RWCTokenizer tokWords( soTemp2 ); RWCString soReadName = tokWords(); int nIndex = soTemp2.index( "PHD_FILE: " ); if ( nIndex != RW_NPOS ) { // PHD_FILE: abc.phd.1 // then nAfterPHD_FILE points to the "a" in abc.phd.1 int nAfterPHD_FILE = nIndex + 10; // find the first non space while( nAfterPHD_FILE < soTemp2.length() && soTemp2[ nAfterPHD_FILE ] == ' ') ++nAfterPHD_FILE; int nEndPos = nAfterPHD_FILE + 1; while( nEndPos < soTemp2.length() && soTemp2[ nEndPos ] != ' ' ) ++nEndPos; // so nAfterPHD_FILE is before the end if ( nEndPos - 1 < soTemp2.length() ) { RWCString soPHDFile = soTemp2( nAfterPHD_FILE, nEndPos - nAfterPHD_FILE ); aSingletPHDFilenames_.insert( soPHDFile ); aSingletReadNames_.insert( soReadName ); } } } if ( nSinglets != nNumberOfSinglets ) { PARSE_PANIC_2_ARGS( "In SN block. There were supposed to be %d singlets but got to end of file after only %d singlets", nNumberOfSinglets, nSinglets ); } } // getSinglets void Assembly :: getWAAfterFirstLine( FILE* pAceFile ) { RWCString soTempp( (RWSize_2T) 10 ); FGETS_OR_PARSE_PANIC( "premature end of file in the middle of a WA{" ); char* szWAType = strtok( szLine, " " ); char* szSource = strtok( NULL, " " ); char* szDateTime = strtok( NULL, "\n " ); #define PARSE_PANIC_WA_LINE( szAdditionalMessage ) \ PARSE_PANIC( "WA line should be:\n \n" #szAdditionalMessage ) if (!szDateTime ) PARSE_PANIC_WA_LINE( "but not all these fields are present" ); // before anymore lines are read, we must save szWAType, szSource, // and szDateTime since they are just pointing to locations on // szLine strcpy( szWATypeSaved, szWAType ); strcpy( szTagSourceSaved, szSource ); strcpy( szDateSaved, szDateTime ); getCommentOfRTOrWA( false, // haven't already read a line into szLine pAceFile, soBig ); wholeAssemblyItem* pWA = new wholeAssemblyItem( szWATypeSaved, szTagSourceSaved, szDateSaved, soBig ); addWholeAssemblyItem( pWA ); // now read singlets part of ace file regardless whether autofinish // is running or not (Jan 2000, DG) // if ( consedParameters::pGetConsedParameters()->bAutoFinishRunning_ ) { if ( memcmp( szWATypeSaved, "singlets", 8 ) == 0 ) getSinglets( soBig ); } void Assembly :: getMiscTags( FILE* pAceFile ) { bool bEndOfAceFile = false; do { if ( fgets( szLine, SZLINE_SIZE, pAceFile ) == NULL ) bEndOfAceFile = true; else { ++nCurrentLine; strcpy( szLineSaved, szLine ); try { if (memcmp( szLine, "CT{", 3 ) == 0 ) { getConsensusTagAfterFirstLine( pAceFile, NULL, false ); } else if (memcmp( szLine, "WA{", 3 ) == 0 ) { getWAAfterFirstLine( pAceFile ); } else if (memcmp( szLine, "RT{", 3 ) == 0 ) { getReadTagAfterFirstLine( pAceFile, NULL ); // LocatedFragment } else if ( szLine[0] != '\n' ) { cerr << "unrecognized line: " << szLine << endl; } } catch( InputDataError ide ) { popupErrorMessage( ide.szGetDesc() ); } } } while( !bEndOfAceFile ); } void Assembly :: getReadTag( FILE* pAceFile, LocatedFragment* pLocFrag ) { bool bFoundTag = false; do { FGETS_OR_PARSE_PANIC( "premature end of file while expecting RT line" ); if ( memcmp( szLine, szREAD_TAG, nREAD_TAG ) == 0 ) bFoundTag = true; } while( !bFoundTag ); getReadTagAfterFirstLine( pAceFile, pLocFrag ); } static void getWholeReadInfo( FILE* pAceFile, LocatedFragment* pLocFrag ) {} void Assembly :: getContigLine( FILE* pAceFile, int& nNumberOfBases, int& nNumberOfReads, int& nNumberOfBaseSegments, Contig*& pContig ) { bool bFoundContig = false; do { FGETS_OR_PARSE_PANIC( "premature end of file while expecting CO line" ); if ( memcmp( szLine, szCONTIG, nCONTIG ) == 0 ) bFoundContig = true; } while( !bFoundContig ); char* szRestOfLine = szLine + nCONTIG; char* szContigName = strtok( szRestOfLine, " " ); char* szNumberOfBases = strtok( NULL, " " ); char* szNumberOfReads = strtok( NULL, " " ); char* szNumberOfBaseSegments = strtok( NULL, " " ); char* szComplementedFlag = strtok( NULL, "\n " ); #define PARSE_PANIC_CONTIG_LINE( szAdditionalMessage ) \ PARSE_PANIC( "Contig line should be:\nCO \n" #szAdditionalMessage ) if (! szContigName || !szNumberOfBases || !szNumberOfReads || !szNumberOfBaseSegments || !szComplementedFlag ) PARSE_PANIC_CONTIG_LINE( "but there are some missing fields" ); if ( nAToIWithError( szNumberOfBases, nNumberOfBases ) != NO_ERROR ) PARSE_PANIC_CONTIG_LINE( "number of bases is not numeric" ); if ( nAToIWithError( szNumberOfReads, nNumberOfReads ) != NO_ERROR ) PARSE_PANIC_CONTIG_LINE( "number of reads is not numeric" ); if ( nAToIWithError( szNumberOfBaseSegments, nNumberOfBaseSegments ) != NO_ERROR ) PARSE_PANIC_CONTIG_LINE( "number of base segments is not numeric" ); bool bComplementedFromWayPhrapCreated; if ( szComplementedFlag[0] == 'C' && szComplementedFlag[1] == '\0' ) bComplementedFromWayPhrapCreated = true; else if (szComplementedFlag[0] == 'U' && szComplementedFlag[1] == '\0' ) bComplementedFromWayPhrapCreated = false; else { PARSE_PANIC_CONTIG_LINE( "complemented flag must be C or U" ); } pContig = new Contig( szContigName, nNumberOfReads, nNumberOfBaseSegments, bComplementedFromWayPhrapCreated, this ); addContig( pContig ); } static void getContigBases( FILE* pAceFile, Contig* pContig, const int nNumberOfBases ) { if ( nNumberOfBases > nBasesCapacity ) { if (szBases ) free( szBases ); // +1 is to leave room for null terminator szBases = (char*) malloc( sizeof(char) * ( nNumberOfBases + 1 ) ); if (!szBases ) PARSE_PANIC_1_ARG( "could not get enough memory to store consensus bases for contig %s", (char*) pContig->soGetName().data() ); nBasesCapacity = nNumberOfBases; } szBases[0] = '\0'; nBasesCurrentLength = 0; bool bEndOfContigBases = false; do { FGETS_OR_PARSE_PANIC_1_ARG( "premature end of file while reading Contig bases for contig %s", pContig->soGetName().data() ); if ( szLine[0] == '\n' ) bEndOfContigBases = true; else { int nLineLength = strlen( szLine ) - 1; // -1 since final carriage return if (nBasesCurrentLength + nLineLength > nNumberOfBases ) PARSE_PANIC_2_ARGS( "Contig line said %d bases but already finding %d bases", nNumberOfBases, nBasesCurrentLength + nLineLength ); memcpy( szBases + nBasesCurrentLength, szLine, nLineLength ); nBasesCurrentLength += nLineLength; } } while( !bEndOfContigBases ); szBases[ nBasesCurrentLength ] = '\0'; pContig->setSequence3( szBases, nBasesCurrentLength ); } // getContigBases // when this returns, nNextPaddedConsPos will be advanced to 1 after the // last non-pad whose quality was set. That is, 1 more than the // base corresponding to the last quality on this line. static void processBaseQualityLine( char* szLine, Contig* pContig, int& nNextPaddedConsPos, const int nMaxPaddedConsPos ) { // find 1st whitespace char* pCurrent; if (szLine[0] == ' ') pCurrent = szLine + 1; else pCurrent = szLine; // -1 to eliminate carriage return char* pPastEnd = szLine + strlen( szLine ) - 1; while( pCurrent < pPastEnd ) { int nQuality = 0; while( ( *pCurrent != ' ' ) && ( pCurrent < pPastEnd ) ) { if (! isdigit( *pCurrent ) ) PARSE_PANIC( "non-digit and non-space on a base quality line" ); nQuality = 10 * nQuality + (*pCurrent - '0' ); ++pCurrent; } // exited because hit a space or advanced past end // usually exited because hit a space so prepare for next base quality // on this line ++pCurrent; // Which base do we assign this quality to? Move past any pads. while( pContig->ntGetConsUnsafe( nNextPaddedConsPos).bIsPad() ) ++nNextPaddedConsPos; if ( nNextPaddedConsPos > nMaxPaddedConsPos ) PARSE_PANIC_1_ARG( "There must be exactly the same number of quality values as unpadded consensus bases for contig %s", (char*) pContig->soGetName().data() ); pContig->setQualityAtSeqPos( nNextPaddedConsPos, nQuality ); ++nNextPaddedConsPos; } } // processBaseQualityLine static void getContigConsensusBaseQualities( FILE* pAceFile, Contig* pContig ) { int nMaxPaddedConsPos = pContig->nGetEndConsensusIndex(); bool bFoundBaseQuality = false; do { FGETS_OR_PARSE_PANIC( "premature end of file while expecting BaseQuality line" ); if ( memcmp( szLine, szBASEQUALITY, nBASEQUALITY ) == 0 ) bFoundBaseQuality = true; } while( !bFoundBaseQuality ); int nNextPaddedContigPos = 1; bool bEndBaseQuality = false; do { FGETS_OR_PARSE_PANIC( "premature end of file while reading BaseQuality lines" ); if ( szLine[0] == '\n' ) bEndBaseQuality = true; else processBaseQualityLine( szLine, pContig, nNextPaddedContigPos, nMaxPaddedConsPos ); } while( !bEndBaseQuality ); // check that all unpadded bases have been assigned a quality if ( nNextPaddedContigPos - 1 < nMaxPaddedConsPos ) { // remaining bases then should all be pads // If not, there weren't enough base quality values. for( int nPadded = nNextPaddedContigPos; nPadded <= nMaxPaddedConsPos; ++nPadded ) { if ( ! pContig->ntGetConsUnsafe( nPadded ).bIsPad() ) PARSE_PANIC_1_ARG( "There are not enough BaseQuality values for all non-pad consensus bases for contig %s", (char*) pContig->soGetName().data() ); } } } static void getAssembledFromBlock( FILE* pAceFile, Contig* pContig, const int nNumberOfReads ) { for( int nRead = 0; nRead < nNumberOfReads; ++nRead ) { bool bFoundAssembledFromLine = false; do { FGETS_OR_PARSE_PANIC_1_ARG( "premature end of file while looking for AF lines for contig %s", (char*) pContig->soGetName().data() ); if (memcmp( szLine, szASSEMBLED_FROM, nASSEMBLED_FROM ) == 0 ) bFoundAssembledFromLine = true; } while( !bFoundAssembledFromLine ); char* pEndOfLine = szLine + nASSEMBLED_FROM; char* szReadName = strtok( pEndOfLine, " " ); char* szComplemented = strtok( NULL, " " ); char* pAlignStart = strtok( NULL, "\n " ); bool bComplemented; if ( szComplemented[0] == 'C' ) bComplemented = true; else if (szComplemented[0] == 'U' ) bComplemented = false; else PARSE_PANIC( "line should be of form AF but C or U wasn't there" ); int nAlignStart; if ( nAToIWithError( pAlignStart, nAlignStart ) != NO_ERROR ) PARSE_PANIC( "line should be of form AF but padded align start wasn't numeric" ); LocatedFragment* pLocFrag = new LocatedFragment( szReadName, nAlignStart, bComplemented, pContig ); pContig->addLocatedFragment( pLocFrag ); } // for (int nRead... } static void processBaseSegmentLine( char* szLine, Contig* pContig ) { char* pCurrent = szLine + nBASE_SEGMENT; // char* pPastLastChar = pCurrent + strlen( pCurrent ); // skip past blanks while( *pCurrent == ' ' ) ++pCurrent; // parse base segment start position int nBaseSegStart = 0; do { if (! isdigit( *pCurrent ) ) PARSE_PANIC( "Should be BS but wasn't numeric" ); nBaseSegStart = 10*nBaseSegStart + (*pCurrent - '0' ); ++pCurrent; } while( *pCurrent != ' ' ); // skip over that blank ++pCurrent; // if there are any other blanks, skip over them also while( *pCurrent == ' ' ) ++pCurrent; // parse base segment end position int nBaseSegEnd = 0; do { if (! isdigit( *pCurrent ) ) PARSE_PANIC( "Should be BS but wasn't numeric" ); nBaseSegEnd = 10*nBaseSegEnd + (*pCurrent - '0' ); ++pCurrent; } while( *pCurrent != ' ' ); if ( nBaseSegStart > nBaseSegEnd ) PARSE_PANIC( "Should be BS and should be <= " ); // skip over that blank ++pCurrent; // skip over any other whitespace while( *pCurrent == ' ' ) ++pCurrent; // for efficiency, do my own tokenizing here for the read name char* szReadName = pCurrent; // find end of read name do { ++pCurrent; } while( *pCurrent != ' ' && *pCurrent != '\n' && *pCurrent != '\t' ); *pCurrent = '\0'; (void) bRemoveCompFromReadName( szReadName ); LocatedFragment* pLocFrag = pContig->pLocFragGetByName( szReadName ); if (!pLocFrag ) PARSE_PANIC( "BS line refers to a read that isn't in a AF line for this contig" ); BaseSegment* pBaseSegment = new BaseSegment( pLocFrag, nBaseSegStart, nBaseSegEnd ); pContig->baseSegArray_.addPtrToNewBaseSeg( pBaseSegment ); } // void processBaseSegmentLine( char* szLine ) { static void getBaseSegments( FILE* pAceFile, Contig* pContig, const int nNumberOfBaseSegments ) { pContig->baseSegArray_.resize( nNumberOfBaseSegments ); bool bFoundBaseSegment; for( int nBaseSegment = 0; nBaseSegment < nNumberOfBaseSegments; ++nBaseSegment ) { // This construction is to allow future additions of other fields bFoundBaseSegment = false; do { FGETS_OR_PARSE_PANIC_1_ARG( "premature end of file while looking for %d BS lines", nNumberOfBaseSegments ); if ( memcmp( szLine, szBASE_SEGMENT, nBASE_SEGMENT ) == 0 ) { processBaseSegmentLine( szLine, pContig ); bFoundBaseSegment = true; } } while( !bFoundBaseSegment ); } int nBadPosition; if ( !pContig->baseSegArray_.bIsBaseSegArraySorted( nBadPosition ) ) { PARSE_PANIC_2_ARGS( "Base segments for contig %s must be sorted either in forward order or backwards order. See base seg %d from beginning", pContig->soGetName(), nBadPosition ); } } static void getReadBases( FILE* pAceFile, Contig* pContig, LocatedFragment*& pLocFrag, int& nNumberOfWholeReadInfoItems, int& nNumberOfReadTags ) { #define READ_LINE_FORM( szMessage ) \ "Line should be of form RD <# of bases> <# of whole read infos> <# of read tags> " # szMessage do { FGETS_OR_PARSE_PANIC_1_ARG( "premature end of file while looking for read for contig %s", (char*) pContig->soGetName().data() ); } while( memcmp( szLine, szREAD, nREAD ) != 0 ); char* szAfterReadKeyword = szLine + nREAD; char* szReadName = strtok( szAfterReadKeyword, " " ); char* szNumberOfBases = strtok( NULL, " " ); char* szNumberOfWholeReadInfoItems = strtok( NULL, " " ); char* szNumberOfReadTags = strtok( NULL, "\n " ); // bRemoveCompFromReadName( szReadName ); int nNumberOfBases; if ( nAToIWithError( szNumberOfBases, nNumberOfBases ) != NO_ERROR ) PARSE_PANIC( READ_LINE_FORM( "but number of bases isn't numeric" ) ); if (nAToIWithError( szNumberOfWholeReadInfoItems, nNumberOfWholeReadInfoItems ) != NO_ERROR ) PARSE_PANIC( READ_LINE_FORM( "but # of whole read infos is not numeric" ) ); if (nAToIWithError( szNumberOfReadTags, nNumberOfReadTags ) != NO_ERROR ) PARSE_PANIC( READ_LINE_FORM( "but # of read tags is not numeric" ) ); // find pLocFrag object for this read pLocFrag = pContig->pLocFragGetByName( szReadName ); if (!pLocFrag ) PARSE_PANIC_1_ARG( "RD line refers to a read that isn't in a AF line for contig %s", (char*) pContig->soGetName().data() ); if ( nNumberOfBases > nBasesCapacity ) { if ( szBases ) free( szBases ); szBases = (char*) malloc( nNumberOfBases + 1 ); if (!szBases) PARSE_PANIC_1_ARG( "Can't get enough memory for read %s", szReadName ); nBasesCapacity = nNumberOfBases; } szBases[0] = '\0'; nBasesCurrentLength = 0; bool bEndOfReadBases = false; do { FGETS_OR_PARSE_PANIC_1_ARG( "premature end of file while read read base for read %s", (char*) pLocFrag->soGetName().data() ); if (szLine[0] == '\n' ) bEndOfReadBases = true; else { int nLineLength = strlen( szLine ) - 1; // -1 since final carriage return if (nBasesCurrentLength + nLineLength > nNumberOfBases ) PARSE_PANIC_3_ARGS( "Read line said %d bases but already finding %d bases for read %s", nNumberOfBases, nBasesCurrentLength + nLineLength, (char*) pLocFrag->soGetName().data() ); memcpy( szBases + nBasesCurrentLength, szLine, nLineLength ); nBasesCurrentLength += nLineLength; } } while( !bEndOfReadBases ); szBases[ nBasesCurrentLength ] = '\0'; pLocFrag->setSequence3( szBases, nBasesCurrentLength ); } // getReadBases static void getReadQualAndAlignClipping( FILE* pAceFile, LocatedFragment* pLocFrag ) { #define QA_LINE_FORM( szMessage ) \ "Line should be of form QA " # szMessage bool bFoundQualAndAlignClipping = false; do { FGETS_OR_PARSE_PANIC_1_ARG( "premature end of file while expecting QA line for read", pLocFrag->soGetName().data() ); if ( memcmp( szLine, szQUAL_AND_ALIGN_CLIPPING, nQUAL_AND_ALIGN_CLIPPING ) == 0 ) bFoundQualAndAlignClipping = true; } while( !bFoundQualAndAlignClipping ); char* szCurrent = szLine + nQUAL_AND_ALIGN_CLIPPING; char* szQualClippingStart = strtok( szCurrent, " " ); char* szQualClippingEnd = strtok( NULL, " " ); char* szAlignClippingStart = strtok( NULL, " " ); char* szAlignClippingEnd = strtok( NULL, " \n" ); int nQualClippingStart; if (nAToIWithError( szQualClippingStart, nQualClippingStart ) != NO_ERROR ) PARSE_PANIC( QA_LINE_FORM( "but qual clipping start is not numeric" ) ); int nQualClippingEnd; if (nAToIWithError( szQualClippingEnd, nQualClippingEnd ) != NO_ERROR ) PARSE_PANIC( QA_LINE_FORM( "but qual clipping end is not numeric" ) ); int nAlignClippingStart; if (nAToIWithError( szAlignClippingStart, nAlignClippingStart ) != NO_ERROR ) PARSE_PANIC( QA_LINE_FORM( "but align clipping start is not numeric" ) ); int nAlignClippingEnd; if (nAToIWithError( szAlignClippingEnd, nAlignClippingEnd ) != NO_ERROR ) PARSE_PANIC( QA_LINE_FORM( "but align clipping end is not numeric" ) ); if ( nQualClippingStart == -1 && nQualClippingEnd == -1 ) pLocFrag->setWholeReadIsLowQuality(); else { if ( ! ( ( nQualClippingStart <= nQualClippingEnd ) && ( pLocFrag->nGetStartIndex() <= nQualClippingStart ) && ( nQualClippingEnd <= pLocFrag->nGetEndIndex() ) ) ) PARSE_PANIC_2_ARGS( "Line should be of form QA but pLocFrag->nGetStartIndex() = %d and pLocFrag->nGetEndIndex() = %d and qual clipping start and end are not within the bounds of the read and in the correct order", pLocFrag->nGetStartIndex(), pLocFrag->nGetEndIndex() ); } if ( nAlignClippingStart == -1 && nAlignClippingEnd == -1 ) pLocFrag->setWholeReadIsUnaligned(); else { if ( ! ( ( nAlignClippingStart <= nAlignClippingEnd ) && ( pLocFrag->nGetStartIndex() <= nAlignClippingStart ) && ( nAlignClippingEnd <= pLocFrag->nGetEndIndex() ) ) ) PARSE_PANIC( QA_LINE_FORM( "but the align clipping start and end are not within the bounds of the read and in the correct order" ) ); } int nQualClippingStartConsPos; int nQualClippingEndConsPos; if ( pLocFrag->bIsWholeReadLowQuality() ) { nQualClippingStartConsPos = -1; nQualClippingEndConsPos = -1; } else { nQualClippingStartConsPos = pLocFrag->nGetConsPosFromFragIndex( nQualClippingStart ); nQualClippingEndConsPos = pLocFrag->nGetConsPosFromFragIndex( nQualClippingEnd ); } int nAlignClippingStartConsPos; int nAlignClippingEndConsPos; if ( pLocFrag->bIsWholeReadUnaligned() ) { nAlignClippingStartConsPos = -1; nAlignClippingEndConsPos = -1; } else { nAlignClippingStartConsPos = pLocFrag->nGetConsPosFromFragIndex( nAlignClippingStart ); nAlignClippingEndConsPos = pLocFrag->nGetConsPosFromFragIndex( nAlignClippingEnd ); } pLocFrag->setQualAndAlignClipping( nQualClippingStartConsPos, nQualClippingEndConsPos, nAlignClippingStartConsPos, nAlignClippingEndConsPos ); } static void getReadDescription( FILE* pAceFile, LocatedFragment* pLocFrag ) { bool bFoundDescription = false; do { FGETS_OR_PARSE_PANIC_1_ARG( "premature end of file while expecting DS line for read", pLocFrag->soGetName().data() ); if ( memcmp( szLine, szDESCRIPTION, nDESCRIPTION ) == 0 ) bFoundDescription = true; } while( !bFoundDescription ); char* szCurrent = szLine + nDESCRIPTION; char* szKeyword = strtok( szCurrent, " " ); bool bFoundChromatFilename = false; bool bFoundPHDFilename = false; bool bFoundPHDTimestamp = false; bool bFoundVersion = false; do { if ( memcmp( szKeyword, szCHROMAT_FILE, nCHROMAT_FILE ) == 0 ) { char* szChromatFilename = strtok( NULL, " " ); if (!szChromatFilename ) PARSE_PANIC( "CHROMAT_FILE: must be followed by the chromatogram filename" ); pLocFrag->setChromatFilename( szChromatFilename ); bFoundChromatFilename = true; } else if ( memcmp( szKeyword, szPHD_FILE, nPHD_FILE ) == 0 ) { char* szPHDFilename = strtok( NULL, " " ); if (! szPHDFilename ) PARSE_PANIC( "PHD_FILE: must be followed by the phd filename" ); pLocFrag->setPHDFilename( szPHDFilename ); bFoundPHDFilename = true; } else if ( memcmp( szKeyword, szTIME, nTIME ) == 0 ) { char* szDayOfWeek = strtok( NULL, " " ); char* szMonth = strtok( NULL, " " ); char* szDayOfMonth = strtok( NULL, " " ); char* szHHMMSS = strtok( NULL, " " ); char* szYYYY = strtok( NULL, "\n " ); if (! szYYYY ) PARSE_PANIC( "TIME: must be followed by something like this:\n Thu Mar 19 18:38:28 2008" ); // sometimes the timezone is put here like this: // Sat Nov 3 01:19:20 PDT 2007 // In such a case, ignore the PDT and put the 2007 into the year if ( !bIsNumeric( szYYYY ) ) { szYYYY = strtok( NULL, "\n " ); if ( ! szYYYY ) { PARSE_PANIC( "TIME: must be followed by something like this:\n Thu Mar 19 18:38:28 2008" ); } } char szPHDTimestamp[200]; char* szOneBlank = " "; strcpyEfficient( szPHDTimestamp, szDayOfWeek, szCurrent ); strcpyEfficient( szCurrent, szOneBlank, szCurrent ); strcpyEfficient( szCurrent, szMonth, szCurrent ); strcpyEfficient( szCurrent, szOneBlank, szCurrent ); strcpyEfficient( szCurrent, szDayOfMonth, szCurrent ); strcpyEfficient( szCurrent, szOneBlank, szCurrent ); strcpyEfficient( szCurrent, szHHMMSS, szCurrent ); strcpyEfficient( szCurrent, szOneBlank, szCurrent ); strcpyEfficient( szCurrent, szYYYY, szCurrent ); pLocFrag->setPHDTimestamp( szPHDTimestamp ); bFoundPHDTimestamp = true; } // we are not paying attention to chemistry in the ace file-- // we will use the phd file to set the chemistry. // else if ( memcmp( szKeyword, szCHEM, nCHEM ) == 0 ) { // char* szChemistry = strtok( NULL, " " ); // if ( memcmp( szChemistry, szSOLEXA, nSOLEXA ) == 0 ) { // pLocFrag->bIsSolexaRead_ = true; // } // } else if ( memcmp( szKeyword, szVERSION, nVERSION ) == 0 ) { char* szVersion = strtok( NULL, " " ); if ( !bIsNumeric( szVersion ) ) { PARSE_PANIC( "VERSION: must be followed by a number but isn't" ); } pLocFrag->nVersion_ = atoi( szVersion ); bFoundVersion = true; } szKeyword = strtok( NULL, " \n" ); } while( szKeyword ); // we just put the solexa file in the PHD_FILE field, but don't // keep it there in consed, since it will cause problems if we edit // this read and try to write it out. #define DESCRIPTION_ERROR( szMessage ) \ #szMessage " on this DS line. This may be missing because you didn't use phredPhrap and phd2fasta (which is called by phredPhrap). These programs were downloaded with consed so you have them" // we'll need to adjust these for 454 reads // I think even Sanger reads no longer need to have phd files since they // could be in the phd ball and the version could be in nVersion_. // if (!bFoundChromatFilename && ConsEd::pGetConsEd()->bUsingPhdFiles_ && // !pLocFrag->bIsSolexaRead_ ) // PARSE_PANIC( // DESCRIPTION_ERROR( "CHROMAT_FILE: not found" ) // ); // do not do these checks for assemblies that do not have phd files/balls if ( ConsEd::pGetConsEd()->bUsingPhdFiles_ ) { if (!bFoundPHDFilename && !bFoundVersion ) PARSE_PANIC( DESCRIPTION_ERROR( "neither VERSION: nor PHD_FILE: not found" ) ); if ( !bFoundVersion && bFoundPHDFilename ) { pLocFrag->nVersion_ = pLocFrag->filGetPHDFilename().nGetVersion(); } // the version should be set by now. assert( pLocFrag->nVersion_ != 0 ); if (!bFoundPHDTimestamp ) PARSE_PANIC( DESCRIPTION_ERROR( "TIME: not found" ) ); } } // getReadDescription bool Assembly :: bGetAssemblyLine( FILE* pAceFile, int& nNumberOfContigs, int& nNumberOfReads ) { FGETS_OR_PARSE_PANIC_1_ARG( "premature end of file while expecting AS line for assembly %s", szAceFilenameStatic ); if ( memcmp( szLine, szASSEMBLY, nASSEMBLY ) != 0 ) return( true ); char* szRestOfLine = szLine + nASSEMBLY; char* szNumberOfContigs = strtok( szRestOfLine, " " ); char* szNumberOfReads = strtok( NULL, "\n " ); #define ASSEMBLY_LINE_FORM( szMessage ) \ "Line should be of form AS " # szMessage if ( nAToIWithError( szNumberOfContigs, nNumberOfContigs ) != NO_ERROR ) PARSE_PANIC( ASSEMBLY_LINE_FORM( "but number of contigs was not numeric" ) ); if ( nAToIWithError( szNumberOfReads, nNumberOfReads ) != NO_ERROR ) PARSE_PANIC( ASSEMBLY_LINE_FORM( "but number of reads was not numeric" ) ); setNumberOfReadsInAssembly( nNumberOfReads ); return( false ); } static void setConstantStringLengths() { nASSEMBLY = strlen( szASSEMBLY ); nCONTIG = strlen( szCONTIG ); nASSEMBLED_FROM = strlen( szASSEMBLED_FROM ); nREAD = strlen( szREAD ); nBASEQUALITY = strlen( szBASEQUALITY ); nBASE_SEGMENT = strlen( szBASE_SEGMENT ); nDESCRIPTION = strlen( szDESCRIPTION ); nCOMP = strlen( szCOMP ); nCHROMAT_FILE = strlen( szCHROMAT_FILE ); nPHD_FILE = strlen( szPHD_FILE ); nVERSION = strlen( szVERSION ); nTIME = strlen( szTIME ); nREAD_TAG = strlen( szREAD_TAG ); nQUAL_AND_ALIGN_CLIPPING = strlen( szQUAL_AND_ALIGN_CLIPPING ); nCONSENSUS_TAG = strlen( szCONSENSUS_TAG ); nOLIGO_TAG = strlen( szOLIGO_TAG ); nCOMMENT = strlen( szCOMMENT ); nCHEM = strlen( szCHEM ); nSOLEXA = strlen( szSOLEXA ); } void Assembly :: parseAceFile( const char* szAceFilename, bool& bPerhapsAceFileFormat1 ) { nHighestOligoNumber = 0; setConstantStringLengths(); szAceFilenameStatic = (char*) szAceFilename; bAlreadyReadLine = false; pAceFile = fopen( szAceFilename, "r" ); if (! pAceFile ) { ostringstream ost; ost << "Unable to open file " << szAceFilename << endl << ends; SysRequestFailed srf(ost.str().c_str()); srf.includeErrnoDescription(); throw srf; } int nNumberOfContigs; int nNumberOfReadsInAssembly; bPerhapsAceFileFormat1 = bGetAssemblyLine( pAceFile, nNumberOfContigs, nNumberOfReadsInAssembly ); if (bPerhapsAceFileFormat1 ) { fclose( pAceFile ); return; } // already read the top line, which is line 1 nCurrentLine = 1; int nReadsProcessedSoFar = 0; for( int nContig = 0; nContig < nNumberOfContigs; ++nContig ) { int nNumberOfBases; int nNumberOfReads; int nNumberOfBaseSegments; Contig* pContig; getContigLine( pAceFile, nNumberOfBases, nNumberOfReads, nNumberOfBaseSegments, pContig ); if ( nNumberOfReads > 1000000 || nNumberOfBaseSegments > 1000000 || nNumberOfBases > 1000000 ) { printf( "big contig %s--please wait...\n", pContig->soGetName().data() ); } getContigBases( pAceFile, pContig, nNumberOfBases ); getContigConsensusBaseQualities( pAceFile, pContig ); getAssembledFromBlock( pAceFile, pContig, nNumberOfReads ); getBaseSegments( pAceFile, pContig, nNumberOfBaseSegments ); for( int nRead = 0; nRead < nNumberOfReads; ++nRead ) { LocatedFragment* pLocFrag; int nNumberOfReadTags; int nNumberOfWholeReadInfoItems; getReadBases( pAceFile, pContig, pLocFrag, nNumberOfWholeReadInfoItems, nNumberOfReadTags ); getReadQualAndAlignClipping( pAceFile, pLocFrag ); getReadDescription( pAceFile, pLocFrag ); for( int nReadTag = 0; nReadTag < nNumberOfReadTags; ++nReadTag ) getReadTag( pAceFile, pLocFrag ); for( int nWholeReadInfo = 0; nWholeReadInfo < nNumberOfWholeReadInfoItems; ++nWholeReadInfo ) getWholeReadInfo( pAceFile, pLocFrag ); const int nPrintFirstSoManyReads = 20; ++nReadsProcessedSoFar; if ( ( nReadsProcessedSoFar % 50 == 0 ) || ( nReadsProcessedSoFar < nPrintFirstSoManyReads ) ) { if ( ( nReadsProcessedSoFar < nPrintFirstSoManyReads ) || ( nReadsProcessedSoFar % 1000 == 0 && ( nReadsProcessedSoFar < 10000 ) ) || ( ( nReadsProcessedSoFar % 10000 == 0 ) && ( nReadsProcessedSoFar < 100000 ) ) || ( nReadsProcessedSoFar % 100000 == 0 ) ) { int nPerCentDone = ( nReadsProcessedSoFar * 100.0 ) / ( 2.0 * nNumberOfReadsInAssembly ); printf( "%d%% done. %s reads read so far...\n", nPerCentDone, soAddCommas( nReadsProcessedSoFar ).data() ); fflush( stdout ); } } } } getMiscTags( pAceFile ); fclose( pAceFile ); setLastUsedOligoNumber( nHighestOligoNumber ); } // void Assembly :: parseAceFile( const char* szAceFilename, bool& bPerhapsAceFileFormat1 ) { void Assembly :: parseConsensusTagsInAceFileFormat1( const char* szAceFilename, const long lPositionOfConsensusTags ) { setConstantStringLengths(); szAceFilenameStatic = (char*) szAceFilename; FILE* pAceFile = fopen( szAceFilename, "r" ); if (! pAceFile ) { ostringstream ost; ost << "Unable to open file " << szAceFilename << endl << ends; SysRequestFailed srf(ost.str().c_str()); srf.includeErrnoDescription(); throw srf; } fseek( pAceFile, lPositionOfConsensusTags, SEEK_SET ); getMiscTags( pAceFile ); fclose( pAceFile ); setLastUsedOligoNumber( nHighestOligoNumber ); }