/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "rwcstring.h" #include using namespace std; #include #include #include "rwcregexp.h" #include "rwctokenizer.h" #include #include #include "locatedFragment.h" #include "cComplementBase.h" #include "readPHD.h" #include "consed.h" #include "bIsNumeric.h" #include "tagTypes.h" #include "tag.h" #include "soGetRestOfLine.h" #include "bIsNumericMaybeWithWhitespace.h" #include "bIsNumericLongMaybeWithWhitespace.h" #include "wholeReadItem.h" #include "consedParameters.h" #include "nLine.h" #include "readWholeReadItem.h" #include "soAddCommas.h" #include "phdBall2Fasta.h" #define PARSE_PANIC(message) \ { ostringstream ost; \ ost << "Error detected from source file " \ << __FILE__ << " at " << __LINE__ <filFileOfPhdFilesCurrentlyBeingRead_ << " at line " << nLine << ": \n" \ << szLine << endl \ << message << endl << ends; \ InputDataError ide(ost.str().c_str()); \ throw ide; } #define PARSE_PANIC2( file, nLine, szMessage, szLine ) \ { ostringstream ost; \ ost << "Error detected from source file " \ << __FILE__ << " at " << __LINE__ <filFileOfPhdFilesCurrentlyBeingRead_ << " at line " << \ nLine << " detected from source file " << \ __FILE__ << " at line " << __LINE__ << endl << \ szTemp << endl << "strlen( szLine ) = " << strlen( szLine ) << "\n" << \ szLine << ends; \ InputDataError ide( ost.str().c_str() ); \ throw ide; } #define PARSE_PANIC_1_ARG( szMessage, arg1 ) { \ char szTemp[1000]; \ sprintf( szTemp, szMessage, arg1 ); \ \ ostringstream ost; \ ost << "Problem reading file of phd files " << pCP->filFileOfPhdFilesCurrentlyBeingRead_ << " at line " << \ nLine << " detected from source file " << \ __FILE__ << " at line " << __LINE__ << endl << \ szTemp << endl << ends; \ InputDataError ide( ost.str().c_str() ); \ throw ide; } #define FGETS_OR_PARSE_PANIC( szErrorMessage ) \ if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) \ PARSE_PANIC( szErrorMessage ); \ ++nLine; \ soLine = szLine; static bool bCalculatedLengths = false; static char* szBEGIN_SEQUENCE = "BEGIN_SEQUENCE"; static int nBEGIN_SEQUENCE; static char* szBEGIN_COMMENT = "BEGIN_COMMENT"; static int nBEGIN_COMMENT; static char* szABI_THUMBPRINT = "ABI_THUMBPRINT:"; static int nABI_THUMBPRINT; static char* szPHRED_VERSION = "PHRED_VERSION:"; static int nPHRED_VERSION; static char* szCALL_METHOD = "CALL_METHOD:"; static int nCALL_METHOD; static char* szQUALITY_LEVELS = "QUALITY_LEVELS:"; static int nQUALITY_LEVELS; static char* szTIME = "TIME:"; static int nTIME; static char* szTRACE_ARRAY_MIN_INDEX = "TRACE_ARRAY_MIN_INDEX:"; static int nTRACE_ARRAY_MIN_INDEX; static char* szTRACE_ARRAY_MAX_INDEX = "TRACE_ARRAY_MAX_INDEX:"; static int nTRACE_ARRAY_MAX_INDEX; static char* szEND_COMMENT = "END_COMMENT"; static int nEND_COMMENT; static char* szBEGIN_DNA = "BEGIN_DNA"; static int nBEGIN_DNA; static char* szEND_DNA = "END_DNA"; static int nEND_DNA; static bool bFoundNonBlank; static bool bFoundBlank; static int nPos; static int nQuality; static int nPeakPosition; const int nMaxLineSize = 300; static char szLine[ nMaxLineSize + 1]; static RWCString soLine; static int* pPaddedFromUnpaddedArray; static int nPaddedFromUnpaddedArraySize = 0; static int nNumberOfUnpaddedBasesStatic = 0; static bool bReadComplemented = false; static void setPaddedFromUnpaddedArray( const int nNumberOfUnpaddedBases, LocatedFragment* pLocFrag ) { nNumberOfUnpaddedBasesStatic = nNumberOfUnpaddedBases; bReadComplemented = pLocFrag->bComp(); if ( ( nNumberOfUnpaddedBases + 10 ) > nPaddedFromUnpaddedArraySize ) { if ( pPaddedFromUnpaddedArray ) free( pPaddedFromUnpaddedArray ); pPaddedFromUnpaddedArray = (int*) malloc( sizeof( int) * ( nNumberOfUnpaddedBases + 10 ) ); if ( !pPaddedFromUnpaddedArray ) { PARSE_PANIC_2_ARGS( "could not get enough memory %d int's for read %s\n", nNumberOfUnpaddedBases, (char*) pLocFrag->soGetName().data() ); } nPaddedFromUnpaddedArraySize = nNumberOfUnpaddedBases + 10; } int nUnpadded = 0; for( int nPaddedSeqPos = pLocFrag->nGetStartIndex(); nPaddedSeqPos <= pLocFrag->nGetEndIndex(); ++nPaddedSeqPos ) { if ( !pLocFrag->ntideGet( nPaddedSeqPos ).bIsPad() ) { ++nUnpadded; pPaddedFromUnpaddedArray[ nUnpadded ] = nPaddedSeqPos; } } } static inline int nGetPaddedConsPosFromUnpaddedPosInDirectionOfSequencing( LocatedFragment* pLocFrag, int nUnpaddedPos ) { if ( bReadComplemented ) nUnpaddedPos = nNumberOfUnpaddedBasesStatic + 1 - nUnpaddedPos; // nUnpaddedPos is now the unpadded sequence position in the // left-to-right direction // now we have the unpadded position from left to right // convert to padded coordinates assert( 1 <= nUnpaddedPos && nUnpaddedPos <= nNumberOfUnpaddedBasesStatic ); int nPaddedLeftToRightSequencePos = pPaddedFromUnpaddedArray[ nUnpaddedPos ] ; return( pLocFrag->nGetConsPosFromFragIndex( nPaddedLeftToRightSequencePos ) ); } static void readTag( FILE* pFil, const FileName& filPHD, LocatedFragment* pLocFrag, const int nUnpaddedBases, const bool bUnpaddedTagCoordinates ) { if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) { PARSE_PANIC( "premature end of file while looking for TYPE:" ); } ++nLine; RWCTokenizer tokTagType( szLine ); if ( tokTagType() != "TYPE:" ) { PARSE_PANIC2( filPHD, nLine, "no type after TYPE:", szLine ); } // current line contains something like // TYPE: phrap RWCString soTagType = tokTagType(); // The code below takes just as long (54 seconds) as the code above // if ( szLine[0] != 'T' || // szLine[1] != 'Y' || // szLine[2] != 'P' || // szLine[3] != 'E' || // szLine[4] != ':' ) // PARSE_PANIC2( filPHD, nLine, // "no type after TYPE:", // szLine ); // int n = 5; // while( szLine[n] == ' ') // ++n; // RWCString soTagType; // while( szLine[n] != '\n' && szLine[n] != '\0' ) { // soTagType.append( szLine[n] ); // ++n; // } // RWCTokenizer tokType( szLine ); // if ( tokType() != "TYPE:" ) // PARSE_PANIC2( filPHD, nLine, // "no type after TYPE:", // szLine ); // RWCString soTagType = tokType(); if (! tagTypes::pGetTagTypes()->bIsAValidTagType( soTagType ) ) PARSE_PANIC2( filPHD, nLine, "unrecognized tag type after TYPE:", szLine ); if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) { PARSE_PANIC( "premature end of file while looking for SOURCE:" ); } ++nLine; // this takes 56 seconds for the same project as opposed to 53 or 54 for // the RWCTokenizer method below // if ( szLine[0] != 'S' || // szLine[1] != 'O' || // szLine[2] != 'U' || // szLine[3] != 'R' || // szLine[4] != 'C' || // szLine[5] != 'E' || // szLine[6] != ':' ) { // PARSE_PANIC_1_ARG( "in tag, expecting SOURCE: line after TYPE: line but found %s", // szLine ); // } // // skip over spaces // n = 7; // while( szLine[n] == ' ' || szLine[n] == '\t' ) // ++n; // RWCString soSource; // while( szLine[n] != '\n' ) { // soSource.append( szLine[n] ); // ++n; // } RWCTokenizer tokSource( szLine ); if ( tokSource() != "SOURCE:" ) { PARSE_PANIC_1_ARG( "in tag, expecting SOURCE: line after TYPE: line but found %s", szLine ); } RWCString soSource = tokSource(); // Note: the above is *faster* than below. Don't ask me why. // If you don't believe it, try it yourself. The difference // was 49 seconds for below and 42 seconds above reproducibly. // do { // if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) { // PARSE_PANIC( filPHD, nLine, // "premature end of file while looking for SOURCE:" ); // } // ++nLine; // } while( ! ( szLine[0] == 'S' && // szLine[1] == 'O' && // szLine[2] == 'U' && // szLine[3] == 'R' && // szLine[4] == 'C' && // szLine[5] == 'E' && // szLine[6] == ':' && // szLine[7] == ' ' ) ); // char* szSource = strtok( szLine + 8, "\n " ); // RWCString soSource; // int n = 8; // while( szLine[n] != '\n' ) { // soSource.append( szLine[n] ); // ++n; // } // now allowing any source // // if ( // (soSource != "phrap") && // (soSource != "crossmatch" ) && // (soSource != "consed" ) && // (soSource != "polyPhred" ) && // (soSource != "other" ) // ) { // PARSE_PANIC2( filPHD, nLine, "SOURCE for tag must be phrap, crossmatch, consed, polyPhred, or other", // szLine ); // } if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) { PARSE_PANIC( "premature end of file while looking for UNPADDED READ POS:" ); } ++nLine; RWCTokenizer tokPos( szLine ); if ( tokPos() != "UNPADDED_READ_POS:" ) { PARSE_PANIC_1_ARG( "tag should have UNPADDED_READ_POS: on line following SOURCE: line but instead has %s", szLine ); } RWCString soPosStart = tokPos(); RWCString soPosEnd = tokPos(); if (soPosStart.isNull() || soPosEnd.isNull() ) PARSE_PANIC2( filPHD, nLine, "missing start of end position for tag", szLine ); if ( !bIsNumeric( soPosStart ) || !bIsNumeric( soPosEnd ) ) PARSE_PANIC2( filPHD, nLine, "non-numeric start or end position for tag", szLine ); int nUnpaddedStartPos = atoi( (char*) soPosStart.data() ); int nUnpaddedEndPos = atoi( (char*) soPosEnd.data() ); if ( bUnpaddedTagCoordinates ) { // when I used this, I had kept the bases in the pSeqPhredCalledBases_ if ( ! ( ( pLocFrag->pSeqPhredCalledBases_->nGetStartIndex() <= nUnpaddedStartPos ) && ( nUnpaddedStartPos <= nUnpaddedEndPos ) && ( nUnpaddedEndPos <= pLocFrag->pSeqPhredCalledBases_->nGetEndIndex() ) ) ) { char szTemp[200]; sprintf( szTemp, "tag must have %d <= start position <= end position <= %d since the tag must be within the bounds of the read", pLocFrag->pSeqPhredCalledBases_->nGetStartIndex(), pLocFrag->pSeqPhredCalledBases_->nGetEndIndex() ); PARSE_PANIC( szTemp ); } } else { if ( ! ( ( pLocFrag->nGetUnpaddedStartIndex() <= nUnpaddedStartPos ) && ( nUnpaddedStartPos <= nUnpaddedEndPos ) && ( nUnpaddedEndPos <= nUnpaddedBases ) ) ) { char szTemp[200]; sprintf( szTemp, "tag must have %d <= start position <= end position <= %d since the tag must be within the bounds of the read", pLocFrag->nGetUnpaddedStartIndex(), nUnpaddedBases ); PARSE_PANIC( szTemp ); } } if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) { PARSE_PANIC( "premature end of file while looking for DATE:" ); } ++nLine; RWCTokenizer tokDate( szLine ); if ( tokDate() != "DATE:" ) { PARSE_PANIC_1_ARG( "UNPADDED_READ_POS: line should be followed by DATE: line but instead has %s", szLine ); } RWCString soDatePart = tokDate(); RWCString soTimePart = tokDate(); if ( (soDatePart.length() != 8 ) || (soTimePart.length() != 8) ) PARSE_PANIC2( filPHD, nLine, "date and time must each be exactly 8 characters YY/MM/DD HH:MI:SS", szLine ); if ( ! ( ( soDatePart[2] == '/' ) && ( soDatePart[5] == '/' ) && ( soTimePart[2] == ':' ) && ( soTimePart[5] == ':' ) ) ) PARSE_PANIC2( filPHD, nLine, "date and time must each be exactly 8 characters YY/MM/DD HH:MI:SS", szLine ); #define bIsNumber( myChar ) ( ( '0' <= myChar && myChar <= '9' ) ? true : false ) if ( ! ( bIsNumber( soDatePart[0] ) && bIsNumber( soDatePart[1] ) && bIsNumber( soDatePart[3] ) && bIsNumber( soDatePart[4] ) && bIsNumber( soDatePart[6] ) && bIsNumber( soDatePart[7] ) && bIsNumber( soTimePart[0] ) && bIsNumber( soTimePart[1] ) && bIsNumber( soTimePart[3] ) && bIsNumber( soTimePart[4] ) && bIsNumber( soTimePart[6] ) && bIsNumber( soTimePart[7] ) ) ) { PARSE_PANIC_2_ARGS( "date and time must be of form YY/MM/DD HH:MI:SS but they were not numeric. Instead found %s %s", (char*) soDatePart.data(), (char*) soTimePart.data() ); } RWCString soDate = soDatePart + " " + soTimePart; // RWCString soTemp = // soDatePart(0, 2 ) + // soDatePart( 3, 2 ) + // soTimePart( 0, 2 ) + // soTimePart( 3, 2 ); // if (! bIsNumeric (soTemp ) ) // PARSE_PANIC2( filPHD, nLine, // "date and time must each be exactly 8 characters YY/MM/DD HH:MI:SS", // szLine ); FGETS_OR_PARSE_PANIC( "premature end of file while in tag" ); long lID = nUndefinedTagID; if ( soLine.bStartsWith( "ID:" ) ) { RWCString soRestOfLine = soLine.soGetRestOfString( 3 ); // bIsNumericLongMaybeWithWhitespace will check that there // is really a number there if ( !bIsNumericLongMaybeWithWhitespace( soRestOfLine, lID ) ) { RWCString soMessage = "ID: line not followed by a number but rather by: "; soMessage += soRestOfLine; PARSE_PANIC( soMessage ); } ConsEd::pGetAssembly()->findLargestTagIDSoFar( lID ); // if reached here, we must have successfully read the ID. // So must read another line FGETS_OR_PARSE_PANIC( "premature end of file in a tag after reading a tag ID" ); } RWCString soComment; if ( soLine.length() >= 13 && soLine( 0, 13 ) == "BEGIN_COMMENT" ) { bool bFoundEndComment = false; while( !bFoundEndComment ) { if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) { PARSE_PANIC( "premature end of file while looking for END_COMMENT in comment tag" ); } ++nLine; soLine = szLine; if (soLine == "END_COMMENT\n" ) bFoundEndComment = true; else soComment += soLine; } // now strip off the final \n off of the comment--this is a delimiter, // not part of the comment soComment = soComment.stripWhitespace( RWCString::TRAILING ); // set up so that soLine contains the next line FGETS_OR_PARSE_PANIC( "premature end of file while in tag and just completed reading comment" ); } // if soLine.length() >= 13 && soLine( 0, 13 ) ... RWCString soMiscData; while( !( soLine.length() > 7 && soLine( 0, 7 ) == "END_TAG" ) ) { FGETS_OR_PARSE_PANIC( "premature end of file while looking for END_TAG" ); soMiscData += soLine; } int nConsPosStart; int nConsPosEnd; if ( bUnpaddedTagCoordinates ) { nConsPosStart = nUnpaddedStartPos; nConsPosEnd = nUnpaddedEndPos; } else { int nPaddedConsPos1 = nGetPaddedConsPosFromUnpaddedPosInDirectionOfSequencing( pLocFrag, nUnpaddedStartPos ); int nPaddedConsPos2 = nGetPaddedConsPosFromUnpaddedPosInDirectionOfSequencing( pLocFrag, nUnpaddedEndPos ); nConsPosStart = MIN( nPaddedConsPos1, nPaddedConsPos2 ); nConsPosEnd = MAX( nPaddedConsPos1, nPaddedConsPos2 ); } tag* pTag = new tag( pLocFrag, NULL, // contig soTagType, nConsPosStart, nConsPosEnd, true, // bWriteToPhdFileNotAceFile_ soComment, soSource, soDate, false ); // no NoTrans pTag->bCoordinatesAreUnpaddedRead_ = bUnpaddedTagCoordinates; if ( !soMiscData.isNull() ) { pTag->soMiscData_ = soMiscData; } pTag->lID_ = lID; pLocFrag->aTags_.append( pTag ); } // readTag // This cannot be used for added bases since it assumes that the // LocatedFragment read is already populated with bases and we are // just using this to fill in pSeqPhredCalledBases_. Even then, // it doesn't set the size of pSeqPhredCalledBases_ static void processCommentAndDnaForOriginalBaseCalls( LocatedFragment* pLocFrag, FILE* pFil ) { pLocFrag->bAlreadyReadPhredCallsInPhdBall_ = true; pLocFrag->pSeqPhredCalledBases_ = new Sequence(); // looking for BEGIN_COMMENT bool bFoundComment = false; while( !bFoundComment ) { if ( fgets( szLine, nMaxLineSize, pFil ) == NULL ) { PARSE_PANIC( "premature end of file while looking for BEGIN_COMMENT for phd file comment section" ); } else ++nLine; if ( strncmp( szBEGIN_COMMENT, szLine, nBEGIN_COMMENT ) == 0 ) bFoundComment = true; } // if reached here, found the BEGIN_COMMENT section // All we want out of the comment section is the TraceArrayMin and Max // All the rest will be read from the higher version phd file. bool bFoundTraceArrayMax = false; bool bFoundCommentEnd = false; while( ! bFoundCommentEnd ) { if ( fgets( szLine, nMaxLineSize, pFil ) == NULL ) { PARSE_PANIC( "premature end of file while looking for END_COMMENT" ); } else ++nLine; if ( strncmp( szEND_COMMENT, szLine, nEND_COMMENT ) == 0 ) bFoundCommentEnd = true; else if ( strncmp( szTRACE_ARRAY_MIN_INDEX, szLine, nTRACE_ARRAY_MIN_INDEX ) == 0 ) { // soGetRestOfLine strips off final CR RWCString soTraceArrayMinIndex = soGetRestOfLine( szLine, nTRACE_ARRAY_MIN_INDEX ); if ( !bIsNumeric( soTraceArrayMinIndex ) ) { PARSE_PANIC( "line should have been TRACE_ARRAY_MIN_INDEX: ### but ### wasn't numeric" ); } pLocFrag->pSeqPhredCalledBases_->nTraceArrayMinIndex_ = atoi( soTraceArrayMinIndex.data() ); } else if ( strncmp( szTRACE_ARRAY_MAX_INDEX, szLine, nTRACE_ARRAY_MAX_INDEX ) == 0 ) { RWCString soTraceArrayMaxIndex = soGetRestOfLine( szLine, nTRACE_ARRAY_MAX_INDEX ); if ( !bIsNumeric( soTraceArrayMaxIndex ) ) { PARSE_PANIC( "line should have been TRACE_ARRAY_MAX_INDEX: ### but ### wasn't numeric" ); } pLocFrag->pSeqPhredCalledBases_->nTraceArrayMaxIndex_ = atoi( soTraceArrayMaxIndex.data() ); bFoundTraceArrayMax = true; } } if ( pLocFrag->bHasChromat() ) { pLocFrag->pSeqPhredCalledBases_->createPointPosArray( bFoundTraceArrayMax, pLocFrag->pSeqPhredCalledBases_->nTraceArrayMaxIndex_, pLocFrag->length(), // this might be longer than we need, but // won't be shorter than we need false ); // false sets current length of array to zero } // looking for BEGIN_DNA bool bFound = false; while( !bFound ) { if ( fgets( szLine, nMaxLineSize, pFil ) == NULL ) { PARSE_PANIC( "premature end of file while looking for BEGIN_DNA" ); } else ++nLine; if ( strncmp( szBEGIN_DNA, szLine, nBEGIN_DNA ) == 0 ) bFound = true; } // prevent zillions of resizes of pSeqPhredCalledBases_ // DG Aug 2008 pLocFrag->pSeqPhredCalledBases_->resize( pLocFrag->length() ); char cBase; bool bHasChromat = pLocFrag->bHasChromat(); bool bEndDNA = false; while( !bEndDNA ) { if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) { PARSE_PANIC( "premature end of file while looking for END_DNA" ); } else { ++nLine; } if ( szLine[0] == 'E' && szLine[1] == 'N' && szLine[2] == 'D' && szLine[3] == '_' && szLine[4] == 'D' && szLine[5] == 'N' && szLine[6] == 'A' ) { bEndDNA = true; } else { // parse the line cBase = szLine[0]; // convert to lowercase if (! ( 'a' <= cBase && cBase <= 'z' ) ) { if ( 'A' <= cBase && cBase <= 'Z' ) { cBase = cBase + 'a' - 'A'; } else { // not a base--not capital and not lowercase PARSE_PANIC_1_ARG( "expected letter in position 0 of this line but instead found \"%c\"", szLine[0] ); } } // advance over 1st whitespace if ( szLine[1] != ' ' ) { PARSE_PANIC( "expected line of the form (base)(space)(quality)(space)(peak position) but the first space was not in column 2" ); } // any more whitespace to skip over? nPos = 2; if ( szLine[2] == ' ' || szLine[2] == '\0' ) { while( szLine[nPos] == ' ' ) { ++nPos; } if ( szLine[nPos] == '\0' ) { PARSE_PANIC( "this line should be of the form (base)(space)(quality)(space)(peak position) but the line ended after the base" ); } } // if reached here, szLine[nPos] is not a space and not a null // advance to next whitespace, accumlating quality on the way nQuality = 0; while( '0' <= szLine[nPos] && szLine[nPos] <= '9' ) { nQuality = nQuality*10 + ( szLine[nPos] - '0' ); ++nPos; } pLocFrag->pSeqPhredCalledBases_->append( Ntide( cBase, nQuality ) ); // no need to set peak position if there is no chromat if ( !bHasChromat ) continue; // when exit loop above (and have chromat), the following // character must be a space if ( szLine[nPos] != ' ' ) { PARSE_PANIC_2_ARGS( "something wrong after quality on this line. Should be a space but instead is \"%c\" at pos %d", szLine[nPos], nPos ); } // now advance to next non-whitespace to find the peak position ++nPos; if ( szLine[nPos] == ' ' || szLine[nPos] == '\0' ) { while( szLine[nPos] == ' ') { ++nPos; } if ( szLine[nPos] == '\0' ) { PARSE_PANIC( "this line should have 3 tokens but has only 2" ); } } // at this point szLine[nPos] is not a space // It should be the peak position. nPeakPosition = 0; while( '0' <= szLine[nPos] && szLine[nPos] <= '9' ) { nPeakPosition = nPeakPosition*10 + ( szLine[nPos] - '0' ); ++nPos; } if ( szLine[nPos] != '\n' ) { PARSE_PANIC( "this line should be of form (base)(space)(quality)(space)(peak position) where quality and peak position are numeric but something non-numeric follows the peak position." ); } pLocFrag->pSeqPhredCalledBases_->appendPointPos( nPeakPosition ); } } // while( !bEndDNA ) // what if the read is complemented? Must reverse the order of the // bases and complement them. if ( pLocFrag->bComp() ) { pLocFrag->pSeqPhredCalledBases_->complementSequenceAfterReadingPHDFile( bFoundTraceArrayMax ); } } static void processCommentAndDna( LocatedFragment* pLocFrag, FILE* pFil, int& nNumberOfUnpaddedBases ) { pLocFrag->bAlreadyReadPhdFile_ = true; // looking for BEGIN_COMMENT bool bFound = false; while( !bFound ) { if (fgets( szLine, nMaxLineSize, pFil ) == NULL ) { PARSE_PANIC( "premature end of file while looking for BEGIN_COMMENT for phd file comment section" ); } else ++nLine; if ( strncmp( szBEGIN_COMMENT, szLine, nBEGIN_COMMENT ) == 0 ) bFound = true; } // looking for TIME:, CHEM:, DYE:, ABI_THUMBPRINT:, pLocFrag->bPhdFileHasTraceArrayMinAndMax_ = false; bool bFoundTime = false; bool bFoundCommentEnd = false; while( ! bFoundCommentEnd ) { if (fgets( szLine, nMaxLineSize, pFil ) == NULL ) { PARSE_PANIC( "premature end of file while looking for END_COMMENT" ); } else ++nLine; if ( strncmp( szEND_COMMENT, szLine, nEND_COMMENT ) == 0 ) bFoundCommentEnd = true; else if ( strncmp( szTIME, szLine, nTIME ) == 0 ) { RWCString soLine( szLine ); RWCTokenizer tokNext( soLine ); RWCString soNextToken = tokNext(); assert( soNextToken == "TIME:" ); RWCString soDayOfWeek = tokNext(); RWCString soMonth = tokNext(); RWCString soDayOfMonth = tokNext(); RWCString soTime = tokNext(); RWCString soYear = tokNext(); if ( !bIsNumeric( soYear ) ) { soYear = tokNext(); } RWCString soTimestampFromPHDFile = soDayOfWeek + " " + soMonth + " " + soDayOfMonth + " " + soTime + " " + soYear; if ( soTimestampFromPHDFile != pLocFrag->soGetTimestamp() && !pCP->bAllowTimestampMismatch_ ) { ostringstream ost; ost << "phdball timestamp mismatch: ace file says " << pLocFrag->soGetTimestamp() << " but phdball " << pCP->filFileOfPhdFilesCurrentlyBeingRead_ << " says " << soTimestampFromPHDFile << " for read " << pLocFrag->soGetName() << endl << ends; InputDataError ide( ost.str().c_str() ); throw ide; } bFoundTime = true; } else if ( strncmp( szABI_THUMBPRINT, szLine, nABI_THUMBPRINT ) == 0 ) { pLocFrag->soABIThumbprint_ = soGetRestOfLine( szLine, nABI_THUMBPRINT ); } else if ( strncmp( szPHRED_VERSION, szLine, nPHRED_VERSION ) == 0 ) { pLocFrag->soPhredVersion_ = soGetRestOfLine( szLine, nPHRED_VERSION ); } else if ( strncmp( szCALL_METHOD, szLine, nCALL_METHOD ) == 0 ) { pLocFrag->soCallMethod_ = soGetRestOfLine( szLine, nCALL_METHOD ); } else if ( strncmp( szQUALITY_LEVELS, szLine, nQUALITY_LEVELS ) == 0 ) { RWCString soQualityLevels = soGetRestOfLine( szLine, nQUALITY_LEVELS ); if (! bIsNumeric( soQualityLevels ) ) { PARSE_PANIC( "quality levels not numeric" ); } pLocFrag->nQualityLevels_ = atoi( (char*) soQualityLevels.data() ); } else if ( strncmp( szTRACE_ARRAY_MIN_INDEX, szLine, nTRACE_ARRAY_MIN_INDEX ) == 0 ) { // soGetRestOfLine strips off final CR RWCString soTraceArrayMinIndex = soGetRestOfLine( szLine, nTRACE_ARRAY_MIN_INDEX ); if ( !bIsNumeric( soTraceArrayMinIndex ) ) { PARSE_PANIC( "line should have been TRACE_ARRAY_MIN_INDEX: ### but ### wasn't numeric" ); } pLocFrag->nTraceArrayMinIndex_ = atoi( soTraceArrayMinIndex.data() ); } else if ( strncmp( szTRACE_ARRAY_MAX_INDEX, szLine, nTRACE_ARRAY_MAX_INDEX ) == 0 ) { // soGetRestOfLine strips off final CR RWCString soTraceArrayMaxIndex = soGetRestOfLine( szLine, nTRACE_ARRAY_MAX_INDEX ); if ( !bIsNumeric( soTraceArrayMaxIndex ) ) { PARSE_PANIC( "line should have been TRACE_ARRAY_MAX_INDEX: ### but ### wasn't numeric" ); } pLocFrag->nTraceArrayMaxIndex_ = atoi( soTraceArrayMaxIndex.data() ); pLocFrag->bPhdFileHasTraceArrayMinAndMax_ = true; } else if ( strncmp( "CHEM:", szLine, 5 ) == 0 ) { int nLength = strlen( szLine ); // make sure that there is at least 2 chars after CHEM: // which allows for a space and a non-space if ( nLength < 7 ) PARSE_PANIC( "line should have been CHEM: (prim, term, or unknown)" ); // start with character after CHEM: which is at position 5 pLocFrag->soChemistry_ = soGetRestOfLine( szLine, 5 ); } else if ( strncmp( "DYE:", szLine, 4 ) == 0 ) { int nLength = strlen( szLine ); if ( nLength < 6 ) PARSE_PANIC( "line should have been DYE: ( rhod, big, ET, d-rhod, or unknown )" ); pLocFrag->soDye_ = soGetRestOfLine( szLine, 4 ); } else { // ALLOW other lines in the comment section of the phd file } } if ( !bFoundTime ) { PARSE_PANIC( "did not find TIME: in the BEGIN_COMMENT/END_COMMENT section of the phd block" ); } if ( pLocFrag->bHasChromat() ) { pLocFrag->createPointPosArray( pLocFrag->bPhdFileHasTraceArrayMinAndMax_, pLocFrag->nTraceArrayMaxIndex_, pLocFrag->length(), true ); // bFillUpArray } // now looking for BEGIN_DNA bFound = false; while( !bFound ) { if (fgets( szLine, nMaxLineSize, pFil ) == NULL ) { PARSE_PANIC( "premature end of file while looking for BEGIN_DNA" ); } else ++nLine; if ( strncmp( szBEGIN_DNA, szLine, nBEGIN_DNA ) == 0 ) bFound = true; } int nPaddedSequencePos; // find the first non-pad base in the direction of sequencing. To // do this, start at the right end (for complemented reads) or left end // for uncomplemented reads. if ( pLocFrag->bComp() ) { nPaddedSequencePos = pLocFrag->nGetEndIndex(); if ( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() ) { while( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() && nPaddedSequencePos > pLocFrag->nGetStartIndex() ) --nPaddedSequencePos; if ( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() ) { fprintf( stderr, "warning: read %s is entirely pads\n", (char*) pLocFrag->soGetName().data() ); fclose( pFil ); return; } } // prepare for 1st advancing: ++nPaddedSequencePos; } else { nPaddedSequencePos = pLocFrag->nGetStartIndex(); if ( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() ) { while( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() && nPaddedSequencePos < pLocFrag->nGetEndIndex() ) ++nPaddedSequencePos; if ( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() ) { fprintf( stderr, "warning: read %s is entirely pads\n", (char*) pLocFrag->soGetName().data() ); fclose( pFil ); return; } } // prepare for 1st advancing --nPaddedSequencePos; } bool bHasChromat = pLocFrag->bHasChromat(); // now reading the data until get END_DNA char cTemp; int nBases = 0; bool bEndDNA = false; while( !bEndDNA ) { if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) { PARSE_PANIC( "premature end of file while looking for END_DNA" ); } else { ++nLine; } if ( szLine[0] == 'E' && szLine[1] == 'N' && szLine[2] == 'D' && szLine[3] == '_' && szLine[4] == 'D' && szLine[5] == 'N' && szLine[6] == 'A' ) { bEndDNA = true; } else { ++nBases; // all of the code below is just to move nPaddedSequencePos to // the next non-pad--the base that corresponds to the next base // in the phd file if ( pLocFrag->bComp() ) { --nPaddedSequencePos; if ( nPaddedSequencePos < pLocFrag->nGetStartIndex() ) { PARSE_PANIC_2_ARGS( "too many bases in phd for complemented read. At least %d bases in phd file but only %d unpadded bases in ace file", nBases, pLocFrag->nGetUnpaddedSeqLength() ); } if ( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() ) { do { --nPaddedSequencePos; if ( nPaddedSequencePos < pLocFrag->nGetStartIndex() ) { PARSE_PANIC_2_ARGS( "too many bases in phd file for complemented read. (2) At least %d bases in phd file but only %d unpadded bases in ace file", nBases, pLocFrag->nGetUnpaddedSeqLength() ); } } while( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() ); } } else { ++nPaddedSequencePos; if ( nPaddedSequencePos > pLocFrag->nGetEndIndex() ) { PARSE_PANIC_2_ARGS( "too many bases in phd file for uncomplemented read. At least %d bases in phd file but only %d unpadded bases in ace file", nBases, pLocFrag->nGetUnpaddedSeqLength() ); } if ( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad () ) { // must further change nPaddedSequencePos to next non-pad do { ++nPaddedSequencePos; if ( nPaddedSequencePos > pLocFrag->nGetEndIndex() ) { PARSE_PANIC_2_ARGS( "too many bases in phd file for uncomplemented read. (2) At least %d bases in phd file but only %d unpadded bases in ace file", nBases, pLocFrag->nGetUnpaddedSeqLength() ); } } while( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() ); // the loop above will be exited only a non-pad (or an // exception is thrown) } } // all of the above was just to change nPaddedSequencePos cTemp = szLine[0]; if (! ( 'a' <= cTemp && cTemp <= 'z' ) ) { if ( 'A' <= cTemp && cTemp <= 'Z' ) { szLine[0] = szLine[0] + ( 'a' - 'A' ); } else { char szTemp[200]; sprintf( szTemp, "expected letter in position 0 of this line but instead found \"%s\"", szLine ); PARSE_PANIC( szTemp ); } } // sanity check that the phd file and the ace file agree char cAceBase = pLocFrag->ntideGet( nPaddedSequencePos ).cGetBase(); char cPHDBase; if ( pLocFrag->bComp() ) cPHDBase = cComplementBase( szLine[0] ); else cPHDBase = szLine[0]; if ( cPHDBase != cAceBase ) { // eliminate changes that crossmatch may have introduced // in to the acefile bases if ( cAceBase != 'x' && szLine[0] != 'x' && cAceBase != 'n' ) { ostringstream ost; ost << "ace file says base at nPaddedSequencePos " << nPaddedSequencePos << " or nBases " << nBases << " is " << cAceBase << " but PHD file says " << szLine[0] << " which on top strand is: " << cPHDBase << " for read " << pLocFrag->soGetName() << " at line " << nLine << " in file of phd files " << pCP->filFileOfPhdFilesCurrentlyBeingRead_ << endl << "How could this have happened? Somehow the ace file was created from a different phdball that was replaced by this more recent one." << endl << __FILE__ << ":" << __LINE__ << endl << ends; InputDataError ide( ost.str().c_str() ); throw ide; } } // advance over 1st whitespace if ( szLine[1] != ' ' ) { PARSE_PANIC( "expected line of the form (base)(space)(quality)(space)(peak position) but the first space was not in column 2" ); } nPos = 2; if ( szLine[2] == ' ' || szLine[2] == '\0' ) { while( szLine[nPos] == ' ' ) { ++nPos; } if ( szLine[nPos] == '\0' ) { PARSE_PANIC( "this line should be of the form (base)(space)(quality)(space)(peak position) but the line ended after the base" ); } } // if reached here, szLine[nPos] is not a space and not a null // advance to next whitespace, accumlating quality on the way nQuality = 0; while( '0' <= szLine[nPos] && szLine[nPos] <= '9' ) { nQuality = nQuality*10 + ( szLine[nPos] - '0' ); ++nPos; } // if a base is cross_matched out, then make its quality 0 // since it should be dark--should not attract attention // changed Dec 1999 since we need quality values for autofinish // to get the error profile of existing reads // if ( cAceBase == 'x' ) // nQuality = 0; pLocFrag->setQualityAtSeqPos( nPaddedSequencePos, nQuality ); // if there is no chromat, no need to set peak position if ( !bHasChromat ) continue; // when exit loop above (and have chromat), the following // character must be a space if ( szLine[nPos] != ' ' ) { PARSE_PANIC( "this line should have 3 tokens but doesn't" ); } // now advance to next non-whitespace to find the peak position ++nPos; if ( szLine[nPos] == ' ' || szLine[nPos] == '\0' ) { while( szLine[nPos] == ' ') { ++nPos; } if ( szLine[nPos] == '\0' ) { PARSE_PANIC( "this line should have 3 tokens but doesn't" ); } } // advance to next whitespace (actually a carriage return), // accumlating peak position on the way nPeakPosition = 0; while( '0' <= szLine[nPos] && szLine[nPos] <= '9' ) { nPeakPosition = nPeakPosition*10 + ( szLine[nPos] - '0' ); ++nPos; } if ( szLine[nPos] != '\n' ) { char szTemp[1000]; sprintf( szTemp, "this line should be of form (base)(space)(quality)(space)(peak position) where quality and peak position are numeric but something non-numeric follows the peak position. Line is:\n\"%s\"", szLine ); PARSE_PANIC( szTemp ); } // if flipping read left and right, must do this also // for the peak position so it lines up with the flipped trace if ( pLocFrag->bComp() ) { if ( pLocFrag->bPhdFileHasTraceArrayMinAndMax_ ) nPeakPosition = pLocFrag->nTraceArrayMaxIndex_ - nPeakPosition; else nPeakPosition = -nPeakPosition; } pLocFrag->setTracePointPosAtSeqPos( nPaddedSequencePos, nPeakPosition ); } // if ( strncmp( szEND_DNA, szLine, nEND_DNA ) == 0 ) ... else } // while( !bEndDNA ) { // check that there are the same number of bases in the phd file // as the ace file // Why don't I just check if nBases == pLocFrag->nGetUnpaddedSeqLength() ? // It is because nGetUnpaddedSeqLength() takes a long time so I use // the more efficient method below. if ( pLocFrag->bComp() ) { if ( nPaddedSequencePos != pLocFrag->nGetStartIndex() ) { if ( pLocFrag->nGetUnpaddedSeqLength() != nBases ) { PARSE_PANIC_2_ARGS( "The phd file has only %d bases but the ace file has %d bases", nBases, pLocFrag->nGetUnpaddedSeqLength() ); } } } else { if ( nPaddedSequencePos != pLocFrag->nGetEndIndex() ) { if ( pLocFrag->nGetUnpaddedSeqLength() != nBases ) { PARSE_PANIC_2_ARGS( "The phd file has only %d bases but the ace file has %d bases", nBases, pLocFrag->nGetUnpaddedSeqLength() ); } } } // this was inserted to fix a bug with polyphred in which // tags applied to the .1 phd file may be out of range if the user then // deletes bases with subsequent versions of the phd files. Since // the .1 phd file is still read, the tags in it may be out of range. // Thus the solution is to not read the tags in the .1 phd file if it // is just being read for the phred calls. bool bAssignPeakPositions = bHasChromat; pLocFrag->assignQualityAndPeakPositionsToPads( bAssignPeakPositions ); nNumberOfUnpaddedBases = nBases; } // processCommentAndDna static void processMiscReadStuff( LocatedFragment* pLocFrag, FILE* pFil, const long lNumberOfUnpaddedBases, char* szLine, bool& bEndOfFile, bool& bFoundBEGIN_SEQUENCE, const bool bUnpaddedTagCoordinates ) { // doesn't return until one of these is set to true bEndOfFile = false; bFoundBEGIN_SEQUENCE = false; bool bSetPaddedFromUnpaddedArray = false; while( fgets( szLine, nMaxLineSize, pFil) != NULL ) { ++nLine; if ( strcmp( szLine, "BEGIN_TAG\n" ) == 0 ) { if ( !bUnpaddedTagCoordinates && !bSetPaddedFromUnpaddedArray ) { setPaddedFromUnpaddedArray( lNumberOfUnpaddedBases, pLocFrag ); bSetPaddedFromUnpaddedArray = true; } readTag( pFil, pCP->filFileOfPhdFilesCurrentlyBeingRead_, pLocFrag, lNumberOfUnpaddedBases, bUnpaddedTagCoordinates ); } else if ( strncmp( szLine, "WR{", 3 ) == 0 ) { wholeReadItem* pWR; readWholeReadItem( pFil, pCP->filFileOfPhdFilesCurrentlyBeingRead_, pLocFrag, pWR ); pWR->appendThyself(); } else if ( memcmp( szLine, "BEGIN_SEQUENCE", 14 ) == 0 ) { bFoundBEGIN_SEQUENCE = true; return; } } bEndOfFile = true; } // processMiscReadStuff void Assembly :: readFileOfPhdFiles( const FileName& filFileOfPhdFiles ) { pCP->filFileOfPhdFilesCurrentlyBeingRead_ = filFileOfPhdFiles; // moved this to readPHDFilesAndSetQualitiesAndPeakPositions // if ( !pCP->filFileOfPhdFiles_.bFileByThisNameExists() ) return; if (! bCalculatedLengths ) { bCalculatedLengths = true; nBEGIN_SEQUENCE = strlen( szBEGIN_SEQUENCE ); nBEGIN_COMMENT = strlen( szBEGIN_COMMENT ); nBEGIN_DNA = strlen( szBEGIN_DNA ); nTIME = strlen( szTIME ); nEND_DNA = strlen( szEND_DNA ); nABI_THUMBPRINT = strlen( szABI_THUMBPRINT ); nPHRED_VERSION = strlen( szPHRED_VERSION ); nCALL_METHOD = strlen( szCALL_METHOD ); nQUALITY_LEVELS = strlen( szQUALITY_LEVELS ); nEND_COMMENT = strlen( szEND_COMMENT ); nTRACE_ARRAY_MIN_INDEX = strlen( szTRACE_ARRAY_MIN_INDEX ); nTRACE_ARRAY_MAX_INDEX = strlen( szTRACE_ARRAY_MAX_INDEX ); } // this should leave any subdirectory of ../phdball_dir but should // remove ../phdball_dir/ RWCString soPhdBallWithoutDirectory = filFileOfPhdFiles; if ( soPhdBallWithoutDirectory.bStartsWithAndRemove( pCP->filPhdBallDirectory_ ) ) { soPhdBallWithoutDirectory.bStartsWithAndRemove( "/" ); } FILE *pFil; nLine = 0; cerr << "opening " << filFileOfPhdFiles << endl; pFil = fopen( (char*) filFileOfPhdFiles.data(), "r" ); // ifsPHDFile.open( filPHD, ios:: in ); if (pFil == NULL ) { ostringstream ost; ost << "could not open phd ball file " << filFileOfPhdFiles << endl << ends; InputDataError ide( ost.str().c_str() ); throw ide; } int nNumberOfReadsInAssembly = nGetNumberOfReadsInAssembly(); // look for each read in turn, until the end of the file int nNumberOfPhdFilesInFile = 0; bool bEndOfFile = false; bool bFoundBEGIN_SEQUENCE = false; RWCString soReadName; RWCString soVersion; int nNumberOfUnpaddedBases; LocatedFragment* pLocFrag; RWCTokenizer tok(""); int nVersion; look_for_BEGIN_SEQUENCE: // looking for BEGIN_SEQUENCE if (fgets( szLine, nMaxLineSize, pFil ) == NULL ) goto end_of_file; ++nLine; if ( memcmp( szBEGIN_SEQUENCE, szLine, nBEGIN_SEQUENCE ) != 0 ) goto look_for_BEGIN_SEQUENCE; found_BEGIN_SEQUENCE: ++nNumberOfPhdFilesInFile; // moved this above ++nNumberOfPhdFiles so // that the pCP->nPhdFilesRead_ is equal to // nNumberOfReadsInAssembly in the case in which // all read are in phd.ball if ( ( nNumberOfPhdFilesInFile < 10 ) || ( ( nNumberOfPhdFilesInFile < 10000 ) && ( nNumberOfPhdFilesInFile % 1000 == 0 ) ) || ( ( nNumberOfPhdFilesInFile < 100000 ) && ( nNumberOfPhdFilesInFile % 10000 == 0 ) ) || ( nNumberOfPhdFilesInFile % 100000 == 0 ) ) { cerr << "read phd files in " << filFileOfPhdFiles << " found: " << soAddCommas( nNumberOfPhdFilesInFile ) << " totals: used: " << soAddCommas( pCP->nPhdFilesRead_ ) << " need: " << soAddCommas( nNumberOfReadsInAssembly ) << endl; cerr.flush(); } tok.soStringToBeTokenized_ = szLine; tok.nAfterLastToken_ = -1; assert( tok() == "BEGIN_SEQUENCE" ); soReadName = tok(); soVersion = tok(); pLocFrag = pGetLocatedFragmentByName( soReadName ); if ( !pLocFrag ) goto look_for_BEGIN_SEQUENCE; nVersion = -666; if ( soVersion.bIsNull() ) nVersion = 1; else if ( !bIsNumeric( soVersion ) ) { RWCString soError = filFileOfPhdFiles + " is corrupted since line " + szLine + " has bad version"; THROW_ERROR( soError ); } else { nVersion = atoi( soVersion.data() ); } if ( ( nVersion != pLocFrag->nVersion_ ) && ( nVersion != 1 ) ) { // for example, if the current version is .3 and this version is // .2, we aren't interested goto look_for_BEGIN_SEQUENCE; } // if reached here, nVersion == 1 or else nVersion == pLocFrg->nVersion_ // or both ++pCP->nPhdFilesRead_; if ( nVersion == pLocFrag->nVersion_ ) { try { processCommentAndDna( pLocFrag, pFil, nNumberOfUnpaddedBases ); } catch( ExceptionBase eb ) { // cerr << eb.szGetDesc() << " so will read phd file from ../phd_dir and see if it does not have this problem" << endl; // force reading of phd file pLocFrag->bAlreadyReadPhdFile_ = false; // since probably some indeterminate place in the phd.ball, find next // read goto look_for_BEGIN_SEQUENCE; } // now process tags and WR items and look for next BEGIN_SEQUENCE const bool bUnpaddedTagCoordinates = false; processMiscReadStuff( pLocFrag, pFil, nNumberOfUnpaddedBases, szLine, bEndOfFile, bFoundBEGIN_SEQUENCE, bUnpaddedTagCoordinates ); pLocFrag->filPhdBall_ = soPhdBallWithoutDirectory; if ( bEndOfFile ) goto end_of_file; if ( bFoundBEGIN_SEQUENCE ) goto found_BEGIN_SEQUENCE; assert( false ); // should never get here } else { assert( nVersion == 1 ); processCommentAndDnaForOriginalBaseCalls( pLocFrag, pFil ); // no need to look at WR or tags for original base calls // We will stop reading at END_DNA so there is no chance of // finding BEGIN_SEQUENCE or end-of-file goto look_for_BEGIN_SEQUENCE; } end_of_file: cerr << "Number of phd blocks used from " << filFileOfPhdFiles << ": " << soAddCommas( pCP->nPhdFilesRead_ ) << endl; cerr.flush(); fclose( pFil ); } // readFileOfPhdFiles static void processCommentAndDnaForAddedRead( LocatedFragment* pLocFrag, FILE* pFil, long& lNumberOfUnpaddedBases ) { pLocFrag->bAlreadyReadPhdFile_ = true; // looking for BEGIN_COMMENT bool bFound = false; while( !bFound ) { if (fgets( szLine, nMaxLineSize, pFil ) == NULL ) { PARSE_PANIC( "premature end of file while looking for BEGIN_COMMENT for phd file comment section" ); } else ++nLine; if ( strncmp( szBEGIN_COMMENT, szLine, nBEGIN_COMMENT ) == 0 ) bFound = true; } // looking for TIME:, CHEM:, DYE:, ABI_THUMBPRINT:, pLocFrag->bPhdFileHasTraceArrayMinAndMax_ = false; bool bFoundTime = false; bool bFoundCommentEnd = false; while( ! bFoundCommentEnd ) { if (fgets( szLine, nMaxLineSize, pFil ) == NULL ) { PARSE_PANIC( "premature end of file while looking for END_COMMENT" ); } else ++nLine; if ( strncmp( szEND_COMMENT, szLine, nEND_COMMENT ) == 0 ) bFoundCommentEnd = true; else if ( strncmp( szTIME, szLine, nTIME ) == 0 ) { RWCString soLine( szLine ); RWCTokenizer tokNext( soLine ); RWCString soNextToken = tokNext(); assert( soNextToken == "TIME:" ); RWCString soDayOfWeek = tokNext(); RWCString soMonth = tokNext(); RWCString soDayOfMonth = tokNext(); RWCString soTime = tokNext(); RWCString soYear = tokNext(); if ( !bIsNumeric( soYear ) ) { soYear = tokNext(); } RWCString soTimestampFromPHDFile = soDayOfWeek + " " + soMonth + " " + soDayOfMonth + " " + soTime + " " + soYear; // no timestamp checking since the read is not in the ace file pLocFrag->setPHDTimestamp( soTimestampFromPHDFile ); bFoundTime = true; } else if ( strncmp( szABI_THUMBPRINT, szLine, nABI_THUMBPRINT ) == 0 ) { pLocFrag->soABIThumbprint_ = soGetRestOfLine( szLine, nABI_THUMBPRINT ); } else if ( strncmp( szPHRED_VERSION, szLine, nPHRED_VERSION ) == 0 ) { pLocFrag->soPhredVersion_ = soGetRestOfLine( szLine, nPHRED_VERSION ); } else if ( strncmp( szCALL_METHOD, szLine, nCALL_METHOD ) == 0 ) { pLocFrag->soCallMethod_ = soGetRestOfLine( szLine, nCALL_METHOD ); } else if ( strncmp( szQUALITY_LEVELS, szLine, nQUALITY_LEVELS ) == 0 ) { RWCString soQualityLevels = soGetRestOfLine( szLine, nQUALITY_LEVELS ); if (! bIsNumeric( soQualityLevels ) ) { PARSE_PANIC( "quality levels not numeric" ); } pLocFrag->nQualityLevels_ = atoi( (char*) soQualityLevels.data() ); } else if ( strncmp( szTRACE_ARRAY_MIN_INDEX, szLine, nTRACE_ARRAY_MIN_INDEX ) == 0 ) { // soGetRestOfLine strips off final CR RWCString soTraceArrayMinIndex = soGetRestOfLine( szLine, nTRACE_ARRAY_MIN_INDEX ); if ( !bIsNumeric( soTraceArrayMinIndex ) ) { PARSE_PANIC( "line should have been TRACE_ARRAY_MIN_INDEX: ### but ### wasn't numeric" ); } pLocFrag->nTraceArrayMinIndex_ = atoi( soTraceArrayMinIndex.data() ); } else if ( strncmp( szTRACE_ARRAY_MAX_INDEX, szLine, nTRACE_ARRAY_MAX_INDEX ) == 0 ) { // soGetRestOfLine strips off final CR RWCString soTraceArrayMaxIndex = soGetRestOfLine( szLine, nTRACE_ARRAY_MAX_INDEX ); if ( !bIsNumeric( soTraceArrayMaxIndex ) ) { PARSE_PANIC( "line should have been TRACE_ARRAY_MAX_INDEX: ### but ### wasn't numeric" ); } pLocFrag->nTraceArrayMaxIndex_ = atoi( soTraceArrayMaxIndex.data() ); pLocFrag->bPhdFileHasTraceArrayMinAndMax_ = true; } else if ( strncmp( "CHEM:", szLine, 5 ) == 0 ) { int nLength = strlen( szLine ); // make sure that there is at least 2 chars after CHEM: // which allows for a space and a non-space if ( nLength < 7 ) PARSE_PANIC( "line should have been CHEM: (prim, term, or unknown)" ); // start with character after CHEM: which is at position 5 pLocFrag->soChemistry_ = soGetRestOfLine( szLine, 5 ); } else if ( strncmp( "DYE:", szLine, 4 ) == 0 ) { int nLength = strlen( szLine ); if ( nLength < 6 ) PARSE_PANIC( "line should have been DYE: ( rhod, big, ET, d-rhod, or unknown )" ); pLocFrag->soDye_ = soGetRestOfLine( szLine, 4 ); } else if ( strncmp( "CHROMAT_FILE:", szLine, 13 ) == 0 ) { int nLength = strlen( szLine ); if ( nLength < 14 ) { PARSE_PANIC( "line should have been CHROMAT: (chromat_file)" ); } pLocFrag->filChromatFile_ = soGetRestOfLine( szLine, 14 ); } else { // ALLOW other lines in the comment section of the phd file } } if ( !bFoundTime ) { PARSE_PANIC( "did not find TIME: in the BEGIN_COMMENT/END_COMMENT section of the phd block" ); } // added debugging info April 2009 for Vancouver if ( pLocFrag->pSeqPhredCalledBases_ ) { cerr << "Fatal error. read " << pLocFrag->soGetName() << " has already been read once. Why did this occur? Possibilities include that there are 2 reads with the same name or that this read was already in the ace file" << endl; cerr.flush(); assert( !pLocFrag->pSeqPhredCalledBases_ ); } pLocFrag->pSeqPhredCalledBases_ = new Sequence(); // now looking for BEGIN_DNA bFound = false; while( !bFound ) { if (fgets( szLine, nMaxLineSize, pFil ) == NULL ) { PARSE_PANIC( "premature end of file while looking for BEGIN_DNA" ); } else ++nLine; if ( strncmp( szBEGIN_DNA, szLine, nBEGIN_DNA ) == 0 ) bFound = true; } // how much room should we reserve for the bases? long lBytesPastFirstLine = 0; long lBasesThisRead = 0; bool bEndDNA = false; while( !bEndDNA ) { if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) { PARSE_PANIC( "premature end of file while looking for END_DNA" ); } else { // ++nLine; we'll be fseeking back lBytesPastFirstLine += strlen( szLine ); } if ( szLine[0] == 'E' && szLine[1] == 'N' && szLine[2] == 'D' && szLine[3] == '_' && szLine[4] == 'D' && szLine[5] == 'N' && szLine[6] == 'A' ) { bEndDNA = true; } else { ++lBasesThisRead; } } // now go back to where the file was at the beginning of this block if ( fseek( pFil, -lBytesPastFirstLine, SEEK_CUR ) != 0 ) { RWCString soError( (size_t) 1000 ); sprintf( soError.data(), "fseek error when seeking in phdball back %d bytes %s (%d)\n", (int) lBytesPastFirstLine, strerror( errno ), errno ); } // and allocate enough space lNumberOfUnpaddedBases = lBasesThisRead; assert( pLocFrag->pSeqPhredCalledBases_ ); pLocFrag->pSeqPhredCalledBases_->resize( lNumberOfUnpaddedBases ); bool bHasChromat = pLocFrag->bHasChromat(); if ( bHasChromat ) { pLocFrag->pSeqPhredCalledBases_->createPointPosArray( pLocFrag->bPhdFileHasTraceArrayMinAndMax_, pLocFrag->nTraceArrayMaxIndex_, lNumberOfUnpaddedBases, true ); // bFillUpArray } char cTemp; for( int n1Base = 1; n1Base <= lNumberOfUnpaddedBases; ++n1Base ) { assert( fgets( szLine, nMaxLineSize, pFil) != NULL ); ++nLine; // sometime try sscanf and see if it is faster cTemp = szLine[0]; if (! ( 'a' <= cTemp && cTemp <= 'z' ) ) { if ( 'A' <= cTemp && cTemp <= 'Z' ) { szLine[0] = szLine[0] + ( 'a' - 'A' ); } else { char szTemp[200]; sprintf( szTemp, "expected letter in position 0 of this line but instead found \"%s\"", szLine ); PARSE_PANIC( szTemp ); } } // advance over 1st whitespace if ( szLine[1] != ' ' ) { PARSE_PANIC( "expected line of the form (base)(space)(quality)(space)(peak position) but the first space was not in column 2" ); } nPos = 2; if ( szLine[2] == ' ' || szLine[2] == '\0' ) { while( szLine[nPos] == ' ' ) { ++nPos; } if ( szLine[nPos] == '\0' ) { PARSE_PANIC( "this line should be of the form (base)(space)(quality)(space)(peak position) but the line ended after the base" ); } } // if reached here, szLine[nPos] is not a space and not a null // advance to next whitespace, accumlating quality on the way nQuality = 0; while( '0' <= szLine[nPos] && szLine[nPos] <= '9' ) { nQuality = nQuality*10 + ( szLine[nPos] - '0' ); ++nPos; } // if a base is cross_matched out, then make its quality 0 // since it should be dark--should not attract attention // changed Dec 1999 since we need quality values for autofinish // to get the error profile of existing reads // if ( cAceBase == 'x' ) // nQuality = 0; pLocFrag->pSeqPhredCalledBases_->insert( Ntide( cTemp, nQuality ) ); // if there is no chromat, no need to set peak position if ( ! bHasChromat ) continue; // when exit loop above (and have chromat), the following // character must be a space if ( szLine[nPos] != ' ' ) { PARSE_PANIC( "this line should have 3 tokens but doesn't" ); } // now advance to next non-whitespace to find the peak position ++nPos; if ( szLine[nPos] == ' ' || szLine[nPos] == '\0' ) { while( szLine[nPos] == ' ') { ++nPos; } if ( szLine[nPos] == '\0' ) { PARSE_PANIC( "this line should have 3 tokens but doesn't" ); } } // advance to next whitespace (actually a carriage return), // accumlating peak position on the way nPeakPosition = 0; while( '0' <= szLine[nPos] && szLine[nPos] <= '9' ) { nPeakPosition = nPeakPosition*10 + ( szLine[nPos] - '0' ); ++nPos; } if ( szLine[nPos] != '\n' ) { char szTemp[1000]; sprintf( szTemp, "this line should be of form (base)(space)(quality)(space)(peak position) where quality and peak position are numeric but something non-numeric follows the peak position. Line is:\n\"%s\"", szLine ); PARSE_PANIC( szTemp ); } pLocFrag->pSeqPhredCalledBases_->setTracePointPosAtSeqPos( n1Base, nPeakPosition ); } // for( int n1Base = 1; n1Base <= lNumberOfUnpaddedBases; ++n1Base ) { // if flipping read left and right, must do this also // for the peak position so it lines up with the flipped trace if ( pLocFrag->bComp() ) { pLocFrag->pSeqPhredCalledBases_->complementSequence(); } } // static void processCommentAndDnaForAddedRead( LocatedFragment* pLocFrag, static void backUpAndWritePhdBlock( FILE* pFil, const long lFilePosStartOfDesiredPhdBlock, char* szLine, FILE* pNewPhdBall ) { assert( fseek( pFil, lFilePosStartOfDesiredPhdBlock, SEEK_SET ) == 0 ); bool bFirstLine = true; while( 1 ) { if ( fgets( szLine, nMaxLineSize, pFil ) == NULL ) { return; } if ( !bFirstLine && memcmp( szBEGIN_SEQUENCE, szLine, nBEGIN_SEQUENCE ) == 0 ) return; bFirstLine = false; // if get here, have a line to write fputs( szLine, pNewPhdBall ); } } // no particular reason to be in Assembly object except to make it // consistent with readFileOfPhdFiles (above) and also to give it a // declaration void Assembly :: readFileOfPhdFilesForAddedReads( readsSortedByReadName& aDesiredReads, const FileName& filPhdBall, const bool bCreateNewPhdBall, FILE* pNewPhdBall ) { pCP->filFileOfPhdFilesCurrentlyBeingRead_ = filPhdBall; if ( !bCalculatedLengths ) { bCalculatedLengths = true; nBEGIN_SEQUENCE = strlen( szBEGIN_SEQUENCE ); nBEGIN_COMMENT = strlen( szBEGIN_COMMENT ); nBEGIN_DNA = strlen( szBEGIN_DNA ); nTIME = strlen( szTIME ); nEND_DNA = strlen( szEND_DNA ); nABI_THUMBPRINT = strlen( szABI_THUMBPRINT ); nPHRED_VERSION = strlen( szPHRED_VERSION ); nCALL_METHOD = strlen( szCALL_METHOD ); nQUALITY_LEVELS = strlen( szQUALITY_LEVELS ); nEND_COMMENT = strlen( szEND_COMMENT ); nTRACE_ARRAY_MIN_INDEX = strlen( szTRACE_ARRAY_MIN_INDEX ); nTRACE_ARRAY_MAX_INDEX = strlen( szTRACE_ARRAY_MAX_INDEX ); } RWCString soPhdBallWithoutDirectory; if ( bCreateNewPhdBall ) { soPhdBallWithoutDirectory = pCP->filNewFileOfPhdFilesFullPath_; } else { soPhdBallWithoutDirectory = filPhdBall; // this is the full path } if ( soPhdBallWithoutDirectory.bStartsWithAndRemove( pCP->filPhdBallDirectory_ ) ) { soPhdBallWithoutDirectory.bStartsWithAndRemove( "/" ); } FILE *pFil; nLine = 0; cerr << "opening " << filPhdBall << endl; pFil = fopen( filPhdBall.data(), "r" ); if ( pFil == NULL ) { THROW_FILE_ERROR( filPhdBall ); } // look for each read in turn, until the end of the file int nNumberOfPhdFilesInFile = 0; bool bEndOfFile = false; bool bFoundBEGIN_SEQUENCE = false; RWCString soReadName; RWCString soVersion; long lNumberOfUnpaddedBases; LocatedFragment* pLocFrag; RWCTokenizer tok(""); int nVersion; int nNumberOfReadsRead = 0; int nNumberOfDesiredReadsRead = 0; const bool bUnpaddedTagCoordinates = true; long lFilePosStartOfDesiredPhdBlock; look_for_BEGIN_SEQUENCE: // looking for BEGIN_SEQUENCE if (fgets( szLine, nMaxLineSize, pFil ) == NULL ) goto end_of_file; ++nLine; if ( memcmp( szBEGIN_SEQUENCE, szLine, nBEGIN_SEQUENCE ) != 0 ) goto look_for_BEGIN_SEQUENCE; found_BEGIN_SEQUENCE: ++nNumberOfReadsRead; if ( ( nNumberOfReadsRead < 10 ) || ( ( nNumberOfReadsRead < 10000 ) && ( nNumberOfReadsRead % 1000 == 0 ) ) || ( ( nNumberOfReadsRead < 100000 ) && ( nNumberOfReadsRead % 10000 == 0 ) ) || ( nNumberOfReadsRead % 100000 == 0 ) ) { cerr << "read phds in " << filPhdBall << " found: " << soAddCommas( nNumberOfReadsRead ) << " totals: used: " << soAddCommas( nNumberOfDesiredReadsRead ) << endl; } tok.soStringToBeTokenized_ = szLine; tok.nAfterLastToken_ = -1; assert( tok() == "BEGIN_SEQUENCE" ); soReadName = tok(); soVersion = tok(); pLocFrag = aDesiredReads.pFindReadByName( soReadName ); if ( !pLocFrag ) { // we don't want this read. Look for next read in phd ball goto look_for_BEGIN_SEQUENCE; } // if reached here, we want the read if ( bCreateNewPhdBall ) { lFilePosStartOfDesiredPhdBlock = ftell( pFil ); if ( lFilePosStartOfDesiredPhdBlock == -1 ) { RWCString soError = filPhdBall + " ftell error"; THROW_FILE_ERROR( soError ); } // At this point lFilePosStartOfDesiredPhdBlock is after // the line we are interested in. How much do we back up? // The entire strlen( szLine )? Yes, because if we position // the file at that location, the next byte that we read will be the // start of the line we want. lFilePosStartOfDesiredPhdBlock -= strlen( szLine ); } nVersion = -666; if ( soVersion.bIsNull() ) nVersion = 1; else if ( !bIsNumeric( soVersion ) ) { RWCString soError = filPhdBall + " is corrupted since line " + szLine + " has bad version" + soVersion; THROW_ERROR( soError ); } else { nVersion = atoi( soVersion.data() ); } // I don't think we can check for version since the alignment file // has no information about which version was used. In 99.9% of // uses, these reads will be new reads, since they are being aligned. // Thus they can't be edited, so they will be version 1. ++nNumberOfDesiredReadsRead; processCommentAndDnaForAddedRead( pLocFrag, pFil, lNumberOfUnpaddedBases ); processMiscReadStuff( pLocFrag, pFil, lNumberOfUnpaddedBases, szLine, bEndOfFile, bFoundBEGIN_SEQUENCE, bUnpaddedTagCoordinates ); pLocFrag->filPhdBall_ = soPhdBallWithoutDirectory; if ( bCreateNewPhdBall ) { backUpAndWritePhdBlock( pFil, lFilePosStartOfDesiredPhdBlock, szLine, pNewPhdBall ); } if ( bEndOfFile ) goto end_of_file; if ( bFoundBEGIN_SEQUENCE) goto found_BEGIN_SEQUENCE; assert( false ); // should never get here end_of_file: cerr << "Number of reads read: " << nNumberOfReadsRead << " Number of these that are desired: " << nNumberOfDesiredReadsRead << " Number of desired reads: " << aDesiredReads.length() << endl; cerr.flush(); fclose( pFil ); } // readFileOfPhdFilesForAddedReads #define PARSE_ERROR_PHDBALL2FASTA(message) \ { ostringstream ost; \ ost << "Error detected from source file " \ << __FILE__ << " at " << __LINE__ <& aQualities, bool& bEndOfFile ) { soBases = ""; aQualities.clear(); soFastaHeader = ""; soFastaQualHeader = ""; char szReadName[1000]; int nVersion; int nTokens = sscanf( szLine_, "BEGIN_SEQUENCE %s %d", szReadName, &nVersion ); if ( nTokens == 1 ) nVersion = 1; else if ( nTokens != 2 ) { PARSE_ERROR_PHDBALL2FASTA( "line should have been of the form BEGIN_SEQUENCE (seq_name) (version)" ); } soFastaHeader += szReadName; soFastaHeader += " VERSION: "; soFastaHeader += RWCString( (long) nVersion ); soFastaQualHeader += szReadName; bool bFoundBeginComment = false; while( bFoundBeginComment ) { if ( fgets( szLine_, nMaxLineSize2, pPhdBall ) == NULL ) { PARSE_ERROR_PHDBALL2FASTA( "premature end of file after BEGIN_SEQUENCE and expecting BEGIN_COMMENT" ); } ++nLine_; if ( memcmp( szBEGIN_COMMENT, szLine_, nBEGIN_COMMENT ) == 0 ) { bFoundBeginComment = true; } } bool bFoundCommentEnd = false; while( !bFoundCommentEnd ) { if (fgets( szLine_, nMaxLineSize2, pPhdBall ) == NULL ) { PARSE_ERROR_PHDBALL2FASTA( "premature end of file while looking for END_COMMENT" ); } ++nLine_; if ( strncmp( szEND_COMMENT, szLine_, nEND_COMMENT ) == 0 ) bFoundCommentEnd = true; else if ( strncmp( szTIME, szLine_, nTIME ) == 0 ) { RWCString soLine( szLine_ ); RWCTokenizer tokNext( soLine ); RWCString soNextToken = tokNext(); assert( soNextToken == "TIME:" ); RWCString soDayOfWeek = tokNext(); RWCString soMonth = tokNext(); RWCString soDayOfMonth = tokNext(); RWCString soTime = tokNext(); RWCString soYear = tokNext(); if ( !bIsNumeric( soYear ) ) { soYear = tokNext(); } RWCString soTimestampFromPHDFile = soDayOfWeek + " " + soMonth + " " + soDayOfMonth + " " + soTime + " " + soYear; soFastaHeader += " TIME: "; soFastaHeader += soTimestampFromPHDFile; } else if ( strncmp( "CHEM:", szLine_, 5 ) == 0 ) { soFastaHeader += " "; soFastaHeader += szLine_; soFastaHeader.stripTrailingWhitespaceFast(); } else if ( strncmp( "DYE:", szLine_, 4 ) == 0 ) { soFastaHeader += " "; soFastaHeader += szLine_; soFastaHeader.stripTrailingWhitespaceFast(); } else if ( strncmp( "CHROMAT_FILE:", szLine_, 13 ) == 0 ) { soFastaHeader += " "; soFastaHeader += szLine_; soFastaHeader.stripTrailingWhitespaceFast(); } else { // ALLOW other lines in the comment section of the phd file } } // while( !bFoundCommentEnd ) { // skip over BEGIN_DNA bool bFoundBeginDNA = false; while( !bFoundBeginDNA ) { if ( fgets( szLine_, nMaxLineSize2, pPhdBall ) == NULL ) { PARSE_ERROR_PHDBALL2FASTA( "premature end of file while looking for BEGIN_DNA" ); } ++nLine_; if ( memcmp( szBEGIN_DNA, szLine_, nBEGIN_DNA ) == 0 ) { bFoundBeginDNA = true; } } int nQuality; char cBase; bool bFoundEndDNA = false; while( !bFoundEndDNA ) { if (fgets( szLine_, nMaxLineSize2, pPhdBall ) == NULL ) { PARSE_ERROR_PHDBALL2FASTA( "premature end of file while looking for END_DNA" ); } ++nLine_; if ( szLine_[0] == 'E' && szLine_[1] == 'N' && szLine_[2] == 'D' && szLine_[3] == '_' && szLine_[4] == 'D' && szLine_[5] == 'N' && szLine_[6] == 'A' ) { bFoundEndDNA = true; } else { nTokens = sscanf( szLine_, "%c %d", &cBase, &nQuality ); if ( nTokens != 2 ) { PARSE_ERROR_PHDBALL2FASTA( "line should be of form (base) (quality) but it isn't" ); } if (! isalpha( cBase ) ) { PARSE_ERROR_PHDBALL2FASTA( "line should begin with a base but doesn't" ); } // the 200 is wild, but just to check that we aren't reading // a corrupted file if ( ! ( 0 <= nQuality && nQuality <= 200 ) ) { PARSE_ERROR_PHDBALL2FASTA( "line should be of form (base) (quality) where quality is in the range of 0 to 99 but it isn't" ); } soBases.append( cBase ); aQualities.append( nQuality ); } } bool bContinue = true; while( bContinue ) { // this can end in 2 ways: 1) end of file 2) next read if (fgets( szLine_, nMaxLineSize2, pPhdBall ) == NULL ) { bEndOfFile = true; return; } ++nLine_; if ( memcmp( szBEGIN_SEQUENCE, szLine_, nBEGIN_SEQUENCE ) == 0 ) { bContinue = false; } else { // look for whole read items if ( memcmp( szLine_, "WR{", 3 ) == 0 ) { // this is in readWholeReadItem.cpp readWholeReadItemForPhdBall2Fasta( pPhdBall, soFastaHeader ); } } } // while( !bContinue ) { // notice that at this point szLine_ contains the BEGIN_SEQUENCE // for the next read }