/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ // // readPHD.cpp // // gordon sep-95 // #include "rwcstring.h" #include using namespace std; #include #include #include "rwcregexp.h" #include "rwctokenizer.h" #include #include #include "locatedFragment.h" #include "cComplementBase.h" #include "readPHD.h" #include "consed.h" #include "bIsNumeric.h" #include "tagTypes.h" #include "tag.h" #include "soGetRestOfLine.h" #include "bIsNumericMaybeWithWhitespace.h" #include "bIsNumericLongMaybeWithWhitespace.h" #include "wholeReadItem.h" #include "readWholeReadItem.h" #include #include "parseUserDefinedTagTypeLine.h" #define DEFINE_nLine #include "nLine.h" #define PARSE_PANIC( message ) \ { ostringstream ost; \ ost << "Error detected from source file " \ << __FILE__ << " at " << __LINE__ <filGetPHDFilename() << " at line " << \ nLine << " detected from source file " << \ __FILE__ << " at line " << __LINE__ << endl << \ szTemp << endl << ends; \ InputDataError ide( ost.str().c_str() ); \ throw ide; } #define PARSE_PANIC_1_ARG( szMessage, arg1 ) { \ char szTemp[1000]; \ sprintf( szTemp, szMessage, arg1 ); \ \ ostringstream ost; \ ost << "Problem reading phd file " << filPHD << " at line " << \ nLine << " detected from source file " << \ __FILE__ << " at line " << __LINE__ << endl << \ szTemp << endl << ends; \ InputDataError ide( ost.str().c_str() ); \ throw ide; } #define FGETS_OR_PARSE_PANIC( szErrorMessage ) \ if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) \ PARSE_PANIC( szErrorMessage ); \ ++nLine; \ soLine = szLine; static bool bCalculatedLengths = false; static char* szBEGIN_SEQUENCE = "BEGIN_SEQUENCE"; static int nBEGIN_SEQUENCE; static char* szBEGIN_COMMENT = "BEGIN_COMMENT"; static int nBEGIN_COMMENT; static char* szABI_THUMBPRINT = "ABI_THUMBPRINT:"; static int nABI_THUMBPRINT; static char* szPHRED_VERSION = "PHRED_VERSION:"; static int nPHRED_VERSION; static char* szCALL_METHOD = "CALL_METHOD:"; static int nCALL_METHOD; static char* szQUALITY_LEVELS = "QUALITY_LEVELS:"; static int nQUALITY_LEVELS; static char* szTIME = "TIME:"; static int nTIME; static char* szTRACE_ARRAY_MIN_INDEX = "TRACE_ARRAY_MIN_INDEX:"; static int nTRACE_ARRAY_MIN_INDEX; static char* szTRACE_ARRAY_MAX_INDEX = "TRACE_ARRAY_MAX_INDEX:"; static int nTRACE_ARRAY_MAX_INDEX; static char* szEND_COMMENT = "END_COMMENT"; static int nEND_COMMENT; static char* szBEGIN_DNA = "BEGIN_DNA"; static int nBEGIN_DNA; static char* szEND_DNA = "END_DNA"; static int nEND_DNA; static bool bFoundNonBlank; static bool bFoundBlank; static int nPos; static int nQuality; static int nPeakPosition; const int nMaxLineSize = 300; static char szLine[ nMaxLineSize + 1]; static RWCString soLine; static FileName filPHDStatic; static FILE* pFilStatic; static int* pPaddedFromUnpaddedArray; static int nPaddedFromUnpaddedArraySize = 0; static int nNumberOfUnpaddedBasesStatic = 0; static bool bReadComplemented = false; static void setPaddedFromUnpaddedArray( const int nNumberOfUnpaddedBases, LocatedFragment* pLocFrag ) { nNumberOfUnpaddedBasesStatic = nNumberOfUnpaddedBases; bReadComplemented = pLocFrag->bComp(); if ( ( nNumberOfUnpaddedBases + 10 ) > nPaddedFromUnpaddedArraySize ) { if ( pPaddedFromUnpaddedArray ) free( pPaddedFromUnpaddedArray ); pPaddedFromUnpaddedArray = (int*) malloc( sizeof( int) * ( nNumberOfUnpaddedBases + 10 ) ); if ( !pPaddedFromUnpaddedArray ) { PARSE_PANIC_2_ARGS_NO_filPHD( "could not get enough memory %d int's for read %s\n", nNumberOfUnpaddedBases, (char*) pLocFrag->soGetName().data() ); } nPaddedFromUnpaddedArraySize = nNumberOfUnpaddedBases + 10; } int nUnpadded = 0; for( int nPaddedSeqPos = pLocFrag->nGetStartIndex(); nPaddedSeqPos <= pLocFrag->nGetEndIndex(); ++nPaddedSeqPos ) { if ( !pLocFrag->ntideGet( nPaddedSeqPos ).bIsPad() ) { ++nUnpadded; pPaddedFromUnpaddedArray[ nUnpadded ] = nPaddedSeqPos; } } } static inline int nGetPaddedConsPosFromUnpaddedPosInDirectionOfSequencing( LocatedFragment* pLocFrag, int nUnpaddedPos ) { if ( bReadComplemented ) nUnpaddedPos = nNumberOfUnpaddedBasesStatic + 1 - nUnpaddedPos; // nUnpaddedPos is now the unpadded sequence position in the // left-to-right direction // now we have the unpadded position from left to right // convert to padded coordinates assert( 1 <= nUnpaddedPos && nUnpaddedPos <= nNumberOfUnpaddedBasesStatic ); int nPaddedLeftToRightSequencePos = pPaddedFromUnpaddedArray[ nUnpaddedPos ] ; return( pLocFrag->nGetConsPosFromFragIndex( nPaddedLeftToRightSequencePos ) ); } static void myParsePanic( const RWCString& soErrorMessage , const int nSourceFileLineNumber, char* szSourceFileName ) { ostringstream ost; ost << "Error detected from source file " << szSourceFileName << " at " << nSourceFileLineNumber << endl; ost << "error in phd file " << filPHDStatic << " at line " << nLine << ": \n" \ << szLine << endl << soErrorMessage << endl << ends; InputDataError ide(ost.str().c_str()); throw ide; } static void myFgetsOrParsePanic( const RWCString& soErrorMessage, int nSourceFileLineNumber, char* szSourceFileName, char*& szLineRead ) { if ( fgets( szLine, nMaxLineSize, pFilStatic ) == NULL ) { ostringstream ost; ost << "End of file from fgets on source file " << szSourceFileName << " at line " << nSourceFileLineNumber << endl; ost << "error in phd file " << filPHDStatic << " at line " << nLine << ": \n" << szLine << endl << soErrorMessage << endl << ends; InputDataError ide( ost.str().c_str() ); throw ide; } ++nLine; szLineRead = szLine; } static void readTag( FILE* pFil, const FileName& filPHD, LocatedFragment* pLocFrag, const int nUnpaddedBases ) { if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) { PARSE_PANIC( "premature end of file while looking for TYPE:" ); } ++nLine; RWCTokenizer tokTagType( szLine ); if ( tokTagType() != "TYPE:" ) { PARSE_PANIC2( filPHD, nLine, "no type after TYPE:", szLine ); } // current line contains something like // TYPE: phrap RWCString soTagType = tokTagType(); // The code below takes just as long (54 seconds) as the code above // if ( szLine[0] != 'T' || // szLine[1] != 'Y' || // szLine[2] != 'P' || // szLine[3] != 'E' || // szLine[4] != ':' ) // PARSE_PANIC2( filPHD, nLine, // "no type after TYPE:", // szLine ); // int n = 5; // while( szLine[n] == ' ') // ++n; // RWCString soTagType; // while( szLine[n] != '\n' && szLine[n] != '\0' ) { // soTagType.append( szLine[n] ); // ++n; // } // RWCTokenizer tokType( szLine ); // if ( tokType() != "TYPE:" ) // PARSE_PANIC2( filPHD, nLine, // "no type after TYPE:", // szLine ); // RWCString soTagType = tokType(); if (! tagTypes::pGetTagTypes()->bIsAValidTagType( soTagType ) ) PARSE_PANIC2( filPHD, nLine, "unrecognized tag type after TYPE:", szLine ); if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) { PARSE_PANIC( "premature end of file while looking for SOURCE:" ); } ++nLine; // this takes 56 seconds for the same project as opposed to 53 or 54 for // the RWCTokenizer method below // if ( szLine[0] != 'S' || // szLine[1] != 'O' || // szLine[2] != 'U' || // szLine[3] != 'R' || // szLine[4] != 'C' || // szLine[5] != 'E' || // szLine[6] != ':' ) { // PARSE_PANIC_1_ARG( "in tag, expecting SOURCE: line after TYPE: line but found %s", // szLine ); // } // // skip over spaces // n = 7; // while( szLine[n] == ' ' || szLine[n] == '\t' ) // ++n; // RWCString soSource; // while( szLine[n] != '\n' ) { // soSource.append( szLine[n] ); // ++n; // } RWCTokenizer tokSource( szLine ); if ( tokSource() != "SOURCE:" ) { PARSE_PANIC_1_ARG( "in tag, expecting SOURCE: line after TYPE: line but found %s", szLine ); } RWCString soSource = tokSource(); // Note: the above is *faster* than below. Don't ask me why. // If you don't believe it, try it yourself. The difference // was 49 seconds for below and 42 seconds above reproducibly. // do { // if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) { // PARSE_PANIC( filPHD, nLine, // "premature end of file while looking for SOURCE:" ); // } // ++nLine; // } while( ! ( szLine[0] == 'S' && // szLine[1] == 'O' && // szLine[2] == 'U' && // szLine[3] == 'R' && // szLine[4] == 'C' && // szLine[5] == 'E' && // szLine[6] == ':' && // szLine[7] == ' ' ) ); // char* szSource = strtok( szLine + 8, "\n " ); // RWCString soSource; // int n = 8; // while( szLine[n] != '\n' ) { // soSource.append( szLine[n] ); // ++n; // } // now allowing any source // // if ( // (soSource != "phrap") && // (soSource != "crossmatch" ) && // (soSource != "consed" ) && // (soSource != "polyPhred" ) && // (soSource != "other" ) // ) { // PARSE_PANIC2( filPHD, nLine, "SOURCE for tag must be phrap, crossmatch, consed, polyPhred, or other", // szLine ); // } if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) { PARSE_PANIC( "premature end of file while looking for UNPADDED READ POS:" ); } ++nLine; RWCTokenizer tokPos( szLine ); if ( tokPos() != "UNPADDED_READ_POS:" ) { PARSE_PANIC_1_ARG( "tag should have UNPADDED_READ_POS: on line following SOURCE: line but instead has %s", szLine ); } RWCString soPosStart = tokPos(); RWCString soPosEnd = tokPos(); if (soPosStart.isNull() || soPosEnd.isNull() ) PARSE_PANIC2( filPHD, nLine, "missing start of end position for tag", szLine ); if ( !bIsNumeric( soPosStart ) || !bIsNumeric( soPosEnd ) ) PARSE_PANIC2( filPHD, nLine, "non-numeric start or end position for tag", szLine ); int nUnpaddedStartPos = atoi( (char*) soPosStart.data() ); int nUnpaddedEndPos = atoi( (char*) soPosEnd.data() ); if ( ! ( ( pLocFrag->nGetUnpaddedStartIndex() <= nUnpaddedStartPos ) && ( nUnpaddedStartPos <= nUnpaddedEndPos ) && ( nUnpaddedEndPos <= nUnpaddedBases ) ) ) { char szTemp[200]; sprintf( szTemp, "tag must have %d <= start position <= end position <= %d since the tag must be within the bounds of the read", pLocFrag->nGetUnpaddedStartIndex(), nUnpaddedBases ); PARSE_PANIC( szTemp ); } if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) { PARSE_PANIC( "premature end of file while looking for DATE:" ); } ++nLine; RWCTokenizer tokDate( szLine ); if ( tokDate() != "DATE:" ) { PARSE_PANIC_1_ARG( "UNPADDED_READ_POS: line should be followed by DATE: line but instead has %s", szLine ); } RWCString soDatePart = tokDate(); RWCString soTimePart = tokDate(); if ( (soDatePart.length() != 8 ) || (soTimePart.length() != 8) ) PARSE_PANIC2( filPHD, nLine, "date and time must each be exactly 8 characters YY/MM/DD HH:MI:SS", szLine ); if ( ! ( ( soDatePart[2] == '/' ) && ( soDatePart[5] == '/' ) && ( soTimePart[2] == ':' ) && ( soTimePart[5] == ':' ) ) ) PARSE_PANIC2( filPHD, nLine, "date and time must each be exactly 8 characters YY/MM/DD HH:MI:SS", szLine ); #define bIsNumber( myChar ) ( ( '0' <= myChar && myChar <= '9' ) ? true : false ) if ( ! ( bIsNumber( soDatePart[0] ) && bIsNumber( soDatePart[1] ) && bIsNumber( soDatePart[3] ) && bIsNumber( soDatePart[4] ) && bIsNumber( soDatePart[6] ) && bIsNumber( soDatePart[7] ) && bIsNumber( soTimePart[0] ) && bIsNumber( soTimePart[1] ) && bIsNumber( soTimePart[3] ) && bIsNumber( soTimePart[4] ) && bIsNumber( soTimePart[6] ) && bIsNumber( soTimePart[7] ) ) ) { PARSE_PANIC_2_ARGS( "date and time must be of form YY/MM/DD HH:MI:SS but they were not numeric. Instead found %s %s", (char*) soDatePart.data(), (char*) soTimePart.data() ); } RWCString soDate = soDatePart + " " + soTimePart; // RWCString soTemp = // soDatePart(0, 2 ) + // soDatePart( 3, 2 ) + // soTimePart( 0, 2 ) + // soTimePart( 3, 2 ); // if (! bIsNumeric (soTemp ) ) // PARSE_PANIC2( filPHD, nLine, // "date and time must each be exactly 8 characters YY/MM/DD HH:MI:SS", // szLine ); FGETS_OR_PARSE_PANIC( "premature end of file while in tag" ); long lID = nUndefinedTagID; if ( soLine.bStartsWith( "ID:" ) ) { RWCString soRestOfLine = soLine.soGetRestOfString( 3 ); // bIsNumericLongMaybeWithWhitespace will check that there // is really a number there if ( !bIsNumericLongMaybeWithWhitespace( soRestOfLine, lID ) ) { RWCString soMessage = "ID: line not followed by a number but rather by: "; soMessage += soRestOfLine; PARSE_PANIC( soMessage ); } // if reached here, we must have successfully read the ID. // So must read another line ConsEd::pGetAssembly()->findLargestTagIDSoFar( lID ); FGETS_OR_PARSE_PANIC( "premature end of file in a tag after reading a tag ID" ); } RWCString soComment; if ( soLine.length() >= 13 && soLine( 0, 13 ) == "BEGIN_COMMENT" ) { bool bFoundEndComment = false; while( !bFoundEndComment ) { if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) { PARSE_PANIC( "premature end of file while looking for END_COMMENT in comment tag" ); } ++nLine; soLine = szLine; if (soLine == "END_COMMENT\n" ) bFoundEndComment = true; else soComment += soLine; } // now strip off the final \n off of the comment--this is a delimiter, // not part of the comment soComment = soComment.stripWhitespace( RWCString::TRAILING ); // set up so that soLine contains the next line FGETS_OR_PARSE_PANIC( "premature end of file while in tag and just completed reading comment" ); } // if soLine.length() >= 13 && soLine( 0, 13 ) ... tagType* pTagType = tagTypes::pGetTagTypes()->pGetTagType( soTagType ); RWCString soMiscData; RWTPtrOrderedVector* pArrayOfUserDefinedTagFields = NULL; if ( pTagType->bIsAUserDefinedTagType() ) { // used for myParsePanic and myFgetsOrParsePanic which are called // from parseUserDefinedTagTypeLine filPHDStatic = filPHD; pFilStatic = pFil; pArrayOfUserDefinedTagFields = new RWTPtrOrderedVector; pArrayOfUserDefinedTagFields->soName_ = "pArrayOfUserDefinedTagFields"; int nIndexOfNextUserDefinedField = 0; while( !( soLine.bStartsWith( "END_TAG" ) ) ) { userDefinedTagField* pUserDefinedTagField = NULL; parseUserDefinedTagTypeLine( soLine.data(), (userDefinedTagType*) pTagType, pUserDefinedTagField, nIndexOfNextUserDefinedField, myParsePanic, myFgetsOrParsePanic ); pArrayOfUserDefinedTagFields->insert( pUserDefinedTagField ); FGETS_OR_PARSE_PANIC( "premature end of file while looking for END_TAG" ); } RWCString soErrorMessage; if ( !((userDefinedTagType*) pTagType)->bCorrectNumberOfEachField( pArrayOfUserDefinedTagFields, soErrorMessage ) ) { PARSE_PANIC( soErrorMessage ); } } else { while( !( soLine.length() > 7 && soLine( 0, 7 ) == "END_TAG" ) ) { soMiscData += soLine; FGETS_OR_PARSE_PANIC( "premature end of file while looking for END_TAG" ); } } int nPaddedConsPos1 = nGetPaddedConsPosFromUnpaddedPosInDirectionOfSequencing( pLocFrag, nUnpaddedStartPos ); int nPaddedConsPos2 = nGetPaddedConsPosFromUnpaddedPosInDirectionOfSequencing( pLocFrag, nUnpaddedEndPos ); int nConsPosStart = MIN( nPaddedConsPos1, nPaddedConsPos2 ); int nConsPosEnd = MAX( nPaddedConsPos1, nPaddedConsPos2 ); tag* pTag = new tag( pLocFrag, NULL, // contig soTagType, nConsPosStart, nConsPosEnd, true, // bWriteToPhdFileNotAceFile_ soComment, soSource, soDate, false ); // no NoTrans if ( !soMiscData.isNull() ) { pTag->soMiscData_ = soMiscData; } if ( pArrayOfUserDefinedTagFields ) { pTag->pArrayOfUserDefinedTagFields_ = pArrayOfUserDefinedTagFields; } pTag->lID_ = lID; pLocFrag->aTags_.append( pTag ); } void readPHDFile( const FileName& filPHD, LocatedFragment* pLocFrag, int nArraySize, char* pBaseArray, int* pQualityArray, int* pPointPositionArray, RWCString& soTimestampFromPHDFile, RWCString& soABIThumbprint, RWCString& soPhredVersion, RWCString& soCallMethod, RWCString& soChemistry, RWCString& soDye, int& nTraceArrayMinIndex, int& nTraceArrayMaxIndex, bool& bFoundTraceArrayMaxIndex, int& nQualityLevels, int& nNumberOfUnpaddedBases, const bool bIgnoreTags ) { if (! bCalculatedLengths ) { bCalculatedLengths = true; nBEGIN_SEQUENCE = strlen( szBEGIN_SEQUENCE ); nBEGIN_COMMENT = strlen( szBEGIN_COMMENT ); nBEGIN_DNA = strlen( szBEGIN_DNA ); nTIME = strlen( szTIME ); nEND_DNA = strlen( szEND_DNA ); nABI_THUMBPRINT = strlen( szABI_THUMBPRINT ); nPHRED_VERSION = strlen( szPHRED_VERSION ); nCALL_METHOD = strlen( szCALL_METHOD ); nQUALITY_LEVELS = strlen( szQUALITY_LEVELS ); nEND_COMMENT = strlen( szEND_COMMENT ); nTRACE_ARRAY_MIN_INDEX = strlen( szTRACE_ARRAY_MIN_INDEX ); nTRACE_ARRAY_MAX_INDEX = strlen( szTRACE_ARRAY_MAX_INDEX ); } bFoundTraceArrayMaxIndex = false; // changed if find it FILE *pFil; nLine = 0; pFil = fopen( (char*) filPHD.data(), "r" ); // ifsPHDFile.open( filPHD, ios:: in ); if (pFil == NULL ) { ostringstream ost; ost << "could not open PHD file " << filPHD << endl << ends; InputDataError ide( ost.str().c_str() ); throw ide; } // looking for BEGIN_SEQUENCE bool bFound = false; while( !bFound ) { if (fgets( szLine, nMaxLineSize, pFil ) == NULL ) { PARSE_PANIC( "premature end of file while looking for BEGIN_SEQUENCE" ); } else ++nLine; if ( strncmp( szBEGIN_SEQUENCE, szLine, nBEGIN_SEQUENCE ) == 0 ) bFound = true; } // looking for BEGIN_COMMENT bFound = false; while( !bFound ) { if (fgets( szLine, nMaxLineSize, pFil ) == NULL ) { PARSE_PANIC( "premature end of file while looking for BEGIN_COMMENT for phd file comment section" ); } else ++nLine; if ( strncmp( szBEGIN_COMMENT, szLine, nBEGIN_COMMENT ) == 0 ) bFound = true; } // looking for TIME:, CHEM:, DYE:, ABI_THUMBPRINT:, bool bFoundCommentEnd = false; while( ! bFoundCommentEnd ) { if (fgets( szLine, nMaxLineSize, pFil ) == NULL ) { PARSE_PANIC( "premature end of file while looking for END_COMMENT" ); } else ++nLine; if ( strncmp( szEND_COMMENT, szLine, nEND_COMMENT ) == 0 ) bFoundCommentEnd = true; else if ( strncmp( szTIME, szLine, nTIME ) == 0 ) { RWCString soLine( szLine ); RWCTokenizer tokNext( soLine ); RWCString soNextToken = tokNext(); assert( soNextToken == "TIME:" ); RWCString soDayOfWeek = tokNext(); RWCString soMonth = tokNext(); RWCString soDayOfMonth = tokNext(); RWCString soTime = tokNext(); RWCString soTimeZone = tokNext(); RWCString soYear = tokNext(); soTimestampFromPHDFile = soDayOfWeek + " " + soMonth + " " + soDayOfMonth + " " + soTime + " " + soTimeZone + soYear; } else if ( strncmp( szABI_THUMBPRINT, szLine, nABI_THUMBPRINT ) == 0 ) { soABIThumbprint = soGetRestOfLine( szLine, nABI_THUMBPRINT ); } else if ( strncmp( szPHRED_VERSION, szLine, nPHRED_VERSION ) == 0 ) { soPhredVersion = soGetRestOfLine( szLine, nPHRED_VERSION ); } else if ( strncmp( szCALL_METHOD, szLine, nCALL_METHOD ) == 0 ) { soCallMethod = soGetRestOfLine( szLine, nCALL_METHOD ); } else if ( strncmp( szQUALITY_LEVELS, szLine, nQUALITY_LEVELS ) == 0 ) { RWCString soQualityLevels = soGetRestOfLine( szLine, nQUALITY_LEVELS ); if (! bIsNumeric( soQualityLevels ) ) { PARSE_PANIC2( filPHD, nLine, "quality levels not numeric: ", szLine ); } nQualityLevels = atoi( (char*) soQualityLevels.data() ); } else if ( strncmp( szTRACE_ARRAY_MIN_INDEX, szLine, nTRACE_ARRAY_MIN_INDEX ) == 0 ) { // soGetRestOfLine strips off final CR RWCString soTraceArrayMinIndex = soGetRestOfLine( szLine, nTRACE_ARRAY_MIN_INDEX ); if ( !bIsNumericMaybeWithWhitespace( soTraceArrayMinIndex, nTraceArrayMinIndex ) ) PARSE_PANIC( "line should have been TRACE_ARRAY_MIN_INDEX: ### but ### wasn't numeric" ); } else if ( strncmp( szTRACE_ARRAY_MAX_INDEX, szLine, nTRACE_ARRAY_MAX_INDEX ) == 0 ) { // soGetRestOfLine strips off final CR RWCString soTraceArrayMaxIndex = soGetRestOfLine( szLine, nTRACE_ARRAY_MAX_INDEX ); if ( !bIsNumericMaybeWithWhitespace( soTraceArrayMaxIndex, nTraceArrayMaxIndex ) ) PARSE_PANIC( "line should have been TRACE_ARRAY_MAX_INDEX: ### but ### wasn't numeric" ); bFoundTraceArrayMaxIndex = true; } else if ( strncmp( "CHEM:", szLine, 5 ) == 0 ) { int nLength = strlen( szLine ); // make sure that there is at least 2 chars after CHEM: // which allows for a space and a non-space if ( nLength < 7 ) PARSE_PANIC( "line should have been CHEM: (prim, term, or unknown)" ); // start with character after CHEM: which is at position 5 soChemistry = soGetRestOfLine( szLine, 5 ); } else if ( strncmp( "DYE:", szLine, 4 ) == 0 ) { int nLength = strlen( szLine ); if ( nLength < 6 ) PARSE_PANIC( "line should have been DYE: ( rhod, big, ET, d-rhod, or unknown )" ); soDye = soGetRestOfLine( szLine, 4 ); } else { // ALLOW other lines in the comment section of the phd file } } // now looking for BEGIN_DNA bFound = false; while( !bFound ) { if (fgets( szLine, nMaxLineSize, pFil ) == NULL ) { PARSE_PANIC( "premature end of file while looking for BEGIN_DNA" ); } else ++nLine; if ( strncmp( szBEGIN_DNA, szLine, nBEGIN_DNA ) == 0 ) bFound = true; } // now reading the data until get END_DNA char cTemp; int nBases = 0; bool bEndDNA = false; while( !bEndDNA ) { if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) { PARSE_PANIC( "premature end of file while looking for END_DNA" ); } else { ++nLine; } if ( szLine[0] == 'E' && szLine[1] == 'N' && szLine[2] == 'D' && szLine[3] == '_' && szLine[4] == 'D' && szLine[5] == 'N' && szLine[6] == 'A' ) { bEndDNA = true; } else { ++nBases; if (nBases > nArraySize ) { char szTemp[200]; sprintf( szTemp, "ace file says %d unpadded bases but phd file has at least %d bases", nArraySize, nBases ); PARSE_PANIC( szTemp ); } cTemp = szLine[0]; if (! ( 'a' <= cTemp && cTemp <= 'z' ) ) { if ( 'A' <= cTemp && cTemp <= 'Z' ) { szLine[0] = szLine[0] + ( 'a' - 'A' ); } else { char szTemp[200]; sprintf( szTemp, "expected letter in position 0 of this line but instead found \"%s\"", szLine ); PARSE_PANIC( szTemp ); } } // if there is 1 base, put it at pBaseArray *(pBaseArray + nBases - 1 ) = szLine[0]; // advance over 1st whitespace if ( szLine[1] != ' ' ) { PARSE_PANIC( "expected line of the form (base)(space)(quality)(space)(peak position) but the first space was not in column 2" ); } nPos = 2; if ( szLine[2] == ' ' || szLine[2] == '\0' ) { while( szLine[nPos] == ' ' ) { ++nPos; } if ( szLine[nPos] == '\0' ) { PARSE_PANIC( "this line should be of the form (base)(space)(quality)(space)(peak position) but the line ended after the base" ); } } // if reached here, szLine[nPos] is not a space and not a null // advance to next whitespace, accumlating quality on the way nQuality = 0; while( '0' <= szLine[nPos] && szLine[nPos] <= '9' ) { nQuality = nQuality*10 + ( szLine[nPos] - '0' ); ++nPos; } // when exit this loop, the following character must be a space if ( szLine[nPos] != ' ' ) { PARSE_PANIC( "this line should have 3 tokens but doesn't" ); } // if there is 1 base, put it at pQualityArray *(pQualityArray + nBases - 1 ) = nQuality; // now advance to next non-whitespace to find the peak position ++nPos; if ( szLine[nPos] == ' ' || szLine[nPos] == '\0' ) { while( szLine[nPos] == ' ') { ++nPos; } if ( szLine[nPos] == '\0' ) { PARSE_PANIC( "this line should have 3 tokens but doesn't" ); } } // advance to next whitespace (actually a carriage return), // accumlating peak position on the way nPeakPosition = 0; while( '0' <= szLine[nPos] && szLine[nPos] <= '9' ) { nPeakPosition = nPeakPosition*10 + ( szLine[nPos] - '0' ); ++nPos; } if ( szLine[nPos] != '\n' ) { char szTemp[1000]; sprintf( szTemp, "this line should be of form (base)(space)(quality)(space)(peak position) where quality and peak position are numeric but something non-numeric follows the peak position. Line is:\n\"%s\"", szLine ); PARSE_PANIC( szTemp ); } // if there is 1 base, put it at pQualityArray *(pPointPositionArray + nBases - 1 ) = nPeakPosition; } // if ( strncmp( szEND_DNA, szLine, nEND_DNA ) == 0 ) ... else } // while( !bEndDNA ) { nNumberOfUnpaddedBases = nBases; // this was inserted to fix a bug with polyphred in which // tags applied to the .1 phd file may be out of range if the user then // deletes bases with subsequent versions of the phd files. Since // the .1 phd file is still read, the tags in it may be out of range. // Thus the solution is to not read the tags in the .1 phd file if it // is just being read for the phred calls. if (!bIgnoreTags ) { bool bSetPaddedFromUnpaddedArray = false; while( fgets( szLine, nMaxLineSize, pFil) != NULL ) { ++nLine; if ( strcmp( szLine, "BEGIN_TAG\n" ) == 0 ) { if ( !bSetPaddedFromUnpaddedArray ) { setPaddedFromUnpaddedArray( nNumberOfUnpaddedBases, pLocFrag ); bSetPaddedFromUnpaddedArray = true; } readTag( pFil, filPHD, pLocFrag, nNumberOfUnpaddedBases ); } else if ( strncmp( szLine, "WR{", 3 ) == 0 ) { wholeReadItem* pWR; readWholeReadItem( pFil, filPHD, pLocFrag, pWR ); pWR->appendThyself(); } } } fclose( pFil ); } void readPHDFileToSetQualitiesAndPeakPositions2( LocatedFragment* pLocFrag, const bool bIgnoreTags ) { if (! bCalculatedLengths ) { bCalculatedLengths = true; nBEGIN_SEQUENCE = strlen( szBEGIN_SEQUENCE ); nBEGIN_COMMENT = strlen( szBEGIN_COMMENT ); nBEGIN_DNA = strlen( szBEGIN_DNA ); nTIME = strlen( szTIME ); nEND_DNA = strlen( szEND_DNA ); nABI_THUMBPRINT = strlen( szABI_THUMBPRINT ); nPHRED_VERSION = strlen( szPHRED_VERSION ); nCALL_METHOD = strlen( szCALL_METHOD ); nQUALITY_LEVELS = strlen( szQUALITY_LEVELS ); nEND_COMMENT = strlen( szEND_COMMENT ); nTRACE_ARRAY_MIN_INDEX = strlen( szTRACE_ARRAY_MIN_INDEX ); nTRACE_ARRAY_MAX_INDEX = strlen( szTRACE_ARRAY_MAX_INDEX ); } if ( pLocFrag->filGetPHDFilename().bIsNull() ) { THROW_ERROR( pLocFrag->soGetName() + " has no phd file" ); } FileName filPHD = ConsEd::pGetConsEd()->filGetPHDDir() + "/" + pLocFrag->filGetPHDFilename(); FILE *pFil; nLine = 0; pFil = fopen( (char*) filPHD.data(), "r" ); // ifsPHDFile.open( filPHD, ios:: in ); if (pFil == NULL ) { ostringstream ost; ost << "could not open PHD file " << filPHD << endl << ends; InputDataError ide( ost.str().c_str() ); throw ide; } // looking for BEGIN_SEQUENCE bool bFound = false; while( !bFound ) { if (fgets( szLine, nMaxLineSize, pFil ) == NULL ) { PARSE_PANIC( "premature end of file while looking for BEGIN_SEQUENCE" ); } else ++nLine; if ( strncmp( szBEGIN_SEQUENCE, szLine, nBEGIN_SEQUENCE ) == 0 ) bFound = true; } // looking for BEGIN_COMMENT bFound = false; while( !bFound ) { if (fgets( szLine, nMaxLineSize, pFil ) == NULL ) { PARSE_PANIC( "premature end of file while looking for BEGIN_COMMENT for phd file comment section" ); } else ++nLine; if ( strncmp( szBEGIN_COMMENT, szLine, nBEGIN_COMMENT ) == 0 ) bFound = true; } // looking for TIME:, CHEM:, DYE:, ABI_THUMBPRINT:, pLocFrag->bPhdFileHasTraceArrayMinAndMax_ = false; bool bFoundTime = false; bool bFoundCommentEnd = false; while( ! bFoundCommentEnd ) { if (fgets( szLine, nMaxLineSize, pFil ) == NULL ) { PARSE_PANIC( "premature end of file while looking for END_COMMENT" ); } else ++nLine; if ( strncmp( szEND_COMMENT, szLine, nEND_COMMENT ) == 0 ) bFoundCommentEnd = true; else if ( strncmp( szTIME, szLine, nTIME ) == 0 ) { RWCString soLine( szLine ); RWCTokenizer tokNext( soLine ); RWCString soNextToken = tokNext(); assert( soNextToken == "TIME:" ); RWCString soDayOfWeek = tokNext(); RWCString soMonth = tokNext(); RWCString soDayOfMonth = tokNext(); RWCString soTime = tokNext(); RWCString soYear = tokNext(); // sometimes the timezone is put here like this: // Sat Nov 3 01:19:20 PDT 2007 // In such a case, ignore the PDT and put the 2007 into the year if ( !bIsNumeric( soYear ) ) { soYear = tokNext(); } RWCString soTimestampFromPHDFile = soDayOfWeek + " " + soMonth + " " + soDayOfMonth + " " + soTime + " " + soYear; if ( soTimestampFromPHDFile != pLocFrag->soGetTimestamp() && !pCP->bAllowTimestampMismatch_ ) { ostringstream ost; ost << "PHD file timestamp mismatch: ace file says " << pLocFrag->soGetTimestamp() << " but PHD file says " << soTimestampFromPHDFile << " for read " << pLocFrag->soGetName() << endl << "Did you use the phredPhrap perl script? You should. Click on help in the aligned reads window for documentation" << ends; InputDataError ide( ost.str().c_str() ); fclose( pFil ); throw ide; } bFoundTime = true; } else if ( strncmp( szABI_THUMBPRINT, szLine, nABI_THUMBPRINT ) == 0 ) { pLocFrag->soABIThumbprint_ = soGetRestOfLine( szLine, nABI_THUMBPRINT ); } else if ( strncmp( szPHRED_VERSION, szLine, nPHRED_VERSION ) == 0 ) { pLocFrag->soPhredVersion_ = soGetRestOfLine( szLine, nPHRED_VERSION ); } else if ( strncmp( szCALL_METHOD, szLine, nCALL_METHOD ) == 0 ) { pLocFrag->soCallMethod_ = soGetRestOfLine( szLine, nCALL_METHOD ); } else if ( strncmp( szQUALITY_LEVELS, szLine, nQUALITY_LEVELS ) == 0 ) { RWCString soQualityLevels = soGetRestOfLine( szLine, nQUALITY_LEVELS ); if (! bIsNumeric( soQualityLevels ) ) { PARSE_PANIC2( filPHD, nLine, "quality levels not numeric: ", szLine ); } pLocFrag->nQualityLevels_ = atoi( (char*) soQualityLevels.data() ); } else if ( strncmp( szTRACE_ARRAY_MIN_INDEX, szLine, nTRACE_ARRAY_MIN_INDEX ) == 0 ) { // soGetRestOfLine strips off final CR RWCString soTraceArrayMinIndex = soGetRestOfLine( szLine, nTRACE_ARRAY_MIN_INDEX ); if ( !bIsNumericMaybeWithWhitespace( soTraceArrayMinIndex, pLocFrag->nTraceArrayMinIndex_ ) ) PARSE_PANIC( "line should have been TRACE_ARRAY_MIN_INDEX: ### but ### wasn't numeric" ); } else if ( strncmp( szTRACE_ARRAY_MAX_INDEX, szLine, nTRACE_ARRAY_MAX_INDEX ) == 0 ) { // soGetRestOfLine strips off final CR RWCString soTraceArrayMaxIndex = soGetRestOfLine( szLine, nTRACE_ARRAY_MAX_INDEX ); if ( !bIsNumericMaybeWithWhitespace( soTraceArrayMaxIndex, pLocFrag->nTraceArrayMaxIndex_ ) ) PARSE_PANIC( "line should have been TRACE_ARRAY_MAX_INDEX: ### but ### wasn't numeric" ); pLocFrag->bPhdFileHasTraceArrayMinAndMax_ = true; } else if ( strncmp( "CHEM:", szLine, 5 ) == 0 ) { int nLength = strlen( szLine ); // make sure that there is at least 2 chars after CHEM: // which allows for a space and a non-space if ( nLength < 7 ) PARSE_PANIC( "line should have been CHEM: (prim, term, or unknown)" ); // start with character after CHEM: which is at position 5 pLocFrag->soChemistry_ = soGetRestOfLine( szLine, 5 ); } else if ( strncmp( "DYE:", szLine, 4 ) == 0 ) { int nLength = strlen( szLine ); if ( nLength < 6 ) PARSE_PANIC( "line should have been DYE: ( rhod, big, ET, d-rhod, or unknown )" ); pLocFrag->soDye_ = soGetRestOfLine( szLine, 4 ); } else { // ALLOW other lines in the comment section of the phd file } } if ( !bFoundTime ) { PARSE_PANIC( "did not find TIME: in the BEGIN_COMMENT/END_COMMENT section of the phd file--did you use the phredPhrap script?" ); } if ( pLocFrag->bHasChromat() ) { pLocFrag->createPointPosArray( pLocFrag->bPhdFileHasTraceArrayMinAndMax_, pLocFrag->nTraceArrayMaxIndex_, pLocFrag->length(), true ); // bFillUpArray } // now looking for BEGIN_DNA bFound = false; while( !bFound ) { if (fgets( szLine, nMaxLineSize, pFil ) == NULL ) { PARSE_PANIC( "premature end of file while looking for BEGIN_SEQUENCE" ); } else ++nLine; if ( strncmp( szBEGIN_DNA, szLine, nBEGIN_DNA ) == 0 ) bFound = true; } int nPaddedSequencePos; // find the first non-pad base in the direction of sequencing. To // do this, start at the right end (for complemented reads) or left end // for uncomplemented reads. if ( pLocFrag->bComp() ) { nPaddedSequencePos = pLocFrag->nGetEndIndex(); if ( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() ) { while( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() && nPaddedSequencePos > pLocFrag->nGetStartIndex() ) --nPaddedSequencePos; if ( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() ) { fprintf( stderr, "warning: read %s is entirely pads\n", (char*) pLocFrag->soGetName().data() ); fclose( pFil ); return; } } // prepare for 1st advancing: ++nPaddedSequencePos; } else { nPaddedSequencePos = pLocFrag->nGetStartIndex(); if ( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() ) { while( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() && nPaddedSequencePos < pLocFrag->nGetEndIndex() ) ++nPaddedSequencePos; if ( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() ) { fprintf( stderr, "warning: read %s is entirely pads\n", (char*) pLocFrag->soGetName().data() ); fclose( pFil ); return; } } // prepare for 1st advancing --nPaddedSequencePos; } // now reading the data until get END_DNA char cTemp; int nBases = 0; bool bEndDNA = false; while( !bEndDNA ) { if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) { PARSE_PANIC( "premature end of file while looking for END_DNA" ); } else { ++nLine; } if ( szLine[0] == 'E' && szLine[1] == 'N' && szLine[2] == 'D' && szLine[3] == '_' && szLine[4] == 'D' && szLine[5] == 'N' && szLine[6] == 'A' ) { bEndDNA = true; } else { ++nBases; // all of the code below is just to move nPaddedSequencePos to // the next non-pad--the base that corresponds to the next base // in the phd file if ( pLocFrag->bComp() ) { --nPaddedSequencePos; if ( nPaddedSequencePos < pLocFrag->nGetStartIndex() ) { PARSE_PANIC_2_ARGS( "too many bases in phd for complemented read. At least %d bases in phd file but only %d unpadded bases in ace file", nBases, pLocFrag->nGetUnpaddedSeqLength() ); } if ( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() ) { do { --nPaddedSequencePos; if ( nPaddedSequencePos < pLocFrag->nGetStartIndex() ) { PARSE_PANIC_2_ARGS( "too many bases in phd file for complemented read. (2) At least %d bases in phd file but only %d unpadded bases in ace file", nBases, pLocFrag->nGetUnpaddedSeqLength() ); } } while( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() ); } } else { ++nPaddedSequencePos; if ( nPaddedSequencePos > pLocFrag->nGetEndIndex() ) { PARSE_PANIC_2_ARGS( "too many bases in phd file for uncomplemented read. At least %d bases in phd file but only %d unpadded bases in ace file", nBases, pLocFrag->nGetUnpaddedSeqLength() ); } if ( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad () ) { // must further change nPaddedSequencePos to next non-pad do { ++nPaddedSequencePos; if ( nPaddedSequencePos > pLocFrag->nGetEndIndex() ) { PARSE_PANIC_2_ARGS( "too many bases in phd file for uncomplemented read. (2) At least %d bases in phd file but only %d unpadded bases in ace file", nBases, pLocFrag->nGetUnpaddedSeqLength() ); } } while( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() ); // the loop above will be exited only a non-pad (or an // exception is thrown) } } // all of the above was just to change nPaddedSequencePos cTemp = szLine[0]; if (! ( 'a' <= cTemp && cTemp <= 'z' ) ) { if ( 'A' <= cTemp && cTemp <= 'Z' ) { szLine[0] = szLine[0] + ( 'a' - 'A' ); } else { char szTemp[200]; sprintf( szTemp, "expected letter in position 0 of this line but instead found \"%s\"", szLine ); PARSE_PANIC( szTemp ); } } // sanity check that the phd file and the ace file agree char cAceBase = pLocFrag->ntideGet( nPaddedSequencePos ).cGetBase(); char cPHDBase; if ( pLocFrag->bComp() ) cPHDBase = cComplementBase( szLine[0] ); else cPHDBase = szLine[0]; if ( cPHDBase != cAceBase ) { // eliminate changes that crossmatch may have introduced // in to the acefile bases // I don't remember why the check for n's, but now at least // one lab (Molecular Dynamics and possibly Ron Davis' lab // at Stanford where John Fernandez set thing up ) depends // on the ability to change bases to n's in the ace file // DG, 3/01 if ( cAceBase != 'x' && szLine[0] != 'x' && cAceBase != 'n' ) { ostringstream ost; ost << "ace file says base at nPaddedSequencePos " << nPaddedSequencePos << " or nBases " << nBases << " is " << cAceBase << " but PHD file says " << szLine[0] << " which on top strand is " << cPHDBase << " for read " << pLocFrag->soGetName() << " at line " << nLine << " in phd file " << filPHD << endl << ends; InputDataError ide( ost.str().c_str() ); fclose( pFil ); throw ide; } } // advance over 1st whitespace if ( szLine[1] != ' ' ) { PARSE_PANIC( "expected line of the form (base)(space)(quality)(space)(peak position) but the first space was not in column 2" ); } nPos = 2; if ( szLine[2] == ' ' || szLine[2] == '\0' ) { while( szLine[nPos] == ' ' ) { ++nPos; } if ( szLine[nPos] == '\0' ) { PARSE_PANIC( "this line should be of the form (base)(space)(quality)(space)(peak position) but the line ended after the base" ); } } // if reached here, szLine[nPos] is not a space and not a null // advance to next whitespace, accumlating quality on the way nQuality = 0; while( '0' <= szLine[nPos] && szLine[nPos] <= '9' ) { nQuality = nQuality*10 + ( szLine[nPos] - '0' ); ++nPos; } // when exit this loop, the following character must be a space if ( szLine[nPos] != ' ' ) { PARSE_PANIC( "this line should have 3 tokens but doesn't" ); } // if a base is cross_matched out, then make its quality 0 // since it should be dark--should not attract attention // changed Dec 1999 since we need quality values for autofinish // to get the error profile of existing reads // if ( cAceBase == 'x' ) // nQuality = 0; pLocFrag->setQualityAtSeqPos( nPaddedSequencePos, nQuality ); // no need to set peak position if there is no chromat if ( !pLocFrag->bHasChromat() ) continue; // now advance to next non-whitespace to find the peak position ++nPos; if ( szLine[nPos] == ' ' || szLine[nPos] == '\0' ) { while( szLine[nPos] == ' ') { ++nPos; } if ( szLine[nPos] == '\0' ) { PARSE_PANIC( "this line should have 3 tokens but doesn't" ); } } // advance to next whitespace (actually a carriage return), // accumlating peak position on the way nPeakPosition = 0; while( '0' <= szLine[nPos] && szLine[nPos] <= '9' ) { nPeakPosition = nPeakPosition*10 + ( szLine[nPos] - '0' ); ++nPos; } if ( szLine[nPos] != '\n' ) { char szTemp[1000]; sprintf( szTemp, "this line should be of form (base)(space)(quality)(space)(peak position) where quality and peak position are numeric but something non-numeric follows the peak position. Line is:\n\"%s\"", szLine ); PARSE_PANIC( szTemp ); } // if flipping read left and right, must do this also // for the peak position so it lines up with the flipped trace if ( pLocFrag->bComp() ) { if ( pLocFrag->bPhdFileHasTraceArrayMinAndMax_ ) nPeakPosition = pLocFrag->nTraceArrayMaxIndex_ - nPeakPosition; else nPeakPosition = -nPeakPosition; } pLocFrag->setTracePointPosAtSeqPos( nPaddedSequencePos, nPeakPosition ); } // if ( strncmp( szEND_DNA, szLine, nEND_DNA ) == 0 ) ... else } // while( !bEndDNA ) { // check that there are the same number of bases in the phd file // as the ace file // Why don't I just check if nBases == pLocFrag->nGetUnpaddedSeqLength() ? // It is because nGetUnpaddedSeqLength() takes a long time so I use // the more efficient method below. if ( pLocFrag->bComp() ) { if ( nPaddedSequencePos != pLocFrag->nGetStartIndex() ) { if ( pLocFrag->nGetUnpaddedSeqLength() != nBases ) { PARSE_PANIC_2_ARGS( "The phd file has only %d bases but the ace file has %d bases", nBases, pLocFrag->nGetUnpaddedSeqLength() ); } } } else { if ( nPaddedSequencePos != pLocFrag->nGetEndIndex() ) { if ( pLocFrag->nGetUnpaddedSeqLength() != nBases ) { PARSE_PANIC_2_ARGS( "The phd file has only %d bases but the ace file has %d bases", nBases, pLocFrag->nGetUnpaddedSeqLength() ); } } } // this was inserted to fix a bug with polyphred in which // tags applied to the .1 phd file may be out of range if the user then // deletes bases with subsequent versions of the phd files. Since // the .1 phd file is still read, the tags in it may be out of range. // Thus the solution is to not read the tags in the .1 phd file if it // is just being read for the phred calls. if (!bIgnoreTags ) { int nNumberOfUnpaddedBases = nBases; bool bSetPaddedFromUnpaddedArray = false; while( fgets( szLine, nMaxLineSize, pFil) != NULL ) { ++nLine; if ( strcmp( szLine, "BEGIN_TAG\n" ) == 0 ) { if ( !bSetPaddedFromUnpaddedArray ) { setPaddedFromUnpaddedArray( nNumberOfUnpaddedBases, pLocFrag ); bSetPaddedFromUnpaddedArray = true; } readTag( pFil, filPHD, pLocFrag, nNumberOfUnpaddedBases ); } else if ( strncmp( szLine, "WR{", 3 ) == 0 ) { wholeReadItem* pWR; readWholeReadItem( pFil, filPHD, pLocFrag, pWR ); pWR->appendThyself(); } } } fclose( pFil ); pLocFrag->bAlreadyReadPhdFile_ = true; } // used for LocatedFragment::readPhredCalls void readPHDFileAndMaybeComplement( FileName filPHD, bool bComplement, LocatedFragment* pLocFrag, int nArraySize, char* pBaseArray, int* pQualityArray, int* pPointPositionArray, RWCString& soTimestampFromPHDFile, RWCString& soABIThumbprint, RWCString& soPhredVersion, RWCString& soCallMethod, RWCString& soChemistry, RWCString& soDye, int& nTraceArrayMinIndex, int& nTraceArrayMaxIndex, bool& bFoundTraceArrayMaxIndex, int& nQualityLevels, int& nNumberOfUnpaddedBases, const bool bIgnoreTags ) { readPHDFile( filPHD, pLocFrag, nArraySize, pBaseArray, pQualityArray, pPointPositionArray, soTimestampFromPHDFile, soABIThumbprint, soPhredVersion, soCallMethod, soChemistry, soDye, nTraceArrayMinIndex, nTraceArrayMaxIndex, bFoundTraceArrayMaxIndex, nQualityLevels, nNumberOfUnpaddedBases, bIgnoreTags ); if (bComplement) { int nTemp; char cTemp; // This is designed so that, if there are an odd number of bases, // the middle base will still be complemented // e.g., If nNumberOfUnpaddedBases = 3 (i.e. bases at positions // 0, 1 and 2, nUnpaddedComp will go from 0 to 1 for( int nUnpaddedComp = 0; nUnpaddedComp <= (nNumberOfUnpaddedBases-1)/2; ++nUnpaddedComp ) { // nUnpadded will go from nNumberOfUnpaddedBases - 1 to 0 // as nUnpaddedComp goes from 0 to nNumberOfUnpaddedBases - 1 int nUnpadded = nNumberOfUnpaddedBases - nUnpaddedComp - 1; // complement quality values nTemp = *(pQualityArray + nUnpaddedComp ); *(pQualityArray + nUnpaddedComp ) = *(pQualityArray + nUnpadded ); *(pQualityArray + nUnpadded) = nTemp; // complement peak positions // (For old style phd files that don't have the // TRACE_ARRAY_MAX_INDEX, this can't be completed here, // since we don't know the max index of the traces. We complete // it when we read in the traces, by adding this maximum value.) nTemp = *(pPointPositionArray + nUnpaddedComp ); *(pPointPositionArray + nUnpaddedComp ) = nTraceArrayMaxIndex - *(pPointPositionArray + nUnpadded ); // notice that nTraceArrayMaxIndex is used here, even if // bFoundTraceArrayMaxIndex is false. I think this is a bug-- // DG, 5/30/03 *(pPointPositionArray + nUnpadded) = nTraceArrayMaxIndex - nTemp; // complement the bases cTemp = *(pBaseArray + nUnpaddedComp ); *(pBaseArray + nUnpaddedComp) = cComplementBase( *(pBaseArray + nUnpadded ) ); *(pBaseArray + nUnpadded ) = cComplementBase( cTemp ); } } // if (bComplement) { } // for LocatedFragment :: readPhredCalls int nReadPHDFileForSize( const FileName& filPHD ) { FILE *pFil; int nLine = 0; pFil = fopen( (char*) filPHD.data(), "r" ); // ifsPHDFile.open( filPHD, ios:: in ); if (pFil == NULL ) { ostringstream ost; ost << "could not open PHD file " << filPHD << endl << ends; InputDataError ide( ost.str().c_str() ); throw ide; } // now looking for BEGIN_DNA bool bFound = false; while( !bFound ) { if (fgets( szLine, nMaxLineSize, pFil ) == NULL ) { PARSE_PANIC( "premature end of file while looking for BEGIN_SEQUENCE" ); } else ++nLine; if ( strcmp( "BEGIN_DNA\n", szLine ) == 0 ) bFound = true; } // now reading the data until get END_DNA int nBases = 0; bool bEndDNA = false; while( !bEndDNA ) { if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) { PARSE_PANIC( "premature end of file while looking for END_DNA" ); } else { ++nLine; ++nBases; } if ( strcmp( "END_DNA\n", szLine ) == 0 ) { bEndDNA = true; } } // while( !bEndDNA ) { fclose( pFil ); return( nBases ); } // This is used for addNewReads function which first reads just the phred // calls, then does the alignment to put in pads, and then reads the tags // in order to determine their padded positions. // the pLocFrag array must have the bases already set. void readPHDFileJustForTagsAndWholeReadItems( LocatedFragment* pLocFrag ) { FileName filPHD = ConsEd::pGetConsEd()->filGetPHDDir() + "/" + pLocFrag->filGetPHDFilename(); FILE *pFil; nLine = 0; pFil = fopen( (char*) filPHD.data(), "r" ); if ( pFil == NULL ) { ostringstream ost; ost << "could not open PHD file " << filPHD << endl << ends; InputDataError ide( ost.str().c_str() ); throw ide; } // look for END_DNA bool bEndDNA = false; do { if ( fgets( szLine, nMaxLineSize, pFil ) == NULL ) { PARSE_PANIC( "premature end of file while looking for END_DNA" ); } ++nLine; if ( szLine[0] == 'E' && szLine[1] == 'N' && szLine[2] == 'D' && szLine[3] == '_' && szLine[4] == 'D' && szLine[5] == 'N' && szLine[6] == 'A' ) { bEndDNA = true; } } while( !bEndDNA ); bool bSetPaddedFromUnpaddedArray = false; int nNumberOfUnpaddedBases = -1; // will be set below if necessary while( fgets( szLine, nMaxLineSize, pFil) != NULL ) { ++nLine; if ( strcmp( szLine, "BEGIN_TAG\n" ) == 0 ) { if ( !bSetPaddedFromUnpaddedArray ) { nNumberOfUnpaddedBases = pLocFrag->nGetUnpaddedSeqLength(); setPaddedFromUnpaddedArray( nNumberOfUnpaddedBases, pLocFrag ); bSetPaddedFromUnpaddedArray = true; } readTag( pFil, filPHD, pLocFrag, nNumberOfUnpaddedBases ); } else if ( strncmp( szLine, "WR{", 3 ) == 0 ) { wholeReadItem* pWR; readWholeReadItem( pFil, filPHD, pLocFrag, pWR ); pWR->appendThyself(); } } fclose( pFil ); }