/*****************************************************************************
#   Copyright (C) 1994-2008 by David Gordon.
#   All rights reserved.                           
#                                                                           
#   This software is part of a beta-test version of the Consed/Autofinish
#   package.  It should not be redistributed or
#   used for any commercial purpose, including commercially funded
#   sequencing, without written permission from the author and the
#   University of Washington.
#   
#   This software is provided ``AS IS'' and any express or implied
#   warranties, including, but not limited to, the implied warranties of
#   merchantability and fitness for a particular purpose, are disclaimed.
#   In no event shall the authors or the University of Washington be
#   liable for any direct, indirect, incidental, special, exemplary, or
#   consequential damages (including, but not limited to, procurement of
#   substitute goods or services; loss of use, data, or profits; or
#   business interruption) however caused and on any theory of liability,
#   whether in contract, strict liability, or tort (including negligence
#   or otherwise) arising in any way out of the use of this software, even
#   if advised of the possibility of such damage.
#
#   Building Consed from source is error prone and not simple which is
#   why I provide executables.  Due to time limitations I cannot
#   provide any assistance in building Consed.  Even if you do not
#   modify the source, you may introduce errors due to using a
#   different version of the compiler, a different version of motif,
#   different versions of other libraries than I used, etc.  For this
#   reason, if you discover Consed bugs, I can only offer help with
#   those bugs if you first reproduce those bugs with an executable
#   provided by me--not an executable you have built.
# 
#   Modifying Consed is also difficult.  Although Consed is modular,
#   some modules are used by many other modules.  Thus making a change
#   in one place can have unforeseen effects on many other features.
#   It may takes months for you to notice these other side-effects
#   which may not seen connected at all.  It is not feasable for me to
#   provide help with modifying Consed sources because of the
#   potentially huge amount of time involved.
#
#*****************************************************************************/
#include    "rwcstring.h"
#include    <stdio.h>
#include    <iostream.h>
#include    <fstream.h>
#include    "rwcregexp.h"
#include    "rwctokenizer.h"
#include    <stdlib.h>
#include    <string.h>
#include    "locatedFragment.h"
#include    "cComplementBase.h"
#include    "readPHD.h"
#include    "consed.h"
#include    "bIsNumeric.h"
#include    "tagTypes.h"
#include    "tag.h"
#include    "soGetRestOfLine.h"
#include    "bIsNumericMaybeWithWhitespace.h"
#include    "bIsNumericLongMaybeWithWhitespace.h"
#include    "wholeReadItem.h"
#include    "consedParameters.h"
#include    "nLine.h"
#include    "readWholeReadItem.h"
#include    "soAddCommas.h"
#include    "phdBall2Fasta.h"



#define PARSE_PANIC(message) \
{  ostrstream ost; \
   ost << "Error detected from source file " \
     << __FILE__ << " at " << __LINE__ <<endl; \
   ost << "Error in file " << pCP->filFileOfPhdFilesCurrentlyBeingRead_ << " at line " << nLine << ": \n" \
       << szLine << endl \
      << message << endl << ends; \
   InputDataError ide(ost.str()); \
   throw ide; }


#define PARSE_PANIC2( file, nLine, szMessage, szLine ) \
{  ostrstream ost; \
   ost << "Error detected from source file " \
     << __FILE__ << " at " << __LINE__ <<endl; \
   ost << "Error in file " << file << " at line " << nLine << ": \n" \
       << szLine << endl \
      << szMessage << endl << szLine << ends; \
   InputDataError ide(ost.str()); \
   throw ide; }

#define PARSE_PANIC_2_ARGS( szMessage, arg1, arg2 ) { \
   char szTemp[1000]; \
   sprintf( szTemp, szMessage, arg1, arg2 ); \
    \
   ostrstream ost; \
   ost << "Problem reading file of phd files " << pCP->filFileOfPhdFilesCurrentlyBeingRead_ << " at line " << \
      nLine << " detected from source file " << \
      __FILE__ << " at line " << __LINE__ << endl << \
      szTemp << endl << "strlen( szLine ) = " << strlen( szLine ) << "\n" << \
    szLine <<  ends; \
   InputDataError ide( ost.str() ); \
   throw ide; }

#define PARSE_PANIC_1_ARG( szMessage, arg1 ) { \
   char szTemp[1000]; \
   sprintf( szTemp, szMessage, arg1 ); \
    \
   ostrstream ost; \
   ost << "Problem reading file of phd files " << pCP->filFileOfPhdFilesCurrentlyBeingRead_ << " at line " << \
      nLine << " detected from source file " << \
      __FILE__ << " at line " << __LINE__ << endl << \
      szTemp << endl << ends; \
   InputDataError ide( ost.str() ); \
   throw ide; }

#define     FGETS_OR_PARSE_PANIC( szErrorMessage ) \
if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) \
   PARSE_PANIC( szErrorMessage ); \
++nLine; \
soLine = szLine;



static bool bCalculatedLengths = false;
static char* szBEGIN_SEQUENCE = "BEGIN_SEQUENCE";
static int    nBEGIN_SEQUENCE;
static char* szBEGIN_COMMENT  = "BEGIN_COMMENT";
static int    nBEGIN_COMMENT;
static char* szABI_THUMBPRINT = "ABI_THUMBPRINT:";
static int    nABI_THUMBPRINT;
static char* szPHRED_VERSION  = "PHRED_VERSION:";
static int    nPHRED_VERSION;
static char* szCALL_METHOD    = "CALL_METHOD:";
static int    nCALL_METHOD;
static char* szQUALITY_LEVELS = "QUALITY_LEVELS:";
static int    nQUALITY_LEVELS;
static char* szTIME           = "TIME:";
static int    nTIME;
static char* szTRACE_ARRAY_MIN_INDEX = "TRACE_ARRAY_MIN_INDEX:";
static int    nTRACE_ARRAY_MIN_INDEX;
static char* szTRACE_ARRAY_MAX_INDEX = "TRACE_ARRAY_MAX_INDEX:";
static int    nTRACE_ARRAY_MAX_INDEX;
static char* szEND_COMMENT    = "END_COMMENT";
static int    nEND_COMMENT;
static char* szBEGIN_DNA      = "BEGIN_DNA";
static int    nBEGIN_DNA;
static char* szEND_DNA        = "END_DNA";
static int    nEND_DNA;




static bool bFoundNonBlank;
static bool bFoundBlank;
static int nPos;
static int nQuality;
static int nPeakPosition;

const int nMaxLineSize = 300;
static char  szLine[ nMaxLineSize + 1];

static RWCString soLine;


static int* pPaddedFromUnpaddedArray;
static int nPaddedFromUnpaddedArraySize = 0;
static int nNumberOfUnpaddedBasesStatic = 0;
static bool bReadComplemented = false;


static void setPaddedFromUnpaddedArray( const int nNumberOfUnpaddedBases,
                                      LocatedFragment* pLocFrag ) {

   nNumberOfUnpaddedBasesStatic = nNumberOfUnpaddedBases;
   bReadComplemented = pLocFrag->bComp();

   if ( ( nNumberOfUnpaddedBases + 10 ) > nPaddedFromUnpaddedArraySize ) {

      if ( pPaddedFromUnpaddedArray ) 
         free( pPaddedFromUnpaddedArray );

      pPaddedFromUnpaddedArray = (int*) malloc( 
                            sizeof( int) * ( nNumberOfUnpaddedBases + 10 ) );
      if ( !pPaddedFromUnpaddedArray ) {
         PARSE_PANIC_2_ARGS( "could not get enough memory %d int's for read %s\n",
                            nNumberOfUnpaddedBases,
                            (char*) pLocFrag->soGetName().data() );
      }

      nPaddedFromUnpaddedArraySize = nNumberOfUnpaddedBases + 10;
   }

   int nUnpadded = 0;
   for( int nPaddedSeqPos = pLocFrag->nGetStartIndex(); 
        nPaddedSeqPos <= pLocFrag->nGetEndIndex();
        ++nPaddedSeqPos ) {
      
      if ( !pLocFrag->ntideGet( nPaddedSeqPos ).bIsPad() ) {
         ++nUnpadded;
         pPaddedFromUnpaddedArray[ nUnpadded ] = nPaddedSeqPos;
      }
   }
}


static inline int nGetPaddedConsPosFromUnpaddedPosInDirectionOfSequencing( 
                                  LocatedFragment* pLocFrag,
                                  int nUnpaddedPos ) {

   
   if ( bReadComplemented ) 
      nUnpaddedPos = nNumberOfUnpaddedBasesStatic + 1 - nUnpaddedPos;
   
   // nUnpaddedPos is now the unpadded sequence position in the
   // left-to-right direction
      
   // now we have the unpadded position from left to right

   // convert to padded coordinates
   assert( 1 <= nUnpaddedPos && nUnpaddedPos <= nNumberOfUnpaddedBasesStatic );

   int nPaddedLeftToRightSequencePos = pPaddedFromUnpaddedArray[ nUnpaddedPos ] ;
   
   return( pLocFrag->nGetConsPosFromFragIndex( nPaddedLeftToRightSequencePos ) );
}
   
   



static void readTag( FILE* pFil, const FileName& filPHD, 
                     LocatedFragment* pLocFrag, 
                     const int nUnpaddedBases,
                     const bool bUnpaddedTagCoordinates ) {


   if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) {
      PARSE_PANIC( "premature end of file while looking for TYPE:" );
   }

   ++nLine;
   
   RWCTokenizer tokTagType( szLine );
   if ( tokTagType() != "TYPE:" ) {
      PARSE_PANIC2( filPHD, nLine, 
                    "no type after TYPE:",
                    szLine );
   }

   // current line contains something like 
   // TYPE: phrap
   RWCString soTagType = tokTagType();



   // The code below takes just as long (54 seconds) as the code above
//    if ( szLine[0] != 'T' ||
//         szLine[1] != 'Y' ||
//         szLine[2] != 'P' ||
//         szLine[3] != 'E' ||
//         szLine[4] != ':' ) 
//       PARSE_PANIC2( filPHD, nLine, 
//                     "no type after TYPE:",
//                     szLine );
   
//    int n = 5;
//    while( szLine[n] == ' ')
//       ++n;
   
//    RWCString soTagType;
   
//    while( szLine[n] != '\n' && szLine[n] != '\0' ) {
//       soTagType.append( szLine[n] );
//       ++n;
//    }
   

//    RWCTokenizer tokType( szLine );
//    if ( tokType() != "TYPE:" ) 
//       PARSE_PANIC2( filPHD, nLine, 
//                     "no type after TYPE:",
//                     szLine );

//    RWCString soTagType = tokType();



   if (! tagTypes::pGetTagTypes()->bIsAValidTagType( soTagType ) ) 
          PARSE_PANIC2( filPHD, nLine, 
                      "unrecognized tag type after TYPE:",
                        szLine );



   if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) {
      PARSE_PANIC( "premature end of file while looking for SOURCE:" );
   }

   ++nLine;


   // this takes 56 seconds for the same project as opposed to 53 or 54 for
   // the RWCTokenizer method below
//    if ( szLine[0] != 'S' ||
//         szLine[1] != 'O' ||
//         szLine[2] != 'U' ||
//         szLine[3] != 'R' ||
//         szLine[4] != 'C' ||
//         szLine[5] != 'E' ||
//         szLine[6] != ':' ) {
//       PARSE_PANIC_1_ARG( "in tag, expecting SOURCE: line after TYPE: line but found %s",
//                          szLine );
//    }

//    // skip over spaces

//    n = 7;
//    while( szLine[n] == ' ' || szLine[n] == '\t' )
//       ++n;

//    RWCString soSource;

//    while( szLine[n] != '\n' ) {
//       soSource.append( szLine[n] );
//       ++n;
//    }
   
      
   
   RWCTokenizer tokSource( szLine );
   
   if ( tokSource() != "SOURCE:" ) {
       PARSE_PANIC_1_ARG( "in tag, expecting SOURCE: line after TYPE: line but found %s",
                          szLine );
   }

   RWCString soSource = tokSource();
   

   // Note:  the above is *faster* than below.  Don't ask me why.
   // If you don't believe it, try it yourself.  The difference 
   // was 49 seconds for below and 42 seconds above reproducibly.

//    do {
//       if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) {
//          PARSE_PANIC( filPHD, nLine, 
//                      "premature end of file while looking for SOURCE:" );
//       }

//       ++nLine;

//    } while( ! ( szLine[0] == 'S' &&
//                 szLine[1] == 'O' &&
//                 szLine[2] == 'U' &&
//                 szLine[3] == 'R' &&
//                 szLine[4] == 'C' &&
//                 szLine[5] == 'E' &&
//                 szLine[6] == ':' &&
//                 szLine[7] == ' ' ) );

//    char* szSource = strtok( szLine + 8, "\n " );
   
//    RWCString soSource;
//    int n = 8;
//    while( szLine[n] != '\n' ) {
//       soSource.append( szLine[n] );
//       ++n;
//    }

// now allowing any source
//
//    if ( 
//         (soSource != "phrap") &&
//         (soSource != "crossmatch" ) &&
//         (soSource != "consed" ) &&
//         (soSource != "polyPhred" ) &&
//         (soSource != "other" )
//         ) {

//       PARSE_PANIC2( filPHD, nLine, "SOURCE for tag must be phrap, crossmatch, consed, polyPhred, or other",
//                     szLine );
//    }
   

   if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) {
      PARSE_PANIC( "premature end of file while looking for UNPADDED READ POS:" );
   }

   ++nLine;

   RWCTokenizer tokPos( szLine );
      
   if ( tokPos() != "UNPADDED_READ_POS:" ) {
      PARSE_PANIC_1_ARG( "tag should have UNPADDED_READ_POS: on line following SOURCE: line but instead has %s", 
                         szLine );
   }

   RWCString soPosStart = tokPos();
   RWCString soPosEnd = tokPos();
   
      
   if (soPosStart.isNull() || soPosEnd.isNull() )
      PARSE_PANIC2( filPHD, nLine, "missing start of end position for tag",
                    szLine );

   if ( !bIsNumeric( soPosStart ) || !bIsNumeric( soPosEnd ) )
      PARSE_PANIC2( filPHD, nLine, "non-numeric start or end position for tag",
                    szLine );


   int nUnpaddedStartPos = atoi( (char*) soPosStart.data() );
   int nUnpaddedEndPos   = atoi( (char*) soPosEnd.data() );


   if ( bUnpaddedTagCoordinates ) {
      // when I used this, I had kept the bases in the pSeqPhredCalledBases_

      if ( ! (
              ( pLocFrag->pSeqPhredCalledBases_->nGetStartIndex() <= nUnpaddedStartPos ) &&
              ( nUnpaddedStartPos <= nUnpaddedEndPos ) &&
              ( nUnpaddedEndPos <= pLocFrag->pSeqPhredCalledBases_->nGetEndIndex() )
              ) ) {
         char szTemp[200];
         sprintf( szTemp, "tag must have %d <= start position <= end position <= %d since the tag must be within the bounds of the read",
                  pLocFrag->pSeqPhredCalledBases_->nGetStartIndex(),
                  pLocFrag->pSeqPhredCalledBases_->nGetEndIndex() );

         PARSE_PANIC( szTemp );
      }

   }
   else {
      if ( ! (
              ( pLocFrag->nGetUnpaddedStartIndex() <= nUnpaddedStartPos ) &&
              ( nUnpaddedStartPos <= nUnpaddedEndPos ) &&
              ( nUnpaddedEndPos <= nUnpaddedBases )
              ) ) {
         char szTemp[200];
         sprintf( szTemp, "tag must have %d <= start position <= end position <= %d since the tag must be within the bounds of the read",
                  pLocFrag->nGetUnpaddedStartIndex(),
                  nUnpaddedBases );

         PARSE_PANIC( szTemp );
      }
   }


   if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) {
      PARSE_PANIC( "premature end of file while looking for DATE:" );
   }

   ++nLine;
      
   RWCTokenizer tokDate( szLine );
      
   if ( tokDate() != "DATE:" ) {
      PARSE_PANIC_1_ARG( "UNPADDED_READ_POS: line should be followed by DATE: line but instead has %s",
                         szLine );
   }

   RWCString soDatePart = tokDate();
   RWCString soTimePart = tokDate();
   
   if ( (soDatePart.length() != 8 ) ||
        (soTimePart.length() != 8) )
      PARSE_PANIC2( filPHD, nLine, 
                    "date and time must each be exactly 8 characters YY/MM/DD HH:MI:SS",
                    szLine );

   if ( ! (
        ( soDatePart[2] == '/' ) &&
        ( soDatePart[5] == '/' ) &&
        ( soTimePart[2] == ':' ) &&
        ( soTimePart[5] == ':' ) ) )
      PARSE_PANIC2( filPHD, nLine, 
                    "date and time must each be exactly 8 characters YY/MM/DD HH:MI:SS",
                    szLine );

   #define bIsNumber( myChar ) ( ( '0' <= myChar && myChar <= '9' ) ? true : false )

   if ( ! ( bIsNumber( soDatePart[0] ) &&
            bIsNumber( soDatePart[1] ) &&
            bIsNumber( soDatePart[3] ) &&
            bIsNumber( soDatePart[4] ) &&
            bIsNumber( soDatePart[6] ) &&
            bIsNumber( soDatePart[7] ) &&

            bIsNumber( soTimePart[0] ) &&
            bIsNumber( soTimePart[1] ) &&
            bIsNumber( soTimePart[3] ) &&
            bIsNumber( soTimePart[4] ) &&
            bIsNumber( soTimePart[6] ) &&
            bIsNumber( soTimePart[7] )  ) ) {

      PARSE_PANIC_2_ARGS( "date and time must be of form  YY/MM/DD HH:MI:SS but they were not numeric.  Instead found %s %s",
                         (char*) soDatePart.data(),
                         (char*) soTimePart.data() );
   }

   RWCString soDate = soDatePart + " " + soTimePart;


//    RWCString soTemp = 
//       soDatePart(0, 2 ) +
//       soDatePart( 3, 2 ) + 
//       soTimePart( 0, 2 ) +
//       soTimePart( 3, 2 );

//    if (! bIsNumeric (soTemp ) )
//       PARSE_PANIC2( filPHD, nLine, 
//                     "date and time must each be exactly 8 characters YY/MM/DD HH:MI:SS",
//                     szLine );


   FGETS_OR_PARSE_PANIC( "premature end of file while in tag" );

   long lID = nUndefinedTagID;
   if ( soLine.bStartsWith( "ID:" ) ) {

      RWCString soRestOfLine = soLine.soGetRestOfString( 3 );

      // bIsNumericLongMaybeWithWhitespace will check that there
      // is really a number there

      if ( !bIsNumericLongMaybeWithWhitespace( soRestOfLine, lID ) ) {
         RWCString soMessage = "ID: line not followed by a number but rather by: ";
         soMessage += soRestOfLine;
         PARSE_PANIC( soMessage );
      }


      ConsEd::pGetAssembly()->findLargestTagIDSoFar( lID );

      // if reached here, we must have successfully read the ID.
      // So must read another line

      FGETS_OR_PARSE_PANIC( "premature end of file in a tag after reading a tag ID" );
   }

      
   RWCString soComment;

   if ( soLine.length() >= 13 && soLine( 0, 13 ) == "BEGIN_COMMENT" ) {

      bool bFoundEndComment = false;
      while( !bFoundEndComment ) {
         if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) {
            PARSE_PANIC( "premature end of file while looking for END_COMMENT in comment tag" );
         }

         ++nLine;
      
         soLine = szLine;
      
         if (soLine == "END_COMMENT\n" )
            bFoundEndComment = true;
         else 
            soComment += soLine;
      }

      // now strip off the final \n off of the comment--this is a delimiter,
      // not part of the comment

      soComment = soComment.stripWhitespace( RWCString::TRAILING );

      // set up so that soLine contains the next line
      FGETS_OR_PARSE_PANIC( "premature end of file while in tag and just completed reading comment" );

   } // if soLine.length() >= 13 && soLine( 0, 13 ) ...

   RWCString soMiscData;
   while( !( soLine.length() > 7 && soLine( 0, 7 )  == "END_TAG" ) ) {
      FGETS_OR_PARSE_PANIC( "premature end of file while looking for END_TAG" );

      soMiscData += soLine;
   }

   int nConsPosStart;
   int nConsPosEnd;

   if ( bUnpaddedTagCoordinates ) {
      nConsPosStart = nUnpaddedStartPos;
      nConsPosEnd = nUnpaddedEndPos;
   }
   else {
      
      int nPaddedConsPos1 = 
         nGetPaddedConsPosFromUnpaddedPosInDirectionOfSequencing( pLocFrag, 
                                                                  nUnpaddedStartPos );

      int nPaddedConsPos2 = 
         nGetPaddedConsPosFromUnpaddedPosInDirectionOfSequencing( pLocFrag,
                                                                  nUnpaddedEndPos );

      nConsPosStart = MIN( nPaddedConsPos1, nPaddedConsPos2 );
      nConsPosEnd   = MAX( nPaddedConsPos1, nPaddedConsPos2 );
   }


   tag* pTag = new tag( pLocFrag,
                      NULL, // contig
                      soTagType, 
                      nConsPosStart,
                      nConsPosEnd,
                      true, // bWriteToPhdFileNotAceFile_
                      soComment,
                      soSource,
                      soDate,
                      false ); // no NoTrans

   pTag->bCoordinatesAreUnpaddedRead_ = bUnpaddedTagCoordinates;

   if ( !soMiscData.isNull() ) {
      pTag->soMiscData_ = soMiscData;
   }

   pTag->lID_ = lID;


   pLocFrag->aTags_.append( pTag );
} // readTag
   


// This cannot be used for added bases since it assumes that the
// LocatedFragment read is already populated with bases and we are
// just using this to fill in pSeqPhredCalledBases_.  Even then,
// it doesn't set the size of pSeqPhredCalledBases_



static void processCommentAndDnaForOriginalBaseCalls( LocatedFragment* pLocFrag, 
                                                      FILE* pFil ) {

   
   pLocFrag->bAlreadyReadPhredCallsInPhdBall_ = true;

   pLocFrag->pSeqPhredCalledBases_ = new Sequence();



   // looking for BEGIN_COMMENT

   bool bFoundComment = false;
   while( !bFoundComment ) {
      if ( fgets( szLine, nMaxLineSize, pFil ) == NULL ) {
         PARSE_PANIC( "premature end of file while looking for BEGIN_COMMENT for phd file comment section" );
      }
      else
         ++nLine;

      if ( strncmp( szBEGIN_COMMENT, szLine, nBEGIN_COMMENT ) == 0 )
         bFoundComment = true;
   }

   // if reached here, found the BEGIN_COMMENT section
   // All we want out of the comment section is the TraceArrayMin and Max
   // All the rest will be read from the higher version phd file.

   bool bFoundTraceArrayMax = false;

   bool bFoundCommentEnd = false;
   while( ! bFoundCommentEnd ) {
      if ( fgets( szLine, nMaxLineSize, pFil ) == NULL ) {
         PARSE_PANIC( "premature end of file while looking for END_COMMENT" );
      }
      else
         ++nLine;

      if ( strncmp( szEND_COMMENT, szLine, nEND_COMMENT ) == 0 )
         bFoundCommentEnd = true;
      else if (  strncmp( szTRACE_ARRAY_MIN_INDEX, szLine, 
                          nTRACE_ARRAY_MIN_INDEX ) == 0 ) {
         
         // soGetRestOfLine strips off final CR

         RWCString soTraceArrayMinIndex =
            soGetRestOfLine( szLine, nTRACE_ARRAY_MIN_INDEX );

         if ( !bIsNumeric( soTraceArrayMinIndex ) ) {
            PARSE_PANIC( "line should have been TRACE_ARRAY_MIN_INDEX: ### but ### wasn't numeric" );
         }

         pLocFrag->pSeqPhredCalledBases_->nTraceArrayMinIndex_ = 
            atoi( soTraceArrayMinIndex.data() );

      }
      else if ( strncmp( szTRACE_ARRAY_MAX_INDEX, szLine,
                         nTRACE_ARRAY_MAX_INDEX ) == 0 ) {

         RWCString soTraceArrayMaxIndex =
            soGetRestOfLine( szLine, nTRACE_ARRAY_MAX_INDEX );
         
         if ( !bIsNumeric( soTraceArrayMaxIndex ) ) {
            PARSE_PANIC( "line should have been TRACE_ARRAY_MAX_INDEX: ### but ### wasn't numeric" );
         }

         pLocFrag->pSeqPhredCalledBases_->nTraceArrayMaxIndex_ =
            atoi( soTraceArrayMaxIndex.data() );

         bFoundTraceArrayMax = true;
      }
   }


   if ( pLocFrag->bHasChromat() ) {

      pLocFrag->pSeqPhredCalledBases_->createPointPosArray( 
       bFoundTraceArrayMax,
       pLocFrag->pSeqPhredCalledBases_->nTraceArrayMaxIndex_,
       pLocFrag->length(), // this might be longer than we need, but 
       // won't be shorter than we need
       false ); // false sets current length of array to zero
   }


   // looking for BEGIN_DNA
   
   bool bFound = false;
   while( !bFound ) {
      if ( fgets( szLine, nMaxLineSize, pFil ) == NULL ) {
         PARSE_PANIC( "premature end of file while looking for BEGIN_DNA" );
      }
      else
         ++nLine;

      if ( strncmp( szBEGIN_DNA, szLine, nBEGIN_DNA ) == 0 ) 
         bFound = true;
   }


   // prevent zillions of resizes of pSeqPhredCalledBases_
   // DG Aug 2008

   pLocFrag->pSeqPhredCalledBases_->resize( pLocFrag->length() );



   char cBase;

   bool bHasChromat = pLocFrag->bHasChromat();

   bool bEndDNA = false;
   while( !bEndDNA ) {
      if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) {
         PARSE_PANIC( "premature end of file while looking for END_DNA" );
      }
      else {
         ++nLine;
      }
      if ( szLine[0] == 'E' &&
           szLine[1] == 'N' &&
           szLine[2] == 'D' &&
           szLine[3] == '_' &&
           szLine[4] == 'D' &&
           szLine[5] == 'N' &&
           szLine[6] == 'A' ) {
         bEndDNA = true;
      }
      else {

         // parse the line

         cBase = szLine[0];

         // convert to lowercase

         if (! 
             ( 'a' <= cBase && cBase <= 'z' ) 
             ) {
            if ( 'A' <= cBase && cBase <= 'Z' ) {
               cBase = cBase + 'a' - 'A';
            }
            else {
               // not a base--not capital and not lowercase

               PARSE_PANIC_1_ARG( "expected letter in position 0 of this line but instead found \"%c\"", szLine[0] );
            }
         }

         // advance over 1st whitespace

         if ( szLine[1] != ' ' ) {
            PARSE_PANIC( "expected line of the form (base)(space)(quality)(space)(peak position) but the first space was not in column 2" );
         }

         // any more whitespace to skip over?
         nPos = 2;
         if ( szLine[2] == ' ' || szLine[2] == '\0' ) {
            while( szLine[nPos] == ' ' ) {
               ++nPos;
            }
            if ( szLine[nPos] == '\0' ) {
               PARSE_PANIC( "this line should be of the form (base)(space)(quality)(space)(peak position) but the line ended after the base" );
            }
         }

         // if reached here, szLine[nPos] is not a space and not a null

         // advance to next whitespace, accumlating quality on the way

         nQuality = 0;
         while( '0' <= szLine[nPos] && szLine[nPos] <= '9' ) {
            nQuality = nQuality*10 + ( szLine[nPos] - '0' );
            ++nPos;
         }
         
         pLocFrag->pSeqPhredCalledBases_->append( Ntide( cBase, nQuality ) );


         // no need to set peak position if there is no chromat
         if ( !bHasChromat ) continue;
         

         // when exit loop above (and have chromat), the following
         // character must be a space
         
         if ( szLine[nPos] != ' ' ) {
            PARSE_PANIC_2_ARGS( "something wrong after quality on this line.  Should be a space but instead is \"%c\" at pos %d", 
                               szLine[nPos],
                               nPos );
         }


         // now advance to next non-whitespace to find the peak position
         ++nPos;
        
         if ( szLine[nPos] == ' ' || szLine[nPos] == '\0' ) {
            while( szLine[nPos] == ' ') {
               ++nPos;
            }

            if ( szLine[nPos] == '\0' ) {
               PARSE_PANIC( "this line should have 3 tokens but has only 2" );
            }
         }

         // at this point szLine[nPos] is not a space
         // It should be the peak position.

         nPeakPosition = 0;
         while( '0' <= szLine[nPos] && szLine[nPos] <= '9' ) {
            nPeakPosition = nPeakPosition*10 + ( szLine[nPos] - '0' );
            ++nPos;
         }

         if ( szLine[nPos] != '\n' ) {
            PARSE_PANIC( "this line should be of form (base)(space)(quality)(space)(peak position) where quality and peak position are numeric but something non-numeric follows the peak position." );
         }
         
         
         pLocFrag->pSeqPhredCalledBases_->appendPointPos( nPeakPosition );
      }
   } // while( !bEndDNA )

   // what if the read is complemented? Must reverse the order of the
   // bases and complement them.

   if ( pLocFrag->bComp() ) {
      pLocFrag->pSeqPhredCalledBases_->complementSequenceAfterReadingPHDFile( 
                                 bFoundTraceArrayMax );
   }
         
}



static void processCommentAndDna( LocatedFragment* pLocFrag, 
                                  FILE* pFil,
                                  int& nNumberOfUnpaddedBases ) {


   pLocFrag->bAlreadyReadPhdFile_ = true;

   // looking for BEGIN_COMMENT

   bool bFound = false;
   while( !bFound ) {
      if (fgets( szLine, nMaxLineSize, pFil ) == NULL ) {
         PARSE_PANIC( 
             "premature end of file while looking for BEGIN_COMMENT for phd file comment section" );
      }
      else 
         ++nLine;

      if ( strncmp( szBEGIN_COMMENT, szLine, nBEGIN_COMMENT ) == 0 ) 
         bFound = true;
   }
      
   // looking for TIME:, CHEM:, DYE:, ABI_THUMBPRINT:,

   pLocFrag->bPhdFileHasTraceArrayMinAndMax_ = false;

   bool bFoundTime = false;

   bool bFoundCommentEnd = false;
   while( ! bFoundCommentEnd ) {
      if (fgets( szLine, nMaxLineSize, pFil ) == NULL ) {
         PARSE_PANIC( "premature end of file while looking for END_COMMENT" );
      }
      else 
         ++nLine;

      if ( strncmp( szEND_COMMENT, szLine, nEND_COMMENT ) == 0 ) 
         bFoundCommentEnd = true;
      else if ( strncmp( szTIME, szLine, nTIME ) == 0 ) {

         RWCString soLine( szLine );
         RWCTokenizer tokNext( soLine );
         RWCString soNextToken = tokNext();

         assert( soNextToken == "TIME:" );
         RWCString soDayOfWeek = tokNext();
         RWCString soMonth = tokNext();
         RWCString soDayOfMonth = tokNext();
         RWCString soTime = tokNext();
         RWCString soYear = tokNext();

         if ( !bIsNumeric( soYear ) ) {
            soYear = tokNext();
         }
         
         RWCString soTimestampFromPHDFile = soDayOfWeek + " " + soMonth +
            " " + soDayOfMonth +
            " " + soTime +
            " " + soYear;


         if ( soTimestampFromPHDFile != pLocFrag->soGetTimestamp() &&
              !pCP->bAllowTimestampMismatch_ ) {
            ostrstream ost;
            ost << "phdball timestamp mismatch:  ace file says " << 
               pLocFrag->soGetTimestamp() << " but phdball " <<
               pCP->filFileOfPhdFilesCurrentlyBeingRead_ << " says " <<
               soTimestampFromPHDFile << " for read " << 
               pLocFrag->soGetName() << endl << ends;

            InputDataError ide( ost.str() );
            throw ide;
         }
         bFoundTime = true;

      }
      else if ( strncmp( szABI_THUMBPRINT, szLine, nABI_THUMBPRINT ) == 0 ) {
         pLocFrag->soABIThumbprint_ = soGetRestOfLine( szLine, nABI_THUMBPRINT );
      }
      else if ( strncmp( szPHRED_VERSION, szLine, nPHRED_VERSION ) == 0 ) {
         pLocFrag->soPhredVersion_ = soGetRestOfLine( szLine, nPHRED_VERSION );
      }
      else if ( strncmp( szCALL_METHOD, szLine, nCALL_METHOD ) == 0 ) {
         pLocFrag->soCallMethod_ = soGetRestOfLine( szLine, nCALL_METHOD );
      }
      else if ( strncmp( szQUALITY_LEVELS, szLine, nQUALITY_LEVELS ) == 0 ) {
         RWCString soQualityLevels = 
            soGetRestOfLine( szLine, nQUALITY_LEVELS );

         if (! bIsNumeric( soQualityLevels ) ) {
            PARSE_PANIC( "quality levels not numeric" );
         }

         pLocFrag->nQualityLevels_ = atoi( (char*) soQualityLevels.data() );
      }
      else if ( 
               strncmp( szTRACE_ARRAY_MIN_INDEX, szLine, 
                        nTRACE_ARRAY_MIN_INDEX ) == 0 
               ) {

         // soGetRestOfLine strips off final CR
         RWCString soTraceArrayMinIndex =
            soGetRestOfLine( szLine, nTRACE_ARRAY_MIN_INDEX );


         if ( !bIsNumeric( soTraceArrayMinIndex ) ) {
            PARSE_PANIC( "line should have been TRACE_ARRAY_MIN_INDEX: ### but ### wasn't numeric" );
         }

         pLocFrag->nTraceArrayMinIndex_ = atoi( soTraceArrayMinIndex.data() );
         
      }
      else if ( 
               strncmp( szTRACE_ARRAY_MAX_INDEX, szLine, 
                        nTRACE_ARRAY_MAX_INDEX ) == 0 
               ) {

         // soGetRestOfLine strips off final CR
         RWCString soTraceArrayMaxIndex = 
            soGetRestOfLine( szLine, nTRACE_ARRAY_MAX_INDEX );
         

         if ( !bIsNumeric( soTraceArrayMaxIndex ) ) {
            PARSE_PANIC( "line should have been TRACE_ARRAY_MAX_INDEX: ###  but ### wasn't numeric" );
         }
         
         pLocFrag->nTraceArrayMaxIndex_ = atoi( soTraceArrayMaxIndex.data() );

         pLocFrag->bPhdFileHasTraceArrayMinAndMax_ = true;

      }
      else if ( strncmp( "CHEM:", szLine, 5 ) == 0 ) {
         int nLength = strlen( szLine );
         
         // make sure that there is at least 2 chars after CHEM:
         // which allows for a space and a non-space
         if ( nLength < 7 )
            PARSE_PANIC( "line should have been CHEM: (prim, term, or unknown)" );
            
         // start with character after CHEM: which is at position 5
         pLocFrag->soChemistry_ = soGetRestOfLine( szLine, 5 );

      }
      else if ( strncmp( "DYE:", szLine, 4 ) == 0 ) {
         
         int nLength = strlen( szLine );

         if ( nLength < 6 )
            PARSE_PANIC( "line should have been DYE: ( rhod, big, ET, d-rhod, or unknown )" );
         
         pLocFrag->soDye_ = soGetRestOfLine( szLine, 4 );
      }
      else {
         // ALLOW other lines in the comment section of the phd file
      }
   }


   if ( !bFoundTime ) {
      PARSE_PANIC( "did not find TIME: in the BEGIN_COMMENT/END_COMMENT section of the phd block" );
   }


   if ( pLocFrag->bHasChromat() ) {

      pLocFrag->createPointPosArray( pLocFrag->bPhdFileHasTraceArrayMinAndMax_,
                                  pLocFrag->nTraceArrayMaxIndex_,
                                  pLocFrag->length(),
                                  true ); // bFillUpArray
   }
      

   // now looking for BEGIN_DNA

   bFound = false;
   while( !bFound ) {
      if (fgets( szLine, nMaxLineSize, pFil ) == NULL ) {
         PARSE_PANIC( 
             "premature end of file while looking for BEGIN_DNA" );
      }
      else 
         ++nLine;

      if ( strncmp( szBEGIN_DNA, szLine, nBEGIN_DNA ) == 0 ) 
         bFound = true;
   }



   int nPaddedSequencePos;

   // find the first non-pad base in the direction of sequencing.  To 
   // do this, start at the right end (for complemented reads) or left end
   // for uncomplemented reads.

   if ( pLocFrag->bComp() ) {
      nPaddedSequencePos = pLocFrag->nGetEndIndex();

      if ( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() ) {
         while( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() &&
                nPaddedSequencePos > pLocFrag->nGetStartIndex() )
            --nPaddedSequencePos;

         if ( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() ) {
            fprintf( stderr, "warning: read %s is entirely pads\n",
                     (char*) pLocFrag->soGetName().data() );
            fclose( pFil );
            return;
         }
      }
      // prepare for 1st advancing:
      ++nPaddedSequencePos;
         
   }
   else {
      nPaddedSequencePos = pLocFrag->nGetStartIndex();

      if ( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() ) {
         while( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() &&
                nPaddedSequencePos < pLocFrag->nGetEndIndex() )
            ++nPaddedSequencePos;


         if ( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() ) {
            fprintf( stderr, "warning: read %s is entirely pads\n",
                     (char*) pLocFrag->soGetName().data() );
            fclose( pFil );
            return;
         }
      }
      // prepare for 1st advancing
      --nPaddedSequencePos;
   }



   bool bHasChromat = pLocFrag->bHasChromat();

   // now reading the data until get END_DNA
   char cTemp;

   int   nBases = 0;

   bool bEndDNA = false;
   while( !bEndDNA ) {
      if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) {
         PARSE_PANIC( "premature end of file while looking for END_DNA" );
      }
      else {
         ++nLine;
      }

      if ( szLine[0] == 'E' &&
           szLine[1] == 'N' &&
           szLine[2] == 'D' &&
           szLine[3] == '_' &&
           szLine[4] == 'D' &&
           szLine[5] == 'N' &&
           szLine[6] == 'A' ) {
         bEndDNA = true;
      }
      else {
         ++nBases;

         // all of the code below is just to move nPaddedSequencePos to
         // the next non-pad--the base that corresponds to the next base
         // in the phd file

         if ( pLocFrag->bComp() ) {
            --nPaddedSequencePos;

            if ( nPaddedSequencePos < pLocFrag->nGetStartIndex() ) {
               PARSE_PANIC_2_ARGS( "too many bases in phd for complemented read.  At least %d bases in phd file but only %d unpadded bases in ace file",
                                   nBases,
                                   pLocFrag->nGetUnpaddedSeqLength() );
            }

            if ( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() ) {
               do {
                  --nPaddedSequencePos;
                  if ( nPaddedSequencePos < pLocFrag->nGetStartIndex() ) {
                     PARSE_PANIC_2_ARGS( "too many bases in phd file for complemented read.  (2) At least %d bases in phd file but only %d unpadded bases in ace file",
                                         nBases,
                                         pLocFrag->nGetUnpaddedSeqLength() );

                  }
                  
               } while( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() );
            }
         }
         else {
            ++nPaddedSequencePos;

            if ( nPaddedSequencePos > pLocFrag->nGetEndIndex() ) {
               PARSE_PANIC_2_ARGS( "too many bases in phd file for uncomplemented read.  At least %d bases in phd file but only %d unpadded bases in ace file",
                                   nBases,
                                   pLocFrag->nGetUnpaddedSeqLength() );
            }

            if ( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad () ) {
               //   must further change nPaddedSequencePos to next non-pad
               do {
                  ++nPaddedSequencePos;
                  if ( nPaddedSequencePos > pLocFrag->nGetEndIndex() ) {
                     PARSE_PANIC_2_ARGS( "too many bases in phd file for uncomplemented read.  (2) At least %d bases in phd file but only %d unpadded bases in ace file",
                                         nBases,
                                         pLocFrag->nGetUnpaddedSeqLength() );

                  }
                  
               } while( pLocFrag->ntideGet( nPaddedSequencePos ).bIsPad() );
               // the loop above will be exited only a non-pad (or an 
               // exception is thrown)
            }
         }

            
         // all of the above was just to change nPaddedSequencePos


         cTemp = szLine[0];

         if (! 
             ( 'a' <= cTemp && cTemp <= 'z' )  
             ) {
            if ( 'A' <= cTemp && cTemp <= 'Z' ) {
               szLine[0] = szLine[0] +
                  ( 'a' - 'A' );
            }
            else {
               char szTemp[200];
               sprintf( szTemp, 
                     "expected letter in position 0 of this line but instead found \"%s\"", 
                        szLine );

               PARSE_PANIC( szTemp );
            }
         }
             

         // sanity check that the phd file and the ace file agree

         char cAceBase = pLocFrag->ntideGet( nPaddedSequencePos ).cGetBase();
         char cPHDBase;
         if ( pLocFrag->bComp() ) 
            cPHDBase = cComplementBase( szLine[0] );
         else
            cPHDBase = szLine[0];
            
         if ( cPHDBase != cAceBase ) {
            // eliminate changes that crossmatch may have introduced
            // in to the acefile bases

            if ( cAceBase != 'x' && szLine[0] != 'x' && cAceBase != 'n' ) {
               ostrstream ost;
               ost << "ace file says base at nPaddedSequencePos " << 
                  nPaddedSequencePos << " or nBases " << nBases << 
                  " is " << cAceBase << 
                  " but PHD file says " << szLine[0] << 
                  " which on top strand is: " << cPHDBase << 
                  " for read " << 
                  pLocFrag->soGetName() << " at line " << nLine << 
                  " in file of phd files " << pCP->filFileOfPhdFilesCurrentlyBeingRead_ << endl << "How could this have happened? Somehow the ace file was created from a different phdball that was replaced by this more recent one." <<
                  endl << __FILE__ << ":" << __LINE__ << endl << ends;
               
               InputDataError ide( ost.str() );
               throw ide;
            }
         }


         // advance over 1st whitespace

         if ( szLine[1] != ' ' ) {
            PARSE_PANIC( "expected line of the form (base)(space)(quality)(space)(peak position) but the first space was not in column 2" );
         }

         nPos = 2;
         if ( szLine[2] == ' ' || szLine[2] == '\0' ) {
            while( szLine[nPos] == ' ' ) {
               ++nPos;
            }
            if ( szLine[nPos] == '\0' ) {
               PARSE_PANIC( "this line should be of the form (base)(space)(quality)(space)(peak position) but the line ended after the base" );
            }
         }

         // if reached here, szLine[nPos] is not a space and not a null

         // advance to next whitespace, accumlating quality on the way

         nQuality = 0;
         while( '0' <= szLine[nPos] && szLine[nPos] <= '9' ) {
            nQuality = nQuality*10 + ( szLine[nPos] - '0' );
            ++nPos;
         }

         // if a base is cross_matched out, then make its quality 0
         // since it should be dark--should not attract attention

         // changed Dec 1999 since we need quality values for autofinish
         // to get the error profile of existing reads

         //         if ( cAceBase == 'x' )
         //            nQuality = 0;


         pLocFrag->setQualityAtSeqPos( nPaddedSequencePos, nQuality );


         // if there is no chromat, no need to set peak position
         if ( !bHasChromat ) continue;

         // when exit loop above (and have chromat), the following
         // character must be a space
         
         if ( szLine[nPos] != ' ' ) {
            PARSE_PANIC( "this line should have 3 tokens but doesn't" );
         }

         // now advance to next non-whitespace to find the peak position
         ++nPos;

         if ( szLine[nPos] == ' ' || szLine[nPos] == '\0' ) {
            while( szLine[nPos] == ' ') {
               ++nPos;
            }

            if ( szLine[nPos] == '\0' ) {
               PARSE_PANIC( "this line should have 3 tokens but doesn't" );
            }
         }            


         // advance to next whitespace (actually a carriage return), 
         // accumlating peak position on the way 


         nPeakPosition = 0;
         while( '0' <= szLine[nPos] && szLine[nPos] <= '9' ) {
            nPeakPosition = nPeakPosition*10 + ( szLine[nPos] - '0' );
            ++nPos;
         }

         if ( szLine[nPos] != '\n' ) {
            char szTemp[1000];
            sprintf( szTemp,
                     "this line should be of form (base)(space)(quality)(space)(peak position) where quality and peak position are numeric but something non-numeric follows the peak position.  Line is:\n\"%s\"",
                     szLine );
            PARSE_PANIC( szTemp );
         }

         // if flipping read left and right, must do this also 
         // for the peak position so it lines up with the flipped trace

         if ( pLocFrag->bComp() ) {
            if ( pLocFrag->bPhdFileHasTraceArrayMinAndMax_ )
               nPeakPosition = pLocFrag->nTraceArrayMaxIndex_ - nPeakPosition;
            else
               nPeakPosition = -nPeakPosition;
         }

         pLocFrag->setTracePointPosAtSeqPos( nPaddedSequencePos, nPeakPosition );

      } //  if ( strncmp( szEND_DNA, szLine, nEND_DNA ) == 0 )  ... else 


   } //       while( !bEndDNA ) {


   // check that there are the same number of bases in the phd file
   // as the ace file

   // Why don't I just check if nBases == pLocFrag->nGetUnpaddedSeqLength() ?
   // It is because nGetUnpaddedSeqLength() takes a long time so I use
   // the more efficient method below.
   
   if ( pLocFrag->bComp() ) {
      if ( nPaddedSequencePos != pLocFrag->nGetStartIndex() ) {
         if ( pLocFrag->nGetUnpaddedSeqLength() != nBases ) {
            PARSE_PANIC_2_ARGS( "The phd file has only %d bases but the ace file has %d bases",
                                nBases,
                                pLocFrag->nGetUnpaddedSeqLength() );
         }
      }
   }
   else {
      if ( nPaddedSequencePos != pLocFrag->nGetEndIndex() ) {
         if ( pLocFrag->nGetUnpaddedSeqLength() != nBases ) {
            PARSE_PANIC_2_ARGS( "The phd file has only %d bases but the ace file has %d bases",
                                nBases,
                                pLocFrag->nGetUnpaddedSeqLength() );
         }
      }
   }

   // this was inserted to fix a bug with polyphred in which
   // tags applied to the .1 phd file may be out of range if the user then
   // deletes bases with subsequent versions of the phd files.  Since
   // the .1 phd file is still read, the tags in it may be out of range.
   // Thus the solution is to not read the tags in the .1 phd file if it
   // is just being read for the phred calls.

   bool bAssignPeakPositions = bHasChromat;

   pLocFrag->assignQualityAndPeakPositionsToPads( bAssignPeakPositions );

   nNumberOfUnpaddedBases = nBases;

} // processCommentAndDna


      

static void processMiscReadStuff( LocatedFragment* pLocFrag, 
                                  FILE* pFil, 
                                  const long lNumberOfUnpaddedBases,
                                  char* szLine, 
                                  bool& bEndOfFile,
                                  bool& bFoundBEGIN_SEQUENCE,
                                  const bool bUnpaddedTagCoordinates ) {

   // doesn't return until one of these is set to true

   bEndOfFile = false;
   bFoundBEGIN_SEQUENCE = false;

   bool bSetPaddedFromUnpaddedArray = false;

   while( fgets( szLine, nMaxLineSize, pFil) != NULL ) {
      ++nLine;

      if ( strcmp( szLine, "BEGIN_TAG\n" ) == 0 ) {
         
         if ( !bUnpaddedTagCoordinates && !bSetPaddedFromUnpaddedArray ) {
            setPaddedFromUnpaddedArray( lNumberOfUnpaddedBases, pLocFrag );
            bSetPaddedFromUnpaddedArray = true;
         }

         readTag( pFil, 
                  pCP->filFileOfPhdFilesCurrentlyBeingRead_,
                  pLocFrag, 
                  lNumberOfUnpaddedBases, 
                  bUnpaddedTagCoordinates );
      }
      else if ( strncmp( szLine, "WR{", 3 ) == 0 ) {
         wholeReadItem* pWR;
         readWholeReadItem( pFil, pCP->filFileOfPhdFilesCurrentlyBeingRead_, pLocFrag, pWR );
         pWR->appendThyself();
      }
      else if ( memcmp( szLine, "BEGIN_SEQUENCE", 14 ) == 0 ) {
         bFoundBEGIN_SEQUENCE = true;
         return;
      }
   }
   
   bEndOfFile = true;
} // processMiscReadStuff




   
void  Assembly :: readFileOfPhdFiles( const FileName& filFileOfPhdFiles ) 
{

   pCP->filFileOfPhdFilesCurrentlyBeingRead_ = filFileOfPhdFiles;

   // moved this to readPHDFilesAndSetQualitiesAndPeakPositions
   //   if ( !pCP->filFileOfPhdFiles_.bFileByThisNameExists() ) return;

   if (! bCalculatedLengths ) {
      bCalculatedLengths = true;
      
      nBEGIN_SEQUENCE = strlen( szBEGIN_SEQUENCE );
      nBEGIN_COMMENT  = strlen( szBEGIN_COMMENT );
      nBEGIN_DNA      = strlen( szBEGIN_DNA );
      nTIME           = strlen( szTIME );
      nEND_DNA        = strlen( szEND_DNA );
      nABI_THUMBPRINT = strlen( szABI_THUMBPRINT );
      nPHRED_VERSION  = strlen( szPHRED_VERSION );
      nCALL_METHOD    = strlen( szCALL_METHOD );
      nQUALITY_LEVELS = strlen( szQUALITY_LEVELS );
      nEND_COMMENT    = strlen( szEND_COMMENT );
      nTRACE_ARRAY_MIN_INDEX = strlen( szTRACE_ARRAY_MIN_INDEX );
      nTRACE_ARRAY_MAX_INDEX = strlen( szTRACE_ARRAY_MAX_INDEX );

   }


   // this should leave any subdirectory of ../phdball_dir but should
   // remove ../phdball_dir/

   RWCString soPhdBallWithoutDirectory = filFileOfPhdFiles;
   if ( soPhdBallWithoutDirectory.bStartsWithAndRemove( pCP->filPhdBallDirectory_ ) ) {
      soPhdBallWithoutDirectory.bStartsWithAndRemove( "/" );
   }


   FILE *pFil;

   nLine = 0;

   cerr << "opening " << filFileOfPhdFiles << endl;

   pFil = fopen( (char*) filFileOfPhdFiles.data(), "r" );
   //   ifsPHDFile.open( filPHD, ios:: in );

   if (pFil == NULL ) {
      ostrstream ost;
      ost << "could not open phd ball file " << filFileOfPhdFiles << endl << ends;
      InputDataError ide( ost.str() );
      throw ide;
   }


   int nNumberOfReadsInAssembly = 
      nGetNumberOfReadsInAssembly();

   // look for each read in turn, until the end of the file

   int nNumberOfPhdFilesInFile = 0;

   bool bEndOfFile = false;
   bool bFoundBEGIN_SEQUENCE = false;
   RWCString soReadName;
   RWCString soVersion;
   int nNumberOfUnpaddedBases;
   LocatedFragment* pLocFrag;
   RWCTokenizer tok("");
   int nVersion;

 look_for_BEGIN_SEQUENCE:   

   // looking for BEGIN_SEQUENCE

   if (fgets( szLine, nMaxLineSize, pFil ) == NULL )
      goto end_of_file;

   ++nLine;

   if ( memcmp( szBEGIN_SEQUENCE, szLine, nBEGIN_SEQUENCE ) != 0 )
      goto look_for_BEGIN_SEQUENCE;

 found_BEGIN_SEQUENCE:
   ++nNumberOfPhdFilesInFile;

   // moved this above ++nNumberOfPhdFiles so
   // that the pCP->nPhdFilesRead_ is equal to
   // nNumberOfReadsInAssembly in the case in which
   // all read are in phd.ball

   if ( 
       ( nNumberOfPhdFilesInFile < 10 ) ||
       ( ( nNumberOfPhdFilesInFile < 10000 ) && ( nNumberOfPhdFilesInFile % 1000 == 0 ) ) ||
       ( ( nNumberOfPhdFilesInFile < 100000 ) && ( nNumberOfPhdFilesInFile % 10000 == 0 ) ) ||
       ( nNumberOfPhdFilesInFile % 100000 == 0 ) 
       ) {
      cerr << "read phd files in " << filFileOfPhdFiles << "  found: " << 
         soAddCommas( nNumberOfPhdFilesInFile ) <<
         " totals: used: " <<  soAddCommas( pCP->nPhdFilesRead_ ) << 
         " need: " << soAddCommas( nNumberOfReadsInAssembly ) << endl;
      cerr.flush();
   }

   tok.soStringToBeTokenized_ = szLine;
   tok.nAfterLastToken_ = -1;

   assert( tok() == "BEGIN_SEQUENCE" );

   soReadName = tok();
   soVersion = tok();

   pLocFrag = pGetLocatedFragmentByName( soReadName );

   if ( !pLocFrag ) goto look_for_BEGIN_SEQUENCE;

   nVersion = -666;
   if ( soVersion.bIsNull() )
      nVersion = 1;
   else if ( !bIsNumeric( soVersion ) ) {
      RWCString soError = filFileOfPhdFiles + " is corrupted since line " +
         szLine + " has bad version";
      THROW_ERROR( soError );
   }
   else {
      nVersion = atoi( soVersion.data() );

   }

   if ( ( nVersion != pLocFrag->nVersion_  ) &&  ( nVersion != 1  ) ) {
         // for example, if the current version is .3 and this version is 
         // .2, we aren't interested
      goto look_for_BEGIN_SEQUENCE;
   }

   // if reached here, nVersion == 1 or else nVersion == pLocFrg->nVersion_
   // or both


   ++pCP->nPhdFilesRead_;


   if ( nVersion == pLocFrag->nVersion_ ) {

      try {
         processCommentAndDna( pLocFrag, pFil, nNumberOfUnpaddedBases );
      }
      catch( ExceptionBase eb ) {
         //         cerr << eb.szGetDesc() << " so will read phd file from ../phd_dir and see if it does not have this problem" << endl;

         // force reading of phd file
         pLocFrag->bAlreadyReadPhdFile_ = false;

         // since probably some indeterminate place in the phd.ball, find next
         // read
         goto look_for_BEGIN_SEQUENCE;
      }

      // now process tags and WR items and look for next BEGIN_SEQUENCE

      const bool bUnpaddedTagCoordinates = false;

      processMiscReadStuff( pLocFrag, pFil, nNumberOfUnpaddedBases, 
                            szLine, bEndOfFile, bFoundBEGIN_SEQUENCE,
                            bUnpaddedTagCoordinates );

      pLocFrag->filPhdBall_ = soPhdBallWithoutDirectory;

      if ( bEndOfFile ) goto end_of_file;
      if ( bFoundBEGIN_SEQUENCE ) goto found_BEGIN_SEQUENCE;

      assert( false );  // should never get here
   }
   else {
      assert( nVersion == 1 );

      processCommentAndDnaForOriginalBaseCalls( pLocFrag, pFil );

      // no need to look at WR or tags for original base calls
      // We will stop reading at END_DNA so there is no chance of 
      // finding BEGIN_SEQUENCE or end-of-file

      goto look_for_BEGIN_SEQUENCE;
      
   }

 end_of_file:
   cerr << "Number of phd blocks used from " << filFileOfPhdFiles << ": " <<
      soAddCommas( pCP->nPhdFilesRead_ ) << endl;
   cerr.flush();

   fclose( pFil );
} //  readFileOfPhdFiles





static void processCommentAndDnaForAddedRead( LocatedFragment* pLocFrag, 
                                  FILE* pFil,
                                  long& lNumberOfUnpaddedBases ) {


   pLocFrag->bAlreadyReadPhdFile_ = true;

   // looking for BEGIN_COMMENT

   bool bFound = false;
   while( !bFound ) {
      if (fgets( szLine, nMaxLineSize, pFil ) == NULL ) {
         PARSE_PANIC( 
             "premature end of file while looking for BEGIN_COMMENT for phd file comment section" );
      }
      else 
         ++nLine;

      if ( strncmp( szBEGIN_COMMENT, szLine, nBEGIN_COMMENT ) == 0 ) 
         bFound = true;
   }
      
   // looking for TIME:, CHEM:, DYE:, ABI_THUMBPRINT:,

   pLocFrag->bPhdFileHasTraceArrayMinAndMax_ = false;

   bool bFoundTime = false;

   bool bFoundCommentEnd = false;
   while( ! bFoundCommentEnd ) {
      if (fgets( szLine, nMaxLineSize, pFil ) == NULL ) {
         PARSE_PANIC( "premature end of file while looking for END_COMMENT" );
      }
      else 
         ++nLine;

      if ( strncmp( szEND_COMMENT, szLine, nEND_COMMENT ) == 0 ) 
         bFoundCommentEnd = true;
      else if ( strncmp( szTIME, szLine, nTIME ) == 0 ) {

         RWCString soLine( szLine );
         RWCTokenizer tokNext( soLine );
         RWCString soNextToken = tokNext();

         assert( soNextToken == "TIME:" );
         RWCString soDayOfWeek = tokNext();
         RWCString soMonth = tokNext();
         RWCString soDayOfMonth = tokNext();
         RWCString soTime = tokNext();
         RWCString soYear = tokNext();

         if ( !bIsNumeric( soYear ) ) {
            soYear = tokNext();
         }
         
         RWCString soTimestampFromPHDFile = soDayOfWeek + " " + soMonth +
            " " + soDayOfMonth +
            " " + soTime +
            " " + soYear;

         // no timestamp checking since the read is not in the ace file

         pLocFrag->setPHDTimestamp( soTimestampFromPHDFile );

         bFoundTime = true;

      }
      else if ( strncmp( szABI_THUMBPRINT, szLine, nABI_THUMBPRINT ) == 0 ) {
         pLocFrag->soABIThumbprint_ = soGetRestOfLine( szLine, nABI_THUMBPRINT );
      }
      else if ( strncmp( szPHRED_VERSION, szLine, nPHRED_VERSION ) == 0 ) {
         pLocFrag->soPhredVersion_ = soGetRestOfLine( szLine, nPHRED_VERSION );
      }
      else if ( strncmp( szCALL_METHOD, szLine, nCALL_METHOD ) == 0 ) {
         pLocFrag->soCallMethod_ = soGetRestOfLine( szLine, nCALL_METHOD );
      }
      else if ( strncmp( szQUALITY_LEVELS, szLine, nQUALITY_LEVELS ) == 0 ) {
         RWCString soQualityLevels = 
            soGetRestOfLine( szLine, nQUALITY_LEVELS );

         if (! bIsNumeric( soQualityLevels ) ) {
            PARSE_PANIC( "quality levels not numeric" );
         }

         pLocFrag->nQualityLevels_ = atoi( (char*) soQualityLevels.data() );
      }
      else if ( 
               strncmp( szTRACE_ARRAY_MIN_INDEX, szLine, 
                        nTRACE_ARRAY_MIN_INDEX ) == 0 
               ) {

         // soGetRestOfLine strips off final CR
         RWCString soTraceArrayMinIndex =
            soGetRestOfLine( szLine, nTRACE_ARRAY_MIN_INDEX );


         if ( !bIsNumeric( soTraceArrayMinIndex ) ) {
            PARSE_PANIC( "line should have been TRACE_ARRAY_MIN_INDEX: ### but ### wasn't numeric" );
         }

         pLocFrag->nTraceArrayMinIndex_ = atoi( soTraceArrayMinIndex.data() );
         
      }
      else if ( 
               strncmp( szTRACE_ARRAY_MAX_INDEX, szLine, 
                        nTRACE_ARRAY_MAX_INDEX ) == 0 
               ) {

         // soGetRestOfLine strips off final CR
         RWCString soTraceArrayMaxIndex = 
            soGetRestOfLine( szLine, nTRACE_ARRAY_MAX_INDEX );
         

         if ( !bIsNumeric( soTraceArrayMaxIndex ) ) {
            PARSE_PANIC( "line should have been TRACE_ARRAY_MAX_INDEX: ###  but ### wasn't numeric" );
         }
         
         pLocFrag->nTraceArrayMaxIndex_ = atoi( soTraceArrayMaxIndex.data() );

         pLocFrag->bPhdFileHasTraceArrayMinAndMax_ = true;

      }
      else if ( strncmp( "CHEM:", szLine, 5 ) == 0 ) {
         int nLength = strlen( szLine );
         
         // make sure that there is at least 2 chars after CHEM:
         // which allows for a space and a non-space
         if ( nLength < 7 )
            PARSE_PANIC( "line should have been CHEM: (prim, term, or unknown)" );
            
         // start with character after CHEM: which is at position 5
         pLocFrag->soChemistry_ = soGetRestOfLine( szLine, 5 );

      }
      else if ( strncmp( "DYE:", szLine, 4 ) == 0 ) {
         
         int nLength = strlen( szLine );

         if ( nLength < 6 )
            PARSE_PANIC( "line should have been DYE: ( rhod, big, ET, d-rhod, or unknown )" );
         
         pLocFrag->soDye_ = soGetRestOfLine( szLine, 4 );
      }
      else if ( strncmp( "CHROMAT_FILE:", szLine, 13 ) == 0 ) {

         int nLength = strlen( szLine );

         if ( nLength < 14 ) {
            PARSE_PANIC( "line should have been CHROMAT: (chromat_file)" );
         }

         pLocFrag->filChromatFile_ = soGetRestOfLine( szLine, 14 );
      }
      else {
         // ALLOW other lines in the comment section of the phd file
      }
   }


   if ( !bFoundTime ) {
      PARSE_PANIC( "did not find TIME: in the BEGIN_COMMENT/END_COMMENT section of the phd block" );
   }


   // added debugging info April 2009 for Vancouver
   if ( pLocFrag->pSeqPhredCalledBases_ ) {
      cerr << "Fatal error.  read " << pLocFrag->soGetName() << " has already been read once.  Why did this occur?  Possibilities include that there are 2 reads with the same name or that this read was already in the ace file" << endl;
      cerr.flush();
      assert( !pLocFrag->pSeqPhredCalledBases_ );
   }



   pLocFrag->pSeqPhredCalledBases_ = new Sequence();

      

   // now looking for BEGIN_DNA

   bFound = false;
   while( !bFound ) {
      if (fgets( szLine, nMaxLineSize, pFil ) == NULL ) {
         PARSE_PANIC( 
             "premature end of file while looking for BEGIN_DNA" );
      }
      else 
         ++nLine;

      if ( strncmp( szBEGIN_DNA, szLine, nBEGIN_DNA ) == 0 ) 
         bFound = true;
   }



   // how much room should we reserve for the bases?


   long lBytesPastFirstLine = 0;
   long lBasesThisRead = 0;

   bool bEndDNA = false;
   while( !bEndDNA ) {
      if ( fgets( szLine, nMaxLineSize, pFil) == NULL ) {
         PARSE_PANIC( "premature end of file while looking for END_DNA" );
      }
      else {
         //         ++nLine;  we'll be fseeking back
         lBytesPastFirstLine += strlen( szLine );
      }

      if ( szLine[0] == 'E' &&
           szLine[1] == 'N' &&
           szLine[2] == 'D' &&
           szLine[3] == '_' &&
           szLine[4] == 'D' &&
           szLine[5] == 'N' &&
           szLine[6] == 'A' ) {
         bEndDNA = true;
      }
      else {
         ++lBasesThisRead;
      }
   }
   

   // now go back to where the file was at the beginning of this block

   if ( fseek( pFil, -lBytesPastFirstLine, SEEK_CUR ) != 0 ) {
      RWCString soError( (size_t) 1000 );
      sprintf( soError.data(), 
               "fseek error when seeking in phdball back %d bytes %s (%d)\n", 
               (int) lBytesPastFirstLine,
               strerror( errno ),
               errno );
               
   }


   // and allocate enough space

   lNumberOfUnpaddedBases = lBasesThisRead;

   assert( pLocFrag->pSeqPhredCalledBases_ );

   pLocFrag->pSeqPhredCalledBases_->resize( lNumberOfUnpaddedBases );

   bool bHasChromat = pLocFrag->bHasChromat();

   if ( bHasChromat ) {

      pLocFrag->pSeqPhredCalledBases_->createPointPosArray( 
          pLocFrag->bPhdFileHasTraceArrayMinAndMax_,
          pLocFrag->nTraceArrayMaxIndex_,
          lNumberOfUnpaddedBases,
          true ); // bFillUpArray
   }

   char cTemp;
   for( int n1Base = 1; n1Base <= lNumberOfUnpaddedBases; ++n1Base ) {
      assert( fgets( szLine, nMaxLineSize, pFil) != NULL );

      ++nLine;

      // sometime try sscanf and see if it is faster

      cTemp = szLine[0];

      if (! 
          ( 'a' <= cTemp && cTemp <= 'z' )  
          ) {
         if ( 'A' <= cTemp && cTemp <= 'Z' ) {
            szLine[0] = szLine[0] +
               ( 'a' - 'A' );
         }
         else {
            char szTemp[200];
            sprintf( szTemp, 
                     "expected letter in position 0 of this line but instead found \"%s\"", 
                     szLine );

            PARSE_PANIC( szTemp );
         }
      }
             
      
      // advance over 1st whitespace

      if ( szLine[1] != ' ' ) {
         PARSE_PANIC( "expected line of the form (base)(space)(quality)(space)(peak position) but the first space was not in column 2" );
      }

      nPos = 2;
      if ( szLine[2] == ' ' || szLine[2] == '\0' ) {
         while( szLine[nPos] == ' ' ) {
            ++nPos;
         }
         if ( szLine[nPos] == '\0' ) {
            PARSE_PANIC( "this line should be of the form (base)(space)(quality)(space)(peak position) but the line ended after the base" );
         }
      }

      // if reached here, szLine[nPos] is not a space and not a null

      // advance to next whitespace, accumlating quality on the way

      nQuality = 0;
      while( '0' <= szLine[nPos] && szLine[nPos] <= '9' ) {
         nQuality = nQuality*10 + ( szLine[nPos] - '0' );
         ++nPos;
      }


      // if a base is cross_matched out, then make its quality 0
      // since it should be dark--should not attract attention

      // changed Dec 1999 since we need quality values for autofinish
      // to get the error profile of existing reads

      //         if ( cAceBase == 'x' )
      //            nQuality = 0;

      pLocFrag->pSeqPhredCalledBases_->insert( Ntide( cTemp, nQuality ) );

      // if there is no chromat, no need to set peak position
      if ( ! bHasChromat ) continue;

      // when exit loop above (and have chromat), the following
      // character must be a space
         
      if ( szLine[nPos] != ' ' ) {
         PARSE_PANIC( "this line should have 3 tokens but doesn't" );
      }

      // now advance to next non-whitespace to find the peak position
      ++nPos;

      if ( szLine[nPos] == ' ' || szLine[nPos] == '\0' ) {
         while( szLine[nPos] == ' ') {
            ++nPos;
         }

         if ( szLine[nPos] == '\0' ) {
            PARSE_PANIC( "this line should have 3 tokens but doesn't" );
         }
      }            


      // advance to next whitespace (actually a carriage return), 
      // accumlating peak position on the way 


      nPeakPosition = 0;
      while( '0' <= szLine[nPos] && szLine[nPos] <= '9' ) {
         nPeakPosition = nPeakPosition*10 + ( szLine[nPos] - '0' );
         ++nPos;
      }

      if ( szLine[nPos] != '\n' ) {
         char szTemp[1000];
         sprintf( szTemp,
                  "this line should be of form (base)(space)(quality)(space)(peak position) where quality and peak position are numeric but something non-numeric follows the peak position.  Line is:\n\"%s\"",
                  szLine );
         PARSE_PANIC( szTemp );
      }


      pLocFrag->pSeqPhredCalledBases_->setTracePointPosAtSeqPos( n1Base, nPeakPosition );
   } //    for( int n1Base = 1; n1Base <= lNumberOfUnpaddedBases; ++n1Base ) {



   // if flipping read left and right, must do this also 
   // for the peak position so it lines up with the flipped trace

   if ( pLocFrag->bComp() ) {

      pLocFrag->pSeqPhredCalledBases_->complementSequence();
         
   }


} // static void processCommentAndDnaForAddedRead( LocatedFragment* pLocFrag, 




static void backUpAndWritePhdBlock( FILE* pFil,
                                    const long lFilePosStartOfDesiredPhdBlock,
                                    char* szLine,
                                    FILE* pNewPhdBall ) {
   
   assert( fseek( pFil, lFilePosStartOfDesiredPhdBlock, SEEK_SET ) == 0 );

   bool bFirstLine = true;
   while( 1 ) {
      if ( fgets( szLine, nMaxLineSize, pFil ) == NULL ) {
         return;
      }

      if ( !bFirstLine &&
           memcmp( szBEGIN_SEQUENCE, szLine, nBEGIN_SEQUENCE ) == 0 ) 
         return;

      bFirstLine = false;

      // if get here, have a line to write

      fputs( szLine, pNewPhdBall );
   }
}





// no particular reason to be in Assembly object except to make it 
// consistent with readFileOfPhdFiles (above) and also to give it a
// declaration
void Assembly :: readFileOfPhdFilesForAddedReads( readsSortedByReadName& aDesiredReads, 
                                                  const FileName& filPhdBall,
                                                  const bool bCreateNewPhdBall,
                                                  FILE* pNewPhdBall ) {


   pCP->filFileOfPhdFilesCurrentlyBeingRead_ = filPhdBall;



   if ( !bCalculatedLengths ) {
      bCalculatedLengths = true;

      nBEGIN_SEQUENCE = strlen( szBEGIN_SEQUENCE );
      nBEGIN_COMMENT  = strlen( szBEGIN_COMMENT );
      nBEGIN_DNA      = strlen( szBEGIN_DNA );
      nTIME           = strlen( szTIME );
      nEND_DNA        = strlen( szEND_DNA );
      nABI_THUMBPRINT = strlen( szABI_THUMBPRINT );
      nPHRED_VERSION  = strlen( szPHRED_VERSION );
      nCALL_METHOD    = strlen( szCALL_METHOD );
      nQUALITY_LEVELS = strlen( szQUALITY_LEVELS );
      nEND_COMMENT    = strlen( szEND_COMMENT );
      nTRACE_ARRAY_MIN_INDEX = strlen( szTRACE_ARRAY_MIN_INDEX );
      nTRACE_ARRAY_MAX_INDEX = strlen( szTRACE_ARRAY_MAX_INDEX );

   }


   RWCString soPhdBallWithoutDirectory;
   if ( bCreateNewPhdBall ) {
      soPhdBallWithoutDirectory = pCP->filNewFileOfPhdFilesFullPath_; 
   }
   else {
      soPhdBallWithoutDirectory = filPhdBall;  // this is the full path
   }
   
   if ( soPhdBallWithoutDirectory.bStartsWithAndRemove( pCP->filPhdBallDirectory_ ) ) {
      soPhdBallWithoutDirectory.bStartsWithAndRemove( "/" );
   }



   FILE *pFil;
   nLine = 0;
   cerr << "opening " << filPhdBall << endl;

   pFil = fopen( filPhdBall.data(), "r" );
   if ( pFil == NULL ) {
      THROW_FILE_ERROR( filPhdBall );
   }

   // look for each read in turn, until the end of the file

   int nNumberOfPhdFilesInFile = 0;

   bool bEndOfFile = false;
   bool bFoundBEGIN_SEQUENCE = false;
   RWCString soReadName;
   RWCString soVersion;
   long lNumberOfUnpaddedBases;
   LocatedFragment* pLocFrag;
   RWCTokenizer tok("");
   int nVersion;
   int nNumberOfReadsRead = 0;
   int nNumberOfDesiredReadsRead = 0;
   const bool bUnpaddedTagCoordinates = true;
   long lFilePosStartOfDesiredPhdBlock;

 look_for_BEGIN_SEQUENCE:   

   // looking for BEGIN_SEQUENCE

   if (fgets( szLine, nMaxLineSize, pFil ) == NULL )
      goto end_of_file;

   ++nLine;


   if ( memcmp( szBEGIN_SEQUENCE, szLine, nBEGIN_SEQUENCE ) != 0 )
      goto look_for_BEGIN_SEQUENCE;

 found_BEGIN_SEQUENCE:
   ++nNumberOfReadsRead;

   if ( 
       ( nNumberOfReadsRead < 10 ) ||
       ( ( nNumberOfReadsRead < 10000 ) && ( nNumberOfReadsRead % 1000 == 0 ) ) ||
       ( ( nNumberOfReadsRead < 100000 ) && ( nNumberOfReadsRead % 10000 == 0 ) ) ||
       ( nNumberOfReadsRead % 100000 == 0 )
       ) {
      cerr << "read phds in " << filPhdBall << " found: " <<
         soAddCommas( nNumberOfReadsRead ) <<
         " totals: used: " << soAddCommas( nNumberOfDesiredReadsRead ) << endl;
   }

   tok.soStringToBeTokenized_ = szLine;
   tok.nAfterLastToken_ = -1;

   assert( tok() == "BEGIN_SEQUENCE" );

   soReadName = tok();
   soVersion = tok();

   pLocFrag = aDesiredReads.pFindReadByName( soReadName );
   if ( !pLocFrag ) {
      // we don't want this read.  Look for next read in phd ball

      goto look_for_BEGIN_SEQUENCE;
   }


   // if reached here, we want the read

   if ( bCreateNewPhdBall ) {
      lFilePosStartOfDesiredPhdBlock = ftell( pFil );
      if ( lFilePosStartOfDesiredPhdBlock == -1 ) {
         RWCString soError = filPhdBall + " ftell error";
         THROW_FILE_ERROR( soError );
      }

      // At this point lFilePosStartOfDesiredPhdBlock is after
      // the line we are interested in.  How much do we back up?
      // The entire strlen( szLine )?  Yes, because if we position
      // the file at that location, the next byte that we read will be the
      // start of the line we want.
      
      lFilePosStartOfDesiredPhdBlock -= strlen( szLine );
   }
      

   nVersion = -666;

   if ( soVersion.bIsNull() )
      nVersion = 1;
   else if ( !bIsNumeric( soVersion ) ) {
      RWCString soError = filPhdBall + " is corrupted since line " +
         szLine + " has bad version" + soVersion;
      THROW_ERROR( soError );
   }
   else {
      nVersion = atoi( soVersion.data() );
   }

   // I don't think we can check for version since the alignment file
   // has no information about which version was used.  In 99.9% of
   // uses, these reads will be new reads, since they are being aligned.
   // Thus they can't be edited, so they will be version 1.

   ++nNumberOfDesiredReadsRead;

   processCommentAndDnaForAddedRead( pLocFrag, pFil, lNumberOfUnpaddedBases );



   processMiscReadStuff( pLocFrag, pFil, lNumberOfUnpaddedBases, 
                         szLine, bEndOfFile, bFoundBEGIN_SEQUENCE,
                         bUnpaddedTagCoordinates );

   pLocFrag->filPhdBall_ = soPhdBallWithoutDirectory;


   if ( bCreateNewPhdBall ) {
      backUpAndWritePhdBlock( pFil, 
                              lFilePosStartOfDesiredPhdBlock,
                              szLine,
                              pNewPhdBall );
   }

   if ( bEndOfFile ) goto end_of_file;
   if ( bFoundBEGIN_SEQUENCE) goto found_BEGIN_SEQUENCE;

   assert( false ); // should never get here

 end_of_file:
   cerr << "Number of reads read: " << nNumberOfReadsRead << 
      " Number of these that are desired: " << nNumberOfDesiredReadsRead << 
      " Number of desired reads: " << aDesiredReads.length() << endl;

   cerr.flush();

   fclose( pFil );
} // readFileOfPhdFilesForAddedReads




#define PARSE_ERROR_PHDBALL2FASTA(message) \
{  ostrstream ost; \
   ost << "Error detected from source file " \
     << __FILE__ << " at " << __LINE__ <<endl; \
   ost << "Error in file " << filPhdBall_ << " at line " << nLine_ << ": \n" \
       << szLine_ << endl \
      << message << endl << ends; \
   InputDataError ide(ost.str()); \
   throw ide; }




void phdBall2Fasta :: readFirstBeginSequenceLine( FILE* pPhdBall,
                                                  bool& bEndOfFile ) {

   if ( !bCalculatedLengths ) {
      bCalculatedLengths = true;

      nBEGIN_SEQUENCE = strlen( szBEGIN_SEQUENCE );
      nBEGIN_COMMENT  = strlen( szBEGIN_COMMENT );
      nBEGIN_DNA      = strlen( szBEGIN_DNA );
      nTIME           = strlen( szTIME );
      nEND_DNA        = strlen( szEND_DNA );
      nABI_THUMBPRINT = strlen( szABI_THUMBPRINT );
      nPHRED_VERSION  = strlen( szPHRED_VERSION );
      nCALL_METHOD    = strlen( szCALL_METHOD );
      nQUALITY_LEVELS = strlen( szQUALITY_LEVELS );
      nEND_COMMENT    = strlen( szEND_COMMENT );
      nTRACE_ARRAY_MIN_INDEX = strlen( szTRACE_ARRAY_MIN_INDEX );
      nTRACE_ARRAY_MAX_INDEX = strlen( szTRACE_ARRAY_MAX_INDEX );

   }

   bool bFoundBeginSequence = false;
   while( !bFoundBeginSequence ) {
      if ( fgets( szLine_, nMaxLineSize2, pPhdBall ) == NULL ) {
         bEndOfFile = true;
         return;
      }
      ++nLine_;

      if ( memcmp( szBEGIN_SEQUENCE, szLine_, nBEGIN_SEQUENCE ) == 0 ) {
         bFoundBeginSequence = true;
      }
   }
}
      
// void phdBall2Fasta :: findNextDesiredBeginSequenceLine( FILE* pPhdBall,
//                                                          bool& bEndOfFile,
//                                                          readsSortedByReadName aDesiredReads ) {

//    if ( !bCalculatedLengths ) {
//       bCalculatedLengths = true;

//       nBEGIN_SEQUENCE = strlen( szBEGIN_SEQUENCE );
//       nBEGIN_COMMENT  = strlen( szBEGIN_COMMENT );
//       nBEGIN_DNA      = strlen( szBEGIN_DNA );
//       nTIME           = strlen( szTIME );
//       nEND_DNA        = strlen( szEND_DNA );
//       nABI_THUMBPRINT = strlen( szABI_THUMBPRINT );
//       nPHRED_VERSION  = strlen( szPHRED_VERSION );
//       nCALL_METHOD    = strlen( szCALL_METHOD );
//       nQUALITY_LEVELS = strlen( szQUALITY_LEVELS );
//       nEND_COMMENT    = strlen( szEND_COMMENT );
//       nTRACE_ARRAY_MIN_INDEX = strlen( szTRACE_ARRAY_MIN_INDEX );
//       nTRACE_ARRAY_MAX_INDEX = strlen( szTRACE_ARRAY_MAX_INDEX );

//    }

//    RWCString soReadName( (size_t) 200 );

//    bool bFoundDesiredBeginSequence = false;
//    while( !bFoundDesiredBeginSequence ) {
//       if ( fgets( szLine_, nMaxLineSize2, pPhdBall ) == NULL ) {
//          bEndOfFile = true;
//          return;
//       }
//       ++nLine_;

//       if ( memcmp( szBEGIN_SEQUENCE, szLine_, nBEGIN_SEQUENCE ) == 0 ) {

//          int nTokenStart;
//          int nTokenEnd;
//          bool bError;

//          getNextToken( szLine_, nBEGIN_SEQUENCE, nTokenStart, nTokenEnd, bError );
//          assert( !bError );

//          RWCString soReadName( szLine_ + nTokenStart, 
//                                nTokenEnd - nTokenStart + 1 );

//          if ( aDesiredReads.bContains( soReadName ) ) {
//             bFoundDesiredBeginSequence = true;
//          }
//       }
//    }
// }
      





// this is put into phdBall2Fasta in order to use some object variables
// such as the filPhdBall_ and nLine_ and not have to pass these as arguments

void phdBall2Fasta :: readFileOfPhdFilesForPhdBall2Fasta( 
                                         FILE* pPhdBall,
                                         RWCString& soFastaHeader,
                                         RWCString& soFastaQualHeader,
                                         RWCString& soBases,
                                         RWTValOrderedVector<int>& aQualities,
                                         bool& bEndOfFile ) {
   

   soBases = "";
   aQualities.clear();
   soFastaHeader = "";
   soFastaQualHeader = "";


   char szReadName[1000];
   int nVersion;
   int nTokens = sscanf( szLine_, "BEGIN_SEQUENCE %s %d", szReadName, &nVersion );

   if ( nTokens == 1 )
      nVersion = 1;
   else if ( nTokens != 2 ) {
      PARSE_ERROR_PHDBALL2FASTA( "line should have been of the form BEGIN_SEQUENCE (seq_name) (version)" );
   }

   soFastaHeader += szReadName;
   soFastaHeader += " VERSION: ";
   soFastaHeader += RWCString( (long) nVersion );
   soFastaQualHeader += szReadName;

   bool bFoundBeginComment = false;
   while( bFoundBeginComment ) {
      if ( fgets( szLine_, nMaxLineSize2, pPhdBall ) == NULL ) {
         PARSE_ERROR_PHDBALL2FASTA( "premature end of file after BEGIN_SEQUENCE and expecting BEGIN_COMMENT" );
      }
      ++nLine_;


      if ( memcmp( szBEGIN_COMMENT, szLine_, nBEGIN_COMMENT ) == 0 ) {
         bFoundBeginComment = true;
      }
   }


   
   bool bFoundCommentEnd = false;
   while( !bFoundCommentEnd ) {

      if (fgets( szLine_, nMaxLineSize2, pPhdBall ) == NULL ) {
         PARSE_ERROR_PHDBALL2FASTA( "premature end of file while looking for END_COMMENT" );
      }
      ++nLine_;

      if ( strncmp( szEND_COMMENT, szLine_, nEND_COMMENT ) == 0 ) 
         bFoundCommentEnd = true;
      else if ( strncmp( szTIME, szLine_, nTIME ) == 0 ) {

         RWCString soLine( szLine_ );
         RWCTokenizer tokNext( soLine );
         RWCString soNextToken = tokNext();

         assert( soNextToken == "TIME:" );
         RWCString soDayOfWeek = tokNext();
         RWCString soMonth = tokNext();
         RWCString soDayOfMonth = tokNext();
         RWCString soTime = tokNext();
         RWCString soYear = tokNext();

         if ( !bIsNumeric( soYear ) ) {
            soYear = tokNext();
         }
         
         RWCString soTimestampFromPHDFile = soDayOfWeek + " " + soMonth +
            " " + soDayOfMonth +
            " " + soTime +
            " " + soYear;

         soFastaHeader += " TIME: ";
         soFastaHeader += soTimestampFromPHDFile;
      }
      else if ( strncmp( "CHEM:", szLine_, 5 ) == 0 ) {
         soFastaHeader += " ";
         soFastaHeader += szLine_;
         soFastaHeader.stripTrailingWhitespaceFast();
      }
      else if ( strncmp( "DYE:", szLine_, 4 ) == 0 ) {

         soFastaHeader += " ";
         soFastaHeader += szLine_;
         soFastaHeader.stripTrailingWhitespaceFast();
      }
      else if ( strncmp( "CHROMAT_FILE:", szLine_, 13 ) == 0 ) {
         soFastaHeader += " ";
         soFastaHeader += szLine_;
         soFastaHeader.stripTrailingWhitespaceFast();
      }
      else {
         // ALLOW other lines in the comment section of the phd file
      }
   } //    while( !bFoundCommentEnd ) {


   // skip over BEGIN_DNA

   bool bFoundBeginDNA = false;
   while( !bFoundBeginDNA ) {
      if ( fgets( szLine_, nMaxLineSize2, pPhdBall ) == NULL ) {
         PARSE_ERROR_PHDBALL2FASTA( "premature end of file while looking for BEGIN_DNA" );
      }
      ++nLine_;
      
      if ( memcmp( szBEGIN_DNA, szLine_, nBEGIN_DNA ) == 0 ) {
         bFoundBeginDNA = true;
      }
   }




   int nQuality;
   char cBase;



   bool bFoundEndDNA = false;
   while( !bFoundEndDNA ) {

      if (fgets( szLine_, nMaxLineSize2, pPhdBall ) == NULL ) {
         PARSE_ERROR_PHDBALL2FASTA( "premature end of file while looking for END_DNA" );
      }
      ++nLine_;
   
      if ( szLine_[0] == 'E' &&
           szLine_[1] == 'N' &&
           szLine_[2] == 'D' &&
           szLine_[3] == '_' &&
           szLine_[4] == 'D' &&
           szLine_[5] == 'N' &&
           szLine_[6] == 'A' ) {
         bFoundEndDNA = true;
      }
      else {

         nTokens = sscanf( szLine_, "%c %d", &cBase, &nQuality );

         if ( nTokens != 2 ) {
            PARSE_ERROR_PHDBALL2FASTA( "line should be of form (base) (quality) but it isn't" );
         }

         if (! isalpha( cBase ) ) {
            PARSE_ERROR_PHDBALL2FASTA( "line should begin with a base but doesn't" );
         } 
         
         // the 200 is wild, but just to check that we aren't reading 
         // a corrupted file
         if ( ! ( 0 <= nQuality && nQuality <= 200 ) ) {
            PARSE_ERROR_PHDBALL2FASTA( "line should be of form (base) (quality) where quality is in the range of 0 to 99 but it isn't" );
         }

         soBases.append( cBase );
         aQualities.append( nQuality );
      }
   }


   bool bContinue = true;
   while( bContinue ) {
      // this can end in 2 ways:  1) end of file 2) next read

      if (fgets( szLine_, nMaxLineSize2, pPhdBall ) == NULL ) {
         bEndOfFile = true;
         return;
      }
      ++nLine_;

      if ( memcmp( szBEGIN_SEQUENCE, szLine_, nBEGIN_SEQUENCE ) == 0 ) {
         bContinue = false;
      }
      else {
         // look for whole read items

         if ( memcmp( szLine_, "WR{", 3 ) == 0 ) {

            // this is in readWholeReadItem.cpp
            readWholeReadItemForPhdBall2Fasta( pPhdBall, soFastaHeader );
         }
      }
   } //  while( !bContinue ) {

   // notice that at this point szLine_ contains the BEGIN_SEQUENCE
   // for the next read
}