/*****************************************************************************
#   Copyright (C) 1994-2008 by David Gordon.
#   All rights reserved.                           
#                                                                           
#   This software is part of a beta-test version of the Consed/Autofinish
#   package.  It should not be redistributed or
#   used for any commercial purpose, including commercially funded
#   sequencing, without written permission from the author and the
#   University of Washington.
#   
#   This software is provided ``AS IS'' and any express or implied
#   warranties, including, but not limited to, the implied warranties of
#   merchantability and fitness for a particular purpose, are disclaimed.
#   In no event shall the authors or the University of Washington be
#   liable for any direct, indirect, incidental, special, exemplary, or
#   consequential damages (including, but not limited to, procurement of
#   substitute goods or services; loss of use, data, or profits; or
#   business interruption) however caused and on any theory of liability,
#   whether in contract, strict liability, or tort (including negligence
#   or otherwise) arising in any way out of the use of this software, even
#   if advised of the possibility of such damage.
#
#   Building Consed from source is error prone and not simple which is
#   why I provide executables.  Due to time limitations I cannot
#   provide any assistance in building Consed.  Even if you do not
#   modify the source, you may introduce errors due to using a
#   different version of the compiler, a different version of motif,
#   different versions of other libraries than I used, etc.  For this
#   reason, if you discover Consed bugs, I can only offer help with
#   those bugs if you first reproduce those bugs with an executable
#   provided by me--not an executable you have built.
# 
#   Modifying Consed is also difficult.  Although Consed is modular,
#   some modules are used by many other modules.  Thus making a change
#   in one place can have unforeseen effects on many other features.
#   It may takes months for you to notice these other side-effects
#   which may not seen connected at all.  It is not feasable for me to
#   provide help with modifying Consed sources because of the
#   potentially huge amount of time involved.
#
#*****************************************************************************/
//
// assembly.cpp
//
// implementation for assembly.h
//
// Currently based on parsing the output
// file from phrap, using the PhrapFile object.
// this is a temporary approach, used for the 
// first cut.
//
// chrisa 21-Dec-94
//



#include <stdio.h>
#include <iostream.h>
#include <fstream.h>
#include "rwcstring.h"
#include "rwctokenizer.h"

#include "sysdepend.h"
#include "locatedFragment.h"
#include "contig.h"
#include "assembly.h"
#include "basesegment.h"
#include "mbt_exception.h"
#include "filename.h"
#include "listOfFragments.h"
#include "listOfFragmentsAndEndPositions.h"
#include "guiapp.h"
#include "consed.h"
#include    "deleteFileIfItExists.h"
#include    "baseSegmentsToSort.h"
#include    "tag.h"



extern bool bNoBaseSeg;
extern bool bNoSetSequence;
extern bool bNoFragments;
extern bool bNoListOfFragments;



// the _minimum_ storage allocated for string objects that
// will hold DNA sequence - avoids needless alloc'ing
static const RWSize_2T sizeMinDnaStringObjectSize = 10240;

// max chars on input line
static const int nLineBufLen = 2040;
static const RWSize_2T nMaxLineChars = nLineBufLen; 



//
// macros format message, form and throw exception objects
//
#define PARSE_PANIC(file,line,message) \
{  ostrstream ost; \
   ost << "Corrupted ace file detected by consed source file " \
     << __FILE__ << " at " << __LINE__ <<endl; \
   ost << "Error in ace file " << file << " at line " << line << ": " \
      << message << endl << ends; \
   InputDataError ide(ost.str()); \
   throw ide; }

#define PARSE_PANIC_TOKEN(file,line,token) \
{  ostrstream ost; \
   ost << "Corrupted ace file detected by consed source file " \
     << __FILE__ << " at " << __LINE__ <<endl; \
   ost << "Error in ace file " << file << " at line " << line << ": " \
      << "Unrecognized or out-of-place token '" << token << "' " << endl \
        << ends; \
   InputDataError ide(ost.str()); \
   throw ide; }

#define PARSE_PANIC_NUMERIC_TOKEN(file,line,token) \
{  ostrstream ost; \
   ost << "Corrupted ace file detected by consed source file " \
     << __FILE__ << " at " << __LINE__ <<endl; \
   ost << "Error in ace file " << file << " at line " << line << ": " \
      << "Bad numeric token '" << token << "' " << endl << ends; \
   InputDataError ide(ost.str()); \
   throw ide; }

// just used internally for checking that phrap's end align positions
// agrees with the consed calculated positions

static ListOfFragmentsAndEndPositions* pListOfFragmentsAndEndPositions;

//
// implemented as a miniature state machine.  this typedef
// used to indicate what we type of object we're eating
// at the moment.
//
// Base_segment lines are accepted in the ExpectAsm state.
// chrisa 17-mar-95
//

typedef enum 
{ 
   ExpectContig,  // initial state, and between sections
   ExpectContDna, // reading lines of contig consensus sequence
   ExpectContSeq, // waiting for contig "Sequence" lines
   ExpectFragSeq, // waiting for frag "Sequence" (clipping) lines
   ExpectFragName,// waiting for frag "DNA <read name>" line
   ExpectFragDna, // reading lines of fragment DNA sequence
   ExpectAsm,     // reading Assembled_from* lines
   ExpectBase,    // reading Base_segment* lnes
   ExpectClip,     // reading Clipping lines
   ExpectAnything, // allow unrecognized bases
   ExpectBaseQualityTitleLine, // for phrap quality
   ExpectBaseQualityNumberLine,
   ExpectDescription
}  
InputState;

//
// if true we have a buffered line that was "put back"
// while parsing.  look, Dave, globals!  See, I'm not 
// a slave to religious dogma after all.  I can write
// totally crufty code when I want to.  %-)
//
bool bHavePutBackLine;
// initial storage for max line len
RWCString soPutBackLine(nMaxLineChars);

//
// this procedure returns buffered line if exists, otherwise
// reads from file.  returns true if has a line (from either source).
//
static bool bGetNextLine(ifstream& ifsInFile,
                   int& nCurLine,
                   RWCString& soNextLine) {
   if (bHavePutBackLine) {
      soNextLine = soPutBackLine;
      bHavePutBackLine = false;
      return true;  // return with buffered data
   }

   char szLineBuf[nLineBufLen];  // buffer for fstream getline()
   if (! ifsInFile.getline(szLineBuf, (nLineBufLen-1)) ) {
      soNextLine = "";  // clear passed arg
      return false;
   }

   // bump line counter - actually read one
   nCurLine++;

   // convert to string object
   soNextLine = szLineBuf;

   // return with new data
   return true;
}


//
// utility to remove ".comp" from filename
// returns true if extension was present
//
static bool removeCompFromFilename(RWCString& so) {
   size_t nFindIndex = so.index(".comp");
   if (nFindIndex != RW_NPOS) {
      so.remove(nFindIndex);  // clear to end of name
      return true;
   }   

   return false;
}


//
// buffer the passed line
//
static void putBackLine(RWCString& soLine) {
   bHavePutBackLine = true;
   soPutBackLine = soLine;
}


void Assembly :: tryParsingUsingAceFileFormat1(const char* szPhrapAceFileName,
                                               bool& bHasConsensusTags,
                                               long& lPositionOfConsensusTags) 
{

   bHasConsensusTags = false;  // will be changed, if there are any

   baseSegmentsToSortType* pBaseSegmentsToSort;


   ifstream ifsInFile(szPhrapAceFileName, ios::in);

   // file open?
   if (!ifsInFile) { 
      ostrstream ost;
      ost << "Unable to open file " << szPhrapAceFileName << endl << ends;
      SysRequestFailed srf(ost.str());
      srf.includeErrnoDescription();
      throw srf;
   }

   int nLinesInAceFile;
   int nNumberOfBaseSegments = 0;

   cout << "first pass through ace file" << endl;
   listOfFragments listOfFragmentss;

   if (!bNoListOfFragments ) {

   firstPassThroughAceFile( ifsInFile, listOfFragmentss, nLinesInAceFile,
                        nNumberOfBaseSegments );

   ifsInFile.close();

   nNumberOfReadsInAssembly_ = listOfFragmentss.daFragmentName_.length();

   // grab memory that we will use to store all base segments

   pBaseSegmentsToSort = 
     (baseSegmentsToSortType*) malloc( sizeof( baseSegmentsToSortType ) *
                                       nNumberOfBaseSegments );

   // rewind the file and parse again


   ifsInFile.open(szPhrapAceFileName, ios::in);

   // file open?
   if (!ifsInFile) { 
      ostrstream ost;
      ost << "Unable to open file " << szPhrapAceFileName << endl << ends;
      SysRequestFailed srf(ost.str());
      srf.includeErrnoDescription();
      throw srf;
   }


   // create this array at the beginning of parsing and delete it when
   // parsing is done
   pListOfFragmentsAndEndPositions = new ListOfFragmentsAndEndPositions();

   }

   cout << "second pass through ace file" << endl;
   // this string object will hold DNA seqs as they're read from 
   // multiple lines
   RWCString soDna(sizeMinDnaStringObjectSize);

   // clear flags for line buffering
   bHavePutBackLine = false;
   soPutBackLine = "";

   // hold name of fragment while reading related lines
   RWCString soCurFragName;
   // pointer to current located fragment
   LocatedFragment* pNewLocFrag = 0;
   LocatedFragment* pCurLocFrag = 0;

   // hold name of current contig while reading related lines
   RWCString soCurContigName;
   // hold pointer to current contig
   Contig* pNewContig = 0;


   // will be used to store phrap quality values
   int nLastNonPadBasePosition;

   // flag indicates that "Clipping*" line found after 
   // "Sequence* <read name>"
   bool bHaveClippingLineForThisFrag = false;


   // used for sorting the base segments
   int  nNumberOfBaseSegmentsInContig = 0;



   // read all lines in the file, keeping count
   int nCurLine = 0;
   int nAsmLines = 0;
   int nBaseLines = 0;

   // holds the current input file line
   RWCString soLine;

   // the first thing we expect is the contig consensus seq
   InputState inputState = ExpectContig;

   bool bAtBeginningOfConsensusTags = false;

   // read the entire file within this while loop
   while ( bGetNextLine(ifsInFile, nCurLine, soLine) ) {

      if ( ( nCurLine % 5000 ) == 0  ) {
         int nPerCentDone = (( nCurLine * 100 ) / nLinesInAceFile) / 2;
         cout << nPerCentDone << "% done.  nCurLine = " << nCurLine << endl;
      }


      //      cout << "Current state = "<< inputState
      //      << " line = '" << soLine << "' " << endl; cout.flush();

      // tokenize input line
      RWCTokenizer tokNext(soLine);

      // pull off the first token
      RWCString soFirstToken = tokNext();

      switch (inputState) {

       case ExpectContig:
         if (soFirstToken.isNull()) break;  // blank line ignored

         if (soFirstToken != "DNA") {
            PARSE_PANIC_TOKEN(soFileName_, nCurLine, soFirstToken);
         }

         // get contig name, check for format error
         soCurContigName = tokNext();
         if (soCurContigName.isNull()) {
            PARSE_PANIC(soFileName_,nCurLine,"'DNA' not followed by contig name");
         }  

         if (!bNoFragments ) {
         // create a new contig object
         pNewContig = new Contig(soCurContigName,
                                 ConsEd::pGetAssembly() );

         // add it to assembly
         dapContigs_.insert(pNewContig);

         }

         // prepare for adding base segments
         nNumberOfBaseSegmentsInContig = 0;
         
         // what follows should be the consensus sequence of the contig
         inputState = ExpectContDna;

         // clear out the dna holder
         soDna = "";



         break;

       case ExpectContDna:
         if (soFirstToken.isNull()) {
            // there should be some DNA in the buffer by now
            if (soDna.length() == 0) {
               PARSE_PANIC(soFileName_, nCurLine,
                           "0 length DNA sequence");
            }

            // end of dna.  add accumulated sequence to array
            if (! bNoSetSequence) pNewContig->setSequence(soDna);

            // end of DNA.  "Sequence" line should follow
            inputState = ExpectAnything;
         }
         else {
            // not null, should be DNA sequence (ACTG's, etc.)
            soDna += soFirstToken;
         }

         break;  // case ExpectContDna

       case ExpectAnything:
         if (soFirstToken == "Sequence" ) {
           putBackLine( soLine );
           inputState = ExpectContSeq;
         }
         else if (soFirstToken == "BaseQuality" ) {
           putBackLine( soLine );
           inputState = ExpectBaseQualityTitleLine;
         }
         else {
           // skip over unrecognized lines
         }
         break;  

       case ExpectBaseQualityTitleLine:
          {
             RWCString soContigName = tokNext();
             assert( pNewContig->soGetName() == soContigName );
          }
          nLastNonPadBasePosition = pNewContig->nGetStartIndex() - 1;
          inputState = ExpectBaseQualityNumberLine;
          break;

       case ExpectBaseQualityNumberLine:
          if (soFirstToken.isNull() ) {

             if ( pNewContig->ntGetConsUnsafe( pNewContig->nGetEndConsensusIndex() ).bIsPad() ) {

                if ( nLastNonPadBasePosition > pNewContig->nGetEndConsensusIndex() ) {
                   PARSE_PANIC( soFileName_, nCurLine, 
                                "There should be the same number of quality values as unpadded bases" );
                }
             }
             else {
                if ( nLastNonPadBasePosition != pNewContig->nGetEndConsensusIndex() ) {
                   PARSE_PANIC( soFileName_, nCurLine,
                                "There should be the same number of quality values as unpadded bases" );
                }
             }


             // we need to do this since the ace file only 
             // contains quality for non-pads

             pNewContig->assignQualityAndPeakPositionsToPads( false ); 
                 // false for bAndPeakPositions
             inputState = ExpectContSeq;
             break;
          }
          processBaseQualityLine( soLine, pNewContig, nLastNonPadBasePosition );
          break;

       case ExpectContSeq:
         if (soFirstToken.isNull()) break;  // blank line ignored

         if (soFirstToken != "Sequence") {
            PARSE_PANIC(soFileName_, nCurLine,
                        "Expected 'Sequence' lines for contig");
         }

         // next token should be the contig name
         {  
            // local variable in switch statement requires braces
            RWCString soTempContigName = tokNext();
            // check for format error
            if (soTempContigName.isNull()) {
               PARSE_PANIC(soFileName_,nCurLine,
                           "'Sequence' not followed by read name");
            }
         }

         // next up are the assembled from lines
         inputState = ExpectAsm;
         nAsmLines = 0;
         break;

       case ExpectAsm:
         // blank line means no base segment lines - error
         if (soFirstToken.isNull()) {
            PARSE_PANIC(soFileName_, nCurLine,
                        "Missing 'Base_segment' lines for contig");
         }

         // end of asm lines should mean start of base lines
         if ((soFirstToken == "Base_segment") ||
             (soFirstToken == "Base_segment*")) {
            inputState = ExpectBase;
            putBackLine(soLine);
            break;
         }

         // discard these "unpadded" (i.e. no "*") versions 
         if (soFirstToken == "Assembled_from") break;

         // only one thing left it can legally be
         if (soFirstToken != "Assembled_from*") {
            PARSE_PANIC_TOKEN(soFileName_, nCurLine, soFirstToken);
         }

         nAsmLines++;  // got one

         {
            RWCString soReadName = tokNext();
            if (soReadName.isNull()) {
               PARSE_PANIC(soFileName_, nCurLine,
                           "Missing filename on Assembled_from* line");
            }
            bool bComp = removeCompFromFilename(soReadName);

            // get alignment of this fragment
            RWCString soStartPos = tokNext();
            RWCString soEndPos = tokNext();
            int nStart, nEnd, nResult;
            nResult = sscanf(soStartPos,"%d",&nStart);
            if (nResult != 1) {
               PARSE_PANIC_NUMERIC_TOKEN(soFileName_, nCurLine, soStartPos);
            }
            nResult = sscanf(soEndPos,"%d",&nEnd);
            if (nResult != 1) {
               PARSE_PANIC_NUMERIC_TOKEN(soFileName_, nCurLine, soEndPos);
            }

            // has a fragment of this name already been added to
            // the assembly?
            if (!bNoFragments ) {

            LocatedFragment* pOldLocF =
              pNewContig->pLocFragGetByName(soReadName);
            if (pOldLocF) {
               ostrstream ost;
               ost <<   "Error in file " << soFileName_ 
                 << " at line " << nCurLine << ": " << endl
                   << "Fragment " << soReadName << 
                     " has already been included in this contig" << endl
                       << ends;
               InputDataError ide(ost.str());
               throw ide; 
            }
            }

            if (!bNoFragments ) {

            // new up a located fragment
            // note that the dna sequence and clipping come later
            LocatedFragment* pLf =
              new LocatedFragment(soReadName,
                                  nStart, // start aligned at this pos in cons
                                  bComp,
                                  pNewContig);
            // add it to current contig
            pNewContig->addLocatedFragment(pLf); 

            FragmentAndEndPosition *pFragAndEnd = 
              new FragmentAndEndPosition( soReadName, nEnd );

            pListOfFragmentsAndEndPositions->daFragmentsAndEndPositions_.insert( pFragAndEnd );
            }

         }

         break;

       case ExpectBase:
         // blank line means expect DNA sequence of read
         if (soFirstToken.isNull()) {
           // sort all of the base segments for this contig

           qsort( pBaseSegmentsToSort, 
                  nNumberOfBaseSegmentsInContig, 
                  sizeof( baseSegmentsToSortType ), 
                  ( int(*)(const void*, const void*) )&nCompareBaseSegmentsToSort );

           pNewContig->baseSegArray_.resize( 
                            (size_t) nNumberOfBaseSegmentsInContig 
                                            );

           for( int n = 0; n < nNumberOfBaseSegmentsInContig; ++n ) {
             baseSegmentsToSortType* pOneBaseSegmentToSort = 
               pBaseSegmentsToSort + n;
             pNewContig->baseSegArray_.addPtrToNewBaseSeg(
                            pOneBaseSegmentToSort->pBaseSegment
                                                          );
           }

           inputState = ExpectFragName;  // after blank lines, 
                                          // will be DNA 
           break;
         }

         if (soFirstToken == "Base_segment") break;  // ignore these

         if (soFirstToken != "Base_segment*") {
            PARSE_PANIC_NUMERIC_TOKEN(soFileName_, nCurLine, soFirstToken);
         }

         // create a BaseSegment object
         nBaseLines++;

         // braces around local vars in switch statement case
         {
            RWCString soStartInCon = tokNext();
            RWCString soEndInCon = tokNext();
            RWCString soReadName = tokNext();

            // all of these must be present, rest of line discarded
            if (soStartInCon.isNull() ||
                soEndInCon.isNull() ||
                soReadName.isNull()) {
               PARSE_PANIC(soFileName_,nCurLine,
                           "Missing token on Base_segment* line");
            }

            // convert strings to integers
            int nStartInCon, nEndInCon, nResult;
            nResult = sscanf(soStartInCon,"%d",&nStartInCon);
            if (nResult != 1) {
               PARSE_PANIC_NUMERIC_TOKEN(soFileName_, nCurLine, soStartInCon);
            }
            nResult = sscanf(soEndInCon,"%d",&nEndInCon);
            if (nResult != 1) {
               PARSE_PANIC_NUMERIC_TOKEN(soFileName_, nCurLine, soEndInCon);
            }
            
            // reality check alignments
            if (nStartInCon > nEndInCon) {
               PARSE_PANIC(soFileName_, nCurLine, 
                           "Bad alignment in Base_segment* line");
            }

            // remove ".comp" from readname
            (void )removeCompFromFilename(soReadName);
            
            // find the corresponding LocatedFragment, which
            // _must_ already have been added to the contig
            //
            // the input file has Base_segment* and Assembled_from*
            // lines that refer to the same fragment, identified
            // only by file name, and separated by an indeterminate
            // number of lines relating to other fragments.  hence
            // we have to search the contig's located fragments
            // by name to get the located fragment's pointer.

            LocatedFragment* pLocF;
            if (!bNoFragments ) {
            pLocF =
              pNewContig->pLocFragGetByName(soReadName);
            if (! pLocF) {
               // this means we couldn't find a fragment of that name
               // since the base segments are assumed to follow 
               // the assembled from lines, this is an error
               PARSE_PANIC(soFileName_, nCurLine,
                           "Unrecognized fragment name in Base_segment* line");
            }
            }

            // add a new base segment to the contig's array of them
            if (!bNoBaseSeg ) {
                        BaseSegment* pNewBaseSeg = 
                        new BaseSegment(pLocF, nStartInCon, nEndInCon);

                        // pBaseSegmentsToSort starts pointing to the
                        // position of the first base segment to add
                        // Hence don't increment nNumberOfBaseSegmentsInContig
                        // until after you have loaded the base segment
                        // into the array 
                        

                        baseSegmentsToSortType* pOneBaseSegmentToSort = 
                          pBaseSegmentsToSort + nNumberOfBaseSegmentsInContig;
                        ++nNumberOfBaseSegmentsInContig;
                        
                        pOneBaseSegmentToSort->nStartPosition = nStartInCon;
                        pOneBaseSegmentToSort->pBaseSegment =
                           pNewBaseSeg;

            }
         }

         break;

       case ExpectFragName:
         if (soFirstToken.isNull()) break;  // blank line ignored

         if (soFirstToken == "CT{" ) {
            bHasConsensusTags = true;
            bAtBeginningOfConsensusTags = true;
            lPositionOfConsensusTags = ifsInFile.tellg();
            lPositionOfConsensusTags -= 1;  // trial and error
            lPositionOfConsensusTags -= soLine.length();
            break;
         }

         if (soFirstToken != "DNA") {
            PARSE_PANIC(soFileName_, nCurLine,
                        "Expected DNA lines for fragment");
         }

         // second token is fragment name--braces make soFragName disappear
         // when } is encountered
         {
            // pick up the fragment name
            RWCString soFragName = tokNext();
            if (soFragName.isNull()) {
               PARSE_PANIC(soFileName_, nCurLine,
                           "Missing read name on DNA line");
            }

            //
            // check if the 'fragment' is really a contig
            //

            if (! bNoListOfFragments) {

            if (listOfFragmentss.daFragmentName_.index( &soFragName ) 
                == RW_NPOS ) {
               putBackLine(soLine);
               inputState = ExpectContig;  // start fresh
               break;
            }
            }
  

            // it's a fragment.  fix the name.
            (void )removeCompFromFilename(soFragName);
            

            // pick up the corresponding fragment name and save
            // because clip data actually comes on later lines

            if (!bNoFragments ) {

            pCurLocFrag = 
              pNewContig->pLocFragGetByName(soFragName);
            if (! pCurLocFrag) {
               // this means we couldn't find a fragment of that name
               PARSE_PANIC(soFileName_, nCurLine,
                           "Unrecognized fragment name in Sequence line");
            }
            }
         }  

         // and set state to expect Fragment dna lines
         soDna = "";         // clear out the dna holder
         inputState = ExpectFragDna;
         break;

       case ExpectFragDna:
         if (soFirstToken.isNull() ) {
            // there should be some DNA in the buffer by now
            if (soDna.length() == 0) {
               PARSE_PANIC(soFileName_, nCurLine,
                           "Zero length DNA sequence");
            }

            // end of dna.  add accumulated sequence to array
            if (! bNoSetSequence) pCurLocFrag->setSequence(soDna);

            // end of DNA.  "Sequence" line should follow
            inputState = ExpectFragSeq;
         }
         else {
            // not null, should be DNA sequence (ACTG's, etc.)
            soDna += soFirstToken;
         }

         break;  // case ExpectFragDna


       case ExpectFragSeq:
         if (soFirstToken.isNull()) break;  // blank line ignored

         if (soFirstToken != "Sequence") {
            PARSE_PANIC_TOKEN(soFileName_, nCurLine, soFirstToken);
         }

         {
            // pick up the fragment name
            RWCString soFragName = tokNext();
            (void )removeCompFromFilename(soFragName);

            // pick up the corresponding fragment name and save
            // because clip data actually comes on later lines

            if (!bNoFragments ) {

            pCurLocFrag = 
              pNewContig->pLocFragGetByName(soFragName);
            if (! pCurLocFrag) {
               // this means we couldn't find a fragment of that name
               PARSE_PANIC(soFileName_, nCurLine,
                           "Unrecognized fragment name in Sequence line");
            }
            }

            // set flag indicating that we don't yet have a clipping line
            // for the current located fragment
            bHaveClippingLineForThisFrag = false;

            // set the state to expect the Clipping* lines
            inputState = ExpectClip;
         }
         
         break;
            
       case ExpectClip:
         if (soFirstToken.isNull() ) break;

         // ignore unpadded clipping lines
         if (soFirstToken == "Clipping") break;

         // at this point there is only one legal alternative
         if (soFirstToken != "Clipping*") {
            PARSE_PANIC(soFileName_, nCurLine,
                           "Missing Clipping* line for fragment");
         }

         // bracket local variables in switch statement
         {
            // pick up the two expected numeric tokens
            RWCString soStartClip = tokNext();
            RWCString soEndClip = tokNext();
            if (soStartClip.isNull() || soEndClip.isNull()) {
               PARSE_PANIC(soFileName_, nCurLine,
                           "Expected start and end fragment clip positions");
            }

            // convert strings to integers
            int nStartClip, nEndClip, nResult;
            nResult = sscanf(soStartClip,"%d",&nStartClip);
            if (nResult != 1) {
               PARSE_PANIC_NUMERIC_TOKEN(soFileName_, nCurLine, soStartClip);
            }
            nResult = sscanf(soEndClip,"%d",&nEndClip);
            if (nResult != 1) {
               PARSE_PANIC_NUMERIC_TOKEN(soFileName_, nCurLine, soEndClip);
            }
         
            // set flag indicating that we have a clipping line
            // for the current located fragment
            bHaveClippingLineForThisFrag = true;

            // now set the located fragments members.
            // assumes pCurLocFrag was set to pointer to relevant
            // LocatedFragment object when we encountered
            // a Sequence <frag name> line 2 (?) lines ago

            if (!bNoFragments ) {
               int nStartConsPos = pCurLocFrag->nGetConsPosFromFragIndex( nStartClip );
               int nEndConsPos = pCurLocFrag->nGetConsPosFromFragIndex( nEndClip );

               // this is a kludge just to allow people to read from
               // the old ace file format
               //  This values really aren't the qual and align clipping
               // values, but they are close.  To get the true values,
               // rephrap with the new phrap

               pCurLocFrag->setQualAndAlignClipping( nStartConsPos, 
                                                     nEndConsPos,
                                                     nStartConsPos,
                                                     nEndConsPos );

            }
         }
         inputState = ExpectDescription;
         break;

       case ExpectDescription:  
          // allow blank lines
          if (soFirstToken.isNull() ) break;


          // Description lines are not required if consed is being
          // run -nophd (wanted by Sequana)

          if (soFirstToken != "Description" ) {
             if ( ConsEd::pGetConsEd()->bUsingPhdFiles_ ) {
                PARSE_PANIC( soFileName_, nCurLine, "There must be a Description line following the Clipping* line in the ace file.  This may be missing because you didn't use phredPhrap and phd2fasta.  These programs came with consed" );
             }
             else {
                putBackLine( soLine );
                inputState = ExpectFragName;
                break;
             }
          }

          if (!bNoFragments )
             parseDescriptionLine( soLine, pCurLocFrag, nCurLine  );
          inputState = ExpectFragName;
          break;

       default:
         assert(false);  // program error
         break;

      } // switch

      // all break's end up here, and then loop around
      // in while statement

      if ( bAtBeginningOfConsensusTags ) break;
   
   }  // while not end of file

   // handle end-of-file condition
   if (inputState == ExpectContDna) {
      if (soDna.length() == 0) {
         PARSE_PANIC(soFileName_, nCurLine, "Zero length DNA sequence");
      }
      else {
         // end of dna.  add accumulated sequence to contig
         if (! bNoSetSequence) pNewContig->setSequence(soDna);
      }
   }  // handle end of file with dna seq still unstored

   if (inputState == ExpectFragDna) {
      if (soDna.length() == 0) {
         PARSE_PANIC(soFileName_, nCurLine, "Zero length DNA sequence");
      }
      else {
         // end of dna.  add accumulated sequence to contig
         if (! bNoSetSequence) pCurLocFrag->setSequence(soDna);
      }
   }  // handle end of file with dna seq still unstored

   
   free( (char*) pBaseSegmentsToSort );
   cleanUpAfterParsingAceFileFormat1();




}  // ctor for PhrapAceFile



void    Assembly :: cleanUpAfterParsingAceFileFormat1() {

   // clean up any discontiguous base segment arrays
   for (int nCtg = 0; nCtg < dapContigs_.length(); nCtg++) {
      dapContigs_[nCtg]->baseSegArray_.forceSegsToBeContiguous();
   }

   int nContig;

   // loop through all fragments in assembly

   for (nContig = 0; nContig < nNumContigs(); nContig++) {
      Contig*   pContig = pGetContig( nContig );

      pContig->updateLastDisplayableContigPos();
   }



   // check that the phrap end align positions (in the Assembled_from* lines)
   // match those calculated by using the start align position and the
   // sequence length

   for (nContig = 0; nContig < nNumContigs(); nContig++) {
      Contig*   pContig = pGetContig( nContig );

      int nNumFrags = pContig->nGetNumberOfFragsInContig();

      for (int nFrag = 0; nFrag < nNumFrags; nFrag++) {

         // get pointer to this located frag from contig
         LocatedFragment* pLocFrag = pContig->pLocatedFragmentGet(nFrag);

         RWCString soFragmentName = pLocFrag->soGetFragmentName();
         
         FragmentAndEndPosition fragAndEndPosition( soFragmentName );

         FragmentAndEndPosition* pFragAndEnd = 
           pListOfFragmentsAndEndPositions->daFragmentsAndEndPositions_.find( 
                                             &fragAndEndPosition 
                                                                             );

         assert( pLocFrag->nGetAlignEnd() == ( pFragAndEnd->nEndPosition_ ) );

      }

   }

   // Now that the check is completed, free up the memory.
   delete pListOfFragmentsAndEndPositions;

}



#define PARSE_PANIC2( file, nLine, soMessage, soLine ) \
{  ostrstream ost; \
   ost << "Corrupted ace file detected by consed source file " \
     << __FILE__ << " at " << __LINE__ <<endl; \
   ost << "Error in ace file " << file << " at line " << nLine << ": " \
      << soMessage << endl << soLine << ends; \
   InputDataError ide(ost.str()); \
   throw ide; }





void Assembly :: parseDescriptionLine( const RWCString& soLine, 
                                      LocatedFragment* pCurLocFrag,
                                       const int nCurLine ) {

      // tokenize input line
      RWCTokenizer tokNext(soLine);

      // pull off the first token
      RWCString soNextToken = tokNext();
      
      assert( soNextToken == "Description" );


      bool bFoundPhdFile = false;
      bool bFoundTime = false;

      // how do we know there are no more tokens?  The tokens returned 
      // are null

      while( !( soNextToken = tokNext() ).isNull()  ) {
         if (soNextToken == "CHROMAT_FILE:" ) {
            RWCString soTemp = tokNext();
            FileName filChromatFile = soTemp;
            pCurLocFrag->setChromatFilename( filChromatFile );
         }
         else if (soNextToken == "PHD_FILE:" ) {
            RWCString soTemp = tokNext();
            FileName filPHDFile = soTemp;
            pCurLocFrag->setPHDFilename( filPHDFile );
            bFoundPhdFile = true;
         }
         else if (soNextToken == "TIME:" ) {
            RWCString soDayOfWeek = tokNext();
            RWCString soMonth = tokNext();
            RWCString soDayOfMonth = tokNext();
            RWCString soTime = tokNext();
            RWCString soTimeZone = tokNext();
            RWCString soYear = tokNext();
            pCurLocFrag->setPHDTimestamp( soDayOfWeek + " " + soMonth +
                                         " " + soDayOfMonth + 
                                         " " + soTime +
                                         " " + soTimeZone +
                                         soYear );

            bFoundTime = true;
         }
      }

      if (!bFoundPhdFile && ConsEd::pGetConsEd()->bUsingPhdFiles_ ) {
         PARSE_PANIC2( soFileName_, nCurLine, "There must be a PHD_FILE: (filename) on this description line.  This may be missing because you didn't use phredPhrap and phd2fasta.  These programs came with consed",
                      soLine );
      }

      if (! bFoundTime && ConsEd::pGetConsEd()->bUsingPhdFiles_ ) {
         PARSE_PANIC2( soFileName_, nCurLine, "There must be a TIME: (timestamp) on this description line.  This may be missing because you didn't use phredPhrap and phd2fasta.  These programs came with consed", 
                      soLine );

      }

}
   



void Assembly :: firstPassThroughAceFile( ifstream& ifs, 
                                     listOfFragments& listOfFragmentss,
                                     int& nLinesInAceFile,
                                     int& nBaseSegments ) {

   const RWSize_2T rwsizeInitialLineSize = 200;

   RWCString soLine( rwsizeInitialLineSize );

   nLinesInAceFile = 0;

   while( soLine.readLine( ifs, false ) ) {

      ++nLinesInAceFile;

      // tokenize input line
      RWCTokenizer tokNext(soLine);

      // pull off the first token
      RWCString soFirstToken = tokNext();
        if (soFirstToken == "Assembled_from*" ) {
           RWCString* psoFragmentName = new RWCString( tokNext() );
           listOfFragmentss.daFragmentName_.insert( psoFragmentName );
        }
        else if (soFirstToken == "Base_segment*" ) 
          ++nBaseSegments;
   }

}




void Assembly :: processBaseQualityLine( const RWCString& soLine, 
                                  Contig* pCurrentContig,
                                  int& nLastBasePosition ) {


   // tokenize input line
   RWCTokenizer tokNext(soLine);
   RWCString soQuality;

   while( ! (soQuality = tokNext()).isNull() ) {
      unsigned char ucQuality = atoi( soQuality.data() );
      ++nLastBasePosition;

      // move past any pads
      while( pCurrentContig->ntGetConsUnsafe( nLastBasePosition ).bIsPad() )
         ++nLastBasePosition;

      pCurrentContig->setQualityAtSeqPos( nLastBasePosition, ucQuality );
   }
}