/*****************************************************************************
#   Copyright (C) 1994-2008 by David Gordon.
#   All rights reserved.                           
#                                                                           
#   This software is part of a beta-test version of the Consed/Autofinish
#   package.  It should not be redistributed or
#   used for any commercial purpose, including commercially funded
#   sequencing, without written permission from the author and the
#   University of Washington.
#   
#   This software is provided ``AS IS'' and any express or implied
#   warranties, including, but not limited to, the implied warranties of
#   merchantability and fitness for a particular purpose, are disclaimed.
#   In no event shall the authors or the University of Washington be
#   liable for any direct, indirect, incidental, special, exemplary, or
#   consequential damages (including, but not limited to, procurement of
#   substitute goods or services; loss of use, data, or profits; or
#   business interruption) however caused and on any theory of liability,
#   whether in contract, strict liability, or tort (including negligence
#   or otherwise) arising in any way out of the use of this software, even
#   if advised of the possibility of such damage.
#
#   Building Consed from source is error prone and not simple which is
#   why I provide executables.  Due to time limitations I cannot
#   provide any assistance in building Consed.  Even if you do not
#   modify the source, you may introduce errors due to using a
#   different version of the compiler, a different version of motif,
#   different versions of other libraries than I used, etc.  For this
#   reason, if you discover Consed bugs, I can only offer help with
#   those bugs if you first reproduce those bugs with an executable
#   provided by me--not an executable you have built.
# 
#   Modifying Consed is also difficult.  Although Consed is modular,
#   some modules are used by many other modules.  Thus making a change
#   in one place can have unforeseen effects on many other features.
#   It may takes months for you to notice these other side-effects
#   which may not seen connected at all.  It is not feasable for me to
#   provide help with modifying Consed sources because of the
#   potentially huge amount of time involved.
#
#*****************************************************************************/
#include    "readWholeReadItem.h"
#include    "rwcstring.h"
#include    "locatedFragment.h"
#include    "wholeReadItem.h"
#include    "mbt_exception.h"
#include    "nLine.h"
#include    "expidAndLocatedFragment.h"
#include    "consed.h"
#include    "bIsNumericMaybeWithWhitespace.h"
#include    "phdBall2Fasta.h"


const int nMaxLineSize = 10000;
static char  szLine[ nMaxLineSize + 1];
static char  szLine2[ nMaxLineSize + 1];

static char szTypeSaved[200];
static char szSourceSaved[200];
static char szDateSaved[200];

const int nMaxDataLines = 3;
static RWCString soDataLine[ nMaxDataLines ];


#define PARSE_PANIC( message ) \
{  ostrstream ost; \
   ost << "Error detected from source file " \
     << __FILE__ << " at " << __LINE__ <<endl; \
   ost << "error in phd file " << filPHD << " at line " << \
   nLine << \
   ": \n" \
       << szLine << endl \
      << message << endl << ends; \
   InputDataError ide(ost.str()); \
   throw ide; }


#define PARSE_PANIC_TEMPLATE_WR( szMessage ) \
{ \
   ostrstream ost; \
   ost << "error in phd file " << filPHD << " at line " << \
   nLine << \
   " detected from consed source file " << __FILE__ \
   << " at " << __LINE__ << endl << \
   "template WR item must look like this:\nWR{\ntemplate phredPhrap 990224:045110\ntemplate: (template name)\ntype: (bac, cosmid, puc, pbc, pcr, etc)\nsize: (template size)\n}\nwhere each of the above lines is optional\ndata:\n"; \
   ost << soDataLinee;\
   ost << endl << szMessage << endl << ends; \
   InputDataError ide( ost.str() ); \
   throw ide; }


#define PARSE_PANIC_TEMPLATE_WR_1_ARG( szMessage, szMessage2 ) \
{   ostrstream ost; \
   ost << "error in phd file " << filPHD << " at line " << \
   nLine << \
   " detected from consed source file " << __FILE__ \
   << " at " << __LINE__ << endl << \
   "template WR item must look like this:\nWR{\ntemplate phredPhrap 990224:045110\ntemplate: (template name)\ntype: (bac, cosmid, puc, pbc, pcr, etc)\nlib: (library name)\n}\nwhere each of the above lines is optional\n" << \
   "Line:\n" << szLine2 << "\n" << \
   szMessage << endl << szMessage2 << endl << ends; \
   InputDataError ide( ost.str() ); \
   throw ide; }


#define PARSE_PANIC_WR_LINE( szAdditionalMessage ) \
{ \
   ostrstream ost; \
   ost << "error in phd file " << filPHD << " at line " << \
   nLine << \
   " detected from consed source file " << __FILE__ \
       << " at " << __LINE__ << \
   ".  Line:\n" << szLine2 << "\n" << \
   "should be of form: (whole read info item type) (source) (date in form YYMMDD:HHMISS)\n " << \
   szAdditionalMessage << endl << ends; \
   InputDataError ide( ost.str() ); \
   throw ide; }


#define PARSE_PANIC_PRIMER_WR( szMessage ) \
{ \
   ostrstream ost; \
   ost << "error in phd file " << filPHD << " at line " << \
   nLine << \
   " detected from consed source file " << __FILE__ \
   << " at " << __LINE__ << endl << \
   " primer WR item must look like this:\nWR{\nprimer phredPhrap 990224:045110\ntype: (primer type)\nseq: (primer sequence)\nwhere the seq line is optional and (primer type) is one of univ fwd, univ rev, pcr end, or walk.\ndata:\n";\
   for( int nTemp = 0; nTemp < nDataLine; ++nTemp ) {\
       ost << soDataLine[nTemp];\
   }\
   ost << endl << szMessage << endl << ends; \
   InputDataError ide( ost.str() ); \
   throw ide; }

// Example:

// WR{
// primer determineReadTypes 990603:090231
// type: univ fwd
// }

static void parsePrimerWRItem( const int nDataLine, LocatedFragment* pLocFrag,
                               const RWCString& filPHD ) {

   if ( nDataLine == 0 ) {
      PARSE_PANIC_PRIMER_WR( "There must be a data line within the WR{ which must start with type: and be followed with univ fwd, univ rev, pcr end, or walk" )
   }
      
   
   if ( soDataLine[0](0, 5) != "type:" ) {
      PARSE_PANIC_PRIMER_WR( "first data line must start with type: and be followed with univ fwd, univ rev, pcr end, or walk" )
   }
   
   if ( soDataLine[0].length() < 6 ) 
      PARSE_PANIC_PRIMER_WR( "after type: there must be one of univ fwd, univ rev, pcr end, or walk but instead ended" )
   
   // chop off the initial type: and the final CR
   soDataLine[0] = soDataLine[0]( 5, soDataLine[0].length() - 6 );

   // strip off leading and trailing whitespace

   soDataLine[0] = soDataLine[0].stripWhitespace( RWCString::BOTH );
   
   if ( soDataLine[0] == "univ fwd" ) 
      pLocFrag->nReadType_ = nUniversalForward;
   else if ( soDataLine[0] == "univ rev" )
      pLocFrag->nReadType_ = nUniversalReverse;
   else if ( soDataLine[0] == "walk" )
      pLocFrag->nReadType_ = nWalk;
   else if ( soDataLine[0] == "pcr end" )
      pLocFrag->nReadType_ = nPCREnd;
   else {
      PARSE_PANIC_PRIMER_WR( "after type: there must be one of univ fwd, univ rev, pcr end, or walk" )
   }
   
   pLocFrag->nReadTypeFromPhdFile_ = pLocFrag->nReadType_;

   
   if ( nDataLine > 1 ) {
      if ( soDataLine[1](0, 4 ) != "seq:" ) {
         PARSE_PANIC_PRIMER_WR( "If you have a second data line in this WR item, it must be seq: (primer sequence) but was unrecognized" );
      }
   
      // chop off the final CR and the beginning seq:
      soDataLine[1] = soDataLine[1](5, soDataLine[1].length() - 5 );
   
      soDataLine[1] = soDataLine[1].stripWhitespace( RWCString::BOTH );
      
      pLocFrag->soPrimerSequence_ = soDataLine[1];
   }
}

#define PARSE_PANIC_PRIMER_WR2( szMessage ) \
{ \
   ostrstream ost; \
   ost << "error in phd file " << filPhdBall_ << " at line " << \
   nLine_ << \
   " detected from consed source file " << __FILE__ \
   << " at " << __LINE__ << endl << \
   " primer WR item must look like this:\nWR{\nprimer phredPhrap 990224:045110\ntype: (primer type)\nseq: (primer sequence)\nwhere the seq line is optional and (primer type) is one of univ fwd, univ rev, pcr end, or walk.\ndata:\n";\
   for( int nTemp = 0; nTemp < nDataLine; ++nTemp ) {\
       ost << soDataLine[nTemp];\
   }\
   ost << endl << szMessage << endl << ends; \
   InputDataError ide( ost.str() ); \
   throw ide; }


// Example:

// WR{
// primer determineReadTypes 990603:090231
// type: univ fwd
// }

void phdBall2Fasta :: parsePrimerWRItemForPhdBall2Fasta( const int nDataLine, 
                                                         RWCString& soFastaHeader ) {

   if ( nDataLine == 0 ) {
      PARSE_PANIC_PRIMER_WR2( "There must be a data line within the WR{ which must start with type: and be followed with univ fwd, univ rev, pcr end, or walk" )
   }
      
   
   if ( soDataLine[0](0, 5) != "type:" ) {
      PARSE_PANIC_PRIMER_WR2( "first data line must start with type: and be followed with univ fwd, univ rev, pcr end, or walk" )
   }
   
   if ( soDataLine[0].length() < 6 ) 
      PARSE_PANIC_PRIMER_WR2( "after type: there must be one of univ fwd, univ rev, pcr end, or walk but instead ended" )
   
   // chop off the initial type: and the final CR
   soDataLine[0] = soDataLine[0]( 5, soDataLine[0].length() - 6 );

   // strip off leading and trailing whitespace

   soDataLine[0] = soDataLine[0].stripWhitespace( RWCString::BOTH );
   
   if ( soDataLine[0] == "univ fwd" ) 
      soFastaHeader += " DIRECTION: fwd";
   else if ( soDataLine[0] == "univ rev" )
      soFastaHeader += " DIRECTION: rev";
}


// Example:

// WR{
// template determineReadTypes 990603:090231
// name: djs366_101
// lib: library1
// }


// filPHD is for PARSE_PANIC's
static void parseTemplateWRItem( const int nDataLines, LocatedFragment* pLocFrag,
                                 const RWCString& filPHD  ) {

   for( int nDataLine = 0; nDataLine < nDataLines; ++nDataLine ) {
      RWCString& soDataLinee = soDataLine[ nDataLine ];


      if ( soDataLinee( 0, 5 ) == "name:" ) {
         if ( soDataLinee.length() < 6 ) {
            PARSE_PANIC_TEMPLATE_WR( "nothing follows name:" )
         }
         soDataLinee = soDataLinee( 5, soDataLinee.length() - 6 );

         pLocFrag->soTemplate_  = 
            soDataLinee.stripWhitespace( RWCString::BOTH );
      }
      else if ( soDataLinee( 0, 5 ) == "type:" ) {
         if ( soDataLinee.length() < 6 ) {
            PARSE_PANIC_TEMPLATE_WR( "nothing follows type:" )
         }
         soDataLinee = soDataLinee( 5, soDataLinee.length() - 6 );

         // strip off leading and trailing whitespace
         pLocFrag->soTemplateType_ = 
            soDataLinee.stripWhitespace( RWCString::BOTH );
      }
      else if ( soDataLinee( 0, 5 ) == "size:" ) {
         if ( soDataLinee.length() < 6 ) {
            PARSE_PANIC_TEMPLATE_WR( "nothing follows size:" )
         }
         soDataLinee = soDataLinee( 5, soDataLinee.length() - 6 );

         if ( !bIsNumericMaybeWithWhitespace( soDataLinee, pLocFrag->nTemplateSize_ ) ) {
            PARSE_PANIC_TEMPLATE_WR_1_ARG( "the size must be numeric but is ", 
                                           soDataLinee )
         }
      }
      else if ( soDataLinee( 0, 4 ) == "lib:" ) {
         if ( soDataLinee.length() < 5 ) {
            PARSE_PANIC_TEMPLATE_WR( "nothing follows lib:" );
         }
         
         soDataLinee = soDataLinee.soGetRestOfString( 4 );
         soDataLinee = soDataLinee.stripWhitespace( RWCString::BOTH );

         if ( soDataLinee.isNull() ) {
            PARSE_PANIC_TEMPLATE_WR( "nothing follows lib:" );
         }

         pLocFrag->soLibrary_ = soDataLinee;
      }
      else {
         PARSE_PANIC_TEMPLATE_WR( "unrecognized line" )
      }
   } // for( int nDataLine ...
} // parseTemplateWRItem


#define PARSE_PANIC_TEMPLATE_WR2( szMessage ) \
{ \
   ostrstream ost; \
   ost << "error in phd file " << filPhdBall_ << " at line " << \
   nLine_ << \
   " detected from consed source file " << __FILE__ \
   << " at " << __LINE__ << endl << \
   "template WR item must look like this:\nWR{\ntemplate phredPhrap 990224:045110\ntemplate: (template name)\ntype: (bac, cosmid, puc, pbc, pcr, etc)\nsize: (template size)\n}\nwhere each of the above lines is optional\ndata:\n"; \
   ost << soDataLinee;\
   ost << endl << szMessage << endl << ends; \
   InputDataError ide( ost.str() ); \
   throw ide; }


// Example:

// WR{
// template determineReadTypes 990603:090231
// name: djs366_101
// lib: library1
// }


void phdBall2Fasta :: parseTemplateWRItemForPhdBall2Fasta( const int nDataLines, 
                                           RWCString& soFastaHeader ) {

   for( int nDataLine = 0; nDataLine < nDataLines; ++nDataLine ) {
      RWCString& soDataLinee = soDataLine[ nDataLine ];

      if ( soDataLinee( 0, 5 ) == "name:" ) {
         if ( soDataLinee.length() < 6 ) {
            PARSE_PANIC_TEMPLATE_WR2( "nothing follows name:" )
         }
         soDataLinee = soDataLinee( 5, soDataLinee.length() - 6 );

         soFastaHeader += " TEMPLATE: ";
         soFastaHeader += 
            soDataLinee.stripWhitespace( RWCString::BOTH );
         return;
      }
   }
} // parseTemplateWRItemForPhdBall2Fasta


#define PARSE_PANIC_EXPID_WR_1_ARG( szMessage, szMessage2 ) \
{   ostrstream ost; \
   ost << "error in phd file " << filPHD << " at line " << \
   nLine << \
   " detected from consed source file " << __FILE__ \
   << " at " << __LINE__ << endl << \
   "expid WR item must look like this:\nWR{\nexpid (program name) 990224:045110\n25\n}\nwhere the 25 should be replaced by the experiment ID from the autofinish output\n" << \
   "Line:\n" << szLine2 << "\n" << \
   szMessage << endl << szMessage2 << endl << ends; \
   InputDataError ide( ost.str() ); \
   throw ide; }


static void parseExpidWRItem( const int nDataLines,
                              LocatedFragment* pLocFrag,
                              const RWCString& filPHD ) {

   if ( nDataLines < 1 ) {
      PARSE_PANIC_EXPID_WR_1_ARG( "there should be exactly 1 data lines in this WR item but instead there is", nDataLines );
   }

   int nExpID;
   if ( !bIsNumericMaybeWithWhitespace( soDataLine[ 0 ], nExpID ) ) {
      PARSE_PANIC_EXPID_WR_1_ARG( "this line should contain a number but instead contains", soDataLine[ 0 ] );
   }
   
   expidAndLocatedFragment* pExpID = 
      new expidAndLocatedFragment( nExpID, 
                                   pLocFrag );

   pLocFrag->pContig_->pAssembly_->aExpidAndLocatedFragment_.insert( pExpID );
}


void readWholeReadItem( FILE* pFil, 
                        const FileName& filPHD,
                        LocatedFragment* pLocFrag,
                        wholeReadItem*& pWR ) {


   if ( fgets( szLine, nMaxLineSize, pFil ) == NULL ) {
      PARSE_PANIC( "premature end of file while looking for body of WR{ block" );
   }
   else
      ++nLine;
      

   // save from the strtok's so can show szLine2 in error messages
   strcpy( szLine2, szLine );

   char* szType = strtok( szLine, " " );
   char* szSource = strtok( NULL, " " );
   char* szDate = strtok( NULL, "\n " );


   if ( !szType ) PARSE_PANIC_WR_LINE( "could not find whole read info item type" );

   if ( !szSource ) PARSE_PANIC_WR_LINE( "could not find source" );
   if ( !szDate ) PARSE_PANIC_WR_LINE( "could not find date" );

   // save the data in 3 different buffers since szLine will now be reused

   strcpy( szTypeSaved, szType );
   strcpy( szSourceSaved, szSource );
   strcpy( szDateSaved, szDate );


   RWCString soData;

   int nDataLine = 0;

   bool bFoundEnd = false;
   do { 

      if ( fgets( szLine, nMaxLineSize, pFil ) == NULL ) {
         PARSE_PANIC( "premature end of file while looking for end of WR{ block which should be terminated with a }" );
      }
      else
         ++nLine;

      if ( szLine[0] == '}' && 
          isspace( szLine[1] ) )
          bFoundEnd = true;
      else {
        soData += szLine;
        if ( nDataLine < nMaxDataLines ) {
           soDataLine[ nDataLine ] = szLine;
           ++nDataLine;
        }
      }

   } while ( !bFoundEnd );

   // some recognized WR items that have consed-defined formats
   // and whose data is parsed and used

   if ( strcmp( szTypeSaved, "primer" ) == 0 ) {
      parsePrimerWRItem( nDataLine, pLocFrag, filPHD );
   }
   else if ( strcmp( szTypeSaved, "template" ) == 0 ) {
      parseTemplateWRItem( nDataLine, pLocFrag, filPHD );
   }
   else if ( strcmp( szTypeSaved, "expid" ) == 0 ) {
      parseExpidWRItem( nDataLine, pLocFrag, filPHD );
   }
   else if ( strcmp( szTypeSaved, "referenceSequence" ) == 0 ) {
      pLocFrag->bIsAFakeRead_ = true;
   }


   // cut off final carriage return from soData

   soData = soData.strip( RWCString::TRAILING, '\n' );


   pWR = new wholeReadItem( pLocFrag,
                                szTypeSaved,
                                szSourceSaved,
                                szDateSaved,
                                soData );


   if ( strcmp( szTypeSaved, "referenceSequence" ) == 0 ) {
      // this read is a reference sequence

      pLocFrag->pContig_->pReferenceSequence_ = pLocFrag;
   }


}


#define PARSE_PANIC_WR_LINE2( szAdditionalMessage ) \
{ \
   ostrstream ost; \
   ost << "error in phdball " << filPhdBall_ << " at line " << \
   nLine_ << \
   " detected from consed source file " << __FILE__ \
       << " at " << __LINE__ << \
   ".  Line:\n" << szLine2 << "\n" << \
   "should be of form: (whole read info item type) (source) (date in form YYMMDD:HHMISS)\n " << \
   szAdditionalMessage << endl << ends; \
   InputDataError ide( ost.str() ); \
   throw ide; }

#define PARSE_PANIC2( message ) \
{  ostrstream ost; \
   ost << "Error detected from source file " \
     << __FILE__ << " at " << __LINE__ <<endl; \
   ost << "error in phd file " << filPhdBall_ << " at line " << \
   nLine_ << \
   ": \n" \
       << szLine_ << endl \
      << message << endl << ends; \
   InputDataError ide(ost.str()); \
   throw ide; }


void phdBall2Fasta :: readWholeReadItemForPhdBall2Fasta( FILE* pFil, 
                                                         RWCString& soFastaHeader ) {

   // this assumes that we've already read the WR{ line

   if ( fgets( szLine_, nMaxLineSize2, pFil ) == NULL ) {
      PARSE_PANIC2( "premature end of file while looking for body of WR{ block" );
   }
   else
      ++nLine_;

   strcpy( szLine2, szLine_ );

   char* szType = strtok( szLine_, " " );
   char* szSource = strtok( NULL, " " );
   char* szDate = strtok( NULL, "\n " );

   if ( !szType ) PARSE_PANIC_WR_LINE2( "could not find whole read info item type" );
   if ( !szSource ) PARSE_PANIC_WR_LINE2( "could not find whole read info item source" );
   if ( !szDate ) PARSE_PANIC_WR_LINE2( "could not find date" );

   // save the data in 3 different buffers since szLine_ will now be reused

   strcpy( szTypeSaved, szType );
   strcpy( szSourceSaved, szSource );
   strcpy( szDateSaved, szDate );

   RWCString soData;

   int nDataLine = 0;
   bool bFoundEnd = false;
   do {
      if ( fgets( szLine_, nMaxLineSize2, pFil ) == NULL ) {
         PARSE_PANIC2( "premature end of file while looking for end of WR{ block which should be terminated with a }" );
      }
      else 
         ++nLine_;

      if ( szLine_[0] == '}' &&
           isspace( szLine_[1] ) )
         bFoundEnd = true;
      else {
         soData += szLine_;
         if ( nDataLine < nMaxDataLines ) {
            soDataLine[ nDataLine ] = szLine_;
            ++nDataLine;
         }
      }
   } while( !bFoundEnd );

   // some recognized Wr items that have consed-defined formats
   // and whose data is parsed and used

   if ( strcmp( szTypeSaved, "primer" ) == 0 ) {
      parsePrimerWRItemForPhdBall2Fasta( nDataLine, soFastaHeader );
   }
   else if ( strcmp( szTypeSaved, "template" ) == 0 ) {
      parseTemplateWRItemForPhdBall2Fasta( nDataLine, soFastaHeader );
   }

}