/*****************************************************************************
#   Copyright (C) 1994-2008 by David Gordon.
#   All rights reserved.                           
#                                                                           
#   This software is part of a beta-test version of the Consed/Autofinish
#   package.  It should not be redistributed or
#   used for any commercial purpose, including commercially funded
#   sequencing, without written permission from the author and the
#   University of Washington.
#   
#   This software is provided ``AS IS'' and any express or implied
#   warranties, including, but not limited to, the implied warranties of
#   merchantability and fitness for a particular purpose, are disclaimed.
#   In no event shall the authors or the University of Washington be
#   liable for any direct, indirect, incidental, special, exemplary, or
#   consequential damages (including, but not limited to, procurement of
#   substitute goods or services; loss of use, data, or profits; or
#   business interruption) however caused and on any theory of liability,
#   whether in contract, strict liability, or tort (including negligence
#   or otherwise) arising in any way out of the use of this software, even
#   if advised of the possibility of such damage.
#
#   Building Consed from source is error prone and not simple which is
#   why I provide executables.  Due to time limitations I cannot
#   provide any assistance in building Consed.  Even if you do not
#   modify the source, you may introduce errors due to using a
#   different version of the compiler, a different version of motif,
#   different versions of other libraries than I used, etc.  For this
#   reason, if you discover Consed bugs, I can only offer help with
#   those bugs if you first reproduce those bugs with an executable
#   provided by me--not an executable you have built.
# 
#   Modifying Consed is also difficult.  Although Consed is modular,
#   some modules are used by many other modules.  Thus making a change
#   in one place can have unforeseen effects on many other features.
#   It may takes months for you to notice these other side-effects
#   which may not seen connected at all.  It is not feasable for me to
#   provide help with modifying Consed sources because of the
#   potentially huge amount of time involved.
#
#*****************************************************************************/
#include    <stdlib.h>
#include    <stdio.h>
#include    "singletInfo.h"
#include    "readPHDAgainForHighQualitySegment.h"
#include    <math.h>
#include    "filename.h"
#include    "mbt_exception.h"
#include    "assert.h"

// This differs from readPHD in that it doesn't bother to read the
// bases or point positions--just the qualities.  It doesn't even save
// the qualities, but just uses them to calculate the high quality
// segment--the only thing it returns.  It is only used to get the
// high quality segment for singlets.


// quality 13 has an error rate of 0.050118723 so
// 0.05 - (error rate of quality 13 base) = neg number
// Hence quality 13 bases on the end of a high quality segment will
// not be included in it.

static double dAmountToSubtractFrom = 0.05;


#define PARSE_PANIC( szMessage ) \
{  ostrstream ost; \
   ost << "phd file error detected from source file " \
     << __FILE__ << " at " << __LINE__ <<endl; \
   ost << "Error in file " << pSingletFromAutoFinish->filPHD_ << " at line " <<  \
       nLine << "  Line:\n" \
      << szLine << "\n" << szMessage << endl \
        << ends; \
   InputDataError ide(ost.str()); \
   throw ide; }
      

#define PARSE_PANIC_1_ARG( szMessage, arg1 ) \
{ \
     char szTemp[500]; \
     sprintf( szTemp, szMessage, arg1 ); \
     PARSE_PANIC( szTemp ); \
}

const int nMaxLineSize = 300;
static char  szLine[ nMaxLineSize + 1];
static int nLine;


void readPHDAgainForHighQualitySegment( 
                   singletInfo* pSingletFromAutoFinish,
                   FILE* pFil ) {

   rewind( pFil );

   // look for BEGIN_DNA

   bool bFoundBEGIN_DNA = false;
   while( !bFoundBEGIN_DNA ) {
      if ( fgets( szLine, nMaxLineSize, pFil ) == NULL ) {
         PARSE_PANIC( "premature end of file while looking for BEGIN_DNA" );
      }
      else {
         ++nLine;
      }

      if ( memcmp( szLine, "BEGIN_DNA", 9 ) == 0 )
         bFoundBEGIN_DNA = true;
   }


   double dSumFromBeginning = 0.0;

   // these values are set so that, if the first base is quality 14 or more,
   // the high quality segment can start from base 1

   // if there is a negative scoring segment starting at the beginning,
   // that segment will replace this least sum of 0.  This is usually the
   // case.

   double dLeastSumFromBeginning = 0.0;
   int nLocationOfLeastSumFromBeginning = 0;
   double dBestSumSegment = 0.0;
   int nBeginningOfBestSumSegment = -1;
   int nEndOfBestSumSegment = -1;

   // anticipating ++nLine being the first base line
   int nLineOfFirstBaseLine = nLine + 1;
   
   // look for END_DNA

   bool bContinueLookingForEndDNA = true;
   do {
      if ( fgets( szLine, nMaxLineSize, pFil ) == NULL ) {
         PARSE_PANIC( "premature end of file while looking for END_DNA" );
      }
      ++nLine;

      if ( szLine[0] == 'E' &&
           szLine[1] == 'N' &&
           szLine[2] == 'D' &&
           szLine[3] == '_' &&
           szLine[4] == 'D' &&
           szLine[5] == 'N' &&
           szLine[6] == 'A' )
         bContinueLookingForEndDNA = false;
      else {
         double dQuality;
         assert( szLine[1] == ' ' &&
              '0' <= szLine[2] && szLine[2] <= '9' );
         int nQuality = ( szLine[2] - '0' );
         int n = 3;
         while( szLine[n] != ' ' ) {
            assert( '0' <= szLine[n] && szLine[n] <= '9' );
            nQuality = 10*nQuality + ( szLine[n] - '0' );
            ++n;
         }

         double dAdjustedError = dAmountToSubtractFrom -
            pow( 10.0, ( (double) -nQuality) / 10.0 );
         
         dSumFromBeginning += dAdjustedError;

         if ( dSumFromBeginning < dLeastSumFromBeginning ) {
            dLeastSumFromBeginning = dSumFromBeginning;
            nLocationOfLeastSumFromBeginning = 
               // nLine - nLineOfFirstBaseLine + 1
               // the reason for the +1 is that we need to make the number
               // be 1-based rather than 0-based
               nLine - nLineOfFirstBaseLine + 1;
         }
         
         if ( dSumFromBeginning - dLeastSumFromBeginning > dBestSumSegment ) {
            dBestSumSegment = dSumFromBeginning - dLeastSumFromBeginning;
            // the high quality segment starts the base *after* the
            // base that makes the least sum base
            nBeginningOfBestSumSegment = nLocationOfLeastSumFromBeginning + 1;
            nEndOfBestSumSegment = 
               nLine - nLineOfFirstBaseLine + 1;

         }
      }

   } while ( bContinueLookingForEndDNA );
   
   // nLine is END_DNA line 
   // e.g.,
   // BEGIN_DNA
   // a 2 12   <-nLineOfFirstBaseLine
   // g 2 15
   // END_DNA  <-nLine
   
   pSingletFromAutoFinish->nNumberOfBases_ = nLine - nLineOfFirstBaseLine;

   pSingletFromAutoFinish->nHighQualitySegmentStart_ = 
      nBeginningOfBestSumSegment;
   pSingletFromAutoFinish->nHighQualitySegmentEnd_ = 
      nEndOfBestSumSegment;

}