/*****************************************************************************
#   Copyright (C) 1994-2008 by David Gordon.
#   All rights reserved.                           
#                                                                           
#   This software is part of a beta-test version of the Consed/Autofinish
#   package.  It should not be redistributed or
#   used for any commercial purpose, including commercially funded
#   sequencing, without written permission from the author and the
#   University of Washington.
#   
#   This software is provided ``AS IS'' and any express or implied
#   warranties, including, but not limited to, the implied warranties of
#   merchantability and fitness for a particular purpose, are disclaimed.
#   In no event shall the authors or the University of Washington be
#   liable for any direct, indirect, incidental, special, exemplary, or
#   consequential damages (including, but not limited to, procurement of
#   substitute goods or services; loss of use, data, or profits; or
#   business interruption) however caused and on any theory of liability,
#   whether in contract, strict liability, or tort (including negligence
#   or otherwise) arising in any way out of the use of this software, even
#   if advised of the possibility of such damage.
#
#   Building Consed from source is error prone and not simple which is
#   why I provide executables.  Due to time limitations I cannot
#   provide any assistance in building Consed.  Even if you do not
#   modify the source, you may introduce errors due to using a
#   different version of the compiler, a different version of motif,
#   different versions of other libraries than I used, etc.  For this
#   reason, if you discover Consed bugs, I can only offer help with
#   those bugs if you first reproduce those bugs with an executable
#   provided by me--not an executable you have built.
# 
#   Modifying Consed is also difficult.  Although Consed is modular,
#   some modules are used by many other modules.  Thus making a change
#   in one place can have unforeseen effects on many other features.
#   It may takes months for you to notice these other side-effects
#   which may not seen connected at all.  It is not feasable for me to
#   provide help with modifying Consed sources because of the
#   potentially huge amount of time involved.
#
#*****************************************************************************/
#include    "addNewReadsWithExistingAlignments.h"
#include    "consedParameters.h"
#include    "filename.h"
#include    "mbt_exception.h"
#include    "consed.h"
#include    <stdio.h>
#include    <iostream.h>
#include    <math.h>
#include    "maybeTerminateIfAnotherReadWriteConsed.h"
#include    "addNewReads.h"
#include    "filGetUniqueFilename.h"
#include    "fileDefines.h"
#include    "soLine.h"
#include    "printTime.h"
#include    "rwctokenizer.h"
#include    "filFindSolexaPrbFileFromSeqFile.h"
#include    "szGetTime.h"
#include    "bIsNumericFloat.h"
#include    "nNumberFromBase2.h"
#include    "soGetDateTime.h"
#include    "soLine.h"
#include    "rwcregexp.h"
#include    "crossMatchInfoForRead.h"
#include    "soGetChemistry.h"


// non-solexa call chain:
// doIt calls
// parseAlignmentsAndAddNewReads which calls
// readBasesAndQualitiesForAddedReads() which calls
// readFileOfPhdFilesForAddedReads



addNewReadsWithExistingAlignments :: addNewReadsWithExistingAlignments( 
                                      const FileName& filAceFileToOpen,
                                      const FileName& filNewAceFile,
                                      const FileName& filFOFOfAlignmentFiles,
                                      const RWCString& soChemistry )
 :
      filAceFileToOpen_( filAceFileToOpen ),
      filFOFOfAlignmentFiles_( filFOFOfAlignmentFiles ),
      filNewAceFile_( filNewAceFile ) {

      if ( soChemistry == "solexa" ) {
         nChemistry_ = nSolexa;
      }
      else if ( soChemistry == "454" ) {
         nChemistry_ = n454;
      }
      else if ( soChemistry == "sanger" ) {
         nChemistry_ = nSanger;
      }
      else if ( soChemistry == "unknown" ) {
         // added July 2010 for aligning fake reads (from fasta files)
         // against existing contigs

         nChemistry_ = nUnknown;
      }
      else {
         THROW_ERROR( "unrecognized chemistry: " + soChemistry );
      }


}





void addNewReadsWithExistingAlignments :: doIt() {

   pCP->bOKToUseGui_ = false;

   // I have not used this file and it is just a nuisance to have to delete
   // this zero-length file with every run.
//    pCP->filAutoFinishFullOutput_ =
//       filGetUniqueFilename( filAceFileToOpen_.soGetProjectFromAceFilename() )
//       + ".out";

//    pAO = fopen( pCP->filAutoFinishFullOutput_.data(), "w" );
//    if ( !pAO ) {
//       RWCString soError = "could not open log file ";
//       soError += pCP->filAutoFinishFullOutput_;
//       soError += " for writing.";
//       THROW_ERROR( soError );
//    }

//    pFILE = pAO;

//    FILE* pFOF = fopen( "addNewReads.fof", "w" );
//    if ( pFOF ) {
//       // just ignore the problem if we can't write to this file

//       fprintf( pFOF, "%s\n", pCP->filAutoFinishFullOutput_.data() );
//       fclose( pFOF );
//    }


   try {

      checkThatCanReadAllAlignmentFiles();

      // create the ConsEd object, which owns and manages the Assembly
      // data structures as well as the lists of currently existing
      // ContigWin and Teditor objects (which there will be none in
      // this case).  There is a global point to this sole instance,
      // set in the ctor.

      ConsEd* pConsed = new ConsEd();

      // This is perhaps archaic--it might not be used at all.  So why
      // force user to create it?
      // terminateIfNoPhdDir();

      maybeTerminateIfAnotherReadWriteConsed();

      // this does "new Assembly"
      ConsEd::pGetConsEd()->openAssemblyFile( filAceFileToOpen_ );

      parseAlignmentsAndAddNewReads();

      printTime( "now saving assembly..." );
      saveAssembly();
      printTime( "done" );
   }
   catch( ExceptionBase eb ) {
      if ( !eb.bUserNotified() ) {
         fprintf( pAO,
                  "Fatal:  exception thrown: %s\n",
                  eb.szGetDesc() );

      }

      fclose( pAO );

      cerr << "See log file: " << pCP->filAutoFinishFullOutput_ << endl;

      exit( 1 );
   }

   fclose( pAO );
   cerr << "See log file: " << pCP->filAutoFinishFullOutput_ << endl;

   // ring the bell
   for( int n = 0; n < 100; ++n ) {
      fprintf( stderr, "\a\a\a\a\a\a" );
      fflush( stderr );
   }

}





void addNewReadsWithExistingAlignments :: parseAlignmentsAndAddNewReads() {

   FILE* pFOFOfAlignmentFiles = fopen( filFOFOfAlignmentFiles_.data(), "r" );
   if ( !pFOFOfAlignmentFiles ) {

      THROW_FILE_ERROR( filFOFOfAlignmentFiles_ );
   }

   // count alignment files

   int nTotalAlignmentFiles = 0;
   char c;
   while( ( c = getc( pFOFOfAlignmentFiles ) ) != EOF ) {
      if ( c == '\n' ) {
         ++nTotalAlignmentFiles;
      }
   }


   rewind( pFOFOfAlignmentFiles );


   const bool bUsingAddReads2ConsedScript = false;
   pAddNewReads_ = new addNewReads( bUsingAddReads2ConsedScript );
   pAddNewReads_->pAddNewReadsWithExistingAlignments_ = this;



   //         pAddNewReads_->zeroAddNewReadsContigCounts();

   int nPreallocatedSize = 
      pCP->nSolexaAlignmentFilesPerInsertingPadsCycle_ * 
      pCP->nSolexaAlignmentsPerAlignmentFile_;
         
   // add 20% more room

   nPreallocatedSize += nPreallocatedSize / 5;

   pAddNewReads_->aCrossMatchInfoForReads_.resize( nPreallocatedSize );


   
   int nAlignmentFiles = 0;
   int nAlignmentFilesReadSoFarBeforeInsertingPads = 0;


   while( fgets( soLine.data(), nMaxLineSize, pFOFOfAlignmentFiles ) ) {
      soLine.nCurrentLength_ = strlen( soLine.data() );
      soLine.stripTrailingWhitespaceFast();

      ++nAlignmentFiles;


      FileName filAlignmentFile = soLine;

      // use some of the code from addNewReads--the interactive method
      // of adding new reads

      ++nAlignmentFilesReadSoFarBeforeInsertingPads;

      if ( nAlignmentFilesReadSoFarBeforeInsertingPads == 1 ) {

      }

      assert( pAddNewReads_ );

      pAddNewReads_->readAlignmentFile( filAlignmentFile );

      fprintf( stdout, "read %d of %d alignment files with %d total alignments\n",
                  nAlignmentFiles,
                  nTotalAlignmentFiles,
                  pAddNewReads_->aCrossMatchInfoForReads_.length() );


      pAddNewReads_->readBasesAndQualitiesForAddedReads();

   } //    while( fgets( soLine.data(), nMaxLineSize, pFOFOfAlignmentFiles )...

   fclose( pFOFOfAlignmentFiles );


   // no longer supported--superceded by consed -fixContigEnds (Oct 2010)
//    if ( pCP->bAddNewReadsExtendConsensusUsingProtrudingNewReads_  ) {
//       pAddNewReads_->runMiniAssembliesOnProtrudingReads();
//    }


   pAddNewReads_->insertPadsInContigs();

   // this adds the read to the contig
   pAddNewReads_->insertPadsInNewReadsAndSetReadBases();

   pAddNewReads_->convertVectorTagsToXsInNewReads();


//          RWCString soMessage;
//          pAddNewReads_->tellUserAllAboutIt2( soMessage );
//          fprintf( pAO, "%s", soMessage.data() );
      
         // there are too many of these, too many reads, no one will use it,
         // and it takes too much disk space

         //      pAddNewReads_->createNavigationFile();

   if ( pCP->bAddNewReadsRecalculateConsensusQuality_ ) {
      pAddNewReads_->recalculateConsensusQualityValues();
   }


} // parseAlignmentsAndAddNewReads


   
void addNewReadsWithExistingAlignments :: checkThatCanReadAllAlignmentFiles() {

   FILE* pFOFOfAlignmentFiles = fopen( filFOFOfAlignmentFiles_.data(), "r" );
   if ( !pFOFOfAlignmentFiles ) {

      THROW_FILE_ERROR( filFOFOfAlignmentFiles_ );
   }

   while( fgets( soLine.data(), nMaxLineSize, pFOFOfAlignmentFiles ) ) {
      soLine.nCurrentLength_ = strlen( soLine.data() );

      soLine.stripTrailingWhitespaceFast();

      FileName filAlignmentFile = soLine;

      FILE* pAlignmentFile = fopen( filAlignmentFile.data(), "r" );
      if ( !pAlignmentFile ) {
         THROW_FILE_ERROR( filAlignmentFile );
      }

      fclose( pAlignmentFile );
   }

   fclose( pFOFOfAlignmentFiles );

}



void addNewReadsWithExistingAlignments :: saveAssembly() {

   if ( filNewAceFile_.isNull() ) {
      filNewAceFile_ = filAceFileToOpen_.filGetNextVersion();

      // find next version that doesn't already exist

      while( filNewAceFile_.bFileByThisNameExists() ) {
         filNewAceFile_ = filNewAceFile_.filGetNextVersion();
      }
   }

   ConsEd::pGetAssembly()->saveToFile( filNewAceFile_,
                                          false ); // ace file format2

   RWCString soAceFileToPrint = filNewAceFile_;

   // this mismash means the path starts with ./(dkdkdkd) where the
   // (dkdkdkd) part should not have any "/" in it

   RWCRegexp regPattern( "^\\./[^/]+$" );
   if ( soAceFileToPrint.bContains( regPattern ) ) {
      soAceFileToPrint.bStartsWithAndRemove( "./" );
   }

   cerr << "See new ace file " << soAceFileToPrint << endl;
}