/*****************************************************************************
#   Copyright (C) 1994-2008 by David Gordon.
#   All rights reserved.                           
#                                                                           
#   This software is part of a beta-test version of the Consed/Autofinish
#   package.  It should not be redistributed or
#   used for any commercial purpose, including commercially funded
#   sequencing, without written permission from the author and the
#   University of Washington.
#   
#   This software is provided ``AS IS'' and any express or implied
#   warranties, including, but not limited to, the implied warranties of
#   merchantability and fitness for a particular purpose, are disclaimed.
#   In no event shall the authors or the University of Washington be
#   liable for any direct, indirect, incidental, special, exemplary, or
#   consequential damages (including, but not limited to, procurement of
#   substitute goods or services; loss of use, data, or profits; or
#   business interruption) however caused and on any theory of liability,
#   whether in contract, strict liability, or tort (including negligence
#   or otherwise) arising in any way out of the use of this software, even
#   if advised of the possibility of such damage.
#
#   Building Consed from source is error prone and not simple which is
#   why I provide executables.  Due to time limitations I cannot
#   provide any assistance in building Consed.  Even if you do not
#   modify the source, you may introduce errors due to using a
#   different version of the compiler, a different version of motif,
#   different versions of other libraries than I used, etc.  For this
#   reason, if you discover Consed bugs, I can only offer help with
#   those bugs if you first reproduce those bugs with an executable
#   provided by me--not an executable you have built.
# 
#   Modifying Consed is also difficult.  Although Consed is modular,
#   some modules are used by many other modules.  Thus making a change
#   in one place can have unforeseen effects on many other features.
#   It may takes months for you to notice these other side-effects
#   which may not seen connected at all.  It is not feasable for me to
#   provide help with modifying Consed sources because of the
#   potentially huge amount of time involved.
#
#*****************************************************************************/
#include    "sff2PhdBall.h"
#include    "rwcstring.h"
#include    "soLine.h"
#include    "seqJustBases.h"
#include    "basesAndQualitiesAndPeaks.h"
#include    "matchLinker.h"
#include    "rwtptrorderedvector.h"
#include    "rwtvalorderedvector.h"
#include    "readOneSffFile.h"
#include    "consedParameters.h"
#include    "findQueryWithinSubject.h"
#include    "complement_so.h"
#include    "getNextToken.h"
#include    "rwtvalvector.h"
#include    "soAddCommas.h"
#include    "bIntersect.h"
#include    "szGetTime.h"
#include    "soGetDateTime.h"
#include    "soLine.h"
#include    "rwctokenizer.h"
#include    <unistd.h>
#include    <ctype.h>
#include    "rwtvalorderedvector.h"
#include    "mbtValOrderedVectorOfRWCString.h"


// Jim Knight's rules allow at worse case indels + mismatches*2 <= 5
// so there can be no more than 5 indels.  However, the score can be 
// worse than L (linker length) - 5 since you can also have 2 mismatches and 
// 1 insertion in the subject giving a L-3 for the # of matches so score of
// L-3 - 2*2 -1*1 = L-6  If we are allowing this to be the worst score, then
// this worst score can also be found by 6 insertions in the subject.  Thus
// we need to allow 6 indels.

// I have found that Jim Knight's rules have a problem:  since mismatches
// count 2 and indels as 1, you get alignments like this:
//gttggaaccgaaagggtttgaattca*aaccctttcggttccaac
//                          x x                
//gttggaaccgaaagggtttgaattcaga*ccctttcggttccaac
//
// in which there clearly is a single base error in the read
//gttggaaccgaaagggtttgaattcaaaccctttcggttccaac
//                          x   
//gttggaaccgaaagggtttgaattcagaccctttcggttccaac
// but the scoring system is allowing 2 indels to be used instead.

// Thus I'm going to use the following:
// match: 1
// mismatch: 3
// indel: 2

// In this way, the algorithm will prefer a mismatch over 2 indels.
// I will still use Jim Knight's rule of allowing 5 indels or 2 mismatches and 1 indel:
// penalty of 5*2 = 10
// penalty of 2*3+1*2 = 8

// Thus the worst score for a linker will be L - 10 or 
// L - 2 - 8 (since if 2 bases are mismatching they can't be matching
// so L-2 is the number of matches and -8 is the penalty for the 2 mismatches and 1 indel   


const int nMaxIndels = 5;

const int nMinimumReadLength = 15;


void sff2PhdBall :: doIt() {

   soPrefixForTemporaryFiles_ = filSffFullPath_.soGetBasename() + "." +
      soGetDateTime( nDotInMiddle );


   pPhdBall_ = fopen( filNewPhdBallFullPath_.data(), "w" );
   if ( !pPhdBall_ ) {
      THROW_FILE_ERROR( filNewPhdBallFullPath_ );
   }


   if ( bUseOnlyCertainReads_ ) {
      FILE* pFOF = fopen( filFOFOfReadsToUse_.data(), "r" );
      if ( !pFOF ) {
         THROW_FILE_ERROR( filFOFOfReadsToUse_ );
      }

      while( fgets( soLine.data(), nMaxLineSize, pFOF ) != NULL ) {
         soLine.nCurrentLength_ = strlen( soLine.data() );

         soLine.stripTrailingWhitespaceFast();

         aReadsToUse_.insert( soLine );
      }

      fclose( pFOF );
   }


   // get linker sequences
   
   RWTPtrOrderedVector<seqJustBases> aArrayOfLinkerSequences;

   readLinkerSequencesAndComplement( aArrayOfLinkerSequences );


   RWTValOrderedVector<int> aMinScoreForLinker( 
         (size_t) aArrayOfLinkerSequences.length() );

   setMinScoreForLinkers( aArrayOfLinkerSequences, aMinScoreForLinker );

   aRawSffReads_.soName_ = "aRawSffReads_";

   if ( bUseOnlyCertainReads_ ) {
      cerr << "extracting reads from sff file " << filSffFullPath_ << 
         " that are listed in file " << filFOFOfReadsToUse_ << endl;
   }
   else {
      cerr << "extracting all reads from sff file " <<
         filSffFullPath_ << endl;
   }

   readOneSffFile( filSffFullPath_.data(),
                   aRawSffReads_,
                   bUseOnlyCertainReads_,
                   aReadsToUse_
                   );

   cerr << "found " << aRawSffReads_.length() << " reads" << endl;

   cerr << "now looking for matches with linkers..." << endl;


   RWTValOrderedVector<matchLinker> aMatches( aRawSffReads_.length(),
                                          "aMatches" );

   // they are already initialized so that nNumberOfMatches_ = 0 for
   // each match object
   aMatches.nCurrentLength_ = aRawSffReads_.length();
   

   // now it has become clear that the Ntide method of holding 
   // the sequence is not a good one.  Better is fastq with
   // 2 RWCString arrays


   int nMatchesSoFar = 0;

   for( int nRead = 0; nRead < aRawSffReads_.length(); ++nRead ) {

      if ( nRead % 10000 == 0 ) {
         int nPerCentDone = ( nRead + 1 ) * 100 / 
            ( aRawSffReads_.length() + 1 );
         cerr << nPerCentDone << "% done.  Matches so far: " << 
            soAddCommas( nMatchesSoFar ) << " out of " << 
            soAddCommas( nRead + 1 ) << " or " <<
            nMatchesSoFar * 100 / ( nRead + 1 ) << "%" << endl;
      }


      basesAndQualitiesAndPeaks* pSeqRead = aRawSffReads_[ nRead ];

      for( int nLinker = 0; nLinker < aArrayOfLinkerSequences.length(); 
           ++nLinker ) {

         seqJustBases* pSeqLinker = aArrayOfLinkerSequences[ nLinker ];

         bool bFoundMatch = false;

         int n0SubjectStart;
         int n0SubjectEnd;
         int nScore;


         findQueryWithinSubject( pSeqLinker->soBases_,
                                 pSeqRead->soBases_,
                                 pCP->n454LinkerAlignmentIndelScore_,
                                 pCP->n454LinkerAlignmentMatchScore_,
                                 pCP->n454LinkerAlignmentMismatchScore_,
                                 aMinScoreForLinker[ nLinker ],
                                 nMaxIndels,
                                 bFoundMatch,
                                 n0SubjectStart,
                                 n0SubjectEnd,
                                 nScore );

         if ( bFoundMatch ) {

            ++nMatchesSoFar;

            matchLinker& myMatch = aMatches[ nRead ];

            if ( myMatch.nNumberOfMatches_ == 0 ||
                 nScore > myMatch.nScore_ ) {
               ++myMatch.nNumberOfMatches_;
               myMatch.nScore_ = nScore;
               myMatch.nLinkerOfMatch_ = nLinker;
               myMatch.n0SubjectStart_ = n0SubjectStart;
               myMatch.n0SubjectEnd_ = n0SubjectEnd;
            }
            else if ( myMatch.nNumberOfMatches_ != 0 &&
                      nScore <= myMatch.nScore_ ) {
               ++myMatch.nNumberOfMatches_;
            }


            // make matching region in read x's and try again.  If 
            // the sequence matches in 2 different places, then 
            // we will reject the read.

            RWCString soCopyOfBases = pSeqRead->soBases_;

            for( int n0 = n0SubjectStart; n0 <= n0SubjectEnd; ++n0 ) {
               soCopyOfBases[n0] = 'x';
            }
            
            bFoundMatch = false;
            findQueryWithinSubject( pSeqLinker->soBases_,
                                    soCopyOfBases,
                                    pCP->n454LinkerAlignmentIndelScore_,
                                    pCP->n454LinkerAlignmentMatchScore_,
                                    pCP->n454LinkerAlignmentMismatchScore_,
                                    aMinScoreForLinker[ nLinker ],
                                    nMaxIndels,
                                    bFoundMatch,
                                    n0SubjectStart,
                                    n0SubjectEnd,
                                    nScore );

            if ( bFoundMatch ) {

               // bad--found twice

               ++myMatch.nNumberOfMatches_;

            }
         } //  if ( bFoundMatch ) {
      }
   }


   int nReadsWithMatchesToLinkers = 0;

   int nMatch;
   for( nMatch = 0; nMatch < aMatches.length(); ++nMatch ) {
      if ( aMatches[ nMatch ].nNumberOfMatches_ > 0 )
         ++nReadsWithMatchesToLinkers;
   }

   bool bPairedReadDataset = false;
   if ( nReadsWithMatchesToLinkers > aRawSffReads_.length() / 4 ) {
      bPairedReadDataset = true;
   }


   // this section below is just for information

   int nNumberWithNoMatches = 0;

   RWTValOrderedVector<int> aLinkerHistogram( 4, "aLinkerHistogram" );

   aLinkerHistogram.increaseCurrentLength( 4, 0 );

   RWTValVector<int> aScoreHistogram( 100, 0, "aScoreHistogram" );


   for( nMatch = 0; nMatch < aMatches.length(); ++nMatch ) {
      matchLinker& myMatch = aMatches[ nMatch ];

      if ( myMatch.nNumberOfMatches_ == 0 ) {
         ++nNumberWithNoMatches;
      }
      else {

         // there might be more than 4 values of myMatch.nLinkerOfMatch_
         // if the users added any

         if ( myMatch.nLinkerOfMatch_ >= aLinkerHistogram.length() ) {
            aLinkerHistogram.increaseCurrentLength( 
               myMatch.nLinkerOfMatch_ + 1, 0 );
         }

         ++aLinkerHistogram[ myMatch.nLinkerOfMatch_ ];
         ++aScoreHistogram[ myMatch.nScore_ ];


         /*         printf( "nNumberOfMatches: %d score: %d linker: %d %d - %d\n",
                 myMatch.nNumberOfMatches_,
                 myMatch.nScore_,
                 myMatch.nLinkerOfMatch_,
                 myMatch.n0SubjectStart_,
                 myMatch.n0SubjectEnd_ ); */
      }
   }

   printf( "number with matches: %d number without matches: %d\n",
           aMatches.length() - nNumberWithNoMatches,
           nNumberWithNoMatches );

   printf( "scores:\n" );
   for( int nScore = 0; nScore <= aScoreHistogram.nGetEndIndex(); ++nScore ) {
      if ( aScoreHistogram[ nScore ] != 0 ) {
         printf( "%d: %d\n", nScore, aScoreHistogram[ nScore ] );
      }
   }

   printf( "linkers:\n" );
   for( int nLinker = 0; nLinker < aLinkerHistogram.length(); ++nLinker ) {
      printf( "%d: %d\n", nLinker, aLinkerHistogram[ nLinker ] );
   }


   printf( "number of deletions in linker: %s\n", 
           soAddCommas( pCP->nDebugging_ ).data() );

   printf( "number of deletions in reads: %s\n",
           soAddCommas( pCP->nDebugging2_ ).data() );

   printf( "number of mismatches: %s\n",
           soAddCommas( pCP->nDebugging3_ ).data() );


   // end information output

   const int nLEFT = 0;

   RWTValVector<RWCString> aLeftOrRight( 2, "" );
   aLeftOrRight[0] = pCP->soPaired454LeftReadExtension_;
   aLeftOrRight[1] = pCP->soPaired454RightReadExtension_;


   // this will set it to be big enough for all the reads to be paired reads
   aDesiredReads_.soName_ = "aDesiredReads_";
   aDesiredReads_.resize( aRawSffReads_.length() * 2 );


   assert( aMatches.length() == aRawSffReads_.length() );


   // I'm indexing this by the same integer as the index of aRawSffReads_
   // instead of pointers just to save some space

   aNonPairedReads_.soName_ = "aNonPairedReads_";
   aNonPairedReads_.resize( aRawSffReads_.length() * 2 );


   for( nMatch = 0; nMatch < aMatches.length(); ++nMatch ) {
      matchLinker& myMatch = aMatches[ nMatch ];

      // aRawSffReads_ is indexed the same as aMatches
      basesAndQualitiesAndPeaks* pSeqRead = aRawSffReads_[ nMatch ];


      if ( myMatch.nNumberOfMatches_ == 0 ) {

         // such a read is a non-paired read

         aNonPairedReads_.insert( nMatch );

      }
      else if ( myMatch.nNumberOfMatches_ == 1 ) {

         // first check if either the left or the right read is too small

         bool bTooSmall = false;

         int nLeftOrRight;
         for( nLeftOrRight = 0; nLeftOrRight <= 1; ++nLeftOrRight ) {

            int n0Left;
            int n0Right;
            if ( nLeftOrRight == nLEFT ) {
               n0Left = 0;
               n0Right = myMatch.n0SubjectStart_ - 1; 
               // the reason for the -1 is that n0SubjectStart_
               // is the beginning of the linker
            }
            else {
               n0Left = myMatch.n0SubjectEnd_ + 1;
               // the reason for the +1 is that n0SubjectEnd_
               // is the end of the linker
               n0Right = pSeqRead->soBases_.length() - 1;
            }

            if ( !bIntersect( n0Left,
                              n0Right,
                              pSeqRead->n1ClipLeft_ - 1,
                              pSeqRead->n1ClipRight_ - 1,
                              n0Left,
                              n0Right ) ) {
//                cerr << "warning:  read " << pSeqRead->soSequenceName_ <<
//                   " has no untrimmed sequence for read on " <<
//                   aLeftOrRight[ nLeftOrRight ] << endl;

               bTooSmall = true;
            }
            else {

               int nReadLength = n0Right - n0Left + 1;
               if ( nReadLength < nMinimumReadLength ) {
                  bTooSmall = true;
               }
            }
         }

         if ( bTooSmall ) {
            aNonPairedReads_.insert( nMatch );
            continue;
         }


         for( nLeftOrRight = 0; nLeftOrRight <= 1; ++nLeftOrRight ) {

            int n0Left;
            int n0Right;
            if ( nLeftOrRight == nLEFT ) {
               n0Left = 0;
               n0Right = myMatch.n0SubjectStart_ - 1; 
               // the reason for the -1 is that n0SubjectStart_
               // is the beginning of the linker
            }
            else {
               n0Left = myMatch.n0SubjectEnd_ + 1;
               // the reason for the +1 is that n0SubjectEnd_
               // is the end of the linker
               n0Right = pSeqRead->soBases_.length() - 1;
            }

            if ( !bIntersect( n0Left,
                              n0Right,
                              pSeqRead->n1ClipLeft_ - 1,
                              pSeqRead->n1ClipRight_ - 1,
                              n0Left,
                              n0Right ) ) {
               cerr << "warning:  read " << pSeqRead->soSequenceName_ <<
                  " has no untrimmed sequence for read on " <<
                  aLeftOrRight[ nLeftOrRight ] << endl;
               continue;
            }


            basesAndQualitiesAndPeaks* pNewRead = 
               new basesAndQualitiesAndPeaks();

            pNewRead->soSequenceName_ = pSeqRead->soSequenceName_ + 
               aLeftOrRight[ nLeftOrRight ];

            pNewRead->soBases_ = pSeqRead->soBases_( n0Left, 
                                                     n0Right - n0Left + 1 );

            // the clipping positions are just the beginning and end of the
            // read

            pNewRead->n1ClipLeft_ = 1;
            pNewRead->n1ClipRight_ = pNewRead->soBases_.length();

            pNewRead->aQualities_.substring( pSeqRead->aQualities_,
                                             n0Left,
                                             n0Right - n0Left + 1 );

            pNewRead->setPointPosArraySubstring( pSeqRead,
                                                 n0Left,
                                                 n0Right - n0Left + 1 );

            pNewRead->nTraceArrayMaxIndex_ = pSeqRead->nTraceArrayMaxIndex_;

            if ( nLeftOrRight == nLEFT ) {
               pNewRead->reverseComplementThyself();
            }

            aDesiredReads_.insert( pNewRead );

         } //  
      } //       else if ( myMatch.nNumberOfMatches_ == 1 ) {
      

   } //    for( nMatch = 0; nMatch < aMatches.length(); ++nMatch ) {


   filterNonPairedReadsAgainstPuc19();

   writePhdBall( aDesiredReads_ );
                 
} // void sff2PhdBall :: doIt() {


void sff2PhdBall :: readLinkerSequencesAndComplement( 
           RWTPtrOrderedVector<seqJustBases>& aArrayOfLinkerSequences ) {

   FILE* pLinkers = fopen( pCP->fil454LinkerSequences_.data(), "r" );
   if ( !pLinkers ) {
      THROW_FILE_ERROR( pCP->fil454LinkerSequences_ );
   }

   bool bStart = true;
   

   seqJustBases* pSeq;

   while( fgets( soLine.data(), nMaxLineSize, pLinkers ) != NULL ) {
      soLine.nCurrentLength_ = strlen( soLine.data() );

      if ( bStart &&  ( soLine[0] != '>' ) ) {
            RWCString soErrorMessage = pCP->fil454LinkerSequences_ +
               " did not start with > so it must not be in fasta format";
            THROW_ERROR( soErrorMessage );
      }

      if ( soLine[0] == '>' ) {
         if ( bStart ) {
            bStart = false;
         }
         else {
            // finish last sequence--I think there is nothing to finish

         }


         pSeq = new seqJustBases();

         int nTokenStart;
         int nTokenEnd;
         bool bError = false;
         getNextToken( soLine.data(), 
                       1, // start here
                       nTokenStart,
                       nTokenEnd,
                       bError );

         // added June 2009 since Torsten Thomas t.thomas@unsw.edu.au
         // had this problem

         if ( bError ) {
            RWCString soError = pCP->fil454LinkerSequences_ +
               " (linker sequence) has nothing following the > but there should be a sequence name";
            THROW_ERROR( bError );
         }

         //         assert( !bError );

         pSeq->soSequenceName_ = 
            soLine( nTokenStart, nTokenEnd - nTokenStart + 1 );

         aArrayOfLinkerSequences.insert( pSeq );
      }
      else {
         soLine.stripTrailingWhitespaceFast();
         pSeq->soBases_ += soLine.returnToLower();
      }
   } //  while( fgets( soLine.data(), nMaxLineSize, pLinkers ) != NULL ) {

   fclose( pLinkers );

   // now add complements to these

   int nNumberOfUncomplementedSequences = aArrayOfLinkerSequences.length();

   for( int nSeq = 0; nSeq < nNumberOfUncomplementedSequences; ++nSeq ) {
      seqJustBases* pSeq = aArrayOfLinkerSequences[ nSeq ];

      // check if the sequence is palindromic.  If it is, then
      // no need to check for its complement.

      RWCString soReverseComplement = soComplementSO( pSeq->soBases_ );

      if ( soReverseComplement == pSeq->soBases_ ) {
         cerr << "not complementing " << pSeq->soSequenceName_ <<
            " because it is palindromic" << endl;
         continue;
      }

      cerr << "complementing " << pSeq->soSequenceName_ << 
         " because it is not palindromic" << endl;

      seqJustBases* pSeqComp = new seqJustBases();
      pSeqComp->soSequenceName_ = pSeq->soSequenceName_ + ".comp";

      pSeqComp->soBases_ = soComplementSO( pSeq->soBases_ );
      aArrayOfLinkerSequences.insert( pSeqComp );
   }


}

   
void sff2PhdBall :: setMinScoreForLinkers( 
          RWTPtrOrderedVector<seqJustBases> aArrayOfLinkerSequences, 
          RWTValOrderedVector<int>& aMinScoreForLinker ) {


   for( int nLinker = 0; nLinker < aArrayOfLinkerSequences.length(); 
        ++nLinker ) {

      int nLength = aArrayOfLinkerSequences[ nLinker ]->soBases_.length();

      aMinScoreForLinker.insert( nLength - 
                      nMaxIndels* pCP->n454LinkerAlignmentIndelScore_ );


   }
}


void sff2PhdBall :: writePhdBall( const RWTPtrOrderedVector<basesAndQualitiesAndPeaks> aReads ) {

   cerr << "writing phd ball" << endl;

   RWCString soSffFileWithoutPath = filSffFullPath_.soGetBasename();

   char* szTimeStamp = szGetTime();

   RWCString soDateTime = soGetDateTime( nNoSlashes );


   for( int nRead = 0; nRead < aReads.length(); ++nRead ) {

      if ( nRead % 10000 == 0 ) {
         cerr << ".";
         cerr.flush();
      }

      basesAndQualitiesAndPeaks* pSeqRead = aReads[ nRead ];


      fprintf( pPhdBall_, "BEGIN_SEQUENCE %s 1\n\n", pSeqRead->soSequenceName_.data() );
      fprintf( pPhdBall_, "BEGIN_COMMENT\n\n" );

      RWCString soReadNameWithoutLeftOrRight = pSeqRead->soSequenceName_;
      bool bLeft = false;
      bool bRight = false;
      if ( soReadNameWithoutLeftOrRight.bEndsWithAndRemove( 
            pCP->soPaired454LeftReadExtension_ ) ) {
         bLeft = true;
      }
      else if ( soReadNameWithoutLeftOrRight.bEndsWithAndRemove( 
            pCP->soPaired454RightReadExtension_ ) ) {
         bRight = true;
      }

      fprintf( pPhdBall_, "CHROMAT_FILE: sff:%s%s:%s\n",
               ( bLeft ? "-f:" : "" ),
               soSffFileWithoutPath.data(),
               soReadNameWithoutLeftOrRight.data() );

      fprintf( pPhdBall_, "TIME: %s\n", szTimeStamp );

      fprintf( pPhdBall_, "CHEM: 454\n" );
      fprintf( pPhdBall_, "TRACE_ARRAY_MIN_INDEX: 0\n" );
      fprintf( pPhdBall_, "TRACE_ARRAY_MAX_INDEX: %d\n",
               pSeqRead->nTraceArrayMaxIndex_ );

      fprintf( pPhdBall_, "\nEND_COMMENT\n\nBEGIN_DNA\n" );

      assert( 0 <= pSeqRead->n1ClipLeft_ - 1 );

      assert( pSeqRead->n1ClipRight_ <= pSeqRead->soBases_.length() );

      for( int n0Base = pSeqRead->n1ClipLeft_ - 1; 
           n0Base <= pSeqRead->n1ClipRight_ - 1; ++n0Base ) {

         fprintf( pPhdBall_, "%c %d %d\n",
                  tolower( pSeqRead->soBases_[ n0Base ] ),
                  pSeqRead->aQualities_[ n0Base ],
                  pSeqRead->nGetPointPos( n0Base ) );
      }

      fprintf( pPhdBall_, "END_DNA\n\nEND_SEQUENCE\n\n" );


      if ( bLeft || bRight ) {
         // should write WR items so assembly view will know the
         // reads are part of a pair

         // newbler write like this:
         // WR{
         // template newbler 000727:123348
         // name: FGXQUUK01DBP5M
         // lib: 454PairedRead
         // }

         fprintf( pPhdBall_, "WR{\n" );
         fprintf( pPhdBall_, "template consed-sff2PhdBall %s\n", 
                  soDateTime.data() );

         fprintf( pPhdBall_, "name: %s\n", 
                  soReadNameWithoutLeftOrRight.data()  );

         fprintf( pPhdBall_, "lib: 454PairedRead\n" );
         fprintf( pPhdBall_, "}\n\n" );

         // WR{
         // primer newbler 000727:123348
         // type: univ fwd
         // }

         fprintf( pPhdBall_, "WR{\n" );
         fprintf( pPhdBall_, "primer consed-sff2PhdBall %s\n", 
                  soDateTime.data() );
         
         if ( bLeft ) {
            fprintf( pPhdBall_, "type: univ fwd\n" );
         }
         else {
            fprintf( pPhdBall_, "type: univ rev\n" );
         }

         fprintf( pPhdBall_, "}\n\n" );

      }


   }

   fclose( pPhdBall_ );

   cerr << endl;

   cerr << "see " << filNewPhdBallFullPath_ << endl;
               
}


void sff2PhdBall :: filterNonPairedReadsAgainstPuc19() {


   printf("number of non-paired reads: %d\n", aNonPairedReads_.length() );

   // fix for Cliff Han (LANL) bug in which all reads are paired
   if ( aNonPairedReads_.length() == 0 ) return;

   // write fasta file for use for searching non-paired reads against puc19

   openFastaForCrossMatch();

   for( int n = 0; n < aNonPairedReads_.length(); ++n ) {
      int nIndex = aNonPairedReads_[n];
      basesAndQualitiesAndPeaks* pRead = aRawSffReads_[ nIndex ];

      writeToFastaForCrossMatch( pRead );
   }
   
   fclose( pFasta_ );

   runCrossMatch();

   readCrossMatchOutputAndAddToDesiredReads();

}


void sff2PhdBall :: openFastaForCrossMatch() {

   filFastaToCrossMatch_ = soPrefixForTemporaryFiles_ + ".fa";

   pFasta_ = fopen( filFastaToCrossMatch_.data(), "w" );
   if ( !pFasta_ ) {

      THROW_FILE_ERROR( filFastaToCrossMatch_ + " for screening against puc19" );
   }

}


void sff2PhdBall :: writeToFastaForCrossMatch( basesAndQualitiesAndPeaks* pRead ) {
   
   fprintf( pFasta_, ">%s\n", pRead->soSequenceName_.data() );

   const int nMaxBasesOnLine = 50;

   for( int nBase = 0; nBase < pRead->soBases_.length(); 
        nBase += nMaxBasesOnLine ) {
      

      int nNumberOfBasesOnLine = nMaxBasesOnLine;
      if ( nBase + nNumberOfBasesOnLine > pRead->soBases_.length() ) {
         // check:  if length() = 20, and nBase = 19, then we should
         // write 1 base and 20 - 19 = 1
         nNumberOfBasesOnLine = pRead->soBases_.length() - nBase;
      }

      fprintf( pFasta_, "%.*s\n", nNumberOfBasesOnLine, 
               pRead->soBases_.data() + nBase );
   }

}


void sff2PhdBall :: runCrossMatch() {

   filCrossMatchLinkerOutput_ = soPrefixForTemporaryFiles_ + ".linker.cross";
   filCrossMatchPucOutput_ = soPrefixForTemporaryFiles_ + ".puc.cross";

   RWCString soCommand = 
      pCP->filFullPathnameOfFilter454ReadsScript_ +
      " " + filFastaToCrossMatch_ + 
      " " + pCP->fil454LinkerSequences_ +
      " " + pCP->filFilter454ReadsAgainstThis_ + 
      " " + filCrossMatchLinkerOutput_ +
      " " + filCrossMatchPucOutput_;

   int nRetStat = system( soCommand.data() );

   if ( nRetStat != 0 ) {
      RWCString soError = "Something went wrong running ";
      soError += soCommand;
      soError += "  Gave error code ";
      soError += soGetErrno();
      soError += "\nSee xterm for more detail on what went wrong or trying running the command directly from the command line.";
      THROW_ERROR( soError );
   }
}


void sff2PhdBall :: readCrossMatchOutputAndAddToDesiredReads() {


   FILE* pPucCross = fopen( filCrossMatchPucOutput_.data(), "r" );
   if ( !pPucCross ) {
      THROW_FILE_ERROR( filCrossMatchPucOutput_ + " cross_match output file" );
   }

   cerr << "reading " << filCrossMatchPucOutput_ << endl;

   int nNumberOfMatchingReads = 0;

   while( fgets( soLine.data(), nMaxLineSize, pPucCross ) ) {

      if ( memcmp( soLine.data(), "ALIGNMENT", 9 ) == 0 ) {
         ++nNumberOfMatchingReads;
      }
   }

   
   mbtValOrderedVectorOfRWCString aMatchingReads( (size_t) nNumberOfMatchingReads );


   rewind( pPucCross );

   while( fgets( soLine.data(), nMaxLineSize, pPucCross ) ) {

      if ( memcmp( soLine.data(), "ALIGNMENT", 9 ) != 0 ) continue;

      soLine.nCurrentLength_ = strlen( soLine.data() );

      // looks like this:
      // 

   // ALIGNMENT    25  3.57 0.00 0.00  s_1_1_886_8        1    28 (0)  C chr5   (854045) 180003821 180003794  
   // 0            1   2    3    4     5                  6    7  8    9 10      11      12        13
   // ALIGNMENT    27  0.00 0.00 0.00  s_1_1_300_628        1    28 (0)  C chr5   (126239857) 54618009 54617982  
   // 0            1   2    3    4     5                    6    7  8    9 10     11          12       13

      // ALIGNMENT   148  0.00 0.00 0.00  puc19.454        1   153 (2533)    FGXQUUK01D4S1E      122   274 (0
      // 0           1    2    3    4     5                6   7   8         9                   10    11  12


      // really all we need is whether the read matches or not

      RWCTokenizer tok( soLine );
      for( int n = 0; n < 9; ++n ) {
         tok();
      }

      RWCString soRead = tok();
      if ( soRead == "C" ) {
         soRead = tok();
      }

      aMatchingReads.insert( soRead );
   }

   fclose( pPucCross );


   cerr << "start sorting" << endl;

   aMatchingReads.resort();

   cerr << "done sorting" << endl;

   // now make a pass through the nonpaired reads, only making it a aDesired
   // read if it has no match to puc19.

   int nReadsMatchingPuc19 = 0;
   int nReadsNotMatchingPuc19 = 0;


   aNonPairedReadsHasPuc_.reshape( aNonPairedReads_.length(), false );

   for( int n = 0; n < aNonPairedReads_.length(); ++n ) {
      int nIndex = aNonPairedReads_[n];
      RWCString soReadName = aRawSffReads_[ nIndex ]->soSequenceName_;

      if ( aMatchingReads.bContains( soReadName ) ) {
         aNonPairedReadsHasPuc_[n] = true;
         ++nReadsMatchingPuc19;
      }
      else {
         ++nReadsNotMatchingPuc19;
      }
   }
   
   cerr << nReadsNotMatchingPuc19 << " unpaired reads not matching puc19" << endl;
   cerr << nReadsMatchingPuc19 << " unpaired reads matching puc19" << endl;
   cerr << "done passing through the full list of nonpaired reads" << endl;


   // now look at linker matches to see if the clipping positions
   // need to be changed


   FILE* pLinkerCross = fopen( filCrossMatchLinkerOutput_.data(), "r" );
   if ( !pLinkerCross ) {
      THROW_FILE_ERROR( filCrossMatchLinkerOutput_ + " cross_match output file" );
   }

   cerr << "reading " << filCrossMatchLinkerOutput_ << endl;

   int nNumberOfUnpairedReadsMatchingLinker = 0;
   int nNonPairedReadsIndex = -1; // will first ++

   RWCString soLastRead = "";

   int nLine = 0;
   while( fgets( soLine.data(), nMaxLineSize, pLinkerCross ) ) {
      ++nLine;
      
      if ( memcmp( soLine.data(), "ALIGNMENT", 9 ) != 0 ) continue;

      ++nNumberOfUnpairedReadsMatchingLinker;

      soLine.nCurrentLength_ = strlen( soLine.data() );

      // looks like this:
      // 


      // ALIGNMENT    24  0.00 0.00 0.00  FGXQUUK01AXTGF       27    51 (239)    linker       20    44 (0)  
      // 0            1    2   3    4     5                    6     7
      // ALIGNMENT    22  0.00 0.00 9.76  FGXQUUK01EDYEW      195   235 (34)  C linker   (7)    37     1  


      RWCTokenizer tok( soLine );
      
      for( int n = 0; n <= 4; ++n ) {
         tok();
      }

      RWCString soRead = tok();

      if ( soRead == soLastRead ) {

         cerr << "duplicated read " << soRead << " line " << nLine << endl;

         continue;
      }

      RWCString soReadStart = tok();
      RWCString soReadEnd = tok();
      RWCString soLeftover = tok();

      int n1ReadStart = atoi( soReadStart.data() );
      int n1ReadEnd = atoi( soReadEnd.data() );

      // find this read in the list of aNonPairedReads_

      basesAndQualitiesAndPeaks* pSeq;
      bool bFoundRead = false;
      while( !bFoundRead ) {
         ++nNonPairedReadsIndex;
         if ( nNonPairedReadsIndex >= aNonPairedReads_.length() ) {
            RWCString soError = "could not find read ";
            soError += soRead;
            soError += " in list aNonPairedReads_.  This might be due to someone editing the perl script filter454Reads.perl--the list of reads must come before the linker sequence. ";
            soError += RWCString( (long) nLine );
            THROW_ERROR( soError );
         }

         int nRawSffReadsIndex = aNonPairedReads_[ nNonPairedReadsIndex ];

         pSeq = aRawSffReads_[ nRawSffReadsIndex ];
         if ( pSeq->soSequenceName_ == soRead ) {
            bFoundRead = true;
         }
         else {
            // this read has no linker sequence in it.
            // Thus if it has no puc, we want it:

            if ( !aNonPairedReadsHasPuc_[ nNonPairedReadsIndex ] ) {

               // add it as is (no additional clipping) to the desired reads

               aDesiredReads_.insert( pSeq );               

            }
         }
      }

      // set up for next time:

      soLastRead = soRead;

      // check if this read is already eliminated due to puc

      if ( aNonPairedReadsHasPuc_[ nNonPairedReadsIndex ] ) continue;

      // see which is larger

      // -------llllll-------
      // a     b      c     d

      int n1A;
      int n1B;
      int nLeftLength = 0;
      if ( bIntersect( pSeq->n1ClipLeft_,
                       pSeq->n1ClipRight_,
                       1,
                       n1ReadStart - 1,
                       n1A,
                       n1B ) ) {
         nLeftLength = n1B - n1A + 1;
      }

      int n1C;
      int n1D;
      int nRightLength = 0;
      if ( bIntersect( pSeq->n1ClipLeft_,
                       pSeq->n1ClipRight_,
                       n1ReadEnd + 1,
                       pSeq->soBases_.length(),
                       n1C,
                       n1D ) ) {
         nRightLength = n1D - n1C + 1;
      }

      if ( nLeftLength > nRightLength ) {
         
         pSeq->n1ClipLeft_ = n1A;
         pSeq->n1ClipRight_ = n1B;
      }
      else {

         pSeq->n1ClipLeft_ = n1C;
         pSeq->n1ClipRight_ = n1D;
      }


      if ( pSeq->n1ClipRight_ - pSeq->n1ClipLeft_ + 1 >= nMinimumReadLength )
         aDesiredReads_.insert( pSeq );
      
   } //    while( fgets( soLine.data(), nMaxLineSize, pLinkerCross ) ) {


   // there may be some nonpaired reads leftover after looking through all
   // the linkers.  Many of these (the ones without puc19) should be made into
   // unpaired reads

   for( ++nNonPairedReadsIndex; 
        nNonPairedReadsIndex < aNonPairedReads_.length(); 
        ++nNonPairedReadsIndex ) {
      
      // this read has no linker sequence in it.
      // Thus if it has no puc, we want it:

      if ( !aNonPairedReadsHasPuc_[ nNonPairedReadsIndex ] ) {
         
         int nRawSffReadsIndex = aNonPairedReads_[ nNonPairedReadsIndex ];

         basesAndQualitiesAndPeaks* pSeq = aRawSffReads_[ nRawSffReadsIndex ];

         // add it as is (no additional clipping) to the desired reads

         aDesiredReads_.insert( pSeq );               
      }
   }


   if ( pCP->bFilter454ReadsDeleteCrossMatchOutput_ ) {

      int nRet;
      nRet = unlink( filCrossMatchLinkerOutput_.data() );
      if ( nRet != 0 ) {
         cerr << "Warning:  could not delete cross_match output file " <<
            filCrossMatchLinkerOutput_ << endl;
         cerr << soGetErrno() << endl;
      }

      nRet = unlink( filCrossMatchPucOutput_.data() );
      if ( nRet != 0 ) {
         cerr << "Warning:  could not delete cross_match output file " << filCrossMatchPucOutput_ << endl;
         cerr << soGetErrno() << endl;
      }

      FileName filLogFile = filFastaToCrossMatch_ + ".log";

      nRet = unlink( filLogFile.data() );
      if ( nRet != 0 ) {
         cerr << "Warning: could not delete cross_match log file " <<
            filLogFile << endl;
         cerr << soGetErrno() << endl;
      }
      
   }

}