/*****************************************************************************
#   Copyright (C) 1994-2008 by David Gordon.
#   All rights reserved.                           
#                                                                           
#   This software is part of a beta-test version of the Consed/Autofinish
#   package.  It should not be redistributed or
#   used for any commercial purpose, including commercially funded
#   sequencing, without written permission from the author and the
#   University of Washington.
#   
#   This software is provided ``AS IS'' and any express or implied
#   warranties, including, but not limited to, the implied warranties of
#   merchantability and fitness for a particular purpose, are disclaimed.
#   In no event shall the authors or the University of Washington be
#   liable for any direct, indirect, incidental, special, exemplary, or
#   consequential damages (including, but not limited to, procurement of
#   substitute goods or services; loss of use, data, or profits; or
#   business interruption) however caused and on any theory of liability,
#   whether in contract, strict liability, or tort (including negligence
#   or otherwise) arising in any way out of the use of this software, even
#   if advised of the possibility of such damage.
#
#   Building Consed from source is error prone and not simple which is
#   why I provide executables.  Due to time limitations I cannot
#   provide any assistance in building Consed.  Even if you do not
#   modify the source, you may introduce errors due to using a
#   different version of the compiler, a different version of motif,
#   different versions of other libraries than I used, etc.  For this
#   reason, if you discover Consed bugs, I can only offer help with
#   those bugs if you first reproduce those bugs with an executable
#   provided by me--not an executable you have built.
# 
#   Modifying Consed is also difficult.  Although Consed is modular,
#   some modules are used by many other modules.  Thus making a change
#   in one place can have unforeseen effects on many other features.
#   It may takes months for you to notice these other side-effects
#   which may not seen connected at all.  It is not feasable for me to
#   provide help with modifying Consed sources because of the
#   potentially huge amount of time involved.
#
#*****************************************************************************/
#include    "phaster2PhdBall.h"
#include    <stdio.h>
#include    <ctype.h>
#include    "mbt_exception.h"
#include    "soDecodePhasterBasesToRead.h"
#include    "soDecodePhasterBasesToGenomic.h"
#include    "mbtValOrderedVectorOfInt.h"
#include    "consedParameters.h"
#include    "soGetDateTime.h"
#include    "szGetTime.h"
#include    "complement_so.h"
#include    "rwtvalvector.h"
#include    "nAmbiguityCodeToNumber2.h"
#include    "rwcregexp.h"
#include    "bIsNumeric.h"
#include    "getAllTokens.h"
#include    "rwctokenizer.h"
#include    "max.h"
#include    "bIntervalsIntersectLong.h"
#include    "soAddCommas.h"
#include    "bIntersectLong.h"


void phaster2PhdBall :: doIt() {

   setAmbiguityCodeTables();


   FILE* pPhasterFOF = fopen( filPhasterFOF_.data(), "r" );
   if ( !pPhasterFOF ) {
      THROW_FILE_ERROR( filPhasterFOF_ );
   }


   pPhdBallFOF_ = fopen( filPhdBallFOF_.data(), "w" );
   if ( !pPhdBallFOF_ ) {
      THROW_FILE_ERROR( filPhdBallFOF_ );
   }


   if ( pCP->bPhaster2PhdBallCalculateNewLocationsFile_ ) {

      FileName filNewLocations = filPhasterLocations_ + ".new";

      pNewLocations_ = fopen( filNewLocations.data(), "w" );
      if ( !pNewLocations_ ) {
         THROW_FILE_ERROR( filNewLocations );
      }
   }


   parseLocationsFile();

   openPhdBalls();

   soDateTimeForTimestamp_ = szGetTime();
   soDateTimeForWRItems_ = soGetDateTime( nColonInMiddle );
   

   FileName filPhasterFile( (size_t) 1000 );

   while( fgets( filPhasterFile.data(), nMAXLINESIZEB, pPhasterFOF ) != NULL ) {
      filPhasterFile.nCurrentLength_ = strlen( filPhasterFile.data() );

      filPhasterFile.stripTrailingWhitespaceFast();

      searchOnePhasterFile( filPhasterFile );
   }

   if ( pCP->bPhaster2PhdBallCalculateNewLocationsFile_ ) {
      writeNewLocationsFile();
      fclose( pNewLocations_ );
   }

   fclose( pPhasterFOF );

   closePhdBalls();
   fclose( pPhdBallFOF_ );
}


static long* pGenomicLocations;


static int nCompareGenomicLocations( const int* pIndexA,
                                     const int* pIndexB ) {

   if ( pGenomicLocations[*pIndexA] < pGenomicLocations[*pIndexB] )
      return -1;
   else if ( pGenomicLocations[*pIndexA] > pGenomicLocations[*pIndexB] )
      return 1;
   else return 0;
}


void phaster2PhdBall :: searchOnePhasterFile( const FileName& filPhasterFile ) {


   cerr << "searching phaster file " << filPhasterFile << endl;


   int nHitsThisPhasterFile = 0;
   int nSavedReadPairsThisPhasterFile = 0;

   filCurrentPhasterFile_ = filPhasterFile;
   
   FILE* pPhasterFile = fopen( filPhasterFile.data(), "r" );
   if ( !pPhasterFile ) {
      THROW_FILE_ERROR( filPhasterFile );
   }

   #define nSZLINE_SIZE 10000
   char szLine[ nSZLINE_SIZE ];

   int nMaxReadLength;
   bool bFoundReadLength = false;
   while( fgets( szLine, nSZLINE_SIZE, pPhasterFile ) != NULL ) {
      
      // looks like:
      // Sequence file: s_5_78_export.txt  337002 entries, max used read length 76, calf_type 1

      if ( szLine[0] == 'S' ) {

         char* szMax = strstr( szLine, "max used read length" );

         if ( szMax != NULL ) {

            int nTokens = 
               sscanf( szMax, "max used read length %d,", &nMaxReadLength );

            if ( nTokens == 1 ) {
               bFoundReadLength = true;
               break;
            }
         }
      }
   }

   if ( !bFoundReadLength ) {
      THROW_ERROR2( "couldn't find \"max used read length\" in " +
                    filPhasterFile );
   }


   // now look for the cref header
   // first look for .cref file
   // then look for # Run date:time


   // looks like this:
   // .cref file: /net/grc/vol3/np_testing/dgordon/phast_combineSnps1000Genomes/hg18_cref_type2_no_ins.cref, type: 2 (bases per byte: 2), Header:
   // # /wt1/gordon/next_phred/100409/make_phast -cref_type:2 -ref_genome:hg18_cref_type2_no_ins snp_chr1.fa snp_chr2.fa snp_chr3.fa snp_chr4.fa snp_chr5.fa snp_chr6.fa snp_chr7.fa snp_chr8.fa snp_chr9.fa snp_chr10.fa snp_chr11.fa snp_chr12.fa snp_chr13.fa snp_chr14.fa snp_chr15.fa snp_chr16.fa snp_chr17.fa snp_chr18.fa snp_chr19.fa snp_chr20.fa snp_chr21.fa snp_chr22.fa snp_chrM.fa snp_chrX.fa snp_chrY.fa snp_chr1_random.fa snp_chr2_random.fa snp_chr3_random.fa snp_chr4_random.fa snp_chr5_random.fa snp_chr6_random.fa snp_chr7_random.fa snp_chr8_random.fa snp_chr9_random.fa snp_chr10_random.fa snp_chr11_random.fa snp_chr13_random.fa snp_chr15_random.fa snp_chr16_random.fa snp_chr17_random.fa snp_chr18_random.fa snp_chr19_random.fa snp_chr21_random.fa snp_chr22_random.fa snp_chrX_random.fa
   // # VERSION 1.100409
   // # Run date:time  100524:093415
   // >chr1  COORDS:1-247249719
   // >chr2  COORDS:247249720-490200868
   // >chr3  COORDS:490200869-689702695
   // >chr4  COORDS:689702696-880975758
   // >chr5  COORDS:880975759-1061833624
   // >chr6  COORDS:1061833625-1232733616


   bool bFoundCrefHeader = false;
   while( fgets( szLine, nSZLINE_SIZE, pPhasterFile ) != NULL ) {

      // looks like
      // .cref file: /net/grc/vol3/np_testing/dgordon/phast_combineSnps1000Genomes/hg18_cref_type2_no_ins.cref, type: 2 (bases per byte: 2), Header:

      if ( szLine[0] == '.' ) {
         if ( memcmp( szLine, ".cref file:", 11 ) == 0 ) {
            bFoundCrefHeader = true;
            break;
         }
      }
   }

   if ( !bFoundCrefHeader ) {
      THROW_ERROR2( "couldn't find \".cref file:\" in " + 
                    filPhasterFile );
   }

   // now looking for:
   // 

   bool bFoundRunDateTime = false;
   while( fgets( szLine, nSZLINE_SIZE, pPhasterFile ) != NULL ) {

      // looks like:
      // # Run date:time  100524:093415\

      if ( szLine[0] == '#' ) {
         if ( memcmp( szLine, "# Run date:time ", 16 ) == 0 ) {
            bFoundRunDateTime = true;
            break;
         }
      }
   }

   
   if ( !bFoundRunDateTime ) {
      THROW_ERROR2( "couldn't find \"# Run date:time \" in " +
                    filPhasterFile );
   }


   // now load up the conversion table

   long long llStart;
   long long llEnd;

   int nConversionIndex = -1; // ready for ++

   int nReturnStatus;

   RWCString soChromosome( (size_t) 1000 );


   while( 
         ( nReturnStatus = fscanf( pPhasterFile, ">%s  COORDS:%lld-%lld\n",
                                   soChromosome.data(),
                                   &llStart,
                                   &llEnd )
           ) == 3  ) {
      soChromosome.nCurrentLength_ = strlen( soChromosome.data() );

      if ( !bConversionTableSet_ ) {
         aConvertChromosomes_.append( soChromosome );
         aConvertStarts_.append( llStart );
         aConvertEnds_.append( llEnd );

      }
      else {
         ++nConversionIndex;
         if ( nConversionIndex >= aConvertChromosomes_.length() ) {
            THROW_ERROR2( "phaster output file " + filPhasterFile 
                          + " uses a different cref file than the first phaster output file" );
         }

         if ( ! ( 
                 ( aConvertChromosomes_[ nConversionIndex ] == soChromosome ) &&
                 ( aConvertStarts_[ nConversionIndex ] == llStart ) &&
                 ( aConvertEnds_[ nConversionIndex ] == llEnd ) 
                 ) ) {
            THROW_ERROR2( "phaster file " + filPhasterFile
                          + " uses a different cref file than the first phaster output file" );
         }
      }
   }


   if ( !bConversionTableSet_ ) {

      setGenomicLocationsOfDesiredLocations();

      if ( pCP->bPhaster2PhdBallCalculateNewLocationsFile_ ) {
         setReadDepthArrays();
      }
   }


   aDesiredLocations_.resort();


   // if reached here, we've set aConvertChromosomes_ etc.
   bConversionTableSet_ = true;

   
   bool bFoundStartAlignments = false;
   while( fgets( szLine, nSZLINE_SIZE, pPhasterFile ) != NULL ) {
      if ( szLine[0] == 'S' && szLine[1] == 'T' ) {
         if ( memcmp( szLine, "START_ALIGNMENTS ", 17 ) == 0 ) {
            bFoundStartAlignments = true;
            break;
         }
      }
   }

   assert( bFoundStartAlignments );

   // looks like:
   // START_ALIGNMENTS 259303
   // ^pos 0           ^pos 17

   int nNumberOfAlignments = atoi( szLine + 17 );

   if ( nNumberOfAlignments == 0 ) {
      fclose( pPhasterFile );
      return;
   }


   desiredLocation dummyDL();


   soLine_.increaseButNotDecreaseMaxLength( 10000 );

   while( fgets( soLine_.data(), nMAXLINESIZEB, pPhasterFile ) != NULL ) {
      soLine_.nCurrentLength_ = strlen( soLine_.data() );

      bCurrentPhasterLineIsFinelyParsed_ = false;

      if ( soLine_[0] == 'E' ) {
         if ( soLine_.bStartsWith( "END_ALIGNMENTS" ) ) {
            break;
         }
      }

      parsePhasterLineCrudely();


      bool bSavedReadPair = false;
      considerEachSnpForThisReadPair( bSavedReadPair );
      if ( bSavedReadPair ) 
         ++nHitsThisPhasterFile;
   }


   fclose( pPhasterFile );
   cerr << nHitsThisPhasterFile << " hits this phaster file" << endl;


}


void phaster2PhdBall :: parsePhasterLineCrudely() {

   lGenomicLeftRead1_ = 0;
   lGenomicLeftRead2_ = 0;

   int nTokens = 
      sscanf( soLine_.data(), "%*s %*s %*s %*s %ld %*s %*s %*s %*s %*s %ld ", 
                    &lGenomicLeftRead1_, &lGenomicLeftRead2_ );


   assert( nTokens > 0 );

   bTwoReads_ = ( nTokens == 2 ? true : false );

   const int nMoreThanMaxAlignmentLength = 1000;
   
   lGenomicFarRightRead1_ = lGenomicLeftRead1_ + nMoreThanMaxAlignmentLength;
   lGenomicFarRightRead2_ = lGenomicLeftRead2_ + nMoreThanMaxAlignmentLength;


   lMaxRight_ = 0;
   if ( lGenomicLeftRead1_ != 0 )
      lMaxRight_ = MAX( lMaxRight_, lGenomicFarRightRead1_ );

   if ( lGenomicLeftRead2_ != 0 )
      lMaxRight_ = MAX( lMaxRight_, lGenomicFarRightRead2_ );


}


void phaster2PhdBall :: parsePhasterLineFinely() {


   // since there is a reasonable chance that we will save this read,
   // let's get all the fields we need all at once

   // looks like this:
   // 2_48 (1) read name
   // 264  (2) mapping quality score
   // 76   (3) read starts or ends
   // 1    (4) read starts or ends
   // 1098731449 (5) genomic position of left end of alignment
   // --#!'#--)2--3-333'3--3-3---!!--!-'3-'!''3!''0'''-''!'3-'33-33'3''!3'''33---3 (6)  encoded read and genomic bases
   // 1*&&)%)($('.'%34,#.//%%%54.04.>5)/1:(*A5*%9;$EE6:990776A:<2:5<<9;@8-2@0<9?*1 (7)  encoded read qualities
   // 289 (8) 2nd read mapping quality score
   // 77  (9) 2nd read start or end cycle #
   // 152 (10)  2nd read start ora end cycle #
   // 1098731401 (11) genomic position of left end of alignment
   // !!!'!3-3'!---!!!33-'3'''-!!33!-!-!'!-!'!--'!-!'!--!!)"--'3--3-333'3--2-3---" (12) encoded read and genomic bases
   // -227;2?606A329877=?>;->@?29:;7C;A;57=;-,?A13@@/49A4>/$=(.4:@24>88%3@$(.'61)' (13) encoded qualities


   // just to reassure that these are large enough
   soReadName_.increaseButNotDecreaseMaxLength( 1000 );
   soEncodedBasesRead1_.increaseButNotDecreaseMaxLength( 1000 );
   soEncodedBasesRead2_.increaseButNotDecreaseMaxLength( 1000 );
   soEncodedQualitiesRead1_.increaseButNotDecreaseMaxLength( 1000 );
   soEncodedQualitiesRead2_.increaseButNotDecreaseMaxLength( 1000 );


   int nTokens = 
      sscanf( soLine_.data(), 
              "%s %*s %d %d %lld %s %s %*s %d %d %lld %s %s\n",
              soReadName_.data(),
              &nRead1Left_,
              &nRead1Right_,
              &lGenomicLeftRead1_,
              soEncodedBasesRead1_.data(),
              soEncodedQualitiesRead1_.data(),
              &nRead2Left_,
              &nRead2Right_,
              &lGenomicLeftRead2_,
              soEncodedBasesRead2_.data(),
              soEncodedQualitiesRead2_.data() );

   if ( nTokens == 6 )
      bTwoReads_ = false;
   else if ( nTokens == 11 )
      bTwoReads_ = true;
   else {
      THROW_ERROR2( "there should have been 6 or 11 tokens but there were " +
                    RWCString( (long) nTokens ) );


   }


   soReadName_.nCurrentLength_ = strlen( soReadName_.data() );

   soEncodedBasesRead1_.nCurrentLength_ = strlen( soEncodedBasesRead1_.data() );
   soEncodedBasesRead2_.nCurrentLength_ = strlen( soEncodedBasesRead2_.data() );

   soEncodedQualitiesRead1_.nCurrentLength_ = strlen( soEncodedQualitiesRead1_.data() );
   soEncodedQualitiesRead2_.nCurrentLength_ = strlen( soEncodedQualitiesRead2_.data() );


   const int nMoreThanMaxAlignmentLength = 1000;
   
   lGenomicFarRightRead1_ = lGenomicLeftRead1_ + nMoreThanMaxAlignmentLength;
   lGenomicFarRightRead2_ = lGenomicLeftRead2_ + nMoreThanMaxAlignmentLength;


   soDecodedBasesRead1_ = soDecodePhasterBasesToRead( soEncodedBasesRead1_ );
   soDecodedBasesRead2_ = soDecodePhasterBasesToRead( soEncodedBasesRead2_ );

   soDecodedGenomicBasesRead1_ = 
      soDecodePhasterBasesToGenomic( soEncodedBasesRead1_ );
   soDecodedGenomicBasesRead2_ = 
      soDecodePhasterBasesToGenomic( soEncodedBasesRead2_ );


   int n;
   int nNonGapsRead1 = 0;
   int nNonGapsRead2 = 0;

   for( n = 0; n < soDecodedGenomicBasesRead1_.length(); ++n ) {
      if ( soDecodedGenomicBasesRead1_[n] != '-' ) ++nNonGapsRead1;
   }
   for( n = 0; n < soDecodedGenomicBasesRead2_.length(); ++n ) {
      if ( soDecodedGenomicBasesRead2_[n] != '-' ) ++nNonGapsRead2;
   }

   lGenomicRightRead1_ = lGenomicLeftRead1_ + nNonGapsRead1 - 1;
   lGenomicRightRead2_ = lGenomicLeftRead2_ + nNonGapsRead2 - 1;


//    int nIndex;
//    if ( ( nIndex = soDecodedGenomicBasesRead1_.index( "-" ) ) != RW_NPOS ) {

//       if ( ( soDecodedGenomicBasesRead1_.length() > ( nIndex + 1 )  ) &&
//            ( soDecodedGenomicBasesRead1_[ nIndex + 1 ] != '-' ) ) {
      
//          RWCString soChromosome;
//          int n1ChrPos;
//          convertFromGenomic( lGenomicLeftRead1_, soChromosome, n1ChrPos );

//          printf( "convertFromGenomic %s %d %ld\n", soChromosome.data(), n1ChrPos, lGenomicLeftRead1_ );

//          printf( "read 1 of %s has a gap in the genomic at %d after gap: %ld\n", 
//               soReadName_.data(),
//               nIndex,
//                  lGenomicLeftRead1_ + nIndex );

//          int nGaps = soDecodedGenomicBasesRead1_.nGetNumberOfCopiesOfChar( '-' );

//          printf( "left end = %s unpadded right end: %s\n",
//                  soAddCommas( lGenomicLeftRead1_ ).data(),
//                  soAddCommas( lGenomicLeftRead1_ + 
//                               soDecodedGenomicBasesRead1_.length() - 1 -
//                               nGaps ).data()  );

//          int n1ChrPosLeft;
//          int n1ChrPosRight;
//          RWCString soChromosomeTemp;
//          convertFromGenomic( lGenomicLeftRead1_, soChromosomeTemp, n1ChrPosLeft );

//          assert( soChromosomeTemp == soChromosome );
//          convertFromGenomic(  lGenomicLeftRead1_ + 
//                               soDecodedGenomicBasesRead1_.length() - 1 -
//                               nGaps,
//                               soChromosomeTemp,
//                               n1ChrPosRight );
//          assert( soChromosomeTemp == soChromosome );

//          printf( "in 1chr: left end = %s right = %s\n",
//                  soAddCommas( n1ChrPosLeft ).data(),
//                  soAddCommas( n1ChrPosRight ).data() );
         
                 
//          printf( "%s\n", soDecodedGenomicBasesRead1_.data() );
//          printf( "%s\n", soDecodedBasesRead1_.data() );
//       }
//    }

//    if ( ( nIndex = soDecodedGenomicBasesRead2_.index( "-" ) ) != RW_NPOS ) {

//       if ( ( soDecodedGenomicBasesRead2_.length() > ( nIndex + 1 ) ) &&
//            ( soDecodedGenomicBasesRead2_[ nIndex + 1 ] != '-' ) ) {
      
//          RWCString soChromosome;
//          int n1ChrPos;
//          convertFromGenomic( lGenomicLeftRead2_, soChromosome, n1ChrPos );

//          printf( "%s %d :\n", soChromosome.data(), n1ChrPos );
//          printf( "read 2 of %s has a gap in the genomic at %d after gap: %ld\n", 
//               soReadName_.data(),
//               nIndex,
//                  lGenomicLeftRead2_ + nIndex );
//          printf( "%s\n", soDecodedGenomicBasesRead2_.data() );
//          printf( "%s\n", soDecodedBasesRead2_.data() );
//       }
//    }


   bCurrentPhasterLineIsFinelyParsed_ = true;
}


bool phaster2PhdBall :: bAlleleOK( desiredLocation* pDL, 
                                   const char cWhichRead,
                                   const char cWhichSite ) {

   
   if ( !bCurrentPhasterLineIsFinelyParsed_ ) {
      parsePhasterLineFinely();
   }


   RWCString* pDecodedReadBases;
   RWCString* pDecodedGenomicBases;
   if ( cWhichRead == cREAD1 ) {
      pDecodedReadBases = &soDecodedBasesRead1_;
      pDecodedGenomicBases = &soDecodedGenomicBasesRead1_;
   }
   else {
      pDecodedReadBases = &soDecodedBasesRead2_;
      pDecodedGenomicBases = &soDecodedGenomicBasesRead2_;
   }


   int nAllowedAlleles;

   if ( cWhichSite == cSITEA ) {
      nAllowedAlleles = nAmbiguityCodeToNumber2[ pDL->cAllowedAllelesA_ ];
   }
   else {
      nAllowedAlleles = nAmbiguityCodeToNumber2[ pDL->cAllowedAllelesB_ ];
   }

   // now need to find the read base at the snp position

   long lGenomicOfSnp = ( cWhichSite == cSITEA ?
                          pDL->lGenomicLocationA_ : pDL->lGenomicLocationB_ );

   long lGenomicOfRead = ( cWhichRead == cREAD1 ?
                           lGenomicLeftRead1_ :
                           lGenomicLeftRead2_ );


   // -----------------
   // ^         ^ snp is here
   // read starts here

   // but alignment has pads in the genomic:
   // -----****------------
   // ^             ^ snp is here
   // read starts here

   // so the snp position is moved over

   int nNumberOfUnpaddedGenomicBasesToSnp = lGenomicOfSnp - lGenomicOfRead + 1;

   char cReadBaseAtSnpPosition = 0;

   int nNumberOfUnpaddedGenomicBasesSoFar = 0;

   int n0UnpaddedBasePosition = -1; // 0 will be first unpadded base
   for( int n0PaddedBasePos = 0; 
        n0PaddedBasePos < pDecodedGenomicBases->length();
        ++n0PaddedBasePos ) {

      if ( (*pDecodedGenomicBases)[ n0PaddedBasePos ] != '-' ) {
         ++nNumberOfUnpaddedGenomicBasesSoFar;

         if ( nNumberOfUnpaddedGenomicBasesSoFar == 
              nNumberOfUnpaddedGenomicBasesToSnp ) {

            cReadBaseAtSnpPosition = (*pDecodedReadBases)[ n0PaddedBasePos ];
            break;
         }
      }
   }


   // case in which the alignment does not extend as far as the snp
   if ( !cReadBaseAtSnpPosition ) return false;


   int nReadBaseAtSnpPosition = nAmbiguityCodeToNumber2[ cReadBaseAtSnpPosition ];

   if ( ( nReadBaseAtSnpPosition & nAllowedAlleles ) == 0 ) {
      return false;
   }
   else {

      return true;
   }

}

                                   
static desiredLocation dummyDL;


void phaster2PhdBall :: considerEachSnpForThisReadPair( bool& bSavedReadPair ) {
   bSavedReadPair = false;

   long lLeftMostGenomic;
   bool bOnly1AlignedRead = false;


   if ( bTwoReads_ ) {
      if ( ( lGenomicLeftRead1_ == 0 ) && ( lGenomicLeftRead2_ == 0 ) ) 
         return;
      else if ( lGenomicLeftRead1_ == 0 ) {
         lLeftMostGenomic = lGenomicLeftRead2_;
         bOnly1AlignedRead = true;
      }
      else if ( lGenomicLeftRead2_ == 0 ) {
         lLeftMostGenomic = lGenomicLeftRead1_;
         bOnly1AlignedRead = true;
      }
      else {
         lLeftMostGenomic = MIN( lGenomicLeftRead1_, lGenomicLeftRead2_ );
      }
   }
   else {
      if ( lGenomicLeftRead1_ == 0 )
         return;

      lLeftMostGenomic = lGenomicLeftRead1_;
      bOnly1AlignedRead = true;
   }


   // used to do this:

   // <-----bbbbbbbbbbb----->   
   //       ^ lGenomicLeft1
   //  bbbb is the read bases
   //  ---- is part of the range
   // look for snps anywhere in this window
   // ^ or after
   //                        ^ but not here or after


   // due to a bug (100916), I'm just going to check all of the snps

   int nIndexOfFoundSnp = -666;
   bool bWantThisReadPair = false;
   for( int nIndexOfSnp = 0; ( nIndexOfSnp < aDesiredLocations_.length() ) && 
      !bWantThisReadPair; ++nIndexOfSnp ) {

      desiredLocation* pDL = aDesiredLocations_[ nIndexOfSnp ];


      if ( pDL->bMateUnmapped_ &&
           ( lGenomicLeftRead1_ != 0 ) && ( lGenomicLeftRead2_ != 0 ) ) {
         // this location line won't work since it requires
         // an unmapped mate, but both reads are mapped

         continue;
      }


      if ( pDL->cTypeOfMatch_ == desiredLocation::cMatchOneSite ) {

         if (( lGenomicLeftRead1_ != 0 ) &&
             ( lGenomicLeftRead1_ <= pDL->lGenomicLocationA_ ) &&
             ( pDL->lGenomicLocationA_ <= lGenomicFarRightRead1_ ) ) {

            // need to check allele

            if ( bAlleleOK( pDL, cREAD1, cSITEA ) ) {
               nIndexOfFoundSnp = nIndexOfSnp;
               bWantThisReadPair = true;
               break;
            }
         }


         // even if we've tried the 1st read, try the second read
         if ( 
             bTwoReads_ &&
             ( lGenomicLeftRead2_ != 0 ) &&
             ( lGenomicLeftRead2_ <= pDL->lGenomicLocationA_ ) &&
             ( pDL->lGenomicLocationA_ <= lGenomicFarRightRead2_ ) ) {

            // need to check allele

            if ( bAlleleOK( pDL, cREAD2, cSITEA ) ) {
               nIndexOfFoundSnp = nIndexOfSnp;
               bWantThisReadPair = true;


               break;
            } 
         }  //  if ( bTwoReads_ ...
      } //       if ( pDL->cTypeOfMatch_ == desiredLocation::cMatchOneSite ) {
      else if ( pDL->cTypeOfMatch_ == desiredLocation::cMatchBothSites ) {


         // there are a variety of ways this can work:
         // siteA in read1, siteA in read2
         // siteB in read1, siteB in read2
         // all 4 combinations of these:
         // siteA in read1, siteB in read1
         // siteA in read1, siteB in read2
         // siteA in read2, siteB in read1
         // siteA in read2, siteB in read2

         bool bSiteAMatches = false;

         if ( lGenomicLeftRead1_ != 0 ) {
            if ( ( lGenomicLeftRead1_ <= pDL->lGenomicLocationA_ ) &&
                 ( pDL->lGenomicLocationA_ <= lGenomicFarRightRead1_ ) ) {
               if ( bAlleleOK( pDL, cREAD1, cSITEA ) ) {
                  bSiteAMatches = true;
               }
            }
         }

         if ( !bSiteAMatches ) {
            // let's try the other read

            if ( bTwoReads_ && ( lGenomicLeftRead2_ != 0  ) ) {
                  
               if ( ( lGenomicLeftRead2_ <= pDL->lGenomicLocationA_ ) &&
                    ( pDL->lGenomicLocationA_ <= lGenomicFarRightRead2_ ) ) {
                  if ( bAlleleOK( pDL, cREAD2, cSITEA ) ) {
                     bSiteAMatches = true;
                  }
               }
            }
         }

         
         // if neither read matches site A of this snp, no point
         // in looking at site B of this snp since this snp requires
         // both sites to match.  So go on to a different snp.

         if ( bSiteAMatches ) {

            bool bSiteBMatches = false;

            if ( lGenomicLeftRead1_ != 0 ) {
               if ( ( lGenomicLeftRead1_ <= pDL->lGenomicLocationB_ ) &&
                    ( pDL->lGenomicLocationB_ <= lGenomicFarRightRead1_ ) ) {
                  if ( bAlleleOK( pDL, cREAD1, cSITEB ) ) {
                     bSiteBMatches = true;
                  }
               }
            }

            if ( !bSiteBMatches ) {
               // let's try the other read

               if ( 
                   bTwoReads_ &&
                   ( lGenomicLeftRead2_ != 0 ) &&
                   ( lGenomicLeftRead2_ <= pDL->lGenomicLocationB_ ) &&
                   ( pDL->lGenomicLocationB_ <= lGenomicFarRightRead2_ ) ) {

                  if ( bAlleleOK( pDL, cREAD2, cSITEB ) ) {
                     bSiteBMatches = true;
                  }
               }
            }

            if ( bSiteAMatches && bSiteBMatches ) {
               nIndexOfFoundSnp = nIndexOfSnp;
               bWantThisReadPair = true;
            }

         }
      } //       else if ( pDL->cTypeOfMatch_ == desiredLocation::cMatchBothSites ) {

      else if ( pDL->cTypeOfMatch_ == desiredLocation::cMatchWindow ) {


         if ( lGenomicLeftRead1_ != 0 ) {
            if ( bIntervalsIntersectLong( lGenomicLeftRead1_,
                                      lGenomicFarRightRead1_,
                                      pDL->lGenomicLocationA_,
                                      pDL->lGenomicLocationB_ ) ) {

               if ( !bCurrentPhasterLineIsFinelyParsed_ ) {
                  parsePhasterLineFinely();
               }


               if ( bIntervalsIntersectLong( lGenomicLeftRead1_,
                                         lGenomicRightRead1_,
                                         pDL->lGenomicLocationA_,
                                         pDL->lGenomicLocationB_ ) ) {

                  nIndexOfFoundSnp = nIndexOfSnp;
                  bWantThisReadPair = true;
               }
            }
         }

         if ( !bWantThisReadPair ) {
            // try the other read

            if ( 
                bTwoReads_ &&
                ( lGenomicLeftRead2_ != 0 ) &&
                ( bIntervalsIntersectLong( lGenomicLeftRead2_,
                                       lGenomicFarRightRead2_,
                                       pDL->lGenomicLocationA_,
                                       pDL->lGenomicLocationB_ ) ) ) {

               if ( !bCurrentPhasterLineIsFinelyParsed_ ) {
                  parsePhasterLineFinely();
               }


               if ( bIntervalsIntersectLong( lGenomicLeftRead2_,
                                         lGenomicRightRead2_,
                                         pDL->lGenomicLocationA_,
                                         pDL->lGenomicLocationB_ ) ) {


                  nIndexOfFoundSnp = nIndexOfSnp;
                  bWantThisReadPair = true;
               }
            }
         }
      } //  else if ( pDL->cTypeOfMatch_ == desiredLocation::cMatchWindow ) {

   } //    while( nIndexOfSnp < aGenomicLocations_.length() ) {


   if ( bWantThisReadPair ) {
      saveReadPair( nIndexOfFoundSnp );
      bSavedReadPair = true;
      ++nHits_;
   }


} // void phaster2PhdBall :: considerEachSnpForThisReadPair() {


void phaster2PhdBall :: saveReadPair( const int nIndexOfLocation ) {


   if ( pCP->bPhaster2PhdBallCalculateNewLocationsFile_ ) {
      desiredLocation* pDL = aDesiredLocations_[ nIndexOfLocation ];
      
      long lIntersectLeft;
      long lIntersectRight;

      long lReadLeft;
      long lReadRight;

      if ( lGenomicLeftRead1_ != 0 ) {
         lReadLeft = lGenomicLeftRead1_;
         lReadRight = lGenomicRightRead1_;
      }
      else {
         lReadLeft = lGenomicLeftRead2_;
         lReadRight = lGenomicRightRead2_;
      }
      
      assert( bIntersectLong( pDL->lGenomicLocationA_,
                              pDL->lGenomicLocationB_,
                              lReadLeft,
                              lReadRight,
                              lIntersectLeft,
                              lIntersectRight ) );

      ++(*(pDL->paReadDepth_))[lIntersectLeft - pDL->lGenomicLocationA_ ];
      --(*(pDL->paReadDepth_))[lIntersectRight + 1 - pDL->lGenomicLocationA_ ];

   }


   if ( pCP->bPhaster2PhdBallSaveInPhasterFormat_ ) {
      desiredLocation* pDL = aDesiredLocations_[ nIndexOfLocation ];

      fprintf( pDL->pPhdBall_, "%s", soLine_.data() );
      return;
   }


   RWTValOrderedVector<int> aQualitiesRead1( soDecodedBasesRead1_.length(),
                                             "aQualitiesRead1" );

   RWTValOrderedVector<int> aQualitiesRead2( soDecodedBasesRead2_.length(),
                                             "aQualitiesRead2" );

   convertQualities( soEncodedQualitiesRead1_, aQualitiesRead1 );

   if (bTwoReads_ )
   convertQualities( soEncodedQualitiesRead2_, aQualitiesRead2 );

   assert( soDecodedBasesRead1_.length() == aQualitiesRead1.length() );

   if (bTwoReads_ )
   assert( soDecodedBasesRead2_.length() == aQualitiesRead2.length() );

   // eliminate any gap characters

   RWCString soBasesWithoutGapsRead1( soDecodedBasesRead1_ );
   RWCString soBasesWithoutGapsRead2( soDecodedBasesRead2_ );

   int nBase;
   for( nBase = soBasesWithoutGapsRead1.length() - 1; nBase >= 0; --nBase ) {
      if ( soBasesWithoutGapsRead1[ nBase ] == '-' ) {
         soBasesWithoutGapsRead1.remove( nBase, 1 );
         aQualitiesRead1.removeAt( nBase );
      }
   }

   if ( bTwoReads_ ) {
      for( nBase = soBasesWithoutGapsRead2.length() - 1; nBase >= 0; --nBase ) {
         if ( soBasesWithoutGapsRead2[ nBase ] == '-' ) {
            soBasesWithoutGapsRead2.remove( nBase, 1 );
            aQualitiesRead2.removeAt( nBase );
         }
      }
   }

   // reverse complement bottom strand reads

   if ( nRead1Left_ > nRead1Right_ ) {
      soBasesWithoutGapsRead1 = soComplementSO( soBasesWithoutGapsRead1 );
      aQualitiesRead1.reverseElementOrder();
   }

   if ( bTwoReads_ && (nRead2Left_ > nRead2Right_ ) ) {
      soBasesWithoutGapsRead2 = soComplementSO( soBasesWithoutGapsRead2 );
      aQualitiesRead2.reverseElementOrder();
   }
   

   // fix read names so they have the lane and tile as well
   // phaster phout file looks like this:
   // /codon/gordon/for_phil/100205_GRC5_0002_614VMAAXX/run_100524/lane4/4_62.uncalibrated.clusters.phout

   RWCString soPhasterPhoutWithoutDirectory = 
      filCurrentPhasterFile_.soGetBasename();

   int nContainsDot = soPhasterPhoutWithoutDirectory.first( '.' );

   RWCString soLaneAndTile;
   if ( nContainsDot == RW_NPOS ) {
      soLaneAndTile = soPhasterPhoutWithoutDirectory;
   }
   else {
      soLaneAndTile = soPhasterPhoutWithoutDirectory( 0, nContainsDot );
   }

   RWCString soReadName1 = soLaneAndTile + "_" + soReadName_;
   RWCString soReadName2 = soLaneAndTile + "_" + soReadName_ + ".2";


   if ( ( lGenomicLeftRead1_ == 0 ) || 
        ( pCP->soPhaster2PhdBallSaveWhichMate_ == "both" ) ) {
      writeReadToPhdBall( nIndexOfLocation, soReadName1, 
                          soBasesWithoutGapsRead1, 
                          aQualitiesRead1, cREAD1 );
   }


   if ( bTwoReads_ ) {
      
      if ( ( lGenomicLeftRead2_ == 0 ) ||
           ( pCP->soPhaster2PhdBallSaveWhichMate_ == "both" ) ) {

         writeReadToPhdBall( nIndexOfLocation, 
                          soReadName2, soBasesWithoutGapsRead2, 
                          aQualitiesRead2, cREAD2 );
      }
   }

}


void phaster2PhdBall :: writeReadToPhdBall( 
                             const int nIndexOfLocation,
                             const RWCString& soReadName,
                             const RWCString& soBases,
                             const RWTValOrderedVector<int>& aQualities,
                             const char cFirstOrSecondRead ) {


   desiredLocation* pDL = aDesiredLocations_[ nIndexOfLocation ];


   fprintf( pDL->pPhdBall_, "BEGIN_SEQUENCE %s 1\nBEGIN_COMMENT\n",
            soReadName.data() );

   // no CHROMAT_FILE, ABI_THUMBPRINT, PHRED_VERSION, CALL_METHOD, or
   // QUALITY_VALUES for solexa reads

   fprintf( pDL->pPhdBall_, "TIME: %s\n", soDateTimeForTimestamp_.data() );
   fprintf( pDL->pPhdBall_, "CHEM: solexa\n" );
   fprintf( pDL->pPhdBall_, "END_COMMENT\n" );


   fprintf( pDL->pPhdBall_, "BEGIN_DNA\n" );

   assert( soBases.length() == aQualities.length() );

   for( int nBase = 0; nBase < soBases.length(); ++nBase ) {
      fprintf( pDL->pPhdBall_, "%c %d\n",
               soBases[ nBase ],
               aQualities[ nBase ] );

   }

   fprintf( pDL->pPhdBall_, "END_DNA\n" );
   fprintf( pDL->pPhdBall_, "END_SEQUENCE\n\n" );


   fprintf( pDL->pPhdBall_, "WR{\n" );
   fprintf( pDL->pPhdBall_, "template phaster2PhdBall %s\n",
            soDateTimeForWRItems_.data() );

   RWCString soTemplateName( soReadName );
   soTemplateName.bEndsWithAndRemove( ".2" );

   fprintf( pDL->pPhdBall_, "name: %s\n", soTemplateName.data() );
   fprintf( pDL->pPhdBall_, "}\n\n" );

   fprintf( pDL->pPhdBall_, "WR{\n" );
   fprintf( pDL->pPhdBall_, "primer phaster2PhdBall %s\n",
            soDateTimeForWRItems_.data() );
   fprintf( pDL->pPhdBall_, "type: univ %s\n",
            ( cFirstOrSecondRead == cREAD1 ? "fwd" : "rev" ) );

   fprintf( pDL->pPhdBall_, "}\n\n" );


   if ( 
       ( ( cFirstOrSecondRead == cREAD1 ) && 
         ( lGenomicLeftRead1_ == 0 ) ) 
        ||
       ( ( cFirstOrSecondRead == cREAD2 ) &&
         ( lGenomicLeftRead2_  == 0 ) )
       ) {
      fprintf( pDL->pPhdBall_, "WR{\n" );
      fprintf( pDL->pPhdBall_, "unmapped phaster2PhdBall %s\n",
               soDateTimeForWRItems_.data() );
      fprintf( pDL->pPhdBall_, "}\n\n" );
   }

   // just so I can see something at the beginning
   if ( nHits_ < 200 ) {
      fflush( pDL->pPhdBall_ );
   }

}


void phaster2PhdBall :: convertQualities( 
                            const RWCString& soEncodedQualities,
                            RWTValOrderedVector<int>& aQualities ) {

   for( int n = 0; n < soEncodedQualities.length(); ++n ) {
      int nQuality = (int) soEncodedQualities[n] - 33;

      aQualities.append( nQuality );
   }
}

      
long phaster2PhdBall :: lConvertToGenomic( const RWCString& soChromosome,
                                           const long l1ChrPos ) {


   if ( soChromosome == "genomic" )
      return l1ChrPos;

   int nChromosomeIndex;
   if ( soLastChromosome_ == soChromosome ) {
      nChromosomeIndex = nLastChromosomeIndex_;
   }
   else {
      nChromosomeIndex = aConvertChromosomes_.index( soChromosome );
      if ( nChromosomeIndex == RW_NPOS ) {
         cerr << "dump of chromosomes: " << endl;
         for( int n = 0; n < aConvertChromosomes_.length(); ++n ) {
            cerr << aConvertChromosomes_[n] << endl;
         }

         THROW_ERROR2( "cannot find chromosome " + soChromosome + " in cref list" );
      }

      soLastChromosome_ = soChromosome;
      nLastChromosomeIndex_ = nChromosomeIndex;
   }

   // if reached here, have set nChromosomeIndex

   long l1GenomicPos = aConvertStarts_[ nChromosomeIndex ] + l1ChrPos - 1;

   if ( l1GenomicPos > aConvertEnds_[ nChromosomeIndex ] ) {
      THROW_ERROR2( soChromosome + " cannot go to pos " + 
                    RWCString( (long) l1ChrPos ) + " largest is " +
                    RWCString( (long) ( aConvertEnds_[ nChromosomeIndex ]
                                        - 
                                        aConvertStarts_[ nChromosomeIndex ]
                                        + 1 ) ) );
   }

   return l1GenomicPos;
}


// for testing
void phaster2PhdBall :: convertFromGenomic( const long lGenomic,
                                            RWCString& soChromosome,
                                            int& n1ChrPos ) {

   for( int nChr = 0; nChr < aConvertStarts_.length(); ++nChr ) {
      if ( ( aConvertStarts_[ nChr ] <= lGenomic ) &&
           ( lGenomic < aConvertEnds_[ nChr ] ) ) {

         // found the right chromosome

         soChromosome = aConvertChromosomes_[ nChr ];
         
         n1ChrPos = lGenomic - aConvertStarts_[ nChr ] + 1;
         return;
      }
   }

   soChromosome = "";
   return;
}
      

// - is not a special character so doesn't need \\ in front of it
RWCRegexp regWindow( "^[0-9,]+-[0-9,]+$" );

      
void phaster2PhdBall :: parseLocationsFile() {

   FILE* pPhasterLocations = fopen( filPhasterLocations_, "r" );
   if ( !pPhasterLocations ) {
      THROW_FILE_ERROR( filPhasterLocations_ );
   }


   bool bFirstTime = true;
   while( fgets( soLine_.data(), nMAXLINESIZEB, pPhasterLocations ) != NULL ) {

      soLine_.nCurrentLength_ = strlen( soLine_.data() );
      soLine_.stripTrailingWhitespaceFast();
      if ( soLine_.bIsNull() ) continue;
      if ( soLine_[0] == '#' ) continue;

      // looks like:
      // chrX 1234

      // or

      // chrX 1234 M

      // or

      // chrX 1234 1334

      // or

      // chrX 1234 M 1334 A

      // chrX 1234-2589

      // "chrX" in each case above can be replace by "genomic"
      // "unmapped" can follow any of these in which case the pair
      // will only be accepted if one of the mates is unmapped

      mbtValOrderedVectorOfRWCString aWords;
      getAllTokens( soLine_, aWords );


      // join tokens into a file name

      FileName filPhdBall;
      for( int nToken = 0; nToken < aWords.length(); ++nToken ) {
         filPhdBall += aWords[nToken];
         if ( nToken != ( aWords.length() - 1 ) )
            filPhdBall += "_";
      }
      filPhdBall += ".phdball";


      if ( aWords.length() < 2 ) {
         RWCString soError = "not enough tokens in location line: " +
            soLine_;
         THROW_ERROR2( soError );
      }

      bool bMateUnmapped = false;
      int nIndexOfUnmapped = aWords.index( "unmapped" );
      if ( nIndexOfUnmapped != RW_NPOS ) {
         bMateUnmapped = true;
         aWords.removeAt( nIndexOfUnmapped );
      }

      desiredLocation* pDL = NULL;

      if ( aWords.length() == 2 ) {
         if ( ( aWords[1] ).bContains( regWindow ) ) {

            // case of
            // charX 1234-5678

            RWCTokenizer tok( aWords[1] );
            RWCString soStartWindow = tok('-' );
            RWCString soEndWindow = tok('-' );


            soStartWindow.removeAllCommas();
            soEndWindow.removeAllCommas();

            long l1ChrPosLeft;
            long l1ChrPosRight;
         
            l1ChrPosLeft = atol( soStartWindow.data() );
            l1ChrPosRight = atol( soEndWindow.data() );

            pDL = 
               new desiredLocation( bMateUnmapped,
                                    aWords[0],
                                    l1ChrPosLeft,
                                    desiredLocation::cMatchBothSites,
                                    filPhdBall );


            pDL->l1ChrPosB_ = l1ChrPosRight;
            //         pDL->cAllowedAlleles_ = 'n';
            pDL->cTypeOfMatch_ = desiredLocation::cMatchWindow;

         }
         else {
            
            // case of 
            // chrX 1234

            aWords[1].removeAllCommas();

            if ( !bIsNumeric( aWords[1] ) ) {
               RWCString soError = "2nd token is not numeric in " + 
                  soLine_;
               THROW_ERROR( soError );
            }

            long l1ChrPos = atol( aWords[1].data() );

      
            pDL = 
               new desiredLocation( bMateUnmapped,
                                    aWords[0],
                                    l1ChrPos,
                                    desiredLocation::cMatchOneSite,
                                    filPhdBall );

         }
      }
      else if ( aWords.length() == 3 ) {
         if ( ( aWords[2].length() == 1  ) &&
              ( isalpha( (aWords[2] )[0] ) || 
                ( aWords[2] == "-" ) ) ) {

            // case of
            // chrX 1234 M

            if ( aWords[2].length() != 1 ) {
               RWCString soError = "3rd token (ambiguity code) is not of length 1: " +
                  soLine_;
               THROW_ERROR2( soError );
            }

            aWords[1].removeAllCommas();

            if ( !bIsNumeric( aWords[1] ) ) {
               RWCString soError = "2nd token is not numeric in " + 
                  soLine_;
               THROW_ERROR( soError );
            }

            long l1ChrPos = atol( aWords[1].data() );


            pDL = 
               new desiredLocation( bMateUnmapped,
                                    aWords[0],
                                    l1ChrPos,
                                    desiredLocation::cMatchOneSite,
                                    filPhdBall
                                    );

            pDL->cAllowedAllelesA_ = (aWords[2])[0];

         }
         else {
            // case of 
            // chrX 1234 1334

            aWords[1].removeAllCommas();
            aWords[2].removeAllCommas();

            if ( !bIsNumeric( aWords[1] ) ) {
               RWCString soError = "unrecognized format A: " + soLine_;
               THROW_ERROR2( soError );
            }

            if ( !bIsNumeric( aWords[2] ) ) {
               RWCString soError = "unrecognized format B: " + soLine_;
               THROW_ERROR2( soError );
            }

            long l1ChrPosA = atol( aWords[1].data() );
            long l1ChrPosB = atol( aWords[2].data() );

            if ( l1ChrPosA > l1ChrPosB ) {
               long lTemp = l1ChrPosA;
               l1ChrPosA = l1ChrPosB;
               l1ChrPosB = lTemp;
            }

            pDL =
               new desiredLocation( bMateUnmapped,
                                    aWords[0],
                                    l1ChrPosA,
                                    desiredLocation::cMatchBothSites,
                                    filPhdBall );

            pDL->l1ChrPosB_ = l1ChrPosB;

         }
      }
      else if ( aWords.length() == 5 ) {

         // chrX 1234 M 1334 A

         aWords[1].removeAllCommas();
         aWords[3].removeAllCommas();


         if ( ! (
                 bIsNumeric( aWords[1] ) &&
                 bIsNumeric( aWords[3] ) &&
                 ( aWords[2].length() == 1 ) &&
                 ( aWords[2].length() == 1 ) ) ) {

            RWCString soError = "5 tokens but unrecognized format a: " + soLine_;

            THROW_ERROR2( soError );

         }
         
         if ( !isalpha( (aWords[2])[0] ) ||
              !isalpha( (aWords[4])[0] ) ) {
            RWCString soError = "5 tokens but unrecognized format b: " + 
               soLine_;
            THROW_ERROR2( soError );
         }

         int l1ChrPosA = atol( aWords[1].data() );
         int l1ChrPosB = atol( aWords[3].data() );

         if ( l1ChrPosA > l1ChrPosB ) {
            long lTemp = l1ChrPosA;
            l1ChrPosA = l1ChrPosB;
            l1ChrPosB = lTemp;
         }

         pDL =
            new desiredLocation( bMateUnmapped,
                                 aWords[0],
                                 l1ChrPosA,
                                 desiredLocation::cMatchBothSites,
                                 filPhdBall 
                                 );

         pDL->l1ChrPosB_ = l1ChrPosB;

         pDL->cAllowedAllelesA_ = (aWords[2])[0];
         pDL->cAllowedAllelesB_ = (aWords[4])[0];
      }
      else {
         RWCString soError = "unrecognized format--wrong # of tokens: " +
            soLine_;
         THROW_ERROR2( soError );
      }


      aDesiredLocations_.append( pDL );
   }


   fclose( pPhasterLocations );


} //void phaster2PhdBall :: parseLocationsFile() {


void phaster2PhdBall :: setGenomicLocationsOfDesiredLocations() {

   // set nGenomicLocationA_ and nGenomicLocationB_

   for( int nLoc = 0; nLoc < aDesiredLocations_.length(); ++nLoc ) {
      desiredLocation* pDL = aDesiredLocations_[nLoc];

      pDL->lGenomicLocationA_ = lConvertToGenomic( pDL->soChromosome_,
                                                   pDL->l1ChrPosA_ );

      pDL->lGenomicLocationB_ = lConvertToGenomic( pDL->soChromosome_,
                                                   pDL->l1ChrPosB_ );

      cerr << "desired location: " << pDL->soChromosome_ << " " << 
         pDL->l1ChrPosA_ << " " << pDL->lGenomicLocationA_ << endl;

   }
}


void phaster2PhdBall :: openPhdBalls() {
   
   for( int nLoc = 0; nLoc < aDesiredLocations_.length(); ++nLoc ) {
      desiredLocation* pDL = aDesiredLocations_[ nLoc ];

      pDL->pPhdBall_ = fopen( pDL->filPhdBall_.data(), "w" );
      if ( !pDL->pPhdBall_ ) {
         THROW_FILE_ERROR( pDL->filPhdBall_ );
      }

      fprintf( pDL->pPhdBall_, "# %s\n", pDL->filPhdBall_.data() );

      fprintf( pPhdBallFOF_, "%s\n", pDL->filPhdBall_.data() );
   }
}


void phaster2PhdBall :: closePhdBalls() {
   
   for( int nLoc = 0; nLoc < aDesiredLocations_.length(); ++nLoc ) {
      desiredLocation* pDL = aDesiredLocations_[ nLoc ];

      fclose( pDL->pPhdBall_ );
   }

}

void phaster2PhdBall :: setReadDepthArrays() {

   for( int nLoc = 0; nLoc < aDesiredLocations_.length(); ++nLoc ) {
      desiredLocation* pDL = aDesiredLocations_[ nLoc ];

      pDL->paReadDepth_ = new RWTValVector<int>( pDL->lGenomicLocationB_ -
                                        pDL->lGenomicLocationA_ + 2,    
                                        0,
                                        "paReadDepth" );
   }
}


void phaster2PhdBall :: writeNewLocationsFile() {

   const char cNotInRegion = 'n';
   const char cInARegion = 'i';


   for( int nLoc = 0; nLoc < aDesiredLocations_.length(); ++nLoc ) {
      desiredLocation* pDL = aDesiredLocations_[ nLoc ];

      int n;
      for( n = 1; n <= pDL->paReadDepth_->nGetEndIndex(); ++n ) {
         (*(pDL->paReadDepth_))[n] +=
            (*(pDL->paReadDepth_))[n-1];
      }

      // this check makes sure that we always finish the last
      // region

      if ( (*(pDL->paReadDepth_))[ pDL->paReadDepth_->nGetEndIndex() ] !=
           0 ) {
         cerr << "read depth at end should be zero but was " <<
            (*(pDL->paReadDepth_))[ pDL->paReadDepth_->nGetEndIndex() ]
              << endl;
         cerr << "nLoc = " << nLoc << " end index = " <<  pDL->paReadDepth_->nGetEndIndex()
              << endl;
         assert( false );

      }

      long lRegionLeft;
      long lRegionRight;
      char cState = cNotInRegion;
      int nMaxDepthOfCoverage = 0;
      for( n = 0; n <= pDL->paReadDepth_->nGetEndIndex(); ++n ) {
         bool bHasReads = ( (*(pDL->paReadDepth_))[n] > 0 );
         
         if ( bHasReads && (cState == cNotInRegion ) ) {
            lRegionLeft = n + pDL->lGenomicLocationA_;
            nMaxDepthOfCoverage =  (*(pDL->paReadDepth_))[n];
            cState = cInARegion;
         }
         else if ( bHasReads && ( cState == cInARegion ) ) {
            nMaxDepthOfCoverage = MAX( nMaxDepthOfCoverage,
                                       (*(pDL->paReadDepth_))[n] );

         }
         else if ( !bHasReads && ( cState == cInARegion ) ) {
            lRegionRight = n - 1 + pDL->lGenomicLocationA_;

            fprintf( pNewLocations_, "genomic %ld-%ld unmapped %d\n",
                     lRegionLeft,
                     lRegionRight,
                     nMaxDepthOfCoverage );

            cState = cNotInRegion;

         }
      } //  for( n = 0; n <= pDL->paReadDepth_->nGetEndIndex(); ++n ) {
   }


}