/*****************************************************************************
#   Copyright (C) 1994-2008 by David Gordon.
#   All rights reserved.                           
#                                                                           
#   This software is part of a beta-test version of the Consed/Autofinish
#   package.  It should not be redistributed or
#   used for any commercial purpose, including commercially funded
#   sequencing, without written permission from the author and the
#   University of Washington.
#   
#   This software is provided ``AS IS'' and any express or implied
#   warranties, including, but not limited to, the implied warranties of
#   merchantability and fitness for a particular purpose, are disclaimed.
#   In no event shall the authors or the University of Washington be
#   liable for any direct, indirect, incidental, special, exemplary, or
#   consequential damages (including, but not limited to, procurement of
#   substitute goods or services; loss of use, data, or profits; or
#   business interruption) however caused and on any theory of liability,
#   whether in contract, strict liability, or tort (including negligence
#   or otherwise) arising in any way out of the use of this software, even
#   if advised of the possibility of such damage.
#
#   Building Consed from source is error prone and not simple which is
#   why I provide executables.  Due to time limitations I cannot
#   provide any assistance in building Consed.  Even if you do not
#   modify the source, you may introduce errors due to using a
#   different version of the compiler, a different version of motif,
#   different versions of other libraries than I used, etc.  For this
#   reason, if you discover Consed bugs, I can only offer help with
#   those bugs if you first reproduce those bugs with an executable
#   provided by me--not an executable you have built.
# 
#   Modifying Consed is also difficult.  Although Consed is modular,
#   some modules are used by many other modules.  Thus making a change
#   in one place can have unforeseen effects on many other features.
#   It may takes months for you to notice these other side-effects
#   which may not seen connected at all.  It is not feasable for me to
#   provide help with modifying Consed sources because of the
#   potentially huge amount of time involved.
#
#*****************************************************************************/
#ifndef MISC_PROGRAM
#include    "miscProgram.h"

void miscProgram() {}


#else


#include    "miscProgram.h"
#include    "soLine.h"
#include    "bIntervalsIntersect.h"
#include    "rwcstring.h"
#include    "rwctokenizer.h"
#include    "mbtPtrOrderedVector.h"
#include    "filename.h"
#include    <stdio.h>
#include    "mbt_exception.h"
#include    "bIsNumericMaybeWithWhitespace.h"
#include    "min.h"
#include    "max.h"
#include    "bIntersect.h"
#include    "rwtvalorderedvector.h"
#include    <math.h>
#include    "mbtValOrderedVectorOfRWCString.h"
#include    "complement_so.h"
#include    "setNumberFromBaseTable.h"
#include    "nNumberFromBase.h"
#include    <stdlib.h>
#include    "soAddCommas.h"
#include    "rwtvalvector.h"
#include    <unistd.h>
#include    "cComplementBase.h"




class region {
public:
   RWCString soChromosome_;
   int nStart_;
   int nEnd_;
   bool bOKToUse_;
   bool bMultipleOverlaps_;
public:
   region( const RWCString& soChromosome,
           const int nStart,
           const int nEnd ) :
      soChromosome_( soChromosome ),
      nStart_( nStart ),
      nEnd_( nEnd ),
      bOKToUse_( true ),
      bMultipleOverlaps_( false ) {}


   bool operator==( const region& myRegion ) {
      return( ( soChromosome_ == myRegion.soChromosome_ ) &&
              ( nStart_ == myRegion.nStart_ ) &&
              ( nEnd_ == myRegion.nEnd_ ) );
   }

   bool operator<( const region& myRegion ) const {
      if ( soChromosome_ != myRegion.soChromosome_ ) {
         return( soChromosome_ < myRegion.soChromosome_ );
      }
      else {
         return( nStart_ < myRegion.nStart_ );
      }
   }

   int nGetLength() {
      return( nEnd_ - nStart_ + 1 );
   }
};




static bool bRegionsOverlap( region* pRegion1, region* pRegion2 ) {
   if ( pRegion1->soChromosome_ != pRegion2->soChromosome_ )
      return false;

   return ( bIntervalsIntersect( pRegion1->nStart_,
                                 pRegion1->nEnd_,
                                 pRegion2->nStart_,
                                 pRegion2->nEnd_ ) );
}


static bool bRegionsLessThan( region* pRegion1, region* pRegion2 ) {
   if ( pRegion1->soChromosome_ < pRegion2->soChromosome_ )
      return true;
   else if ( pRegion1->soChromosome_ > pRegion2->soChromosome_)
      return false;
   else {
      if ( pRegion1->nEnd_ < pRegion2->nStart_ )
         return true;
      else
         return false;
   }
}


static bool bRegionContainedWithinRegion( region* pRegion1, region* pRegion2 ) {
   if ( pRegion1->soChromosome_ != pRegion2->soChromosome_ ) 
      return false;

   if ( ( pRegion2->nStart_ <= pRegion1->nStart_ ) &&
        ( pRegion1->nEnd_ <= pRegion2->nEnd_ ) ) {
      return true;
   }
   else
      return false;
}


      



static void run_100812() {
   // used to compare Ian and Tristan's coordinates


//    FileName filHumanGenomeFOF = "/codon/gordon/genomes/human_genome_hg18/human_genome.fof";

//    FILE* pHumanGenomeFOF = fopen( filHumanGenomeFOF.data(), "r" );

//    if ( !pHumanGenomeFOF ) {
//       THROW_FILE_ERROR( filHumanGenomeFOF );
//    }

//    while( fgets( soLine.data(), nMaxLineSize, pHumanGenomeFOF ) != NULL ) {
//       FileName filChromosome( soLine.data() );

//       // how big is this chromosome?

//       FILE* pChr = fopen( filChromosome.data(), "r" );
//       if ( !pChr ) {
//          THROW_FILE_ERROR( filChromosome );
//       }

//       int nChrLength = 0;
//       while( fgets( soLine.data(), nMaxLineSize, pHumanGenomeFOF ) != NULL ) {

//          soLine.nCurrentLength_ = strlen( soLine.data() );

//          soLine.stripTrailingWhitespaceFast();

//          if ( soLine[0] == '>' ) continue;

//          nChrLength += soLine.length();
//       }

//       fclose( filChromosome.data() );

//       printf( "%s has %d bases\n", filChromosome.data(), nChrLength );

      
//       mbtValVectorOfBool aIan( nChrLength, 1, "aIan" );
//       mbtValVectorOfBool aTristan( nChrLength, 1, "aTristan" );

//       apply( filChromosome, aIanRegions, aIan );
//       apply( filChromosome, aTristanRegions, aTristan );

//       mbtValVector<unsigned char> aJustIan( nChrLength, 1, 0, "aJustIan" );
//       mbtValVector<unsigned char> aJustTristan( nChrLength, 1, 0, "aJustTristan" );

//       for( int nChrPos = 1; nChrPos <= nChrLength; ++nChrPos ) {
//          if ( aIan[nChrPos] != aTristan[nChrPos] ) {
//             if ( aIan[ nChrPos ] ) {
//                aJustIan[ nChrPos ] = 1;
//             }
//             else {
//                aJustTristan[ nChrPos ] = 1;
//             }
//          }
//       }

//    }


   // used to compare Ian and Tristan's coordinates

   FileName filTristan = "/codon/gordon/misc/geneClassifications/exomeSegments/exome_coords.txt";

   FileName filIan     = "/codon/gordon/misc/geneClassifications/otherExomeSegments/nimblegen_solution_ccds_2008.HG18.list";

   FILE* pIan = fopen( filIan.data(), "r" );
   if ( !pIan ) {
      THROW_FILE_ERROR( filIan );
   }

   FILE* pTristan = fopen( filTristan.data(), "r" );
   if ( !pTristan ) {
      THROW_FILE_ERROR( filTristan );
   }


   
   mbtPtrOrderedVector<region> aTristanRegion( 160000 );
   mbtPtrOrderedVector<region> aIanRegion( 160000 );

   
   while( fgets( soLine.data(), nMaxLineSize, pTristan ) != NULL ) {
      soLine.nCurrentLength_ = strlen( soLine.data() );

      // looks like:
      // chr1    58953   59871

      RWCTokenizer tok( soLine );
      RWCString soChromosome = tok();
      RWCString soStart = tok();
      RWCString soEnd = tok();

      if ( soEnd.bIsNull() ) {
         THROW_ERROR2( "LINE " + soLine + " should be of form chr start end" );
      }


      int nStart;
      int nEnd;

      assert( bIsNumericMaybeWithWhitespace( soStart, nStart ) );
      assert( bIsNumericMaybeWithWhitespace( soEnd, nEnd ) );

      region* pRegion = new region( soChromosome, nStart, nEnd );
      aTristanRegion.append( pRegion );
   }


   fclose( pTristan );

   RWCString soChromosome( (size_t) 1000);

   int nStart;
   int nEnd;

   int nTokens;
   int nLinesRead = 1;

   while( fgets( soLine.data(), nMaxLineSize, pIan ) != NULL ) {
      soLine.nCurrentLength_ =  strlen( soLine.data() );

      int nColon = soLine.index( ":" );
      assert( nColon != RW_NPOS );

      int nDash = soLine.index( "-", nColon + 1 );
      assert( nDash != RW_NPOS );

      RWCString soChromosome = soLine( 0, nColon );

      nTokens = sscanf( soLine.soGetRestOfString( nColon + 1 ).data(), "%d-%d",
                        &nStart, 
                        &nEnd );

      if ( nTokens != 2 ) {

         soLine.nCurrentLength_ =  strlen( soLine.data() );
         soChromosome.nCurrentLength_ = strlen( soChromosome.data() );

         THROW_ERROR2( "data error with Ian, nToken = " + RWCString( (long) nTokens ) + " converted string is " + soChromosome + " remaining string is " + soLine  + " line " + RWCString( (long) nLinesRead )  );
      }

      ++nLinesRead;

      region* pRegion = new region( soChromosome, nStart, nEnd );
      aIanRegion.append( pRegion );
   }

   fclose( pIan );

   aTristanRegion.resort();
   aIanRegion.resort();


   printf( "original tristan regions: %d\n", aTristanRegion.length() );
   printf( "original ian regions: %d\n", aIanRegion.length() );


   // do any ian regions overlap?

   
   int nIntersectingIanRegions = 0;
   int nIan;
   for( nIan = 1; nIan < aIanRegion.length(); ++nIan ) {
      
      region* pIan1 = aIanRegion[nIan-1];
      region* pIan2 = aIanRegion[nIan];

      if ( bRegionsOverlap( pIan1, pIan2 ) ) {
         ++nIntersectingIanRegions;
      }
   }

   printf( "ian regions overlapping: %d\n", nIntersectingIanRegions );


   // how many tristan regions are there that don't overlap any ian regions?

   int nTristanRegionsWithoutIan = 0;

   int nTristan;
   for( nTristan = 0; nTristan < aTristanRegion.length(); ++nTristan ) {
      region* pTristan = aTristanRegion[ nTristan ];

      bool bFoundOverlapWithIan = false;

      int nIan = aIanRegion.nFindIndexOfMatchOrPredecessor( pTristan );
      if ( nIan == RW_NPOS ) nIan = 0;
      while( ( nIan < aIanRegion.length() ) ) {
         region* pIan = aIanRegion[ nIan ];

         if ( bRegionsOverlap( pTristan, pIan ) ) {
            bFoundOverlapWithIan = true;
            break;
         }

         if ( bRegionsLessThan( pTristan, pIan ) ) {
            break;
         }
         
         ++nIan;
      }

      if ( !bFoundOverlapWithIan ) {
         ++nTristanRegionsWithoutIan;
      }
   }

   printf( "tristan regions without ian: %d\n", nTristanRegionsWithoutIan );


   // how many ian regions are there without any tristan?

   int nIanRegionsWithoutTristan = 0;

   for( nIan = 0; nIan < aIanRegion.length(); ++nIan ) {
      region* pIan = aIanRegion[ nIan ];

      bool bFoundOverlapWithTristan = false;

      int nTristan = aTristanRegion.nFindIndexOfMatchOrPredecessor( pIan );

      if ( nTristan == RW_NPOS ) nTristan = 0;

      while( ( nTristan < aTristanRegion.length() ) ) {
         region* pTristan = aTristanRegion[ nTristan ];

         if ( bRegionsOverlap( pIan, pTristan ) ) {
            bFoundOverlapWithTristan = true;
            break;
         }

         if ( bRegionsLessThan( pIan, pTristan ) ) {
            break;
         }
         
         ++nTristan;
      }

      if ( !bFoundOverlapWithTristan ) {
         printf( "ian region without overlap with tristan: %s %d %d\n",
                 pIan->soChromosome_.data(),
                 pIan->nStart_,
                 pIan->nEnd_ );

         ++nIanRegionsWithoutTristan;
      }
   }

   printf( "ian regions without tristan: %d\n", nIanRegionsWithoutTristan );
   


   // go through and eliminate ian regions that overlap

   int nIntersectingRegions = 0;
   for( nTristan = 1; nTristan < aTristanRegion.length(); ++nTristan ) {
      
      region* pTristan1 = aTristanRegion[nTristan-1];
      region* pTristan2 = aTristanRegion[nTristan];

      if ( ( pTristan1->soChromosome_ == pTristan2->soChromosome_  ) &&
           ( bIntervalsIntersect( pTristan1->nStart_,
                                  pTristan1->nEnd_,
                                  pTristan2->nStart_,
                                  pTristan2->nEnd_ ) ) ) {
         // how much do they overlap by?

         int nOverlap = pTristan1->nEnd_ - pTristan2->nStart_ + 1;

         printf( "overlap by %d\n", nOverlap );


         pTristan1->bOKToUse_ = false;
         pTristan2->bOKToUse_ = false;

         ++nIntersectingRegions;

         // find corresponding region in aIan

         region* pDummy = new region( pTristan1->soChromosome_,
                                      MIN( pTristan1->nStart_, pTristan2->nStart_ ),
                                      MAX( pTristan1->nEnd_, pTristan2->nEnd_ )
                                      );

         int nIndex = aIanRegion.nFindIndexOfMatchOrPredecessor( pDummy );
         if ( nIndex == RW_NPOS ) nIndex = 0;
         while( nIndex < aIanRegion.length() ) {

            region* pIanRegion = aIanRegion[ nIndex ];
            if ( bRegionsOverlap( pDummy, pIanRegion ) ) {
               pIanRegion->bOKToUse_ = false;
            }
            else {

               if ( pDummy->soChromosome_ <
                    pIanRegion->soChromosome_ ) {
                  break;
               }
               else if ( pDummy->soChromosome_ ==
                         pIanRegion->soChromosome_ ) {
                  if ( pDummy->nEnd_ < pIanRegion->nStart_ )
                     break;
               }
            }
            ++nIndex;
         }
      } //  if ( ( pTristan1->soChromosome_ == pTristan2->soChromosome_  ) &&
   }
   
   printf( "tristan regions that overlap previous region: %d\n",
           nIntersectingRegions );

   mbtPtrOrderedVector<region> aTristanRegionTemp = aTristanRegion;
   mbtPtrOrderedVector<region> aIanRegionTemp = aIanRegion;


   aTristanRegion.clear();
   aIanRegion.clear();

   int n;
   for( n = 0; n < aTristanRegionTemp.length(); ++n ) {
      region* pRegion = aTristanRegionTemp[ n ];
      if ( pRegion->bOKToUse_ ) {
         aTristanRegion.append( pRegion );
      }
   }

   for( n = 0; n < aIanRegionTemp.length(); ++n ) {
      region* pRegion = aIanRegionTemp[ n ];
      if ( pRegion->bOKToUse_ ) {
         aIanRegion.append( pRegion );
      }
   }


   printf( "now tristan: %d\n", aTristanRegion.length() );
   printf( "now ian:     %d\n", aIanRegion.length() );

   
   aTristanRegionTemp.clear();
   aIanRegionTemp.clear();


   // look for regions in Ian that have many points than in Tristan

   int nRegionsWithLotsOfExtraBases = 0;
   for( nIan = 0; nIan < aIanRegion.length(); ++nIan ) {
      region* pIan = aIanRegion[ nIan ];

      int nTristan = aTristanRegion.nFindIndexOfMatchOrPredecessor( pIan );
      if ( nTristan == RW_NPOS ) nTristan = 0;

      while( nTristan < aTristanRegion.length() ) {

         region* pTristan = aTristanRegion[ nTristan ];
         if ( bRegionsLessThan( pIan, pTristan ) ) break;
         
         if ( bRegionsOverlap( pTristan, pIan ) ) {
            // how much do they overlap and how much do they
            // not overlap?

            int nIntersectStart;
            int nIntersectEnd;

            assert( bIntersect( pTristan->nStart_,
                                pTristan->nEnd_,
                                pIan->nStart_,
                                pIan->nEnd_,
                                nIntersectStart,
                                nIntersectEnd ) );

            int nIntersectingSize = nIntersectEnd - nIntersectStart + 1;

            int nNonIntersectingSizeAtBeginning = 
               MAX( pTristan->nStart_,
                    pIan->nStart_ ) 
               -
               MIN( pTristan->nStart_,
                    pIan->nStart_ );


            int nNonIntersectingSizeAtEnd =
               MAX( pTristan->nEnd_,
                    pIan->nEnd_ )
               -
               MIN( pTristan->nEnd_,
                    pIan->nEnd_ );

            assert( pIan->nStart_ < pTristan->nStart_ );
            assert( pTristan->nEnd_ < pIan->nEnd_ );

            printf( "intersecting size: %d noninter left: %d noninter right %d",
                    nIntersectingSize,
                    nNonIntersectingSizeAtBeginning,
                    nNonIntersectingSizeAtEnd );

            if ( ( nNonIntersectingSizeAtBeginning +
                   nNonIntersectingSizeAtEnd ) > 3 ) {
               pTristan->bOKToUse_ = false;
               pIan->bOKToUse_ = false;
               ++nRegionsWithLotsOfExtraBases;
               printf( " big " );
            }
            printf( "\n" );
         } //  if ( bRegionsOverlap( pTristan, pIan ) ) {


         ++nTristan;
      } //         while( nTristan < aTristanRegion.length() ) {
   } //    for( int nIan = 0; nIan < aIanRegion.length(); ++nIan ) {


   printf( "regions with lots of extra bases: %d\n", nRegionsWithLotsOfExtraBases );

   // again eliminate not used

   aTristanRegionTemp = aTristanRegion;
   aIanRegionTemp = aIanRegion;


   aTristanRegion.clear();
   aIanRegion.clear();

   for( n = 0; n < aTristanRegionTemp.length(); ++n ) {
      region* pRegion = aTristanRegionTemp[ n ];
      if ( pRegion->bOKToUse_ ) {
         aTristanRegion.append( pRegion );
      }
   }

   for( n = 0; n < aIanRegionTemp.length(); ++n ) {
      region* pRegion = aIanRegionTemp[ n ];
      if ( pRegion->bOKToUse_ ) {
         aIanRegion.append( pRegion );
      }
   }

   
   aTristanRegionTemp.clear();
   aIanRegionTemp.clear();
   

   printf( "tristan regions: %d\n", aTristanRegion.length() );
   printf( "ian regions:     %d\n", aIanRegion.length() );



      



   fflush( stdout );

   


//    for( int nIan = 0; nIan < aIanRegion.length(); ++nIan ) {
//       region* pIan = aIanRegion[ nIan ];

//       int nTristan = aTristanRegion.nFindIndexOfMatchOrPredecessor( pIan );

//       if ( nTristan == RW_NPOS ) {
//          nTristan = aTristanRegion.nFindIndexOfMatchOrSuccessor( pIan );
         
//          if ( nTristan == RW_NPOS ) continue;
//       }


//       for( int nLoop = 0; nLoop < 2; ++nLoop ) {
//          if ( nLoop == 1 ) {
//             ++nTristan;
//             if ( nTristan >= aTristanRegion.length() ) break;
//          }
//          region* pTristan = aTristanRegion[ nTristan ];
//          if ( bRegionsOverlap( pTristan, pIan ) ) {
//             // how much do they overlap and how much do they
//             // not overlap?

//             int nIntersectStart;
//             int nIntersectEnd;

//             assert( bIntersect( pTristan->nStart_,
//                                 pTristan->nEnd_,
//                                 pIan->nStart_,
//                                 pIan->nEnd_,
//                                 nIntersectStart,
}





static void run_100813() {

   // used to compare Ian and Tristan's coordinates

   FileName filTristan = "/codon/gordon/misc/geneClassifications/exomeSegments/exome_coords.txt";

   FileName filIan     = "/codon/gordon/misc/geneClassifications/otherExomeSegments/nimblegen_solution_ccds_2008.HG18.list";

   FILE* pIan = fopen( filIan.data(), "r" );
   if ( !pIan ) {
      THROW_FILE_ERROR( filIan );
   }

   FILE* pTristan = fopen( filTristan.data(), "r" );
   if ( !pTristan ) {
      THROW_FILE_ERROR( filTristan );
   }


   
   mbtPtrOrderedVector<region> aTristanRegion( 160000 );
   mbtPtrOrderedVector<region> aIanRegion( 160000 );


   RWTValOrderedVector<int> aHistogramSizesTristan( 10000 );
   RWTValOrderedVector<int> aHistogramSizesIan( 10000 );

   int nLongestTristanRegion = 0;
   
   while( fgets( soLine.data(), nMaxLineSize, pTristan ) != NULL ) {
      soLine.nCurrentLength_ = strlen( soLine.data() );

      // looks like:
      // chr1    58953   59871

      RWCTokenizer tok( soLine );
      RWCString soChromosome = tok();
      RWCString soStart = tok();
      RWCString soEnd = tok();

      if ( soEnd.bIsNull() ) {
         THROW_ERROR2( "LINE " + soLine + " should be of form chr start end" );
      }


      int nStart;
      int nEnd;

      assert( bIsNumericMaybeWithWhitespace( soStart, nStart ) );
      assert( bIsNumericMaybeWithWhitespace( soEnd, nEnd ) );

      // note:  converting Tristan regions to 1-based

      region* pRegion = new region( soChromosome, nStart + 1, nEnd );
      aTristanRegion.append( pRegion );

      if ( pRegion->nGetLength() > nLongestTristanRegion ) 
         nLongestTristanRegion = pRegion->nGetLength();

      while( pRegion->nGetLength() >= aHistogramSizesTristan.length() ) {
         aHistogramSizesTristan.append( 0 );
      }
      
      ++aHistogramSizesTristan[ pRegion->nGetLength() ];
   }


   fclose( pTristan );

   printf( "histogram of sizes of tristan regions:\n" );
   int nSize;
   for( nSize = 0; nSize < aHistogramSizesTristan.length(); ++nSize ) {
      if ( aHistogramSizesTristan[ nSize ] != 0 ) {
         printf( "Tristan histogram %d : %d\n", nSize, aHistogramSizesTristan[ nSize ] );
      }
   }




   RWCString soChromosome( (size_t) 1000);

   int nStart;
   int nEnd;

   int nTokens;
   int nLinesRead = 1;

   int nLongestIanRegion = 0;

   while( fgets( soLine.data(), nMaxLineSize, pIan ) != NULL ) {
      soLine.nCurrentLength_ =  strlen( soLine.data() );

      int nColon = soLine.index( ":" );
      assert( nColon != RW_NPOS );

      int nDash = soLine.index( "-", nColon + 1 );
      assert( nDash != RW_NPOS );

      RWCString soChromosome = soLine( 0, nColon );

      nTokens = sscanf( soLine.soGetRestOfString( nColon + 1 ).data(), "%d-%d",
                        &nStart, 
                        &nEnd );

      if ( nTokens != 2 ) {

         soLine.nCurrentLength_ =  strlen( soLine.data() );
         soChromosome.nCurrentLength_ = strlen( soChromosome.data() );

         THROW_ERROR2( "data error with Ian, nToken = " + RWCString( (long) nTokens ) + " converted string is " + soChromosome + " remaining string is " + soLine  + " line " + RWCString( (long) nLinesRead )  );
      }

      ++nLinesRead;

      region* pRegion = new region( soChromosome, nStart, nEnd );
      aIanRegion.append( pRegion );


      if ( pRegion->nGetLength() > nLongestIanRegion ) 
         nLongestIanRegion = pRegion->nGetLength();
   }

   fclose( pIan );

   aTristanRegion.resort();
   aIanRegion.resort();


   printf( "original tristan regions: %d\n", aTristanRegion.length() );
   printf( "original ian regions: %d\n", aIanRegion.length() );


   // do any ian regions overlap?

   
   int nIntersectingIanRegions = 0;
   int nIan;
   for( nIan = 1; nIan < aIanRegion.length(); ++nIan ) {
      
      region* pIan1 = aIanRegion[nIan-1];
      region* pIan2 = aIanRegion[nIan];

      if ( bRegionsOverlap( pIan1, pIan2 ) ) {
         ++nIntersectingIanRegions;
      }
   }

   printf( "ian regions overlapping: %d\n", nIntersectingIanRegions );


   // how many tristan regions are there that don't overlap any ian regions?

   int nTristanRegionsWithoutIan = 0;
   int nTristan;
   for( nTristan = 0; nTristan < aTristanRegion.length(); ++nTristan ) {
      region* pTristan = aTristanRegion[ nTristan ];

      bool bFoundOverlapWithIan = false;

      int nIan = aIanRegion.nFindIndexOfMatchOrPredecessor( pTristan );
      if ( nIan == RW_NPOS ) nIan = 0;
      while( ( nIan < aIanRegion.length() ) ) {
         region* pIan = aIanRegion[ nIan ];

         if ( bRegionsOverlap( pTristan, pIan ) ) {
            bFoundOverlapWithIan = true;
            break;
         }

         if ( bRegionsLessThan( pTristan, pIan ) ) {
            break;
         }
         
         ++nIan;
      }

      if ( !bFoundOverlapWithIan ) {
         ++nTristanRegionsWithoutIan;
      }
   }

   printf( "tristan regions without ian: %d\n", nTristanRegionsWithoutIan );


   // how many ian regions are there without any tristan?

   int nIanRegionsWithoutTristan = 0;

   for( nIan = 0; nIan < aIanRegion.length(); ++nIan ) {
      region* pIan = aIanRegion[ nIan ];


      region myRegion = *pIan;

      myRegion.nStart_ -= nLongestTristanRegion;

      bool bFoundOverlapWithTristan = false;

      int nTristan = aTristanRegion.nFindIndexOfMatchOrPredecessor( &myRegion );

      if ( nTristan == RW_NPOS ) nTristan = 0;

      while( ( nTristan < aTristanRegion.length() ) ) {
         region* pTristan = aTristanRegion[ nTristan ];

         if ( bRegionsOverlap( pIan, pTristan ) ) {
            bFoundOverlapWithTristan = true;
            break;
         }

         if ( bRegionsLessThan( pIan, pTristan ) ) {
            break;
         }
         
         ++nTristan;
      }

      if ( !bFoundOverlapWithTristan ) {
         printf( "ian region without overlap with tristan: %s %d %d\n",
                 pIan->soChromosome_.data(),
                 pIan->nStart_,
                 pIan->nEnd_ );

         ++nIanRegionsWithoutTristan;
      }
   }

   printf( "ian regions without tristan: %d\n", nIanRegionsWithoutTristan );
   

   // ian regions overlapping multiple Tristan regions

   RWTValOrderedVector<int> aHistogramIan;

   for( nIan = 0; nIan < aIanRegion.length(); ++nIan ) {
      region* pIan = aIanRegion[ nIan ];

      region myRegion = *pIan;

      myRegion.nStart_ -= nLongestTristanRegion;


      int nTristan = aTristanRegion.nFindIndexOfMatchOrPredecessor( &myRegion );

      RWTPtrOrderedVector<region> aTristanRegionsWithOverlap;

      
      if ( nTristan == RW_NPOS ) nTristan = 0;
      int nNumberOfOverlapsWithThisIanRegion = 0;
      while( ( nTristan < aTristanRegion.length() ) ) {
         region* pTristan = aTristanRegion[ nTristan ];

         if ( bRegionsOverlap( pIan, pTristan ) ) {
            ++nNumberOfOverlapsWithThisIanRegion;
            aTristanRegionsWithOverlap.append( pTristan );
         }

         if ( bRegionsLessThan( pIan, pTristan ) ) {
            break;
         }
         
         ++nTristan;
      }

      while( nNumberOfOverlapsWithThisIanRegion >= aHistogramIan.length() ) {
         aHistogramIan.append( 0 );
      }

      ++aHistogramIan[ nNumberOfOverlapsWithThisIanRegion ];

      if ( nNumberOfOverlapsWithThisIanRegion > 1 ) {
         pIan->bMultipleOverlaps_ = true;

         for( int n = 0; n < aTristanRegionsWithOverlap.length(); ++n ) {
            region* pTristan = aTristanRegionsWithOverlap[n];
            pTristan->bMultipleOverlaps_ = true;
         }
      }
   }

   
   printf( "histogram\n" );
   printf( "# of overlaps   # of ian regions with this # of overlaps\n" );
   int nNumberOfOverlaps;
   for( nNumberOfOverlaps = 0; nNumberOfOverlaps < aHistogramIan.length();
        ++nNumberOfOverlaps ) {
      printf( "%d               %d\n",
              nNumberOfOverlaps,
              aHistogramIan[ nNumberOfOverlaps ] );
   }


   // Tristan regions overlapping multiple Ian regions

   RWTValOrderedVector<int> aHistogramTristan;

   for( nTristan = 0; nTristan < aTristanRegion.length(); ++nTristan ) {
      region* pTristan = aTristanRegion[ nTristan ];

      region myRegion = *pTristan;

      myRegion.nStart_ -= nLongestIanRegion;


      int nIan = aIanRegion.nFindIndexOfMatchOrPredecessor( &myRegion );

      
      if ( nIan == RW_NPOS ) nIan = 0;
      int nNumberOfOverlapsWithThisTristanRegion = 0;
      while( ( nIan < aIanRegion.length() ) ) {
         region* pIan = aIanRegion[ nIan ];

         if ( bRegionsOverlap( pTristan, pIan ) ) {
            ++nNumberOfOverlapsWithThisTristanRegion;
         }

         if ( bRegionsLessThan( pTristan, pIan ) ) {
            break;
         }
         
         ++nIan;
      }

      while( nNumberOfOverlapsWithThisTristanRegion >= aHistogramTristan.length() ) {
         aHistogramTristan.append( 0 );
      }

      ++aHistogramTristan[ nNumberOfOverlapsWithThisTristanRegion ];
   }

   
   printf( "histogram\n" );
   printf( "# of overlaps   # of Tristan regions with this # of overlaps\n" );
   for( nNumberOfOverlaps = 0; nNumberOfOverlaps < aHistogramTristan.length();
        ++nNumberOfOverlaps ) {
      printf( "%d               %d\n",
              nNumberOfOverlaps,
              aHistogramTristan[ nNumberOfOverlaps ] );
   }


   // eliminate regions that have multiple overlaps

   mbtPtrOrderedVector<region> aTristanRegionTemp = aTristanRegion;
   mbtPtrOrderedVector<region> aIanRegionTemp = aIanRegion;


   aTristanRegion.clear();
   aIanRegion.clear();
   int n;
   for( n = 0; n < aTristanRegionTemp.length(); ++n ) {
      region* pRegion = aTristanRegionTemp[ n ];
      if ( !pRegion->bMultipleOverlaps_ ) {
         aTristanRegion.append( pRegion );
      }
   }

   for( n = 0; n < aIanRegionTemp.length(); ++n ) {
      region* pRegion = aIanRegionTemp[ n ];
      if ( !pRegion->bMultipleOverlaps_ ) {
         aIanRegion.append( pRegion );
      }
   }

   printf( "after eliminating Tristan and Ian regions in which any Ian region overlaps multiple Tristan regions\n" );

   printf( "now tristan: %d\n", aTristanRegion.length() );
   printf( "now ian:     %d\n", aIanRegion.length() );







   // go through and eliminate ian regions that overlap

   int nIntersectingRegions = 0;
   for( nTristan = 1; nTristan < aTristanRegion.length(); ++nTristan ) {
      
      region* pTristan1 = aTristanRegion[nTristan-1];
      region* pTristan2 = aTristanRegion[nTristan];

      if ( ( pTristan1->soChromosome_ == pTristan2->soChromosome_  ) &&
           ( bIntervalsIntersect( pTristan1->nStart_,
                                  pTristan1->nEnd_,
                                  pTristan2->nStart_,
                                  pTristan2->nEnd_ ) ) ) {
         // how much do they overlap by?

         int nOverlap = pTristan1->nEnd_ - pTristan2->nStart_ + 1;

         printf( "overlap by %d\n", nOverlap );


         pTristan1->bOKToUse_ = false;
         pTristan2->bOKToUse_ = false;

         ++nIntersectingRegions;

         // find corresponding region in aIan

         region regionToSearch( pTristan1->soChromosome_,
                                MIN( pTristan1->nStart_, pTristan2->nStart_ ) -
                                nLongestIanRegion,
                                MAX( pTristan1->nEnd_, pTristan2->nEnd_ )
                                      );

         region* pCombinedTristan = new region( pTristan1->soChromosome_,
                                      MIN( pTristan1->nStart_, pTristan2->nStart_ ),
                                      MAX( pTristan1->nEnd_, pTristan2->nEnd_ )
                                      );

         int nIndex = aIanRegion.nFindIndexOfMatchOrPredecessor( &regionToSearch );
         if ( nIndex == RW_NPOS ) nIndex = 0;
         while( nIndex < aIanRegion.length() ) {

            region* pIanRegion = aIanRegion[ nIndex ];
            if ( bRegionsOverlap( pCombinedTristan, pIanRegion ) ) {
               pIanRegion->bOKToUse_ = false;
            }
            else {
               if ( bRegionsLessThan( pCombinedTristan, pIanRegion ) )
                  break;
            }
            ++nIndex;
         }
      } //  if ( ( pTristan1->soChromosome_ == pTristan2->soChromosome_  ) &&
   }
   
   printf( "tristan regions that overlap previous region: %d\n",
           nIntersectingRegions );

















   aTristanRegionTemp = aTristanRegion;
   aIanRegionTemp = aIanRegion;


   aTristanRegion.clear();
   aIanRegion.clear();

   for( n = 0; n < aTristanRegionTemp.length(); ++n ) {
      region* pRegion = aTristanRegionTemp[ n ];
      if ( pRegion->bOKToUse_ ) {
         aTristanRegion.append( pRegion );
      }
   }

   for( n = 0; n < aIanRegionTemp.length(); ++n ) {
      region* pRegion = aIanRegionTemp[ n ];
      if ( pRegion->bOKToUse_ ) {
         aIanRegion.append( pRegion );
      }
   }


   printf( "now tristan: %d\n", aTristanRegion.length() );
   printf( "now ian:     %d\n", aIanRegion.length() );

   
   aTristanRegionTemp.clear();
   aIanRegionTemp.clear();


   // look for regions in Ian that have many points than in Tristan
   
   int nIanWithinTristan = 0;
   int nTristanWithinIan = 0;
   int nNotEither = 0;
   

   int nRegionsWithLotsOfExtraBases = 0;
   for( nIan = 0; nIan < aIanRegion.length(); ++nIan ) {
      region* pIan = aIanRegion[ nIan ];

      region regionToSearch = *pIan;
      regionToSearch.nStart_ -= nLongestTristanRegion;


      int nTristan = aTristanRegion.nFindIndexOfMatchOrPredecessor( &regionToSearch );
      if ( nTristan == RW_NPOS ) nTristan = 0;

      while( nTristan < aTristanRegion.length() ) {

         region* pTristan = aTristanRegion[ nTristan ];
         if ( bRegionsLessThan( pIan, pTristan ) ) break;
         
         if ( bRegionsOverlap( pTristan, pIan ) ) {
            // how much do they overlap and how much do they
            // not overlap?

            int nIntersectStart;
            int nIntersectEnd;

            assert( bIntersect( pTristan->nStart_,
                                pTristan->nEnd_,
                                pIan->nStart_,
                                pIan->nEnd_,
                                nIntersectStart,
                                nIntersectEnd ) );

            int nIntersectingSize = nIntersectEnd - nIntersectStart + 1;

            int nNonIntersectingSizeAtBeginning = 
               MAX( pTristan->nStart_,
                    pIan->nStart_ ) 
               -
               MIN( pTristan->nStart_,
                    pIan->nStart_ );


            int nNonIntersectingSizeAtEnd =
               MAX( pTristan->nEnd_,
                    pIan->nEnd_ )
               -
               MIN( pTristan->nEnd_,
                    pIan->nEnd_ );

            assert( pIan->nStart_ < pTristan->nStart_ );
            assert( pTristan->nEnd_ < pIan->nEnd_ );

            printf( "intersecting size: %d noninter left: %d noninter right %d bp ",
                    nIntersectingSize,
                    nNonIntersectingSizeAtBeginning,
                    nNonIntersectingSizeAtEnd );


            if ( bRegionContainedWithinRegion( pTristan, pIan ) ) {
               ++nTristanWithinIan;
            }
            else if ( bRegionContainedWithinRegion( pIan, pTristan ) ) {
               ++nIanWithinTristan;
            }
            else {
               ++nNotEither;
            }




            if ( ( nNonIntersectingSizeAtBeginning +
                   nNonIntersectingSizeAtEnd ) > 4 ) {
               pTristan->bOKToUse_ = false;
               pIan->bOKToUse_ = false;
               ++nRegionsWithLotsOfExtraBases;
               printf( " big " );
            }
            printf( "\n" );
         } //  if ( bRegionsOverlap( pTristan, pIan ) ) {


         ++nTristan;
      } //         while( nTristan < aTristanRegion.length() ) {
   } //    for( int nIan = 0; nIan < aIanRegion.length(); ++nIan ) {


   printf( "regions with lots of extra bases: %d\n", nRegionsWithLotsOfExtraBases );


   printf( "tristan within ian: %d\n", nTristanWithinIan );
   printf( "ian within tristan: %d\n", nIanWithinTristan );
   printf( "not either        : %d\n", nNotEither );

   // again eliminate not used

   aTristanRegionTemp = aTristanRegion;
   aIanRegionTemp = aIanRegion;


   aTristanRegion.clear();
   aIanRegion.clear();

   for( n = 0; n < aTristanRegionTemp.length(); ++n ) {
      region* pRegion = aTristanRegionTemp[ n ];
      if ( pRegion->bOKToUse_ ) {
         aTristanRegion.append( pRegion );
      }
   }

   for( n = 0; n < aIanRegionTemp.length(); ++n ) {
      region* pRegion = aIanRegionTemp[ n ];
      if ( pRegion->bOKToUse_ ) {
         aIanRegion.append( pRegion );
      }
   }

   
   aTristanRegionTemp.clear();
   aIanRegionTemp.clear();
   

   printf( "tristan regions: %d\n", aTristanRegion.length() );
   printf( "ian regions:     %d\n", aIanRegion.length() );


   fflush( stdout );

}





static RWCString soChromosomeBases;
static mbtValOrderedVectorOfRWCString aHumanGenomeFOF( (size_t) 50 );

static void loadChromosome( const RWCString& soChromosome ) {


   RWCString soPattern = soChromosome + ".fa";

   // find the corresponding file

   FileName filChromosome;
   for( int n = 0; n < aHumanGenomeFOF.length(); ++n ) {

      FileName filChromosomeTemp = aHumanGenomeFOF[n];
      if ( filChromosomeTemp.bContains( soPattern ) ) {
         filChromosome = filChromosomeTemp;
         break;
      }
   }

   assert( !filChromosome.bIsNull() );

   FILE* pChromosome = fopen( filChromosome.data(), "r" );
   if ( !pChromosome ) {
      THROW_FILE_ERROR( filChromosome );
   }

   soChromosomeBases.increaseButNotDecreaseMaxLength( 260000000 );

   soChromosomeBases = " "; // make 1-based

   // throw away header
   assert( fgets( soLine.data(), nMaxLineSize, pChromosome ) != NULL );
   assert( soLine.sz_[0] == '>' );

   while( fgets( soLine.data(), nMaxLineSize, pChromosome ) != NULL ) {
      soLine.nCurrentLength_ = strlen( soLine.data() );

      soLine.stripTrailingWhitespaceFast();

      soChromosomeBases += soLine;
   }

   fclose( pChromosome );

}
         




// this is for examining motifs within the exome

static void run_100814() {

   setNumberFromBaseTable();

   // looks like:
   // chr10:82995-84056
   const FileName filExomeRegions = "/codon/gordon/misc/geneClassifications/otherExomeSegments/nimblegen_solution_ccds_2008.HG18.list";

   
   const FileName filHumanGenomeFOF = "/codon/gordon/genomes/human_genome_hg18/human_genome.fof";

   const FileName filScoreMatrix = "/codon/gordon/misc/geneClassifications/exomeMotifs/run_100819/scoreMatrix.txt";

   const int nScoreMatrixColumns = 9;

   const float fBinSize = 1.0;


   FILE* pExome = fopen( filExomeRegions.data(), "r" );
   if ( !pExome ) {
      THROW_FILE_ERROR( filExomeRegions );
   }


   mbtPtrOrderedVector<region> aExomeRegion( 160000 );


   RWCString soChromosome( (size_t) 1000);

   int nStart;
   int nEnd;

   int nTokens;
   int nLinesRead = 1;

   int nLongestExomeRegion = 0;

   while( fgets( soLine.data(), nMaxLineSize, pExome ) != NULL ) {
      soLine.nCurrentLength_ =  strlen( soLine.data() );

      int nColon = soLine.index( ":" );
      assert( nColon != RW_NPOS );

      int nDash = soLine.index( "-", nColon + 1 );
      assert( nDash != RW_NPOS );

      RWCString soChromosome = soLine( 0, nColon );

      nTokens = sscanf( soLine.soGetRestOfString( nColon + 1 ).data(), "%d-%d",
                        &nStart, 
                        &nEnd );

      if ( nTokens != 2 ) {

         soLine.nCurrentLength_ =  strlen( soLine.data() );
         soChromosome.nCurrentLength_ = strlen( soChromosome.data() );

         THROW_ERROR2( "data error with Exome, nToken = " + RWCString( (long) nTokens ) + " converted string is " + soChromosome + " remaining string is " + soLine  + " line " + RWCString( (long) nLinesRead )  );
      }

      ++nLinesRead;

      region* pRegion = new region( soChromosome, nStart, nEnd );
      aExomeRegion.append( pRegion );


      if ( pRegion->nGetLength() > nLongestExomeRegion ) 
         nLongestExomeRegion = pRegion->nGetLength();
   }

   fclose( pExome );

   aExomeRegion.resort();
   

   FILE* pHumanGenomeFOF = fopen( filHumanGenomeFOF.data(), "r" );
   if ( !pHumanGenomeFOF ) {
      THROW_FILE_ERROR( filHumanGenomeFOF );
   }


   while( fgets( soLine.data(), nMaxLineSize, pHumanGenomeFOF ) != NULL ) {
      soLine.nCurrentLength_ = strlen( soLine.data() );

      soLine.stripTrailingWhitespaceFast();

      aHumanGenomeFOF.append( soLine );
   }

   
   fclose( pHumanGenomeFOF );


   float fScoreMatrix[nScoreMatrixColumns][4];



   int nCountMatrix[nScoreMatrixColumns][4];
   for( int nCol = 0; nCol < nScoreMatrixColumns; ++nCol ) {
      for( int nRow = 0; nRow < 4; ++nRow ) {
         nCountMatrix[nCol][nRow] = 0;
      }
   }

   FILE* pScoreMatrix = fopen( filScoreMatrix.data(), "r" );
   if ( !pScoreMatrix ) {
      THROW_FILE_ERROR( filScoreMatrix );
   }


   int nRow = -1; // ready for ++
   while( fgets( soLine.data(), nMaxLineSize, pScoreMatrix ) != NULL ) {
      soLine.nCurrentLength_ = strlen( soLine.data() );
      soLine.stripTrailingWhitespaceFast();
      if ( soLine.bIsNull() ) continue;

      ++nRow;


      RWCTokenizer tok( soLine );

      printf( "parsing score matrix line %s\n", soLine.data() );

      for( int n = 0; n < nScoreMatrixColumns; ++n ) {
         RWCString soNumber = tok();
         if ( n == 0 && (soNumber == "A" || soNumber == "C" ||
                         soNumber == "G" || soNumber == "T" ) ) {
            soNumber = tok();
         }
         
         printf( "%s ", soNumber.data() );
         fflush( stdout );
         assert( !soNumber.bIsNull() );

         float f;
         errno = 0;
         char* szEndPtr;
#ifndef SOLARIS
         f = strtof( soNumber.data(), &szEndPtr );
         assert( errno == 0 );
         if ( *szEndPtr != 0 ) {
            RWCString soError = "floating point number " + soNumber +
               " didn't not convert ok, left over was " +
               RWCString( szEndPtr );
            THROW_ERROR( soError );
         }
#else
         f = atof( soNumber.data() );
#endif

         fScoreMatrix[ n ][ nRow ] = f;
      }

      printf( "\n" );
   }

   // 0 to 3 for the 4 nucleotides
   assert( nRow == 3 );


   RWCString soExome( (size_t) 100000000 );

   RWCString soCurrentChromosome;

   RWTValVector<int> aExomeBaseCounts( 4, 0, "aExomeBaseCounts" );
   

   for( int nRegion = 0; nRegion < aExomeRegion.length(); ++nRegion ) {
      region* pRegion = aExomeRegion[ nRegion ];

      if ( pRegion->soChromosome_ != soCurrentChromosome ) {

         // load another chromosome
         cerr << "reading " << pRegion->soChromosome_ << endl;
         loadChromosome( pRegion->soChromosome_ );
         cerr << "done" << endl;

         soCurrentChromosome = pRegion->soChromosome_;

      }

      soExome += soChromosomeBases( pRegion->nStart_, 
                                    pRegion->nEnd_ - pRegion->nStart_ + 1 );

      for( int n = pRegion->nStart_; n <= pRegion->nEnd_; ++n ) {
         ++aExomeBaseCounts[ nNumberFromBase[ soChromosomeBases[n] ] ];
      }

      soExome += "X";
   }

   cerr << "done reading exome bases" << endl;


   // and now reverse complement this exome

   // there will be two X's in the middle of the string and
   // no X on the end

   soExome += soComplementSO( soExome );


   cerr << "done complementing exome bases" << endl;


   // calculate lowest possible score so we can figure out the
   // offset


   float fTotalLeast = 0.0;
   for( int nColumn = 0; nColumn < nScoreMatrixColumns; ++nColumn ) {
      
      float fLeast = fScoreMatrix[nColumn][0];
      for( int nBase = 1; nBase < 4; ++nBase ) {
         fLeast = MIN( fLeast, fScoreMatrix[nColumn][nBase] );
      }

      fTotalLeast += fLeast;
   }

   int nHistogramOffset = -( floor( fTotalLeast / fBinSize ) );

   cerr << "histogram offset = " << nHistogramOffset << endl;



   RWTValOrderedVector<int> aHistogram;
   aHistogram.soName_ = "histogram";

   

   // debugging
   int nTempCount = 0;
   // end debugging


   int nLastExomePos = soExome.length() - nScoreMatrixColumns;

   for( int nExomePos = 0; nExomePos <= nLastExomePos; ++nExomePos ) {

      // check if this has an X


      float fTotalScore = 0.0;
      
      bool bFoundX = false;
      int n;
      for( n = nScoreMatrixColumns - 1; n >= 0; --n ) {
         if ( soExome[ nExomePos + n ] == 'X' ) {
            bFoundX = true;
            break;
         }
         fTotalScore += 
            fScoreMatrix[n][nNumberFromBase[ soExome[ nExomePos + n ] ] ];

      }

      if ( bFoundX ) continue;

      // debugging
      if ( fTotalScore >= 20.0 ) {
         RWCString soPartOfExome = soExome( nExomePos, nScoreMatrixColumns );
         cerr << soPartOfExome << " with score " << fTotalScore << endl;
      }

      if ( ( -20 <= fTotalScore) && (fTotalScore < -15 ) ) {
         ++nTempCount;
      }

      for( n = nScoreMatrixColumns - 1; n >= 0; --n ) {
         ++nCountMatrix[n][nNumberFromBase[ soExome[ nExomePos + n ] ] ];
      }

      // end debugging
      
      int nBin = floor( fTotalScore / fBinSize ) + nHistogramOffset;

      assert( nBin >= 0 );

      while( nBin >= aHistogram.length() ) {
         aHistogram.append( 0 );
      }


      assert( nBin < aHistogram.length() );
      ++aHistogram[ nBin ];
   }

   // debugging
   cerr << "-20 <= x < -15: " << nTempCount << endl;
   // end debugging

   cerr << "done calculating histogram" << endl;
   cerr.flush();

   // print matrix

   printf( "score matrix:\n" );
   int nBase;
   for( nBase = 0; nBase < 4; ++nBase ) {
      printf( "base: %d ", nBase );
      for( int n = 0; n < nScoreMatrixColumns; ++n ) {
         printf( " %4.1f", fScoreMatrix[n][nBase] );
      }
      printf( "\n" );
   }

   printf( "count matrix:\n" );
   for( nBase = 0; nBase < 4; ++nBase ) {
      printf( "base: %d ", nBase );
      for( int n = 0; n < nScoreMatrixColumns; ++n ) {
         printf( " %10s", soAddCommas( nCountMatrix[n][nBase] ).data() );
      }
      printf( "\n" );
   }

   int nExomeSize = 0;
   printf( "base counts in exome:\n" );
   for( nBase = 0; nBase < 4; ++nBase ) {
      printf( "base %d : %d\n", nBase, aExomeBaseCounts[nBase] );
      nExomeSize += aExomeBaseCounts[nBase];
   }

   float fGCPerCent = ( aExomeBaseCounts[1] + aExomeBaseCounts[2] ) * 100.0 /
      ( aExomeBaseCounts[0] + aExomeBaseCounts[1] + aExomeBaseCounts[2] +
        aExomeBaseCounts[3] );

   printf( "gc : %.1f %%\n", fGCPerCent );


   // debugging
   cerr << "aHistogram.length() = " << aHistogram.length() << endl;
   // end debugging

   // print histogram
   int nCumulativeCounts = 0;
   for( int nBin = aHistogram.length() - 1; nBin >= 0; --nBin ) {

      int nBin2 = nBin - nHistogramOffset;

      float fUpperBoundOfBin = ( nBin2 + 1 ) * fBinSize;


      int nCountsThisBin = aHistogram[ nBin ];
      nCumulativeCounts += nCountsThisBin;

      int nMeanDistanceBetweenMotifs = 2*nExomeSize / nCumulativeCounts;

      // skipping negative values
      if ( fUpperBoundOfBin < 0 ) continue;




      printf( "%6.1f <= x < %6.1f : %10s %15s %10s\n",
              nBin2 * fBinSize,
              ( nBin2 + 1 ) * fBinSize,
              soAddCommas( nCountsThisBin ).data(),
              soAddCommas( nCumulativeCounts ).data(),
              soAddCommas( nMeanDistanceBetweenMotifs ).data() );
              

   }
    
   printf( "score                   counts in bin    cum counts     mean dist between motifs\n" );

   // so that stdout buffer is flushed
   //exit(0);
   fflush( stdout );
}




   const int nNumberOfPutativeExons = 160000;
   
static   RWTValOrderedVector<int> aExomeToHumanGenome0E( (size_t) 2*nNumberOfPutativeExons );
static   RWTValOrderedVector<RWCString> aExomeToHumanGenomeChrName( (size_t) 2*nNumberOfPutativeExons );
static   RWTValOrderedVector<int> aExomeToHumanGenome1ChrPos( (size_t) 2*nNumberOfPutativeExons );




static void convertExomePosToChromosome( const int nExomePos, 
                                         RWCString& soChromosome, 
                                         int& n1ChrPos ) {

   for( int nExome = 0; nExome < aExomeToHumanGenome0E.length() - 1; ++nExome ) {

      if ( ( aExomeToHumanGenome0E[ nExome ] <= nExomePos ) &&
           ( nExomePos < aExomeToHumanGenome0E[ nExome + 1 ] ) ) {

         soChromosome = aExomeToHumanGenomeChrName[ nExome ];

         if ( soChromosome.bEndsWith( ".comp" ) ) {
            n1ChrPos = aExomeToHumanGenome1ChrPos[ nExome ] -
               ( nExomePos - aExomeToHumanGenome0E[ nExome ] );

         }
         else {
            n1ChrPos = nExomePos - aExomeToHumanGenome0E[ nExome ] +
               aExomeToHumanGenome1ChrPos[ nExome ];
         }

         return;
      }
   }


   // if reached here, something is wrong

   RWCString soError = "trying to find chr pos for exome pos "  +
      RWCString( (long) nExomePos ) + " but the largest is " +
      RWCString( (long) aExomeToHumanGenome0E.last() ) + " " +
      aExomeToHumanGenomeChrName.last() + " " +
      RWCString( (long) aExomeToHumanGenome1ChrPos.last() );

   THROW_ERROR( soError );
}




// this is for examining motifs within the exome

static void run_100820() {

   cerr << "in run_100820" << endl;
   cerr.flush();
   setNumberFromBaseTable();

   // looks like:
   // chr10:82995-84056
   const FileName filExomeRegions = "/codon/gordon/misc/geneClassifications/otherExomeSegments/nimblegen_solution_ccds_2008.HG18.list";

   
   const FileName filHumanGenomeFOF = "/codon/gordon/genomes/human_genome_hg18/human_genome.fof";

   const FileName filScoreMatrix = "scoreMatrix.txt";

   const int nScoreMatrixColumns = 11;
   const int nSizeOfMotif = nScoreMatrixColumns;
   const float fScoreThreshold = 11.0;

   const float fBinSize = 1.0;


   FILE* pExome = fopen( filExomeRegions.data(), "r" );
   if ( !pExome ) {
      THROW_FILE_ERROR( filExomeRegions );
   }


   mbtPtrOrderedVector<region> aExomeRegion( 160000 );


   RWCString soChromosome( (size_t) 1000);

   int nStart;
   int nEnd;

   int nTokens;
   int nLinesRead = 1;

   int nLongestExomeRegion = 0;

   while( fgets( soLine.data(), nMaxLineSize, pExome ) != NULL ) {
      soLine.nCurrentLength_ =  strlen( soLine.data() );

      int nColon = soLine.index( ":" );
      assert( nColon != RW_NPOS );

      int nDash = soLine.index( "-", nColon + 1 );
      assert( nDash != RW_NPOS );

      RWCString soChromosome = soLine( 0, nColon );

      nTokens = sscanf( soLine.soGetRestOfString( nColon + 1 ).data(), "%d-%d",
                        &nStart, 
                        &nEnd );

      if ( nTokens != 2 ) {

         soLine.nCurrentLength_ =  strlen( soLine.data() );
         soChromosome.nCurrentLength_ = strlen( soChromosome.data() );

         THROW_ERROR2( "data error with Exome, nToken = " + RWCString( (long) nTokens ) + " converted string is " + soChromosome + " remaining string is " + soLine  + " line " + RWCString( (long) nLinesRead )  );
      }

      ++nLinesRead;

      region* pRegion = new region( soChromosome, nStart, nEnd );
      aExomeRegion.append( pRegion );


      if ( pRegion->nGetLength() > nLongestExomeRegion ) 
         nLongestExomeRegion = pRegion->nGetLength();
   }

   fclose( pExome );

   aExomeRegion.resort();
   

   FILE* pHumanGenomeFOF = fopen( filHumanGenomeFOF.data(), "r" );
   if ( !pHumanGenomeFOF ) {
      THROW_FILE_ERROR( filHumanGenomeFOF );
   }


   while( fgets( soLine.data(), nMaxLineSize, pHumanGenomeFOF ) != NULL ) {
      soLine.nCurrentLength_ = strlen( soLine.data() );

      soLine.stripTrailingWhitespaceFast();

      aHumanGenomeFOF.append( soLine );
   }

   
   fclose( pHumanGenomeFOF );


   float fScoreMatrix[nScoreMatrixColumns][4];



   int nCountMatrix[nScoreMatrixColumns][4];
   for( int nCol = 0; nCol < nScoreMatrixColumns; ++nCol ) {
      for( int nRow = 0; nRow < 4; ++nRow ) {
         nCountMatrix[nCol][nRow] = 0;
      }
   }

   FILE* pScoreMatrix = fopen( filScoreMatrix.data(), "r" );
   if ( !pScoreMatrix ) {
      THROW_FILE_ERROR( filScoreMatrix );
   }


   int nRow = -1; // ready for ++
   while( fgets( soLine.data(), nMaxLineSize, pScoreMatrix ) != NULL ) {
      soLine.nCurrentLength_ = strlen( soLine.data() );
      soLine.stripTrailingWhitespaceFast();
      if ( soLine.bIsNull() ) continue;

      ++nRow;


      RWCTokenizer tok( soLine );

      printf( "parsing score matrix line %s\n", soLine.data() );

      for( int n = 0; n < nScoreMatrixColumns; ++n ) {
         RWCString soNumber = tok();
         if ( n == 0 && (soNumber == "A" || soNumber == "C" ||
                         soNumber == "G" || soNumber == "T" ) ) {
            soNumber = tok();
         }
         
         printf( "%s ", soNumber.data() );
         fflush( stdout );
         assert( !soNumber.bIsNull() );

         float f;
         errno = 0;
         char* szEndPtr;
#ifndef SOLARIS
         f = strtof( soNumber.data(), &szEndPtr );
         assert( errno == 0 );
         if ( *szEndPtr != 0 ) {
            RWCString soError = "floating point number " + soNumber +
               " didn't not convert ok, left over was " +
               RWCString( szEndPtr );
            THROW_ERROR( soError );
         }
#else
         f = atof( soNumber.data() );
#endif

         fScoreMatrix[ n ][ nRow ] = f;
      }

      printf( "\n" );
   }

   // 0 to 3 for the 4 nucleotides
   assert( nRow == 3 );


   RWCString soExome( (size_t) 100000000 );

   RWCString soCurrentChromosome;

   RWTValVector<int> aExomeBaseCounts( 4, 0, "aExomeBaseCounts" );
   
   

   aExomeToHumanGenome0E.soName_ = "aExomeToHumanGenome0E";
   aExomeToHumanGenomeChrName.soName_ = "aExomeToHumanGenomeChrName";
   aExomeToHumanGenome1ChrPos.soName_ = "aExomeToHumanGenome1ChrPos";


   int nRegion;
   for( nRegion = 0; nRegion < aExomeRegion.length(); ++nRegion ) {
      region* pRegion = aExomeRegion[ nRegion ];

      if ( pRegion->soChromosome_ != soCurrentChromosome ) {

         // load another chromosome
         cerr << "reading " << pRegion->soChromosome_ << endl;
         loadChromosome( pRegion->soChromosome_ );
         cerr << "done" << endl;

         soCurrentChromosome = pRegion->soChromosome_;

      }


      aExomeToHumanGenome0E.append( soExome.length() );
      aExomeToHumanGenomeChrName.append( pRegion->soChromosome_ );
      aExomeToHumanGenome1ChrPos.append( pRegion->nStart_ );


      soExome += soChromosomeBases( pRegion->nStart_, 
                                    pRegion->nEnd_ - pRegion->nStart_ + 1 );


      for( int n = pRegion->nStart_; n <= pRegion->nEnd_; ++n ) {
         ++aExomeBaseCounts[ nNumberFromBase[ soChromosomeBases[n] ] ];
      }

      soExome += "X";
   }


   cerr << "done reading exome bases" << endl;


   // and now reverse complement this exome

   // there will be two X's in the middle of the string and
   // no X on the end

   int nExomeSizeBeforeAddingComplement = soExome.length();

   soExome += soComplementSO( soExome );


   cerr << "done complementing exome bases" << endl;


   // need to add all of the complemented exons

   //  there are 2 X's in a row at the
   //  boundary. nExomeSizeBeforeAddingComplement points to the 2nd X,
   //  and the +1 points to the first base in the complemented exome

   int n0ExomeStartPos = nExomeSizeBeforeAddingComplement + 1;

   int nMatch = 0;
   int nMismatch = 0;

   for( nRegion = aExomeRegion.length() - 1; nRegion >= 0; --nRegion ) {
      region* pRegion = aExomeRegion[ nRegion ];

      assert( soExome[ n0ExomeStartPos - 1 ] == 'X' );

      aExomeToHumanGenome0E.append( n0ExomeStartPos );
      aExomeToHumanGenomeChrName.append( pRegion->soChromosome_ + ".comp" );
      aExomeToHumanGenome1ChrPos.append( pRegion->nEnd_ );

      // sanity check--checks chr1 and chrY
      if ( ( pRegion->soChromosome_ == "chr1" ) &&
           ( pRegion->soChromosome_ != soCurrentChromosome ) ) {
         loadChromosome( pRegion->soChromosome_ );
         soCurrentChromosome = pRegion->soChromosome_;
      }
         
      
      if ( pRegion->soChromosome_ == soCurrentChromosome ) {
         if ( soExome[ n0ExomeStartPos ] == 
              cComplementBase( soChromosomeBases[ pRegion->nEnd_ ] ) ) {
            ++nMatch;
         }
         else {
            ++nMismatch;
            cerr << soExome[ n0ExomeStartPos ] << " " <<
               cComplementBase( soChromosomeBases[ pRegion->nEnd_ ] ) << endl;
         }
      }
      // end sanity check


      int nLength = pRegion->nEnd_ - pRegion->nStart_ + 1;

      n0ExomeStartPos += nLength;
      n0ExomeStartPos += 1; // for the "X"
   }

   assert( nMismatch == 0 );

   cerr << "match: " << nMatch << " mismatch: " << nMismatch << endl;
      


   // add a sentinel

   aExomeToHumanGenome0E.append( soExome.length() );
   aExomeToHumanGenomeChrName.append( "SENTINEL" );
   aExomeToHumanGenome1ChrPos.append( 1 );



   // calculate lowest possible score so we can figure out the
   // offset


   float fTotalLeast = 0.0;
   for( int nColumn = 0; nColumn < nScoreMatrixColumns; ++nColumn ) {
      
      float fLeast = fScoreMatrix[nColumn][0];
      for( int nBase = 1; nBase < 4; ++nBase ) {
         fLeast = MIN( fLeast, fScoreMatrix[nColumn][nBase] );
      }

      fTotalLeast += fLeast;
   }

   int nHistogramOffset = -( floor( fTotalLeast / fBinSize ) );

   cerr << "histogram offset = " << nHistogramOffset << endl;



   RWTValOrderedVector<int> aHistogram;
   aHistogram.soName_ = "histogram";


   FILE* pLocations = fopen( "locations.txt", "w" );
   if ( !pLocations ) {
      THROW_FILE_ERROR( "locations.txt" );
   }
   
   

   int nLastExomePos = soExome.length() - nScoreMatrixColumns;

   for( int nExomePos = 0; nExomePos <= nLastExomePos; ++nExomePos ) {


      // check if this has an X


      float fTotalScore = 0.0;
      
      bool bFoundX = false;
      int n;
      for( n = nScoreMatrixColumns - 1; n >= 0; --n ) {
         if ( soExome[ nExomePos + n ] == 'X' ) {
            bFoundX = true;
            break;
         }
         fTotalScore += 
            fScoreMatrix[n][nNumberFromBase[ soExome[ nExomePos + n ] ] ];

      }

      if ( bFoundX ) continue;

      if ( fTotalScore >= fScoreThreshold ) {
         RWCString soPartOfExome = soExome( nExomePos, nScoreMatrixColumns );
         
         RWCString soChromosome;
         int n1ChrPos;
         
         convertExomePosToChromosome( nExomePos, soChromosome, n1ChrPos );

         printf( "exome position %d score %.1f %s %s %d\n",
                 nExomePos,
                 fTotalScore,
                 soPartOfExome.data(),  
                 soChromosome.data(),
                 n1ChrPos );

         RWCString soChromosomeWithoutComp= soChromosome;


         int n1ChrPosAtEndOfMotif = n1ChrPos;
         
         if ( soChromosomeWithoutComp.bEndsWithAndRemove( ".comp" ) ) {
            n1ChrPosAtEndOfMotif -= ( nSizeOfMotif - 1);
         }
         else {
            n1ChrPosAtEndOfMotif += ( nSizeOfMotif - 1 );
         }

         fprintf( pLocations, "%s %d\n", 
                  soChromosomeWithoutComp.data(),
                  n1ChrPosAtEndOfMotif );
      }

      
      int nBin = floor( fTotalScore / fBinSize ) + nHistogramOffset;

      assert( nBin >= 0 );

      while( nBin >= aHistogram.length() ) {
         aHistogram.append( 0 );
      }


      assert( nBin < aHistogram.length() );
      ++aHistogram[ nBin ];
   }


   fclose( pLocations );


   // print histogram
   int nCumulativeCounts = 0;
   for( int nBin = aHistogram.length() - 1; nBin >= 0; --nBin ) {

      int nBin2 = nBin - nHistogramOffset;

      float fUpperBoundOfBin = ( nBin2 + 1 ) * fBinSize;


      int nCountsThisBin = aHistogram[ nBin ];
      nCumulativeCounts += nCountsThisBin;

      //int nMeanDistanceBetweenMotifs = 2*nExomeSize / nCumulativeCounts;

      // skipping negative values
      if ( fUpperBoundOfBin < 0 ) continue;




      printf( "%6.1f <= x < %6.1f : %10s %15s\n",
              nBin2 * fBinSize,
              ( nBin2 + 1 ) * fBinSize,
              soAddCommas( nCountsThisBin ).data(),
              soAddCommas( nCumulativeCounts ).data() );
      //              soAddCommas( nMeanDistanceBetweenMotifs ).data() );
              

   }
    
   printf( "score                   counts in bin    cum counts     mean dist between motifs\n" );

   





   // so that stdout buffer is flushed
   fflush( stdout );
}





void miscProgram() {

   cerr << "in miscProgram" << endl;
   cerr.flush();

   //   run_100812();
   // run_100813();
   //run_100814();
   run_100820();


}
#endif