/*****************************************************************************
#   Copyright (C) 1994-2008 by David Gordon.
#   All rights reserved.                           
#                                                                           
#   This software is part of a beta-test version of the Consed/Autofinish
#   package.  It should not be redistributed or
#   used for any commercial purpose, including commercially funded
#   sequencing, without written permission from the author and the
#   University of Washington.
#   
#   This software is provided ``AS IS'' and any express or implied
#   warranties, including, but not limited to, the implied warranties of
#   merchantability and fitness for a particular purpose, are disclaimed.
#   In no event shall the authors or the University of Washington be
#   liable for any direct, indirect, incidental, special, exemplary, or
#   consequential damages (including, but not limited to, procurement of
#   substitute goods or services; loss of use, data, or profits; or
#   business interruption) however caused and on any theory of liability,
#   whether in contract, strict liability, or tort (including negligence
#   or otherwise) arising in any way out of the use of this software, even
#   if advised of the possibility of such damage.
#
#   Building Consed from source is error prone and not simple which is
#   why I provide executables.  Due to time limitations I cannot
#   provide any assistance in building Consed.  Even if you do not
#   modify the source, you may introduce errors due to using a
#   different version of the compiler, a different version of motif,
#   different versions of other libraries than I used, etc.  For this
#   reason, if you discover Consed bugs, I can only offer help with
#   those bugs if you first reproduce those bugs with an executable
#   provided by me--not an executable you have built.
# 
#   Modifying Consed is also difficult.  Although Consed is modular,
#   some modules are used by many other modules.  Thus making a change
#   in one place can have unforeseen effects on many other features.
#   It may takes months for you to notice these other side-effects
#   which may not seen connected at all.  It is not feasable for me to
#   provide help with modifying Consed sources because of the
#   potentially huge amount of time involved.
#
#*****************************************************************************/
#include    "geneClassifications.h"
#include    "cGetProtein.h"
#include    <stdio.h>
#include    "soLine.h"
#include    "rwctokenizer.h"
#include    "max.h"
#include    "bIntersect.h"
#include    "cComplementBase.h"
#include    "soGetDateTime.h"
#include    "soAddCommas.h"
#include    <stdlib.h>
#include    "assert.h"


const int cUTR = 'u';
const int cINTERGENIC = 'n';
const int cINTRON = 'i';
const int cSPLICE_SITE = 'p';
const int cSYN_CODING = 's';
const int cNONSYN_CODING = 'o';
const int cFROM_STOP_CODON = 'f';
const int cTO_STOP_CODON = 't';


#define szTranslateGeneType( CODE ) (  \
                                     ( CODE == cUTR ) ? "utr" : (( CODE == cINTERGENIC ) ? "intergenic" : (( CODE == cINTRON ) ? "intron" : ( CODE == cSPLICE_SITE ) ? "splice" : ((( CODE == cSYN_CODING ) ? "syn" : (( CODE == cNONSYN_CODING ) ? "nonsyn" : (( CODE == cFROM_STOP_CODON ) ? "from_stop" : (( CODE == cTO_STOP_CODON ) ? "to_stop" : "???" ))))))))


void geneClassifications :: doIt() {

   
   FileName filOutputFile = filGenomicLocations_.filReplaceExtension( "class" );

   pOutputFile_ = fopen( filOutputFile.data(), "w" );
   if ( !pOutputFile_ ) {
      THROW_FILE_ERROR( filOutputFile );
   }


   readKnownGene();


   readChromosomesFOF();

   readPhyloFOF();

   FILE* pGenomicLocations = fopen( filGenomicLocations_.data(), "r" );
   if ( !pGenomicLocations ) {
      THROW_FILE_ERROR( filGenomicLocations_ );
   }


   RWCString soChromosome( (size_t) 200 ); // sscanf is used on this
   RWCString soLastChromosome;
   RWCString soSnpLine( (size_t) nMaxLineSize );
   int n1ChrPos;
   RWCString soAlternativeAlleles( (size_t) 200 ); // sscanf is used on this

   while( 1 ) {
      if ( fgets( soSnpLine.data(), nMaxLineSize, pGenomicLocations ) 
           == NULL ) {
         soChromosome = "LAST";
      }
      else {
         soSnpLine.nCurrentLength_ = strlen( soSnpLine.data() );

         soSnpLine.stripTrailingWhitespaceFast();

         if ( soSnpLine.bIsNull() ) continue;

         parseCurrentLine( soSnpLine, soChromosome, n1ChrPos, soAlternativeAlleles );
      }

      if ( soChromosome != soLastChromosome ) {
         //         finishLastChromosome( );

         if ( soChromosome == "LAST" ) break;

         startNextChromosome( soChromosome );
         soLastChromosome = soChromosome;
      }

      processCurrentLine( soChromosome, n1ChrPos, soAlternativeAlleles );
   }


   fclose( pOutputFile_ );

   cerr << "see " <<  filOutputFile << endl;

}


void geneClassifications :: readChromosomesFOF() {

   FILE* pChromosomesFOF = fopen( filChromosomesFOF_.data(), "r" );
   if ( !pChromosomesFOF ) {
      THROW_FILE_ERROR( filChromosomesFOF_ );
   }


   while( fgets( soLine.data(), nMaxLineSize, pChromosomesFOF ) != NULL ) {
      soLine.nCurrentLength_  = strlen( soLine.data() );

      soLine.stripTrailingWhitespaceFast();

      aChromosomeFiles_.append( soLine );
   }

   fclose( pChromosomesFOF );
}


void geneClassifications :: readPhyloFOF() {

   if ( filPhyloFOF_.bIsNull() ) return;

   FILE* pPhyloFOF = fopen( filPhyloFOF_.data(), "r" );
   if ( !pPhyloFOF ) {
      THROW_FILE_ERROR( filPhyloFOF_ );
   }

   while( fgets( soLine.data(), nMaxLineSize, pPhyloFOF ) != NULL ) {
      soLine.nCurrentLength_ = strlen( soLine.data() );

      soLine.stripTrailingWhitespaceFast();

      aPhyloFiles_.append( soLine );
   }

   fclose( pPhyloFOF );


}


void geneClassifications :: parseCurrentLine( const RWCString& soLine,
                                              RWCString& soChromosome,
                                              int& n1ChrPos,
                                              RWCString& soAlternativeAlleles ) {

   // assuming line looks like:
   // chr1 38383

   soChromosome = "                  "; // just so there is enough room

   sscanf( soLine.data(), "%s %d %s\n", soChromosome.data(), &n1ChrPos,
           soAlternativeAlleles.data() );

   soChromosome.nCurrentLength_ = strlen( soChromosome.data() );
   soAlternativeAlleles.nCurrentLength_ = strlen( soAlternativeAlleles.data() );


}


void geneClassifications :: startNextChromosome( const RWCString& soChromosome ) {

   RWCString soPattern = soChromosome + ".fa";

   RWCString soFoundChromosomeFile;
   int nChr;
   for( nChr = 0; nChr < aChromosomeFiles_.length(); ++nChr ) {
      RWCString soChromosomeFile = aChromosomeFiles_[ nChr ];
      if ( soChromosomeFile.bContains( soPattern ) ) {
         soFoundChromosomeFile = soChromosomeFile;
         break;
      }
   }

   cerr << "reading " << soFoundChromosomeFile << endl;
   cerr.flush();


   FILE* pChromosome = fopen( soFoundChromosomeFile.data(), "r" );
   if ( !pChromosome ) {
      THROW_FILE_ERROR( soFoundChromosomeFile );
   }

   assert( fgets( soLine.data(), nMaxLineSize, pChromosome ) != NULL );

   assert( soLine[0] == '>' );

   soChromosomeBases_ = " "; // to make index 1-based
   while( fgets( soLine.data(), nMaxLineSize, pChromosome ) != NULL ) {
      soLine.nCurrentLength_ = strlen( soLine.data() );
      soLine.stripTrailingWhitespaceFast();
      soChromosomeBases_ += soLine;
   }

   fclose( pChromosome );

   cerr << "done" << endl;
   cerr.flush();


   if ( !filPhyloFOF_.bIsNull() ) {

      aConservationScores_.increaseButNotDecreaseMaxLength( soChromosomeBases_.length() + 1 );
      aConservationScores_.nCurrentLength_ = soChromosomeBases_.length() + 1;
      

      for( int n = 1; n <= soChromosomeBases_.length(); ++n ) {
         aConservationScores_[n] = -666.0;
      }

      // find Phylo file:

      RWCString soPhyloPattern = soChromosome + ".";

      FileName filFoundPhyloFile;
      for( nChr = 0; nChr < aPhyloFiles_.length(); ++nChr ) {
         if ( aPhyloFiles_[nChr].bContains( soPhyloPattern ) ) {
            filFoundPhyloFile = aPhyloFiles_[ nChr ];
            break;
         }
      }


      cerr << "reading " << filFoundPhyloFile << endl;
      cerr.flush();

      
      FILE* pPhylo = fopen( filFoundPhyloFile.data(), "r" );
      if ( !pPhylo ) {
         THROW_FILE_ERROR( filFoundPhyloFile );
      }

      char szChromosome[100];
      float fConsScore;
      int nNextChrPos;
      int nChrPos;
      while( fgets( soLine.data(), nMaxLineSize, pPhylo ) != NULL ) {
         //          soLine.nCurrentLength_ = strlen( soLine.data() );
         
         if ( sscanf( soLine.data(), "%f", &fConsScore ) == 1 ) {

            ++nChrPos;

            aConservationScores_[ nChrPos ] = fConsScore;
         }
         else if ( sscanf( soLine.data(), "fixedStep chrom=%s start=%d step=1",
                           szChromosome,
                           &nNextChrPos ) ) {

            assert( soChromosome == RWCString( szChromosome ) );

            nChrPos = nNextChrPos - 1;
         }
         else {
            soLine.nCurrentLength_ = strlen( soLine.data() );

            RWCString soError = "do not understand conservation line " +
               soLine;
            THROW_ERROR( soError );
         }
      }

      fclose( pPhylo );

      cerr << "done" << endl;
      cerr.flush();

   }

}

         
static int nCompareLocations( const RWCString& soChromosome,
                              const int n1ChrPos,
                              const RWCString& soKnownGeneLine ) {


   // looks like:
   // uc001aaa.2      chr1    +       1115    4121 
   // 0               1       2       3       4

   RWCTokenizer tok( soKnownGeneLine );
   tok();
   RWCString soKnownGeneChromosome = tok();
   tok(); // strand
   RWCString so0Start = tok(); // 0-based


   if ( soChromosome < soKnownGeneChromosome )
      return -1;
   else if ( soChromosome > soKnownGeneChromosome )
      return 1;
   else {
      int n1KnownGeneChrPos = atoi( so0Start.data() ) + 1;
      if ( n1ChrPos < n1KnownGeneChrPos )
         return -1;
      else if ( n1ChrPos > n1KnownGeneChrPos )
         return 1;
      else
         return 0;
   }
}

            
void geneClassifications :: readKnownGene() {
         
   FILE* pKnownGene = fopen( filKnownGene_.data(), "r" );
   if ( !pKnownGene ){
      THROW_FILE_ERROR( filKnownGene_ );
   }

   // this is in ctor
   aKnownGene_.resize( 70000 );

   const int nMaxKnownGeneLineSize = 1000000;
   RWCString soKnownGeneLine( (size_t) nMaxKnownGeneLineSize + 2 );

   int nMaxLengthSoFar = 0;

   while( fgets( soKnownGeneLine.data(), nMaxKnownGeneLineSize, pKnownGene ) != NULL ) {
      soKnownGeneLine.nCurrentLength_ = strlen( soKnownGeneLine.data() );

      if ( soKnownGeneLine.cGetLastChar() != '\n' ) {
         THROW_ERROR2( "line " + soKnownGeneLine + " didn't end with a cr" );
      }

      soKnownGeneLine.removeFinalCR();

      if ( nMaxLengthSoFar < soKnownGeneLine.length() ) {
         nMaxLengthSoFar = soKnownGeneLine.length();
      }

      aKnownGene_.insert( soKnownGeneLine );
      
   }

   printf( "longest line %d\n", nMaxLengthSoFar );
   
   fclose( pKnownGene );

   cerr << "done" << endl;
   cerr.flush();

   // check that knownGene is sorted

   for( int nKnownGene = 1; nKnownGene < aKnownGene_.length(); ++nKnownGene ) {
      RWCString soKnownGene1 = aKnownGene_[nKnownGene - 1];
      RWCString soKnownGene2 = aKnownGene_[nKnownGene];

      RWCTokenizer tok1( soKnownGene1 );
      RWCTokenizer tok2( soKnownGene2 );

      // looks like:
      // uc001aaa.2      chr1    +       1115    4121 
      // 0               1       2       3       4

      tok1('\t');
      tok2('\t');

      RWCString soChromosome1 = tok1('\t');
      RWCString soChromosome2 = tok2('\t');

      //      printf( "dddd %s\n", soChromosome1.data() );
      //      printf( "eeee %s\n", soChromosome2.data() );

      if ( soChromosome1 > soChromosome2 ) {
         printf( "known genes at index %d has %s then %s\n",
                 nKnownGene,
                 soChromosome1.data(),
                 soChromosome2.data() );
      }

      tok1('\t');
      tok2('\t');

      RWCString so0Start1 = tok1('\t');
      RWCString so0Start2 = tok2('\t');

      int n1ChrPos1 = atoi( so0Start1.data() ) + 1;


      if ( nCompareLocations( soChromosome1,
                              n1ChrPos1,
                              soKnownGene2 ) > 0 ) {
         printf( "genes out of order: %s %s %s %s\n",
                 soChromosome1.data(),
                 soChromosome2.data(),
                 so0Start1.data(),
                 so0Start2.data() );
      }
      

   } // for( int nKnownGene = 1;  

}

      
void geneClassifications :: processCurrentLine( const RWCString& soChromosome,
                                                const int n1ChrPos,
                                                const RWCString& soAlternativeAlleles ) {


   char cNormalAA = '?';
   char cSnpAA = '?';


   char cGeneType = cFindGeneTypeAllGenes( soChromosome, n1ChrPos, 
                                           soAlternativeAlleles,
                                           cNormalAA,
                                           cSnpAA );
      

   if ( filPhyloFOF_.bIsNull() ) {
      fprintf( pOutputFile_, "%s %d %s %s %c %c\n", soChromosome.data(), n1ChrPos, 
            soAlternativeAlleles.data(), 
            szTranslateGeneType( cGeneType ),
            cNormalAA,
            cSnpAA );
   }
   else {
   
      fprintf( pOutputFile_, "%s %d %s %s %c %c %.3f\n", soChromosome.data(), n1ChrPos, 
            soAlternativeAlleles.data(), 
            szTranslateGeneType( cGeneType ),
            cNormalAA,
            cSnpAA,
            aConservationScores_[ n1ChrPos ] );
   }

}


char geneClassifications :: cFindGeneTypeAllGenes( 
                                 const RWCString& soChromosome,
                                 const int n1ChrPos,
                                 const RWCString& soAlternativeAlleles,
                                 char& cNormalAA,
                                 char& cSnpAA ) {

   char cAlternativeAllele;

   // just so the compiler doesn't complain, make a copy
   RWCString soAlternativeAlleles2 = soAlternativeAlleles;

   if ( soAlternativeAlleles2.bStartsWith( "del" ) )
      cAlternativeAllele = 'a';
   else if ( soAlternativeAlleles2.bStartsWith( "ins" ) )
      cAlternativeAllele = 'a';
   else {
      cAlternativeAllele = soAlternativeAlleles2[0];
   }


   int nIndex = nFindBeginningOfGenesPossiblyOverlappingPos( soChromosome,
                                                             n1ChrPos );

   if ( nIndex == RW_NPOS ) {
      return cINTERGENIC;
   }

   char cGeneType = cINTERGENIC;

   for( ;  nIndex < aKnownGene_.length(); ++nIndex ) {

      if ( nCompareLocations( soChromosome, n1ChrPos, aKnownGene_[ nIndex ] )
           < 0 ) {
         // this gene is to the right of n1ChrPos so we need look at
         // no more genes
         break;
      }

      char cTempNormalAA;
      char cTempSnpAA;

      char cGeneTypeThisGene = 
         cFindGeneType( soChromosome, 
                        n1ChrPos, 
                        cAlternativeAllele,
                        aKnownGene_[ nIndex],  
                        cTempNormalAA,
                        cTempSnpAA );
      
      cGeneType = cCombineGeneTypes( cGeneType, cGeneTypeThisGene );

      // this check will determine if the current gene is the one
      // that dominates
      if ( cGeneType == cGeneTypeThisGene ) {

         cNormalAA = cTempNormalAA;
         cSnpAA = cTempSnpAA;
      }
   }

   return cGeneType;
}
      
   
int geneClassifications :: nFindBeginningOfGenesPossiblyOverlappingPos( 
         const RWCString& soChromosome,
         const int n1ChrPos ) {

   
   // largest gene is 2304634


   // ------ ---    -------- 
   //                     x position to find genes overlapping
   //     x position after subtracting nLargestGene
   

   const int nLargestGene = 2400000;
   int n1ChrPosToLookFor = MAX( 1, n1ChrPos - nLargestGene );


   int nIndex = nFindIndexOfMatchOrSuccessor( soChromosome, 
                                              n1ChrPosToLookFor );

   if ( nIndex != RW_NPOS ) {
      if ( nCompareLocations( soChromosome, n1ChrPosToLookFor,
                              aKnownGene_[ nIndex ] ) == 0 ) {

         // try backing up as far as can go and still have the 
         // 2 locations equal

         while( nIndex > 0 ) {
            --nIndex;
            
            if (  nCompareLocations( soChromosome, n1ChrPosToLookFor,
                              aKnownGene_[ nIndex ] ) != 0 ) {
               ++nIndex;
               break;
            }
         }
         // reach here either because nIndex == 0
         // or because backed up until the locations were different
         // and then ++nIndex so that the locations at the same


      }
   }


   return nIndex;
}


int geneClassifications :: nFindIndexOfMatchOrSuccessor( 
         const RWCString& soChromosome,
         const int n1ChrPosToLookFor ) {


   // this routine doesn't worry about whether the gene overlaps the
   // position or not--it is just simply concerned with finding the
   // first gene that starts at or after n1ChrPosToLookFor 

   if ( aKnownGene_.isEmpty() )
      return RW_NPOS;


   // these labels don't yet apply
   int nMaxIndex = aKnownGene_.length() - 1 ;


   // consider case in with last gene is too far left
   if ( nCompareLocations( soChromosome,
                           n1ChrPosToLookFor,
                           aKnownGene_[ nMaxIndex ] ) > 0 )
      return RW_NPOS;

   // if reached here,
   // . ----> (last gene)
   // or
   // . (n1ChrPosToLookFor
   // -----> (last gene)

   if ( nCompareLocations( soChromosome,
                           n1ChrPosToLookFor,
                           aKnownGene_[0] ) <= 0 ) 
      return 0;

   // if reached here, the gene at 0 must be to the left of the position:
   // ----->
   //  .
   // so the gene at 0 is too far to the left:

   int nTooSmallIndex = 0;

   while( true ) {
      
      if ( nMaxIndex - nTooSmallIndex <= 1 ) 
         return( nMaxIndex );
      else {
         int nTestIndex = ( nTooSmallIndex + nMaxIndex ) / 2;
         if ( nCompareLocations( soChromosome,
                                 n1ChrPosToLookFor,
                                 aKnownGene_[ nTestIndex ] ) > 0 )
            nTooSmallIndex = nTestIndex;
         else
            nMaxIndex = nTestIndex;
      }
   }
}


char geneClassifications :: cFindGeneType( 
         const RWCString& soChromosome, 
         const int n1ChrPos, 
         const char cAlternativeAllele,
         const RWCString& soKnownGeneLine,
         char& cNormalAA,
         char& cSnpAA ) {

//   `name` varchar(255) NOT NULL default '',
//   `chrom` varchar(255) NOT NULL default '',
//   `strand` char(1) NOT NULL default '',
//   `txStart` int(10) unsigned NOT NULL default '0',
//   `txEnd` int(10) unsigned NOT NULL default '0',
//   `cdsStart` int(10) unsigned NOT NULL default '0',
//   `cdsEnd` int(10) unsigned NOT NULL default '0',
//   `exonCount` int(10) unsigned NOT NULL default '0',
//   `exonStarts` longblob NOT NULL,
//   `exonEnds` longblob NOT NULL,
//   `proteinID` varchar(40) NOT NULL default '',
//   `alignID` varchar(255) NOT NULL default '',

   cNormalAA = '?';
   cSnpAA = '?';


   char cGeneType = cINTERGENIC;

   RWCTokenizer tok( soKnownGeneLine );

   tok(); // name
   RWCString soKnownGeneChromosome = tok();
   if ( soChromosome != soKnownGeneChromosome )
      return  cINTERGENIC;

   RWCString soStrand = tok();
   RWCString so0Start = tok();
   RWCString so1End = tok();
   RWCString so0CodingStart = tok();
   RWCString so1CodingEnd = tok();
   RWCString soNumberOfExons = tok();
   RWCString soExonStarts = tok();
   RWCString soExonEnds = tok();

   assert( !soExonEnds.bIsNull() );

   // check whether it is within the gene

   int n1GeneStart = atoi( so0Start.data() ) + 1;
   int n1GeneEnd = atoi( so1End.data() );


   if ( ( n1ChrPos < n1GeneStart  ) ||
        ( n1GeneEnd < n1ChrPos ) ) {
      return cINTERGENIC;
   }

   // now we know it is within an exon or intron

   // check whether it is in an exon or intron

   int nNumberOfExons = atoi( soNumberOfExons.data() );

   RWCTokenizer tokExonStarts( soExonStarts );
   RWCTokenizer tokExonEnds( soExonEnds );

   bool bIsInSpliceSite = false;
   bool bIsInExon = false;
   for( int nExon = 0; nExon < nNumberOfExons; ++nExon ) {
      RWCString so0ExonStart = tokExonStarts(',' );
      RWCString so1ExonEnd = tokExonEnds(',' );
      assert( !so1ExonEnd.bIsNull() );

      int n1ExonStart = atoi( so0ExonStart.data() ) + 1;
      int n1ExonEnd = atoi( so1ExonEnd.data() );

      // first exon    intron    last exon
      // ---------ssiiiiiiiiiiiss-----------
      //          ^^           ^^ look these places in introns
      //         ^n1ExonEnd
      //           ^ n1ExonEnd + 2
      //                      ^n1ExonStart -2
      //                         ^n1ExonStart
      
      
      if ( nExon != 0 ) {
         if ( ( ( n1ExonStart - 2 ) <=  n1ChrPos ) &&
              ( n1ChrPos < n1ExonStart ) ) {
            bIsInSpliceSite = true;
            break;
         }
      }

      if ( nExon != ( nNumberOfExons - 1 ) ) {
         if ( ( n1ExonEnd < n1ChrPos ) &&
              ( n1ChrPos <= ( n1ExonEnd + 2 ) ) ) {
            bIsInSpliceSite = true;
            break;
         }
      }


      if ( n1ChrPos < n1ExonStart ) {
         break;
      }
      

      if ( ( n1ExonStart <= n1ChrPos ) &&
           ( n1ChrPos <= n1ExonEnd ) ) {
         bIsInExon = true;
         break;
      }

   }

   if ( bIsInSpliceSite )
      return cSPLICE_SITE;


   if ( !bIsInExon ) 
      return  cINTRON;

   putExonsIntoAnArray( soExonStarts, soExonEnds, nNumberOfExons );


   n1ChrPosCodingRegionLeft_ = atoi( so0CodingStart.data() ) + 1;
   n1ChrPosCodingRegionRight_   = atoi( so1CodingEnd.data() );

   if ( ( n1ChrPos < n1ChrPosCodingRegionLeft_ ) ||
        ( n1ChrPosCodingRegionRight_ < n1ChrPos ) ) 
      return cUTR;

   
   // in coding region.  We need to find whether it is a synonomous
   // or nonsynonomous snp.

   // get the codons 


   int n1CodingPositionOfSnp = 0;
   bool bFoundCodingPositionOfSnp = false;

   if ( soStrand == "+" )
      bTopNotBottomStrand_ = true;
   else if ( soStrand == "-" )
      bTopNotBottomStrand_ = false;
   else
      assert( false );

   if ( bTopNotBottomStrand_ ) {
      
      for( int nExon = 0; nExon < aExonStarts_.length(); ++nExon ) {

         int n1ExonStart = aExonStarts_[ nExon ];
         int n1ExonEnd = aExonEnds_[ nExon ];

         int n1ChrPosCodingRegionExonStart;
         int n1ChrPosCodingRegionExonEnd;
         if ( !bIntersect( n1ExonStart,
                           n1ExonEnd,
                           n1ChrPosCodingRegionLeft_,
                           n1ChrPosCodingRegionRight_,
                           n1ChrPosCodingRegionExonStart,
                           n1ChrPosCodingRegionExonEnd ) ) {
            // this exon is totally within a utr
            continue;
         }


         if ( n1ChrPosCodingRegionExonEnd < n1ChrPos ) {
            n1CodingPositionOfSnp += 
               ( n1ChrPosCodingRegionExonEnd 
                 - n1ChrPosCodingRegionExonStart + 1 );
         }
         else {
            assert( n1ChrPosCodingRegionExonStart <= n1ChrPos );
            assert( n1ChrPos <= n1ChrPosCodingRegionExonEnd );

            n1CodingPositionOfSnp += 
               ( n1ChrPos - n1ChrPosCodingRegionExonStart + 1 );
            bFoundCodingPositionOfSnp = true;
            break;
         }
      }
   }
   else {


      for( int nExon = aExonStarts_.length() - 1; nExon >= 0; --nExon ) {
         int n1ExonStart = aExonStarts_[ nExon ];
         int n1ExonEnd = aExonEnds_[ nExon ];


         int n1ChrPosCodingRegionExonStart;
         int n1ChrPosCodingRegionExonEnd;

         if ( !bIntersect( n1ExonStart,
                           n1ExonEnd,
                           n1ChrPosCodingRegionLeft_,
                           n1ChrPosCodingRegionRight_,
                           n1ChrPosCodingRegionExonStart,
                           n1ChrPosCodingRegionExonEnd ) ) {
            // this exon is total within a utr
            continue;
         }

         
         // -------- -------- ----- ------- coding region parts of exons
         // n1ChrPos is somewhere here because we've
         // already established that it is in an exon
         // and within the coding region boundaries


         if ( n1ChrPos < n1ChrPosCodingRegionExonStart ) {
            // the snp is not within this exon

            n1CodingPositionOfSnp += 
               ( n1ChrPosCodingRegionExonEnd
                 - n1ChrPosCodingRegionExonStart + 1 );
         }
         else {

            assert( n1ChrPosCodingRegionExonStart <= n1ChrPos );
            assert( n1ChrPos <= n1ChrPosCodingRegionExonEnd );

            n1CodingPositionOfSnp +=
               ( n1ChrPosCodingRegionExonEnd - n1ChrPos + 1 );
            bFoundCodingPositionOfSnp = true;
            break;
         }
      }
   }


   assert( bFoundCodingPositionOfSnp );


   // now need to find the 2 other bases in the codon.  Do this
   // by first finding the coordinates in terms of coding region coordinates
   // of the 2 other bases and convert these to chromosome positions

   int nCodonPosition = n1CodingPositionOfSnp % 3;

   // nCodonPosition is thus 1, 2, or 0, with 0 being the 3rd codon
   // position

   int n1CodingPosition[2];

   if ( nCodonPosition == 1 ) {
      n1CodingPosition[0] = n1CodingPositionOfSnp + 1;
      n1CodingPosition[1] = n1CodingPositionOfSnp + 2;
   }
   else if ( nCodonPosition == 2 ) {
      n1CodingPosition[0] = n1CodingPositionOfSnp - 1;
      n1CodingPosition[1] = n1CodingPositionOfSnp + 1;
   }
   else if ( nCodonPosition == 0 ) {
      n1CodingPosition[0] = n1CodingPositionOfSnp - 2;
      n1CodingPosition[1] = n1CodingPositionOfSnp - 1;
   }
   else {
      assert( false );
   }


   int n1ChrPosOfOtherPositions[2];

   for( int n = 0; n < 2; ++n ) {

      n1ChrPosOfOtherPositions[n] = 
         nConvertCodingPositionToChrPos( n1CodingPosition[n] );

   }

   RWCString soNormalCodon = "   ";
   RWCString soSnpCodon    = "   ";

   char cRefAlleleBase = soChromosomeBases_[n1ChrPos];

   if ( nCodonPosition == 1 ) {
      soNormalCodon[0] = cRefAlleleBase;
      soNormalCodon[1] = soChromosomeBases_[ n1ChrPosOfOtherPositions[0] ];
      soNormalCodon[2] = soChromosomeBases_[ n1ChrPosOfOtherPositions[1] ];

      soSnpCodon = soNormalCodon;
      soSnpCodon[0] = cAlternativeAllele;


   }
   else if ( nCodonPosition == 2 ) {
      soNormalCodon[0] = soChromosomeBases_[ n1ChrPosOfOtherPositions[0] ];
      soNormalCodon[1] = cRefAlleleBase;
      soNormalCodon[2] = soChromosomeBases_[ n1ChrPosOfOtherPositions[1] ];

      soSnpCodon = soNormalCodon;
      soSnpCodon[1] = cAlternativeAllele;

   }
   else if ( nCodonPosition == 0 ) {
         
      soNormalCodon[0] = soChromosomeBases_[ n1ChrPosOfOtherPositions[0] ];
      soNormalCodon[1] = soChromosomeBases_[ n1ChrPosOfOtherPositions[1] ];
      soNormalCodon[2] = cRefAlleleBase;

      soSnpCodon = soNormalCodon;
      soSnpCodon[2] = cAlternativeAllele;

   }
   else
      assert( false );

   if ( soStrand == "-" ) {
      // I believe that we do not want to reverse-complement these codons,
      // but rather to just complement them.  The reason is that
      // they are already in bottom-strand orientation by virtue
      // of the way the codons are assembled above.  For example,
      // n1CodingPosition[0] < n1CodingPosition[1] always, but
      // for bottom strand genes, nConvertCodingPositionToChrPos
      // will reverse this ordering.  So the codon will be assembled
      // from right to left.  That is the way we want bottom strand
      // codons.  So we just need to complement them, but not reverse them.

      for( int n = 0; n < 3; ++n ) {
         soNormalCodon[n] = cComplementBase( soNormalCodon[n] );
         soSnpCodon[n]    = cComplementBase( soSnpCodon[n] );
      }

   }
   

   cNormalAA = cGetProtein( soNormalCodon[0], soNormalCodon[1], soNormalCodon[2] );
   cSnpAA    = cGetProtein( soSnpCodon[0], soSnpCodon[1], soSnpCodon[2] );


   if ( cNormalAA == cSnpAA ) {
      return cSYN_CODING;
   }
   else {
      if ( cNormalAA == '*' ) {
         return cFROM_STOP_CODON;
      }
      else if ( cSnpAA == '*' ) {
         return cTO_STOP_CODON;
      }
      else {
         return cNONSYN_CODING;
      }
   }
}


void geneClassifications :: putExonsIntoAnArray( const RWCString& soExonStarts,
                                                const RWCString& soExonEnds,
                                                const int nNumberOfExons ) {

   aExonStarts_.clear();
   aExonEnds_.clear();

   RWCTokenizer tokExonStarts( soExonStarts );
   RWCTokenizer tokExonEnds( soExonEnds );
   
   for( int nExon = 0; nExon < nNumberOfExons; ++nExon ) {

      RWCString so0ExonStart = tokExonStarts(',' );
      RWCString so1ExonEnd = tokExonEnds(',' );
      assert( !so1ExonEnd.bIsNull() );

      int n1ExonStart = atoi( so0ExonStart.data() ) + 1;
      int n1ExonEnd = atoi( so1ExonEnd.data() );
      
      aExonStarts_.append( n1ExonStart );
      aExonEnds_.append( n1ExonEnd );
   }

   assert( aExonStarts_.length() == nNumberOfExons );
}

      
int geneClassifications :: nConvertCodingPositionToChrPos( const int n1CodingPos ) {
   

   bool bFoundCodingPos = false;

   int nCodingBasesSoFar = 0;

   if ( bTopNotBottomStrand_ ) {
      
      for( int nExon = 0; nExon < aExonStarts_.length(); ++nExon ) {

         int n1ExonStart = aExonStarts_[ nExon ];
         int n1ExonEnd = aExonEnds_[ nExon ];

         int n1ChrPosCodingRegionExonStart;
         int n1ChrPosCodingRegionExonEnd;
         if ( !bIntersect( n1ExonStart,
                           n1ExonEnd,
                           n1ChrPosCodingRegionLeft_,
                           n1ChrPosCodingRegionRight_,
                           n1ChrPosCodingRegionExonStart,
                           n1ChrPosCodingRegionExonEnd ) ) {

            // this exon is totally within a utr
            continue;
         }
         
         int nCodingBasesIncludingThisExon =
            nCodingBasesSoFar + 
            ( n1ChrPosCodingRegionExonEnd 
              - n1ChrPosCodingRegionExonStart + 1 );

         if ( nCodingBasesIncludingThisExon < n1CodingPos ) {
            nCodingBasesSoFar = nCodingBasesIncludingThisExon;
         }
         else {
            assert( nCodingBasesSoFar < n1CodingPos );

            int nAdditionalBasesThisExon = n1CodingPos - nCodingBasesSoFar;
            return n1ChrPosCodingRegionExonStart + nAdditionalBasesThisExon - 1;
         }
      }

      // should never get here
      assert( false );
   }
   else {

      for( int nExon = aExonStarts_.length() - 1; nExon >= 0; --nExon ) {
         
         int n1ExonStart = aExonStarts_[ nExon ];
         int n1ExonEnd = aExonEnds_[ nExon ];

         int n1ChrPosCodingRegionExonStart;
         int n1ChrPosCodingRegionExonEnd;
         if ( !bIntersect( n1ExonStart,
                           n1ExonEnd,
                           n1ChrPosCodingRegionLeft_,
                           n1ChrPosCodingRegionRight_,
                           n1ChrPosCodingRegionExonStart,
                           n1ChrPosCodingRegionExonEnd ) ) {
            // this exon is totally within a utr
            continue;
         }

         int nCodingBasesIncludingThisExon =
            nCodingBasesSoFar +
            ( n1ChrPosCodingRegionExonEnd
              - n1ChrPosCodingRegionExonStart + 1 );

         if ( nCodingBasesIncludingThisExon < n1CodingPos ) {
            nCodingBasesSoFar = nCodingBasesIncludingThisExon;
         }
         else {
            assert( nCodingBasesSoFar < n1CodingPos );

            int nAdditionalBasesThisExon = n1CodingPos - nCodingBasesSoFar;
            
            return n1ChrPosCodingRegionExonEnd - nAdditionalBasesThisExon + 1;
         }
      }

      // should never get here
      assert( false );
   }
}

         
char geneClassifications :: cCombineGeneTypes( char cGeneType1, 
                                               char cGeneType2 ) {

   if ( ( cGeneType1 == cTO_STOP_CODON ) ||
        ( cGeneType2 == cTO_STOP_CODON ) )
      return cTO_STOP_CODON;

   if ( ( cGeneType1 == cFROM_STOP_CODON ) ||
        ( cGeneType2 == cFROM_STOP_CODON ) )
      return cFROM_STOP_CODON;

   // notice that a splice site is even higher priority
   // than a nonsynonomous coding substitution

   if ( ( cGeneType1 == cSPLICE_SITE ) ||
        ( cGeneType2 == cSPLICE_SITE ) )
      return cSPLICE_SITE;

   if ( ( cGeneType1 == cNONSYN_CODING ) ||
        ( cGeneType2 == cNONSYN_CODING ) )
      return cNONSYN_CODING;

   if ( ( cGeneType1 == cSYN_CODING ) ||
        ( cGeneType2 == cSYN_CODING ) )
      return cSYN_CODING;

   if ( ( cGeneType1 == cUTR ) ||
        ( cGeneType2 == cUTR ) )
      return cUTR;

   if ( ( cGeneType1 == cINTRON ) ||
        ( cGeneType2 == cINTRON ) )
      return cINTRON;

   return cINTERGENIC;

}