/*****************************************************************************
#   Copyright (C) 1994-2008 by David Gordon.
#   All rights reserved.                           
#                                                                           
#   This software is part of a beta-test version of the Consed/Autofinish
#   package.  It should not be redistributed or
#   used for any commercial purpose, including commercially funded
#   sequencing, without written permission from the author and the
#   University of Washington.
#   
#   This software is provided ``AS IS'' and any express or implied
#   warranties, including, but not limited to, the implied warranties of
#   merchantability and fitness for a particular purpose, are disclaimed.
#   In no event shall the authors or the University of Washington be
#   liable for any direct, indirect, incidental, special, exemplary, or
#   consequential damages (including, but not limited to, procurement of
#   substitute goods or services; loss of use, data, or profits; or
#   business interruption) however caused and on any theory of liability,
#   whether in contract, strict liability, or tort (including negligence
#   or otherwise) arising in any way out of the use of this software, even
#   if advised of the possibility of such damage.
#
#   Building Consed from source is error prone and not simple which is
#   why I provide executables.  Due to time limitations I cannot
#   provide any assistance in building Consed.  Even if you do not
#   modify the source, you may introduce errors due to using a
#   different version of the compiler, a different version of motif,
#   different versions of other libraries than I used, etc.  For this
#   reason, if you discover Consed bugs, I can only offer help with
#   those bugs if you first reproduce those bugs with an executable
#   provided by me--not an executable you have built.
# 
#   Modifying Consed is also difficult.  Although Consed is modular,
#   some modules are used by many other modules.  Thus making a change
#   in one place can have unforeseen effects on many other features.
#   It may takes months for you to notice these other side-effects
#   which may not seen connected at all.  It is not feasable for me to
#   provide help with modifying Consed sources because of the
#   potentially huge amount of time involved.
#
#*****************************************************************************/
#include    "rwcstring.h"
#include    "rwtptrvector.h"
#include    "rwtvalorderedvector.h"
#include    "assert.h"
#include    <stdio.h>

#include    "consedParameters.h"
#include    "findAncestralTrees.h"


// input:  RWCString -- a string of bases for the species, which can
//   be acgtn, they must be in the order: PPan PTro HSap GGor PPyg MMul
// output: an array of RWCString which has, in Newick format, the
//   most parsimonious 

const RWCString soAllowedBases( "acgt*" );
const int nNumberOfBases = 5;

// how does this contain multiple paths?  Shouldn't there be an array 
// for each of the 4 bases?  Yes.

class oneBaseOfNodeClass {
 public:
   int nMinNumberOfMutations_;
   RWTValOrderedVector<char> aLeftBelowBases_;
   RWTValOrderedVector<char> aRightBelowBases_;
   RWTValOrderedVector<char> aMMulBases_;  // only used for top ancestral
   // node
 public:
   oneBaseOfNodeClass() : nMinNumberOfMutations_( 0 ) {}
};


class nodeClass {
 public:
   oneBaseOfNodeClass aOneBaseOfNodeClass_[nNumberOfBases ];
   bool bUnknown_; // only applies to the bottom row
 public:
   nodeClass() : bUnknown_( false ) {}
};

static char cIntegerToBase[nNumberOfBases];
static int nBaseToInteger[256];
static bool bBaseToIntegerAndIntegerToBaseSet = false;

static void setBaseToIntegerAndIntegerToBase() {

   for( int n = 0; n < 256; ++n ) {
      nBaseToInteger[n] = -1;
   }

   nBaseToInteger['a'] = 0;
   nBaseToInteger['c'] = 1;
   nBaseToInteger['g'] = 2;
   nBaseToInteger['t'] = 3;
   nBaseToInteger['*'] = 4;

   nBaseToInteger['A'] = 0;
   nBaseToInteger['C'] = 1;
   nBaseToInteger['G'] = 2;
   nBaseToInteger['T'] = 3;

   cIntegerToBase[0] = 'a';
   cIntegerToBase[1] = 'c';
   cIntegerToBase[2] = 'g';
   cIntegerToBase[3] = 't';
   cIntegerToBase[4] = '*';

}



// -1 since PPan and PTro go into same internal node, -1 because MMul
// and PPyg go into same internal node, and -1 because index starts at
// 0


static bool bFirstTimeFindAncestralTrees = true;

static RWTPtrVector<nodeClass> aInternalNodes( (size_t) nMaxInternalNode + 1, 
                                               0,
                                               "aInternalNodes" );

static RWTPtrVector<nodeClass> aBottomRow( (size_t) nNumberOfSpecies, 0,
                                    "aBottomRow" );

static RWTValOrderedVector<RWCString> aTreesStatic;



static void tracebackToGetTrees( const RWCString& soTreeSoFarLeft,
                           const RWCString& soTreeSoFarRight,
                           const int nCurrentInternalNode,
                           const char cCurrentBaseAtCurrentInternalNode ) {



   nodeClass* pCurrentNode = aInternalNodes[ nCurrentInternalNode ];


   oneBaseOfNodeClass& myOneBaseOfNodeClass =
      pCurrentNode->aOneBaseOfNodeClass_[ nBaseToInteger[ cCurrentBaseAtCurrentInternalNode ] ];


   if ( nCurrentInternalNode == 0 ) {
      // case in which the descendant nodes are PPan and PTro

      nodeClass* pPPanBottomNode = aBottomRow[0];
      nodeClass* pPTroBottomNode = aBottomRow[1];

      for( int nPPanBase = 0;
           nPPanBase < myOneBaseOfNodeClass.aLeftBelowBases_.length();
           ++nPPanBase ) {

         RWCString soPPanBase = 
            myOneBaseOfNodeClass.aLeftBelowBases_[ nPPanBase ];

         if ( pPPanBottomNode->bUnknown_ )
            soPPanBase += "?";


         for( int nPTroBase = 0;
              nPTroBase < myOneBaseOfNodeClass.aRightBelowBases_.length();
              ++nPTroBase ) {

            RWCString soPTroBase =
               myOneBaseOfNodeClass.aRightBelowBases_[ nPTroBase ];

            if ( pPTroBottomNode->bUnknown_ )
               soPTroBase += "?";

            RWCString soCompleteTree =
               soTreeSoFarLeft + 
               RWCString( "(" ) + 
               soPPanBase +
               RWCString( "," ) +
               soPTroBase +
               RWCString( ")" ) +
               soTreeSoFarRight;

            aTreesStatic.insert( soCompleteTree );

         }
      }
   } //    if ( nCurrentInternalNode == 0 ) {
   else if ( nCurrentInternalNode == nMaxInternalNode ) {
      // special case for top node

      nodeClass* pPPygBottomNode = aBottomRow[4];
      nodeClass* pMMulBottomNode = aBottomRow[5];

      for( int nMMulBase = 0;
           nMMulBase < myOneBaseOfNodeClass.aMMulBases_.length();
           ++nMMulBase ) {

         RWCString soMMulBase = myOneBaseOfNodeClass.aMMulBases_[ nMMulBase ];
         if ( pMMulBottomNode->bUnknown_ )
            soMMulBase += "?";
         

         for( int nLeftBase = 0; 
              nLeftBase < myOneBaseOfNodeClass.aLeftBelowBases_.length();
              ++nLeftBase ) {

            char cLeftBase =
               myOneBaseOfNodeClass.aLeftBelowBases_[ nLeftBase ];

            for( int nPPygBase = 0;
                 nPPygBase < myOneBaseOfNodeClass.aRightBelowBases_.length();
                 ++nPPygBase ) {

               RWCString soPPygBase = 
                  myOneBaseOfNodeClass.aRightBelowBases_[ nPPygBase ];

               if ( pPPygBottomNode->bUnknown_ )
                  soPPygBase += "?";

               RWCString soNewTreeSoFarLeft =
                  soTreeSoFarLeft + RWCString( "(" ) + RWCString( (char) cLeftBase );

               RWCString soNewTreeSoFarRight =
                  RWCString( "," ) +
                  soPPygBase +
                  RWCString( "," ) +
                  soMMulBase +
                  RWCString( ")" ) +
                  soTreeSoFarRight;

               tracebackToGetTrees( soNewTreeSoFarLeft,
                                    soNewTreeSoFarRight,
                                    nCurrentInternalNode - 1,
                                    cLeftBase );

            } // for( int nPPygBase
         } // for( int nLeftBase
      } // for( int nMMulBase
   } //   if ( nCurrentInternalNode < nMaxInternalNode ) { else
   else {

      // e.g., if internal node is 1, we will take 0th internal node 
      // and the 2st bottom node

      int nBelowRightBottomNode = nCurrentInternalNode + 1;

      nodeClass* pBelowRightBottomNode = aBottomRow[ nBelowRightBottomNode ];

      for( int nLeftBase = 0;
           nLeftBase < myOneBaseOfNodeClass.aLeftBelowBases_.length();
           ++nLeftBase ) {
      
         char cLeftBase =
            myOneBaseOfNodeClass.aLeftBelowBases_[ nLeftBase ];

         for( int nRightBase = 0;
              nRightBase < myOneBaseOfNodeClass.aRightBelowBases_.length();
              ++nRightBase ) {

            RWCString soRightBase =
               myOneBaseOfNodeClass.aRightBelowBases_[ nRightBase ];

            if ( pBelowRightBottomNode->bUnknown_ )
               soRightBase += "?";

            RWCString soNewTreeSoFarLeft =
               soTreeSoFarLeft + RWCString( "(" ) + RWCString( (char) cLeftBase );

            RWCString soNewTreeSoFarRight =
               RWCString( "," ) +
               soRightBase + ")" + soTreeSoFarRight;

            tracebackToGetTrees( soNewTreeSoFarLeft,
                                 soNewTreeSoFarRight,
                                 nCurrentInternalNode - 1,
                                 cLeftBase );
         }
      }
   }
                  




}






// input:  RWCString -- a string of bases for the species, which can
//   be acgtn, they must be in the order: PPan PTro HSap GGor PPyg MMul
// output: an array of RWCString which has, in Newick format, the
//   most parsimonious 



void findAncestralTrees( const RWCString& soPresentDayBasesHumanInFront,
                         RWTValOrderedVector<RWCString>& aTrees ) {


   RWCString soPresentDayBasesPPanInFront = soPresentDayBasesHumanInFront;
   
   // PPan
   soPresentDayBasesPPanInFront[0] = soPresentDayBasesHumanInFront[1];
   // PTro
   soPresentDayBasesPPanInFront[1] = soPresentDayBasesHumanInFront[2];
   // HSap
   soPresentDayBasesPPanInFront[2] = soPresentDayBasesHumanInFront[0];


   if ( !bBaseToIntegerAndIntegerToBaseSet ) {
      setBaseToIntegerAndIntegerToBase();
      bBaseToIntegerAndIntegerToBaseSet = true;
   }

   
   aTrees.clear();
   aTreesStatic.clear();

   if ( bFirstTimeFindAncestralTrees ) {
      bFirstTimeFindAncestralTrees = false;
   }
   else {
      aBottomRow.clearAndDestroy();

      // aBottomRow[0] and aInternalNode[0] are the same, and thus
      // must not be deleted twice
      for( int nInternalNode = 1; nInternalNode < aInternalNodes.length();
           ++nInternalNode ) {
         delete aInternalNodes[ nInternalNode ];
      }
      aInternalNodes.clear();

   }


   for( int nSpecies = 0; nSpecies < nNumberOfSpecies; ++nSpecies ) {
      char cBasePresentDay = soPresentDayBasesPPanInFront[ nSpecies ];

      nodeClass* pNodeClass = new nodeClass();

      if ( cBasePresentDay == 'n' || cBasePresentDay == '?' ) {
         pNodeClass->bUnknown_ = true;
      }


      // for each species, set a value for each base at that position.
      // If the base of a species is known (not an 'n'), then
      // set a large penalty for it to be anything other than the known
      // base, forcing the tree to end at that base.  But if the
      // base is not known, then have no penalty for each possible
      // base at that position.  The dynamic programming algorithm
      // will actually figure out what the base should be.


      for( int nBase = 0; nBase < nNumberOfBases; ++nBase ) {
         if ( pNodeClass->bUnknown_ ) {
            pNodeClass->aOneBaseOfNodeClass_[nBase].nMinNumberOfMutations_ = 0;
         }
         else if ( nBaseToInteger[ cBasePresentDay ] == nBase ) {
            pNodeClass->aOneBaseOfNodeClass_[nBase].nMinNumberOfMutations_ = 0;
         }
         else {
            pNodeClass->aOneBaseOfNodeClass_[nBase].nMinNumberOfMutations_ =
               100;
         }

         // pNodeClass->aLeftBelowBases_ has length 0
         // pNodeClass->aRightBelowBases_ has length 0
      }

      aBottomRow[nSpecies] = pNodeClass;
   }

   for( int nInternalNode = 0; nInternalNode < aInternalNodes.length();
        ++nInternalNode ) {

      // problem:  the topology here is wrong, unless
      // the order of the bases is:
      // PPan PTro HSap GGor PPyg MMul
      
      // internal node 0 uses bottom nodes 0 and 1 (PPan and PTro)
      // internal node 1 uses internal node 0 and bottom node 2 (HSap)
      // internal node 2 uses internal node 1 and bottom node 3 (GGor)
      // internal node 3 uses internal node 2 and bottom node 4 (PPyg) and
      //                                          bottom node 5 (MMul)

      RWTPtrOrderedVector<nodeClass> aNodesToBeCombined( (size_t) 3 );

      if ( nInternalNode == 0 ) {
         aNodesToBeCombined.insert( aBottomRow[0] );
         aNodesToBeCombined.insert( aBottomRow[1] );
      }
      else if ( nInternalNode == nMaxInternalNode ) {
         aNodesToBeCombined.insert( aInternalNodes[nInternalNode - 1] );
         aNodesToBeCombined.insert( aBottomRow[nNumberOfSpecies - 2] ); // PPyg
         aNodesToBeCombined.insert( aBottomRow[nNumberOfSpecies - 1] ); // MMul
      }
      else {
         aNodesToBeCombined.insert( aInternalNodes[nInternalNode - 1] );
         aNodesToBeCombined.insert( aBottomRow[nInternalNode + 1 ] );
      }


      nodeClass* pCurrentNode = new nodeClass();
      aInternalNodes[nInternalNode] = pCurrentNode;
      
      for( int nAncestralBase = 0; nAncestralBase < nNumberOfBases; ++nAncestralBase ) {

         // suppose that the internal node is base nAncestralBase.
         // How many mutations did it take to get there?


         int nLeastNumberOfMutationsFromAllNodesToBeCombined = 0;

         oneBaseOfNodeClass& myOneBaseOfNodeClass =
            pCurrentNode->aOneBaseOfNodeClass_[ nAncestralBase ];

         for( int nNodeToBeCombined = 0; nNodeToBeCombined <
                 aNodesToBeCombined.length(); ++nNodeToBeCombined ) {

            nodeClass* pNodeToBeCombined = 
               aNodesToBeCombined[ nNodeToBeCombined ];


            int nLeastNumberOfMutationsFromThisNodeToBeCombined = 10000;
            RWTValOrderedVector<char> aHowToComeFromNodeToBeCombined;

            for( int nBelowBase = 0; nBelowBase < nNumberOfBases; ++nBelowBase ) {
               int nMutations = pNodeToBeCombined->aOneBaseOfNodeClass_[ nBelowBase ].nMinNumberOfMutations_;

               if ( nBelowBase != nAncestralBase ) {
                  ++nMutations;
               }
               
               if ( nMutations < nLeastNumberOfMutationsFromThisNodeToBeCombined ) {
                  nLeastNumberOfMutationsFromThisNodeToBeCombined = nMutations;
                  aHowToComeFromNodeToBeCombined.clear();
                  aHowToComeFromNodeToBeCombined.insert( cIntegerToBase[nBelowBase] );
               }
               else if ( nMutations == nLeastNumberOfMutationsFromThisNodeToBeCombined ) {
                  aHowToComeFromNodeToBeCombined.insert( cIntegerToBase[nBelowBase] );
               }
            }

            nLeastNumberOfMutationsFromAllNodesToBeCombined +=
               nLeastNumberOfMutationsFromThisNodeToBeCombined;


            if ( nNodeToBeCombined == 0 ) {
               myOneBaseOfNodeClass.aLeftBelowBases_ = aHowToComeFromNodeToBeCombined;
            }
            else if ( nNodeToBeCombined == 1 ) {
               myOneBaseOfNodeClass.aRightBelowBases_ = aHowToComeFromNodeToBeCombined;
            }
            else if ( nNodeToBeCombined == 2 ) {
               myOneBaseOfNodeClass.aMMulBases_ = aHowToComeFromNodeToBeCombined;
            }
            else
               assert( false );
         } //   for( int nNodeToBeCombined = 0; nNodeToBeCombined <

         myOneBaseOfNodeClass.nMinNumberOfMutations_ =
            nLeastNumberOfMutationsFromAllNodesToBeCombined;

      } //  for( int nAncestralBase = 0; 
   } //     for( int nInternalNode = 0; 

   
   // now print the most parsimonious trees


   // to do that, determine which base (or bases) at the top internal
   // node are most parsimonious

   int nTopInternalNode = aInternalNodes.length() - 1;
   nodeClass* pTopInternalNode = aInternalNodes[ nTopInternalNode ];

   int nLeastMutations = 1000;
   RWTValOrderedVector<int> aBasesOfLeastMutations;
   for( int nTopBase = 0; nTopBase < nNumberOfBases; ++nTopBase ) {
      oneBaseOfNodeClass& myOneBaseOfNodeClass =
         pTopInternalNode->aOneBaseOfNodeClass_[ nTopBase ];

      if ( nLeastMutations > myOneBaseOfNodeClass.nMinNumberOfMutations_ ) {
         aBasesOfLeastMutations.clear();
         aBasesOfLeastMutations.insert( nTopBase );
         nLeastMutations = myOneBaseOfNodeClass.nMinNumberOfMutations_;
      }
      else if ( nLeastMutations == myOneBaseOfNodeClass.nMinNumberOfMutations_ ) {
         aBasesOfLeastMutations.insert( nTopBase );
      }
   }

   // so aBasesOfLeastMutations is the list of bases at the top 
   // ancestral internal node that all have
   // nLeastMutations mutations, which is the fewest mutations
   // that can explain each of the soPresentDayBases


   RWCString soRightTreeSoFar = "";
   int nCurrentInternalNode = aInternalNodes.length() - 1;
   for( int nBaseOfLeastMutations = 0; 
        nBaseOfLeastMutations < aBasesOfLeastMutations.length(); 
        ++nBaseOfLeastMutations ) {

      char cCurrentBase = 
         cIntegerToBase[ aBasesOfLeastMutations[nBaseOfLeastMutations] ];

      RWCString soLeftTreeSoFar( cCurrentBase );

      tracebackToGetTrees( soLeftTreeSoFar, soRightTreeSoFar,
                     nCurrentInternalNode, cCurrentBase );


      // result from tracebackToGetTrees is appended onto aTreesStatic

   }

   aTrees = aTreesStatic;

}