/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "rwcstring.h" #include "rwtptrvector.h" #include "rwtvalorderedvector.h" #include "assert.h" #include #include "consedParameters.h" #include "findAncestralTrees.h" // input: RWCString -- a string of bases for the species, which can // be acgtn, they must be in the order: PPan PTro HSap GGor PPyg MMul // output: an array of RWCString which has, in Newick format, the // most parsimonious const RWCString soAllowedBases( "acgt*" ); const int nNumberOfBases = 5; // how does this contain multiple paths? Shouldn't there be an array // for each of the 4 bases? Yes. class oneBaseOfNodeClass { public: int nMinNumberOfMutations_; RWTValOrderedVector aLeftBelowBases_; RWTValOrderedVector aRightBelowBases_; RWTValOrderedVector aMMulBases_; // only used for top ancestral // node public: oneBaseOfNodeClass() : nMinNumberOfMutations_( 0 ) {} }; class nodeClass { public: oneBaseOfNodeClass aOneBaseOfNodeClass_[nNumberOfBases ]; bool bUnknown_; // only applies to the bottom row public: nodeClass() : bUnknown_( false ) {} }; static char cIntegerToBase[nNumberOfBases]; static int nBaseToInteger[256]; static bool bBaseToIntegerAndIntegerToBaseSet = false; static void setBaseToIntegerAndIntegerToBase() { for( int n = 0; n < 256; ++n ) { nBaseToInteger[n] = -1; } nBaseToInteger['a'] = 0; nBaseToInteger['c'] = 1; nBaseToInteger['g'] = 2; nBaseToInteger['t'] = 3; nBaseToInteger['*'] = 4; nBaseToInteger['A'] = 0; nBaseToInteger['C'] = 1; nBaseToInteger['G'] = 2; nBaseToInteger['T'] = 3; cIntegerToBase[0] = 'a'; cIntegerToBase[1] = 'c'; cIntegerToBase[2] = 'g'; cIntegerToBase[3] = 't'; cIntegerToBase[4] = '*'; } // -1 since PPan and PTro go into same internal node, -1 because MMul // and PPyg go into same internal node, and -1 because index starts at // 0 static bool bFirstTimeFindAncestralTrees = true; static RWTPtrVector aInternalNodes( (size_t) nMaxInternalNode + 1, 0, "aInternalNodes" ); static RWTPtrVector aBottomRow( (size_t) nNumberOfSpecies, 0, "aBottomRow" ); static RWTValOrderedVector aTreesStatic; static void tracebackToGetTrees( const RWCString& soTreeSoFarLeft, const RWCString& soTreeSoFarRight, const int nCurrentInternalNode, const char cCurrentBaseAtCurrentInternalNode ) { nodeClass* pCurrentNode = aInternalNodes[ nCurrentInternalNode ]; oneBaseOfNodeClass& myOneBaseOfNodeClass = pCurrentNode->aOneBaseOfNodeClass_[ nBaseToInteger[ cCurrentBaseAtCurrentInternalNode ] ]; if ( nCurrentInternalNode == 0 ) { // case in which the descendant nodes are PPan and PTro nodeClass* pPPanBottomNode = aBottomRow[0]; nodeClass* pPTroBottomNode = aBottomRow[1]; for( int nPPanBase = 0; nPPanBase < myOneBaseOfNodeClass.aLeftBelowBases_.length(); ++nPPanBase ) { RWCString soPPanBase = myOneBaseOfNodeClass.aLeftBelowBases_[ nPPanBase ]; if ( pPPanBottomNode->bUnknown_ ) soPPanBase += "?"; for( int nPTroBase = 0; nPTroBase < myOneBaseOfNodeClass.aRightBelowBases_.length(); ++nPTroBase ) { RWCString soPTroBase = myOneBaseOfNodeClass.aRightBelowBases_[ nPTroBase ]; if ( pPTroBottomNode->bUnknown_ ) soPTroBase += "?"; RWCString soCompleteTree = soTreeSoFarLeft + RWCString( "(" ) + soPPanBase + RWCString( "," ) + soPTroBase + RWCString( ")" ) + soTreeSoFarRight; aTreesStatic.insert( soCompleteTree ); } } } // if ( nCurrentInternalNode == 0 ) { else if ( nCurrentInternalNode == nMaxInternalNode ) { // special case for top node nodeClass* pPPygBottomNode = aBottomRow[4]; nodeClass* pMMulBottomNode = aBottomRow[5]; for( int nMMulBase = 0; nMMulBase < myOneBaseOfNodeClass.aMMulBases_.length(); ++nMMulBase ) { RWCString soMMulBase = myOneBaseOfNodeClass.aMMulBases_[ nMMulBase ]; if ( pMMulBottomNode->bUnknown_ ) soMMulBase += "?"; for( int nLeftBase = 0; nLeftBase < myOneBaseOfNodeClass.aLeftBelowBases_.length(); ++nLeftBase ) { char cLeftBase = myOneBaseOfNodeClass.aLeftBelowBases_[ nLeftBase ]; for( int nPPygBase = 0; nPPygBase < myOneBaseOfNodeClass.aRightBelowBases_.length(); ++nPPygBase ) { RWCString soPPygBase = myOneBaseOfNodeClass.aRightBelowBases_[ nPPygBase ]; if ( pPPygBottomNode->bUnknown_ ) soPPygBase += "?"; RWCString soNewTreeSoFarLeft = soTreeSoFarLeft + RWCString( "(" ) + RWCString( (char) cLeftBase ); RWCString soNewTreeSoFarRight = RWCString( "," ) + soPPygBase + RWCString( "," ) + soMMulBase + RWCString( ")" ) + soTreeSoFarRight; tracebackToGetTrees( soNewTreeSoFarLeft, soNewTreeSoFarRight, nCurrentInternalNode - 1, cLeftBase ); } // for( int nPPygBase } // for( int nLeftBase } // for( int nMMulBase } // if ( nCurrentInternalNode < nMaxInternalNode ) { else else { // e.g., if internal node is 1, we will take 0th internal node // and the 2st bottom node int nBelowRightBottomNode = nCurrentInternalNode + 1; nodeClass* pBelowRightBottomNode = aBottomRow[ nBelowRightBottomNode ]; for( int nLeftBase = 0; nLeftBase < myOneBaseOfNodeClass.aLeftBelowBases_.length(); ++nLeftBase ) { char cLeftBase = myOneBaseOfNodeClass.aLeftBelowBases_[ nLeftBase ]; for( int nRightBase = 0; nRightBase < myOneBaseOfNodeClass.aRightBelowBases_.length(); ++nRightBase ) { RWCString soRightBase = myOneBaseOfNodeClass.aRightBelowBases_[ nRightBase ]; if ( pBelowRightBottomNode->bUnknown_ ) soRightBase += "?"; RWCString soNewTreeSoFarLeft = soTreeSoFarLeft + RWCString( "(" ) + RWCString( (char) cLeftBase ); RWCString soNewTreeSoFarRight = RWCString( "," ) + soRightBase + ")" + soTreeSoFarRight; tracebackToGetTrees( soNewTreeSoFarLeft, soNewTreeSoFarRight, nCurrentInternalNode - 1, cLeftBase ); } } } } // input: RWCString -- a string of bases for the species, which can // be acgtn, they must be in the order: PPan PTro HSap GGor PPyg MMul // output: an array of RWCString which has, in Newick format, the // most parsimonious void findAncestralTrees( const RWCString& soPresentDayBasesHumanInFront, RWTValOrderedVector& aTrees ) { RWCString soPresentDayBasesPPanInFront = soPresentDayBasesHumanInFront; // PPan soPresentDayBasesPPanInFront[0] = soPresentDayBasesHumanInFront[1]; // PTro soPresentDayBasesPPanInFront[1] = soPresentDayBasesHumanInFront[2]; // HSap soPresentDayBasesPPanInFront[2] = soPresentDayBasesHumanInFront[0]; if ( !bBaseToIntegerAndIntegerToBaseSet ) { setBaseToIntegerAndIntegerToBase(); bBaseToIntegerAndIntegerToBaseSet = true; } aTrees.clear(); aTreesStatic.clear(); if ( bFirstTimeFindAncestralTrees ) { bFirstTimeFindAncestralTrees = false; } else { aBottomRow.clearAndDestroy(); // aBottomRow[0] and aInternalNode[0] are the same, and thus // must not be deleted twice for( int nInternalNode = 1; nInternalNode < aInternalNodes.length(); ++nInternalNode ) { delete aInternalNodes[ nInternalNode ]; } aInternalNodes.clear(); } for( int nSpecies = 0; nSpecies < nNumberOfSpecies; ++nSpecies ) { char cBasePresentDay = soPresentDayBasesPPanInFront[ nSpecies ]; nodeClass* pNodeClass = new nodeClass(); if ( cBasePresentDay == 'n' || cBasePresentDay == '?' ) { pNodeClass->bUnknown_ = true; } // for each species, set a value for each base at that position. // If the base of a species is known (not an 'n'), then // set a large penalty for it to be anything other than the known // base, forcing the tree to end at that base. But if the // base is not known, then have no penalty for each possible // base at that position. The dynamic programming algorithm // will actually figure out what the base should be. for( int nBase = 0; nBase < nNumberOfBases; ++nBase ) { if ( pNodeClass->bUnknown_ ) { pNodeClass->aOneBaseOfNodeClass_[nBase].nMinNumberOfMutations_ = 0; } else if ( nBaseToInteger[ cBasePresentDay ] == nBase ) { pNodeClass->aOneBaseOfNodeClass_[nBase].nMinNumberOfMutations_ = 0; } else { pNodeClass->aOneBaseOfNodeClass_[nBase].nMinNumberOfMutations_ = 100; } // pNodeClass->aLeftBelowBases_ has length 0 // pNodeClass->aRightBelowBases_ has length 0 } aBottomRow[nSpecies] = pNodeClass; } for( int nInternalNode = 0; nInternalNode < aInternalNodes.length(); ++nInternalNode ) { // problem: the topology here is wrong, unless // the order of the bases is: // PPan PTro HSap GGor PPyg MMul // internal node 0 uses bottom nodes 0 and 1 (PPan and PTro) // internal node 1 uses internal node 0 and bottom node 2 (HSap) // internal node 2 uses internal node 1 and bottom node 3 (GGor) // internal node 3 uses internal node 2 and bottom node 4 (PPyg) and // bottom node 5 (MMul) RWTPtrOrderedVector aNodesToBeCombined( (size_t) 3 ); if ( nInternalNode == 0 ) { aNodesToBeCombined.insert( aBottomRow[0] ); aNodesToBeCombined.insert( aBottomRow[1] ); } else if ( nInternalNode == nMaxInternalNode ) { aNodesToBeCombined.insert( aInternalNodes[nInternalNode - 1] ); aNodesToBeCombined.insert( aBottomRow[nNumberOfSpecies - 2] ); // PPyg aNodesToBeCombined.insert( aBottomRow[nNumberOfSpecies - 1] ); // MMul } else { aNodesToBeCombined.insert( aInternalNodes[nInternalNode - 1] ); aNodesToBeCombined.insert( aBottomRow[nInternalNode + 1 ] ); } nodeClass* pCurrentNode = new nodeClass(); aInternalNodes[nInternalNode] = pCurrentNode; for( int nAncestralBase = 0; nAncestralBase < nNumberOfBases; ++nAncestralBase ) { // suppose that the internal node is base nAncestralBase. // How many mutations did it take to get there? int nLeastNumberOfMutationsFromAllNodesToBeCombined = 0; oneBaseOfNodeClass& myOneBaseOfNodeClass = pCurrentNode->aOneBaseOfNodeClass_[ nAncestralBase ]; for( int nNodeToBeCombined = 0; nNodeToBeCombined < aNodesToBeCombined.length(); ++nNodeToBeCombined ) { nodeClass* pNodeToBeCombined = aNodesToBeCombined[ nNodeToBeCombined ]; int nLeastNumberOfMutationsFromThisNodeToBeCombined = 10000; RWTValOrderedVector aHowToComeFromNodeToBeCombined; for( int nBelowBase = 0; nBelowBase < nNumberOfBases; ++nBelowBase ) { int nMutations = pNodeToBeCombined->aOneBaseOfNodeClass_[ nBelowBase ].nMinNumberOfMutations_; if ( nBelowBase != nAncestralBase ) { ++nMutations; } if ( nMutations < nLeastNumberOfMutationsFromThisNodeToBeCombined ) { nLeastNumberOfMutationsFromThisNodeToBeCombined = nMutations; aHowToComeFromNodeToBeCombined.clear(); aHowToComeFromNodeToBeCombined.insert( cIntegerToBase[nBelowBase] ); } else if ( nMutations == nLeastNumberOfMutationsFromThisNodeToBeCombined ) { aHowToComeFromNodeToBeCombined.insert( cIntegerToBase[nBelowBase] ); } } nLeastNumberOfMutationsFromAllNodesToBeCombined += nLeastNumberOfMutationsFromThisNodeToBeCombined; if ( nNodeToBeCombined == 0 ) { myOneBaseOfNodeClass.aLeftBelowBases_ = aHowToComeFromNodeToBeCombined; } else if ( nNodeToBeCombined == 1 ) { myOneBaseOfNodeClass.aRightBelowBases_ = aHowToComeFromNodeToBeCombined; } else if ( nNodeToBeCombined == 2 ) { myOneBaseOfNodeClass.aMMulBases_ = aHowToComeFromNodeToBeCombined; } else assert( false ); } // for( int nNodeToBeCombined = 0; nNodeToBeCombined < myOneBaseOfNodeClass.nMinNumberOfMutations_ = nLeastNumberOfMutationsFromAllNodesToBeCombined; } // for( int nAncestralBase = 0; } // for( int nInternalNode = 0; // now print the most parsimonious trees // to do that, determine which base (or bases) at the top internal // node are most parsimonious int nTopInternalNode = aInternalNodes.length() - 1; nodeClass* pTopInternalNode = aInternalNodes[ nTopInternalNode ]; int nLeastMutations = 1000; RWTValOrderedVector aBasesOfLeastMutations; for( int nTopBase = 0; nTopBase < nNumberOfBases; ++nTopBase ) { oneBaseOfNodeClass& myOneBaseOfNodeClass = pTopInternalNode->aOneBaseOfNodeClass_[ nTopBase ]; if ( nLeastMutations > myOneBaseOfNodeClass.nMinNumberOfMutations_ ) { aBasesOfLeastMutations.clear(); aBasesOfLeastMutations.insert( nTopBase ); nLeastMutations = myOneBaseOfNodeClass.nMinNumberOfMutations_; } else if ( nLeastMutations == myOneBaseOfNodeClass.nMinNumberOfMutations_ ) { aBasesOfLeastMutations.insert( nTopBase ); } } // so aBasesOfLeastMutations is the list of bases at the top // ancestral internal node that all have // nLeastMutations mutations, which is the fewest mutations // that can explain each of the soPresentDayBases RWCString soRightTreeSoFar = ""; int nCurrentInternalNode = aInternalNodes.length() - 1; for( int nBaseOfLeastMutations = 0; nBaseOfLeastMutations < aBasesOfLeastMutations.length(); ++nBaseOfLeastMutations ) { char cCurrentBase = cIntegerToBase[ aBasesOfLeastMutations[nBaseOfLeastMutations] ]; RWCString soLeftTreeSoFar( cCurrentBase ); tracebackToGetTrees( soLeftTreeSoFar, soRightTreeSoFar, nCurrentInternalNode, cCurrentBase ); // result from tracebackToGetTrees is appended onto aTreesStatic } aTrees = aTreesStatic; }