/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "contig.h" #include "consedParameters.h" #include "locatedFragment.h" #include "bIntersect.h" #include "mbtValVectorOfBool.h" #include "mbtValVector.h" #include "rwtptrorderedvector.h" #include "ntide.h" #include "fileDefines.h" #include "soAddCommas.h" #include "rwctokenizer.h" #include "autoReport.h" #include "consed.h" #include "assembly.h" #include "nNumberFromBase.h" #include "setErrorRateFromQuality.h" #include "dErrorRateFromQuality.h" #include "singletInfo.h" #include "readPHDAgainForHighQualitySegment.h" #include using namespace std; #include #include #include "rwcregexp.h" #include "getAllSinglets.h" #include "mbt_val_ord_offset_vec.h" #include "soLine.h" #include "max.h" #include #include "findAncestralTrees.h" #include "transitionsTransversions.h" static int nEstimatedTotalMutations = 0; static int nConsPosConsidered = 0; //const int n256 = 256; already defined in transitionsTransversions.h static RWTValOrderedVector aArrayOfNamesOfBranches; static RWTValOrderedVector aArrayOfNumberOfNonMutationsAtBranch; static RWTValOrderedVector aArrayOfNumberOfDeaminationMutationsAtBranch; static RWTValOrderedVector aArrayOfNumberOfInsertionMutationsAtBranch; static RWTValOrderedVector aArrayOfNumberOfDeletionMutationsAtBranch; static RWTValOrderedVector* pArraysAtBranch[n256][n256]; typedef RWTValVector typeArrayOfNucleotideCounts; static RWTPtrVector aArrayOfInternalNodes( nMaxInternalNode + 1, NULL, "aArrayOfInternalNodes" ); static int nBaseToIndex[n256]; static const int nOnBackbone = 1; static const int nDownToRight = 2; static int nGetNumberOfDistinctBasesNotNs( const RWCString& soBases ) { RWCString soBases2( soBases ); soBases2.removeSingleCharacterAllOccurrences( 'n' ); // check how many bases remain const int nBaseTypes = 5; bool bBases[nBaseTypes]; int n; for( n =0; n ( 10, // array size soName ) ; } } } // now consider indels int nBase; for( nBase = 0; nBase < soBasesForSubstitutions.length(); ++nBase ) { char cBase = soBasesForSubstitutions[ nBase ]; pArraysAtBranch[ cBase ][ '*' ] = &aArrayOfNumberOfDeletionMutationsAtBranch; pArraysAtBranch[ '*' ][ cBase ] = &aArrayOfNumberOfInsertionMutationsAtBranch; } // what about ['*']['*'] ? This will not count as anything for( int nInternalNode2 = 0; nInternalNode2 <= (nMaxInternalNode + 1 ); ++nInternalNode2 ) { int nInternalNode = nInternalNode2; if ( nInternalNode == ( nMaxInternalNode + 1 ) ) { nInternalNode = -1; } for( int nBranchBelowInternalNode = 0; nBranchBelowInternalNode <= 1; ++nBranchBelowInternalNode ) { // just a trick so that there is only one MMul branch if ( nInternalNode == -1 && nBranchBelowInternalNode == 1 ) continue; int nOnBackboneOrDownToRight; if ( nBranchBelowInternalNode == 0 ) nOnBackboneOrDownToRight = nOnBackbone; else nOnBackboneOrDownToRight = nDownToRight; aArrayOfNamesOfBranches.insert( soGetBranchNameWithoutTree( nInternalNode, nOnBackboneOrDownToRight ) ); aArrayOfNumberOfNonMutationsAtBranch.insert( 0.0 ); aArrayOfNumberOfDeaminationMutationsAtBranch.insert( 0.0 ); aArrayOfNumberOfInsertionMutationsAtBranch.insert( 0.0 ); aArrayOfNumberOfDeletionMutationsAtBranch.insert( 0.0 ); for( nLeft = 0; nLeft < soBasesForSubstitutions.length(); ++nLeft ) { char cLeft = soBasesForSubstitutions[ nLeft ]; for( int nRight = 0; nRight < soBasesForSubstitutions.length(); ++nRight ) { if ( nRight == nLeft ) continue; char cRight = soBasesForSubstitutions[ nRight ]; ( pArraysAtBranch[ cLeft ][ cRight ])->insert( 0.0 ); } } } // for( int nBranchBelowInternalNode = 0; ... } // for( int nInternalNode2 = 0; nInternalNode2 <= (nMaxInternalNode + for( int nInternalNode = 0; nInternalNode <= nMaxInternalNode; ++nInternalNode ) { RWCString soName = "aArrayOfInternalNodes nInternalNode = " + RWCString( (long) nInternalNode ); aArrayOfInternalNodes[ nInternalNode ] = new RWTValVector( soBasesWithPad.length(), // capacity: acgt* 0.0, // initial value soName ); } int n; for( n = 0; n < n256; ++n ) { nBaseToIndex[n] = -1; } nBaseToIndex['a'] = 0; nBaseToIndex['c'] = 1; nBaseToIndex['g'] = 2; nBaseToIndex['t'] = 3; nBaseToIndex['*'] = 4; } // static void initializeArrays() { const unsigned char ucPTro = 1; const unsigned char ucPPan = 2; const unsigned char ucGGor = 4; const unsigned char ucPPyg = 8; const unsigned char ucMMul = 16; static unsigned char ucGetMaskForSpecies( RWCString& soSpecies ) { unsigned char ucSpeciesMask; if ( soSpecies == "PTro" ) { ucSpeciesMask = ucPTro; } else if ( soSpecies == "PPan" ) { ucSpeciesMask = ucPPan; } else if ( soSpecies == "GGor" ) { ucSpeciesMask = ucGGor; } else if ( soSpecies == "PPyg" ) { ucSpeciesMask = ucPPyg; } else if ( soSpecies == "MMul" ) { ucSpeciesMask = ucMMul; } else { ucSpeciesMask = 0; } return ucSpeciesMask; } const unsigned char ucPTroOrPPan = ucPTro | ucPPan; const unsigned char ucPPygOrMMul = ucPPyg | ucMMul; static bool bAllNeededSpecies( unsigned char ucSpeciesMask ) { if ( ( ucSpeciesMask & ucPTroOrPPan ) && ( ucSpeciesMask & ucPPygOrMMul ) && ( ucSpeciesMask & ucGGor ) ) return true; else return false; } static bool bNoMutationThisColumn( const RWCString& soBases ) { char cBase = 0; for( int n = 0; n < soBases.length(); ++n ) { if ( soBases[n] == 'n' ) continue; if ( !cBase ) { cBase = soBases[n]; } else { if ( cBase != soBases[n] ) { return false; } } } return true; } static RWCString soGetTreeWithNoMutation( const RWCString soPresentDayBasesHumanInFront ) { RWCString soPresentDayBasesPPanInFront = soPresentDayBasesHumanInFront; // PPan soPresentDayBasesPPanInFront[0] = soPresentDayBasesHumanInFront[1]; // PTro soPresentDayBasesPPanInFront[1] = soPresentDayBasesHumanInFront[2]; // HSap soPresentDayBasesPPanInFront[2] = soPresentDayBasesHumanInFront[0]; // find base char cBase = 0; int n; for( n = 0; n < soPresentDayBasesPPanInFront.length(); ++n ) { if ( soPresentDayBasesPPanInFront[n] != 'n' ) { cBase = soPresentDayBasesPPanInFront[n]; break; } } assert( cBase ); // trees look like: // g(g(t(t(t(t,t),t),t),g?),g) RWCString soTreeLeft; RWCString soTreeRight; for( n = soPresentDayBasesPPanInFront.length() - 1; n >= 1; --n ) { soTreeLeft += RWCString( cBase ); soTreeLeft += "("; soTreeRight = "," + RWCString( cBase ) + ( soPresentDayBasesPPanInFront[n] == 'n' ? "?" : "" ) + ")" + soTreeRight; } // we have to handle PPan separately from the loop above because it // might have a "?" next to it RWCString soTree = soTreeLeft + RWCString( cBase ) + ( soPresentDayBasesPPanInFront[0] == 'n' ? "?" : "" ) + soTreeRight; return( soTree ); } static float fGetProbabilityOfDeaminationMutation( const int nInternalNode, const int nOnBackboneOrDownToRight, const int nTree, mbtValOrderedVectorOfRWCString* pArrayOfDeaminationMutations, RWTValOrderedVector* pArrayOfProbabilitiesOfDeaminationMutations ) { if ( !pArrayOfDeaminationMutations ) return( 0.0 ); RWCString soBranchNameWithTree = soGetBranchNameWithTree( nInternalNode, nOnBackboneOrDownToRight, nTree ); int nIndexOfDeaminationMutation = pArrayOfDeaminationMutations->index( soBranchNameWithTree ); if ( nIndexOfDeaminationMutation == RW_NPOS ) return( 0.0 ); assert( pArrayOfProbabilitiesOfDeaminationMutations ); return( (*pArrayOfProbabilitiesOfDeaminationMutations)[ nIndexOfDeaminationMutation ] ); } static void countNucleotidesAtInternalNodes( RWTValOrderedVector* pTrees ) { for( int nTree = 0; nTree < pTrees->length(); ++nTree ) { RWCString soTree = (*pTrees)[ nTree ]; soTree.removeSingleCharacterAllOccurrences( '?' ); soTree.removeSingleCharacterAllOccurrences( '(' ); soTree.removeSingleCharacterAllOccurrences( ')' ); soTree.removeSingleCharacterAllOccurrences( ',' ); // now look for mutations // nInternal node is the string offset and also that used in // soGetBranchNameWithoutTree but without using -1 for( int nInternalNode = 0; nInternalNode <= nMaxInternalNode; ++nInternalNode ) { // notice that "nInternalNode" refers to the string position // within the tree. So 0 is the top node and 4 is the node // going to PPan and PTro char cAncestralBase = soTree[ nInternalNode ]; RWTValVector* pArrayOfCounts = aArrayOfInternalNodes[ nInternalNode ]; int nBaseIndex = nBaseToIndex[ cAncestralBase ]; if ( nBaseIndex != -1 ) { // will need to be changed for maximum likelihood model (*pArrayOfCounts)[ nBaseIndex ] += 1.0 / pTrees->length(); } } } } const int nNumberOfBranches = 9; // note: this is the probability of a particular substitution // that is a transversion (e.g., g->t) --it is not the probability of // a transversion (the sum of the probs of g->t, t->g, ... static RWTValVector aTransversionRateByBranch( nNumberOfBranches, 0.0, // initial "aTransversionRateByBranch" ); static RWTValVector aTransitionRateByBranch( nNumberOfBranches, 0.0, // initial "aTransitionRateByBranch" ); static RWTValVector aInsertionRateByBranch( nNumberOfBranches, 0.0, // initial "aInsertionRateByBranch" ); static RWTValVector aDeletionRateByBranch( nNumberOfBranches, 0.0, // initial "aDeletionRateByBranch" ); static RWTValVector aSameRateByBranch( nNumberOfBranches, 0.0, //initial "aSameRateByBranch" ); static RWTValVector aNotInsertionRateByBranch( nNumberOfBranches, 0.0, // initial "aNonsenseRateByBranch" ); static RWTValVector* pWhichArrayForPairOfBases[n256][n256]; static void setBranchProbabilities() { RWCString soValid( "acgt*" ); RWCString soBases( "acgt" ); for( int n1 = 0; n1 < n256; ++n1 ) { char c1 = n1; for( int n2 = 0; n2 < n256; ++n2 ) { char c2 = n2; if ( soValid.index( c1 ) == RW_NPOS || soValid.index( c2 ) == RW_NPOS ) { pWhichArrayForPairOfBases[n1][n2] = 0; } else if ( c1 == '*' && c2 != '*' ) { pWhichArrayForPairOfBases[n1][n2] = &aInsertionRateByBranch; } else if ( c1 != '*' && c2 == '*' ) { pWhichArrayForPairOfBases[n1][n2] = &aDeletionRateByBranch; } else if ( c1 == '*' && c2 == '*' ) { pWhichArrayForPairOfBases[n1][n2] = &aNotInsertionRateByBranch; } else if ( n1 == n2 ) { pWhichArrayForPairOfBases[n1][n2] = &aSameRateByBranch; } else { if ( cTransitionsTransversions[n1][n2] == 'i' ) { pWhichArrayForPairOfBases[n1][n2] = &aTransitionRateByBranch; } else if ( cTransitionsTransversions[n1][n2] == 'v' ) { pWhichArrayForPairOfBases[n1][n2] = &aTransversionRateByBranch; } } } // for( int n2 ... } // for( int n1 ... // now let's set the actual numbers int nBranchIndex; float fTotal; nBranchIndex = nGetBranchIndexFromName( "PPan_ex" ); fTotal = 292999.3; aInsertionRateByBranch[ nBranchIndex ] = 21.4 / fTotal; aDeletionRateByBranch[ nBranchIndex ] = 40.6 / fTotal; aSameRateByBranch[ nBranchIndex ] = 292525.7 / fTotal; aTransitionRateByBranch[ nBranchIndex ] = 218.6 / fTotal; aTransversionRateByBranch[ nBranchIndex ] = 159.7 / fTotal; aNotInsertionRateByBranch[ nBranchIndex ] = 1.0 - aInsertionRateByBranch[ nBranchIndex ]; nBranchIndex = nGetBranchIndexFromName( "PTro_ex" ); fTotal = 292999.3; aInsertionRateByBranch[ nBranchIndex ] = 36.5 / fTotal; aDeletionRateByBranch[ nBranchIndex ] = 41.9 / fTotal; aSameRateByBranch[ nBranchIndex ] = 292519.8 / fTotal; aTransitionRateByBranch[ nBranchIndex ] = 226.8 / fTotal; aTransversionRateByBranch[ nBranchIndex ] = 163.8 / fTotal; aNotInsertionRateByBranch[ nBranchIndex ] = 1.0 - aInsertionRateByBranch[ nBranchIndex ]; nBranchIndex = nGetBranchIndexFromName( "HSap_in" ); fTotal = 293024.1; aInsertionRateByBranch[ nBranchIndex ] = 49.7 / fTotal; aDeletionRateByBranch[ nBranchIndex ] = 74.5 / fTotal; aSameRateByBranch[ nBranchIndex ] = 291906.5 / fTotal; aTransitionRateByBranch[ nBranchIndex ] = 573.5 / fTotal; aTransversionRateByBranch[ nBranchIndex ] = 352.8 / fTotal; aNotInsertionRateByBranch[ nBranchIndex ] = 1.0 - aInsertionRateByBranch[ nBranchIndex ]; nBranchIndex = nGetBranchIndexFromName( "HSap_ex" ); fTotal = 293024.1; aInsertionRateByBranch[ nBranchIndex ] = 55.8 / fTotal; aDeletionRateByBranch[ nBranchIndex ] = 55.9 / fTotal; aSameRateByBranch[ nBranchIndex ] = 291475.7 / fTotal; aTransitionRateByBranch[ nBranchIndex ] = 853.1 / fTotal; aTransversionRateByBranch[ nBranchIndex ] = 487.0 / fTotal; aNotInsertionRateByBranch[ nBranchIndex ] = 1.0 - aInsertionRateByBranch[ nBranchIndex ]; nBranchIndex = nGetBranchIndexFromName( "GGor_in" ); fTotal = 293031.7; aInsertionRateByBranch[ nBranchIndex ] = 18.7 / fTotal; aDeletionRateByBranch[ nBranchIndex ] = 26.3 / fTotal; aSameRateByBranch[ nBranchIndex ] = 292494.9 / fTotal; aTransitionRateByBranch[ nBranchIndex ] = 295.6 / fTotal; aTransversionRateByBranch[ nBranchIndex ] = 153.1 / fTotal; aNotInsertionRateByBranch[ nBranchIndex ] = 1.0 - aInsertionRateByBranch[ nBranchIndex ]; nBranchIndex = nGetBranchIndexFromName( "GGor_ex" ); fTotal = 293031.6; aInsertionRateByBranch[ nBranchIndex ] = 85.4 / fTotal; aDeletionRateByBranch[ nBranchIndex ] = 133.2 / fTotal; aSameRateByBranch[ nBranchIndex ] = 291000.5 / fTotal; aTransitionRateByBranch[ nBranchIndex ] = 1098.5 / fTotal; aTransversionRateByBranch[ nBranchIndex ] = 630.5 / fTotal; aNotInsertionRateByBranch[ nBranchIndex ] = 1.0 - aInsertionRateByBranch[ nBranchIndex ]; nBranchIndex = nGetBranchIndexFromName( "PPyg_in" ); fTotal = 293054.2; aInsertionRateByBranch[ nBranchIndex ] = 146.1 / fTotal; aDeletionRateByBranch[ nBranchIndex ] = 169.4 / fTotal; aSameRateByBranch[ nBranchIndex ] = 289815.1 / fTotal; aTransitionRateByBranch[ nBranchIndex ] = 1834.1 / fTotal; aTransversionRateByBranch[ nBranchIndex ] = 983.7 / fTotal; aNotInsertionRateByBranch[ nBranchIndex ] = 1.0 - aInsertionRateByBranch[ nBranchIndex ]; nBranchIndex = nGetBranchIndexFromName( "PPyg_ex" ); fTotal = 293054.2; aInsertionRateByBranch[ nBranchIndex ] = 127.1 / fTotal; aDeletionRateByBranch[ nBranchIndex ] = 193.4 / fTotal; aSameRateByBranch[ nBranchIndex ] = 289803.3 / fTotal; aTransitionRateByBranch[ nBranchIndex ] = 1852.1 / fTotal; aTransversionRateByBranch[ nBranchIndex ] = 939.3 / fTotal; aNotInsertionRateByBranch[ nBranchIndex ] = 1.0 - aInsertionRateByBranch[ nBranchIndex ]; nBranchIndex = nGetBranchIndexFromName( "MMul_ex" ); fTotal = 293054.3; aInsertionRateByBranch[ nBranchIndex ] = 348.6 / fTotal; aDeletionRateByBranch[ nBranchIndex ] = 380.9 / fTotal; aSameRateByBranch[ nBranchIndex ] = 285118.6 / fTotal; aTransitionRateByBranch[ nBranchIndex ] = 4609.2 / fTotal; aTransversionRateByBranch[ nBranchIndex ] = 2513.6 / fTotal; aNotInsertionRateByBranch[ nBranchIndex ] = 1.0 - aInsertionRateByBranch[ nBranchIndex ]; } static float fGetProbabilityOfOneTree( const RWCString& soTree2 ) { // sure getting tired of always removing all these characters // each time we need to computer-use them RWCString soTree( soTree2 ); // just so can change it soTree.removeSingleCharacterAllOccurrences('?'); soTree.removeSingleCharacterAllOccurrences('('); soTree.removeSingleCharacterAllOccurrences(')'); soTree.removeSingleCharacterAllOccurrences(','); // now compute the product of the branch lengths (where the // branch lengths are expressed as the probability of a mutation // per base position float fProduct = 1.0; for( int nInternalNode = 0; nInternalNode <= nMaxInternalNode; ++nInternalNode ) { // notice that "nInternalNode" refers to the string position // within the tree. So 0 is the top node and 4 is the node // going to PPan and PTro char cAncestralBase = soTree[ nInternalNode ]; // branch down-to-right int nIndexOfPresentDayBase = nSubtractInternalNodeFromThisToGetPresentDayIndex - nInternalNode; char cPresentDayBase = soTree[ nIndexOfPresentDayBase ]; RWTValVector* pArrayOfProbsByBranch = pWhichArrayForPairOfBases[cAncestralBase][cPresentDayBase]; int nIndexOfBranchDownAndToRight = nGetIndexForBranch( nInternalNode, nDownToRight ); fProduct *= (*pArrayOfProbsByBranch)[ nIndexOfBranchDownAndToRight ]; // branch down-to-left char cNextAncestralBaseOrPPan = soTree[ nInternalNode + 1 ]; pArrayOfProbsByBranch = pWhichArrayForPairOfBases[cAncestralBase][cNextAncestralBaseOrPPan]; int nIndexOfBranchDownAndToLeft = nGetIndexForBranch( nInternalNode, nOnBackbone ); fProduct *= (*pArrayOfProbsByBranch)[ nIndexOfBranchDownAndToLeft ]; if ( nInternalNode == 0 ) { char cMMulBase = soTree[ nSubtractInternalNodeFromThisToGetPresentDayIndex + 1]; pArrayOfProbsByBranch = pWhichArrayForPairOfBases[cAncestralBase][cMMulBase]; int nIndexForMMulBranch = nGetIndexForBranch( -1, 0 ); fProduct *= (*pArrayOfProbsByBranch)[ nIndexForMMulBranch ]; } } // nInternalNode return( fProduct ); } static void calculateTreeProbabilities( RWTValOrderedVector* pTrees, RWTValOrderedVector*& pTreeProbabilities ) { RWTValOrderedVector aProbabilityOfTree( pTrees->length() ); float fSumOfProbabilities = 0.0; int nTree; for( nTree = 0; nTree < pTrees->length(); ++nTree ) { RWCString soTree = (*pTrees)[ nTree ]; // debug(leave)ging Jan 2008 don't know why the change but // leave it this way unless I'm working on this // float fProbOfOneTree = fGetProbabilityOfOneTree( soTree ); float fProbOfOneTree = 1.0; // end debug(leave)ging fSumOfProbabilities += fProbOfOneTree; aProbabilityOfTree.insert( fProbOfOneTree ); } // now convert these to normalized probabilities (add to 1) for( nTree = 0; nTree < pTrees->length(); ++nTree ) { RWCString soTree = (*pTrees)[ nTree ]; float fNormalizedProbability = aProbabilityOfTree[ nTree ] / fSumOfProbabilities; assert( pTreeProbabilities->length() == nTree ); pTreeProbabilities->insert( fNormalizedProbability ); } } void Contig :: countAllMutationsML( autoReport* pAutoReport ) { setTransitionsTransversionsTableIfNecessary(); initializeArrays(); setBranchProbabilities(); // this differs from printFlankedColumns3 in that it records the // trees and attempts to identify which mutations are deamination // mutations // read all of the good reads RWCString soGoodReads = pCP->filAutoReportGoodHitReads_; FILE* pGoodReads = fopen( soGoodReads.data(), "r" ); if ( !pGoodReads ) { RWCString soError = "could not open "; soError += soGoodReads; THROW_ERROR( soError ); } mbtValOrderedVectorOfRWCString aGoodReads( (size_t) 1000 ); while( fgets( soLine, nMaxLineSize, pGoodReads ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); aGoodReads.insert( soLine ); } fclose( pGoodReads ); // aGoodReads.resort(); inefficient--just check every one // this will change when we delete columns int nPaddedContigLength = nGetPaddedConsLength(); mbtValVectorOfBool aSomeCombinedReadHasNoPad( nPaddedContigLength, 1, "Contig::aSomeCombinedReadHasNoPad" ); RWTPtrOrderedVector aListOfReadsInSingleSignalArrays( (size_t) 10 ); typedef MBTValOrderedOffsetVector typeArrayOfChar; RWTPtrOrderedVector aListOfSingleSignalArrays( (size_t) 10 ); // the point of the look below is to set aSomeCombinedReadHasNoPad // and to set the single signal arrays (aListOfSingleSignalArrays) int nSpecies; for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; LocatedFragment* pHumanRead = NULL; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { pHumanRead = pLocFrag; continue; } // the fake read for the 2nd alignment if ( pLocFrag->soGetName().bContains( "b." ) ) { continue; } if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; // check if this is a good read if ( aGoodReads.index( pLocFrag->soGetName() ) == RW_NPOS ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } // If we are in a contig that doesn't have a human read (rare, and // pathological), ignore this contig. if ( !pHumanRead ) return; // single signal arrays MBTValOrderedOffsetVector* pForwardReadSingleSignalArray = NULL; MBTValOrderedOffsetVector* pReverseReadSingleSignalArray = NULL; // save single signal arrays before deleting column of pads for( int n = 0; n <= 1; ++n ) { LocatedFragment* pLocFrag = NULL; if ( n == 0 ) pLocFrag = pForwardRead; else pLocFrag = pReverseRead; if ( !pLocFrag ) continue; MBTValOrderedOffsetVector* pSingleSignalArray = NULL; assert( pLocFrag->bGetSingleSignalBasesConsensusLength( pSingleSignalArray ) ); aListOfReadsInSingleSignalArrays.insert( pLocFrag ); aListOfSingleSignalArrays.insert( pSingleSignalArray ); if ( n == 0 ) pForwardReadSingleSignalArray = pSingleSignalArray; else pReverseReadSingleSignalArray = pSingleSignalArray; } for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // the consensus and human read should be equal assert( pHumanRead->bIsInAlignedPartOfRead( nConsPos ) ); if ( !pHumanRead->ntGetFragFromConsPos( nConsPos ).bIsPad() ) { aSomeCombinedReadHasNoPad.setValue( nConsPos, true ); } char cCombinedBase = 0; int nCombinedQuality = -666; if ( !bGetCombinedReadAtBase( pForwardRead, pForwardReadSingleSignalArray, pReverseRead, pReverseReadSingleSignalArray, nConsPos, cCombinedBase, nCombinedQuality ) ) continue; if ( cCombinedBase != '*' ) aSomeCombinedReadHasNoPad.setValue( nConsPos, true ); } } // for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_... // remove columns in which all combined reads have a pad // first do it to single signal arrays int nRead; for( nRead = 0; nRead < aListOfSingleSignalArrays.length(); ++nRead ) { MBTValOrderedOffsetVector* pSingleSignalArray = aListOfSingleSignalArrays[ nRead ]; for( int nConsPos = nGetEndConsensusIndex(); nConsPos >= nGetStartConsensusIndex(); --nConsPos ) { if ( !aSomeCombinedReadHasNoPad[ nConsPos ] ) { pSingleSignalArray->removeAt( nConsPos ); } } } // for( int nRead = 0; int nConsPos; for( nConsPos = nGetEndConsensusIndex(); nConsPos >= nGetStartConsensusIndex(); --nConsPos ) { if ( !aSomeCombinedReadHasNoPad[ nConsPos ] ) { deleteColumn( nConsPos, false ); } } // new padded length after deleting columns nPaddedContigLength = nGetPaddedConsLength(); // create a combined read, one for each species // each combined read will be the length of the entire consensus RWTPtrVector aCombinedReads( (size_t) pAutoReport->aArrayOfSpecies_.length(), NULL, // initial value "aCombinedReads" ); for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; RWCString soCombinedReadName = soSpecies + ".comb"; LocatedFragment* pCombinedRead = new LocatedFragment( soCombinedReadName, 1, // aligned position within contig false, // bComplemented this ); // contig aCombinedReads[ nSpecies ] = pCombinedRead; for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { pCombinedRead->appendNtide( Ntide( 'n', 0 ) ); } } // Now create bit arrays, one for each species mbtValVector aMaskOfSpeciesMeetingCriteria( nGetPaddedConsLength(), 1, // starts at 1 0, // initial value "Contig::aMaskOfSpeciesMeetingCriteria" ); mbtValVector aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman( nGetPaddedConsLength(), 1, // starts at 1 0, // initial value "Contig::aMaskOfSpeciesAndAgreeingWithHuman" ); for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; LocatedFragment* pCombinedRead = aCombinedReads[ nSpecies ]; assert( pCombinedRead->soGetName().bContains( soSpecies ) ); LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; LocatedFragment* pHumanRead = NULL; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { pHumanRead = pLocFrag; continue; } // the fake read for the 2nd alignment if ( pLocFrag->soGetName().bContains( "b." ) ) { continue; } if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; // check if this is a good read if ( aGoodReads.index( pLocFrag->soGetName() ) == RW_NPOS ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } // If we are in a contig that doesn't contain a human read, // ignore the contig if ( !pHumanRead ) return; // find single signal arrays MBTValOrderedOffsetVector* pForwardReadSingleSignalArray = NULL; MBTValOrderedOffsetVector* pReverseReadSingleSignalArray = NULL; for( int nSingleSignalArray = 0; nSingleSignalArray < aListOfReadsInSingleSignalArrays.length(); ++nSingleSignalArray ) { if ( aListOfReadsInSingleSignalArrays[ nSingleSignalArray ] == pForwardRead ) { pForwardReadSingleSignalArray = aListOfSingleSignalArrays[ nSingleSignalArray ]; } else if ( aListOfReadsInSingleSignalArrays[ nSingleSignalArray ] == pReverseRead ) { pReverseReadSingleSignalArray = aListOfSingleSignalArrays[ nSingleSignalArray ]; } } if ( pForwardRead ) { assert( pForwardReadSingleSignalArray ); } if ( pReverseRead ) { assert( pReverseReadSingleSignalArray ); } // at this point, we have whiever reads are available for that // species and their single-signal arrays unsigned char ucSpeciesMask = ucGetMaskForSpecies( soSpecies ); for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // the consensus and human read should be equal assert( pHumanRead->bIsInAlignedPartOfRead( nConsPos ) ); char cCombinedBase = 0; int nCombinedQuality = -666; if ( !bGetCombinedReadAtBase( pForwardRead, pForwardReadSingleSignalArray, pReverseRead, pReverseReadSingleSignalArray, nConsPos, cCombinedBase, nCombinedQuality ) ) continue; pCombinedRead->Sequence::setBaseAtPos( pCombinedRead->nGetFragIndexFromConsPos( nConsPos ), cCombinedBase ); pCombinedRead->Sequence::setQualityAtSeqPos( pCombinedRead->nGetFragIndexFromConsPos( nConsPos ), nCombinedQuality ); aMaskOfSpeciesMeetingCriteria[nConsPos] |= ucSpeciesMask; // agreeing pads do not count as flanking bases. To be // sure, explicitly check for this. (Even though we deleted // columns of pads, the might be some columns not removed // that have a pad in human and some species but not others.) if ( ( cCombinedBase == ntGetCons( nConsPos ).cGetBase() ) && !ntGetCons( nConsPos ).bIsPad() ) { aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPos ] |= ucSpeciesMask; } } // for( int nConsPos = nGetStartConsensusIndex(); } // for( nSpecies = 0; nSpecies < ... // now we have finished setting 3 key arrays: // aCombinedReads // aMaskOfSpeciesMeetingCriteria // aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman // with these, we can figure out the flanked columns and // print out the bases in those columns mbtValVector aSpeciesInAcceptableColumns( nGetPaddedConsLength(), 1, // starting position 0, // initial value "aSpeciesInAcceptableColumns" ); int nConsPosOfStartingColumn; for( nConsPosOfStartingColumn = nGetStartConsensusIndex(); nConsPosOfStartingColumn <= nGetEndConsensusIndex(); ++nConsPosOfStartingColumn ) { if ( bAllNeededSpecies( aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPosOfStartingColumn ] ) ) { for( int nConsPosOfEndingColumn = nConsPosOfStartingColumn + 2; nConsPosOfEndingColumn <= MIN( nConsPosOfStartingColumn + 4, nGetEndConsensusIndex() ); ++nConsPosOfEndingColumn ) { if ( bAllNeededSpecies( aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPosOfStartingColumn ] & aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPosOfEndingColumn ] ) ) { // we have a pair of flanking columns. However, the // columns between the flanking columns may not // be ok. For example, it might be that the flanking // gorilla is single signal, but one of the internal // columns doesn't have single signal for gorilla. // I think in such a case, we could use the columns // that are ok, but just not use the column in which // gorilla is not ok. We should not use any species // that isn't supported by the flanking columns, though. unsigned char ucFlankingColumnSpecies = aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPosOfStartingColumn ] & aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPosOfEndingColumn ]; // now look at all columns between these flanking columns for( int nConsPos = nConsPosOfStartingColumn + 1; nConsPos <= nConsPosOfEndingColumn - 1; ++nConsPos ) { // do not use the same column more than once if ( bAllNeededSpecies( aMaskOfSpeciesMeetingCriteria[ nConsPos ] & ucFlankingColumnSpecies ) ) { unsigned char ucFlankedSpeciesAtThisColumn = aMaskOfSpeciesMeetingCriteria[ nConsPos ] & ucFlankingColumnSpecies; // use this new flanking of the column if: // 1) this column has not been used // or // 2) the new set of species is a proper superset // of the old one // the negation of the above is: // 1) the column has been used // and // 2) the new set of species is not a proper superset // of the old one if ( aSpeciesInAcceptableColumns[ nConsPos ] != 0 && ( ( aSpeciesInAcceptableColumns[ nConsPos ] & ucFlankedSpeciesAtThisColumn ) != aSpeciesInAcceptableColumns[ nConsPos ] ) ) continue; aSpeciesInAcceptableColumns[ nConsPos ] |= ucFlankedSpeciesAtThisColumn; } // if ( bAllNeededSpecies( } // for( int nConsPos = nConsPosOfStartingColumn + 1; } // } } // if ( bAllNeededSpecies( aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPosOfStartingColumn ] ) ) { } // for( int nConsPosOfStartingColumn = nGetStartConsensusIndex(); // now used aSpeciesInAcceptableColumns to print out which // species will be used for each position int nArraySize = nGetEndConsensusIndex() - nGetStartConsensusIndex() + 1; RWTPtrVector aArrayOfTreesAtEachConsPos( nArraySize + 1, // + 1so we can use nConsPos (1-based) directly as an index 0, "aArrayOfTreesAtEachConsPos" ); RWTValVector aArrayOfBasesAtEachConsPos( nArraySize + 1, // + 1so we can use nConsPos (1-based) directly as an index "", "aArrayOfBasesAtEachConsPos" ); typedef RWTValOrderedVector arrayOfFloat; RWTPtrVector aArrayOfProbabilitiesOfTreesAtEachConsPos( nArraySize + 1, // + 1 so we can use nConsPos (1-based) directly as an index 0, "aArrayOfProbabilitiesOfTreesAtEachConsPos" ); for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( ! aSpeciesInAcceptableColumns[ nConsPos ] ) continue; RWCString soColumnOfBases( (size_t) 11 ); // 6 bases and 5 spaces soColumnOfBases = ntGetCons( nConsPos ).cGetBase(); // start with human for( int nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; unsigned char ucSpeciesMask = ucGetMaskForSpecies( soSpecies ); char cBase; if ( ucSpeciesMask & aSpeciesInAcceptableColumns[ nConsPos ] ) { // if reached here, soSpecies is a flanked species // which, at this columns, meets the criteria // so print out the base LocatedFragment* pCombinedRead = aCombinedReads[ nSpecies ]; assert( pCombinedRead ); cBase = pCombinedRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } else { // always the same number of bases on each line cBase = 'n'; } soColumnOfBases += cBase; } // for( int nSpecies = 0; nSpecies < aArrayOfBasesAtEachConsPos[ nConsPos ] = soColumnOfBases; RWTValOrderedVector aTrees; findAncestralTrees( soColumnOfBases, aTrees ); arrayOfRWCString* pTrees = new RWTValOrderedVector( aTrees ); aArrayOfTreesAtEachConsPos[ nConsPos ] = pTrees; RWCString soName = "pTreesProbabilities at "; soName += RWCString( (long) nConsPos ); RWTValOrderedVector* pTreeProbabilities = new RWTValOrderedVector( pTrees->length(), soName ); calculateTreeProbabilities( pTrees, pTreeProbabilities ); aArrayOfProbabilitiesOfTreesAtEachConsPos[ nConsPos ] = pTreeProbabilities; } // for( nConsPos = nGetStartConsensusIndex(); // record all deamination mutations in these arrays RWTPtrVector aArrayOfDeaminationMutationsAtEachConsPos( nPaddedContigLength + 1, // so can // use nConsPos directly 0, // initial "aArrayOfDeaminationMutationsAtEachConsPos" ); typedef RWTValOrderedVector arrayOfFloat; RWTPtrVector aArrayOfProbabilitiesOfDeaminationMutationsAtEachConsPos( nPaddedContigLength + 1, // so can use nConsPos directly 0, // initial value "aArrayOfProbabilitiesOfDeaminationMutationsAtEachConsPos" ); for( nConsPos = nGetStartConsensusIndex() + 1; nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( !aArrayOfTreesAtEachConsPos[ nConsPos - 1 ] ) continue; if ( !aArrayOfTreesAtEachConsPos[ nConsPos ] ) continue; checkTreesAndRecordDeaminationMutationsML( aArrayOfTreesAtEachConsPos[ nConsPos - 1 ], aArrayOfTreesAtEachConsPos[ nConsPos ], aArrayOfProbabilitiesOfTreesAtEachConsPos[ nConsPos - 1 ], aArrayOfProbabilitiesOfTreesAtEachConsPos[ nConsPos ], aArrayOfDeaminationMutationsAtEachConsPos[ nConsPos - 1 ], aArrayOfDeaminationMutationsAtEachConsPos[ nConsPos ], aArrayOfProbabilitiesOfDeaminationMutationsAtEachConsPos[ nConsPos - 1 ], aArrayOfProbabilitiesOfDeaminationMutationsAtEachConsPos[ nConsPos ] ); } const int nMaxNumberOfTrees = 50; int nTreeFrequency[nMaxNumberOfTrees + 1]; int n; for( n = 0; n <= nMaxNumberOfTrees; ++n ) { nTreeFrequency[n] = 0; } for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( !aArrayOfTreesAtEachConsPos[ nConsPos ] ) continue; countNucleotidesAtInternalNodes( aArrayOfTreesAtEachConsPos[ nConsPos ] ); // count number of trees, but only count this for // positions in which there is a mutation if ( nGetNumberOfDistinctBasesNotNs( aArrayOfBasesAtEachConsPos[ nConsPos ] ) > 1 ) { int nNumberOfTrees = aArrayOfTreesAtEachConsPos[ nConsPos ]->length(); if ( nNumberOfTrees > nMaxNumberOfTrees ) nNumberOfTrees = nMaxNumberOfTrees; ++nTreeFrequency[ nNumberOfTrees ]; } } // print out the frequency of trees fprintf( pAO, "tree frequencies:" ); for( n = 0; n <= nMaxNumberOfTrees; ++n ) { if ( nTreeFrequency[n] > 0 ) { fprintf( pAO, " %d : %d", n, nTreeFrequency[n] ); } } fprintf( pAO, "\n" ); for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( !aArrayOfTreesAtEachConsPos[ nConsPos ] ) continue; countMutationsAtConsPosML( aArrayOfTreesAtEachConsPos[ nConsPos ], aArrayOfBasesAtEachConsPos[ nConsPos ], aArrayOfProbabilitiesOfTreesAtEachConsPos[ nConsPos ], nConsPos, aArrayOfDeaminationMutationsAtEachConsPos[ nConsPos ], aArrayOfProbabilitiesOfDeaminationMutationsAtEachConsPos[ nConsPos ] ); } // print out estimate (sanity check) fprintf( pAO, "estimate: %d mutations out of %d positions\n", nEstimatedTotalMutations, nConsPosConsidered ); // now print out the counts for each branch fprintf( pAO, "allMutations {\n" ); // nInternalNode means the same as it does in soGetBranchNameWithoutTree for( int nInternalNode2 = 0; nInternalNode2 <= ( nMaxInternalNode + 1); ++nInternalNode2 ) { // fuss so that nInternalNode goes as 0, 1, 2, 3, -1 int nInternalNode = nInternalNode2; if ( nInternalNode2 == ( nMaxInternalNode + 1 ) ) { nInternalNode = -1; } int nRealInternalNode = ( nInternalNode == -1 ) ? 0 : nInternalNode; RWTValVector* pArrayOfNucleotideCounts = aArrayOfInternalNodes[ nRealInternalNode ]; for( int nBranchBelowInternalNode = 0; nBranchBelowInternalNode <= 1; ++nBranchBelowInternalNode ) { // just a trick so that there is only one MMul branch if ( nInternalNode == -1 && nBranchBelowInternalNode == 1 ) continue; int nOnBackboneOrDownToRight; if ( nBranchBelowInternalNode == 0 ) nOnBackboneOrDownToRight = nOnBackbone; else nOnBackboneOrDownToRight = nDownToRight; RWCString soBranchName = soGetBranchNameWithoutTree( nInternalNode, nOnBackboneOrDownToRight ); int nIndexOfBranch = nGetIndexForBranch( nInternalNode, nOnBackboneOrDownToRight ); assert( aArrayOfNamesOfBranches[ nIndexOfBranch ] == soBranchName ); float fTotalSubstitutions = 0.0; int nLeft; for( nLeft = 0; nLeft < soBasesForSubstitutions.length(); ++nLeft ) { char cLeft = soBasesForSubstitutions[ nLeft ]; for( int nRight = 0; nRight < soBasesForSubstitutions.length(); ++nRight ) { if ( nLeft == nRight ) continue; char cRight = soBasesForSubstitutions[ nRight ]; fTotalSubstitutions += (*(pArraysAtBranch[ cLeft ][ cRight ]))[ nIndexOfBranch ]; } } float fTotalAtBranch = aArrayOfNumberOfNonMutationsAtBranch[ nIndexOfBranch ] + aArrayOfNumberOfDeaminationMutationsAtBranch[ nIndexOfBranch ] + aArrayOfNumberOfDeletionMutationsAtBranch[ nIndexOfBranch ] + aArrayOfNumberOfInsertionMutationsAtBranch[ nIndexOfBranch ] + fTotalSubstitutions; // to avoid division by zero if ( fTotalAtBranch == 0.0 ) { fTotalAtBranch = 0.1; } fprintf( pAO, "%s: same: %.2f subst: %.2f deam: %.2f ins: %.2f del: %.2f deam rate %.2f nondeam subst rate: %.2f %s %.1f\n", aArrayOfNamesOfBranches[ nIndexOfBranch ].data(), aArrayOfNumberOfNonMutationsAtBranch[ nIndexOfBranch ], fTotalSubstitutions, aArrayOfNumberOfDeaminationMutationsAtBranch[ nIndexOfBranch ], aArrayOfNumberOfInsertionMutationsAtBranch[ nIndexOfBranch ], aArrayOfNumberOfDeletionMutationsAtBranch[ nIndexOfBranch ], aArrayOfNumberOfDeaminationMutationsAtBranch[ nIndexOfBranch ] * 100.0 / fTotalAtBranch, fTotalSubstitutions * 100.0 / fTotalAtBranch, soGetName().data(), fTotalAtBranch ); fprintf( pAO, "nucleotides at source node:" ); for( int nSourceBase = 0; nSourceBase < soBasesWithPad.length(); ++nSourceBase ) { char c = soBasesWithPad[ nSourceBase ]; float fSourceBases = (*pArrayOfNucleotideCounts)[ nSourceBase ]; fprintf( pAO, " %c: %.3f", c, fSourceBases ); } fprintf( pAO, "\n" ); for( nLeft = 0; nLeft < soBasesForSubstitutions.length(); ++nLeft ) { char cLeft = soBasesForSubstitutions[ nLeft ]; float fSourceBases = (*pArrayOfNucleotideCounts)[ nLeft ]; for( int nRight = 0; nRight < soBasesForSubstitutions.length(); ++nRight ) { if ( nLeft == nRight ) continue; char cRight = soBasesForSubstitutions[ nRight ]; fprintf( pAO, " subst %c -> %c %.2f %c: %.1f\n", cLeft, cRight, (*(pArraysAtBranch[ cLeft ][ cRight ]))[ nIndexOfBranch ], cLeft, fSourceBases ); } } } // for( int nBranchBelowInternalNode = 0; } // for( int nInternalNode2 = 0; fprintf( pAO, "} allMutations\n" ); } static void addDeaminationMutation( const int nInternalNode, const int nDownToRightOrOnBackbone, const int nTree, const float fProbability, mbtValOrderedVectorOfRWCString*& pArrayOfDeaminationMutations, RWTValOrderedVector*& pArrayOfProbsOfDeaminationMutations ) { if ( !pArrayOfDeaminationMutations ) { pArrayOfDeaminationMutations = new mbtValOrderedVectorOfRWCString( 5 ); // initial size pArrayOfProbsOfDeaminationMutations = new RWTValOrderedVector( 5, "pArrayOfProbsOfDeaminationMutations" ); } RWCString soBranchName = soGetBranchNameWithTree( nInternalNode, nDownToRightOrOnBackbone, nTree ); int nIndex = pArrayOfDeaminationMutations->index( soBranchName ); if ( nIndex == RW_NPOS ) { pArrayOfDeaminationMutations->insert( soBranchName ); pArrayOfProbsOfDeaminationMutations->insert( fProbability ); } else { (*pArrayOfProbsOfDeaminationMutations)[ nIndex ] += fProbability; } } void Contig :: checkTreesAndRecordDeaminationMutationsML( RWTValOrderedVector* pTreesPositionA, RWTValOrderedVector* pTreesPositionB, RWTValOrderedVector* pProbabilitiesOfTreesPositionA, RWTValOrderedVector* pProbabilitiesOfTreesPositionB, mbtValOrderedVectorOfRWCString*& pArrayOfDeaminationMutationsPositionA, mbtValOrderedVectorOfRWCString*& pArrayOfDeaminationMutationsPositionB, RWTValOrderedVector*& pArrayOfProbsOfDeaminationMutationsPositionA, RWTValOrderedVector*& pArrayOfProbsOfDeaminationMutationsPositionB ) { mbtValOrderedVectorOfRWCString aDeaminationMutationsPositionA; mbtValOrderedVectorOfRWCString aDeaminationMutationsPositionB; for( int nTreeA = 0; nTreeA < pTreesPositionA->length(); ++nTreeA ) { RWCString soTreeA = (*pTreesPositionA)[ nTreeA ]; float fProbabilityOfTreeA = (*pProbabilitiesOfTreesPositionA)[ nTreeA ]; // get rid of ? for now soTreeA.removeSingleCharacterAllOccurrences('?'); soTreeA.removeSingleCharacterAllOccurrences('('); soTreeA.removeSingleCharacterAllOccurrences(')'); soTreeA.removeSingleCharacterAllOccurrences(','); for( int nTreeB = 0; nTreeB < pTreesPositionB->length(); ++nTreeB ) { RWCString soTreeB = (*pTreesPositionB)[ nTreeB ]; float fProbabilityOfTreeB = (*pProbabilitiesOfTreesPositionB)[ nTreeB ]; soTreeB.removeSingleCharacterAllOccurrences( '?' ); soTreeB.removeSingleCharacterAllOccurrences( '(' ); soTreeB.removeSingleCharacterAllOccurrences( ')' ); soTreeB.removeSingleCharacterAllOccurrences( ',' ); // now we have a pair of trees, soTreeA and soTreeB // abcdef // ^top internal node, leading to MMul // ^ next internal node, leading to PPyg // ^ next internal node, leading to GGor // ^ next internal node, leading to HSap, I hope! // ^ next internal node, leading to PTro // ^ PPan // ^ PTro // ^ GGor // etc. // g(g(g(g(g,g),g),a),g?,g) // is changed to // gggggggagg // a(c(c(c(*,c),c),c),a,*) // is changed to: // accc*ccca* // ^ node leading to MMul and PPyg // ^ node leading to GGor // ^ node leading to HSap // ^ node leading to PTro and PPan // ^ PPan // ^ PTro // ^ HSap // ^ GGor // ^ PPyg // ^ MMul // so there are 4 ancestral nodes to be concerned about // it turns out (lucky?) that this nInternalNode actually // is the same as that in soGetBranchNameWithoutTree and // is also the string position in the tree. There is // no nInternalNode == -1 here, though. // 0 refers to the internal node where PPyg and MMul come together. for( int nInternalNode = 0; nInternalNode <= nMaxInternalNode; ++nInternalNode ) { // notice that "nInternalNode" refers to the string position // within the tree. So 0 is the top node and 4 is the node // going to PPan and PTro RWCString soAncestralSequence = RWCString( soTreeA[ nInternalNode ] ) + RWCString( soTreeB[ nInternalNode ] ); if ( soAncestralSequence != "cg" ) continue; // if reached here, ancestral sequence is a CpG // is there a mutation from this position? // first, check if there is a mutation down-right to // the corresponding present-day node int nIndexOfPresentDayBases = nSubtractInternalNodeFromThisToGetPresentDayIndex - nInternalNode; RWCString soPresentDayBases = RWCString( soTreeA[ nIndexOfPresentDayBases ] ) + RWCString( soTreeB[ nIndexOfPresentDayBases ] ); if ( soPresentDayBases == "ca" ) { addDeaminationMutation( nInternalNode, nDownToRight, nTreeB, fProbabilityOfTreeA * fProbabilityOfTreeB, pArrayOfDeaminationMutationsPositionB, pArrayOfProbsOfDeaminationMutationsPositionB ); } else if ( soPresentDayBases == "tg" ) { addDeaminationMutation( nInternalNode, nDownToRight, nTreeA, fProbabilityOfTreeA * fProbabilityOfTreeB, pArrayOfDeaminationMutationsPositionA, pArrayOfProbsOfDeaminationMutationsPositionA ); } if ( nInternalNode == 0 ) { // case of top internal node, leading to both MMul and PPyg // we've already considered the branch leading to PPyg (above) // in which nIndexOfPresentDayBases = 8 // Now consider the branch leading to MMul, with // nIndexOfPresentDayBases = 9 nIndexOfPresentDayBases = nSubtractInternalNodeFromThisToGetPresentDayIndex + 1; soPresentDayBases = RWCString( soTreeA[ nIndexOfPresentDayBases ] ) + RWCString( soTreeB[ nIndexOfPresentDayBases ] ); if ( soPresentDayBases == "ca" ) { addDeaminationMutation( -1, nDownToRight, nTreeB, fProbabilityOfTreeA * fProbabilityOfTreeB, pArrayOfDeaminationMutationsPositionB, pArrayOfProbsOfDeaminationMutationsPositionB ); } else if ( soPresentDayBases == "tg" ) { addDeaminationMutation( -1, nDownToRight, nTreeA, fProbabilityOfTreeA * fProbabilityOfTreeB, pArrayOfDeaminationMutationsPositionA, pArrayOfProbsOfDeaminationMutationsPositionA ); } } // if nInternalNode is 0-3, the next node is the // descendant internal node. if nInternalNode is // 4, the next node is PPan RWCString soDescendantInternalNode = RWCString( soTreeA[ nInternalNode + 1 ] ) + RWCString( soTreeB[ nInternalNode + 1 ] ); if ( soDescendantInternalNode == "ca" ) { addDeaminationMutation( nInternalNode, nOnBackbone, nTreeB, fProbabilityOfTreeA * fProbabilityOfTreeB, pArrayOfDeaminationMutationsPositionB, pArrayOfProbsOfDeaminationMutationsPositionB ); } else if ( soDescendantInternalNode == "tg" ) { addDeaminationMutation( nInternalNode, nOnBackbone, nTreeA, fProbabilityOfTreeA * fProbabilityOfTreeB, pArrayOfDeaminationMutationsPositionA, pArrayOfProbsOfDeaminationMutationsPositionA ); } } // for( int nInternalNode = 0; nInternalNode < 5; } // for( int nTreeB = 0; nTreeB < pTreesPositionB->length } // for( int nTreeA = 0; nTreeA < pTreesPositionA->length() } // void Contig :: checkTreesAndRecordDeaminationMutationsML( void Contig :: countMutationsAtConsPosML( RWTValOrderedVector* pTrees, const RWCString& soPresentDayBases, RWTValOrderedVector* pProbabilitiesOfTrees, const int nConsPos, mbtValOrderedVectorOfRWCString* pArrayOfDeaminationMutations, RWTValOrderedVector* pArrayOfProbabilitiesOfDeaminationMutations ) { float fRoughMutationCount = 0.0; assert( pTrees ); assert( pProbabilitiesOfTrees ); assert( pTrees->length() == pProbabilitiesOfTrees->length() ); mbtValOrderedVectorOfRWCString aListOfMutations; for( int nTree = 0; nTree < pTrees->length(); ++nTree ) { RWCString soTree = (*pTrees)[ nTree ]; float fProbabilityOfTree = (*pProbabilitiesOfTrees)[ nTree ]; soTree.removeSingleCharacterAllOccurrences( '?' ); soTree.removeSingleCharacterAllOccurrences( '(' ); soTree.removeSingleCharacterAllOccurrences( ')' ); soTree.removeSingleCharacterAllOccurrences( ',' ); // now look for mutations // nInternal node is the string offset and also that used in // soGetBranchNameWithoutTree but without using -1 for( int nInternalNode = 0; nInternalNode <= nMaxInternalNode; ++nInternalNode ) { // notice that "nInternalNode" refers to the string position // within the tree. So 0 is the top node and 3 is the node // going to PPan and PTro char cAncestralBase = soTree[ nInternalNode ]; // there are two paths--one to the next ancestral node, // the other to a present-day node. (In the case of the 0th // node, there is another path--that to MMul.) int nIndexOfPresentDayBases = nSubtractInternalNodeFromThisToGetPresentDayIndex - nInternalNode; char cPresentDayBase = soTree[ nIndexOfPresentDayBases ]; char cNextAncestralBaseOrPPan = soTree[ nInternalNode + 1 ]; // first consider the branch on the backbone RWCString soBackboneBranchName = soGetBranchNameWithoutTree( nInternalNode, nOnBackbone ); int nIndexOfBackboneBranch = nGetIndexForBranch( nInternalNode, nOnBackbone ); assert( soBackboneBranchName == aArrayOfNamesOfBranches[ nIndexOfBackboneBranch ] ); if ( ! ( cAncestralBase == '*' && cNextAncestralBaseOrPPan == '*' ) ) { float fProbabilityOfDeaminationMutation = fGetProbabilityOfDeaminationMutation( nInternalNode, nOnBackbone, nTree, pArrayOfDeaminationMutations, pArrayOfProbabilitiesOfDeaminationMutations ); (*(pArraysAtBranch[ cAncestralBase ][ cNextAncestralBaseOrPPan ] ))[ nIndexOfBackboneBranch ] += ( 1.0 - fProbabilityOfDeaminationMutation ) * fProbabilityOfTree; // note: the above includes non-mutations and indel mutations, // as well as the 12 substitutions aArrayOfNumberOfDeaminationMutationsAtBranch[ nIndexOfBackboneBranch ] += fProbabilityOfDeaminationMutation * fProbabilityOfTree; // just for sanity check if ( cAncestralBase != cNextAncestralBaseOrPPan ) { fRoughMutationCount += 1.0 / pTrees->length(); if ( nInternalNode == 3 && cAncestralBase != '*' && cNextAncestralBaseOrPPan != '*' ) { // PPan // want output to look like: // PPan hs1-15256499 247 aagaaa i // find project name RWCString soProjectName2 = ConsEd::pGetAssembly()->filGetAceFileFullPathname().soGetDirectory(); // get rid of last /edit_dir/ assert( soProjectName2.bEndsWithAndRemove("/edit_dir/") ); RWCRegexp regFasta("[^/]+$"); RWCString soProjectName = soProjectName2( regFasta ); fprintf( pAO, "PPan %15s %4d %s %c %.2f\n", soProjectName.data(), nUnpaddedIndex( nConsPos ), soPresentDayBases.data(), cTransitionsTransversions[cAncestralBase][ cNextAncestralBaseOrPPan], ( 1.0 - fProbabilityOfDeaminationMutation ) / pTrees->length() ); } // if ( nInternalNode == 0 ) { } // end sanity check } // now consider the branch down and to right int nIndexForBranchDownAndToRight = nGetIndexForBranch( nInternalNode, nDownToRight ); RWCString soBranchNameDownAndToRight = soGetBranchNameWithoutTree( nInternalNode, nDownToRight ); assert( soBranchNameDownAndToRight == aArrayOfNamesOfBranches[ nIndexForBranchDownAndToRight ] ); if ( !( cAncestralBase == '*' && cPresentDayBase == '*' ) ) { float fProbabilityOfDeaminationMutation = fGetProbabilityOfDeaminationMutation( nInternalNode, nDownToRight, nTree, pArrayOfDeaminationMutations, pArrayOfProbabilitiesOfDeaminationMutations ); (*(pArraysAtBranch[ cAncestralBase ][ cPresentDayBase ] ))[ nIndexForBranchDownAndToRight ] += ( 1.0 - fProbabilityOfDeaminationMutation ) * fProbabilityOfTree; aArrayOfNumberOfDeaminationMutationsAtBranch[ nIndexForBranchDownAndToRight ] += fProbabilityOfDeaminationMutation * fProbabilityOfTree; // sanity check if ( cAncestralBase != cPresentDayBase ) { fRoughMutationCount += 1.0 / pTrees->length(); if ( nInternalNode == 3 && cAncestralBase != '*' && cPresentDayBase != '*' ) { // PTro // want output to look like: // PTro hs1-15256499 247 aagaaa i // find project name RWCString soProjectName2 = ConsEd::pGetAssembly()->filGetAceFileFullPathname().soGetDirectory(); // get rid of last /edit_dir/ assert( soProjectName2.bEndsWithAndRemove("/edit_dir/") ); RWCRegexp regFasta("[^/]+$"); RWCString soProjectName = soProjectName2( regFasta ); fprintf( pAO, "PTro %15s %4d %s %c %.2f\n", soProjectName.data(), nUnpaddedIndex( nConsPos ), soPresentDayBases.data(), cTransitionsTransversions[cAncestralBase][cPresentDayBase], ( 1.0 - fProbabilityOfDeaminationMutation ) / pTrees->length() ); } // if ( nInternalNode == 0 ) { } // end sanity check } // one more branch to consider: that going to MMul if ( nInternalNode == 0 ) { char cMMulBase = soTree[ nSubtractInternalNodeFromThisToGetPresentDayIndex + 1]; int nIndexForMMulBranch = nGetIndexForBranch( -1, 0 ); if ( !( cAncestralBase == '*' && cMMulBase == '*' ) ) { // mutation. could it be a deamination mutation? float fProbabilityOfDeaminationMutation = fGetProbabilityOfDeaminationMutation( -1, // nInternalNode for MMul branch nDownToRight, nTree, pArrayOfDeaminationMutations, pArrayOfProbabilitiesOfDeaminationMutations ); (*(pArraysAtBranch[ cAncestralBase ][ cMMulBase ] ) )[ nIndexForMMulBranch ] += ( 1.0 - fProbabilityOfDeaminationMutation ) * fProbabilityOfTree; aArrayOfNumberOfDeaminationMutationsAtBranch[ nIndexForMMulBranch ] += fProbabilityOfDeaminationMutation * fProbabilityOfTree; // sanity check if ( cAncestralBase != cMMulBase ) { fRoughMutationCount += 1.0 / pTrees->length(); } // end sanity check } // if ( !( cAncestralBase == '*' && cMMulBase == '*' } // if ( nInternalNode == 0 ) { } // for( int nInternalNode = 0; nInternalNode < 5; ... } // for( int nTree = 0; nTree < pTrees->length(); ++nTree ) { // sanity check on the number of mutations int nNumberOfDistinctBases = nGetNumberOfDistinctBasesNotNs( soPresentDayBases ); float fDiff = fabs( nNumberOfDistinctBases - 1 - fRoughMutationCount ); if ( fDiff > 0.001 ) { fprintf( pAO, "sanity check failed: %d distinct bases %.6f rough mutations %s %s %d\n", nNumberOfDistinctBases, fRoughMutationCount, soPresentDayBases.data(), soGetName().data(), nUnpaddedIndex( nConsPos ) ); } nEstimatedTotalMutations += ( nNumberOfDistinctBases - 1 ); ++nConsPosConsidered; } // countMutationsAtConsPos(