/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "contig.h" #include "consedParameters.h" #include "locatedFragment.h" #include "bIntersect.h" #include "mbtValVectorOfBool.h" #include "mbtValVector.h" #include "rwtptrorderedvector.h" #include "ntide.h" #include "fileDefines.h" #include "soAddCommas.h" #include "rwctokenizer.h" #include "autoReport.h" #include "consed.h" #include "assembly.h" #include "nNumberFromBase.h" #include "setErrorRateFromQuality.h" #include "dErrorRateFromQuality.h" #include "singletInfo.h" #include "readPHDAgainForHighQualitySegment.h" #include #include #include #include "rwcregexp.h" #include "getAllSinglets.h" #include "mbt_val_ord_offset_vec.h" #include "soLine.h" #include "max.h" #include #include "findAncestralTrees.h" #include "transitionsTransversions.h" #include "soGetBranchNameWithoutTree.h" #include "primateSpecies.h" static int nEstimatedTotalMutations = 0; static int nConsPosConsidered = 0; //const int n256 = 256; already defined in transitionsTransversions.h static RWTValOrderedVector aArrayOfNamesOfBranches; static RWTValOrderedVector aArrayOfNumberOfNonMutationsAtBranch; static RWTValOrderedVector aArrayOfNumberOfDeaminationMutationsAtBranch; static RWTValOrderedVector aArrayOfNumberOfInsertionMutationsAtBranch; static RWTValOrderedVector aArrayOfNumberOfDeletionMutationsAtBranch; static RWTValOrderedVector* pArraysAtBranch[n256][n256]; typedef RWTValVector typeArrayOfNucleotideCounts; static RWTPtrVector aArrayOfInternalNodes( nMaxInternalNode + 1, NULL, "aArrayOfInternalNodes" ); static int nBaseToIndex[n256]; static int nGetNumberOfDistinctBasesNotNs( const RWCString& soBases ) { RWCString soBases2( soBases ); soBases2.removeSingleCharacterAllOccurrences( 'n' ); // check how many bases remain const int nBaseTypes = 5; bool bBases[nBaseTypes]; int n; for( n =0; n ( 10, // array size soName ) ; } } } // now consider indels int nBase; for( nBase = 0; nBase < soBasesForSubstitutions.length(); ++nBase ) { char cBase = soBasesForSubstitutions[ nBase ]; pArraysAtBranch[ cBase ][ '*' ] = &aArrayOfNumberOfDeletionMutationsAtBranch; pArraysAtBranch[ '*' ][ cBase ] = &aArrayOfNumberOfInsertionMutationsAtBranch; } // what about ['*']['*'] ? This will not count as anything for( int nInternalNode2 = 0; nInternalNode2 <= (nMaxInternalNode + 1 ); ++nInternalNode2 ) { int nInternalNode = nInternalNode2; if ( nInternalNode == ( nMaxInternalNode + 1 ) ) { nInternalNode = -1; } for( int nBranchBelowInternalNode = 0; nBranchBelowInternalNode <= 1; ++nBranchBelowInternalNode ) { // just a trick so that there is only one MMul branch if ( nInternalNode == -1 && nBranchBelowInternalNode == 1 ) continue; int nOnBackboneOrDownToRight; if ( nBranchBelowInternalNode == 0 ) nOnBackboneOrDownToRight = nOnBackbone; else nOnBackboneOrDownToRight = nDownToRight; aArrayOfNamesOfBranches.insert( soGetBranchNameWithoutTree( nInternalNode, nOnBackboneOrDownToRight ) ); aArrayOfNumberOfNonMutationsAtBranch.insert( 0.0 ); aArrayOfNumberOfDeaminationMutationsAtBranch.insert( 0.0 ); aArrayOfNumberOfInsertionMutationsAtBranch.insert( 0.0 ); aArrayOfNumberOfDeletionMutationsAtBranch.insert( 0.0 ); for( nLeft = 0; nLeft < soBasesForSubstitutions.length(); ++nLeft ) { char cLeft = soBasesForSubstitutions[ nLeft ]; for( int nRight = 0; nRight < soBasesForSubstitutions.length(); ++nRight ) { if ( nRight == nLeft ) continue; char cRight = soBasesForSubstitutions[ nRight ]; ( pArraysAtBranch[ cLeft ][ cRight ])->insert( 0.0 ); } } } // for( int nBranchBelowInternalNode = 0; ... } // for( int nInternalNode2 = 0; nInternalNode2 <= (nMaxInternalNode + for( int nInternalNode = 0; nInternalNode <= nMaxInternalNode; ++nInternalNode ) { RWCString soName = "aArrayOfInternalNodes nInternalNode = " + RWCString( (long) nInternalNode ); aArrayOfInternalNodes[ nInternalNode ] = new RWTValVector( soBasesWithPad.length(), // capacity: acgt* 0.0, // initial value soName ); } int n; for( n = 0; n < n256; ++n ) { nBaseToIndex[n] = -1; } nBaseToIndex['a'] = 0; nBaseToIndex['c'] = 1; nBaseToIndex['g'] = 2; nBaseToIndex['t'] = 3; nBaseToIndex['*'] = 4; } // static void initializeArrays() { static unsigned char ucGetMaskForSpecies( RWCString& soSpecies ) { unsigned char ucSpeciesMask; if ( soSpecies == "PTro" ) { ucSpeciesMask = ucPTro; } else if ( soSpecies == "PPan" ) { ucSpeciesMask = ucPPan; } else if ( soSpecies == "GGor" ) { ucSpeciesMask = ucGGor; } else if ( soSpecies == "PPyg" ) { ucSpeciesMask = ucPPyg; } else if ( soSpecies == "MMul" ) { ucSpeciesMask = ucMMul; } else { ucSpeciesMask = 0; } return ucSpeciesMask; } static bool bAllNeededSpecies( unsigned char ucSpeciesMask ) { if ( ( ucSpeciesMask & ucPTroOrPPan ) && ( ucSpeciesMask & ucPPygOrMMul ) && ( ucSpeciesMask & ucGGor ) ) return true; else return false; } static bool bNoMutationThisColumn( const RWCString& soBases ) { char cBase = 0; for( int n = 0; n < soBases.length(); ++n ) { if ( soBases[n] == 'n' ) continue; if ( !cBase ) { cBase = soBases[n]; } else { if ( cBase != soBases[n] ) { return false; } } } return true; } static RWCString soGetTreeWithNoMutation( const RWCString soPresentDayBasesHumanInFront ) { RWCString soPresentDayBasesPPanInFront = soPresentDayBasesHumanInFront; // PPan soPresentDayBasesPPanInFront[0] = soPresentDayBasesHumanInFront[1]; // PTro soPresentDayBasesPPanInFront[1] = soPresentDayBasesHumanInFront[2]; // HSap soPresentDayBasesPPanInFront[2] = soPresentDayBasesHumanInFront[0]; // find base char cBase = 0; int n; for( n = 0; n < soPresentDayBasesPPanInFront.length(); ++n ) { if ( soPresentDayBasesPPanInFront[n] != 'n' ) { cBase = soPresentDayBasesPPanInFront[n]; break; } } assert( cBase ); // trees look like: // g(g(t(t(t(t,t),t),t),g?),g) RWCString soTreeLeft; RWCString soTreeRight; for( n = soPresentDayBasesPPanInFront.length() - 1; n >= 1; --n ) { soTreeLeft += RWCString( cBase ); soTreeLeft += "("; soTreeRight = "," + RWCString( cBase ) + ( soPresentDayBasesPPanInFront[n] == 'n' ? "?" : "" ) + ")" + soTreeRight; } // we have to handle PPan separately from the loop above because it // might have a "?" next to it RWCString soTree = soTreeLeft + RWCString( cBase ) + ( soPresentDayBasesPPanInFront[0] == 'n' ? "?" : "" ) + soTreeRight; return( soTree ); } static bool bIsDeaminationMutation( const int nInternalNode, const int nOnBackboneOrDownToRight, const RWCString& soDeaminationMutations ) { RWCString soBranchName = soGetBranchNameWithoutTree( nInternalNode, nOnBackboneOrDownToRight ); if ( soDeaminationMutations.index( soBranchName ) == RW_NPOS ) return false; else return true; } static float fGetProbabilityOfDeaminationMutation( const int nInternalNode, const int nOnBackboneOrDownToRight, const int nTree, mbtValOrderedVectorOfRWCString* pArrayOfDeaminationMutations ) { RWCString soBranchNameWithTree = soGetBranchNameWithTree( nInternalNode, nOnBackboneOrDownToRight, nTree ); if ( !pArrayOfDeaminationMutations ) return( 0.0 ); int nIndexOfDeaminationMutation = pArrayOfDeaminationMutations->index( soBranchNameWithTree ); if ( nIndexOfDeaminationMutation == RW_NPOS ) return( 0.0 ); else return( 1.0 ); } static void countNucleotidesAtInternalNodes( const RWCString& soOneTree2 ) { RWCString soOneTree = soOneTree2; soOneTree.removeSingleCharacterAllOccurrences( '?' ); soOneTree.removeSingleCharacterAllOccurrences( '(' ); soOneTree.removeSingleCharacterAllOccurrences( ')' ); soOneTree.removeSingleCharacterAllOccurrences( ',' ); // now look for mutations // nInternal node is the string offset and also that used in // soGetBranchNameWithoutTree but without using -1 for( int nInternalNode = 0; nInternalNode <= nMaxInternalNode; ++nInternalNode ) { // notice that "nInternalNode" refers to the string position // within the tree. So 0 is the top node and 4 is the node // going to PPan and PTro char cAncestralBase = soOneTree[ nInternalNode ]; RWTValVector* pArrayOfCounts = aArrayOfInternalNodes[ nInternalNode ]; int nBaseIndex = nBaseToIndex[ cAncestralBase ]; if ( nBaseIndex != -1 ) { // will need to be changed for maximum likelihood model (*pArrayOfCounts)[ nBaseIndex ] += 1.0; } } } void Contig :: countAllMutations( autoReport* pAutoReport ) { setTransitionsTransversionsTableIfNecessary(); initializeArrays(); RWTPtrVector aCombinedReads( (size_t) pAutoReport->aArrayOfSpecies_.length(), NULL, // initial value "aCombinedReads" ); mbtValVector aSpeciesInAcceptableColumns( 1, // will be changed 1, // starting position 0, // initial value "aSpeciesInAcceptableColumns" ); RWTValVector aArrayOfOneTreeAtEachConsPos( 1, // size--will be changed // + 1so we can use nConsPos (1-based) directly as an index "", "aArrayOfOneTreeAtEachConsPos" ); RWTValVector aArrayOfBasesAtEachConsPos( 1, // size--will be changed // + 1so we can use nConsPos (1-based) directly as an index "", "aArrayOfBasesAtEachConsPos" ); RWTValVector aArrayOfLenientlyFilteredBasesAtEachConsPos( 1, // size--will be changed // + 1 so we can use nConsPos (1-based) directly as an index "", // initial value "aArrayOfLenientlyFilteredBasesAtEachConsPos" ); RWTValVector aDeaminationMutationsAtEachConsPos( 1, // size--will be changed "", // initial value "aDeaminationMutationsAtEachConsPos" ); // this is different than above in that if a CpG occurs, both // columns are marked, rather than just the first column RWTValVector aAncestralCpGAtEachConsPos2( 1,// size--will be changed cNoCpG, // initial value "aAncestralCpGAtEachConsPos" ); LocatedFragment* pHumanRead; applyFilters( pAutoReport, aCombinedReads, aSpeciesInAcceptableColumns, aArrayOfOneTreeAtEachConsPos, aArrayOfBasesAtEachConsPos, aArrayOfLenientlyFilteredBasesAtEachConsPos, aDeaminationMutationsAtEachConsPos, aAncestralCpGAtEachConsPos2, pHumanRead ); int nConsPos; for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( aArrayOfOneTreeAtEachConsPos[ nConsPos ].isNull() ) continue; countNucleotidesAtInternalNodes( aArrayOfOneTreeAtEachConsPos[ nConsPos ] ); } for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( aArrayOfOneTreeAtEachConsPos[ nConsPos ].isNull() ) continue; // for comparison with Graham's data if ( pCP->bAutoReportPrintPositionsForGraham_ ) { if ( ! ( aSpeciesInAcceptableColumns[ nConsPos ] & ucPTro ) ) continue; if ( ! ( aSpeciesInAcceptableColumns[ nConsPos ] & ucMMul ) ) continue; if ( aAncestralCpGAtEachConsPos2[ nConsPos ] != cNoCpG ) continue; } // end for comparison with Graham's data fprintf( pAO, "using site %d %s\n", nUnpaddedIndex( nConsPos ), aArrayOfBasesAtEachConsPos[ nConsPos ].data() ); countMutationsAtConsPos( aArrayOfOneTreeAtEachConsPos[ nConsPos ], aArrayOfBasesAtEachConsPos[ nConsPos ], nConsPos, aDeaminationMutationsAtEachConsPos[ nConsPos ] ); } // print out estimate (sanity check) fprintf( pAO, "estimate: %d mutations out of %d positions\n", nEstimatedTotalMutations, nConsPosConsidered ); // now print out the counts for each branch fprintf( pAO, "allMutations {\n" ); // nInternalNode means the same as it does in soGetBranchNameWithoutTree for( int nInternalNode2 = 0; nInternalNode2 <= ( nMaxInternalNode + 1); ++nInternalNode2 ) { // fuss so that nInternalNode goes as 0, 1, 2, 3, -1 int nInternalNode = nInternalNode2; if ( nInternalNode2 == ( nMaxInternalNode + 1 ) ) { nInternalNode = -1; } int nRealInternalNode = ( nInternalNode == -1 ) ? 0 : nInternalNode; RWTValVector* pArrayOfNucleotideCounts = aArrayOfInternalNodes[ nRealInternalNode ]; for( int nBranchBelowInternalNode = 0; nBranchBelowInternalNode <= 1; ++nBranchBelowInternalNode ) { // just a trick so that there is only one MMul branch if ( nInternalNode == -1 && nBranchBelowInternalNode == 1 ) continue; int nOnBackboneOrDownToRight; if ( nBranchBelowInternalNode == 0 ) nOnBackboneOrDownToRight = nOnBackbone; else nOnBackboneOrDownToRight = nDownToRight; RWCString soBranchName = soGetBranchNameWithoutTree( nInternalNode, nOnBackboneOrDownToRight ); int nIndexOfBranch = nGetIndexForBranch( nInternalNode, nOnBackboneOrDownToRight ); assert( aArrayOfNamesOfBranches[ nIndexOfBranch ] == soBranchName ); float fTotalSubstitutions = 0.0; int nLeft; for( nLeft = 0; nLeft < soBasesForSubstitutions.length(); ++nLeft ) { char cLeft = soBasesForSubstitutions[ nLeft ]; for( int nRight = 0; nRight < soBasesForSubstitutions.length(); ++nRight ) { if ( nLeft == nRight ) continue; char cRight = soBasesForSubstitutions[ nRight ]; fTotalSubstitutions += (*(pArraysAtBranch[ cLeft ][ cRight ]))[ nIndexOfBranch ]; } } float fTotalAtBranch = aArrayOfNumberOfNonMutationsAtBranch[ nIndexOfBranch ] + aArrayOfNumberOfDeaminationMutationsAtBranch[ nIndexOfBranch ] + aArrayOfNumberOfDeletionMutationsAtBranch[ nIndexOfBranch ] + aArrayOfNumberOfInsertionMutationsAtBranch[ nIndexOfBranch ] + fTotalSubstitutions; // to avoid division by zero if ( fTotalAtBranch == 0.0 ) { fTotalAtBranch = 0.1; } fprintf( pAO, "%s: same: %.2f subst: %.2f deam: %.2f ins: %.2f del: %.2f deam rate %.2f nondeam subst rate: %.2f %s %.1f\n", aArrayOfNamesOfBranches[ nIndexOfBranch ].data(), aArrayOfNumberOfNonMutationsAtBranch[ nIndexOfBranch ], fTotalSubstitutions, aArrayOfNumberOfDeaminationMutationsAtBranch[ nIndexOfBranch ], aArrayOfNumberOfInsertionMutationsAtBranch[ nIndexOfBranch ], aArrayOfNumberOfDeletionMutationsAtBranch[ nIndexOfBranch ], aArrayOfNumberOfDeaminationMutationsAtBranch[ nIndexOfBranch ] * 100.0 / fTotalAtBranch, fTotalSubstitutions * 100.0 / fTotalAtBranch, soGetName().data(), fTotalAtBranch ); fprintf( pAO, "nucleotides at source node:" ); for( int nSourceBase = 0; nSourceBase < soBasesWithPad.length(); ++nSourceBase ) { char c = soBasesWithPad[ nSourceBase ]; float fSourceBases = (*pArrayOfNucleotideCounts)[ nSourceBase ]; fprintf( pAO, " %c: %.3f", c, fSourceBases ); } fprintf( pAO, "\n" ); for( nLeft = 0; nLeft < soBasesForSubstitutions.length(); ++nLeft ) { char cLeft = soBasesForSubstitutions[ nLeft ]; float fSourceBases = (*pArrayOfNucleotideCounts)[ nLeft ]; for( int nRight = 0; nRight < soBasesForSubstitutions.length(); ++nRight ) { if ( nLeft == nRight ) continue; char cRight = soBasesForSubstitutions[ nRight ]; fprintf( pAO, " subst %c -> %c %.2f %c: %.1f\n", cLeft, cRight, (*(pArraysAtBranch[ cLeft ][ cRight ]))[ nIndexOfBranch ], cLeft, fSourceBases ); } } } // for( int nBranchBelowInternalNode = 0; } // for( int nInternalNode2 = 0; fprintf( pAO, "} allMutations\n" ); } void Contig :: checkTreesAndRecordDeaminationMutations( RWTValOrderedVector* pTreesPositionA, RWTValOrderedVector* pTreesPositionB, mbtValOrderedVectorOfRWCString*& pArrayOfDeaminationMutationsPositionA, mbtValOrderedVectorOfRWCString*& pArrayOfDeaminationMutationsPositionB, RWTValOrderedVector*& pArrayOfProbsOfDeaminationMutationsPositionA, RWTValOrderedVector*& pArrayOfProbsOfDeaminationMutationsPositionB ) { mbtValOrderedVectorOfRWCString aDeaminationMutationsPositionA; mbtValOrderedVectorOfRWCString aDeaminationMutationsPositionB; for( int nTreeA = 0; nTreeA < pTreesPositionA->length(); ++nTreeA ) { RWCString soTreeA = (*pTreesPositionA)[ nTreeA ]; // get rid of ? for now soTreeA.removeSingleCharacterAllOccurrences('?'); soTreeA.removeSingleCharacterAllOccurrences('('); soTreeA.removeSingleCharacterAllOccurrences(')'); soTreeA.removeSingleCharacterAllOccurrences(','); for( int nTreeB = 0; nTreeB < pTreesPositionB->length(); ++nTreeB ) { RWCString soTreeB = (*pTreesPositionB)[ nTreeB ]; soTreeB.removeSingleCharacterAllOccurrences( '?' ); soTreeB.removeSingleCharacterAllOccurrences( '(' ); soTreeB.removeSingleCharacterAllOccurrences( ')' ); soTreeB.removeSingleCharacterAllOccurrences( ',' ); // now we have a pair of trees, soTreeA and soTreeB // abcdef // ^top internal node, leading to MMul // ^ next internal node, leading to PPyg // ^ next internal node, leading to GGor // ^ next internal node, leading to HSap, I hope! // ^ next internal node, leading to PTro // ^ PPan // ^ PTro // ^ GGor // etc. // g(g(g(g(g,g),g),a),g?,g) // is changed to // gggggggagg // a(c(c(c(*,c),c),c),a,*) // is changed to: // accc*ccca* // ^ node leading to MMul and PPyg // ^ node leading to GGor // ^ node leading to HSap // ^ node leading to PTro and PPan // ^ PPan // ^ PTro // ^ HSap // ^ GGor // ^ PPyg // ^ MMul if ( pCP->bAutoReportDeaminationMutationsDeterminedByMoreAccurateMethod_ ) { // so there are 4 ancestral nodes to be concerned about // it turns out (lucky?) that this nInternalNode actually // is the same as that in soGetBranchNameWithoutTree and // is also the string position in the tree. There is // no nInternalNode == -1 here, though. // 0 refers to the internal node where PPyg and MMul come together. for( int nInternalNode = 0; nInternalNode <= nMaxInternalNode; ++nInternalNode ) { // notice that "nInternalNode" refers to the string position // within the tree. So 0 is the top node and 4 is the node // going to PPan and PTro RWCString soAncestralSequence = RWCString( soTreeA[ nInternalNode ] ) + RWCString( soTreeB[ nInternalNode ] ); // special case of MMul in which the deamination // mutation occurs on the short part of the branch // that goes down to the left to the top internal node. // In such a case, the MMul present day base is // actually older than the top internal node. // Thus we look for a CpG at the MMul present day bases // and some deamination mutation at the top internal node. if ( nInternalNode == 0 ) { int nIndexOfPresentDayBases = 9; RWCString soPresentDayBases = RWCString( soTreeA[ nIndexOfPresentDayBases ] ) + RWCString( soTreeB[ nIndexOfPresentDayBases ] ); if ( soPresentDayBases == "cg" ) { if ( soAncestralSequence == "ca" ) { aDeaminationMutationsPositionB.insert( soGetBranchNameWithTree( -1, nDownToRight, nTreeB ) ); } else if ( soAncestralSequence == "tg" ) { aDeaminationMutationsPositionA.insert( soGetBranchNameWithTree( -1, nDownToRight, nTreeA ) ); } } } if ( soAncestralSequence != "cg" ) continue; // if reached here, ancestral sequence is a CpG // is there a mutation from this position? // first, check if there is a mutation down-right to // the corresponding present-day node int nIndexOfPresentDayBases = nSubtractInternalNodeFromThisToGetPresentDayIndex - nInternalNode; RWCString soPresentDayBases = RWCString( soTreeA[ nIndexOfPresentDayBases ] ) + RWCString( soTreeB[ nIndexOfPresentDayBases ] ); if ( soPresentDayBases == "ca" ) { aDeaminationMutationsPositionB.insert( soGetBranchNameWithTree( nInternalNode, nDownToRight, nTreeB ) ); } else if ( soPresentDayBases == "tg" ) { aDeaminationMutationsPositionA.insert( soGetBranchNameWithTree( nInternalNode, nDownToRight, nTreeA ) ); } if ( nInternalNode == 0 ) { // case of top internal node, leading to both MMul and PPyg // we've already considered the branch leading to PPyg (above) // in which nIndexOfPresentDayBases = 8 // Now consider the branch leading to MMul, with // nIndexOfPresentDayBases = 9 nIndexOfPresentDayBases = nSubtractInternalNodeFromThisToGetPresentDayIndex + 1; soPresentDayBases = RWCString( soTreeA[ nIndexOfPresentDayBases ] ) + RWCString( soTreeB[ nIndexOfPresentDayBases ] ); if ( soPresentDayBases == "ca" ) { aDeaminationMutationsPositionB.insert( soGetBranchNameWithTree( -1, nDownToRight, nTreeB ) ); } else if ( soPresentDayBases == "tg" ) { aDeaminationMutationsPositionA.insert( soGetBranchNameWithTree( -1, nDownToRight, nTreeA ) ); } } // if nInternalNode is 0-3, the next node is the // descendant internal node. if nInternalNode is // 4, the next node is PPan RWCString soDescendantInternalNode = RWCString( soTreeA[ nInternalNode + 1 ] ) + RWCString( soTreeB[ nInternalNode + 1 ] ); if ( soDescendantInternalNode == "ca" ) { aDeaminationMutationsPositionB.insert( soGetBranchNameWithTree( nInternalNode, nOnBackbone, nTreeB ) ); } else if ( soDescendantInternalNode == "tg" ) { aDeaminationMutationsPositionA.insert( soGetBranchNameWithTree( nInternalNode, nOnBackbone, nTreeA ) ); } } // for( int nInternalNode = 0; nInternalNode < 5; } // if ( pCP->bAutoReportDeaminationMutationsDeterminedByMoreAccurateMethod_ ) { else { // ^ PPan // ^ PTro // ^ HSap // ^ GGor // ^ PPyg // ^ MMul RWCString soPTro = RWCString( soTreeA.cGetLastChar(-5) ) + RWCString( soTreeB.cGetLastChar(-5) ); RWCString soMMul = RWCString( soTreeA.cGetLastChar(-1) ) + RWCString( soTreeB.cGetLastChar(-1) ); RWCString soHSap = RWCString( soTreeA.cGetLastChar(-4) ) + RWCString( soTreeB.cGetLastChar(-4) ); int nNumberOfCpGs = 0; int nNumberOfTpGs = 0; int nNumberOfCpAs = 0; for( int n = 1; n <= 3; ++n ) { RWCString soDinucleotide( (size_t) 2 ); if ( n == 1 ) soDinucleotide = soPTro; else if ( n == 2 ) soDinucleotide = soHSap; else if ( n == 3 ) soDinucleotide = soMMul; if ( soDinucleotide == "cg" ) ++nNumberOfCpGs; else if ( soDinucleotide == "tg" ) ++nNumberOfTpGs; else if ( soDinucleotide == "ca" ) ++nNumberOfCpAs; } int nNumberNotEqualZero = 0; if ( nNumberOfCpGs != 0 ) ++nNumberNotEqualZero; if ( nNumberOfTpGs != 0 ) ++nNumberNotEqualZero; if ( nNumberOfCpAs != 0 ) ++nNumberNotEqualZero; if ( nNumberNotEqualZero >= 2 ) { // declare it a CpG deamination mutation! if ( nNumberOfTpGs != 0 ) { // a CpG deamination mutation in the left position // don't bother finding where it occurred for now aDeaminationMutationsPositionA.insert( soGetBranchNameWithTree( -1, nDownToRight, nTreeA ) ); } if ( nNumberOfCpAs != 0 ) { // a CpG deamination mutation in the right position. // don't bother finding where it occurred for now aDeaminationMutationsPositionB.insert( soGetBranchNameWithTree( -1, nDownToRight, nTreeB ) ); } } } } // for( int nTreeB = 0; nTreeB < pTreesPositionB->length } // for( int nTreeA = 0; nTreeA < pTreesPositionA->length() aDeaminationMutationsPositionA.resort(); aDeaminationMutationsPositionB.resort(); // are there any deamination mutations that are in every pair of // trees? Count each. RWCString soLastDeaminationMutation; int nNumberOfPairsOfTreesWithCurrentDeaminationMutation; int nMutation; // add sentinel aDeaminationMutationsPositionA.insert( "sentinel" ); for( nMutation = 0; nMutation < aDeaminationMutationsPositionA.length(); ++nMutation ) { RWCString soCurrentDeaminationMutation = aDeaminationMutationsPositionA[ nMutation ]; if ( soCurrentDeaminationMutation == soLastDeaminationMutation ) { ++nNumberOfPairsOfTreesWithCurrentDeaminationMutation; } else { // just changed--first finish off last deamination mutation if ( !soLastDeaminationMutation.isNull() ) { float fFractionOfTreesInOtherColumnSupportingThisAsADeaminationMutation = (float) nNumberOfPairsOfTreesWithCurrentDeaminationMutation / (float) pTreesPositionB->length(); // yep, B if ( !pArrayOfDeaminationMutationsPositionA ) { pArrayOfDeaminationMutationsPositionA = new mbtValOrderedVectorOfRWCString( 5 ); // initial size pArrayOfProbsOfDeaminationMutationsPositionA = new RWTValOrderedVector( 5, "pArrayOfProbsOfDeaminationMutationsPositionA" ); } pArrayOfDeaminationMutationsPositionA->insert( soLastDeaminationMutation ); pArrayOfProbsOfDeaminationMutationsPositionA->insert( fFractionOfTreesInOtherColumnSupportingThisAsADeaminationMutation ); } // now set up for next deamination mutation soLastDeaminationMutation = soCurrentDeaminationMutation; nNumberOfPairsOfTreesWithCurrentDeaminationMutation = 1; } } soLastDeaminationMutation = ""; aDeaminationMutationsPositionB.insert( "sentinel" ); for( nMutation = 0; nMutation < aDeaminationMutationsPositionB.length(); ++nMutation ) { RWCString soCurrentDeaminationMutation = aDeaminationMutationsPositionB[ nMutation ]; if ( soCurrentDeaminationMutation == soLastDeaminationMutation ) { ++nNumberOfPairsOfTreesWithCurrentDeaminationMutation; } else { // just changed--first finish off last deamination mutation if ( !soLastDeaminationMutation.isNull() ) { float fFractionOfTreesInOtherColumnSupportingThisAsADeaminationMutation = (float) nNumberOfPairsOfTreesWithCurrentDeaminationMutation / (float) pTreesPositionA->length(); // yep, A if ( !pArrayOfDeaminationMutationsPositionB ) { pArrayOfDeaminationMutationsPositionB = new mbtValOrderedVectorOfRWCString( 5 ); // initial size pArrayOfProbsOfDeaminationMutationsPositionB = new RWTValOrderedVector( 5, "pArrayOfProbsOfDeaminationMutationsPositionB" ); } pArrayOfDeaminationMutationsPositionB->insert( soLastDeaminationMutation ); pArrayOfProbsOfDeaminationMutationsPositionB->insert( fFractionOfTreesInOtherColumnSupportingThisAsADeaminationMutation ); } // now set up for next deamination mutation soLastDeaminationMutation = soCurrentDeaminationMutation; nNumberOfPairsOfTreesWithCurrentDeaminationMutation = 1; } } } // void Contig :: checkTreesAndRecordDeaminationMutations( // void Contig :: checkTreesAndRecordDeaminationMutations( // const RWCString& soTreePositionA, // const RWCString& soTreePositionB, // mbtValOrderedVectorOfRWCString*& pArrayOfDeaminationMutationsPositionA, // mbtValOrderedVectorOfRWCString*& pArrayOfDeaminationMutationsPositionB ) { // mbtValOrderedVectorOfRWCString aDeaminationMutationsPositionA; // mbtValOrderedVectorOfRWCString aDeaminationMutationsPositionB; // RWCString soTreeA = soTreePositionA; // RWCString soTreeB = soTreePositionB; // // just for backward compatibility // int nTreeA = 0; // int nTreeB = 0; // // get rid of ? for now // soTreeA.removeSingleCharacterAllOccurrences('?'); // soTreeA.removeSingleCharacterAllOccurrences('('); // soTreeA.removeSingleCharacterAllOccurrences(')'); // soTreeA.removeSingleCharacterAllOccurrences(','); // soTreeB.removeSingleCharacterAllOccurrences( '?' ); // soTreeB.removeSingleCharacterAllOccurrences( '(' ); // soTreeB.removeSingleCharacterAllOccurrences( ')' ); // soTreeB.removeSingleCharacterAllOccurrences( ',' ); // // now we have a pair of trees, soTreeA and soTreeB // // abcdef // // ^top internal node, leading to MMul // // ^ next internal node, leading to PPyg // // ^ next internal node, leading to GGor // // ^ next internal node, leading to HSap, I hope! // // ^ next internal node, leading to PTro // // ^ PPan // // ^ PTro // // ^ GGor // // etc. // // g(g(g(g(g,g),g),a),g?,g) // // is changed to // // gggggggagg // // a(c(c(c(*,c),c),c),a,*) // // is changed to: // // accc*ccca* // // ^ node leading to MMul and PPyg // // ^ node leading to GGor // // ^ node leading to HSap // // ^ node leading to PTro and PPan // // ^ PPan // // ^ PTro // // ^ HSap // // ^ GGor // // ^ PPyg // // ^ MMul // // so there are 4 ancestral nodes to be concerned about // // it turns out (lucky?) that this nInternalNode actually // // is the same as that in soGetBranchNameWithoutTree and // // is also the string position in the tree. There is // // no nInternalNode == -1 here, though. // // 0 refers to the internal node where PPyg and MMul come together. // for( int nInternalNode = 0; nInternalNode <= nMaxInternalNode; // ++nInternalNode ) { // // notice that "nInternalNode" refers to the string position // // within the tree. So 0 is the top node and 4 is the node // // going to PPan and PTro // RWCString soAncestralSequence = // RWCString( soTreeA[ nInternalNode ] ) + // RWCString( soTreeB[ nInternalNode ] ); // // special case of MMul in which the deamination // // mutation occurs on the short part of the branch // // that goes down to the left to the top internal node. // // In such a case, the MMul present day base is // // actually older than the top internal node. // // Thus we look for a CpG at the MMul present day bases // // and some deamination mutation at the top internal node. // if ( nInternalNode == 0 ) { // int nIndexOfPresentDayBases = 9; // RWCString soPresentDayBases = // RWCString( soTreeA[ nIndexOfPresentDayBases ] ) + // RWCString( soTreeB[ nIndexOfPresentDayBases ] ); // if ( soPresentDayBases == "cg" ) { // if ( soAncestralSequence == "ca" ) { // aDeaminationMutationsPositionB.insert( // soGetBranchNameWithTree( -1, // nDownToRight, // nTreeB ) ); // } // else if ( soAncestralSequence == "tg" ) { // aDeaminationMutationsPositionA.insert( // soGetBranchNameWithTree( -1, // nDownToRight, // nTreeA ) ); // } // } // } // if ( soAncestralSequence != "cg" ) continue; // // if reached here, ancestral sequence is a CpG // // is there a mutation from this position? // // first, check if there is a mutation down-right to // // the corresponding present-day node // int nIndexOfPresentDayBases = // nSubtractInternalNodeFromThisToGetPresentDayIndex - // nInternalNode; // RWCString soPresentDayBases = // RWCString( soTreeA[ nIndexOfPresentDayBases ] ) + // RWCString( soTreeB[ nIndexOfPresentDayBases ] ); // if ( soPresentDayBases == "ca" ) { // aDeaminationMutationsPositionB.insert( // soGetBranchNameWithTree( nInternalNode, nDownToRight, // nTreeB ) ); // } // else if ( soPresentDayBases == "tg" ) { // aDeaminationMutationsPositionA.insert( // soGetBranchNameWithTree( nInternalNode, nDownToRight, // nTreeA ) // ); // } // if ( nInternalNode == 0 ) { // // case of top internal node, leading to both MMul and PPyg // // we've already considered the branch leading to PPyg (above) // // in which nIndexOfPresentDayBases = 8 // // Now consider the branch leading to MMul, with // // nIndexOfPresentDayBases = 9 // nIndexOfPresentDayBases = // nSubtractInternalNodeFromThisToGetPresentDayIndex + 1; // soPresentDayBases = // RWCString( soTreeA[ nIndexOfPresentDayBases ] ) + // RWCString( soTreeB[ nIndexOfPresentDayBases ] ); // if ( soPresentDayBases == "ca" ) { // aDeaminationMutationsPositionB.insert( // soGetBranchNameWithTree( -1, nDownToRight, nTreeB ) // ); // } // else if ( soPresentDayBases == "tg" ) { // aDeaminationMutationsPositionA.insert( // soGetBranchNameWithTree( -1, nDownToRight, nTreeA ) // ); // } // } // // if nInternalNode is 0-3, the next node is the // // descendant internal node. if nInternalNode is // // 4, the next node is PPan // RWCString soDescendantInternalNode = // RWCString( soTreeA[ nInternalNode + 1 ] ) + // RWCString( soTreeB[ nInternalNode + 1 ] ); // if ( soDescendantInternalNode == "ca" ) { // aDeaminationMutationsPositionB.insert( // soGetBranchNameWithTree( nInternalNode, nOnBackbone, nTreeB ) // ); // } // else if ( soDescendantInternalNode == "tg" ) { // aDeaminationMutationsPositionA.insert( // soGetBranchNameWithTree( nInternalNode, nOnBackbone, nTreeA ) // ); // } // } // for( int nInternalNode = 0; nInternalNode < 5; // if ( aDeaminationMutationsPositionA.length() > 0 && // !pArrayOfDeaminationMutationsPositionA ) { // pArrayOfDeaminationMutationsPositionA = // new mbtValOrderedVectorOfRWCString( 5 ); // initial size // } // int nDeam; // for( nDeam = 0; nDeam < aDeaminationMutationsPositionA.length(); // ++nDeam ) { // pArrayOfDeaminationMutationsPositionA->insert( aDeaminationMutationsPositionA[ nDeam ] ); // } // if ( aDeaminationMutationsPositionB.length() > 0 && // !pArrayOfDeaminationMutationsPositionB ) { // pArrayOfDeaminationMutationsPositionB = // new mbtValOrderedVectorOfRWCString( 5 ); // initial size // } // for( nDeam = 0; nDeam < aDeaminationMutationsPositionB.length(); // ++nDeam ) { // pArrayOfDeaminationMutationsPositionB->insert( aDeaminationMutationsPositionB[ nDeam ] ); // } // } void Contig :: countMutationsAtConsPos( const RWCString& soOneTree2, const RWCString& soPresentDayBases, const int nConsPos, const RWCString& soDeaminationMutations ) { RWCString soOneTree = soOneTree2; float fRoughMutationCount = 0.0; assert( !soOneTree.isNull() ); mbtValOrderedVectorOfRWCString aListOfMutations; soOneTree.removeSingleCharacterAllOccurrences( '?' ); soOneTree.removeSingleCharacterAllOccurrences( '(' ); soOneTree.removeSingleCharacterAllOccurrences( ')' ); soOneTree.removeSingleCharacterAllOccurrences( ',' ); // now look for mutations // nInternal node is the string offset and also that used in // soGetBranchNameWithoutTree but without using -1 // notice that "nInternalNode" refers to the string position // within the tree. So 0 is the top node and 3 is the node // going to PPan and PTro. Node 1 leads to GGor, node 2 leads // to HSap. for( int nInternalNode = 0; nInternalNode <= nMaxInternalNode; ++nInternalNode ) { // notice that "nInternalNode" refers to the string position // within the tree. So 0 is the top node and 3 is the node // going to PPan and PTro char cAncestralBase = soOneTree[ nInternalNode ]; // there are two paths--one to the next ancestral node, // the other to a present-day node. (In the case of the 0th // node, there is another path--that to MMul.) int nIndexOfPresentDayBases = nSubtractInternalNodeFromThisToGetPresentDayIndex - nInternalNode; char cPresentDayBase = soOneTree[ nIndexOfPresentDayBases ]; char cNextAncestralBaseOrPPan = soOneTree[ nInternalNode + 1 ]; // first consider the branch on the backbone RWCString soBackboneBranchName = soGetBranchNameWithoutTree( nInternalNode, nOnBackbone ); int nIndexOfBackboneBranch = nGetIndexForBranch( nInternalNode, nOnBackbone ); assert( soBackboneBranchName == aArrayOfNamesOfBranches[ nIndexOfBackboneBranch ] ); if ( ! ( cAncestralBase == '*' && cNextAncestralBaseOrPPan == '*' ) ) { bool bIsDeaminationMutationn = bIsDeaminationMutation( nInternalNode, nOnBackbone, soDeaminationMutations ); if ( bIsDeaminationMutationn ) { aArrayOfNumberOfDeaminationMutationsAtBranch[ nIndexOfBackboneBranch ] += 1; } else { (*(pArraysAtBranch[ cAncestralBase ][ cNextAncestralBaseOrPPan ] ))[ nIndexOfBackboneBranch ] += 1.0; // note: the above includes non-mutations and indel mutations, // as well as the 12 substitutions } // just for sanity check if ( cAncestralBase != cNextAncestralBaseOrPPan ) { fRoughMutationCount += 1.0; if ( cAncestralBase != '*' && cNextAncestralBaseOrPPan != '*' ) { RWCString soLocus = ConsEd::pGetAssembly()->filGetAceFileFullPathname().soGetProjectBeforeEditDir(); fprintf( pAO, "mutation %s %15s %4d %s %c %s %c\n", soBackboneBranchName.data(), soLocus.data(), nUnpaddedIndex( nConsPos ), soPresentDayBases.data(), cTransitionsTransversions[cAncestralBase][ cNextAncestralBaseOrPPan], ( bIsDeaminationMutationn ? "yes_deam" : "no_deam"), toupper( ntGetCons( nConsPos ).cGetBase() ) ); } // if ( nInternalNode == 0 ) { } // end sanity check } // now consider the branch down and to right int nIndexForBranchDownAndToRight = nGetIndexForBranch( nInternalNode, nDownToRight ); RWCString soBranchNameDownAndToRight = soGetBranchNameWithoutTree( nInternalNode, nDownToRight ); assert( soBranchNameDownAndToRight == aArrayOfNamesOfBranches[ nIndexForBranchDownAndToRight ] ); fprintf( pAO, "branch: %s %d %c %c 1\n", soBranchNameDownAndToRight.data(), nUnpaddedIndex( nConsPos ), cAncestralBase, cPresentDayBase ); if ( !( cAncestralBase == '*' && cPresentDayBase == '*' ) ) { bool bIsDeaminationMutationn = bIsDeaminationMutation( nInternalNode, nDownToRight, soDeaminationMutations ); if ( bIsDeaminationMutationn ) { aArrayOfNumberOfDeaminationMutationsAtBranch[ nIndexForBranchDownAndToRight ] += 1; } else { (*(pArraysAtBranch[ cAncestralBase ][ cPresentDayBase ] ))[ nIndexForBranchDownAndToRight ] += 1.0; } // sanity check if ( cAncestralBase != cPresentDayBase ) { fRoughMutationCount += 1.0; if ( cAncestralBase != '*' && cPresentDayBase != '*' ) { RWCString soLocus = ConsEd::pGetAssembly()->filGetAceFileFullPathname().soGetProjectBeforeEditDir(); fprintf( pAO, "mutation %s %15s %4d %s %c %s %c\n", soBranchNameDownAndToRight.data(), soLocus.data(), nUnpaddedIndex( nConsPos ), soPresentDayBases.data(), cTransitionsTransversions[cAncestralBase][cPresentDayBase], ( bIsDeaminationMutationn ? "yes_deam" : "no_deam" ), toupper( ntGetCons( nConsPos ).cGetBase() ) ); } } // end sanity check } // one more branch to consider: that going to MMul if ( nInternalNode == 0 ) { char cMMulBase = soOneTree[ nSubtractInternalNodeFromThisToGetPresentDayIndex + 1]; int nIndexForMMulBranch = nGetIndexForBranch( -1, 0 ); if ( !( cAncestralBase == '*' && cMMulBase == '*' ) ) { // mutation. could it be a deamination mutation? bool bIsDeaminationMutationn = bIsDeaminationMutation( -1, // nInternalNode for MMul branch nDownToRight, soDeaminationMutations ); if ( bIsDeaminationMutationn ) { aArrayOfNumberOfDeaminationMutationsAtBranch[ nIndexForMMulBranch ] += 1; } else { (*(pArraysAtBranch[ cAncestralBase ][ cMMulBase ] ) )[ nIndexForMMulBranch ] += 1.0; } // sanity check if ( cAncestralBase != cMMulBase ) { fRoughMutationCount += 1.0; RWCString soLocus = ConsEd::pGetAssembly()->filGetAceFileFullPathname().soGetProjectBeforeEditDir(); fprintf( pAO, "mutation MMul_ex %15s %4d %s %c %s %c\n", soLocus.data(), nUnpaddedIndex( nConsPos ), soPresentDayBases.data(), cTransitionsTransversions[cAncestralBase][cPresentDayBase], ( bIsDeaminationMutationn ? "yes_deam" : "no_deam" ), toupper( ntGetCons( nConsPos ).cGetBase() ) ); } // end sanity check } // if ( !( cAncestralBase == '*' && cMMulBase == '*' } // if ( nInternalNode == 0 ) { } // for( int nInternalNode = 0; nInternalNode < 5; ... // sanity check on the number of mutations int nNumberOfDistinctBases = nGetNumberOfDistinctBasesNotNs( soPresentDayBases ); float fDiff = fabs( nNumberOfDistinctBases - 1 - fRoughMutationCount ); if ( fDiff > 0.001 ) { fprintf( pAO, "sanity check failed: %d distinct bases %.6f rough mutations %s %s %d\n", nNumberOfDistinctBases, fRoughMutationCount, soPresentDayBases.data(), soGetName().data(), nUnpaddedIndex( nConsPos ) ); } nEstimatedTotalMutations += ( nNumberOfDistinctBases - 1 ); ++nConsPosConsidered; } // countMutationsAtConsPos( void Contig :: printMutationsWithContext( autoReport* pAutoReport ) { setTransitionsTransversionsTableIfNecessary(); initializeArrays(); // this differs from printFlankedColumns3 in that it records the // trees and attempts to identify which mutations are deamination // mutations // read all of the good reads RWCString soGoodReads = pCP->filAutoReportGoodHitReads_; FILE* pGoodReads = fopen( soGoodReads.data(), "r" ); if ( !pGoodReads ) { RWCString soError = "could not open "; soError += soGoodReads; THROW_ERROR( soError ); } mbtValOrderedVectorOfRWCString aGoodReads( (size_t) 1000 ); while( fgets( soLine, nMaxLineSize, pGoodReads ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); aGoodReads.insert( soLine ); } fclose( pGoodReads ); // aGoodReads.resort(); inefficient--just check every one // this will change when we delete columns int nPaddedContigLength = nGetPaddedConsLength(); mbtValVectorOfBool aSomeCombinedReadHasNoPad( nPaddedContigLength, 1, "Contig::aSomeCombinedReadHasNoPad" ); RWTPtrOrderedVector aListOfReadsInSingleSignalArrays( (size_t) 10 ); typedef MBTValOrderedOffsetVector typeArrayOfChar; RWTPtrOrderedVector aListOfSingleSignalArrays( (size_t) 10 ); // the point of the look below is to set aSomeCombinedReadHasNoPad // and to set the single signal arrays (aListOfSingleSignalArrays) int nSpecies; for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; LocatedFragment* pHumanRead = NULL; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { pHumanRead = pLocFrag; continue; } // the fake read for the 2nd alignment if ( pLocFrag->soGetName().bContains( "b." ) ) { continue; } if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; // check if this is a good read if ( aGoodReads.index( pLocFrag->soGetName() ) == RW_NPOS ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } // If we are in a contig that doesn't have a human read (rare, and // pathological), ignore this contig. if ( !pHumanRead ) return; // single signal arrays MBTValOrderedOffsetVector* pForwardReadSingleSignalArray = NULL; MBTValOrderedOffsetVector* pReverseReadSingleSignalArray = NULL; // save single signal arrays before deleting column of pads for( int n = 0; n <= 1; ++n ) { LocatedFragment* pLocFrag = NULL; if ( n == 0 ) pLocFrag = pForwardRead; else pLocFrag = pReverseRead; if ( !pLocFrag ) continue; MBTValOrderedOffsetVector* pSingleSignalArray = NULL; assert( pLocFrag->bGetSingleSignalBasesConsensusLength( pSingleSignalArray ) ); aListOfReadsInSingleSignalArrays.insert( pLocFrag ); aListOfSingleSignalArrays.insert( pSingleSignalArray ); if ( n == 0 ) pForwardReadSingleSignalArray = pSingleSignalArray; else pReverseReadSingleSignalArray = pSingleSignalArray; } for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // the consensus and human read should be equal assert( pHumanRead->bIsInAlignedPartOfRead( nConsPos ) ); if ( !pHumanRead->ntGetFragFromConsPos( nConsPos ).bIsPad() ) { aSomeCombinedReadHasNoPad.setValue( nConsPos, true ); } char cCombinedBase = 0; int nCombinedQuality = -666; if ( !bGetCombinedReadAtBase( pForwardRead, pForwardReadSingleSignalArray, pReverseRead, pReverseReadSingleSignalArray, nConsPos, cCombinedBase, nCombinedQuality ) ) continue; if ( cCombinedBase != '*' ) aSomeCombinedReadHasNoPad.setValue( nConsPos, true ); } } // for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_... // remove columns in which all combined reads have a pad // first do it to single signal arrays int nRead; for( nRead = 0; nRead < aListOfSingleSignalArrays.length(); ++nRead ) { MBTValOrderedOffsetVector* pSingleSignalArray = aListOfSingleSignalArrays[ nRead ]; for( int nConsPos = nGetEndConsensusIndex(); nConsPos >= nGetStartConsensusIndex(); --nConsPos ) { if ( !aSomeCombinedReadHasNoPad[ nConsPos ] ) { pSingleSignalArray->removeAt( nConsPos ); } } } // for( int nRead = 0; int nConsPos; for( nConsPos = nGetEndConsensusIndex(); nConsPos >= nGetStartConsensusIndex(); --nConsPos ) { if ( !aSomeCombinedReadHasNoPad[ nConsPos ] ) { deleteColumn( nConsPos, false ); } } // new padded length after deleting columns nPaddedContigLength = nGetPaddedConsLength(); // create a combined read, one for each species // each combined read will be the length of the entire consensus RWTPtrVector aCombinedReads( (size_t) pAutoReport->aArrayOfSpecies_.length(), NULL, // initial value "aCombinedReads" ); for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; RWCString soCombinedReadName = soSpecies + ".comb"; LocatedFragment* pCombinedRead = new LocatedFragment( soCombinedReadName, 1, // aligned position within contig false, // bComplemented this ); // contig aCombinedReads[ nSpecies ] = pCombinedRead; for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { pCombinedRead->appendNtide( Ntide( 'n', 0 ) ); } } // Now create bit arrays, one for each species mbtValVector aMaskOfSpeciesMeetingCriteria( nGetPaddedConsLength(), 1, // starts at 1 0, // initial value "Contig::aMaskOfSpeciesMeetingCriteria" ); mbtValVector aMaskOfSpeciesMeetingFlankingCriteriaAndAgreeingWithHuman( nGetPaddedConsLength(), 1, // starts at 1 0, // initial value "Contig::aMaskOfSpeciesAndAgreeingWithHuman" ); mbtValVector aProblemsWithSpeciesAtConsPos( nGetPaddedConsLength(), 1, // starts at 1 "iiiii", // initial value, "i" for initial "Contig::aProblemsWithSpeciesAtConsPos" ); mbtValVector aProblemsForFlankingWithSpeciesAtConsPos( nGetPaddedConsLength(), 1, // starts at 1 "iiiii", // initial value, "i" for initial "Contig::aProblemsForFlankingWithSpeciesAtConsPos" ); mbtValVector aProblemsIncludingFlankingWithSpeciesAtConsPos( nGetPaddedConsLength(), 1, // starts at 1 "iiiii", // initial value, "i" for initial "Contig::aProblemsWithSpeciesAtConsPos" ); for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; LocatedFragment* pCombinedRead = aCombinedReads[ nSpecies ]; assert( pCombinedRead->soGetName().bContains( soSpecies ) ); LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; LocatedFragment* pHumanRead = NULL; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { pHumanRead = pLocFrag; continue; } // the fake read for the 2nd alignment if ( pLocFrag->soGetName().bContains( "b." ) ) { continue; } if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; // check if this is a good read if ( aGoodReads.index( pLocFrag->soGetName() ) == RW_NPOS ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } // case in which both forward and reverse reads are not good-matching if ( !pForwardRead && !pReverseRead ) { for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { ( aProblemsWithSpeciesAtConsPos[ nConsPos ] )[ nSpecies ] = cProblemNoGoodRead; } continue; // next species } // If we are in a contig that doesn't contain a human read, // ignore the contig if ( !pHumanRead ) return; // find single signal arrays MBTValOrderedOffsetVector* pForwardReadSingleSignalArray = NULL; MBTValOrderedOffsetVector* pReverseReadSingleSignalArray = NULL; for( int nSingleSignalArray = 0; nSingleSignalArray < aListOfReadsInSingleSignalArrays.length(); ++nSingleSignalArray ) { if ( aListOfReadsInSingleSignalArrays[ nSingleSignalArray ] == pForwardRead ) { pForwardReadSingleSignalArray = aListOfSingleSignalArrays[ nSingleSignalArray ]; } else if ( aListOfReadsInSingleSignalArrays[ nSingleSignalArray ] == pReverseRead ) { pReverseReadSingleSignalArray = aListOfSingleSignalArrays[ nSingleSignalArray ]; } } if ( pForwardRead ) { assert( pForwardReadSingleSignalArray ); } if ( pReverseRead ) { assert( pReverseReadSingleSignalArray ); } // at this point, we have whiever reads are available for that // species and their single-signal arrays unsigned char ucSpeciesMask = ucGetMaskForSpecies( soSpecies ); for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // the consensus and human read should be equal assert( pHumanRead->bIsInAlignedPartOfRead( nConsPos ) ); char cCombinedBase = 0; int nCombinedQuality = -666; char cProblem = ' '; bGetCombinedReadAtBaseAndReportProblem( cNotForFlanking, pForwardRead, pForwardReadSingleSignalArray, pReverseRead, pReverseReadSingleSignalArray, nConsPos, cCombinedBase, nCombinedQuality, cProblem ); ( aProblemsWithSpeciesAtConsPos[ nConsPos ] )[ nSpecies ] = cProblem; pCombinedRead->Sequence::setBaseAtPos( pCombinedRead->nGetFragIndexFromConsPos( nConsPos ), cCombinedBase ); pCombinedRead->Sequence::setQualityAtSeqPos( pCombinedRead->nGetFragIndexFromConsPos( nConsPos ), nCombinedQuality ); if ( cProblem == cProblemNone ) { aMaskOfSpeciesMeetingCriteria[nConsPos] |= ucSpeciesMask; } bGetCombinedReadAtBaseAndReportProblem( cForFlanking, pForwardRead, pForwardReadSingleSignalArray, pReverseRead, pReverseReadSingleSignalArray, nConsPos, cCombinedBase, nCombinedQuality, cProblem ); if ( cProblem == cProblemNone ) { // agreeing pads do not count as flanking bases. To be // sure, explicitly check for this. (Even though we deleted // columns of pads, the might be some columns not removed // that have a pad in human and some species but not others.) if ( ( cCombinedBase == ntGetCons( nConsPos ).cGetBase() ) && !ntGetCons( nConsPos ).bIsPad() ) { aMaskOfSpeciesMeetingFlankingCriteriaAndAgreeingWithHuman[ nConsPos ] |= ucSpeciesMask; } else { cProblem = cProblemFlankingNotAgreeing; } } ( aProblemsForFlankingWithSpeciesAtConsPos[ nConsPos ] )[ nSpecies ] = cProblem; } // for( int nConsPos = nGetStartConsensusIndex(); } // for( nSpecies = 0; nSpecies < ... // now we have finished setting 3 key arrays: // aCombinedReads // aMaskOfSpeciesMeetingCriteria // aMaskOfSpeciesMeetingFlankingCriteriaAndAgreeingWithHuman // with these, we can figure out the flanked columns and // print out the bases in those columns mbtValVector aAcceptableSpeciesAtConsPos( nGetPaddedConsLength(), 1, // starting position 0, // initial value "aAcceptableSpeciesAtConsPos" ); int nConsPosOfStartingColumn; for( nConsPosOfStartingColumn = nGetStartConsensusIndex(); nConsPosOfStartingColumn <= nGetEndConsensusIndex() - 2; ++nConsPosOfStartingColumn ) { int nConsPosOfEndingColumn = nConsPosOfStartingColumn + 2; // only interested in cases in which the 2 flanking columns which // some species in common. Otherwise there is not a single // species meeting the flanking criteria. if ( aMaskOfSpeciesMeetingFlankingCriteriaAndAgreeingWithHuman[ nConsPosOfStartingColumn ] & aMaskOfSpeciesMeetingFlankingCriteriaAndAgreeingWithHuman[ nConsPosOfEndingColumn ] ) { int nConsPos = nConsPosOfStartingColumn + 1; // we have a pair of flanking columns. However, the // columns between the flanking columns may not // be ok. For example, it might be that the flanking // gorilla is single signal, but one of the internal // columns doesn't have single signal for gorilla. // I think in such a case, we could use the columns // that are ok, but just not use the column in which // gorilla is not ok. We should not use any species // that isn't supported by the flanking columns, though. unsigned char ucFlankingColumnSpecies = aMaskOfSpeciesMeetingFlankingCriteriaAndAgreeingWithHuman[ nConsPosOfStartingColumn ] & aMaskOfSpeciesMeetingFlankingCriteriaAndAgreeingWithHuman[ nConsPosOfEndingColumn ] ; assert( ucFlankingColumnSpecies ); if ( ucFlankingColumnSpecies & aMaskOfSpeciesMeetingCriteria[ nConsPos ] ) { unsigned char ucAcceptableSpeciesAtThisColumn = aMaskOfSpeciesMeetingCriteria[ nConsPos ] & ucFlankingColumnSpecies; aAcceptableSpeciesAtConsPos[ nConsPos ] = ucAcceptableSpeciesAtThisColumn; // the other species at this position can fail in 2 ways: // no flanking bases or not meeting criteria at this position // no flanking bases can be due to failing the flanking criteria // (e.g., in low quality segment) or not agreeing with human // is this the point at which we should record this info? for( int nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; unsigned char ucSpeciesMask = ucGetMaskForSpecies( soSpecies ); char cProblem = ' '; if ( ucAcceptableSpeciesAtThisColumn & ucSpeciesMask ) { cProblem = cProblemNone; } else if ( !( aMaskOfSpeciesMeetingCriteria[ nConsPos ] & ucSpeciesMask ) ) { // this base/species has a problem cProblem = ( aProblemsWithSpeciesAtConsPos[ nConsPos ] )[ nSpecies ]; } else { // so the base itself is ok. So the flanking // bases must have a problem with them assert( !( ucFlankingColumnSpecies & ucSpeciesMask ) ); // fails flanking. Does it fail due to the flanking // bases not agreeing with human or due to them not // not meeting flanking criteria? if ( ( aProblemsForFlankingWithSpeciesAtConsPos[ nConsPos - 1 ] )[ nSpecies ] != cProblemNone ) { cProblem = ( aProblemsForFlankingWithSpeciesAtConsPos[ nConsPos - 1 ] )[ nSpecies ]; } else if ( ( aProblemsForFlankingWithSpeciesAtConsPos[ nConsPos + 1 ] )[ nSpecies ] != cProblemNone ) { cProblem = ( aProblemsForFlankingWithSpeciesAtConsPos[ nConsPos + 1 ] )[ nSpecies ]; } else { assert( false ); } // if the problem is with a flanking base, we don't // report the specific problem, but rather that the // problem was with the flanking base if ( cProblem != cProblemFlankingNotAgreeing ) { cProblem = cProblemFlankingOther; } } ( aProblemsIncludingFlankingWithSpeciesAtConsPos[ nConsPos ] )[ nSpecies ] = cProblem; } // for( int nSpecies = 0; nSpecies < } // if ( ucFlankingColumnSpecies & aMaskOfSpeciesMeetingCriteria[ } // if ( aMaskOfSpeciesMeetingFlankingCriteriaAndAgreeingWithHuman[ nConsPosOfStartingColumn ] ... } // for( int nConsPosOfStartingColumn = nGetStartConsensusIndex(); // now get ancestral trees and hence deamination mutations RWTPtrVector aArrayOfTreesAtEachConsPos( nPaddedContigLength + 1, // + 1so we can use nConsPos (1-based) directly as an index 0, "aArrayOfTreesAtEachConsPos" ); RWTValVector aArrayOfBasesAtEachConsPos( nPaddedContigLength + 1, // + 1so we can use nConsPos (1-based) directly as an index "", "aArrayOfBasesAtEachConsPos" ); for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( ! aAcceptableSpeciesAtConsPos[ nConsPos ] ) continue; RWCString soColumnOfBases( (size_t) 11 ); // 6 bases and 5 spaces soColumnOfBases = ntGetCons( nConsPos ).cGetBase(); // start with human for( int nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; unsigned char ucSpeciesMask = ucGetMaskForSpecies( soSpecies ); char cBase; if ( ucSpeciesMask & aAcceptableSpeciesAtConsPos[ nConsPos ] ) { // if reached here, soSpecies is a flanked species // which, at this columns, meets the criteria // so print out the base LocatedFragment* pCombinedRead = aCombinedReads[ nSpecies ]; assert( pCombinedRead ); cBase = pCombinedRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } else { // always the same number of bases on each line cBase = 'n'; } soColumnOfBases += cBase; } // for( int nSpecies = 0; nSpecies < aArrayOfBasesAtEachConsPos[ nConsPos ] = soColumnOfBases; RWTValOrderedVector aTrees; findAncestralTrees( soColumnOfBases, aTrees ); arrayOfRWCString* pTrees = new RWTValOrderedVector( aTrees ); aArrayOfTreesAtEachConsPos[ nConsPos ] = pTrees; } // for( nConsPos = nGetStartConsensusIndex(); // record all deamination mutations in these arrays RWTPtrVector aArrayOfDeaminationMutationsAtEachConsPos( nPaddedContigLength + 1, // so can // use nConsPos directly 0, // initial "aArrayOfDeaminationMutationsAtEachConsPos" ); typedef RWTValOrderedVector arrayOfFloat; RWTPtrVector aArrayOfProbabilitiesOfDeaminationMutationsAtEachConsPos( nPaddedContigLength + 1, // so can use nConsPos directly 0, // initial value "aArrayOfProbabilitiesOfDeaminationMutationsAtEachConsPos" ); for( nConsPos = nGetStartConsensusIndex() + 1; nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( !aArrayOfTreesAtEachConsPos[ nConsPos - 1 ] ) continue; if ( !aArrayOfTreesAtEachConsPos[ nConsPos ] ) continue; checkTreesAndRecordDeaminationMutations( aArrayOfTreesAtEachConsPos[ nConsPos - 1 ], aArrayOfTreesAtEachConsPos[ nConsPos ], aArrayOfDeaminationMutationsAtEachConsPos[ nConsPos - 1 ], aArrayOfDeaminationMutationsAtEachConsPos[ nConsPos ], aArrayOfProbabilitiesOfDeaminationMutationsAtEachConsPos[ nConsPos - 1 ], aArrayOfProbabilitiesOfDeaminationMutationsAtEachConsPos[ nConsPos ] ); } // now used aAcceptableSpeciesAtConsPos to print out which // species will be used for each position for( nConsPos = nGetStartConsensusIndex() + 1; nConsPos <= nGetEndConsensusIndex() - 1; ++nConsPos ) { if ( ! aAcceptableSpeciesAtConsPos[ nConsPos ] ) continue; RWCString soColumnOfBases( (size_t) 6 ); // 6 bases and no spaces RWCString soColumnOfProblems( (size_t) 5 ); // 5 non-human soColumnOfBases = ntGetCons( nConsPos ).cGetBase(); // start with human char cHumanBase = ntGetCons( nConsPos ).cGetBase(); for( int nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; unsigned char ucSpeciesMask = ucGetMaskForSpecies( soSpecies ); LocatedFragment* pCombinedRead = aCombinedReads[ nSpecies ]; assert( pCombinedRead ); char cBase = pCombinedRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); soColumnOfProblems += ( aProblemsIncludingFlankingWithSpeciesAtConsPos[ nConsPos ] )[ nSpecies ]; soColumnOfBases += cBase; } // for( int nSpecies = 0; nSpecies < // now look for any deamination mutations RWTValOrderedVector* pArrayOfProbabilitiesOfDeaminationMutationsAtEachSpecies = aArrayOfProbabilitiesOfDeaminationMutationsAtEachConsPos[ nConsPos ]; RWCString soDeaminationMutationFlags; bool bDeam = false; if ( pArrayOfProbabilitiesOfDeaminationMutationsAtEachSpecies ) { for( int nDeam = 0; nDeam < pArrayOfProbabilitiesOfDeaminationMutationsAtEachSpecies->length(); ++nDeam ) { if ( (*pArrayOfProbabilitiesOfDeaminationMutationsAtEachSpecies)[ nDeam ] >= 0.5 ) { bDeam = true; break; } } } char cBaseBefore = ntGetCons( nConsPos - 1 ).cGetBase(); char cBaseAfter = ntGetCons( nConsPos + 1 ).cGetBase(); fprintf( pAO, "%c%c %s e%s %4d %s\n", cBaseBefore, cBaseAfter, soColumnOfBases.data(), soColumnOfProblems.data(), nUnpaddedIndex( nConsPos ), ( bDeam ? "deam " : "nodeam" ) ); } // for( nConsPos = nGetStartConsensusIndex(); } void Contig :: printCrudeChimpHumanMutations( autoReport* pAutoReport ) { setTransitionsTransversionsTableIfNecessary(); RWTPtrVector aCombinedReads( (size_t) pAutoReport->aArrayOfSpecies_.length(), NULL, // initial value "aCombinedReads" ); mbtValVector aSpeciesInAcceptableColumns( 1, // will be changed 1, // starting position 0, // initial value "aSpeciesInAcceptableColumns" ); RWTValVector aArrayOfOneTreeAtEachConsPos( 1, // size--will be changed // + 1so we can use nConsPos (1-based) directly as an index "", "aArrayOfOneTreeAtEachConsPos" ); RWTValVector aArrayOfBasesAtEachConsPos( 1, // size--will be changed // + 1so we can use nConsPos (1-based) directly as an index "", "aArrayOfBasesAtEachConsPos" ); RWTValVector aArrayOfLenientlyFilteredBasesAtEachConsPos( 1, // size--will be changed // + 1 so we can use nConsPos (1-based) directly as an index "", // initial value "aArrayOfLenientlyFilteredBasesAtEachConsPos" ); RWTValVector aDeaminationMutationsAtEachConsPos( 1, // size--will be changed "", // initial value "aDeaminationMutationsAtEachConsPos" ); RWTValVector aAncestralCpGAtEachConsPos( 1,// size--will be changed cNoCpG, // initial value "aAncestralCpGAtEachConsPos" ); // this is different than above in that if a CpG occurs, both // columns are marked, rather than just the first column RWTValVector aAncestralCpGAtEachConsPos2( 1,// size--will be changed cNoCpG, // initial value "aAncestralCpGAtEachConsPos" ); LocatedFragment* pHumanRead; applyFilters( pAutoReport, aCombinedReads, aSpeciesInAcceptableColumns, aArrayOfOneTreeAtEachConsPos, aArrayOfBasesAtEachConsPos, aArrayOfLenientlyFilteredBasesAtEachConsPos, aDeaminationMutationsAtEachConsPos, aAncestralCpGAtEachConsPos2, pHumanRead ); RWCString soLocus = ConsEd::pGetAssembly()->filGetAceFileFullPathname().soGetProjectBeforeEditDir(); int nConsPos; for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( aArrayOfOneTreeAtEachConsPos[ nConsPos ].isNull() ) continue; // for comparison with Graham's data if ( ! ( aSpeciesInAcceptableColumns[ nConsPos ] & ucPTro ) ) continue; if ( ! ( aSpeciesInAcceptableColumns[ nConsPos ] & ucMMul ) ) continue; if ( aAncestralCpGAtEachConsPos2[ nConsPos ] != cNoCpG ) continue; // end for comparison with Graham's data RWCString soBases = aArrayOfBasesAtEachConsPos[ nConsPos ]; char cPTro = soBases[nPTro]; char cHSap = soBases[nHSap]; char cMMul = soBases[nMMul]; // can't exclude it here, since some of these do not have indels // in Grahams data. // Exclude it in perl. // we are not interested in indels // if ( cPTro == '*' || // cHSap == '*' || // cMMul == '*' ) continue; // if ( cPTro == cHSap && cHSap == cMMul ) continue; RWCString soBranch; char cBase1; char cBase2; if ( cPTro == cHSap && cHSap == cMMul ) { soBranch = "o"; } else if ( cPTro == cMMul ) { soBranch = "HSap_ex"; } else if ( cHSap == cMMul ) { soBranch = "PTro_ex"; } else { // neither human nor chimp matches mmul--not useful for this analysis soBranch = "o"; } // change pads from * to - soBases.translate( "*", "-" ); fprintf( pAO, "mutation %s %15s %4d %s %c\n", soBranch.data(), soLocus.data(), nUnpaddedIndex( nConsPos ), soBases.data(), cTransitionsTransversions[cHSap][cPTro] ); } }