/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "contig.h" #include "consedParameters.h" #include "locatedFragment.h" #include "bIntersect.h" #include "mbtValVectorOfBool.h" #include "mbtValVector.h" #include "rwtptrorderedvector.h" #include "ntide.h" #include "fileDefines.h" #include "soAddCommas.h" #include "rwctokenizer.h" #include "autoReport.h" #include "consed.h" #include "assembly.h" #include "nNumberFromBase.h" #include "setErrorRateFromQuality.h" #include "dErrorRateFromQuality.h" #include "singletInfo.h" #include "readPHDAgainForHighQualitySegment.h" #include using namespace std; #include #include #include "rwcregexp.h" #include "getAllSinglets.h" #include "mbt_val_ord_offset_vec.h" #include "soLine.h" #include "max.h" #include #include "findAncestralTrees.h" #include "ucGetMaskForSpecies.h" #include "bAllNeededSpecies.h" #include "assert.h" #include "kimuraBranchProb.h" #include "soGetBranchNameWithoutTree.h" static const RWCString soHCClade("hc" ); static const RWCString soHGClade("hg" ); static const RWCString soCGClade("cg" ); static char cGetAgreeingBasesNoNs( const char cA, const char cB ) { if ( cA == cB && cA != 'n' ) { return( cA ); } else if ( cA != cB ) { if ( cA == 'n' ) return( cB ); else if ( cB == 'n' ) return( cA ); else { return( 0 ); } } else { // bases both equal n assert( false ); } } static bool bIsAllPadsOrNs( const RWCString& soBases ) { for( int n = 0; n < soBases.length(); ++n ) { if ( soBases[n] != '*' && soBases[n] != 'n' ) { return false; } } return true; } void Contig :: chooseTreesUsingBadData( const RWCString& soColumnOfBadBases2, RWTValOrderedVector& aTrees, RWTValVector* pChosenTrees ) { // convert soColumnOfBadBases2 to a form with the following order: // from: // HSap PPan PTro GGor PPyg MMul // to this form: // PPan PTro HSap GGor PPyg MMul char cHSap = soColumnOfBadBases2[0]; RWCString soColumnOfBadBases = soColumnOfBadBases2.soGetRestOfString( 1 ); soColumnOfBadBases.insert( 2, RWCString( cHSap ) ); RWTValVector aTreesMatching( (size_t) aTrees.length(), false, "aTreesMatching" ); int nNumberOfMatches = 0; int nTree; for( nTree = 0; nTree < aTrees.length(); ++nTree ) { if ( !(*pChosenTrees)[ nTree ] ) continue; RWCString soTree = aTrees[ nTree ]; soTree.removeSingleCharacterAllOccurrences('?'); soTree.removeSingleCharacterAllOccurrences('('); soTree.removeSingleCharacterAllOccurrences(')'); soTree.removeSingleCharacterAllOccurrences(','); // looks like: // abcdef // ^to PPyg and MMul // ^to GGor // ^ to HSap // ^ to PPan and PTro // ^PPan PTro HSap GGor PPyg MMul // 01234 5 6 7 8 9 RWCString soPresentDayBasesFromTree = soTree(4, 6 ); if ( soColumnOfBadBases.length() != soPresentDayBasesFromTree.length() ) { RWCString soError = "bad bases: " + soColumnOfBadBases + " not same length as present-day " + soPresentDayBasesFromTree; THROW_ERROR( soError ); } bool bMatches = true; for( int nSpecies = 0; nSpecies < soColumnOfBadBases.length(); ++nSpecies ) { if ( ( soColumnOfBadBases[nSpecies] != 'n' ) && ( soColumnOfBadBases[nSpecies] != soPresentDayBasesFromTree[nSpecies] ) ) { bMatches = false; break; } } if ( bMatches ) { aTreesMatching[ nTree ] = true; ++nNumberOfMatches; } } if ( nNumberOfMatches == 0 ) { // do not use bad data to choose the tree return; } else { for( nTree = 0; nTree < aTreesMatching.length(); ++nTree ) { (*pChosenTrees)[ nTree ] = aTreesMatching[ nTree ]; } } } void Contig :: printFlankedColumns2( autoReport* pAutoReport ) { // read all of the good reads RWCString soGoodReads = pCP->filAutoReportGoodHitReads_; FILE* pGoodReads = fopen( soGoodReads.data(), "r" ); if ( !pGoodReads ) { RWCString soError = "could not open "; soError += soGoodReads; THROW_ERROR( soError ); } mbtValOrderedVectorOfRWCString aGoodReads( (size_t) 1000 ); while( fgets( soLine, nMaxLineSize, pGoodReads ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); aGoodReads.insert( soLine ); } fclose( pGoodReads ); // aGoodReads.resort(); inefficient--just check every one // this will change when we delete columns int nPaddedContigLength = nGetPaddedConsLength(); mbtValVectorOfBool aSomeCombinedReadHasNoPad( nPaddedContigLength, 1, "Contig::aSomeCombinedReadHasNoPad" ); RWTPtrOrderedVector aListOfReadsInSingleSignalArrays( (size_t) 10 ); typedef MBTValOrderedOffsetVector typeArrayOfChar; RWTPtrOrderedVector aListOfSingleSignalArrays( (size_t) 10 ); // the point of the look below is to set aSomeCombinedReadHasNoPad // and to set the single signal arrays (aListOfSingleSignalArrays) int nSpecies; for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; LocatedFragment* pHumanRead = NULL; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { pHumanRead = pLocFrag; continue; } // the fake read for the 2nd alignment if ( pLocFrag->soGetName().bContains( "b." ) ) { continue; } if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; // check if this is a good read if ( aGoodReads.index( pLocFrag->soGetName() ) == RW_NPOS ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } // If we are in a contig that doesn't have a human read (rare, and // pathological), ignore this contig. if ( !pHumanRead ) return; // single signal arrays MBTValOrderedOffsetVector* pForwardReadSingleSignalArray = NULL; MBTValOrderedOffsetVector* pReverseReadSingleSignalArray = NULL; // save single signal arrays before deleting column of pads for( int n = 0; n <= 1; ++n ) { LocatedFragment* pLocFrag = NULL; if ( n == 0 ) pLocFrag = pForwardRead; else pLocFrag = pReverseRead; if ( !pLocFrag ) continue; MBTValOrderedOffsetVector* pSingleSignalArray = NULL; assert( pLocFrag->bGetSingleSignalBasesConsensusLength( pSingleSignalArray ) ); aListOfReadsInSingleSignalArrays.insert( pLocFrag ); aListOfSingleSignalArrays.insert( pSingleSignalArray ); if ( n == 0 ) pForwardReadSingleSignalArray = pSingleSignalArray; else pReverseReadSingleSignalArray = pSingleSignalArray; } for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // the consensus and human read should be equal assert( pHumanRead->bIsInAlignedPartOfRead( nConsPos ) ); if ( !pHumanRead->ntGetFragFromConsPos( nConsPos ).bIsPad() ) { aSomeCombinedReadHasNoPad.setValue( nConsPos, true ); } char cCombinedBase = 0; int nCombinedQuality = -666; if ( !bGetCombinedReadAtBase( pForwardRead, pForwardReadSingleSignalArray, pReverseRead, pReverseReadSingleSignalArray, nConsPos, cCombinedBase, nCombinedQuality ) ) continue; if ( cCombinedBase != '*' ) aSomeCombinedReadHasNoPad.setValue( nConsPos, true ); } } // for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_... // remove columns in which all combined reads have a pad // first do it to single signal arrays int nRead; for( nRead = 0; nRead < aListOfSingleSignalArrays.length(); ++nRead ) { MBTValOrderedOffsetVector* pSingleSignalArray = aListOfSingleSignalArrays[ nRead ]; for( int nConsPos = nGetEndConsensusIndex(); nConsPos >= nGetStartConsensusIndex(); --nConsPos ) { if ( !aSomeCombinedReadHasNoPad[ nConsPos ] ) { pSingleSignalArray->removeAt( nConsPos ); } } } // for( int nRead = 0; int nConsPos; for( nConsPos = nGetEndConsensusIndex(); nConsPos >= nGetStartConsensusIndex(); --nConsPos ) { if ( !aSomeCombinedReadHasNoPad[ nConsPos ] ) { deleteColumn( nConsPos, false ); } } // new padded length after deleting columns nPaddedContigLength = nGetPaddedConsLength(); // create a combined read, one for each species // each combined read will be the length of the entire consensus RWTPtrVector aCombinedReads( (size_t) pAutoReport->aArrayOfSpecies_.length(), NULL, // initial value "aCombinedReads" ); for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; RWCString soCombinedReadName = soSpecies + ".comb"; LocatedFragment* pCombinedRead = new LocatedFragment( soCombinedReadName, 1, // aligned position within contig false, // bComplemented this ); // contig aCombinedReads[ nSpecies ] = pCombinedRead; for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { pCombinedRead->appendNtide( Ntide( 'n', 0 ) ); } } // Now create bit arrays, one for each species mbtValVector aMaskOfSpeciesMeetingCriteria( nGetPaddedConsLength(), 1, // starts at 1 0, // initial value "Contig::aMaskOfSpeciesMeetingCriteria" ); mbtValVector aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman( nGetPaddedConsLength(), 1, // starts at 1 0, // initial value "Contig::aMaskOfSpeciesAndAgreeingWithHuman" ); for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; LocatedFragment* pCombinedRead = aCombinedReads[ nSpecies ]; assert( pCombinedRead->soGetName().bContains( soSpecies ) ); LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; LocatedFragment* pHumanRead = NULL; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { pHumanRead = pLocFrag; continue; } // the fake read for the 2nd alignment if ( pLocFrag->soGetName().bContains( "b." ) ) { continue; } if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; // check if this is a good read if ( aGoodReads.index( pLocFrag->soGetName() ) == RW_NPOS ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } // If we are in a contig that doesn't contain a human read, // ignore the contig if ( !pHumanRead ) return; // find single signal arrays MBTValOrderedOffsetVector* pForwardReadSingleSignalArray = NULL; MBTValOrderedOffsetVector* pReverseReadSingleSignalArray = NULL; for( int nSingleSignalArray = 0; nSingleSignalArray < aListOfReadsInSingleSignalArrays.length(); ++nSingleSignalArray ) { if ( aListOfReadsInSingleSignalArrays[ nSingleSignalArray ] == pForwardRead ) { pForwardReadSingleSignalArray = aListOfSingleSignalArrays[ nSingleSignalArray ]; } else if ( aListOfReadsInSingleSignalArrays[ nSingleSignalArray ] == pReverseRead ) { pReverseReadSingleSignalArray = aListOfSingleSignalArrays[ nSingleSignalArray ]; } } if ( pForwardRead ) { assert( pForwardReadSingleSignalArray ); } if ( pReverseRead ) { assert( pReverseReadSingleSignalArray ); } // at this point, we have whiever reads are available for that // species and their single-signal arrays unsigned char ucSpeciesMask = ucGetMaskForSpecies( soSpecies ); for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // the consensus and human read should be equal assert( pHumanRead->bIsInAlignedPartOfRead( nConsPos ) ); char cCombinedBase = 0; int nCombinedQuality = -666; if ( !bGetCombinedReadAtBase( pForwardRead, pForwardReadSingleSignalArray, pReverseRead, pReverseReadSingleSignalArray, nConsPos, cCombinedBase, nCombinedQuality ) ) continue; pCombinedRead->Sequence::setBaseAtPos( pCombinedRead->nGetFragIndexFromConsPos( nConsPos ), cCombinedBase ); pCombinedRead->Sequence::setQualityAtSeqPos( pCombinedRead->nGetFragIndexFromConsPos( nConsPos ), nCombinedQuality ); aMaskOfSpeciesMeetingCriteria[nConsPos] |= ucSpeciesMask; // agreeing pads do not count as flanking bases. To be // sure, explicitly check for this. (Even though we deleted // columns of pads, the might be some columns not removed // that have a pad in human and some species but not others.) if ( ( cCombinedBase == ntGetCons( nConsPos ).cGetBase() ) && !ntGetCons( nConsPos ).bIsPad() ) { aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPos ] |= ucSpeciesMask; } } // for( int nConsPos = nGetStartConsensusIndex(); } // for( nSpecies = 0; nSpecies < ... // now we have finished setting 3 key arrays: // aCombinedReads // aMaskOfSpeciesMeetingCriteria // aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman // with these, we can figure out the flanked columns and // print out the bases in those columns mbtValVector aColumnsAlreadyUsedSpecies( nGetPaddedConsLength(), 1, // starting position 0, // initial value "aColumnsAlreadyUsedSpecies" ); int nConsPosOfStartingColumn; for( nConsPosOfStartingColumn = nGetStartConsensusIndex(); nConsPosOfStartingColumn <= nGetEndConsensusIndex(); ++nConsPosOfStartingColumn ) { if ( bAllNeededSpecies( aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPosOfStartingColumn ] ) ) { for( int nConsPosOfEndingColumn = nConsPosOfStartingColumn + 2; nConsPosOfEndingColumn <= MIN( nConsPosOfStartingColumn + 4, nGetEndConsensusIndex() ); ++nConsPosOfEndingColumn ) { if ( bAllNeededSpecies( aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPosOfStartingColumn ] & aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPosOfEndingColumn ] ) ) { // we have a pair of flanking columns. However, the // columns between the flanking columns may not // be ok. For example, it might be that the flanking // gorilla is single signal, but one of the internal // columns doesn't have single signal for gorilla. // I think in such a case, we could use the columns // that are ok, but just not use the column in which // gorilla is not ok. We should not use any species // that isn't supported by the flanking columns, though. unsigned char ucFlankingColumnSpecies = aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPosOfStartingColumn ] & aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPosOfEndingColumn ]; // now look at all columns between these flanking columns for( int nConsPos = nConsPosOfStartingColumn + 1; nConsPos <= nConsPosOfEndingColumn - 1; ++nConsPos ) { // do not use the same column more than once if ( bAllNeededSpecies( aMaskOfSpeciesMeetingCriteria[ nConsPos ] & ucFlankingColumnSpecies ) ) { unsigned char ucFlankedSpeciesAtThisColumn = aMaskOfSpeciesMeetingCriteria[ nConsPos ] & ucFlankingColumnSpecies; // use this new flanking of the column if: // 1) this column has not been used // or // 2) the new set of species is a proper superset // of the old one // the negation of the above is: // 1) the column has been used // and // 2) the new set of species is not a proper superset // of the old one if ( aColumnsAlreadyUsedSpecies[ nConsPos ] != 0 && ( ( aColumnsAlreadyUsedSpecies[ nConsPos ] & ucFlankedSpeciesAtThisColumn ) != aColumnsAlreadyUsedSpecies[ nConsPos ] ) ) continue; aColumnsAlreadyUsedSpecies[ nConsPos ] |= ucFlankedSpeciesAtThisColumn; } // if ( bAllNeededSpecies( } // for( int nConsPos = nConsPosOfStartingColumn + 1; } // } } // if ( bAllNeededSpecies( aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPosOfStartingColumn ] ) ) { } // for( int nConsPosOfStartingColumn = nGetStartConsensusIndex(); fprintf( pAO, "printFlankedColumns {\n" ); fprintf( pAO, "order of species: HSap" ); for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; fprintf( pAO, " %s", soSpecies.data() ); } fprintf( pAO, "\n" ); // now used aColumnsAlreadyUsedSpecies to print out which // species will be used for each position for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( ! aColumnsAlreadyUsedSpecies[ nConsPos ] ) continue; fprintf( pAO, "%d %d %c", nUnpaddedIndex( nConsPos ), nConsPos, ntGetCons( nConsPos ).cGetBase() ); for( int nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; unsigned char ucSpeciesMask = ucGetMaskForSpecies( soSpecies ); if ( ucSpeciesMask & aColumnsAlreadyUsedSpecies[ nConsPos ] ) { // if reached here, soSpecies is a flanked species // which, at this columns, meets the criteria // so print out the base LocatedFragment* pCombinedRead = aCombinedReads[ nSpecies ]; assert( pCombinedRead ); fprintf( pAO, " %c", pCombinedRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ); } else { // always the same number of bases on each line fprintf( pAO, " ?" ); } } // for( int nSpecies = 0; nSpecies < // print contig name (which has the locus name in it fprintf( pAO, " %s\n", soGetName().data() ); } fprintf( pAO, "} printFlankedColumns\n" ); } static bool bNoMutationThisColumn( const RWCString& soBases ) { char cBase = 0; for( int n = 0; n < soBases.length(); ++n ) { if ( soBases[n] == 'n' ) continue; if ( !cBase ) { cBase = soBases[n]; } else { if ( cBase != soBases[n] ) { return false; } } } return true; } void Contig :: printFlankedColumns3( autoReport* pAutoReport ) { // read all of the good reads RWCString soGoodReads = pCP->filAutoReportGoodHitReads_; FILE* pGoodReads = fopen( soGoodReads.data(), "r" ); if ( !pGoodReads ) { RWCString soError = "could not open "; soError += soGoodReads; THROW_ERROR( soError ); } mbtValOrderedVectorOfRWCString aGoodReads( (size_t) 1000 ); while( fgets( soLine, nMaxLineSize, pGoodReads ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); aGoodReads.insert( soLine ); } fclose( pGoodReads ); // aGoodReads.resort(); inefficient--just check every one // this will change when we delete columns int nPaddedContigLength = nGetPaddedConsLength(); mbtValVectorOfBool aSomeCombinedReadHasNoPad( nPaddedContigLength, 1, "Contig::aSomeCombinedReadHasNoPad" ); RWTPtrOrderedVector aListOfReadsInSingleSignalArrays( (size_t) 10 ); typedef MBTValOrderedOffsetVector typeArrayOfChar; RWTPtrOrderedVector aListOfSingleSignalArrays( (size_t) 10 ); // the point of the look below is to set aSomeCombinedReadHasNoPad // and to set the single signal arrays (aListOfSingleSignalArrays) int nSpecies; for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; LocatedFragment* pHumanRead = NULL; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { pHumanRead = pLocFrag; continue; } // the fake read for the 2nd alignment if ( pLocFrag->soGetName().bContains( "b." ) ) { continue; } if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; // check if this is a good read if ( aGoodReads.index( pLocFrag->soGetName() ) == RW_NPOS ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } // If we are in a contig that doesn't have a human read (rare, and // pathological), ignore this contig. if ( !pHumanRead ) return; // single signal arrays MBTValOrderedOffsetVector* pForwardReadSingleSignalArray = NULL; MBTValOrderedOffsetVector* pReverseReadSingleSignalArray = NULL; // save single signal arrays before deleting column of pads for( int n = 0; n <= 1; ++n ) { LocatedFragment* pLocFrag = NULL; if ( n == 0 ) pLocFrag = pForwardRead; else pLocFrag = pReverseRead; if ( !pLocFrag ) continue; MBTValOrderedOffsetVector* pSingleSignalArray = NULL; assert( pLocFrag->bGetSingleSignalBasesConsensusLength( pSingleSignalArray ) ); aListOfReadsInSingleSignalArrays.insert( pLocFrag ); aListOfSingleSignalArrays.insert( pSingleSignalArray ); if ( n == 0 ) pForwardReadSingleSignalArray = pSingleSignalArray; else pReverseReadSingleSignalArray = pSingleSignalArray; } for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // the consensus and human read should be equal assert( pHumanRead->bIsInAlignedPartOfRead( nConsPos ) ); if ( !pHumanRead->ntGetFragFromConsPos( nConsPos ).bIsPad() ) { aSomeCombinedReadHasNoPad.setValue( nConsPos, true ); } char cCombinedBase = 0; int nCombinedQuality = -666; if ( !bGetCombinedReadAtBase( pForwardRead, pForwardReadSingleSignalArray, pReverseRead, pReverseReadSingleSignalArray, nConsPos, cCombinedBase, nCombinedQuality ) ) continue; if ( cCombinedBase != '*' ) aSomeCombinedReadHasNoPad.setValue( nConsPos, true ); } } // for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_... // remove columns in which all combined reads have a pad // first do it to single signal arrays int nRead; for( nRead = 0; nRead < aListOfSingleSignalArrays.length(); ++nRead ) { MBTValOrderedOffsetVector* pSingleSignalArray = aListOfSingleSignalArrays[ nRead ]; for( int nConsPos = nGetEndConsensusIndex(); nConsPos >= nGetStartConsensusIndex(); --nConsPos ) { if ( !aSomeCombinedReadHasNoPad[ nConsPos ] ) { pSingleSignalArray->removeAt( nConsPos ); } } } // for( int nRead = 0; int nConsPos; for( nConsPos = nGetEndConsensusIndex(); nConsPos >= nGetStartConsensusIndex(); --nConsPos ) { if ( !aSomeCombinedReadHasNoPad[ nConsPos ] ) { deleteColumn( nConsPos, false ); } } // new padded length after deleting columns nPaddedContigLength = nGetPaddedConsLength(); // create a combined read, one for each species // each combined read will be the length of the entire consensus RWTPtrVector aCombinedReads( (size_t) pAutoReport->aArrayOfSpecies_.length(), NULL, // initial value "aCombinedReads" ); for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; RWCString soCombinedReadName = soSpecies + ".comb"; LocatedFragment* pCombinedRead = new LocatedFragment( soCombinedReadName, 1, // aligned position within contig false, // bComplemented this ); // contig aCombinedReads[ nSpecies ] = pCombinedRead; for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { pCombinedRead->appendNtide( Ntide( 'n', 0 ) ); } } // Now create bit arrays, one for each species mbtValVector aMaskOfSpeciesMeetingCriteria( nGetPaddedConsLength(), 1, // starts at 1 0, // initial value "Contig::aMaskOfSpeciesMeetingCriteria" ); mbtValVector aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman( nGetPaddedConsLength(), 1, // starts at 1 0, // initial value "Contig::aMaskOfSpeciesAndAgreeingWithHuman" ); for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; LocatedFragment* pCombinedRead = aCombinedReads[ nSpecies ]; assert( pCombinedRead->soGetName().bContains( soSpecies ) ); LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; LocatedFragment* pHumanRead = NULL; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { pHumanRead = pLocFrag; continue; } // the fake read for the 2nd alignment if ( pLocFrag->soGetName().bContains( "b." ) ) { continue; } if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; // check if this is a good read if ( aGoodReads.index( pLocFrag->soGetName() ) == RW_NPOS ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } // If we are in a contig that doesn't contain a human read, // ignore the contig if ( !pHumanRead ) return; // find single signal arrays MBTValOrderedOffsetVector* pForwardReadSingleSignalArray = NULL; MBTValOrderedOffsetVector* pReverseReadSingleSignalArray = NULL; for( int nSingleSignalArray = 0; nSingleSignalArray < aListOfReadsInSingleSignalArrays.length(); ++nSingleSignalArray ) { if ( aListOfReadsInSingleSignalArrays[ nSingleSignalArray ] == pForwardRead ) { pForwardReadSingleSignalArray = aListOfSingleSignalArrays[ nSingleSignalArray ]; } else if ( aListOfReadsInSingleSignalArrays[ nSingleSignalArray ] == pReverseRead ) { pReverseReadSingleSignalArray = aListOfSingleSignalArrays[ nSingleSignalArray ]; } } if ( pForwardRead ) { assert( pForwardReadSingleSignalArray ); } if ( pReverseRead ) { assert( pReverseReadSingleSignalArray ); } // at this point, we have whiever reads are available for that // species and their single-signal arrays unsigned char ucSpeciesMask = ucGetMaskForSpecies( soSpecies ); for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // the consensus and human read should be equal assert( pHumanRead->bIsInAlignedPartOfRead( nConsPos ) ); char cCombinedBase = 0; int nCombinedQuality = -666; if ( !bGetCombinedReadAtBase( pForwardRead, pForwardReadSingleSignalArray, pReverseRead, pReverseReadSingleSignalArray, nConsPos, cCombinedBase, nCombinedQuality ) ) continue; pCombinedRead->Sequence::setBaseAtPos( pCombinedRead->nGetFragIndexFromConsPos( nConsPos ), cCombinedBase ); pCombinedRead->Sequence::setQualityAtSeqPos( pCombinedRead->nGetFragIndexFromConsPos( nConsPos ), nCombinedQuality ); aMaskOfSpeciesMeetingCriteria[nConsPos] |= ucSpeciesMask; // agreeing pads do not count as flanking bases. To be // sure, explicitly check for this. (Even though we deleted // columns of pads, the might be some columns not removed // that have a pad in human and some species but not others.) if ( ( cCombinedBase == ntGetCons( nConsPos ).cGetBase() ) && !ntGetCons( nConsPos ).bIsPad() ) { aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPos ] |= ucSpeciesMask; } } // for( int nConsPos = nGetStartConsensusIndex(); } // for( nSpecies = 0; nSpecies < ... // now we have finished setting 3 key arrays: // aCombinedReads // aMaskOfSpeciesMeetingCriteria // aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman // with these, we can figure out the flanked columns and // print out the bases in those columns mbtValVector aColumnsAlreadyUsedSpecies( nGetPaddedConsLength(), 1, // starting position 0, // initial value "aColumnsAlreadyUsedSpecies" ); int nConsPosOfStartingColumn; for( nConsPosOfStartingColumn = nGetStartConsensusIndex(); nConsPosOfStartingColumn <= nGetEndConsensusIndex(); ++nConsPosOfStartingColumn ) { if ( bAllNeededSpecies( aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPosOfStartingColumn ] ) ) { for( int nConsPosOfEndingColumn = nConsPosOfStartingColumn + 2; nConsPosOfEndingColumn <= MIN( nConsPosOfStartingColumn + 4, nGetEndConsensusIndex() ); ++nConsPosOfEndingColumn ) { if ( bAllNeededSpecies( aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPosOfStartingColumn ] & aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPosOfEndingColumn ] ) ) { // we have a pair of flanking columns. However, the // columns between the flanking columns may not // be ok. For example, it might be that the flanking // gorilla is single signal, but one of the internal // columns doesn't have single signal for gorilla. // I think in such a case, we could use the columns // that are ok, but just not use the column in which // gorilla is not ok. We should not use any species // that isn't supported by the flanking columns, though. unsigned char ucFlankingColumnSpecies = aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPosOfStartingColumn ] & aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPosOfEndingColumn ]; // now look at all columns between these flanking columns for( int nConsPos = nConsPosOfStartingColumn + 1; nConsPos <= nConsPosOfEndingColumn - 1; ++nConsPos ) { // do not use the same column more than once if ( bAllNeededSpecies( aMaskOfSpeciesMeetingCriteria[ nConsPos ] & ucFlankingColumnSpecies ) ) { unsigned char ucFlankedSpeciesAtThisColumn = aMaskOfSpeciesMeetingCriteria[ nConsPos ] & ucFlankingColumnSpecies; // use this new flanking of the column if: // 1) this column has not been used // or // 2) the new set of species is a proper superset // of the old one // the negation of the above is: // 1) the column has been used // and // 2) the new set of species is not a proper superset // of the old one if ( aColumnsAlreadyUsedSpecies[ nConsPos ] != 0 && ( ( aColumnsAlreadyUsedSpecies[ nConsPos ] & ucFlankedSpeciesAtThisColumn ) != aColumnsAlreadyUsedSpecies[ nConsPos ] ) ) continue; aColumnsAlreadyUsedSpecies[ nConsPos ] |= ucFlankedSpeciesAtThisColumn; } // if ( bAllNeededSpecies( } // for( int nConsPos = nConsPosOfStartingColumn + 1; } // } } // if ( bAllNeededSpecies( aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPosOfStartingColumn ] ) ) { } // for( int nConsPosOfStartingColumn = nGetStartConsensusIndex(); fprintf( pAO, "printFlankedColumns {\n" ); fprintf( pAO, "order of species: HSap" ); for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; fprintf( pAO, " %s", soSpecies.data() ); } fprintf( pAO, "\n" ); // now used aColumnsAlreadyUsedSpecies to print out which // species will be used for each position for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( ! aColumnsAlreadyUsedSpecies[ nConsPos ] ) continue; RWCString soColumnOfBases( (size_t) 11 ); // 6 bases and 5 spaces fprintf( pAO, "%d %d %c", nUnpaddedIndex( nConsPos ), nConsPos, ntGetCons( nConsPos ).cGetBase() ); soColumnOfBases = ntGetCons( nConsPos ).cGetBase(); // start with human for( int nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; unsigned char ucSpeciesMask = ucGetMaskForSpecies( soSpecies ); char cBase; if ( ucSpeciesMask & aColumnsAlreadyUsedSpecies[ nConsPos ] ) { // if reached here, soSpecies is a flanked species // which, at this columns, meets the criteria // so print out the base LocatedFragment* pCombinedRead = aCombinedReads[ nSpecies ]; assert( pCombinedRead ); cBase = pCombinedRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } else { // always the same number of bases on each line cBase = 'n'; } soColumnOfBases += cBase; fprintf( pAO, " %c", cBase ); } // for( int nSpecies = 0; nSpecies < // print contig name (which has the locus name in it fprintf( pAO, " %s", soGetName().data() ); if ( bNoMutationThisColumn( soColumnOfBases ) ) { fprintf( pAO, " same\n" ); } else { RWTValOrderedVector aTrees; findAncestralTrees( soColumnOfBases, aTrees ); for( int nTree = 0; nTree < aTrees.length(); ++nTree ) { fprintf( pAO, " %s", aTrees[ nTree ].data() ); } fprintf( pAO, "\n" ); } } // for( nConsPos = nGetStartConsensusIndex(); fprintf( pAO, "} printFlankedColumns\n" ); } void Contig :: countAcceptableColumnsWithNoneOnLeft( autoReport* pAutoReport ) { // read all of the good reads RWCString soGoodReads = pCP->filAutoReportGoodHitReads_; FILE* pGoodReads = fopen( soGoodReads.data(), "r" ); if ( !pGoodReads ) { RWCString soError = "could not open "; soError += soGoodReads; THROW_ERROR( soError ); } mbtValOrderedVectorOfRWCString aGoodReads( (size_t) 1000 ); while( fgets( soLine, nMaxLineSize, pGoodReads ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); aGoodReads.insert( soLine ); } fclose( pGoodReads ); // aGoodReads.resort(); inefficient--just check every one // this will change when we delete columns int nPaddedContigLength = nGetPaddedConsLength(); mbtValVectorOfBool aSomeCombinedReadHasNoPad( nPaddedContigLength, 1, "Contig::aSomeCombinedReadHasNoPad" ); RWTPtrOrderedVector aListOfReadsInSingleSignalArrays( (size_t) 10 ); typedef MBTValOrderedOffsetVector typeArrayOfChar; RWTPtrOrderedVector aListOfSingleSignalArrays( (size_t) 10 ); // the point of the look below is to set aSomeCombinedReadHasNoPad // and to set the single signal arrays (aListOfSingleSignalArrays) int nSpecies; for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; LocatedFragment* pHumanRead = NULL; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { pHumanRead = pLocFrag; continue; } // the fake read for the 2nd alignment if ( pLocFrag->soGetName().bContains( "b." ) ) { continue; } if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; // check if this is a good read if ( aGoodReads.index( pLocFrag->soGetName() ) == RW_NPOS ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } // If we are in a contig that doesn't have a human read (rare, and // pathological), ignore this contig. if ( !pHumanRead ) return; // single signal arrays MBTValOrderedOffsetVector* pForwardReadSingleSignalArray = NULL; MBTValOrderedOffsetVector* pReverseReadSingleSignalArray = NULL; // save single signal arrays before deleting column of pads for( int n = 0; n <= 1; ++n ) { LocatedFragment* pLocFrag = NULL; if ( n == 0 ) pLocFrag = pForwardRead; else pLocFrag = pReverseRead; if ( !pLocFrag ) continue; MBTValOrderedOffsetVector* pSingleSignalArray = NULL; assert( pLocFrag->bGetSingleSignalBasesConsensusLength( pSingleSignalArray ) ); aListOfReadsInSingleSignalArrays.insert( pLocFrag ); aListOfSingleSignalArrays.insert( pSingleSignalArray ); if ( n == 0 ) pForwardReadSingleSignalArray = pSingleSignalArray; else pReverseReadSingleSignalArray = pSingleSignalArray; } for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // the consensus and human read should be equal assert( pHumanRead->bIsInAlignedPartOfRead( nConsPos ) ); if ( !pHumanRead->ntGetFragFromConsPos( nConsPos ).bIsPad() ) { aSomeCombinedReadHasNoPad.setValue( nConsPos, true ); } char cCombinedBase = 0; int nCombinedQuality = -666; if ( !bGetCombinedReadAtBase( pForwardRead, pForwardReadSingleSignalArray, pReverseRead, pReverseReadSingleSignalArray, nConsPos, cCombinedBase, nCombinedQuality ) ) continue; if ( cCombinedBase != '*' ) aSomeCombinedReadHasNoPad.setValue( nConsPos, true ); } } // for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_... // remove columns in which all combined reads have a pad // first do it to single signal arrays int nRead; for( nRead = 0; nRead < aListOfSingleSignalArrays.length(); ++nRead ) { MBTValOrderedOffsetVector* pSingleSignalArray = aListOfSingleSignalArrays[ nRead ]; for( int nConsPos = nGetEndConsensusIndex(); nConsPos >= nGetStartConsensusIndex(); --nConsPos ) { if ( !aSomeCombinedReadHasNoPad[ nConsPos ] ) { pSingleSignalArray->removeAt( nConsPos ); } } } // for( int nRead = 0; int nConsPos; for( nConsPos = nGetEndConsensusIndex(); nConsPos >= nGetStartConsensusIndex(); --nConsPos ) { if ( !aSomeCombinedReadHasNoPad[ nConsPos ] ) { deleteColumn( nConsPos, false ); } } // new padded length after deleting columns nPaddedContigLength = nGetPaddedConsLength(); // create a combined read, one for each species // each combined read will be the length of the entire consensus RWTPtrVector aCombinedReads( (size_t) pAutoReport->aArrayOfSpecies_.length(), NULL, // initial value "aCombinedReads" ); for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; RWCString soCombinedReadName = soSpecies + ".comb"; LocatedFragment* pCombinedRead = new LocatedFragment( soCombinedReadName, 1, // aligned position within contig false, // bComplemented this ); // contig aCombinedReads[ nSpecies ] = pCombinedRead; for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { pCombinedRead->appendNtide( Ntide( 'n', 0 ) ); } } // Now create bit arrays, one for each species mbtValVector aMaskOfSpeciesMeetingCriteria( nGetPaddedConsLength(), 1, // starts at 1 0, // initial value "Contig::aMaskOfSpeciesMeetingCriteria" ); mbtValVector aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman( nGetPaddedConsLength(), 1, // starts at 1 0, // initial value "Contig::aMaskOfSpeciesAndAgreeingWithHuman" ); for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; LocatedFragment* pCombinedRead = aCombinedReads[ nSpecies ]; assert( pCombinedRead->soGetName().bContains( soSpecies ) ); LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; LocatedFragment* pHumanRead = NULL; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { pHumanRead = pLocFrag; continue; } // the fake read for the 2nd alignment if ( pLocFrag->soGetName().bContains( "b." ) ) { continue; } if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; // check if this is a good read if ( aGoodReads.index( pLocFrag->soGetName() ) == RW_NPOS ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } // If we are in a contig that doesn't contain a human read, // ignore the contig if ( !pHumanRead ) return; // find single signal arrays MBTValOrderedOffsetVector* pForwardReadSingleSignalArray = NULL; MBTValOrderedOffsetVector* pReverseReadSingleSignalArray = NULL; for( int nSingleSignalArray = 0; nSingleSignalArray < aListOfReadsInSingleSignalArrays.length(); ++nSingleSignalArray ) { if ( aListOfReadsInSingleSignalArrays[ nSingleSignalArray ] == pForwardRead ) { pForwardReadSingleSignalArray = aListOfSingleSignalArrays[ nSingleSignalArray ]; } else if ( aListOfReadsInSingleSignalArrays[ nSingleSignalArray ] == pReverseRead ) { pReverseReadSingleSignalArray = aListOfSingleSignalArrays[ nSingleSignalArray ]; } } if ( pForwardRead ) { assert( pForwardReadSingleSignalArray ); } if ( pReverseRead ) { assert( pReverseReadSingleSignalArray ); } // at this point, we have whiever reads are available for that // species and their single-signal arrays unsigned char ucSpeciesMask = ucGetMaskForSpecies( soSpecies ); for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // the consensus and human read should be equal assert( pHumanRead->bIsInAlignedPartOfRead( nConsPos ) ); char cCombinedBase = 0; int nCombinedQuality = -666; if ( !bGetCombinedReadAtBase( pForwardRead, pForwardReadSingleSignalArray, pReverseRead, pReverseReadSingleSignalArray, nConsPos, cCombinedBase, nCombinedQuality ) ) continue; pCombinedRead->Sequence::setBaseAtPos( pCombinedRead->nGetFragIndexFromConsPos( nConsPos ), cCombinedBase ); pCombinedRead->Sequence::setQualityAtSeqPos( pCombinedRead->nGetFragIndexFromConsPos( nConsPos ), nCombinedQuality ); aMaskOfSpeciesMeetingCriteria[nConsPos] |= ucSpeciesMask; // agreeing pads do not count as flanking bases. To be // sure, explicitly check for this. (Even though we deleted // columns of pads, the might be some columns not removed // that have a pad in human and some species but not others.) if ( ( cCombinedBase == ntGetCons( nConsPos ).cGetBase() ) && !ntGetCons( nConsPos ).bIsPad() ) { aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPos ] |= ucSpeciesMask; } } // for( int nConsPos = nGetStartConsensusIndex(); } // for( nSpecies = 0; nSpecies < ... // now we have finished setting 3 key arrays: // aCombinedReads // aMaskOfSpeciesMeetingCriteria // aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman // with these, we can figure out the flanked columns and // print out the bases in those columns mbtValVector aSpeciesInAcceptableColumns( nGetPaddedConsLength(), 1, // starting position 0, // initial value "aSpeciesInAcceptableColumns" ); int nConsPosOfStartingColumn; for( nConsPosOfStartingColumn = nGetStartConsensusIndex(); nConsPosOfStartingColumn <= nGetEndConsensusIndex(); ++nConsPosOfStartingColumn ) { if ( bAllNeededSpecies( aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPosOfStartingColumn ] ) ) { for( int nConsPosOfEndingColumn = nConsPosOfStartingColumn + 2; nConsPosOfEndingColumn <= MIN( nConsPosOfStartingColumn + 4, nGetEndConsensusIndex() ); ++nConsPosOfEndingColumn ) { if ( bAllNeededSpecies( aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPosOfStartingColumn ] & aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPosOfEndingColumn ] ) ) { // we have a pair of flanking columns. However, the // columns between the flanking columns may not // be ok. For example, it might be that the flanking // gorilla is single signal, but one of the internal // columns doesn't have single signal for gorilla. // I think in such a case, we could use the columns // that are ok, but just not use the column in which // gorilla is not ok. We should not use any species // that isn't supported by the flanking columns, though. unsigned char ucFlankingColumnSpecies = aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPosOfStartingColumn ] & aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPosOfEndingColumn ]; // now look at all columns between these flanking columns for( int nConsPos = nConsPosOfStartingColumn + 1; nConsPos <= nConsPosOfEndingColumn - 1; ++nConsPos ) { // do not use the same column more than once if ( bAllNeededSpecies( aMaskOfSpeciesMeetingCriteria[ nConsPos ] & ucFlankingColumnSpecies ) ) { unsigned char ucFlankedSpeciesAtThisColumn = aMaskOfSpeciesMeetingCriteria[ nConsPos ] & ucFlankingColumnSpecies; // use this new flanking of the column if: // 1) this column has not been used // or // 2) the new set of species is a proper superset // of the old one // the negation of the above is: // 1) the column has been used // and // 2) the new set of species is not a proper superset // of the old one if ( aSpeciesInAcceptableColumns[ nConsPos ] != 0 && ( ( aSpeciesInAcceptableColumns[ nConsPos ] & ucFlankedSpeciesAtThisColumn ) != aSpeciesInAcceptableColumns[ nConsPos ] ) ) continue; aSpeciesInAcceptableColumns[ nConsPos ] |= ucFlankedSpeciesAtThisColumn; } // if ( bAllNeededSpecies( } // for( int nConsPos = nConsPosOfStartingColumn + 1; } // } } // if ( bAllNeededSpecies( aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPosOfStartingColumn ] ) ) { } // for( int nConsPosOfStartingColumn = nGetStartConsensusIndex(); fprintf( pAO, "printFlankedColumns {\n" ); fprintf( pAO, "order of species: HSap" ); for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; fprintf( pAO, " %s", soSpecies.data() ); } fprintf( pAO, "\n" ); // now use aSpeciesInAcceptableColumns to print out which // species will be used for each position for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; unsigned char ucSpeciesMask = ucGetMaskForSpecies( soSpecies ); int nColumnsWithNoneOnLeft = 0; int nColumnsWithSomeOrNoneOnLeft = 0; for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( aSpeciesInAcceptableColumns[ nConsPos ] & ucSpeciesMask ) { ++nColumnsWithSomeOrNoneOnLeft; if ( nConsPos == nGetStartConsensusIndex() ) { ++nColumnsWithNoneOnLeft; } else if ( !( aSpeciesInAcceptableColumns[ nConsPos - 1 ] & ucSpeciesMask ) ) { ++nColumnsWithNoneOnLeft; } } } fprintf( pAO, "%s %d usable columns with none on left out of %d acceptable\n", soSpecies.data(), nColumnsWithNoneOnLeft, nColumnsWithSomeOrNoneOnLeft ); } for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; unsigned char ucSpeciesMask = ucGetMaskForSpecies( soSpecies ); int nColumnsWithNoneOnOneSide = 0; int nColumnsWithSomeOrNoneOnOneSide = 0; for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( aSpeciesInAcceptableColumns[ nConsPos ] & ucSpeciesMask ) { ++nColumnsWithSomeOrNoneOnOneSide; if ( nConsPos == nGetStartConsensusIndex() || nConsPos == nGetEndConsensusIndex()) { ++nColumnsWithNoneOnOneSide; } else if ( !( aSpeciesInAcceptableColumns[ nConsPos - 1 ] & ucSpeciesMask ) || !( aSpeciesInAcceptableColumns[ nConsPos + 1 ] & ucSpeciesMask ) ) { ++nColumnsWithNoneOnOneSide; } } } fprintf( pAO, "%s %d usable columns with none on one side out of %d acceptable\n", soSpecies.data(), nColumnsWithNoneOnOneSide, nColumnsWithSomeOrNoneOnOneSide ); } int nColumnsWithNoneOnOneSide = 0; int nColumnsWithSomeOrNoneOnOneSide = 0; for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( aSpeciesInAcceptableColumns[ nConsPos ] ) { ++nColumnsWithSomeOrNoneOnOneSide; if ( nConsPos == nGetStartConsensusIndex() || nConsPos == nGetEndConsensusIndex()) { ++nColumnsWithNoneOnOneSide; } else if ( !aSpeciesInAcceptableColumns[ nConsPos - 1] || !aSpeciesInAcceptableColumns[ nConsPos + 1] ) { ++nColumnsWithNoneOnOneSide; } } } fprintf( pAO, "%d usable real columns with none on one side out of %d acceptable\n", nColumnsWithNoneOnOneSide, nColumnsWithSomeOrNoneOnOneSide ); } static RWCString soGetTreeWithNoMutation( const RWCString soPresentDayBasesHumanInFront ) { RWCString soPresentDayBasesPPanInFront = soPresentDayBasesHumanInFront; // PPan soPresentDayBasesPPanInFront[0] = soPresentDayBasesHumanInFront[1]; // PTro soPresentDayBasesPPanInFront[1] = soPresentDayBasesHumanInFront[2]; // HSap soPresentDayBasesPPanInFront[2] = soPresentDayBasesHumanInFront[0]; // find base char cBase = 0; int n; for( n = 0; n < soPresentDayBasesPPanInFront.length(); ++n ) { if ( soPresentDayBasesPPanInFront[n] != 'n' ) { cBase = soPresentDayBasesPPanInFront[n]; break; } } assert( cBase ); // trees look like: // g(g(t(t(t(t,t),t),t),g?),g) RWCString soTreeLeft; RWCString soTreeRight; for( n = soPresentDayBasesPPanInFront.length() - 1; n >= 1; --n ) { soTreeLeft += RWCString( cBase ); soTreeLeft += "("; soTreeRight = "," + RWCString( cBase ) + ( soPresentDayBasesPPanInFront[n] == 'n' ? "?" : "" ) + ")" + soTreeRight; } // we have to handle PPan separately from the loop above because it // might have a "?" next to it RWCString soTree = soTreeLeft + RWCString( cBase ) + ( soPresentDayBasesPPanInFront[0] == 'n' ? "?" : "" ) + soTreeRight; return( soTree ); } void Contig :: adjustReadQualitiesAndRecalculateHighQualitySegments( autoReport* pAutoReport, const RWTPtrOrderedVector& aGoodReads ) { for( int nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; for( int nRead = 0; nRead < aGoodReads.length(); ++nRead ) { LocatedFragment* pLocFrag = aGoodReads[ nRead ]; if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } // there is only any adjusting to be done if we have // both a forward and a reverse good-hit read if ( !pForwardRead || !pReverseRead ) continue; int nIntersectLeft; int nIntersectRight; if ( !bIntersect( pForwardRead->nGetAlignClipStart(), pForwardRead->nGetAlignClipEnd(), pReverseRead->nGetAlignClipStart(), pReverseRead->nGetAlignClipEnd(), nIntersectLeft, nIntersectRight ) ) continue; for( int nConsPos = nIntersectLeft; nConsPos <= nIntersectRight; ++nConsPos ) { if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { int nCombinedQuality = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() + pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); nCombinedQuality = MIN( nCombinedQuality, 90 ); pForwardRead->setQualityAtConsPos( nConsPos, nCombinedQuality ); pReverseRead->setQualityAtConsPos( nConsPos, nCombinedQuality ); } } // now adjust the high quality segments pForwardRead->calculateHighQualitySegmentOfRead(); pReverseRead->calculateHighQualitySegmentOfRead(); } // for( int nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); } void Contig :: applyFilters( autoReport* pAutoReport, RWTPtrVector& aCombinedReads, mbtValVector& aSpeciesInAcceptableColumns, RWTValVector& aArrayOfOneTreeAtEachConsPos, RWTValVector& aArrayOfBasesAtEachConsPos, RWTValVector& aArrayOfLenientlyFilteredBasesAtEachConsPos, RWTValVector& aDeaminationMutationsAtEachConsPos, RWTValVector& aAncestralCpGAtEachConsPos2, // aAncestralCpGAtEachConsPos has just // the left position marked for an // ancestral CpG // aAncestralCpGAtEachConsPos2 has both // positions marked for an ancestral CpG LocatedFragment*& pHumanRead ) { // this differs from printFlankedColumns3 in that it records the // trees and attempts to identify which mutations are deamination // mutations // read all of the good reads RWCString soGoodReads = pCP->filAutoReportGoodHitReads_; FILE* pGoodReads = fopen( soGoodReads.data(), "r" ); if ( !pGoodReads ) { RWCString soError = "could not open "; soError += soGoodReads; THROW_ERROR( soError ); } RWTPtrOrderedVector aGoodReads( (size_t) 10 ); while( fgets( soLine, nMaxLineSize, pGoodReads ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName() == soLine ) { // the fake read for the 2nd alignment assert( !pLocFrag->soGetName().bContains( "b." ) ); aGoodReads.insert( pLocFrag ); } } } fclose( pGoodReads ); // aGoodReads.resort(); inefficient--just check every one // find pHumanRead pHumanRead = NULL; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { pHumanRead = pLocFrag; break; } } assert( pHumanRead ); RWTPtrOrderedVector aListOfReadsInSingleSignalArrays( (size_t) 10 ); typedef MBTValOrderedOffsetVector typeArrayOfChar; RWTPtrOrderedVector aListOfSingleSignalArrays( (size_t) 10 ); // set the single signal arrays (aListOfSingleSignalArrays) int nSpecies; for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; for( int nRead = 0; nRead < aGoodReads.length(); ++nRead ) { LocatedFragment* pLocFrag = aGoodReads[ nRead ]; if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } // save single signal arrays before deleting column of pads for( int n = 0; n <= 1; ++n ) { LocatedFragment* pLocFrag = NULL; if ( n == 0 ) pLocFrag = pForwardRead; else pLocFrag = pReverseRead; if ( !pLocFrag ) continue; MBTValOrderedOffsetVector* pSingleSignalArray = NULL; assert( pLocFrag->bGetSingleSignalBasesConsensusLength( pSingleSignalArray ) ); aListOfReadsInSingleSignalArrays.insert( pLocFrag ); aListOfSingleSignalArrays.insert( pSingleSignalArray ); } } // for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_... if ( pCP->bAutoReportDeleteColumnsOfPadsBeforeAdjustingReadQualityValues_ ) { deleteColumnsOfPads( true, // bAddQualityValuesOfStrands pAutoReport, pHumanRead, aGoodReads, aListOfReadsInSingleSignalArrays, aListOfSingleSignalArrays ); adjustReadQualitiesAndRecalculateHighQualitySegments( pAutoReport, aGoodReads ); } else { adjustReadQualitiesAndRecalculateHighQualitySegments( pAutoReport, aGoodReads ); deleteColumnsOfPads( false, // bAddQualityValuesOfStrands pAutoReport, pHumanRead, aGoodReads, aListOfReadsInSingleSignalArrays, aListOfSingleSignalArrays ); } // new padded length after deleting columns int nPaddedContigLength = nGetPaddedConsLength(); // create a combined read, one for each species // each combined read will be the length of the entire consensus for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; RWCString soCombinedReadName = soSpecies + ".comb"; LocatedFragment* pCombinedRead = new LocatedFragment( soCombinedReadName, 1, // aligned position within contig false, // bComplemented this ); // contig aCombinedReads[ nSpecies ] = pCombinedRead; for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { pCombinedRead->appendNtide( Ntide( 'n', 0 ) ); } } // Now create bit arrays, one for each species mbtValVector aMaskOfSpeciesMeetingCriteria( nGetPaddedConsLength(), 1, // starts at 1 0, // initial value "Contig::aMaskOfSpeciesMeetingCriteria" ); mbtValVector aMaskOfSpeciesAgreeingWithHuman( nGetPaddedConsLength(), 1, // starts at 1 0, // initial value "Contig::aMaskOfSpeciesAndAgreeingWithHuman" ); for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; LocatedFragment* pCombinedRead = aCombinedReads[ nSpecies ]; assert( pCombinedRead->soGetName().bContains( soSpecies ) ); LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; for( int nRead = 0; nRead < aGoodReads.length(); ++nRead ) { LocatedFragment* pLocFrag = aGoodReads( nRead ); if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } // find single signal arrays MBTValOrderedOffsetVector* pForwardReadSingleSignalArray = NULL; MBTValOrderedOffsetVector* pReverseReadSingleSignalArray = NULL; for( int nSingleSignalArray = 0; nSingleSignalArray < aListOfReadsInSingleSignalArrays.length(); ++nSingleSignalArray ) { if ( aListOfReadsInSingleSignalArrays[ nSingleSignalArray ] == pForwardRead ) { pForwardReadSingleSignalArray = aListOfSingleSignalArrays[ nSingleSignalArray ]; } else if ( aListOfReadsInSingleSignalArrays[ nSingleSignalArray ] == pReverseRead ) { pReverseReadSingleSignalArray = aListOfSingleSignalArrays[ nSingleSignalArray ]; } } if ( pForwardRead ) { assert( pForwardReadSingleSignalArray ); } if ( pReverseRead ) { assert( pReverseReadSingleSignalArray ); } // at this point, we have whiever reads are available for that // species and their single-signal arrays unsigned char ucSpeciesMask = ucGetMaskForSpecies( soSpecies ); for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // the consensus and human read should be equal assert( pHumanRead->bIsInAlignedPartOfRead( nConsPos ) ); char cCombinedBase = 0; int nCombinedQuality = -666; bool bCombinedReadALittleOK; bool bCombinedReadOK = bGetCombinedReadAtBase3( false, // bAddQualityValuesOfStrands pForwardRead, pForwardReadSingleSignalArray, pReverseRead, pReverseReadSingleSignalArray, nConsPos, cCombinedBase, nCombinedQuality, bCombinedReadALittleOK); if ( bCombinedReadOK || bCombinedReadALittleOK ) { pCombinedRead->Sequence::setBaseAtPos( pCombinedRead->nGetFragIndexFromConsPos( nConsPos ), cCombinedBase ); pCombinedRead->Sequence::setQualityAtSeqPos( pCombinedRead->nGetFragIndexFromConsPos( nConsPos ), nCombinedQuality ); // agreeing pads do not count as flanking bases. To be // sure, explicitly check for this. (Even though we deleted // columns of pads, the might be some columns not removed // that have a pad in human and some species but not others.) // Note that we have lowered the requirement for flanking // bases--they need not be single signal, nor above a quality // threshold nor in high quality segment. They must be // aligned and good-hit reads. if ( ( cCombinedBase == ntGetCons( nConsPos ).cGetBase() ) && !ntGetCons( nConsPos ).bIsPad() ) { aMaskOfSpeciesAgreeingWithHuman[ nConsPos ] |= ucSpeciesMask; } } if ( bCombinedReadOK ) aMaskOfSpeciesMeetingCriteria[nConsPos] |= ucSpeciesMask; } // for( int nConsPos = nGetStartConsensusIndex(); } // for( nSpecies = 0; nSpecies < ... // now we have finished setting 3 key arrays: // aCombinedReads // aMaskOfSpeciesMeetingCriteria // aMaskOfSpeciesAgreeingWithHuman // with these, we can figure out the flanked columns and // print out the bases in those columns aSpeciesInAcceptableColumns.reshape( nGetPaddedConsLength(), 0 ); // initial value const int nWidthOfFlankedRegion = 3; int nConsPosOfStartingColumn; for( nConsPosOfStartingColumn = nGetStartConsensusIndex(); nConsPosOfStartingColumn <= nGetEndConsensusIndex(); ++nConsPosOfStartingColumn ) { if ( bAllNeededSpecies( aMaskOfSpeciesAgreeingWithHuman[ nConsPosOfStartingColumn ] ) ) { for( int nConsPosOfEndingColumn = nConsPosOfStartingColumn + 2; nConsPosOfEndingColumn <= MIN( nConsPosOfStartingColumn + nWidthOfFlankedRegion + 1, nGetEndConsensusIndex() ); ++nConsPosOfEndingColumn ) { if ( bAllNeededSpecies( aMaskOfSpeciesAgreeingWithHuman[ nConsPosOfStartingColumn ] & aMaskOfSpeciesAgreeingWithHuman[ nConsPosOfEndingColumn ] ) ) { // we have a pair of flanking columns. However, the // columns between the flanking columns may not // be ok. For example, it might be that the flanking // gorilla is single signal, but one of the internal // columns doesn't have single signal for gorilla. // I think in such a case, we could use the columns // that are ok, but just not use the column in which // gorilla is not ok. We should not use any species // that isn't supported by the flanking columns, though. unsigned char ucFlankingColumnSpecies = aMaskOfSpeciesAgreeingWithHuman[ nConsPosOfStartingColumn ] & aMaskOfSpeciesAgreeingWithHuman[ nConsPosOfEndingColumn ]; // now look at all columns between these flanking columns for( int nConsPos = nConsPosOfStartingColumn + 1; nConsPos <= nConsPosOfEndingColumn - 1; ++nConsPos ) { // do not use the same column more than once if ( bAllNeededSpecies( aMaskOfSpeciesMeetingCriteria[ nConsPos ] & ucFlankingColumnSpecies ) ) { unsigned char ucFlankedSpeciesAtThisColumn = aMaskOfSpeciesMeetingCriteria[ nConsPos ] & ucFlankingColumnSpecies; // use this new flanking of the column if: // 1) this column has not been used // or // 2) the new set of species is a proper superset // of the old one // the negation of the above is: // 1) the column has been used // and // 2) the new set of species is not a proper superset // of the old one if ( aSpeciesInAcceptableColumns[ nConsPos ] != 0 && ( ( aSpeciesInAcceptableColumns[ nConsPos ] & ucFlankedSpeciesAtThisColumn ) != aSpeciesInAcceptableColumns[ nConsPos ] ) ) continue; aSpeciesInAcceptableColumns[ nConsPos ] |= ucFlankedSpeciesAtThisColumn; } // if ( bAllNeededSpecies( } // for( int nConsPos = nConsPosOfStartingColumn + 1; } // } } // if ( bAllNeededSpecies( aMaskOfSpeciesAgreeingWithHuman[ nConsPosOfStartingColumn ] ) ) { } // for( int nConsPosOfStartingColumn = nGetStartConsensusIndex(); int nArraySize = nGetEndConsensusIndex() - nGetStartConsensusIndex() + 1; aArrayOfOneTreeAtEachConsPos.reshape( nArraySize + 1, "" ); // initial value // + 1so we can use nConsPos (1-based) directly as an index aArrayOfBasesAtEachConsPos.reshape( nArraySize + 1, // + 1so we can use nConsPos (1-based) directly as an index "" ); // initial value aArrayOfLenientlyFilteredBasesAtEachConsPos.reshape( nArraySize + 1, "" ); // initial value typedef RWTValVector typeArrayOfBool; RWTPtrVector aArrayOfChosenTreesAtEachConsPos( (size_t) nArraySize + 1, 0, // initial values "aArrayOfChosenTreesAtEachConsPos" ); RWTPtrVector aArrayOfTreesAtEachConsPos( (size_t) nArraySize + 1, 0, // initial values "aArrayofTreesAtEachConsPos" ); int nConsPos; for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // Jan 18 changed so that bases are returned even for // columns that do not pass all filters // if ( ! aSpeciesInAcceptableColumns[ nConsPos ] ) continue; RWCString soColumnOfBases( (size_t) 6 ); RWCString soColumnOfBadBases( (size_t) 6 ); soColumnOfBases = ntGetCons( nConsPos ).cGetBase(); // start with human soColumnOfBadBases = soColumnOfBases; for( int nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; unsigned char ucSpeciesMask = ucGetMaskForSpecies( soSpecies ); LocatedFragment* pCombinedRead = aCombinedReads[ nSpecies ]; assert( pCombinedRead ); char cBadBase = pCombinedRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); char cBase; if ( ucSpeciesMask & aSpeciesInAcceptableColumns[ nConsPos ] ) { // if reached here, soSpecies is a flanked species // which, at this columns, meets the criteria // so print out the base cBase = cBadBase; } else { // always the same number of bases on each line cBase = 'n'; } // soColumnOfBases in in order HSap PPan PTro ... if ( soSpecies != "HSap" ) { soColumnOfBases += cBase; soColumnOfBadBases += cBadBase; } } // for( int nSpecies = 0; nSpecies < assert( soColumnOfBadBases.length() == soColumnOfBases.length() ); aArrayOfBasesAtEachConsPos[ nConsPos ] = soColumnOfBases; aArrayOfLenientlyFilteredBasesAtEachConsPos[ nConsPos ] = soColumnOfBadBases; RWTValOrderedVector aTrees; findAncestralTrees( soColumnOfBases, aTrees ); arrayOfRWCString* pTrees = new RWTValOrderedVector( aTrees ); aArrayOfTreesAtEachConsPos[ nConsPos ] = pTrees; aArrayOfChosenTreesAtEachConsPos[ nConsPos ] = new RWTValVector( (size_t) aTrees.length(), true, // initial value "aArrayOfChosenTreesAtEachConsPos( " + RWCString( (long) nConsPos ) + ")" ); if ( aTrees.length() > 1 && pCP->bAutoReportChooseTreesUsingBadData_ ) { chooseTreesUsingBadData( soColumnOfBadBases, *pTrees, aArrayOfChosenTreesAtEachConsPos[ nConsPos ] ); } } // for( nConsPos = nGetStartConsensusIndex(); // if we are only interested in single parsimonious tree sites, delete // those that are not if ( pCP->bAutoReportIgnoreMultipleTrees_ ) { for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( aArrayOfTreesAtEachConsPos[ nConsPos ] ) { if ( aArrayOfTreesAtEachConsPos[ nConsPos ]->length() > 1 ) { aSpeciesInAcceptableColumns[ nConsPos ] = 0; aArrayOfTreesAtEachConsPos[ nConsPos ] = 0; } } } } // choosen trees based on deamination mutations for( nConsPos = nGetStartConsensusIndex() + 1; nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( !aArrayOfTreesAtEachConsPos[ nConsPos - 1 ] ) continue; if ( !aArrayOfTreesAtEachConsPos[ nConsPos ] ) continue; checkForDeaminationMutationsAmongMultipleTrees( aArrayOfTreesAtEachConsPos[ nConsPos - 1 ], aArrayOfTreesAtEachConsPos[ nConsPos ], nConsPos - 1, aArrayOfChosenTreesAtEachConsPos[ nConsPos - 1 ], aArrayOfChosenTreesAtEachConsPos[ nConsPos ] ); } if ( pCP->bAutoReportChooseTreesUsingKimura_ ) { // choose most likely tree at each position for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { arrayOfRWCString* pArrayOfTrees = aArrayOfTreesAtEachConsPos[ nConsPos ]; if ( pArrayOfTrees ) { RWTValVector* pChosenTrees = aArrayOfChosenTreesAtEachConsPos[ nConsPos ]; assert( pChosenTrees ); assert( pChosenTrees->length() == pArrayOfTrees->length() ); if ( pArrayOfTrees->length() == 1 ) { (*pChosenTrees)[0] = true; } else { float fHighestProbability = 0.0; int nIndexOfBestTree = -666; RWCString soBestTree; int nTree; for( nTree = 0; nTree < pArrayOfTrees->length(); ++nTree ) { if ( !(*pChosenTrees)[ nTree ] ) continue; RWCString soTree = (*pArrayOfTrees)[ nTree ]; float fProbOfOneTree = fGetProbabilityOfTree( soTree ); if ( fProbOfOneTree > fHighestProbability ) { fHighestProbability = fProbOfOneTree; soBestTree = soTree; nIndexOfBestTree = nTree; } } for( nTree = 0; nTree < pArrayOfTrees->length(); ++nTree ) { if ( nTree == nIndexOfBestTree ) (*pChosenTrees)[ nTree ] = true; else (*pChosenTrees)[ nTree ] = false; } } // if ( pArrayOfTrees->length() == 1 ) { else } // if ( pArrayOfTrees ) { } // for( nConsPos = nGetStartConsensusIndex(); } // if ( pCP->bAutoReportChooseTreesUsingKimura_ ) { // now shouldn't I only return the trees that are chosen? for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { RWTValVector* pChosenTrees = aArrayOfChosenTreesAtEachConsPos[ nConsPos ]; arrayOfRWCString* pArrayOfTrees = aArrayOfTreesAtEachConsPos[ nConsPos ]; if ( pArrayOfTrees ) { int nNumberOfChosenTrees = 0; for( int nTree = 0; nTree < pChosenTrees->length(); ++nTree ) { if ( (*pChosenTrees)[ nTree ] ) { ++nNumberOfChosenTrees; if ( nNumberOfChosenTrees == 1 ){ RWCString soTree = (*pArrayOfTrees)[ nTree ]; aArrayOfOneTreeAtEachConsPos[ nConsPos ] = soTree; } } } if ( nNumberOfChosenTrees != 1 ) { aArrayOfOneTreeAtEachConsPos[ nConsPos ] = ""; aSpeciesInAcceptableColumns[ nConsPos ] = 0; } } } // at this point, I need to go through the chosen trees and check // for deamination mutations (again), now that I have chosen the trees. // now look at mutations in which the ancestral sequence is a CpG // and it clearly is a deamination mutation. Mark them as such. aDeaminationMutationsAtEachConsPos.reshape( nArraySize + 1, "" ); // if a CpG occurs, both // columns are marked, rather than just the first column aAncestralCpGAtEachConsPos2.reshape( nArraySize + 1, cNoCpG ); // initial value checkForDeaminationMutationsAmongUniqueTrees( aArrayOfOneTreeAtEachConsPos, aDeaminationMutationsAtEachConsPos, aAncestralCpGAtEachConsPos2 ); if ( pCP->bAutoReportOnlyAllowSitesThatAreBetweenAcceptableSites_ ) { RWTValVector aSitesBetweenTwoGoodSites( nGetPaddedConsLength() + 1, false, // initial value "aSitesBetweenTwoGoodSites" ); for( nConsPos = nGetStartConsensusIndex() + 1; nConsPos <= nGetEndConsensusIndex() - 1; ++nConsPos ) { if ( bAllNeededSpecies( aSpeciesInAcceptableColumns[ nConsPos - 1 ] ) && bAllNeededSpecies( aSpeciesInAcceptableColumns[ nConsPos + 1 ] ) ) { aSitesBetweenTwoGoodSites[ nConsPos ] = true; } } // now use this to zero the sites that are not between two good sites for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( !aSitesBetweenTwoGoodSites[ nConsPos ] ) { aSpeciesInAcceptableColumns[ nConsPos ] = 0; aArrayOfOneTreeAtEachConsPos[ nConsPos ] = ""; } } } // if ( pCP->bAutoReportOnlyAllowSitesThatAreBetweenAcceptableSites_ ) { } // applyFilters void Contig :: checkForDeaminationMutationsAmongUniqueTrees( RWTValVector& aArrayOfOneTreeAtEachConsPos, RWTValVector& aDeaminationMutationsAtEachConsPos, RWTValVector& aAncestralCpGAtEachConsPos2 ) { for( int nConsPos = nGetStartConsensusIndex() + 1; nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { RWCString soTreeA = aArrayOfOneTreeAtEachConsPos[ nConsPos - 1]; RWCString soTreeB = aArrayOfOneTreeAtEachConsPos[ nConsPos ]; if ( soTreeA.isNull() || soTreeB.isNull() ) continue; soTreeA.removeSingleCharacterAllOccurrences('?'); soTreeA.removeSingleCharacterAllOccurrences('('); soTreeA.removeSingleCharacterAllOccurrences(')'); soTreeA.removeSingleCharacterAllOccurrences(','); soTreeB.removeSingleCharacterAllOccurrences('?'); soTreeB.removeSingleCharacterAllOccurrences('('); soTreeB.removeSingleCharacterAllOccurrences(')'); soTreeB.removeSingleCharacterAllOccurrences(','); // abcdef // ^ top internal node, leading to PPyg and MMul 0 // ^ next internal node, leading to GGor 1 // ^ next internal node, leading to HSap, I hope! 2 // ^ next internal node, leading to PTro 3 // ^ PPan // ^ PTro // ^ GGor // etc. // so there are 4 internal nodes to be concerned about bool bSomeInternalNodeIsACpG = false; RWTValOrderedVector aDeaminationMutationsPositionA( (size_t) 3 ); RWTValOrderedVector aDeaminationMutationsPositionB( (size_t) 3 ); for( int nInternalNode = 0; nInternalNode <= nMaxInternalNode; ++nInternalNode ) { RWCString soAncestralSequence = RWCString( soTreeA[ nInternalNode ] ) + RWCString( soTreeB[ nInternalNode ] ); // special case of MMul in which the // deamination mutation occurs on the short part of the // branch that goes down to the right to the top // internal node. In such a case, the MMul present day // base is actually older than the top internal node. // Thus we look for a CpG at the MMul present day bases // and some TpG or CpA at the top internal node. if ( nInternalNode == 0 ) { int nIndexOfPresentDayBases = 9; RWCString soPresentDayBases = RWCString( soTreeA[ nIndexOfPresentDayBases ] ) + RWCString( soTreeB[ nIndexOfPresentDayBases ] ); if ( soPresentDayBases == "cg" ) { bSomeInternalNodeIsACpG = true; if ( soAncestralSequence == "ca" ) { aDeaminationMutationsPositionB.insert( soGetBranchNameWithoutTree( -1, nDownToRight ) ); } else if ( soAncestralSequence == "tg" ) { aDeaminationMutationsPositionA.insert( soGetBranchNameWithoutTree( -1, nDownToRight ) ); } } } // if ( nInternalNode == 0 ) { if ( soAncestralSequence != "cg" ) continue; bSomeInternalNodeIsACpG = true; // if reached here, ancestral sequence is a CpG // is there a mutation from this position? // check down-right to the corresponding present-day node // then check down-left to next internal node int nIndexOfPresentDayBases = 8 - nInternalNode; RWCString soPresentDayBases = RWCString( soTreeA[ nIndexOfPresentDayBases ] ) + RWCString( soTreeB[ nIndexOfPresentDayBases ] ); if ( soPresentDayBases == "ca" ) { aDeaminationMutationsPositionB.insert( soGetBranchNameWithoutTree( nInternalNode, nDownToRight ) ); } else if ( soPresentDayBases == "tg" ) { aDeaminationMutationsPositionA.insert( soGetBranchNameWithoutTree( nInternalNode, nDownToRight) ); } // case of top internal node which has the additional // external branch leading down to MMul if ( nInternalNode == 0 ) { nIndexOfPresentDayBases = nMMulPresentDayNode; soPresentDayBases = RWCString( soTreeA[ nIndexOfPresentDayBases ] ) + RWCString( soTreeB[ nIndexOfPresentDayBases ] ); if ( soPresentDayBases == "ca" ) { aDeaminationMutationsPositionB.insert( soGetBranchNameWithoutTree( -1, nDownToRight ) ); } else if ( soPresentDayBases == "tg" ) { aDeaminationMutationsPositionA.insert( soGetBranchNameWithoutTree( -1, nDownToRight ) ); } } // if nInternalNode is 0-2 (PPyg, GGor, HSap), the next node // is the descendant internal node. If nInternalNode is 3, // the next node is PPan. RWCString soDescendantInternalNode = RWCString( soTreeA[ nInternalNode + 1 ] ) + RWCString( soTreeB[ nInternalNode + 1 ] ); if ( soDescendantInternalNode == "ca" ) { aDeaminationMutationsPositionB.insert( soGetBranchNameWithoutTree( nInternalNode, nOnBackbone ) ); } else if ( soDescendantInternalNode == "tg" ) { aDeaminationMutationsPositionA.insert( soGetBranchNameWithoutTree( nInternalNode, nOnBackbone ) ); } } // for( int nInternalNode = 0; nInternalNode <= nMaxInternalNode; if ( bSomeInternalNodeIsACpG ) { aAncestralCpGAtEachConsPos2[ nConsPos - 1 ] = cDefinitelyCpG; aAncestralCpGAtEachConsPos2[ nConsPos ] = cDefinitelyCpG; } int nDeam; for( nDeam = 0; nDeam < aDeaminationMutationsPositionA.length(); ++nDeam ) { if ( !aDeaminationMutationsAtEachConsPos[ nConsPos - 1 ].isNull() ) aDeaminationMutationsAtEachConsPos[ nConsPos - 1 ] += " "; aDeaminationMutationsAtEachConsPos[ nConsPos - 1 ] += aDeaminationMutationsPositionA[ nDeam ]; } for( nDeam = 0; nDeam < aDeaminationMutationsPositionB.length(); ++nDeam ) { if ( !aDeaminationMutationsAtEachConsPos[nConsPos].isNull() ) aDeaminationMutationsAtEachConsPos[ nConsPos ] += " "; aDeaminationMutationsAtEachConsPos[ nConsPos ] += aDeaminationMutationsPositionB[ nDeam ]; } } // for( int nConsPos = nGetStartConsensusIndex() + 1; } float Contig :: fGetProbabilityOfTree( const RWCString& soTreeConst ) { RWCString soTree = soTreeConst; if ( !pBranchProbabilities_ ) { pBranchProbabilities_ = new kimuraBranchProb(); } soTree.removeSingleCharacterAllOccurrences('?'); soTree.removeSingleCharacterAllOccurrences('('); soTree.removeSingleCharacterAllOccurrences(')'); soTree.removeSingleCharacterAllOccurrences(','); // for each branch, figure out its probability // notice that this use of internal node labels them from top // of the tree down: 0 is about PPyg and MMul, 1 is above GGor, // 2 is above HSap, and 3 is above PPan and PTro float fTotalProb = 1.0; for( int nInternalNode = 0; nInternalNode <= nMaxInternalNode; ++nInternalNode ) { char cAncestralBase = soTree[ nInternalNode ]; if ( nInternalNode == 0 ) { // special case of MMul, int nIndexOfPresentDayBase = 9; char cPresentDayBase = soTree[ nIndexOfPresentDayBase ]; fTotalProb *= pBranchProbabilities_->fGetBranchProbability( nInternalNode, nIndexOfPresentDayBase, cAncestralBase, cPresentDayBase ); } int nIndexOfPresentDayBase = 8 - nInternalNode; char cPresentDayBase = soTree[ nIndexOfPresentDayBase ]; fTotalProb *= pBranchProbabilities_->fGetBranchProbability( nInternalNode, nIndexOfPresentDayBase, cAncestralBase, cPresentDayBase ); // if nInternalNode is 0-3, the next node is the // descendant internal node. if nInternalNode is // 4, the next node is PPan char cDescendantInternalBase = soTree[ nInternalNode + 1 ]; fTotalProb *= pBranchProbabilities_->fGetBranchProbability( nInternalNode, nInternalNode + 1, cAncestralBase, cDescendantInternalBase ); } return( fTotalProb ); } void Contig :: printFlankedColumns4( autoReport* pAutoReport ) { RWTPtrVector aCombinedReads( (size_t) pAutoReport->aArrayOfSpecies_.length(), NULL, // initial value "aCombinedReads" ); mbtValVector aSpeciesInAcceptableColumns( 1, // will be changed 1, // starting position 0, // initial value "aSpeciesInAcceptableColumns" ); RWTValVector aArrayOfOneTreeAtEachConsPos( 1, // size--will be changed // + 1so we can use nConsPos (1-based) directly as an index "", "aArrayOfOneTreeAtEachConsPos" ); RWTValVector aArrayOfBasesAtEachConsPos( 1, // size--will be changed // + 1so we can use nConsPos (1-based) directly as an index "", "aArrayOfBasesAtEachConsPos" ); RWTValVector aArrayOfLenientlyFilteredBasesAtEachConsPos( 1, // size--will be changed // + 1 so we can use nConsPos (1-based) directly as an index "", // initial value "aArrayOfLenientlyFilteredBasesAtEachConsPos" ); RWTValVector aDeaminationMutationsAtEachConsPos( 1, // size--will be changed "", // initial value "aDeaminationMutationsAtEachConsPos" ); // this is different than above in that if a CpG occurs, both // columns are marked, rather than just the first column RWTValVector aAncestralCpGAtEachConsPos2( 1,// size--will be changed cNoCpG, // initial value "aAncestralCpGAtEachConsPos" ); LocatedFragment* pHumanRead; applyFilters( pAutoReport, aCombinedReads, aSpeciesInAcceptableColumns, aArrayOfOneTreeAtEachConsPos, aArrayOfBasesAtEachConsPos, aArrayOfLenientlyFilteredBasesAtEachConsPos, aDeaminationMutationsAtEachConsPos, aAncestralCpGAtEachConsPos2, pHumanRead ); if ( pCP->bAutoReportUseAnnotationFormat_ ) { // figure out chromosome RWCString soTemp = ConsEd::pGetAssembly()->filGetAceFileFullPathname().soGetProjectBeforeEditDir(); // this isn't quite the chromosome, rather it is something like: // hs1-10143913 // split on '-' and take the first token RWCTokenizer tok( soTemp ); soTemp = tok('-'); // now it is like: hs1 // replace the "hs" with "chr" assert( soTemp.bStartsWithAndRemove("hs") ); RWCString soChromosome = "chr" + soTemp; fprintf( pAO, "printCladeInformativeColumns {\n" ); int nConsPos; for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( ! aSpeciesInAcceptableColumns[ nConsPos ] ) continue; RWCString soColumnOfBases = aArrayOfBasesAtEachConsPos[ nConsPos ]; char cChimp = cGetAgreeingBasesNoNs( soColumnOfBases[nPPan], soColumnOfBases[nPTro] ); // check if they disagree if ( cChimp == 0 ) continue; if ( soColumnOfBases[nHSap] == cChimp && cChimp == soColumnOfBases[nGGor] ) continue; char cPPygMMul = cGetAgreeingBasesNoNs( soColumnOfBases[nPPyg], soColumnOfBases[nMMul ] ); // they disagree if ( cPPygMMul == 0 ) continue; // if reached here, PTro and PPan agree, MMul and PPyg agree char cHSap = soColumnOfBases[nHSap]; char cGGor = soColumnOfBases[nGGor]; RWCString soClade; char cNonHumanBase = ' '; if ( ( cHSap == cChimp ) && ( cGGor == cPPygMMul ) ) { soClade = soHCClade; cNonHumanBase = cGGor; } else if ( ( cHSap == cGGor ) && ( cChimp == cPPygMMul ) ) { soClade = soHGClade; cNonHumanBase = cChimp; } else if ( ( cChimp == cGGor ) && ( cHSap == cPPygMMul ) ) { soClade = soCGClade; cNonHumanBase = cChimp; } else { // this includes cases in which only one species is different // or there are 3 species that are all different continue; } RWCString soDeamination; if ( aDeaminationMutationsAtEachConsPos[ nConsPos ].isNull() ) { soDeamination = "nodeam"; } else { soDeamination = "deam "; } char cAncestralCpG = aAncestralCpGAtEachConsPos2[ nConsPos ]; fprintf( pAO, "clade: %2s hum: %c other: %c %s chr: %11s %5s unpad: %4d %s cpg: %c\n", soClade.data(), cHSap, cNonHumanBase, soDeamination.data(), soAddCommas( nUnpaddedIndexStartingAtUserDefinedPosition2( nConsPos ) ).data(), soChromosome.data(), nUnpaddedIndex( nConsPos ), soGetName().data(), // contig name cAncestralCpG ); } // for( nConsPos = nGetStartConsensusIndex(); fprintf( pAO, "} printCladeInformativeColumns\n" ); } else if ( pCP->bAutoReportPrintCpGMutations_ ) { fprintf( pAO, "CpG mutations {\n" ); int nConsPos; for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( !aDeaminationMutationsAtEachConsPos[ nConsPos ].isNull() ) { fprintf( pAO, "%d %s\n", nUnpaddedIndex( nConsPos ), soGetName().data() ); } } fprintf( pAO, "} CpG mutations\n" ); } else if ( pCP->bAutoReportPrintAncestralCpGs_ ) { fprintf( pAO, "Ancestral CpGs {\n" ); int nConsPos; for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { fprintf( pAO, "%d %c %s\n", nUnpaddedIndex( nConsPos ), aAncestralCpGAtEachConsPos2[ nConsPos ], soGetName().data() ); } fprintf( pAO, "} Ancestral CpGs\n" ); } else if ( pCP->bAutoReportPrintPositionsForGraham_ ) { fprintf( pAO, "forGraham {\n" ); int nConsPos; for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( ! ( aSpeciesInAcceptableColumns[ nConsPos ] & ucPTro ) ) continue; if ( ! ( aSpeciesInAcceptableColumns[ nConsPos ] & ucMMul ) ) continue; if ( aAncestralCpGAtEachConsPos2[ nConsPos ] != cNoCpG ) continue; fprintf( pAO, "%3d %s\n", nUnpaddedIndex( nConsPos ), aArrayOfBasesAtEachConsPos[ nConsPos ].data() ); } fprintf( pAO, "} forGraham\n" ); } else { // finally print everything out fprintf( pAO, "printTrees {\n" ); int nConsPos; for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( ! aSpeciesInAcceptableColumns[ nConsPos ] ) continue; // now that we are less aggressive about deleting columns of pads, // there are some columns that consist of all pads (only some // of the pads are multiple signal because there is a multiple // signal base next to it, such as E01_hs1-10143913_PPan_050413.r // at 654. We don't want to print these since there is no information. // I still think we shouldn't delete them, since we can't be sure // there really isn't a base and thus by deleting them we would // be making columns flanked that might not really be. if ( bIsAllPadsOrNs( aArrayOfBasesAtEachConsPos[ nConsPos ] ) ) { continue; } fprintf( pAO, "%3d %s", nUnpaddedIndex( nConsPos ), aArrayOfBasesAtEachConsPos[ nConsPos ].data() ); if ( !aArrayOfOneTreeAtEachConsPos[ nConsPos ].isNull() ) { fprintf( pAO, " %28s", aArrayOfOneTreeAtEachConsPos[ nConsPos ].data() ); } fprintf( pAO, " %s", aDeaminationMutationsAtEachConsPos[ nConsPos ].data() ); // fprintf( pAO, " %c", // aAncestralCpGAtEachConsPos[ nConsPos ] ); // contig name fprintf( pAO, " %s\n", soGetName().data() ); } fprintf( pAO, "} printTrees\n" ); } } void Contig :: checkForDeaminationMutationsAmongMultipleTrees( RWTValOrderedVector* pTreesPositionA, RWTValOrderedVector* pTreesPositionB, const int nConsPos, RWTValVector* pChosenTreesPositionA, RWTValVector* pChosenTreesPositionB ) { // now the trees look like this: // 4 5 6 7 8 9 // c(c(c(c(c?,c),c),c),c,c?) // 0 1 2 3 PPan HSap PPyg // PTro GGor MMul // all numbers below are string positions: // so internal node n goes to present day species 8-n and internal node // n+1. 2 exceptions: internal node 0 goes to present day species // 8 and 9. Internal node 3 goes to present day species 4 and 5. int nPairsInWhichAncestralIsCpG = 0; int nPairsInWhichDeaminationMutation = 0; int nGreatestNumberOfDeaminationMutationsInAnyPair = 0; RWTValOrderedVector aPairsOfTreesWithGreatestNumberPositionA( (size_t) 2); RWTValOrderedVector aPairsOfTreesWithGreatestNumberPositionB( (size_t) 2); for( int nTreeA = 0; nTreeA < pTreesPositionA->length(); ++nTreeA ) { if ( !(*pChosenTreesPositionA)[ nTreeA ] ) continue; RWCString soTreeA = (*pTreesPositionA)[ nTreeA ]; // get rid of ? for now soTreeA.removeSingleCharacterAllOccurrences('?'); soTreeA.removeSingleCharacterAllOccurrences('('); soTreeA.removeSingleCharacterAllOccurrences(')'); soTreeA.removeSingleCharacterAllOccurrences(','); for( int nTreeB = 0; nTreeB < pTreesPositionB->length(); ++nTreeB ) { if ( !(*pChosenTreesPositionB)[ nTreeB ] ) continue; RWCString soTreeB = (*pTreesPositionB)[ nTreeB ]; soTreeB.removeSingleCharacterAllOccurrences( '?' ); soTreeB.removeSingleCharacterAllOccurrences( '(' ); soTreeB.removeSingleCharacterAllOccurrences( ')' ); soTreeB.removeSingleCharacterAllOccurrences( ',' ); // now we have a pair of trees, soTreeA and soTreeB // abcdef // ^ top internal node, leading to PPyg and MMul 0 // ^ next internal node, leading to GGor 1 // ^ next internal node, leading to HSap, I hope! 2 // ^ next internal node, leading to PTro 3 // ^ PPan // ^ PTro // ^ GGor // etc. // so there are 4 internal nodes to be concerned about bool bSomeInternalNodeIsACpG = false; bool bDeaminationMutationThisPairOfTrees = false; int nNumberOfDeaminationMutationsThisPairOfTrees = 0; for( int nInternalNode = 0; nInternalNode <= nMaxInternalNode; ++nInternalNode ) { RWCString soAncestralSequence = RWCString( soTreeA[ nInternalNode ] ) + RWCString( soTreeB[ nInternalNode ] ); // special case of MMul in which the // deamination mutation occurs on the short part of the // branch that goes down to the right to the top // internal node. In such a case, the MMul present day // base is actually older than the top internal node. // Thus we look for a CpG at the MMul present day bases // and some deamination mutation at the top internal node if ( nInternalNode == 0 ) { int nIndexOfPresentDayBases = 9; RWCString soPresentDayBases = RWCString( soTreeA[ nIndexOfPresentDayBases ] ) + RWCString( soTreeB[ nIndexOfPresentDayBases ] ); if ( soPresentDayBases == "cg" ) { bSomeInternalNodeIsACpG = true; if ( soAncestralSequence == "ca" ) { bDeaminationMutationThisPairOfTrees = true; ++nNumberOfDeaminationMutationsThisPairOfTrees; } else if ( soAncestralSequence == "tg" ) { bDeaminationMutationThisPairOfTrees = true; ++nNumberOfDeaminationMutationsThisPairOfTrees; } } } if ( soAncestralSequence != "cg" ) continue; bSomeInternalNodeIsACpG = true; // if reached here, ancestral sequence is a CpG // is there a mutation from this position? // first, check if there is a mutation down-right to // the corresponding present-day node int nIndexOfPresentDayBases = 8 - nInternalNode; RWCString soPresentDayBases = RWCString( soTreeA[ nIndexOfPresentDayBases ] ) + RWCString( soTreeB[ nIndexOfPresentDayBases ] ); if ( soPresentDayBases == "ca" ) { bDeaminationMutationThisPairOfTrees = true; ++nNumberOfDeaminationMutationsThisPairOfTrees; } else if ( soPresentDayBases == "tg" ) { bDeaminationMutationThisPairOfTrees = true; ++nNumberOfDeaminationMutationsThisPairOfTrees; } // case of top internal node if ( nInternalNode == 0 ) { // consider additionally deamination mutations on branch to // MMul nIndexOfPresentDayBases = 9; soPresentDayBases = RWCString( soTreeA[ nIndexOfPresentDayBases ] ) + RWCString( soTreeB[ nIndexOfPresentDayBases ] ); if ( soPresentDayBases == "ca" ) { bDeaminationMutationThisPairOfTrees = true; ++nNumberOfDeaminationMutationsThisPairOfTrees; } else if ( soPresentDayBases == "tg" ) { bDeaminationMutationThisPairOfTrees = true; ++nNumberOfDeaminationMutationsThisPairOfTrees; } } // if nInternalNode is 0-3, the next node is the // descendant internal node. if nInternalNode is // 4, the next node is PPan RWCString soDescendantInternalNode = RWCString( soTreeA[ nInternalNode + 1 ] ) + RWCString( soTreeB[ nInternalNode + 1 ] ); if ( soDescendantInternalNode == "ca" ) { bDeaminationMutationThisPairOfTrees = true; ++nNumberOfDeaminationMutationsThisPairOfTrees; } else if ( soDescendantInternalNode == "tg" ) { bDeaminationMutationThisPairOfTrees = true; ++nNumberOfDeaminationMutationsThisPairOfTrees; } } // for( int nInternalNode = 0; nInternalNode < 5; if ( bSomeInternalNodeIsACpG ) { ++nPairsInWhichAncestralIsCpG; } if ( bDeaminationMutationThisPairOfTrees ) { ++nPairsInWhichDeaminationMutation; } if ( nNumberOfDeaminationMutationsThisPairOfTrees >= nGreatestNumberOfDeaminationMutationsInAnyPair ) { if ( nNumberOfDeaminationMutationsThisPairOfTrees > nGreatestNumberOfDeaminationMutationsInAnyPair ) { nGreatestNumberOfDeaminationMutationsInAnyPair = nNumberOfDeaminationMutationsThisPairOfTrees; aPairsOfTreesWithGreatestNumberPositionA.clear(); aPairsOfTreesWithGreatestNumberPositionB.clear(); } aPairsOfTreesWithGreatestNumberPositionA.insert( nTreeA ); aPairsOfTreesWithGreatestNumberPositionB.insert( nTreeB ); } } // for( int nTreeB = 0; nTreeB < pTreesPositionB->length } // for( int nTreeA = 0; nTreeA < pTreesPositionA->length() if ( pCP->bAutoReportPrintCpGMutations_ ) { if ( nPairsInWhichAncestralIsCpG != 0 ) { fprintf( pAO, "cpg_mutations %d trees with ancestral CpG %d trees with deamination mutation %d pairs of trees %d %s\n", nPairsInWhichAncestralIsCpG, nPairsInWhichDeaminationMutation, pTreesPositionB->length() * pTreesPositionA->length(), nUnpaddedIndex( nConsPos ), soGetName().data() ); } if ( pTreesPositionA->length() * pTreesPositionB->length() > 1 ) { fprintf( pAO, "multiple trees %d equal CpG trees: %d\n", pTreesPositionA->length() * pTreesPositionB->length(), aPairsOfTreesWithGreatestNumberPositionA.length() ); } if ( nGreatestNumberOfDeaminationMutationsInAnyPair > 0 ) { fprintf( pAO, "trees with greatest number of deamination mutations %d at %d of %s is %d", nGreatestNumberOfDeaminationMutationsInAnyPair, nUnpaddedIndex( nConsPos ), soGetName().data(), aPairsOfTreesWithGreatestNumberPositionA.length() ); if ( aPairsOfTreesWithGreatestNumberPositionA.length() > 1 ) { // multiple pairs with same # of deamination mutations for( int nPair = 0; nPair < aPairsOfTreesWithGreatestNumberPositionA.length(); ++nPair ) { int nTreePositionA = aPairsOfTreesWithGreatestNumberPositionA[ nPair ]; int nTreePositionB = aPairsOfTreesWithGreatestNumberPositionB[ nPair ]; RWCString soTreePositionA = (*pTreesPositionA)[ nTreePositionA ]; RWCString soTreePositionB = (*pTreesPositionB)[ nTreePositionB ]; fprintf( pAO, " pair %s %s", soTreePositionA.data(), soTreePositionB.data() ); } } fprintf( pAO, "\n" ); } } // if ( pCP->bAutoReportPrintCpGMutations_ ) { if ( pCP->bAutoReportChooseTreesByCountingDeaminationMutations_ ) { int nNumberOfChosenTreesPositionA = 0; for( int nTreeA = 0; nTreeA < pChosenTreesPositionA->length(); ++nTreeA ) { if ( (*pChosenTreesPositionA)[ nTreeA ] ) { ++nNumberOfChosenTreesPositionA; } } int nNumberOfChosenTreesPositionB = 0; for( int nTreeB = 0; nTreeB < pChosenTreesPositionB->length(); ++nTreeB ) { if ( (*pChosenTreesPositionB)[ nTreeB ] ) { ++nNumberOfChosenTreesPositionB; } } if ( aPairsOfTreesWithGreatestNumberPositionA.length() < nNumberOfChosenTreesPositionA * nNumberOfChosenTreesPositionB ) { // we can reduce the number of chosen trees by counting // deamination mutations // But does it actually reduce the number of trees? Or just // the number of pairs of trees? RWTValVector aATrees( (size_t) pTreesPositionA->length(), false, "aATrees" ); int nPair; for( nPair = 0; nPair < aPairsOfTreesWithGreatestNumberPositionA.length(); ++nPair ) { int nAIndex = aPairsOfTreesWithGreatestNumberPositionA[ nPair ]; aATrees[nAIndex] = true; } // rather than counting, let's directly compare to // the pChosenTreesPositionA int nNumberOfTreesEliminatedPositionA = 0; for( int nTreeA = 0; nTreeA < aATrees.length(); ++nTreeA ) { if ( (*pChosenTreesPositionA)[ nTreeA ] && !aATrees[nTreeA] ) { ++nNumberOfTreesEliminatedPositionA; (*pChosenTreesPositionA)[ nTreeA ] = false; } } // do same with B RWTValVector aBTrees( (size_t) pTreesPositionB->length(), false, "aBTrees" ); for( nPair = 0; nPair < aPairsOfTreesWithGreatestNumberPositionB.length(); ++nPair ) { int nBIndex = aPairsOfTreesWithGreatestNumberPositionB[ nPair ]; aBTrees[ nBIndex ] = true; } int nNumberOfTreesEliminatedPositionB = 0; for( int nTreeB = 0; nTreeB < aBTrees.length(); ++nTreeB ) { if ( (*pChosenTreesPositionB)[ nTreeB ] && !aBTrees[nTreeB] ) { ++nNumberOfTreesEliminatedPositionB; (*pChosenTreesPositionB)[ nTreeB ] = false; } } } // if ( aPairsOfTreesWithGreatestNumberPositionA.length() < } // if ( pCP->bAutoReportChooseTreesByCountingDeaminationMutations_ ) { } // this differs from bGetCombinedReadAtBase in that it guesses at the // base when the data is bad. It also does NOT add quality values--it // assumes that is already done. bool Contig :: bGetCombinedReadAtBase3( const bool bAddQualityValuesOfStrands, LocatedFragment* pForwardRead, MBTValOrderedOffsetVector* pForwardReadSingleSignalArray, LocatedFragment* pReverseRead, MBTValOrderedOffsetVector* pReverseReadSingleSignalArray, const int nConsPos, char& cCombinedBase, int& nCombinedQuality, bool& bBaseALittleOK ) { bool bBaseOK = true; bBaseALittleOK = true; // will be changed if necessary bool bForwardReadBaseOK = ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos ) && pForwardRead->bIsInHighQualitySegmentOfRead( nConsPos ) && ( (*pForwardReadSingleSignalArray)[ nConsPos ] == cSingleSignal ) ) ? true : false; bool bReverseReadBaseOK = ( pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) && pReverseRead->bIsInHighQualitySegmentOfRead( nConsPos ) && ( (*pReverseReadSingleSignalArray)[ nConsPos ] == cSingleSignal ) ) ? true : false; if ( bForwardReadBaseOK && bReverseReadBaseOK ) { if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { cCombinedBase = pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); if ( bAddQualityValuesOfStrands ) { nCombinedQuality = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() + pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); } else { nCombinedQuality = MAX( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(), pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ); } } else { // reads disagree. Which one should we use? // will just use one read. Which is it? if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() == pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) { // I don't know what to do in this case so let's ignore // such data points bBaseOK = false; } else if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() > pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) { nCombinedQuality = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); cCombinedBase = pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } else { nCombinedQuality = pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); cCombinedBase = pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } } } // if ( bForwardReadBaseOK && bReverseReadBaseOK ) { else if ( bForwardReadBaseOK ) { nCombinedQuality = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); cCombinedBase = pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } else if ( bReverseReadBaseOK ) { nCombinedQuality = pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); cCombinedBase = pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } else { // neither read's base is ok bBaseOK = false; } if ( nCombinedQuality < pCP->nQualityThresholdForFindingHighQualityDiscrepancies_ ) bBaseOK = false; if ( !bBaseOK ) { // now go to work trying to guess correct base bool bForwardReadBaseOK2 = ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos ) ) ? true : false; bool bReverseReadBaseOK2 = ( pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) ) ? true : false; if ( pCP->bAutoReportFlankingBasesMustBeSingleSignal_ && pForwardReadSingleSignalArray ) bForwardReadBaseOK2 = bForwardReadBaseOK2 && ( (*pForwardReadSingleSignalArray)[ nConsPos ] == cSingleSignal ); if ( pCP->bAutoReportFlankingBasesMustBeSingleSignal_ && pReverseReadSingleSignalArray ) bReverseReadBaseOK2 = bReverseReadBaseOK2 && ( (*pReverseReadSingleSignalArray)[ nConsPos ] == cSingleSignal ); if ( pCP->bAutoReportFlankingBasesMustBeInHighQualitySegment_ && pForwardRead ) bForwardReadBaseOK2 = bForwardReadBaseOK2 && pForwardRead->bIsInHighQualitySegmentOfRead( nConsPos ); if ( pCP->bAutoReportFlankingBasesMustBeInHighQualitySegment_ && pReverseRead ) bReverseReadBaseOK2 = bReverseReadBaseOK2 && pReverseRead->bIsInHighQualitySegmentOfRead( nConsPos ); if ( bForwardReadBaseOK2 && bReverseReadBaseOK2 ) { if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { cCombinedBase = pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); if ( bAddQualityValuesOfStrands ) { nCombinedQuality = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() + pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); } else { nCombinedQuality = MAX( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(), pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ); } } else { // reads disagree. Which one should we use? // will just use one read. if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() >= // sic. Arbitrarily choose forward read if they disagree pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) { cCombinedBase = pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); nCombinedQuality = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); } else { cCombinedBase = pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); nCombinedQuality = pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); } } } else if ( bForwardReadBaseOK2 ) { cCombinedBase = pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); nCombinedQuality = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); } else if ( bReverseReadBaseOK2 ) { cCombinedBase = pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); nCombinedQuality = pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); } else { // neither base is even aligned or exists bBaseALittleOK = false; } if ( nCombinedQuality < pCP->nAutoReportMinimumQualityOfFlankingBases_ ) { bBaseALittleOK = false; } } // if ( !bBaseOK ) { return bBaseOK; }