/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "contig.h" #include "consedParameters.h" #include "locatedFragment.h" #include "bIntersect.h" #include "mbtValVectorOfBool.h" #include "mbtValVector.h" #include "rwtptrorderedvector.h" #include "ntide.h" #include "fileDefines.h" #include "soAddCommas.h" #include "rwctokenizer.h" #include "autoReport.h" #include "consed.h" #include "assembly.h" #include "nNumberFromBase.h" #include "setErrorRateFromQuality.h" #include "dErrorRateFromQuality.h" #include "singletInfo.h" #include "readPHDAgainForHighQualitySegment.h" #include #include #include #include "rwcregexp.h" #include "getAllSinglets.h" #include "mbt_val_ord_offset_vec.h" #include "soLine.h" #include "max.h" #include #include "findAncestralTrees.h" #include "primateSpecies.h" #include "ucGetMaskForSpecies.h" #include "bAllNeededSpecies.h" void Contig :: countColumnsForGroupsOfSpecies( autoReport* pAutoReport ) { // read all of the good reads RWCString soGoodReads = pCP->filAutoReportGoodHitReads_; FILE* pGoodReads = fopen( soGoodReads.data(), "r" ); if ( !pGoodReads ) { RWCString soError = "could not open "; soError += soGoodReads; THROW_ERROR( soError ); } mbtValOrderedVectorOfRWCString aGoodReads( (size_t) 1000 ); while( fgets( soLine, nMaxLineSize, pGoodReads ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); aGoodReads.insert( soLine ); } fclose( pGoodReads ); // this will change when we delete columns int nPaddedContigLength = nGetPaddedConsLength(); mbtValVectorOfBool aSomeCombinedReadHasNoPad( nPaddedContigLength, 1, "Contig::aSomeCombinedReadHasNoPad" ); const char cSingleSignalUndetermined = 'u'; const char cSingleSignal = 's'; const char cMultipleSignal = 'm'; RWTPtrOrderedVector aListOfReadsInSingleSignalArrays( (size_t) 10 ); typedef MBTValOrderedOffsetVector typeArrayOfChar; RWTPtrOrderedVector aListOfSingleSignalArrays( (size_t) 10 ); int nSpecies; for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; LocatedFragment* pHumanRead = NULL; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { pHumanRead = pLocFrag; continue; } // the fake read for the 2nd alignment if ( pLocFrag->soGetName().bContains( "b." ) ) { continue; } if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; // check if this is a good read if ( aGoodReads.index( pLocFrag->soGetName() ) == RW_NPOS ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } // there should always be a human read. No, perhaps we are in // a contig that doesn't have one. In that case, ignore this contig. if (! pHumanRead ) return; // single signal arrays MBTValOrderedOffsetVector* pForwardReadSingleSignalArray = NULL; MBTValOrderedOffsetVector* pReverseReadSingleSignalArray = NULL; // save single signal arrays before deleting column of pads for( int n = 0; n <= 1; ++n ) { LocatedFragment* pLocFrag = NULL; if ( n == 0 ) pLocFrag = pForwardRead; else pLocFrag = pReverseRead; if ( !pLocFrag ) continue; MBTValOrderedOffsetVector* pSingleSignalArray = NULL; pLocFrag->bGetSingleSignalBasesConsensusLength( pSingleSignalArray ); aListOfReadsInSingleSignalArrays.insert( pLocFrag ); aListOfSingleSignalArrays.insert( pSingleSignalArray ); if ( n == 0 ) pForwardReadSingleSignalArray = pSingleSignalArray; else pReverseReadSingleSignalArray = pSingleSignalArray; } for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // the consensus and human read should be equal assert( pHumanRead->bIsInAlignedPartOfRead( nConsPos ) ); if ( !pHumanRead->ntGetFragFromConsPos( nConsPos ).bIsPad() ) { aSomeCombinedReadHasNoPad.setValue( nConsPos, true ); } char cCombinedBase = 0; int nCombinedQuality = -666; if ( !bGetCombinedReadAtBase( pForwardRead, pForwardReadSingleSignalArray, pReverseRead, pReverseReadSingleSignalArray, nConsPos, cCombinedBase, nCombinedQuality ) ) continue; if ( cCombinedBase != '*' ) aSomeCombinedReadHasNoPad.setValue( nConsPos, true ); } // for( int nConsPos = nGetStartConsensusIndex(); } // for( nSpecies = 0; nSpecies < // remove columns in which all combined reads have a pad // first, do it to the single signal arrays. for( int nRead = 0; nRead < aListOfSingleSignalArrays.length(); ++nRead ) { MBTValOrderedOffsetVector* pSingleSignalArray = aListOfSingleSignalArrays[ nRead ]; for( int nConsPos = nGetEndConsensusIndex(); nConsPos >= nGetStartConsensusIndex(); --nConsPos ) { if ( !aSomeCombinedReadHasNoPad[ nConsPos ] ) { pSingleSignalArray->removeAt( nConsPos ); } } } // for( int nRead = 0; nRead < aListOfReadsInSingleSignalArrays.length(); int nConsPos; for( nConsPos = nGetEndConsensusIndex(); nConsPos >= nGetStartConsensusIndex(); --nConsPos ) { if ( ! aSomeCombinedReadHasNoPad[ nConsPos ] ) { deleteColumn( nConsPos, false ); } } // let's save the assembly to see that everything went ok // no--it makes too many phd files when run over and over--DG 3/28/06 // ConsEd::pGetAssembly()->saveToFile( "temp_no_columns.ace", // false ); // bAceFormat1 // new padded length after deleting columns nPaddedContigLength = nGetPaddedConsLength(); // Now create bit arrays, one for each species. RWTValVector aMaskOfSpecies( nPaddedContigLength + 1, // 1-based 0, // initial "aMaskOfSpecies" ); for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; unsigned int uMaskSpecies = pow( (double) 2, (double) nSpecies ); LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; LocatedFragment* pHumanRead = NULL; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { pHumanRead = pLocFrag; continue; } // the fake read for the 2nd alignment if ( pLocFrag->soGetName().bContains( "b." ) ) { continue; } if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; // check if this is a good read if ( aGoodReads.index( pLocFrag->soGetName() ) == RW_NPOS ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } // there should always be a human read. No, perhaps we are in // a contig that doesn't have one. In that case, ignore this contig. if (! pHumanRead ) return; // find single signal arrays MBTValOrderedOffsetVector* pForwardReadSingleSignalArray = NULL; MBTValOrderedOffsetVector* pReverseReadSingleSignalArray = NULL; for( int nSingleSignalArray = 0; nSingleSignalArray < aListOfReadsInSingleSignalArrays.length(); ++nSingleSignalArray ) { if ( aListOfReadsInSingleSignalArrays[ nSingleSignalArray ] == pForwardRead ) { pForwardReadSingleSignalArray = aListOfSingleSignalArrays[ nSingleSignalArray ]; } else if ( aListOfReadsInSingleSignalArrays[ nSingleSignalArray ] == pReverseRead ) { pReverseReadSingleSignalArray = aListOfSingleSignalArrays[ nSingleSignalArray ]; } } if ( pForwardRead ) { assert( pForwardReadSingleSignalArray ); } if ( pReverseRead ) { assert( pReverseReadSingleSignalArray ); } // at this point, we have whichever reads are available for that // species and their single-signal arrays // now combine the reads at each base position for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // the consensus and human read should be equal assert( pHumanRead->bIsInAlignedPartOfRead( nConsPos ) ); char cCombinedBase = 0; int nCombinedQuality = -666; if ( !bGetCombinedReadAtBase( pForwardRead, pForwardReadSingleSignalArray, pReverseRead, pReverseReadSingleSignalArray, nConsPos, cCombinedBase, nCombinedQuality ) ) continue; aMaskOfSpecies[nConsPos] |= uMaskSpecies; } // for( int nConsPos = nGetStartConsensusIndex(); } // for( int nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); fprintf( pAO, "columnOfSpecies {\n" ); unsigned int uMaxMaskSpecies = pow( 2.0, (double) pAutoReport->aArrayOfSpecies_.length() ) - 1; for( unsigned int uMaskSpecies = 1; uMaskSpecies <= uMaxMaskSpecies; ++uMaskSpecies ) { int nTotalColumnsOfTheseSpecies = 0; for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( ( aMaskOfSpecies[nConsPos] & uMaskSpecies ) == uMaskSpecies ) { ++nTotalColumnsOfTheseSpecies; } } // now convert uMaskSpecies to words RWCString soAllSpecies; for( int nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { unsigned int uMaskForOneSpecies = pow( 2.0, (double) nSpecies ); if ( uMaskSpecies & uMaskForOneSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; if ( soAllSpecies.isNull() ) soAllSpecies = soSpecies; else { soAllSpecies += " "; soAllSpecies += soSpecies; } } } fprintf( pAO, "%s %d\n", soAllSpecies.data(), nTotalColumnsOfTheseSpecies ); } fprintf( pAO, "} columnOfSpecies\n" ); } void Contig :: compareHQSWithLQS( autoReport* pAutoReport ) { // read all of the good reads RWCString soGoodReads = "/me2/gordon/primates/checkHumanGenome/goodReadsBothBatches.out"; // RWCString soGoodReads = "/me2/gordon/primates/checkHumanGenome/tempreads.txt"; FILE* pGoodReads = fopen( soGoodReads.data(), "r" ); if ( !pGoodReads ) { RWCString soError = "could not open "; soError += soGoodReads; THROW_ERROR( soError ); } mbtValOrderedVectorOfRWCString aGoodReads( (size_t) 1000 ); while( fgets( soLine, nMaxLineSize, pGoodReads ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); aGoodReads.insert( soLine ); } fclose( pGoodReads ); const int nNumberOfQualityValues = 100; int n3DArray[nNumberOfQualityValues][2][2]; int nQuality; for( nQuality = 0; nQuality < nNumberOfQualityValues; ++nQuality ) { for( int nHQS = 0; nHQS <= 1; ++nHQS ) { for( int nAgree = 0; nAgree <= 1; ++nAgree ) { n3DArray[nQuality][nHQS][nAgree] = 0; } } } for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains("HSap" ) ) { continue; } if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( aGoodReads.index( pLocFrag->soGetName() ) == RW_NPOS ) continue; mbtValVector* paRelativeAreasOfUncalledBases = NULL; assert( pLocFrag->bGetSingleSignalRelativeAreas( paRelativeAreasOfUncalledBases ) ); int nIncrement = 1; if ( pLocFrag->bComp() ) nIncrement = -1; int nUnpaddedOrientedReadPos = pLocFrag->nOrientedUnpaddedFragPosFromConsPos( pLocFrag->nGetAlignClipStart() ) - nIncrement; for( int nConsPos = pLocFrag->nGetAlignClipStart(); nConsPos <= pLocFrag->nGetAlignClipEnd(); ++nConsPos ) { Ntide ntRead = pLocFrag->ntGetFragFromConsPos( nConsPos ); if ( !ntRead.bIsPad() ) { nUnpaddedOrientedReadPos += nIncrement; } if ( (*paRelativeAreasOfUncalledBases)[ nUnpaddedOrientedReadPos ] != -1.0 ) continue; int nAgree = ( ( ntRead.cGetBase() == ntGetCons( nConsPos ).cGetBase() ) ? 1 : 0 ); // if both are pads, ignore this column if ( ntRead.bIsPad() && ntGetCons( nConsPos ).bIsPad() ) continue; int nHQS = ( pLocFrag->bIsInHighQualitySegmentOfRead( nConsPos ) ? 1 : 0 ); int nQuality = ntRead.qualGetQuality(); ++n3DArray[nQuality][nHQS][nAgree]; } } fprintf( pAO, "quality low quality high quality\n" ); fprintf( pAO, " dis agree dis agree\n" ); fprintf( pAO, "compareHQSAndLQS {\n" ); for( nQuality = 0; nQuality < nNumberOfQualityValues; ++nQuality ) { fprintf(pAO, "%d %d %d %d %d\n", nQuality, n3DArray[nQuality][0][0], n3DArray[nQuality][0][1], n3DArray[nQuality][1][0], n3DArray[nQuality][1][1] ); } fprintf( pAO, "} compareHQSAndLQS\n" ); } void Contig :: lowQualityBasesInHQS( autoReport* pAutoReport ) { // read all of the good reads RWCString soGoodReads = "/me2/gordon/primates/checkHumanGenome/goodReadsBothBatches.out"; // RWCString soGoodReads = "/me2/gordon/primates/checkHumanGenome/tempreads.txt"; FILE* pGoodReads = fopen( soGoodReads.data(), "r" ); if ( !pGoodReads ) { RWCString soError = "could not open "; soError += soGoodReads; THROW_ERROR( soError ); } mbtValOrderedVectorOfRWCString aGoodReads( (size_t) 1000 ); while( fgets( soLine, nMaxLineSize, pGoodReads ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); aGoodReads.insert( soLine ); } fclose( pGoodReads ); for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains("HSap" ) ) { continue; } if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( aGoodReads.index( pLocFrag->soGetName() ) == RW_NPOS ) continue; mbtValVector* paRelativeAreasOfUncalledBases = NULL; assert( pLocFrag->bGetSingleSignalRelativeAreas( paRelativeAreasOfUncalledBases ) ); int nIntersectStart; int nIntersectEnd; if ( !bIntersect( pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), pLocFrag->nGetHighQualityStart(), pLocFrag->nGetHighQualityEnd(), nIntersectStart, nIntersectEnd ) ) continue; int nIncrement = 1; if ( pLocFrag->bComp() ) nIncrement = -1; int nUnpaddedOrientedReadPos = pLocFrag->nOrientedUnpaddedFragPosFromConsPos( nIntersectStart ) - nIncrement; for( int nConsPos = nIntersectStart; nConsPos <= nIntersectEnd; ++nConsPos ) { Ntide ntRead = pLocFrag->ntGetFragFromConsPos( nConsPos ); if ( !ntRead.bIsPad() ) { nUnpaddedOrientedReadPos += nIncrement; } if ( ntRead.qualGetQuality() > 6 ) continue; if ( (*paRelativeAreasOfUncalledBases)[ nUnpaddedOrientedReadPos ] != -1.0 ) continue; // ignore agreeing bases if ( ntRead.cGetBase() == ntGetCons( nConsPos ).cGetBase() ) continue; // if both are pads, ignore this column if ( ntRead.bIsPad() && ntGetCons( nConsPos ).bIsPad() ) continue; fprintf( pAO, "HQS low quality ss discrepant: %s %s %d %d\n", soGetName().data(), // contig pLocFrag->soGetName().data(), nUnpaddedIndex( nConsPos ), (int) ntRead.qualGetQuality() ); } } } void Contig :: singleSignalOrQuality( autoReport* pAutoReport ) { // read all of the good reads RWCString soGoodReads = "/me2/gordon/primates/checkHumanGenome/goodReadsBothBatches.out"; // RWCString soGoodReads = "/me2/gordon/primates/checkHumanGenome/tempreads.txt"; FILE* pGoodReads = fopen( soGoodReads.data(), "r" ); if ( !pGoodReads ) { RWCString soError = "could not open "; soError += soGoodReads; THROW_ERROR( soError ); } mbtValOrderedVectorOfRWCString aGoodReads( (size_t) 1000 ); while( fgets( soLine, nMaxLineSize, pGoodReads ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); aGoodReads.insert( soLine ); } fclose( pGoodReads ); for( int nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; LocatedFragment* pHumanRead = NULL; bool bSpeciesHasBadRead = false; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains("HSap" ) ) { pHumanRead = pLocFrag; continue; } if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; if ( pLocFrag->soGetName().bEndsWith( "b.r" ) ) { cerr << "ignoring read " << pLocFrag->soGetName() << endl; continue; } if ( aGoodReads.index( pLocFrag->soGetName() ) == RW_NPOS ) { bSpeciesHasBadRead = true; break; } if ( pLocFrag->soGetName().bEndsWith(".f" )) { pForwardRead = pLocFrag; } else if ( pLocFrag->soGetName().bEndsWith( ".r" ) ) { pReverseRead = pLocFrag; } else { assert( false ); } } if ( bSpeciesHasBadRead ) continue; // can't use species if all reads aren't there if ( !pForwardRead || !pReverseRead || !pHumanRead ) continue; // now get the relative area arrays for both reads mbtValVector* pRelativeAreaOfUncalledBasesForwardRead = NULL; mbtValVector* pRelativeAreaOfUncalledBasesReverseRead = NULL; assert( pForwardRead->bGetSingleSignalRelativeAreas( pRelativeAreaOfUncalledBasesForwardRead ) ); assert( pReverseRead->bGetSingleSignalRelativeAreas( pRelativeAreaOfUncalledBasesReverseRead ) ); if ( pForwardRead->bIsWholeReadUnaligned() || pReverseRead->bIsWholeReadUnaligned() ) continue; int nConsPosLeft; int nConsPosRight; // just fixed if ( !bIntersect( pForwardRead->nGetAlignClipStart(), pForwardRead->nGetAlignClipEnd(), pReverseRead->nGetAlignClipStart(), pReverseRead->nGetAlignClipEnd(), nConsPosLeft, nConsPosRight ) ) continue; // decided to only use high quality segment bases if ( !bIntersect( nConsPosLeft, nConsPosRight, pForwardRead->nGetHighQualityStart(), pForwardRead->nGetHighQualityEnd(), nConsPosLeft, nConsPosRight ) ) continue; if ( !bIntersect( nConsPosLeft, nConsPosRight, pReverseRead->nGetHighQualityStart(), pReverseRead->nGetHighQualityEnd(), nConsPosLeft, nConsPosRight ) ) continue; for( int nConsPos = nConsPosLeft; nConsPos <= nConsPosRight; ++nConsPos ) { // we are only interested in cases in which the reads do not agree if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) continue; // since they disagree with each other, at least 1 must disagree // with human. // since they disagree with each other, at least 1 must not be // a pad. // we are only interested in case in which higher quality read // has larger relative area int nQualityForward = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); int nQualityReverse = pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); if ( nQualityForward < pCP->nQualityThresholdForFindingHighQualityDiscrepancies_ ) continue; if ( nQualityReverse < pCP->nQualityThresholdForFindingHighQualityDiscrepancies_ ) continue; int nOrientedUnpaddedReadPosOfForwardRead = pForwardRead->nOrientedUnpaddedFragPosFromConsPos( nConsPos ); int nOrientedUnpaddedReadPosOfReverseRead = pReverseRead->nOrientedUnpaddedFragPosFromConsPos( nConsPos ); bool bForwardReadIsSingleSignal = ( (*pRelativeAreaOfUncalledBasesForwardRead)[ nOrientedUnpaddedReadPosOfForwardRead ] == -1 ) ? 1 : 0; bool bReverseReadIsSingleSignal = ( (*pRelativeAreaOfUncalledBasesReverseRead)[ nOrientedUnpaddedReadPosOfReverseRead ] == -1 ) ? 1 : 0; // 3 cases: forward agrees with human, reverse agrees with human, // neither agrees with human char cWhichAgreesWithHuman = ' '; const char cForward = 'f'; const char cReverse = 'r'; const char cNeither = 'n'; if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == ntGetCons( nConsPos ).cGetBase() ) cWhichAgreesWithHuman = cForward; else if ( pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == ntGetCons( nConsPos ).cGetBase() ) cWhichAgreesWithHuman = cReverse; else cWhichAgreesWithHuman = cNeither; int nQualityDifference = -666; bool bWantThisPair = false; bool bHighQualityBaseAgreesWithHuman; bool bLowQualityBaseAgreesWithHuman; if ( nQualityForward > nQualityReverse && !bForwardReadIsSingleSignal && bReverseReadIsSingleSignal ) { bWantThisPair = true; nQualityDifference = nQualityForward - nQualityReverse; bHighQualityBaseAgreesWithHuman = ( cWhichAgreesWithHuman == cForward ); bLowQualityBaseAgreesWithHuman = ( cWhichAgreesWithHuman == cReverse ); } else if ( nQualityForward < nQualityReverse && bForwardReadIsSingleSignal && !bReverseReadIsSingleSignal ) { bWantThisPair = true; nQualityDifference = nQualityReverse - nQualityForward; bHighQualityBaseAgreesWithHuman = ( cWhichAgreesWithHuman == cReverse ); bLowQualityBaseAgreesWithHuman = ( cWhichAgreesWithHuman == cForward ); } if ( bWantThisPair ) { fprintf( pAO, "ss/quality diff: %d high agrees: %s low agrees: %s unpadded: %d %s %s\n", nQualityDifference, szPrintBool( bHighQualityBaseAgreesWithHuman ), szPrintBool( bLowQualityBaseAgreesWithHuman ), nUnpaddedIndex( nConsPos ), soSpecies.data(), soGetName().data() ); } } // for( int nConsPos = nConsPosLeft; nConsPos <= nConsPosRight; ++nConsPos ) { } // for( int nSpecies = 0; nSpecies < } // void Contig :: compareTopAndBottomStrandsWithHuman( autoReport* pAutoReport ) { void Contig :: discrepancyRateInFlankedRegions( autoReport* pAutoReport ) { // read all of the good reads RWCString soGoodReads = pCP->filAutoReportGoodHitReads_; // RWCString soGoodReads = "/me2/gordon/primates/checkHumanGenome/tempreads.txt"; FILE* pGoodReads = fopen( soGoodReads.data(), "r" ); if ( !pGoodReads ) { RWCString soError = "could not open "; soError += soGoodReads; THROW_ERROR( soError ); } mbtValOrderedVectorOfRWCString aGoodReads( (size_t) 1000 ); while( fgets( soLine, nMaxLineSize, pGoodReads ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); aGoodReads.insert( soLine ); } aGoodReads.resort(); fclose( pGoodReads ); // this will change when we delete columns int nPaddedContigLength = nGetPaddedConsLength(); mbtValVectorOfBool aSomeCombinedReadHasNoPad( nPaddedContigLength, 1, "Contig::aSomeCombinedReadHasNoPad" ); const char cSingleSignalUndetermined = 'u'; const char cSingleSignal = 's'; const char cMultipleSignal = 'm'; RWTPtrOrderedVector aListOfReadsInSingleSignalArrays( (size_t) 10 ); typedef MBTValOrderedOffsetVector typeArrayOfChar; RWTPtrOrderedVector aListOfSingleSignalArrays( (size_t) 10 ); LocatedFragment* pHumanRead = NULL; int nSpecies; for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { pHumanRead = pLocFrag; continue; } // the fake read for the 2nd alignment if ( pLocFrag->soGetName().bContains( "b." ) ) { continue; } if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; if ( !aGoodReads.bContains( pLocFrag->soGetName() ) ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } // there should always be a human read. No, perhaps we are in // a contig that doesn't have one. In that case, ignore this contig. if (! pHumanRead ) return; // save single signal arrays before deleting column of pads for( int n = 0; n <= 1; ++n ) { LocatedFragment* pLocFrag = NULL; if ( n == 0 ) pLocFrag = pForwardRead; else pLocFrag = pReverseRead; if ( !pLocFrag ) continue; MBTValOrderedOffsetVector* pSingleSignalArray = NULL; pLocFrag->bGetSingleSignalBasesConsensusLength( pSingleSignalArray ); aListOfReadsInSingleSignalArrays.insert( pLocFrag ); aListOfSingleSignalArrays.insert( pSingleSignalArray ); } for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // the consensus and human read should be equal assert( pHumanRead->bIsInAlignedPartOfRead( nConsPos ) ); if ( !pHumanRead->ntGetFragFromConsPos( nConsPos ).bIsPad() ) { aSomeCombinedReadHasNoPad.setValue( nConsPos, true ); } // cases: use just forward // use just reverse // use both // use neither char cCase = ' '; LocatedFragment* pReadToUse = NULL; if ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos ) && pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) ) { if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { // just so it isn't null: pReadToUse = pForwardRead; cCase = 'b'; // both } else { // will just use one read. Which is it? if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() == pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) { // I don't know what to do in this case so let's ignore // such data points cCase = 'n'; } else if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() > pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) { pReadToUse = pForwardRead; cCase = 'o'; } else { pReadToUse = pReverseRead; cCase = 'o'; } } } else if ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos ) ) { cCase = 'o'; pReadToUse = pForwardRead; } else if ( pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) ) { cCase = 'o'; pReadToUse = pReverseRead; } else { cCase = 'n'; } // if both the human and primate are pads, // skip this base if ( cCase != 'n' && !pReadToUse->ntGetFragFromConsPos( nConsPos ).bIsPad() ) { aSomeCombinedReadHasNoPad.setValue( nConsPos, true ); } } // for( int nConsPos = nGetStartConsensusIndex(); } // for( nSpecies = 0; nSpecies < // remove columns in which all combined reads have a pad // first, do it to the single signal arrays. for( int nRead = 0; nRead < aListOfSingleSignalArrays.length(); ++nRead ) { MBTValOrderedOffsetVector* pSingleSignalArray = aListOfSingleSignalArrays[ nRead ]; for( int nConsPos = nGetEndConsensusIndex(); nConsPos >= nGetStartConsensusIndex(); --nConsPos ) { if ( !aSomeCombinedReadHasNoPad[ nConsPos ] ) { pSingleSignalArray->removeAt( nConsPos ); } } } // for( int nRead = 0; nRead < aListOfReadsInSingleSignalArrays.length(); int nConsPos; for( nConsPos = nGetEndConsensusIndex(); nConsPos >= nGetStartConsensusIndex(); --nConsPos ) { if ( ! aSomeCombinedReadHasNoPad[ nConsPos ] ) { deleteColumn( nConsPos, false ); } } // let's save the assembly to see that everything went ok ConsEd::pGetAssembly()->saveToFile( "temp_no_columns.ace", false ); // bAceFormat1 // new padded length after deleting columns nPaddedContigLength = nGetPaddedConsLength(); mbtValVector aMinQualityInColumn( (size_t) nPaddedContigLength, 1, 666, "aMinQualityInColumn" ); mbtValVector aSpeciesInColumn( (size_t) nPaddedContigLength, 1, 0, "aSpeciesInColumn" ); mbtValVectorOfBool aDiscrepantPositions( nPaddedContigLength, 1, "Contig::aDiscrepantPositions" ); mbtValVectorOfBool aNeededSpeciesAreAlignedAndAgree( nPaddedContigLength, 1, "Contig::aNeededSpeciesAreAlignedAndAgree" ); mbtValVectorOfBool aNeededSpeciesAreAlignedAndHQ( nPaddedContigLength, 1, "Contig::aNeededSpeciesAreAlignedAndHQ" ); for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; unsigned char ucSpeciesMask; if ( soSpecies == "PTro" ) { ucSpeciesMask = ucPTro; } else if ( soSpecies == "PPan" ) { ucSpeciesMask = ucPPan; } else if ( soSpecies == "GGor" ) { ucSpeciesMask = ucGGor; } else if ( soSpecies == "PPyg" ) { ucSpeciesMask = ucPPyg; } else if ( soSpecies == "MMul" ) { ucSpeciesMask = ucMMul; } else { ucSpeciesMask = 0; } LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { pHumanRead = pLocFrag; continue; } if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; if ( !aGoodReads.bContains( pLocFrag->soGetName() ) ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } // there should always be a human read. No, perhaps we are in // a contig that doesn't have one. In that case, ignore this contig. if (! pHumanRead ) return; MBTValOrderedOffsetVector* pForwardReadSingleSignalArray = NULL; MBTValOrderedOffsetVector* pReverseReadSingleSignalArray = NULL; if ( pForwardRead ) { int nIndex = aListOfReadsInSingleSignalArrays.indexByPointers( pForwardRead ); assert( nIndex != RW_NPOS ); pForwardReadSingleSignalArray = aListOfSingleSignalArrays[ nIndex ]; } if ( pReverseRead ) { int nIndex = aListOfReadsInSingleSignalArrays.indexByPointers( pReverseRead ); assert( nIndex != RW_NPOS ); pReverseReadSingleSignalArray = aListOfSingleSignalArrays[ nIndex ]; } int nConsPos; for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // the consensus and human read should be equal assert( pHumanRead->bIsInAlignedPartOfRead( nConsPos ) ); // cases: use just forward // use just reverse // use both // use neither const char cOneRead = 'o'; const char cBothReads = 'b'; const char cNoRead = 'n'; char cCase = ' '; LocatedFragment* pReadToUse = NULL; bool bSingleSignalOfBase = false; bool bForwardIsSingleSignal = false; if ( pForwardReadSingleSignalArray ) { bForwardIsSingleSignal = ( (*pForwardReadSingleSignalArray)[ nConsPos ] == cSingleSignal ); } bool bReverseIsSingleSignal = false; if ( pReverseReadSingleSignalArray ) { bReverseIsSingleSignal = ( (*pReverseReadSingleSignalArray)[ nConsPos ] == cSingleSignal ); } if ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos ) && bForwardIsSingleSignal && pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) && bReverseIsSingleSignal ) { if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() > pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) pReadToUse = pForwardRead; else pReadToUse = pReverseRead; cCase = cBothReads; // both } else { // will just use one read. Which is it? // choose by which read is higher quality if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() == pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) { // no way to choose--don't use cCase = cNoRead; } else if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() > pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) { cCase = cOneRead; pReadToUse = pForwardRead; } else { cCase = cOneRead; pReadToUse = pReverseRead; } } // if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { else } // if ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos ) && pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) ) { else if ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos && bForwardIsSingleSignal ) ) { cCase = cOneRead; pReadToUse = pForwardRead; bSingleSignalOfBase = bForwardIsSingleSignal; } else if ( pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) && bReverseIsSingleSignal ) { cCase = cOneRead; pReadToUse = pReverseRead; bSingleSignalOfBase = bReverseIsSingleSignal; } else { cCase = cNoRead; } if ( cCase != cNoRead ) { if ( pReadToUse->ntGetFragFromConsPos( nConsPos ).cGetBase() != pHumanRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { aDiscrepantPositions.setValue( nConsPos, true ); } aSpeciesInColumn[ nConsPos ] |= ucSpeciesMask; } // set aMinQualityInColumn if ( cCase == cOneRead ) { int nQuality = pReadToUse->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); aMinQualityInColumn[ nConsPos ] = MIN( aMinQualityInColumn[ nConsPos ], nQuality ); } else if ( cCase == cBothReads ) { int nQuality = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() + pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); if ( nQuality > 90 ) nQuality = 90; aMinQualityInColumn[ nConsPos ] = MIN( aMinQualityInColumn[ nConsPos ], nQuality ); } } // for( int nConsPos = nGetStartConsensusIndex(); } // for( int nSpecies = 0; nSpecies < const int nThresholdQuality = 20; int nNumberOfColumnsAboveThreshold = 0; int nNumberOfColumnsWithNeededSpecies = 0; int nNumberOfColumnsWithNeededSpeciesAndQuality = 0; int nNumberOfColumnsWithNeededSpeciesAndQualityAndSingleSignal = 0; unsigned char ucOKMask = ucMMul | ucPPyg | ucGGor; unsigned char ucChimpMask = ucPPan | ucPTro; for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // need to check with Phil if deletion in human is // of interest in making trees. // if ( ntGetCons( nConsPos ).bIsPad() ) { // continue; // } bool bColumnAboveThreshold = false; if ( aMinQualityInColumn[ nConsPos ] != 666 && aMinQualityInColumn[ nConsPos ] >= nThresholdQuality ) { bColumnAboveThreshold = true; ++nNumberOfColumnsAboveThreshold; } bool bColumnWithNeededSpecies = false; if ( ( aSpeciesInColumn[ nConsPos ] & ucOKMask ) == ucOKMask ) { // now check that one of chimps is also present if ( aSpeciesInColumn[ nConsPos ] & ucChimpMask ) { bColumnWithNeededSpecies = true; ++nNumberOfColumnsWithNeededSpecies; // if there are no discrepancies in this column, then // it is an acceptable aligned column: Notice that there // are no quality conditions on the flanking bases. if ( !aDiscrepantPositions[ nConsPos ] ) { aNeededSpeciesAreAlignedAndAgree.setValue( nConsPos, true ); } } } if ( bColumnAboveThreshold && bColumnWithNeededSpecies ) { ++nNumberOfColumnsWithNeededSpeciesAndQuality; aNeededSpeciesAreAlignedAndHQ.setValue( nConsPos, true ); } } // now let's see about the quality of the alignment and see // how many places have a certain alignment // mbtValVectorOfBool aNeededSpeciesAreAlignedAndAgreeSeveralColumns( // nPaddedContigLength, // 1, // "Contig::NeededSpeciesAreAlignedAndAgreeSeveralColumn" ); mbtValVectorOfBool aColumnsFlanked( nPaddedContigLength, 1, "Contig:aColumnsFlanked" ); // nConsPosDiscrepantRegionLeft for( int nConsPosDiscrepantRegionLeft = pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ + 1; // e.g., if consensus is from 1 to 11, then 11 - 5 = 6 which is // the last position to check nConsPosDiscrepantRegionLeft <= ( nGetEndConsensusIndex() - pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ - pCP->nAutoReportSizeOfDiscrepantRegion_ + 1 ); ++nConsPosDiscrepantRegionLeft ) { // check the left flank int nNumberOfFlankingBases = 0; bool bRegionOK = true; int nConsPos2; for( nConsPos2 = nConsPosDiscrepantRegionLeft - 1; bRegionOK && ( nConsPos2 >= nGetStartConsensusIndex() ) && ( nNumberOfFlankingBases < pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ); --nConsPos2 ) { bRegionOK = aNeededSpeciesAreAlignedAndAgree[ nConsPos2 ]; ++nNumberOfFlankingBases; } // a little check for the left edge of the consensus if ( nNumberOfFlankingBases < pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ) bRegionOK = false; // check the right flank nNumberOfFlankingBases = 0; for( nConsPos2 = nConsPosDiscrepantRegionLeft + pCP->nAutoReportSizeOfDiscrepantRegion_; bRegionOK && ( nConsPos2 <= nGetEndConsensusIndex() ) && ( nNumberOfFlankingBases < pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ); ++nConsPos2 ) { bRegionOK = aNeededSpeciesAreAlignedAndAgree[ nConsPos2 ]; ++nNumberOfFlankingBases; } // another little check for the right edge of the consensus if ( nNumberOfFlankingBases < pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ) bRegionOK = false; if ( bRegionOK ) { for( nConsPos2 = nConsPosDiscrepantRegionLeft; nConsPos2 <= nConsPosDiscrepantRegionLeft + pCP->nAutoReportSizeOfDiscrepantRegion_ - 1; ++nConsPos2 ) { aColumnsFlanked.setValue( nConsPos2, true ); fprintf( pAO, "flanked column (unpadded/padded): %d %d\n", nUnpaddedIndex( nConsPos2 ), nConsPos2 ); } } } RWTValVector aNumberOfComparisonsIndexedByQualityValue( 99, 0, "aNumberOfComparisonsIndexedByQualityValue" ); RWTValVector aNumberOfDiscrepanciesIndexedByQualityValue( 99, 0, "aNumberOfDiscrepanciesIndexedByQualityValue" ); for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if (!aColumnsFlanked[ nConsPos ] ) continue; // the consensus and human read should be equal assert( pHumanRead->bIsInAlignedPartOfRead( nConsPos ) ); for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { pHumanRead = pLocFrag; continue; } if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; if ( !aGoodReads.bContains( pLocFrag->soGetName() ) ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } // there should always be a human read. No, perhaps we are in // a contig that doesn't have one. In that case, ignore this contig. if (! pHumanRead ) return; MBTValOrderedOffsetVector* pForwardReadSingleSignalArray = NULL; MBTValOrderedOffsetVector* pReverseReadSingleSignalArray = NULL; if ( pForwardRead ) { int nIndex = aListOfReadsInSingleSignalArrays.indexByPointers( pForwardRead ); assert( nIndex != RW_NPOS ); pForwardReadSingleSignalArray = aListOfSingleSignalArrays[ nIndex ]; } if ( pReverseRead ) { int nIndex = aListOfReadsInSingleSignalArrays.indexByPointers( pReverseRead ); assert( nIndex != RW_NPOS ); pReverseReadSingleSignalArray = aListOfSingleSignalArrays[ nIndex ]; } // cases: use just forward // use just reverse // use both // use neither const char cOneRead = 'o'; const char cBothReads = 'b'; const char cNoRead = 'n'; char cCase = ' '; LocatedFragment* pReadToUse = NULL; bool bSingleSignalOfBase = false; bool bForwardIsSingleSignal = false; if ( pForwardReadSingleSignalArray ) { bForwardIsSingleSignal = ( (*pForwardReadSingleSignalArray)[ nConsPos ] == cSingleSignal ); } bool bReverseIsSingleSignal = false; if ( pReverseReadSingleSignalArray ) { bReverseIsSingleSignal = ( (*pReverseReadSingleSignalArray)[ nConsPos ] == cSingleSignal ); } if ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos ) && bForwardIsSingleSignal && pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) && bReverseIsSingleSignal ) { if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() > pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) pReadToUse = pForwardRead; else pReadToUse = pReverseRead; cCase = cBothReads; // both } else { // will just use one read. Which is it? // choose by which read is higher quality if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() == pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) { // no way to choose--don't use cCase = cNoRead; } else if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() > pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) { cCase = cOneRead; pReadToUse = pForwardRead; } else { cCase = cOneRead; pReadToUse = pReverseRead; } } // if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { else } // if ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos ) && pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) ) { else if ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos && bForwardIsSingleSignal ) ) { cCase = cOneRead; pReadToUse = pForwardRead; bSingleSignalOfBase = bForwardIsSingleSignal; } else if ( pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) && bReverseIsSingleSignal ) { cCase = cOneRead; pReadToUse = pReverseRead; bSingleSignalOfBase = bReverseIsSingleSignal; } else { cCase = cNoRead; } // I don't know what to do when there is no read but this column // is flanked. I guess do not count it either as agreeing or // discrepant. if ( cCase == cNoRead ) { continue; } int nQuality; if ( cCase == cBothReads ) { nQuality = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() + pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); if ( nQuality > 90 ) nQuality = 90; } else { assert( cCase == cOneRead ); nQuality = pReadToUse->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); } bool bAgree = ( pReadToUse->ntGetFragFromConsPos( nConsPos ).cGetBase() == ntGetCons( nConsPos ).cGetBase() ); ++aNumberOfComparisonsIndexedByQualityValue[ nQuality ]; if ( !bAgree ) ++aNumberOfDiscrepanciesIndexedByQualityValue[ nQuality ]; } // for( nSpecies = 0; nSpecies < } // for( nConsPos = nGetStartConsensusIndex(); fprintf( pAO, "flanked bases {\n" ); for( int nQuality = 0; nQuality <= 90; ++nQuality ) { fprintf( pAO, "%d %d %d\n", nQuality, aNumberOfDiscrepanciesIndexedByQualityValue[ nQuality ], aNumberOfComparisonsIndexedByQualityValue[ nQuality ] ); } fprintf( pAO, "} flanked bases\n" ); } void Contig :: countReadsDueToBugInGoodReads( ) { // read all of the good reads RWCString soGoodReads = "/me2/gordon/primates/checkHumanGenome/goodReadsBothBatches.out"; // RWCString soGoodReads = "/me2/gordon/primates/checkHumanGenome/tempreads.txt"; FILE* pGoodReads = fopen( soGoodReads.data(), "r" ); if ( !pGoodReads ) { RWCString soError = "could not open "; soError += soGoodReads; THROW_ERROR( soError ); } mbtValOrderedVectorOfRWCString aGoodReads( (size_t) 1000 ); while( fgets( soLine, nMaxLineSize, pGoodReads ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); aGoodReads.insert( soLine ); } // aGoodReads.resort(); // turn off safety: aGoodReads.bResortWasRun_ = true; fclose( pGoodReads ); int nGoodReadsWithBug = 0; int nRead; for( nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { continue; } // the fake read for the 2nd alignment if ( pLocFrag->soGetName().bContains( "b." ) ) { continue; } if ( !aGoodReads.bContains( pLocFrag->soGetName() ) ) continue; ++nGoodReadsWithBug; } int nGoodReadsWithoutBug = 0; aGoodReads.resort(); for( nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { continue; } // the fake read for the 2nd alignment if ( pLocFrag->soGetName().bContains( "b." ) ) { continue; } if ( !aGoodReads.bContains( pLocFrag->soGetName() ) ) continue; ++nGoodReadsWithoutBug; } fprintf( pAO, "good reads with bug: %d without bug: %d\n", nGoodReadsWithBug, nGoodReadsWithoutBug ); } // this differs from discrepancyRateInFlankedRegions in that all // species must be present in the flanked region just as in the // flanking region void Contig :: discrepancyRateInFlankedRegions2( autoReport* pAutoReport ) { // read all of the good reads RWCString soGoodReads = "/me2/gordon/primates/checkHumanGenome/goodReadsBothBatches.out"; // RWCString soGoodReads = "/me2/gordon/primates/checkHumanGenome/tempreads.txt"; FILE* pGoodReads = fopen( soGoodReads.data(), "r" ); if ( !pGoodReads ) { RWCString soError = "could not open "; soError += soGoodReads; THROW_ERROR( soError ); } mbtValOrderedVectorOfRWCString aGoodReads( (size_t) 1000 ); while( fgets( soLine, nMaxLineSize, pGoodReads ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); aGoodReads.insert( soLine ); } aGoodReads.resort(); fclose( pGoodReads ); // this will change when we delete columns int nPaddedContigLength = nGetPaddedConsLength(); mbtValVectorOfBool aSomeCombinedReadHasNoPad( nPaddedContigLength, 1, "Contig::aSomeCombinedReadHasNoPad" ); const char cSingleSignalUndetermined = 'u'; const char cSingleSignal = 's'; const char cMultipleSignal = 'm'; RWTPtrOrderedVector aListOfReadsInSingleSignalArrays( (size_t) 10 ); typedef MBTValOrderedOffsetVector typeArrayOfChar; RWTPtrOrderedVector aListOfSingleSignalArrays( (size_t) 10 ); LocatedFragment* pHumanRead = NULL; int nSpecies; for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { pHumanRead = pLocFrag; continue; } // the fake read for the 2nd alignment if ( pLocFrag->soGetName().bContains( "b." ) ) { continue; } if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; if ( !aGoodReads.bContains( pLocFrag->soGetName() ) ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } // there should always be a human read. No, perhaps we are in // a contig that doesn't have one. In that case, ignore this contig. if (! pHumanRead ) return; // save single signal arrays before deleting column of pads for( int n = 0; n <= 1; ++n ) { LocatedFragment* pLocFrag = NULL; if ( n == 0 ) pLocFrag = pForwardRead; else pLocFrag = pReverseRead; if ( !pLocFrag ) continue; MBTValOrderedOffsetVector* pSingleSignalArray = NULL; pLocFrag->bGetSingleSignalBasesConsensusLength( pSingleSignalArray ); aListOfReadsInSingleSignalArrays.insert( pLocFrag ); aListOfSingleSignalArrays.insert( pSingleSignalArray ); } for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // the consensus and human read should be equal assert( pHumanRead->bIsInAlignedPartOfRead( nConsPos ) ); if ( !pHumanRead->ntGetFragFromConsPos( nConsPos ).bIsPad() ) { aSomeCombinedReadHasNoPad.setValue( nConsPos, true ); } // cases: use just forward // use just reverse // use both // use neither char cCase = ' '; LocatedFragment* pReadToUse = NULL; if ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos ) && pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) ) { if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { // just so it isn't null: pReadToUse = pForwardRead; cCase = 'b'; // both } else { // will just use one read. Which is it? if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() == pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) { // I don't know what to do in this case so let's ignore // such data points cCase = 'n'; } else if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() > pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) { pReadToUse = pForwardRead; cCase = 'o'; } else { pReadToUse = pReverseRead; cCase = 'o'; } } } else if ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos ) ) { cCase = 'o'; pReadToUse = pForwardRead; } else if ( pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) ) { cCase = 'o'; pReadToUse = pReverseRead; } else { cCase = 'n'; } // if both the human and primate are pads, // skip this base if ( cCase != 'n' && !pReadToUse->ntGetFragFromConsPos( nConsPos ).bIsPad() ) { aSomeCombinedReadHasNoPad.setValue( nConsPos, true ); } } // for( int nConsPos = nGetStartConsensusIndex(); } // for( nSpecies = 0; nSpecies < // remove columns in which all combined reads have a pad // first, do it to the single signal arrays. for( int nRead = 0; nRead < aListOfSingleSignalArrays.length(); ++nRead ) { MBTValOrderedOffsetVector* pSingleSignalArray = aListOfSingleSignalArrays[ nRead ]; for( int nConsPos = nGetEndConsensusIndex(); nConsPos >= nGetStartConsensusIndex(); --nConsPos ) { if ( !aSomeCombinedReadHasNoPad[ nConsPos ] ) { pSingleSignalArray->removeAt( nConsPos ); } } } // for( int nRead = 0; nRead < aListOfReadsInSingleSignalArrays.length(); int nConsPos; for( nConsPos = nGetEndConsensusIndex(); nConsPos >= nGetStartConsensusIndex(); --nConsPos ) { if ( ! aSomeCombinedReadHasNoPad[ nConsPos ] ) { deleteColumn( nConsPos, false ); } } // let's save the assembly to see that everything went ok ConsEd::pGetAssembly()->saveToFile( "temp_no_columns.ace", false ); // bAceFormat1 // new padded length after deleting columns nPaddedContigLength = nGetPaddedConsLength(); mbtValVector aMinQualityInColumn( (size_t) nPaddedContigLength, 1, 666, "aMinQualityInColumn" ); const unsigned char ucPTro = 1; const unsigned char ucPPan = 2; const unsigned char ucGGor = 4; const unsigned char ucPPyg = 8; const unsigned char ucMMul = 16; mbtValVector aSpeciesInColumn( (size_t) nPaddedContigLength, 1, 0, "aSpeciesInColumn" ); mbtValVectorOfBool aDiscrepantPositions( nPaddedContigLength, 1, "Contig::aDiscrepantPositions" ); mbtValVectorOfBool aNeededSpeciesAreAlignedAndAgree( nPaddedContigLength, 1, "Contig::aNeededSpeciesAreAlignedAndAgree" ); mbtValVectorOfBool aNeededSpeciesAreAligned( nPaddedContigLength, 1, "Contig::aNeededSpeciesAreAligned" ); mbtValVectorOfBool aNeededSpeciesAreAlignedAndHQ( nPaddedContigLength, 1, "Contig::aNeededSpeciesAreAlignedAndHQ" ); for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; unsigned char ucSpeciesMask; if ( soSpecies == "PTro" ) { ucSpeciesMask = ucPTro; } else if ( soSpecies == "PPan" ) { ucSpeciesMask = ucPPan; } else if ( soSpecies == "GGor" ) { ucSpeciesMask = ucGGor; } else if ( soSpecies == "PPyg" ) { ucSpeciesMask = ucPPyg; } else if ( soSpecies == "MMul" ) { ucSpeciesMask = ucMMul; } else { ucSpeciesMask = 0; } LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { pHumanRead = pLocFrag; continue; } if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; if ( !aGoodReads.bContains( pLocFrag->soGetName() ) ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } // there should always be a human read. No, perhaps we are in // a contig that doesn't have one. In that case, ignore this contig. if (! pHumanRead ) return; MBTValOrderedOffsetVector* pForwardReadSingleSignalArray = NULL; MBTValOrderedOffsetVector* pReverseReadSingleSignalArray = NULL; if ( pForwardRead ) { int nIndex = aListOfReadsInSingleSignalArrays.indexByPointers( pForwardRead ); assert( nIndex != RW_NPOS ); pForwardReadSingleSignalArray = aListOfSingleSignalArrays[ nIndex ]; } if ( pReverseRead ) { int nIndex = aListOfReadsInSingleSignalArrays.indexByPointers( pReverseRead ); assert( nIndex != RW_NPOS ); pReverseReadSingleSignalArray = aListOfSingleSignalArrays[ nIndex ]; } int nConsPos; for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // the consensus and human read should be equal assert( pHumanRead->bIsInAlignedPartOfRead( nConsPos ) ); // cases: use just forward // use just reverse // use both // use neither const char cOneRead = 'o'; const char cBothReads = 'b'; const char cNoRead = 'n'; char cCase = ' '; LocatedFragment* pReadToUse = NULL; bool bSingleSignalOfBase = false; bool bForwardIsSingleSignal = false; if ( pForwardReadSingleSignalArray ) { bForwardIsSingleSignal = ( (*pForwardReadSingleSignalArray)[ nConsPos ] == cSingleSignal ); } bool bReverseIsSingleSignal = false; if ( pReverseReadSingleSignalArray ) { bReverseIsSingleSignal = ( (*pReverseReadSingleSignalArray)[ nConsPos ] == cSingleSignal ); } if ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos ) && bForwardIsSingleSignal && pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) && bReverseIsSingleSignal ) { if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() > pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) pReadToUse = pForwardRead; else pReadToUse = pReverseRead; cCase = cBothReads; // both } else { // will just use one read. Which is it? // choose by which read is higher quality if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() == pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) { // no way to choose--don't use cCase = cNoRead; } else if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() > pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) { cCase = cOneRead; pReadToUse = pForwardRead; } else { cCase = cOneRead; pReadToUse = pReverseRead; } } // if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { else } // if ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos ) && pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) ) { else if ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos && bForwardIsSingleSignal ) ) { cCase = cOneRead; pReadToUse = pForwardRead; bSingleSignalOfBase = bForwardIsSingleSignal; } else if ( pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) && bReverseIsSingleSignal ) { cCase = cOneRead; pReadToUse = pReverseRead; bSingleSignalOfBase = bReverseIsSingleSignal; } else { cCase = cNoRead; } if ( cCase != cNoRead ) { if ( pReadToUse->ntGetFragFromConsPos( nConsPos ).cGetBase() != pHumanRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { aDiscrepantPositions.setValue( nConsPos, true ); } aSpeciesInColumn[ nConsPos ] |= ucSpeciesMask; } // set aMinQualityInColumn if ( cCase == cOneRead ) { int nQuality = pReadToUse->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); aMinQualityInColumn[ nConsPos ] = MIN( aMinQualityInColumn[ nConsPos ], nQuality ); } else if ( cCase == cBothReads ) { int nQuality = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() + pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); if ( nQuality > 90 ) nQuality = 90; aMinQualityInColumn[ nConsPos ] = MIN( aMinQualityInColumn[ nConsPos ], nQuality ); } } // for( int nConsPos = nGetStartConsensusIndex(); } // for( int nSpecies = 0; nSpecies < const int nThresholdQuality = 20; int nNumberOfColumnsAboveThreshold = 0; int nNumberOfColumnsWithNeededSpecies = 0; int nNumberOfColumnsWithNeededSpeciesAndQuality = 0; int nNumberOfColumnsWithNeededSpeciesAndQualityAndSingleSignal = 0; unsigned char ucOKMask = ucMMul | ucPPyg | ucGGor; unsigned char ucChimpMask = ucPPan | ucPTro; for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // need to check with Phil if deletion in human is // of interest in making trees. // if ( ntGetCons( nConsPos ).bIsPad() ) { // continue; // } bool bColumnAboveThreshold = false; if ( aMinQualityInColumn[ nConsPos ] != 666 && aMinQualityInColumn[ nConsPos ] >= nThresholdQuality ) { bColumnAboveThreshold = true; ++nNumberOfColumnsAboveThreshold; } bool bColumnWithNeededSpecies = false; if ( ( aSpeciesInColumn[ nConsPos ] & ucOKMask ) == ucOKMask ) { // now check that one of chimps is also present if ( aSpeciesInColumn[ nConsPos ] & ucChimpMask ) { bColumnWithNeededSpecies = true; ++nNumberOfColumnsWithNeededSpecies; aNeededSpeciesAreAligned.setValue( nConsPos, true ); // if there are no discrepancies in this column, then // it is an acceptable aligned column: Notice that there // are no quality conditions on the flanking bases. if ( !aDiscrepantPositions[ nConsPos ] ) { aNeededSpeciesAreAlignedAndAgree.setValue( nConsPos, true ); } } } if ( bColumnAboveThreshold && bColumnWithNeededSpecies ) { ++nNumberOfColumnsWithNeededSpeciesAndQuality; aNeededSpeciesAreAlignedAndHQ.setValue( nConsPos, true ); } } // now let's see about the quality of the alignment and see // how many places have a certain alignment // mbtValVectorOfBool aNeededSpeciesAreAlignedAndAgreeSeveralColumns( // nPaddedContigLength, // 1, // "Contig::NeededSpeciesAreAlignedAndAgreeSeveralColumn" ); mbtValVectorOfBool aColumnsFlanked( nPaddedContigLength, 1, "Contig:aColumnsFlanked" ); // nConsPosDiscrepantRegionLeft for( int nConsPosDiscrepantRegionLeft = pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ + 1; // e.g., if consensus is from 1 to 11, then 11 - 5 = 6 which is // the last position to check nConsPosDiscrepantRegionLeft <= ( nGetEndConsensusIndex() - pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ - pCP->nAutoReportSizeOfDiscrepantRegion_ + 1 ); ++nConsPosDiscrepantRegionLeft ) { // check the left flank int nNumberOfFlankingBases = 0; bool bRegionOK = true; int nConsPos2; for( nConsPos2 = nConsPosDiscrepantRegionLeft - 1; bRegionOK && ( nConsPos2 >= nGetStartConsensusIndex() ) && ( nNumberOfFlankingBases < pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ); --nConsPos2 ) { bRegionOK = aNeededSpeciesAreAlignedAndAgree[ nConsPos2 ]; ++nNumberOfFlankingBases; } // a little check for the left edge of the consensus if ( nNumberOfFlankingBases < pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ) bRegionOK = false; // check the right flank nNumberOfFlankingBases = 0; for( nConsPos2 = nConsPosDiscrepantRegionLeft + pCP->nAutoReportSizeOfDiscrepantRegion_; bRegionOK && ( nConsPos2 <= nGetEndConsensusIndex() ) && ( nNumberOfFlankingBases < pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ); ++nConsPos2 ) { bRegionOK = aNeededSpeciesAreAlignedAndAgree[ nConsPos2 ]; ++nNumberOfFlankingBases; } // another little check for the right edge of the consensus if ( nNumberOfFlankingBases < pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ) bRegionOK = false; // check in between that all needed species are present for( nConsPos2 = nConsPosDiscrepantRegionLeft; bRegionOK && ( nConsPos2 <= nGetEndConsensusIndex() ) && ( nConsPos2 <= nConsPosDiscrepantRegionLeft + pCP->nAutoReportSizeOfDiscrepantRegion_ - 1 ); ++nConsPos2 ) { bRegionOK = aNeededSpeciesAreAligned[ nConsPos2 ]; } if ( bRegionOK ) { for( nConsPos2 = nConsPosDiscrepantRegionLeft; nConsPos2 <= nConsPosDiscrepantRegionLeft + pCP->nAutoReportSizeOfDiscrepantRegion_ - 1; ++nConsPos2 ) { aColumnsFlanked.setValue( nConsPos2, true ); fprintf( pAO, "flanked column (unpadded/padded): %d %d\n", nUnpaddedIndex( nConsPos2 ), nConsPos2 ); } } } RWTValVector aNumberOfComparisonsIndexedByQualityValue( 99, 0, "aNumberOfComparisonsIndexedByQualityValue" ); RWTValVector aNumberOfDiscrepanciesIndexedByQualityValue( 99, 0, "aNumberOfDiscrepanciesIndexedByQualityValue" ); for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if (!aColumnsFlanked[ nConsPos ] ) continue; // the consensus and human read should be equal assert( pHumanRead->bIsInAlignedPartOfRead( nConsPos ) ); for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { pHumanRead = pLocFrag; continue; } if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; if ( !aGoodReads.bContains( pLocFrag->soGetName() ) ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } // there should always be a human read. No, perhaps we are in // a contig that doesn't have one. In that case, ignore this contig. if (! pHumanRead ) return; MBTValOrderedOffsetVector* pForwardReadSingleSignalArray = NULL; MBTValOrderedOffsetVector* pReverseReadSingleSignalArray = NULL; if ( pForwardRead ) { int nIndex = aListOfReadsInSingleSignalArrays.indexByPointers( pForwardRead ); assert( nIndex != RW_NPOS ); pForwardReadSingleSignalArray = aListOfSingleSignalArrays[ nIndex ]; } if ( pReverseRead ) { int nIndex = aListOfReadsInSingleSignalArrays.indexByPointers( pReverseRead ); assert( nIndex != RW_NPOS ); pReverseReadSingleSignalArray = aListOfSingleSignalArrays[ nIndex ]; } // cases: use just forward // use just reverse // use both // use neither const char cOneRead = 'o'; const char cBothReads = 'b'; const char cNoRead = 'n'; char cCase = ' '; LocatedFragment* pReadToUse = NULL; bool bSingleSignalOfBase = false; bool bForwardIsSingleSignal = false; if ( pForwardReadSingleSignalArray ) { bForwardIsSingleSignal = ( (*pForwardReadSingleSignalArray)[ nConsPos ] == cSingleSignal ); } bool bReverseIsSingleSignal = false; if ( pReverseReadSingleSignalArray ) { bReverseIsSingleSignal = ( (*pReverseReadSingleSignalArray)[ nConsPos ] == cSingleSignal ); } if ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos ) && bForwardIsSingleSignal && pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) && bReverseIsSingleSignal ) { if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() > pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) pReadToUse = pForwardRead; else pReadToUse = pReverseRead; cCase = cBothReads; // both } else { // will just use one read. Which is it? // choose by which read is higher quality if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() == pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) { // no way to choose--don't use cCase = cNoRead; } else if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() > pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) { cCase = cOneRead; pReadToUse = pForwardRead; } else { cCase = cOneRead; pReadToUse = pReverseRead; } } // if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { else } // if ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos ) && pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) ) { else if ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos && bForwardIsSingleSignal ) ) { cCase = cOneRead; pReadToUse = pForwardRead; bSingleSignalOfBase = bForwardIsSingleSignal; } else if ( pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) && bReverseIsSingleSignal ) { cCase = cOneRead; pReadToUse = pReverseRead; bSingleSignalOfBase = bReverseIsSingleSignal; } else { cCase = cNoRead; } // I don't know what to do when there is no read but this column // is flanked. I guess do not count it either as agreeing or // discrepant. if ( cCase == cNoRead ) { continue; } int nQuality; if ( cCase == cBothReads ) { nQuality = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() + pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); if ( nQuality > 90 ) nQuality = 90; } else { assert( cCase == cOneRead ); nQuality = pReadToUse->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); } bool bAgree = ( pReadToUse->ntGetFragFromConsPos( nConsPos ).cGetBase() == ntGetCons( nConsPos ).cGetBase() ); ++aNumberOfComparisonsIndexedByQualityValue[ nQuality ]; if ( !bAgree ) ++aNumberOfDiscrepanciesIndexedByQualityValue[ nQuality ]; } // for( nSpecies = 0; nSpecies < } // for( nConsPos = nGetStartConsensusIndex(); fprintf( pAO, "flanked bases {\n" ); for( int nQuality = 0; nQuality <= 90; ++nQuality ) { fprintf( pAO, "%d %d %d\n", nQuality, aNumberOfDiscrepanciesIndexedByQualityValue[ nQuality ], aNumberOfComparisonsIndexedByQualityValue[ nQuality ] ); } fprintf( pAO, "} flanked bases\n" ); } void Contig :: compareTopAndBottomStrandsNoHuman( autoReport* pAutoReport ) { // read all of the good reads RWCString soGoodReads = "/me2/gordon/primates/checkHumanGenome/goodReadsBothBatches.out"; // RWCString soGoodReads = "/me2/gordon/primates/checkHumanGenome/tempreads.txt"; FILE* pGoodReads = fopen( soGoodReads.data(), "r" ); if ( !pGoodReads ) { RWCString soError = "could not open "; soError += soGoodReads; THROW_ERROR( soError ); } mbtValOrderedVectorOfRWCString aGoodReads( (size_t) 1000 ); while( fgets( soLine, nMaxLineSize, pGoodReads ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); aGoodReads.insert( soLine ); } fclose( pGoodReads ); aGoodReads.resort(); if ( nGetNumberOfFragsInContig() != 2 ) { fprintf( pAO, "something wrong: not 2 reads in contig\n" ); return; } LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( !aGoodReads.bContains( pLocFrag->soGetName() ) ) { fprintf( pAO, "%s is not a good read", pLocFrag->soGetName().data() ); return; } if ( pLocFrag->soGetName().bEndsWith(".f" )) { pForwardRead = pLocFrag; } else if ( pLocFrag->soGetName().bEndsWith( ".r" ) ) { pReverseRead = pLocFrag; } else { assert( false ); } } assert( pForwardRead && pReverseRead ); if ( pForwardRead->bComp() == pReverseRead->bComp() ) { fprintf( pAO, "something wrong: forward and reverse are in same orientation" ); return; } MBTValOrderedOffsetVector* pForwardSingleSignalArray = NULL; MBTValOrderedOffsetVector* pReverseSingleSignalArray = NULL; pForwardRead->bGetSingleSignalBasesConsensusLength( pForwardSingleSignalArray ); pReverseRead->bGetSingleSignalBasesConsensusLength( pReverseSingleSignalArray ); if ( pForwardRead->bIsWholeReadUnaligned() || pReverseRead->bIsWholeReadUnaligned() ) return; int nConsPosLeft; int nConsPosRight; if ( !bIntersect( pForwardRead->nGetAlignClipStart(), pForwardRead->nGetAlignClipEnd(), pReverseRead->nGetAlignClipStart(), pReverseRead->nGetAlignClipEnd(), nConsPosLeft, nConsPosRight ) ) return; if ( !bIntersect( nConsPosLeft, nConsPosRight, pForwardRead->nGetHighQualityStart(), pForwardRead->nGetHighQualityEnd(), nConsPosLeft, nConsPosRight ) ) return; if ( !bIntersect( nConsPosLeft, nConsPosRight, pReverseRead->nGetHighQualityStart(), pReverseRead->nGetHighQualityEnd(), nConsPosLeft, nConsPosRight ) ) return; fprintf( pAO, "compareTopAndBottomStrandsNoHuman {\n" ); for( int nConsPos = nConsPosLeft; nConsPos <= nConsPosRight; ++nConsPos ) { if ( (*pForwardSingleSignalArray)[nConsPos] != cSingleSignal ) continue; if ( (*pReverseSingleSignalArray)[nConsPos] != cSingleSignal ) continue; // if reached here, both are single signal fprintf( pAO, "%s %d %d %d %s\n", ( pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ? "agree" : "disagree" ), (int ) pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(), (int) pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(), nUnpaddedIndex( nConsPos ), soGetName().data() // contig name ); } fprintf( pAO, "} compareTopAndBottomStrandsNoHuman\n" ); } void Contig :: printFlankedColumns( autoReport* pAutoReport ) { // create combined reads and record those that are: // single signal // above quality threshold // (only using high quality segment of reads) // aligned against human // read all of the good reads RWCString soGoodReads = pCP->filAutoReportGoodHitReads_; // /me2/gordon/primates/checkHumanGenome/goodReadsBothBatches.out FILE* pGoodReads = fopen( soGoodReads.data(), "r" ); if ( !pGoodReads ) { RWCString soError = "could not open "; soError += soGoodReads; THROW_ERROR( soError ); } mbtValOrderedVectorOfRWCString aGoodReads( (size_t) 1000 ); while( fgets( soLine, nMaxLineSize, pGoodReads ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); aGoodReads.insert( soLine ); } fclose( pGoodReads ); aGoodReads.resort(); mbtValVector aMaskOfSpeciesMeetingCriteria( nGetPaddedConsLength(), 1, // starts at 1 0, // initial value "Contig::aMaskOfSpecies" ); mbtValVector aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman( nGetPaddedConsLength(), 1, // starts at 1 0, // initial value "Contig::aMaskOfSpecies" ); const unsigned char ucPTro = 1; const unsigned char ucPPan = 2; const unsigned char ucGGor = 4; const unsigned char ucPPyg = 8; const unsigned char ucMMul = 16; RWTPtrVector aCombinedReads( (size_t) 5, NULL, // initial value "aCombinedReads" ); int nSpecies; for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; unsigned char ucSpeciesMask; if ( soSpecies == "PTro" ) { ucSpeciesMask = ucPTro; } else if ( soSpecies == "PPan" ) { ucSpeciesMask = ucPPan; } else if ( soSpecies == "GGor" ) { ucSpeciesMask = ucGGor; } else if ( soSpecies == "PPyg" ) { ucSpeciesMask = ucPPyg; } else if ( soSpecies == "MMul" ) { ucSpeciesMask = ucMMul; } else { ucSpeciesMask = 0; } const unsigned char ucForward = 1; const unsigned char ucReverse = 2; const unsigned char ucBoth = ucForward | ucReverse; mbtValVector aWhichReadToUse( nGetPaddedConsLength(), 1, // starts at 1 0, // initial value "Contig::aMaskOfSpecies" ); LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; int nRead; for( nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; if ( !aGoodReads.bContains( pLocFrag->soGetName() ) ) continue; int nLeftConsPos; int nRightConsPos; if ( !pLocFrag->bGetAlignedHighQualitySegment( nLeftConsPos, nRightConsPos ) ) continue; unsigned char ucForwardOrReverse; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; ucForwardOrReverse = ucForward; } else { pReverseRead = pLocFrag; ucForwardOrReverse = ucReverse; } MBTValOrderedOffsetVector* pSingleSignalArray = NULL; pLocFrag->bGetSingleSignalBasesConsensusLength( pSingleSignalArray ); for( int nConsPos = nLeftConsPos; nConsPos <= nRightConsPos; ++nConsPos ) { if ( (*pSingleSignalArray)[ nConsPos ] == cSingleSignal ) { aWhichReadToUse[ nConsPos ] |= ucForwardOrReverse; } } } if ( !pForwardRead && !pReverseRead ) continue; // create a new read which is the composite of the // existing reads RWCString soCombinedReadName = soSpecies + ".comb"; LocatedFragment* pCombinedRead = new LocatedFragment( soCombinedReadName, 1, // aligned position within contig false, // bComplemented this ); // contig // this array allows me to get the combined read // by the species index aCombinedReads[ nSpecies ] = pCombinedRead; // don't know what to do about phd file // no chromat // let's fill it right now with N's from the beginning to the // end of the contig int nConsPos; for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { pCombinedRead->appendNtide( Ntide('N', 0 ) ); } for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { int nQuality; LocatedFragment* pReadToUse = NULL; if ( aWhichReadToUse[ nConsPos ] == ucForward ) { nQuality = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); pReadToUse = pForwardRead; } else if ( aWhichReadToUse[ nConsPos ] == ucReverse ) { nQuality = pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); pReadToUse = pReverseRead; } else if ( aWhichReadToUse[ nConsPos ] == ucBoth ) { // can only add qualities if the reads agree if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { nQuality = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() + pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); nQuality = MIN( nQuality, 90 ); pReadToUse = pForwardRead; } else { // reads disagree if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() > pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) pReadToUse = pForwardRead; else pReadToUse = pReverseRead; nQuality = pReadToUse->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); } } else { // neither read can be used continue; } char cBase = pReadToUse->ntGetFragFromConsPos( nConsPos ).cGetBase(); pCombinedRead->Sequence::setBaseAtPos( pCombinedRead->nGetFragIndexFromConsPos(nConsPos), cBase); pCombinedRead->Sequence::setQualityAtSeqPos( pCombinedRead->nGetFragIndexFromConsPos( nConsPos ), nQuality ); if ( nQuality >= pCP->nQualityThresholdForFindingHighQualityDiscrepancies_ ) { aMaskOfSpeciesMeetingCriteria[ nConsPos ] |= ucSpeciesMask; } // note that I do not allow a column of pads to be considered // agreeing with human. Thus it cannot flank. (If there // are 10 columns of pads and 3 non-columns of pads between // 2 flanking columns, those 3 non-column of pads should be // considered flanked.) if ( ( aMaskOfSpeciesMeetingCriteria[ nConsPos ] & ucSpeciesMask ) && pReadToUse->ntGetFragFromConsPos( nConsPos ).cGetBase() == ntGetCons( nConsPos ).cGetBase() && !ntGetCons( nConsPos ).bIsPad() ) { aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPos ] |= ucSpeciesMask; } } } // for( nSpecies = 0; nSpecies < aSpecies.length(); ++nSpecies ) { // now look for flanked columns mbtValVectorOfBool aColumnsAlreadyUsed( nGetPaddedConsLength(), 1, // starting position "aColumnsAlreadyUsed" ); unsigned char ucMaskOfAll = ucPTro | ucPPan | ucGGor | ucPPyg | ucMMul; fprintf( pAO, "printFlankedColumns {\n" ); fprintf( pAO, "order of species: HSap" ); for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; fprintf( pAO, " %s", soSpecies.data() ); } fprintf( pAO, "\n" ); for( int nConsPosOfStartingColumn = nGetStartConsensusIndex(); nConsPosOfStartingColumn <= nGetEndConsensusIndex(); ++nConsPosOfStartingColumn ) { if ( ( aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPosOfStartingColumn ] & ucMaskOfAll) == ucMaskOfAll ) { for( int nConsPosOfEndingColumn = nConsPosOfStartingColumn + 2; nConsPosOfEndingColumn <= MIN( nConsPosOfStartingColumn + 4, nGetEndConsensusIndex() ); ++nConsPosOfEndingColumn ) { if ( ( aMaskOfSpeciesMeetingCriteria[ nConsPosOfEndingColumn ] & ucMaskOfAll ) == ucMaskOfAll ) { // found a pair of flanking columns for( int nConsPos = nConsPosOfStartingColumn + 1; nConsPos <= nConsPosOfEndingColumn - 1; ++nConsPos ) { // do not print out the same column more than once if ( aColumnsAlreadyUsed[ nConsPos ] ) continue; aColumnsAlreadyUsed.setValue( nConsPos, true ); fprintf( pAO, "%d %d %c", nUnpaddedIndex( nConsPos ), nConsPos, ntGetCons( nConsPos ).cGetBase() ); for( int nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; LocatedFragment* pCombinedRead = aCombinedReads[ nSpecies ]; unsigned char ucSpeciesMask; if ( soSpecies == "PTro" ) { ucSpeciesMask = ucPTro; } else if ( soSpecies == "PPan" ) { ucSpeciesMask = ucPPan; } else if ( soSpecies == "GGor" ) { ucSpeciesMask = ucGGor; } else if ( soSpecies == "PPyg" ) { ucSpeciesMask = ucPPyg; } else if ( soSpecies == "MMul" ) { ucSpeciesMask = ucMMul; } else { ucSpeciesMask = 0; } if ( pCombinedRead && ( aMaskOfSpeciesMeetingCriteria[ nConsPos ] & ucSpeciesMask ) ) { fprintf( pAO, " %c", pCombinedRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ); } else { fprintf( pAO, " ?" ); } } // for( int nSpecies = 0; fprintf( pAO, " %s\n", soGetName().data() ); } // for( int nConsPos = nConsPosOfStartingColumn + 1; } // if ( aMaskOfSpeciesMeetingCriteria[ nConsPosOfEndingColumn ] & } // for( int nConsPosOfEndingColumn = nConsPosOfStartingColumn + 2; } // if ( aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPosOfStartingColumn ] & } // for( int nConsPosOfStartingColumn = nGetStartConsensusIndex(); fprintf( pAO, "} printFlankedColumns\n" ); } void Contig :: highQualitySegmentData() { int nAlignedBases = 0; int nAlignedHQSBases = 0; // read all of the good reads RWCString soGoodReads = "/me2/gordon/primates/checkHumanGenome/goodReadsBothBatches.out"; FILE* pGoodReads = fopen( soGoodReads.data(), "r" ); if ( !pGoodReads ) { RWCString soError = "could not open "; soError += soGoodReads; THROW_ERROR( soError ); } mbtValOrderedVectorOfRWCString aGoodReads( (size_t) 1000 ); while( fgets( soLine, nMaxLineSize, pGoodReads ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); aGoodReads.insert( soLine ); } fclose( pGoodReads ); aGoodReads.resort(); for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { continue; } if ( !aGoodReads.bContains( pLocFrag->soGetName() ) ) continue; int nConsPosStart = pLocFrag->nGetAlignClipStart(); int nConsPosEnd = pLocFrag->nGetAlignClipEnd(); for( int nConsPos = nConsPosStart; nConsPos <= nConsPosEnd; ++nConsPos ) { if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).bIsPad() ) continue; ++nAlignedBases; if ( pLocFrag->bIsInHighQualitySegmentOfRead( nConsPos ) ) { ++nAlignedHQSBases; } } } fprintf( pAO, "aligned bases: %d aligned hqs bases: %d\n", nAlignedBases, nAlignedHQSBases ); } static int nCmpLocatedFragmentAlphabetically( const LocatedFragment** ppLocFrag1, const LocatedFragment** ppLocFrag2 ) { if ( (*ppLocFrag1)->soSequenceName_ < (*ppLocFrag2)->soSequenceName_ ) return( -1 ); else if ( (*ppLocFrag2)->soSequenceName_ < (*ppLocFrag1)->soSequenceName_ ) return( 1 ); else return( 0 ); } void Contig :: printSingleSignalBases( const int nUnpadded ) { RWTPtrOrderedVector aReads; int nConsPos = nPaddedIndex( nUnpadded ); int nRead; for( nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->bIsInRead( nConsPos ) ) { aReads.insert( pLocFrag ); } } // sort reads alphabetically qsort( aReads.data(), aReads.length(), sizeof( LocatedFragment* ), ( (int(*) ( const void*, const void*) ) nCmpLocatedFragmentAlphabetically ) ); bool bSorted = true; for( nRead = 0; nRead <= aReads.length() - 2; ++nRead ) { if ( aReads[ nRead ]->soGetName() > aReads[ nRead + 1 ]->soGetName() ) { bSorted = false; cerr << "Reads " << nRead << " " << aReads[ nRead ]->soGetName() << " and " << nRead + 1 << " " << aReads[ nRead + 1 ]->soGetName() << " are out of order." << endl; cerr.flush(); } } if ( !bSorted ) assert( false ); RWTValOrderedVector aSingleSignal; RWTValOrderedVector aReadPos; for( nRead = 0; nRead < aReads.length(); ++nRead ) { LocatedFragment* pLocFrag = aReads[ nRead ]; MBTValOrderedOffsetVector* pSingleSignalArray = NULL; if ( !pLocFrag->bGetSingleSignalBasesConsensusLength( pSingleSignalArray ) ) { cerr << "problem getting single signal info for " << pLocFrag->soGetName() << endl; } int nUnpaddedPosInDirectionOfSequencing = pLocFrag->nOrientedUnpaddedFragPosFromConsPos( nConsPos ); aSingleSignal.insert( (*pSingleSignalArray)[ nConsPos ] ); aReadPos.insert( nUnpaddedPosInDirectionOfSequencing ); } for( nRead = 0; nRead < aReads.length(); ++nRead ) { LocatedFragment* pLocFrag = aReads[ nRead ]; cerr << pLocFrag->soGetName() << " " << aSingleSignal[ nRead ] << " " << aReadPos[ nRead ] << endl; } } bool Contig :: bGetCombinedReadAtBase( LocatedFragment* pForwardRead, MBTValOrderedOffsetVector* pForwardReadSingleSignalArray, LocatedFragment* pReverseRead, MBTValOrderedOffsetVector* pReverseReadSingleSignalArray, const int nConsPos, char& cCombinedBase, int& nCombinedQuality ) { bool bForwardReadBaseOK = ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos ) && pForwardRead->bIsInHighQualitySegmentOfRead( nConsPos ) && ( (*pForwardReadSingleSignalArray)[ nConsPos ] == cSingleSignal ) ) ? true : false; bool bReverseReadBaseOK = ( pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) && pReverseRead->bIsInHighQualitySegmentOfRead( nConsPos ) && ( (*pReverseReadSingleSignalArray)[ nConsPos ] == cSingleSignal ) ) ? true : false; if ( bForwardReadBaseOK && bReverseReadBaseOK ) { if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { cCombinedBase = pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); nCombinedQuality = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() + pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); nCombinedQuality = MIN( nCombinedQuality, 90 ); } else { // reads disagree. Which one should we use? // will just use one read. Which is it? if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() == pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) { // I don't know what to do in this case so let's ignore // such data points return false; } else if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() > pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) { nCombinedQuality = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); cCombinedBase = pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } else { nCombinedQuality = pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); cCombinedBase = pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } } } // if ( bForwardReadBaseOK && bReverseReadBaseOK ) { else if ( bForwardReadBaseOK ) { nCombinedQuality = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); cCombinedBase = pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } else if ( bReverseReadBaseOK ) { nCombinedQuality = pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); cCombinedBase = pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } else { // neither read's base is ok return false; } if ( nCombinedQuality < pCP->nQualityThresholdForFindingHighQualityDiscrepancies_ ) return false; return true; } bool Contig :: bGetCombinedReadAtBaseAndReportProblem( const char cForFlankingOrNotForFlanking, LocatedFragment* pForwardRead, MBTValOrderedOffsetVector* pForwardReadSingleSignalArray, LocatedFragment* pReverseRead, MBTValOrderedOffsetVector* pReverseReadSingleSignalArray, const int nConsPos, char& cCombinedBase, int& nCombinedQuality, char& cProblem ) { nCombinedQuality = 0; // will be changed if read is ok bool bForwardReadBaseOK = ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos ) && pForwardRead->bIsInHighQualitySegmentOfRead( nConsPos ) ) ? true : false; if ( ( cForFlankingOrNotForFlanking == cNotForFlanking ) || pCP->bAutoReportFlankingBasesMustBeSingleSignal_ ) { bForwardReadBaseOK = bForwardReadBaseOK && ( (*pForwardReadSingleSignalArray)[ nConsPos ] == cSingleSignal ); } bool bReverseReadBaseOK = ( pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) && pReverseRead->bIsInHighQualitySegmentOfRead( nConsPos ) ) ? true : false; if ( ( cForFlankingOrNotForFlanking == cNotForFlanking ) || pCP->bAutoReportFlankingBasesMustBeSingleSignal_ ) { bReverseReadBaseOK = bReverseReadBaseOK && ( (*pReverseReadSingleSignalArray)[ nConsPos ] == cSingleSignal ); } if ( bForwardReadBaseOK && bReverseReadBaseOK ) { if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { cCombinedBase = pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); nCombinedQuality = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() + pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); nCombinedQuality = MIN( nCombinedQuality, 90 ); } else { // reads disagree. Which one should we use? // will just use one read. Which is it? if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() == pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) { // I don't know what to do in this case so let's ignore // such data points cProblem = cProblemReadsDisagree; cCombinedBase = pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); return false; } else if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() > pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) { nCombinedQuality = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); cCombinedBase = pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } else { nCombinedQuality = pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); cCombinedBase = pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } } } // if ( bForwardReadBaseOK && bReverseReadBaseOK ) { else if ( bForwardReadBaseOK ) { nCombinedQuality = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); cCombinedBase = pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } else if ( bReverseReadBaseOK ) { nCombinedQuality = pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); cCombinedBase = pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } else { assert( !bForwardReadBaseOK ); assert( !bReverseReadBaseOK ); if ( pForwardRead && !pReverseRead) { if ( ! pForwardRead->bIsInAlignedPartOfRead( nConsPos ) ) { cProblem = cProblemUnaligned; cCombinedBase = '?'; } else if ( ! pForwardRead->bIsInHighQualitySegmentOfRead( nConsPos ) ) { cProblem = cProblemLowQualitySegment; cCombinedBase = pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } else if ( (*pForwardReadSingleSignalArray)[ nConsPos ] != cSingleSignal ) { cProblem = cProblemMultipleSignal; cCombinedBase = pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } else assert( false ); } else if ( pReverseRead && !pForwardRead ) { assert( pReverseRead ); if ( ! pReverseRead->bIsInAlignedPartOfRead( nConsPos ) ) { cProblem = cProblemUnaligned; cCombinedBase = '?'; } else if ( ! pReverseRead->bIsInHighQualitySegmentOfRead( nConsPos ) ) { cProblem = cProblemLowQualitySegment; cCombinedBase = pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } else if ( (*pReverseReadSingleSignalArray)[ nConsPos ] != cSingleSignal ) { cProblem = cProblemMultipleSignal; cCombinedBase = pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } else assert( false ); } else { assert( pForwardRead && pReverseRead ); if ( pForwardRead->bIsInAlignedPartOfRead( nConsPos ) && pForwardRead->bIsInHighQualitySegmentOfRead( nConsPos ) ) { assert( (*pForwardReadSingleSignalArray)[ nConsPos ] != cSingleSignal ); cProblem = cProblemMultipleSignal; // not trying very hard to find the correct base. Just // taking the forward read's base arbitrarily cCombinedBase = pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } else if ( pReverseRead->bIsInAlignedPartOfRead( nConsPos ) && pReverseRead->bIsInHighQualitySegmentOfRead( nConsPos ) ) { assert( (*pReverseReadSingleSignalArray)[ nConsPos ] != cSingleSignal ); cProblem = cProblemMultipleSignal; cCombinedBase = pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } else if ( pForwardRead->bIsInAlignedPartOfRead( nConsPos ) ) { assert( !pForwardRead->bIsInHighQualitySegmentOfRead( nConsPos ) ); cProblem = cProblemLowQualitySegment; cCombinedBase = pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } else if ( pReverseRead->bIsInAlignedPartOfRead( nConsPos ) ) { assert( !pReverseRead->bIsInHighQualitySegmentOfRead( nConsPos ) ); cProblem = cProblemLowQualitySegment; cCombinedBase = pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } else { assert( !pForwardRead->bIsInAlignedPartOfRead( nConsPos ) ); assert( !pReverseRead->bIsInAlignedPartOfRead( nConsPos ) ); cProblem = cProblemUnaligned; cCombinedBase = '?'; } } // neither read's base is ok return false; } int nMinQuality; if ( cForFlankingOrNotForFlanking == cForFlanking ) { nMinQuality = pCP->nAutoReportMinimumQualityOfFlankingBases_; } else { nMinQuality = pCP->nQualityThresholdForFindingHighQualityDiscrepancies_; } if ( nCombinedQuality < nMinQuality ) { cProblem = cProblemLowQuality; return false; } cProblem = cProblemNone; return true; }