/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "contig.h" #include "consedParameters.h" #include "locatedFragment.h" #include "bIntersect.h" #include "mbtValVectorOfBool.h" #include "mbtValVector.h" #include "rwtptrorderedvector.h" #include "ntide.h" #include "fileDefines.h" #include "soAddCommas.h" #include "rwctokenizer.h" #include "autoReport.h" #include "consed.h" #include "assembly.h" #include "nNumberFromBase.h" #include "setErrorRateFromQuality.h" #include "dErrorRateFromQuality.h" #include "singletInfo.h" #include "readPHDAgainForHighQualitySegment.h" #include using namespace std; #include #include #include "rwcregexp.h" #include "getAllSinglets.h" #include "mbt_val_ord_offset_vec.h" #include "soLine.h" #include "max.h" #include #include "findAncestralTrees.h" #include "primateSpecies.h" #include "ucGetMaskForSpecies.h" #include "bAllNeededSpecies.h" #include "soGetRequiredBases.h" bool Contig :: bGetCombinedReadAtBase2( const char cForFlankingOrNotForFlanking, LocatedFragment* pForwardRead, MBTValOrderedOffsetVector* pForwardReadSingleSignalArray, LocatedFragment* pReverseRead, MBTValOrderedOffsetVector* pReverseReadSingleSignalArray, const int nConsPos, char& cCombinedBase, int& nCombinedQuality ) { cCombinedBase = '?'; nCombinedQuality = 0; // will be changed if read is ok bool bForwardReadBaseOK = ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos ) && pForwardRead->bIsInHighQualitySegmentOfRead( nConsPos ) ) ? true : false; if ( ( cForFlankingOrNotForFlanking == cNotForFlanking ) || pCP->bAutoReportFlankingBasesMustBeSingleSignal_ ) { bForwardReadBaseOK = bForwardReadBaseOK && ( (*pForwardReadSingleSignalArray)[ nConsPos ] == cSingleSignal ); } bool bReverseReadBaseOK = ( pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) && pReverseRead->bIsInHighQualitySegmentOfRead( nConsPos ) ) ? true : false; if ( ( cForFlankingOrNotForFlanking == cNotForFlanking ) || pCP->bAutoReportFlankingBasesMustBeSingleSignal_ ) { bReverseReadBaseOK = bReverseReadBaseOK && ( (*pReverseReadSingleSignalArray)[ nConsPos ] == cSingleSignal ); } if ( bForwardReadBaseOK && bReverseReadBaseOK ) { if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { cCombinedBase = pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); nCombinedQuality = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() + pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); nCombinedQuality = MIN( nCombinedQuality, 90 ); } else { // reads disagree. Which one should we use? // will just use one read. Which is it? if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() == pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) { // I don't know what to do in this case so let's ignore // such data points return false; } else if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() > pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) { nCombinedQuality = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); cCombinedBase = pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } else { nCombinedQuality = pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); cCombinedBase = pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } } } // if ( bForwardReadBaseOK && bReverseReadBaseOK ) { else if ( bForwardReadBaseOK ) { nCombinedQuality = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); cCombinedBase = pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } else if ( bReverseReadBaseOK ) { nCombinedQuality = pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); cCombinedBase = pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); } else { // neither read's base is ok return false; } int nMinQuality; if ( cForFlankingOrNotForFlanking == cForFlanking ) { nMinQuality = pCP->nAutoReportMinimumQualityOfFlankingBases_; } else { nMinQuality = pCP->nQualityThresholdForFindingHighQualityDiscrepancies_; } if ( nCombinedQuality < nMinQuality ) { return false; } return true; } void Contig :: discrepancyRateInFlankedRegions4( autoReport* pAutoReport ) { // read all of the good reads RWCString soGoodReads = pCP->filAutoReportGoodHitReads_; FILE* pGoodReads = fopen( soGoodReads.data(), "r" ); if ( !pGoodReads ) { RWCString soError = "could not open "; soError += soGoodReads; THROW_ERROR( soError ); } RWTPtrOrderedVector aGoodReads( (size_t) 10 ); while( fgets( soLine, nMaxLineSize, pGoodReads ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName() == soLine ) { // the fake read for the 2nd alignment assert( !pLocFrag->soGetName().bContains( "b." ) ); aGoodReads.insert( pLocFrag ); } } } // aGoodReads.resort(); no need--we will check each one anyway fclose( pGoodReads ); // find pHumanRead LocatedFragment* pHumanRead = NULL; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { pHumanRead = pLocFrag; break; } } assert( pHumanRead ); RWTPtrOrderedVector aListOfReadsInSingleSignalArrays( (size_t) 10 ); typedef MBTValOrderedOffsetVector typeArrayOfChar; RWTPtrOrderedVector aListOfSingleSignalArrays( (size_t) 10 ); // get single signal arrays int nSpecies; for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; for( int nRead = 0; nRead < aGoodReads.length(); ++nRead ) { LocatedFragment* pLocFrag = aGoodReads[ nRead ]; if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } // save single signal arrays before deleting column of pads for( int n = 0; n <= 1; ++n ) { LocatedFragment* pLocFrag = NULL; if ( n == 0 ) pLocFrag = pForwardRead; else pLocFrag = pReverseRead; if ( !pLocFrag ) continue; MBTValOrderedOffsetVector* pSingleSignalArray = NULL; assert( pLocFrag->bGetSingleSignalBasesConsensusLength( pSingleSignalArray ) ); aListOfReadsInSingleSignalArrays.insert( pLocFrag ); aListOfSingleSignalArrays.insert( pSingleSignalArray ); } } // for( nSpecies = 0; nSpecies < deleteColumnsOfPads( true, // bAddQualityValuesOfStrands pAutoReport, pHumanRead, aGoodReads, aListOfReadsInSingleSignalArrays, aListOfSingleSignalArrays ); // new padded length after deleting columns int nPaddedContigLength = nGetPaddedConsLength(); // adjust read qualities for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; for( int nRead = 0; nRead < aGoodReads.length(); ++nRead ) { LocatedFragment* pLocFrag = aGoodReads[ nRead ]; if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } if ( pForwardRead && pReverseRead ) { int nIntersectLeft; int nIntersectRight; if ( bIntersect( pForwardRead->nGetAlignClipStart(), pForwardRead->nGetAlignClipEnd(), pReverseRead->nGetAlignClipStart(), pReverseRead->nGetAlignClipEnd(), nIntersectLeft, nIntersectRight ) ) { for( int nConsPos = nIntersectLeft; nConsPos <= nIntersectRight; ++nConsPos ) { if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { int nCombinedQuality = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() + pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); nCombinedQuality = MIN( nCombinedQuality, 90 ); pForwardRead->setQualityAtConsPos( nConsPos, nCombinedQuality ); pReverseRead->setQualityAtConsPos( nConsPos, nCombinedQuality ); } } // now adjust the high quality segments pForwardRead->calculateHighQualitySegmentOfRead(); pReverseRead->calculateHighQualitySegmentOfRead(); } // if ( bIntersect( pForwardRead->nGetAlignClipStart(), } } // create a combined read, one for each species // each combined read will be the length of the entire consensus RWTPtrVector aCombinedReads( (size_t) pAutoReport->aArrayOfSpecies_.length(), NULL, // initial value "aCombinedReads" ); for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; RWCString soCombinedReadName = soSpecies + ".comb"; LocatedFragment* pCombinedRead = new LocatedFragment( soCombinedReadName, 1, // aligned position within contig false, // bComplemented this ); // contig aCombinedReads[ nSpecies ] = pCombinedRead; for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { pCombinedRead->appendNtide( Ntide( 'n', 0 ) ); } } mbtValVector aMaskOfSpeciesMeetingCriteria( nGetPaddedConsLength(), 1, // starts at 1 0, // initial value "Contig::aMaskOfSpeciesMeetingCriteria" ); mbtValVector aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman( nGetPaddedConsLength(), 1, // starts at 1 0, // initial value "Contig::aMaskOfSpeciesAndAgreeingWithHuman" ); for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; unsigned char ucSpeciesMask = ucGetMaskForSpecies( soSpecies ); LocatedFragment* pCombinedRead = aCombinedReads[ nSpecies ]; assert( pCombinedRead->soGetName().bContains( soSpecies ) ); LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; for( int nRead = 0; nRead < aGoodReads.length(); ++nRead ) { LocatedFragment* pLocFrag = aGoodReads[ nRead ]; if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } MBTValOrderedOffsetVector* pForwardReadSingleSignalArray = NULL; MBTValOrderedOffsetVector* pReverseReadSingleSignalArray = NULL; if ( pForwardRead ) { int nIndex = aListOfReadsInSingleSignalArrays.indexByPointers( pForwardRead ); assert( nIndex != RW_NPOS ); pForwardReadSingleSignalArray = aListOfSingleSignalArrays[ nIndex ]; } if ( pReverseRead ) { int nIndex = aListOfReadsInSingleSignalArrays.indexByPointers( pReverseRead ); assert( nIndex != RW_NPOS ); pReverseReadSingleSignalArray = aListOfSingleSignalArrays[ nIndex ]; } int nConsPos; for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // the consensus and human read should be equal assert( pHumanRead->bIsInAlignedPartOfRead( nConsPos ) ); char cCombinedBase = 0; int nCombinedQuality = -666; // new code to compare with discrepancyRateInFlankedRegions5 bool bCombinedReadALittleOK; bool bCombinedReadOK = bGetCombinedReadAtBase3( false, // bAddQualityValueOfStrands pForwardRead, pForwardReadSingleSignalArray, pReverseRead, pReverseReadSingleSignalArray, nConsPos, cCombinedBase, nCombinedQuality, bCombinedReadALittleOK ); if ( bCombinedReadOK ) { pCombinedRead->Sequence::setBaseAtPos( pCombinedRead->nGetFragIndexFromConsPos( nConsPos ), cCombinedBase ); pCombinedRead->Sequence::setQualityAtSeqPos( pCombinedRead->nGetFragIndexFromConsPos( nConsPos ), nCombinedQuality ); aMaskOfSpeciesMeetingCriteria[ nConsPos ] |= ucSpeciesMask; } if ( bCombinedReadOK || bCombinedReadALittleOK ) { if ( ( cCombinedBase == ntGetCons( nConsPos ).cGetBase() ) && !ntGetCons( nConsPos ).bIsPad() ) { aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPos ] |= ucSpeciesMask; } } // old code // if ( bGetCombinedReadAtBase2( cNotForFlanking, // pForwardRead, // pForwardReadSingleSignalArray, // pReverseRead, // pReverseReadSingleSignalArray, // nConsPos, // cCombinedBase, // nCombinedQuality ) ) { // pCombinedRead->Sequence::setBaseAtPos( // pCombinedRead->nGetFragIndexFromConsPos( nConsPos ), // cCombinedBase ); // pCombinedRead->Sequence::setQualityAtSeqPos( // pCombinedRead->nGetFragIndexFromConsPos( nConsPos ), // nCombinedQuality ); // aMaskOfSpeciesMeetingCriteria[ nConsPos ] |= ucSpeciesMask; // } // cCombinedBase = 0; // nCombinedQuality = -666; // if ( bGetCombinedReadAtBase2( cForFlanking, // pForwardRead, // pForwardReadSingleSignalArray, // pReverseRead, // pReverseReadSingleSignalArray, // nConsPos, // cCombinedBase, // nCombinedQuality ) ) { // // notice that a column of pads does not count as a flanking base // if ( ( cCombinedBase == ntGetCons( nConsPos ).cGetBase() ) && // !ntGetCons( nConsPos ).bIsPad() ) { // aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPos ] // |= ucSpeciesMask; // } // } } // for( int nConsPos = nGetStartConsensusIndex(); } // for( int nSpecies = 0; nSpecies < mbtValVector aSpeciesInAcceptableColumns( nGetPaddedConsLength(), 1, // starting position 0, // initial value "aSpeciesInAcceptableColumns" ); for( int nSizeOfDiscrepantRegion = 1; nSizeOfDiscrepantRegion <= pCP->nAutoReportSizeOfDiscrepantRegion_; ++nSizeOfDiscrepantRegion ) { // nConsPosDiscrepantRegionLeft for( int nConsPosDiscrepantRegionLeft = pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ + 1; // e.g., if consensus is from 1 to 11, then 11 - 5 = 6 which is // the last position to check nConsPosDiscrepantRegionLeft <= ( nGetEndConsensusIndex() - pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ - nSizeOfDiscrepantRegion + 1 ); ++nConsPosDiscrepantRegionLeft ) { // check the left flank int nNumberOfFlankingBases = 0; bool bRegionOK = true; int nConsPos2; unsigned char ucFlankingSpecies = ucPPan | ucPTro | ucGGor | ucPPyg | ucMMul; for( nConsPos2 = nConsPosDiscrepantRegionLeft - 1; bRegionOK && ( nConsPos2 >= nGetStartConsensusIndex() ) && ( nNumberOfFlankingBases < pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ); --nConsPos2 ) { ucFlankingSpecies &= aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPos2 ]; if ( bAllNeededSpecies( ucFlankingSpecies ) ) { ++nNumberOfFlankingBases; } else { bRegionOK = false; } } // a little check for the left edge of the consensus if ( nNumberOfFlankingBases < pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ) bRegionOK = false; // check the right flank nNumberOfFlankingBases = 0; for( nConsPos2 = nConsPosDiscrepantRegionLeft + nSizeOfDiscrepantRegion; bRegionOK && ( nConsPos2 <= nGetEndConsensusIndex() ) && ( nNumberOfFlankingBases < pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ); ++nConsPos2 ) { ucFlankingSpecies &= aMaskOfSpeciesMeetingCriteriaAndAgreeingWithHuman[ nConsPos2 ]; if ( bAllNeededSpecies( ucFlankingSpecies ) ) { ++nNumberOfFlankingBases; } else { bRegionOK = false; } } // another little check for the right edge of the consensus if ( nNumberOfFlankingBases < pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ) bRegionOK = false; if ( bRegionOK ) { for( nConsPos2 = nConsPosDiscrepantRegionLeft; nConsPos2 <= nConsPosDiscrepantRegionLeft + nSizeOfDiscrepantRegion - 1; ++nConsPos2 ) { // if we require 0 flanking columns, this still checks // that there are the required # of species at the column // (in that case, ucFlankingSpecies will have all species) if ( bAllNeededSpecies( ucFlankingSpecies & aMaskOfSpeciesMeetingCriteria[ nConsPos2 ] ) ) { unsigned char ucFlankedSpeciesAtThisColumn = ucFlankingSpecies & aMaskOfSpeciesMeetingCriteria[ nConsPos2 ]; // use this new flanking of the column if: // 1) this column has not been used // or // 2) the new set of species is a proper superset // of the old one // the negation of the above is: // 1) the column has been used // and // 2) the new set of species is not a proper superset // of the old one if ( aSpeciesInAcceptableColumns[ nConsPos2 ] != 0 && ( ( aSpeciesInAcceptableColumns[ nConsPos2 ] & ucFlankedSpeciesAtThisColumn ) != aSpeciesInAcceptableColumns[ nConsPos2 ] ) ) continue; // why |= instead of = ? // I'm going to use "=" here: aSpeciesInAcceptableColumns[ nConsPos2] = ucFlankedSpeciesAtThisColumn; } // if ( bAllNeededSpecies( ucFlankingSpecies & } // for( nConsPos2 = nConsPosDiscrepantRegionLeft; } // if ( bRegionOK ) { } // for( int nConsPosDiscrepantRegionLeft } // for( int nSizeOfDiscrepantRegion = 1; // now go through the consensus again, and use the combined reads // for each acceptable species at each column to tabulate, at each // quality value, how many agreements and how many discrepancies there // are. RWCString soProject = ConsEd::pGetAssembly()->filGetAceFileFullPathname().soGetProjectBeforeEditDir(); RWTValVector aNumberOfComparisonsIndexedByQualityValue( 99, 0, "aNumberOfComparisonsIndexedByQualityValue" ); RWTValVector aNumberOfDiscrepanciesIndexedByQualityValue( 99, 0, "aNumberOfDiscrepanciesIndexedByQualityValue" ); int nConsPos; for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( !bAllNeededSpecies( aSpeciesInAcceptableColumns[ nConsPos ] )) continue; // the consensus and human read should be equal assert( pHumanRead->bIsInAlignedPartOfRead( nConsPos ) ); bool bReportThisPosition = false; for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; unsigned char ucSpeciesMask = ucGetMaskForSpecies( soSpecies ); if ( ! (ucSpeciesMask & aSpeciesInAcceptableColumns[ nConsPos ] )) continue; LocatedFragment* pCombinedRead = aCombinedReads[ nSpecies ]; assert( pCombinedRead ); char cBase = pCombinedRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); int nQuality = pCombinedRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); bool bAgree = ( cBase == ntGetCons( nConsPos ).cGetBase() ); ++aNumberOfComparisonsIndexedByQualityValue[ nQuality ]; if ( !bAgree ) { fprintf( pAO, "__discrep %d %s %c %c\n", nUnpaddedIndex( nConsPos ), soSpecies.data(), cBase, ntGetCons( nConsPos ).cGetBase() ); ++aNumberOfDiscrepanciesIndexedByQualityValue[ nQuality ]; } bReportThisPosition = true; } // for( nSpecies = 0; nSpecies < if ( bReportThisPosition ) { fprintf( pAO, "__passed %s %d\n", soProject.data(), nUnpaddedIndex( nConsPos ) ); } } // for( nConsPos = nGetStartConsensusIndex(); int nTotalDiscrepancies = 0; int nTotalComparisons = 0; fprintf( pAO, "discrepancyRateInFlankedRegions4 {\n" ); for( int nQuality = 0; nQuality <= 90; ++nQuality ) { fprintf( pAO, "%d %d %d\n", nQuality, aNumberOfDiscrepanciesIndexedByQualityValue[ nQuality ], aNumberOfComparisonsIndexedByQualityValue[ nQuality ] ); nTotalDiscrepancies += aNumberOfDiscrepanciesIndexedByQualityValue[ nQuality ]; nTotalComparisons += aNumberOfComparisonsIndexedByQualityValue[ nQuality ]; } fprintf( pAO, "} discrepancyRateInFlankedRegions4\n" ); fprintf( pAO, "total discrep: %d comparisons: %d\n", nTotalDiscrepancies, nTotalComparisons ); } // void Contig :: discrepancyRateInFlankedRegions4( autoReport* pAutoReport ) void Contig :: discrepancyRateInFlankedRegions5( autoReport* pAutoReport ) { RWTPtrVector aCombinedReads( (size_t) pAutoReport->aArrayOfSpecies_.length(), NULL, // initial value "aCombinedReads" ); mbtValVector aSpeciesInAcceptableColumns( 1, // will be changed 1, // starting position 0, // initial value "aSpeciesInAcceptableColumns" ); RWTValVector aArrayOfOneTreeAtEachConsPos( 1, // size--will be changed // + 1so we can use nConsPos (1-based) directly as an index "", "aArrayOfOneTreeAtEachConsPos" ); RWTValVector aArrayOfBasesAtEachConsPos( 1, // size--will be changed // + 1so we can use nConsPos (1-based) directly as an index "", "aArrayOfBasesAtEachConsPos" ); RWTValVector aArrayOfLenientlyFilteredBasesAtEachConsPos( 1, // size--will be changed // + 1 so we can use nConsPos (1-based) directly as an index "", // initial value "aArrayOfLenientlyFilteredBasesAtEachConsPos" ); RWTValVector aDeaminationMutationsAtEachConsPos( 1, // size--will be changed "", // initial value "aDeaminationMutationsAtEachConsPos" ); RWTValVector aDeaminationMutationsBoolAtEachConsPos( 1,// size--will be changed false, // initial value "aDeaminationMutationsBoolAtEachConsPos" ); RWTValVector aAncestralCpGAtEachConsPos( 1,// size--will be changed cNoCpG, // initial value "aAncestralCpGAtEachConsPos" ); // this is different than above in that if a CpG occurs, both // columns are marked, rather than just the first column RWTValVector aAncestralCpGAtEachConsPos2( 1,// size--will be changed cNoCpG, // initial value "aAncestralCpGAtEachConsPos" ); LocatedFragment* pHumanRead; applyFilters( pAutoReport, aCombinedReads, aSpeciesInAcceptableColumns, aArrayOfOneTreeAtEachConsPos, aArrayOfBasesAtEachConsPos, aArrayOfLenientlyFilteredBasesAtEachConsPos, aDeaminationMutationsAtEachConsPos, aAncestralCpGAtEachConsPos2, pHumanRead ); // now go through the consensus again, and use the combined reads // for each acceptable species at each column to tabulate, at each // quality value, how many agreements and how many discrepancies there // are. RWCString soProject = ConsEd::pGetAssembly()->filGetAceFileFullPathname().soGetProjectBeforeEditDir(); RWTValVector aNumberOfComparisonsIndexedByQualityValue( 99, 0, "aNumberOfComparisonsIndexedByQualityValue" ); RWTValVector aNumberOfDiscrepanciesIndexedByQualityValue( 99, 0, "aNumberOfDiscrepanciesIndexedByQualityValue" ); int nTotalNonPadComparisons = 0; int nTotalNonPadDiscrepancies = 0; int nConsPos; for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( !bAllNeededSpecies( aSpeciesInAcceptableColumns[ nConsPos ] )) continue; // the consensus and human read should be equal assert( pHumanRead->bIsInAlignedPartOfRead( nConsPos ) ); bool bReportThisPosition = false; for( int nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; unsigned char ucSpeciesMask = ucGetMaskForSpecies( soSpecies ); if ( ! (ucSpeciesMask & aSpeciesInAcceptableColumns[ nConsPos ] )) continue; LocatedFragment* pCombinedRead = aCombinedReads[ nSpecies ]; assert( pCombinedRead ); char cBase = pCombinedRead->ntGetFragFromConsPos( nConsPos ).cGetBase(); int nQuality = pCombinedRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); bool bAgree = ( cBase == ntGetCons( nConsPos ).cGetBase() ); ++aNumberOfComparisonsIndexedByQualityValue[ nQuality ]; if ( !bAgree ) { fprintf( pAO, "__discrep %d %s %c %c\n", nUnpaddedIndex( nConsPos ), soSpecies.data(), cBase, ntGetCons( nConsPos ).cGetBase() ); ++aNumberOfDiscrepanciesIndexedByQualityValue[ nQuality ]; } if ( cBase != '*' && !ntGetCons( nConsPos ).bIsPad() ) { ++nTotalNonPadComparisons; if ( !bAgree ) { ++nTotalNonPadDiscrepancies; } } bReportThisPosition = true; } // for( nSpecies = 0; nSpecies < if ( bReportThisPosition ) { fprintf( pAO, "__passed %s %d\n", soProject.data(), nUnpaddedIndex( nConsPos ) ); } } // for( nConsPos = nGetStartConsensusIndex(); int nTotalDiscrepancies = 0; int nTotalComparisons = 0; fprintf( pAO, "discrepancyRateInFlankedRegions5 {\n" ); for( int nQuality = 0; nQuality <= 90; ++nQuality ) { fprintf( pAO, "%d %d %d\n", nQuality, aNumberOfDiscrepanciesIndexedByQualityValue[ nQuality ], aNumberOfComparisonsIndexedByQualityValue[ nQuality ] ); nTotalDiscrepancies += aNumberOfDiscrepanciesIndexedByQualityValue[ nQuality ]; nTotalComparisons += aNumberOfComparisonsIndexedByQualityValue[ nQuality ]; } fprintf( pAO, "} discrepancyRateInFlankedRegions5\n" ); fprintf( pAO, "total discrep: %d comparisons: %d nonpad discrep: %d nonpad comp: %d\n", nTotalDiscrepancies, nTotalComparisons, nTotalNonPadDiscrepancies, nTotalNonPadComparisons ); } // void Contig :: discrepancyRateInFlankedRegions5( autoReport* pAutoReport ) void Contig :: deleteColumnsOfPads( const bool bAddQualityValuesOfStrands, autoReport* pAutoReport, LocatedFragment* pHumanRead, RWTPtrOrderedVector& aGoodReads, RWTPtrOrderedVector& aListOfReadsInSingleSignalArrays, RWTPtrOrderedVector& aListOfSingleSignalArrays ) { // this will change when we delete columns int nPaddedContigLength = nGetPaddedConsLength(); mbtValVectorOfBool aSomeCombinedReadHasNoPad( nPaddedContigLength, 1, "Contig::aSomeCombinedReadHasNoPad" ); // first, find locations in which human has a pad int nConsPos; for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // the consensus and human read should be equal assert( pHumanRead->bIsInAlignedPartOfRead( nConsPos ) ); if ( !pHumanRead->ntGetFragFromConsPos( nConsPos ).bIsPad() ) { aSomeCombinedReadHasNoPad.setValue( nConsPos, true ); } } // now check all species for pads int nSpecies; for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; for( int nRead = 0; nRead < aGoodReads.length(); ++nRead ) { LocatedFragment* pLocFrag = aGoodReads[ nRead ]; if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } // single signal arrays MBTValOrderedOffsetVector* pForwardReadSingleSignalArray = NULL; MBTValOrderedOffsetVector* pReverseReadSingleSignalArray = NULL; // save single signal arrays before deleting column of pads for( int n = 0; n <= 1; ++n ) { LocatedFragment* pLocFrag = NULL; if ( n == 0 ) pLocFrag = pForwardRead; else pLocFrag = pReverseRead; if ( !pLocFrag ) continue; int nIndex = aListOfReadsInSingleSignalArrays.index( pLocFrag ); assert( nIndex != RW_NPOS ); if ( n == 0 ) pForwardReadSingleSignalArray = aListOfSingleSignalArrays[ nIndex ]; else pReverseReadSingleSignalArray = aListOfSingleSignalArrays[ nIndex ]; } if ( !pCP->bAutoReportUseOldCriteriaForDeletingColumnsOfPads_ ) { // new criteria for deleting columns of pads for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { bool bThisSpeciesMustBeAPad = false; if ( ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos ) ) || ( pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) ) ) { bThisSpeciesMustBeAPad = true; } if ( !bThisSpeciesMustBeAPad ) { continue; } char cCombinedBase = 0; int nCombinedQuality = -666; bool bBaseALittleOK; if ( bGetCombinedReadAtBase3( bAddQualityValuesOfStrands, pForwardRead, pForwardReadSingleSignalArray, pReverseRead, pReverseReadSingleSignalArray, nConsPos, cCombinedBase, nCombinedQuality, bBaseALittleOK ) ) { if ( cCombinedBase != '*' ) { aSomeCombinedReadHasNoPad.setValue( nConsPos, true ); } } else { // the idea here is that we really don't know if this // is a pad or not because the bases are uncertain, // so do not delete it aSomeCombinedReadHasNoPad.setValue( nConsPos, true ); } } // for( int nConsPos } else { // old criteria for deleting columns of pads for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // the consensus and human read should be equal assert( pHumanRead->bIsInAlignedPartOfRead( nConsPos ) ); if ( !pHumanRead->ntGetFragFromConsPos( nConsPos ).bIsPad() ) { aSomeCombinedReadHasNoPad.setValue( nConsPos, true ); } char cCombinedBase = 0; int nCombinedQuality = -666; bool bBaseALittleOK; if ( ! bGetCombinedReadAtBase3( bAddQualityValuesOfStrands, pForwardRead, pForwardReadSingleSignalArray, pReverseRead, pReverseReadSingleSignalArray, nConsPos, cCombinedBase, nCombinedQuality, bBaseALittleOK ) ) continue; if ( cCombinedBase != '*' ) aSomeCombinedReadHasNoPad.setValue( nConsPos, true ); } // for( int nConsPos } // if ( !pCP->bAutoReportUseOldCriteriaForDeletingColumnsOfPads_ ) { else } // for( nSpecies = 0; nSpecies < // remove columns in which all combined reads have a pad // first, do it to the single signal arrays. for( int nRead = 0; nRead < aListOfSingleSignalArrays.length(); ++nRead ) { MBTValOrderedOffsetVector* pSingleSignalArray = aListOfSingleSignalArrays[ nRead ]; for( int nConsPos = nGetEndConsensusIndex(); nConsPos >= nGetStartConsensusIndex(); --nConsPos ) { if ( !aSomeCombinedReadHasNoPad[ nConsPos ] ) { pSingleSignalArray->removeAt( nConsPos ); } } } // for( int nRead = 0; nRead < aListOfReadsInSingleSignalArrays.length(); for( nConsPos = nGetEndConsensusIndex(); nConsPos >= nGetStartConsensusIndex(); --nConsPos ) { if ( ! aSomeCombinedReadHasNoPad[ nConsPos ] ) { deleteColumn( nConsPos, false ); } } } // deleteColumnsOfPads static bool bCheckThatFlankingBasesAgree( const RWCString& soFlankingBases, unsigned char ucAcceptableSpecies ) { char cHuman = soFlankingBases[nHSap]; bool bAgree = true; for( int nSpecies = 0; nSpecies < ConsEd::pGetAutoReport()->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = ConsEd::pGetAutoReport()->aArrayOfSpecies_[ nSpecies]; unsigned char ucSpeciesMask = ucGetMaskForSpecies( soSpecies ); if ( ucSpeciesMask & ucAcceptableSpecies ) { int nSpeciesIndex = nSpecies + 1; if ( soFlankingBases[nSpeciesIndex] != cHuman ) { bAgree = false; } } } if ( ! bAgree ) { fprintf( pAO, "required bases disagree %s acceptable species: %d\n", soFlankingBases.data(), (int) ucAcceptableSpecies ); cerr << "required bases disagree " << soFlankingBases << " acceptable species " << (int) ucAcceptableSpecies << endl; return( false ); } else { return( true ); } } void Contig :: printToCompareToReich( autoReport* pAutoReport ) { assert( !pCP->bAutoReportOnlyAllowSitesThatAreBetweenAcceptableSites_ ); RWTPtrVector aCombinedReads( (size_t) pAutoReport->aArrayOfSpecies_.length(), NULL, // initial value "aCombinedReads" ); mbtValVector aSpeciesInAcceptableColumns( 1, // will be changed 1, // starting position 0, // initial value "aSpeciesInAcceptableColumns" ); RWTValVector aArrayOfOneTreeAtEachConsPos( 1, // size--will be changed // + 1so we can use nConsPos (1-based) directly as an index "", "aArrayOfOneTreeAtEachConsPos" ); RWTValVector aArrayOfBasesAtEachConsPos( 1, // size--will be changed // + 1so we can use nConsPos (1-based) directly as an index "", "aArrayOfBasesAtEachConsPos" ); RWTValVector aArrayOfLenientlyFilteredBasesAtEachConsPos( 1, // size--will be changed // + 1 so we can use nConsPos (1-based) directly as an index "", // initial value "aArrayOfLenientlyFilteredBasesAtEachConsPos" ); RWTValVector aDeaminationMutationsAtEachConsPos( 1, // size--will be changed "", // initial value "aDeaminationMutationsAtEachConsPos" ); RWTValVector aDeaminationMutationsBoolAtEachConsPos( 1,// size--will be changed false, // initial value "aDeaminationMutationsBoolAtEachConsPos" ); RWTValVector aAncestralCpGAtEachConsPos( 1,// size--will be changed cNoCpG, // initial value "aAncestralCpGAtEachConsPos" ); // this is different than above in that if a CpG occurs, both // columns are marked, rather than just the first column RWTValVector aAncestralCpGAtEachConsPos2( 1,// size--will be changed cNoCpG, // initial value "aAncestralCpGAtEachConsPos" ); LocatedFragment* pHumanRead; applyFilters( pAutoReport, aCombinedReads, aSpeciesInAcceptableColumns, aArrayOfOneTreeAtEachConsPos, aArrayOfBasesAtEachConsPos, aArrayOfLenientlyFilteredBasesAtEachConsPos, aDeaminationMutationsAtEachConsPos, aAncestralCpGAtEachConsPos2, pHumanRead ); RWCString soLocusName = ConsEd::pGetAssembly()->soGetLocusName(); // now find chromosome RWCTokenizer tok( soLocusName ); RWCString soChromosome = tok('-'); soChromosome.substitute( "^hs", "chr", false ); // bAll // notice that we require that nConsPos - 1 and nConsPos + 1 are // still on the consensus int nConsPos; for( nConsPos = nGetStartConsensusIndex() + 1; nConsPos <= nGetEndConsensusIndex() - 1; ++nConsPos ) { if ( ! aSpeciesInAcceptableColumns[ nConsPos ] ) { continue; } assert( bAllNeededSpecies( aSpeciesInAcceptableColumns[ nConsPos ] ) ); RWCString soColumnOfBases = aArrayOfBasesAtEachConsPos[ nConsPos ]; // check the previous and subsequent columns as well // will these always have all species matching? RWCString soColumnOfBasesPrevious = aArrayOfLenientlyFilteredBasesAtEachConsPos[ nConsPos - 1]; RWCString soColumnOfBasesNext = aArrayOfLenientlyFilteredBasesAtEachConsPos[ nConsPos + 1 ]; if ( !bCheckThatFlankingBasesAgree( soColumnOfBasesPrevious, aSpeciesInAcceptableColumns[ nConsPos ] ) ) { cerr << "bad prev column " << nConsPos << " " << nUnpaddedIndex( nConsPos ) << endl; cerr << (int) aSpeciesInAcceptableColumns[ nConsPos ] << endl; cerr << endl << endl; assert( false ); } if ( !bCheckThatFlankingBasesAgree( soColumnOfBasesNext, aSpeciesInAcceptableColumns[ nConsPos ] ) ) { cerr << "bad next column " << nConsPos << " " << nUnpaddedIndex( nConsPos ) << endl; cerr << (int) aSpeciesInAcceptableColumns[ nConsPos ] << endl; cerr << endl << endl; assert( false ); } char cPreviousBase = soColumnOfBasesPrevious[ nHSap ]; char cNextBase = soColumnOfBasesNext[ nHSap ]; assert( cPreviousBase != 'n' && cPreviousBase != '*' ); assert( cNextBase != 'n' && cPreviousBase != '*' ); RWCString soHCGOM = RWCString( soColumnOfBases[nHSap] ) + RWCString( soColumnOfBases[nPTro] ) + RWCString( soColumnOfBases[nGGor] ) + RWCString( soColumnOfBases[nPPyg] ) + RWCString( soColumnOfBases[nMMul] ); RWCString soChrPos( (size_t) 20 ); if ( !pCP->bAutoReportUseCommasInBigNumbers_ ) { soChrPos.nCurrentLength_ = sprintf( soChrPos.data(), "%d", nUnpaddedIndexStartingAtUserDefinedPosition2( nConsPos ) ); } else { soChrPos = soAddCommas( nUnpaddedIndexStartingAtUserDefinedPosition2( nConsPos ) ); } fprintf( pAO, "%s %s UW %s %d %s %c %c\n", soChromosome.data(), soChrPos.data(), soLocusName.data(), nUnpaddedIndex( nConsPos ), soHCGOM.data(), cPreviousBase, cNextBase ); } }