/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "contig.h" #include "consedParameters.h" #include "locatedFragment.h" #include "bIntersect.h" #include "mbtValVectorOfBool.h" #include "mbtValVector.h" #include "rwtptrorderedvector.h" #include "ntide.h" #include "fileDefines.h" #include "soAddCommas.h" #include "rwctokenizer.h" #include "autoReport.h" #include "consed.h" #include "assembly.h" #include "nNumberFromBase.h" #include "setErrorRateFromQuality.h" #include "dErrorRateFromQuality.h" #include "singletInfo.h" #include "readPHDAgainForHighQualitySegment.h" #include #include #include #include "rwcregexp.h" #include "getAllSinglets.h" #include "mbt_val_ord_offset_vec.h" #include "soLine.h" #include "max.h" #include static bool bDebug = false; unsigned char cCpGMatrix[256][256]; static void setCpGMatrix() { for( int n1 = 0; n1 < 256; ++n1 ) { for( int n2 = 0; n2 < 256; ++n2 ) { cCpGMatrix[n1][n2] = 0; } } cCpGMatrix['c']['g'] = 1; cCpGMatrix['t']['g'] = 2; cCpGMatrix['c']['a'] = 4; } void Contig :: printDiscrepantRegions() { setCpGMatrix(); int nBases = nGetNumberOfBasesInContigNotBackbone( "HSap" ); fprintf( pAO, "\n\nAutoReportPrintAmbiguouslyAlignedRegions {\n" ); fprintf( pAO, "CONTIG: %s %d\n\n", soGetName().data(), nBases ); int nContigLengthPlusSentinels = nGetPaddedConsLength() + pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_; int nContigLengthWithoutSentinels = nGetPaddedConsLength(); mbtValVectorOfBool aDiscrepantPositions( nContigLengthPlusSentinels, 1, // starts at 1 "Contig::aDiscrepantPositions" ); mbtValVectorOfBool aDiscrepantPositionsHighQuality( nContigLengthPlusSentinels, 1, // starts at 1 "Contig::aDiscrepantPositionsHighQuality" ); mbtValVectorOfBool aPadPositions( nContigLengthPlusSentinels, 1, // starts at 1 "Contig::aPadPositions" ); mbtValVector aLeftSideOfCpG( nContigLengthPlusSentinels, 1, // starts at 1 0 ); // initial value aLeftSideOfCpG.soName_ = "Contig::aLeftSideOfCpG"; mbtValVector aWorseQualityAtDiscrepantColumn( nContigLengthPlusSentinels, 1, // starts at 1 99 ); // initial value aWorseQualityAtDiscrepantColumn.soName_ = "Contig::aWorseQualityAtDiscrepantColumn"; mbtValVector aBestQuality( nContigLengthPlusSentinels, 1, // starts at 1 0 ); // initial value typedef RWTPtrOrderedVector arrayOfPointersToLocatedFragments; mbtPtrVector aArraysOfDiscrepantReads( nContigLengthPlusSentinels, 1); // starts at 1 aArraysOfDiscrepantReads.soName_ = "Contig::aArraysOfDiscrepantReads"; // int nRead; // for( nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { // LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); // for( int n // for( int nConsPos = nGetStartConsensusIndex(); // nConsPos <= ( nGetEndConsensusIndex() - 1 ); ++nConsPos ) { // char cBaseA = ntGetCons( nConsPos ).cGetBase(); // char cBaseB = ntGetCons( nConsPos + 1 ).cGetBase(); // // if ( ( cBaseA == 'c' && cBaseB == 'g' ) || // // ( cBaseA == 'c' && cBaseB == 'a' ) || // // ( cBaseA == 't' && cBaseB == 'g' ) ) { // // aLeftSideOfCpG.setValue( nConsPos, true ); // // } // if ( cCpGMatrix[cBaseA][cBaseB] ) { // aLeftSideOfCpG |= cCpGMatrix[cBaseA][cBaseB]; // } // } mbtValVectorOfBool aAlignedReadsBesidesHuman( nContigLengthPlusSentinels, 1, // starts at 1 "Contig::aAlignedReadsBesidesHuman" ); int nRead; for( nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pCP->aAutoReportPatternsOfReadsToIgnore_.bIsContainedBy( pLocFrag->soGetName() ) ) continue; if ( pLocFrag->soGetName().bContains( pCP->soAutoReportBackboneReadHasThisStringInIt_ ) ) continue; if ( pLocFrag->bIsWholeReadLowQuality() ) continue; int nLeftConsPos = pLocFrag->nGetHighQualityStart(); int nRightConsPos = pLocFrag->nGetHighQualityEnd(); if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( !bIntersect( nLeftConsPos, nRightConsPos, pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), nLeftConsPos, nRightConsPos ) ) continue; if ( !bIntersect( nLeftConsPos, nRightConsPos, nGetStartConsensusIndex(), nGetEndConsensusIndex(), nLeftConsPos, nRightConsPos ) ) continue; for( int nConsPos = nLeftConsPos; nConsPos <= nRightConsPos; ++nConsPos ) { aAlignedReadsBesidesHuman.setValue( nConsPos, true ); if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQualityWithout98or99() > aBestQuality[ nConsPos ] ) { aBestQuality[ nConsPos ] = pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQualityWithout98or99(); } } } // record CpG's (and mutant forms), discrepancies, and pads. // Do this for all reads, including human backbone, except // for specifically ignored reads for( nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); bool bIgnoreThisRead = false; for( int nPattern = 0; nPattern < pCP->aAutoReportPatternsOfReadsToIgnore_.length(); ++nPattern ) { if ( pLocFrag->soGetName().bContains( pCP->aAutoReportPatternsOfReadsToIgnore_[ nPattern ] ) ) { bIgnoreThisRead = true; break; } } if ( bIgnoreThisRead ) continue; // we are only interested in the part of the read that is // high quality segment, aligned, and on the consensus if ( pLocFrag->bIsWholeReadLowQuality() ) continue; int nLeftConsPos = pLocFrag->nGetHighQualityStart(); int nRightConsPos = pLocFrag->nGetHighQualityEnd(); if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( !bIntersect( nLeftConsPos, nRightConsPos, pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), nLeftConsPos, nRightConsPos ) ) continue; if ( !bIntersect( nLeftConsPos, nRightConsPos, nGetStartConsensusIndex(), nGetEndConsensusIndex(), nLeftConsPos, nRightConsPos ) ) continue; for( int nConsPos = nLeftConsPos; nConsPos <= nRightConsPos; ++nConsPos ) { if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).bIsPad() || ntGetCons( nConsPos ).bIsPad() ) { if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() != ntGetCons( nConsPos ).cGetBase() ) { aPadPositions.setValue( nConsPos, true ); } } // check for CpG -- this needs to include human. // Note that this means that if there is a CpG in which only // one of the forms is above the quality threshold, it will // not be considered a CpG if ( nConsPos < nRightConsPos ) { char cBaseA = pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase(); char cBaseB = pLocFrag->ntGetFragFromConsPos( nConsPos + 1 ).cGetBase(); if ( cCpGMatrix[cBaseA][cBaseB] ) { aLeftSideOfCpG[ nConsPos ] |= cCpGMatrix[cBaseA][cBaseB]; } } if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() != ntGetCons( nConsPos ).cGetBase() ) { aDiscrepantPositions.setValue( nConsPos, true ); if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQualityWithout98or99() >= pCP->nQualityThresholdForFindingHighQualityDiscrepancies_ ) { aDiscrepantPositionsHighQuality.setValue( nConsPos, true ); } arrayOfPointersToLocatedFragments* pArrayOfDiscrepantReads = aArraysOfDiscrepantReads[ nConsPos ]; if ( !pArrayOfDiscrepantReads ) { RWCString soName = "RWTPtrOrderedVector at nConsPos = "; soName += RWCString( (long) nConsPos ); pArrayOfDiscrepantReads = new RWTPtrOrderedVector( 3, // initial capacity soName ); aArraysOfDiscrepantReads[ nConsPos ] = pArrayOfDiscrepantReads; } pArrayOfDiscrepantReads->insert( pLocFrag ); if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQualityWithout98or99() < aWorseQualityAtDiscrepantColumn[ nConsPos ] ) { aWorseQualityAtDiscrepantColumn[ nConsPos ] = pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQualityWithout98or99(); } } } } // scan again looking for a discrepancy preceeded by at least // nAutoReportNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ // mark all locations for which there are at least // pCP->nAutoReportNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ // non-discrepant positions up to and including that base // start this out by being 1 wherever there are high quality segment // aligned reads (not the ones to ignore). Thus this // array will be 0 at the beginning and end of the consensus // where there are no reads other than the human backbone mbtValVectorOfBool aStretchesOfAgreeingBases( nContigLengthPlusSentinels, 1, // starts at 1 "Contig::aStretchesOfAgreeingBases" ); bool bInAgreeingRegion = false; int nPositionsThatAgree = 0; int nConsPos; for( nConsPos = 1; nConsPos <= aDiscrepantPositions.length(); ++nConsPos ) { // there must be aligned bases besides human that have // no discrepancies for it to be considered // aStretchesOfAgreeingBases true. Thus if we hit // a region of no such aligned reads, // this region is not aStretchesOfAgreeingBases. Furthermore, // further to the right where there are aligned reads, there must // be several columns of agreement before we consider it // a aStretchesOfAgreeingBases. Note that there is a potential // problem: // aaaaaaaaaaaaaaaaaaaaaaaaaaaa one read // daaaaaaaaaaaaaaaaaaaaaaa another read // ^ this would be considered an isolated discrepancy. // However, since there are no aligned agreeing bases in the // same read, we probably don't want it. This could be // fixed by not allowing discrepancies if they occur // immediately at the beginning or end of a read. if ( !aAlignedReadsBesidesHuman[ nConsPos ] ) { bInAgreeingRegion = false; continue; } if ( aDiscrepantPositions[ nConsPos ] ) { bInAgreeingRegion = false; } else { if ( bInAgreeingRegion ) { ++nPositionsThatAgree; } else { bInAgreeingRegion = true; nPositionsThatAgree = 1; } if ( nPositionsThatAgree >= pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ) { aStretchesOfAgreeingBases.setValue( nConsPos, true ); } } } // now let's calculate the number of "potential" sites for each // needed size of discrepant region // now print out minimal regions that are flanked by at // least pCP->nAutoReportNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ for( nConsPos = 1; nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // -----DDDD----- // ^ must be true if ( !aStretchesOfAgreeingBases[nConsPos] ) continue; // now want to see a range of high quality discrepancies // -----DDDD----- // ^ must be true if ( !aDiscrepantPositionsHighQuality[ nConsPos + 1 ] ) continue; // find first position that doesn't have a hqd. It must // start a stretch of agreeing bases. bool bFoundEnd = false; int nFirstNonHQD; int nConsPos2; for( nConsPos2 = nConsPos + 2; nConsPos2 <= nGetEndConsensusIndex(); ++nConsPos2 ) { if ( !aDiscrepantPositionsHighQuality[ nConsPos2 ] ) { bFoundEnd = true; nFirstNonHQD = nConsPos2; break; } } // this would signal that high quality discrepancies went to the // end of the consensus--a condition we aren't interested in if ( !bFoundEnd ) continue; // -----DDDD----- // ^ must be true if ( !aStretchesOfAgreeingBases[ nFirstNonHQD + pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ - 1 ] ) continue; int nLeftDiscrepantRegion = nConsPos + 1; int nSizeOfDiscrepantRegion = nFirstNonHQD - nLeftDiscrepantRegion; int nRightDiscrepantRegion = nFirstNonHQD - 1; // if made it here, this is a high quality isolated discrepant // region. However, it might have pads or CpG mutations in it. char cPadsInDiscrepantRegion = 'b'; for( nConsPos2 = nLeftDiscrepantRegion; nConsPos2 <= nRightDiscrepantRegion; ++nConsPos2 ) { if ( aPadPositions[ nConsPos2 ] ) cPadsInDiscrepantRegion = '*'; } RWCString soCpG = "no-"; for( nConsPos2 = nLeftDiscrepantRegion - 1; nConsPos2 <= nRightDiscrepantRegion; ++nConsPos2 ) { // check with any 2 of CpG mutation's are found here // must have 2 or else all 3 if ( aLeftSideOfCpG[ nConsPos2 ] == 3 || aLeftSideOfCpG[ nConsPos2 ] == 6 || aLeftSideOfCpG[ nConsPos2 ] == 5 || aLeftSideOfCpG[ nConsPos2 ] == 7 ) { soCpG = "cpg"; } } // the 1 and 1 on the end are just for the perl script fprintf( pAO, "%s %d to %d (%d) %c %s 1 1\n", soGetName().data(), nUnpaddedIndexStartingAtUserDefinedPosition( nLeftDiscrepantRegion ), nUnpaddedIndexStartingAtUserDefinedPosition( nRightDiscrepantRegion ), nSizeOfDiscrepantRegion, cPadsInDiscrepantRegion, soCpG.data() ); } fprintf( pAO, "} AutoReportPrintAmbiguouslyAlignedRegions\n" ); fprintf( pAO, "\nPOTENTIAL SITES (size of site, number of sites) {\n" ); // now print the # of potential sites for discrepant regions // of various sizes for( int nSizeOfDiscrepantRegion = 1; nSizeOfDiscrepantRegion <= 5; ++nSizeOfDiscrepantRegion ) { int nNumberOfPotentialSitesNoPads = -666; // fprintf( pAO, "accurate:\n" ); int nNumberOfPotentialSites = nGetNumberOfPotentialDiscrepantRegions( nSizeOfDiscrepantRegion, aStretchesOfAgreeingBases, aBestQuality, aPadPositions, nNumberOfPotentialSitesNoPads ); int nRoughNumberOfPotentialSites = nGetNumberOfPotentialDiscrepantRegions2( nSizeOfDiscrepantRegion ); // fprintf( pAO, "not quite as rough:\n" ); int nPadsToo = nGetNumberOfPotentialDiscrepantRegions3( nSizeOfDiscrepantRegion ); int nQualityToo = nGetNumberOfPotentialDiscrepantRegions4( nSizeOfDiscrepantRegion ); fprintf( pAO, "%d %d %d %d %d %d\n", nSizeOfDiscrepantRegion, nNumberOfPotentialSites, nNumberOfPotentialSitesNoPads, nRoughNumberOfPotentialSites, nPadsToo, nQualityToo ); } fprintf( pAO, "} POTENTIAL SITES\n\n" ); } int Contig :: nGetNumberOfBasesInContigNotBackbone( const RWCString& soPatternInNameOfBackbone ) { int nContigLength = nGetPaddedConsLength(); mbtValVectorOfBool aAlignedHQSPositions( nContigLength, 1, // starts at 1 "Contig::aAlignedHQSPositions" ); int nRead; for( nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( soPatternInNameOfBackbone ) ) { continue; } if ( pCP->aAutoReportPatternsOfReadsToIgnore_.bIsContainedBy( pLocFrag->soGetName() ) ) continue; // we are only interested in the part of the read that is // high quality segment, aligned, and on the consensus if ( pLocFrag->bIsWholeReadLowQuality() ) continue; int nLeftConsPos = pLocFrag->nGetHighQualityStart(); int nRightConsPos = pLocFrag->nGetHighQualityEnd(); if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( !bIntersect( nLeftConsPos, nRightConsPos, pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), nLeftConsPos, nRightConsPos ) ) continue; if ( !bIntersect( nLeftConsPos, nRightConsPos, nGetStartConsensusIndex(), nGetEndConsensusIndex(), nLeftConsPos, nRightConsPos ) ) continue; // will only count non-pads. There must be at least 1 hqs aligned // read that is not a pad to count it. for( int nConsPos = nLeftConsPos; nConsPos <= nRightConsPos; ++nConsPos ) { if ( !pLocFrag->ntGetFragFromConsPos( nConsPos ).bIsPad() ) { aAlignedHQSPositions.setValue( nConsPos, true ); } } // for( int nConsPos = nLeftConsPos; nConsPos <= nRightConsPos; ++nConsPos ) { // if ( pCP->bAutoReportPrintDiscrepantRegionsButOnlyIfAboveQualityThreshold_ && // ( pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQualityWithout98or99() < // pCP->nQualityThresholdForFindingHighQualityDiscrepancies_ ) ) // continue; // if ( !pLocFrag->ntGetFragFromConsPos( nConsPos ).bIsPad() ) { // aAlignedHQSPositions.setValue( nConsPos, true ); // } // } } int nTotalAlignedHQSPositions = 0; for( int nConsPos = aAlignedHQSPositions.nGetStartIndex(); nConsPos <= aAlignedHQSPositions.nGetEndIndex(); ++nConsPos ) { if ( aAlignedHQSPositions[ nConsPos ] ) { ++nTotalAlignedHQSPositions; } } return( nTotalAlignedHQSPositions ); } int Contig :: nGetNumberOfPotentialDiscrepantRegions( const int nSizeOfPotentialDiscrepantRegion, const mbtValVectorOfBool& aStretchesOfAgreeingBases, const mbtValVector& aBestQuality, const mbtValVectorOfBool& aPadPositions, int& nPotentialSitesNoPads ) { // -----xxxxx----- // ^ this is pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ // bases long // ^ this is nSizeOfPotentialDiscrepantRegion long // ^ this is pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ // bases long again // so if at position nConsPos, there must be a stretch of // pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ // agreeing bases starting at nConsPos // and then there must be another stretch of similar agreeing bases // at nConsPos + pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ + nSizeOfPotentialDiscrepantRegion int nLengthOfPattern = 2 * pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ + nSizeOfPotentialDiscrepantRegion; // -----xxxxx----- // ^ here aStretchesOfAgreeingBases should be 1 // ^ here aStretchesOfAgreeingBases should be 1 int nOffset = nSizeOfPotentialDiscrepantRegion + pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_; // this allows nConsPos + nOffset to go to // aStretchesOfAgreeingBases.length() // which, since this is a 1-based array, is the maximum subscript int nLastConsPos = aStretchesOfAgreeingBases.length() - nOffset; int nPotentialSites = 0; nPotentialSitesNoPads = 0; for( int nConsPos = 1; nConsPos <= nLastConsPos; ++nConsPos ) { if ( aStretchesOfAgreeingBases[ nConsPos ] && aStretchesOfAgreeingBases[ nConsPos + nOffset ] ) { // so this region is flanked by aligned agreeing bases // are there any pads within the region? bool bPadsInDiscrepantRegion = false; for( int nConsPos2 = nConsPos + 1; nConsPos2 <= nConsPos + nSizeOfPotentialDiscrepantRegion; ++nConsPos2 ) { if ( aPadPositions[ nConsPos2 ] ) { bPadsInDiscrepantRegion = true; break; } } if ( pCP->bAutoReportPrintDiscrepantRegionsButOnlyIfAboveQualityThreshold_ ) { // now require that there be a high quality base at // each position within the potentially discrepant region // That is from nConsPos + 1 to // nConsPos + nSizeOfPotentialDiscrepantRegion // -----xxxx----- // ^ here to // ^ here bool bAllHighQuality = true; for( int nConsPos2 = nConsPos + 1; nConsPos2 <= nConsPos + nSizeOfPotentialDiscrepantRegion; ++nConsPos2 ) { if ( aBestQuality[ nConsPos2 ] < pCP->nQualityThresholdForFindingHighQualityDiscrepancies_ ) { bAllHighQuality = false; break; } } if ( bAllHighQuality ) { ++nPotentialSites; if ( !bPadsInDiscrepantRegion ) { ++nPotentialSitesNoPads; } } } else { ++nPotentialSites; if ( !bPadsInDiscrepantRegion ) ++nPotentialSitesNoPads; } } // if ( aStretchesOfAgreeingBases[ nConsPos ] && } // for( int nConsPos = 1; nConsPos <= nLastConsPos; ++nConsPos ) return( nPotentialSites ); } int Contig :: nGetNumberOfPotentialDiscrepantRegions2(const int nSizeOfDiscrepantRegion ) { int nContigLengthPlusSentinels = nGetPaddedConsLength() + pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_; mbtValVectorOfBool aDiscrepantPositions( nContigLengthPlusSentinels, 1, // starts at 1 "Contig::aDiscrepantPositions" ); mbtValVectorOfBool aAlignedReadsBesidesHuman( nContigLengthPlusSentinels, 1, "Contig::aAlignedReadsBesiesHuman" ); int nRead; for( nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( pCP->soAutoReportBackboneReadHasThisStringInIt_ ) ) continue; if ( pCP->aAutoReportPatternsOfReadsToIgnore_.bIsContainedBy( pLocFrag->soGetName() ) ) continue; if ( pLocFrag->bIsWholeReadLowQuality() ) continue; int nLeftConsPos = pLocFrag->nGetHighQualityStart(); int nRightConsPos = pLocFrag->nGetHighQualityEnd(); if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( !bIntersect( nLeftConsPos, nRightConsPos, pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), nLeftConsPos, nRightConsPos ) ) continue; if ( !bIntersect( nLeftConsPos, nRightConsPos, nGetStartConsensusIndex(), nGetEndConsensusIndex(), nLeftConsPos, nRightConsPos ) ) continue; for( int nConsPos = nLeftConsPos; nConsPos <= nRightConsPos; ++nConsPos ) { aAlignedReadsBesidesHuman.setValue( nConsPos, true ); if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() != ntGetCons( nConsPos ).cGetBase() ) { aDiscrepantPositions.setValue( nConsPos, true ); } } } // see if we have a 12-base region with no discrepancies, except maybe // bbbbbxxbbbbb // b, no discrepancy // x, possible discrepancy int nPotentialSites = 0; for( int nConsPos = pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ + 1; nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // for( int nConsPos = nStart; nConsPos <= nEnd; ++nConsPos ) { int nConsPos2; bool bRegionOK = true; for( nConsPos2 = nConsPos - pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_; bRegionOK && nConsPos2 <= nConsPos - 1; ++nConsPos2 ) { if ( aDiscrepantPositions[ nConsPos2 ] || !aAlignedReadsBesidesHuman[ nConsPos2 ] ) { bRegionOK = false; } } for( nConsPos2 = nConsPos + nSizeOfDiscrepantRegion; bRegionOK && ( nConsPos2 <= ( nConsPos + nSizeOfDiscrepantRegion + pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ - 1 ) ); ++nConsPos2 ) { if ( aDiscrepantPositions[ nConsPos2 ] || !aAlignedReadsBesidesHuman[ nConsPos2 ] ) { bRegionOK = false; } } if ( bRegionOK ) { ++nPotentialSites; } } return nPotentialSites; } // differs from nGetNumberOfPotentialDiscrepantRegions2 in that it // excludes pads from the "discrepant" region int Contig :: nGetNumberOfPotentialDiscrepantRegions3( const int nSizeOfDiscrepantRegion ) { int nContigLengthPlusSentinels = nGetPaddedConsLength() + pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_; mbtValVectorOfBool aDiscrepantPositions( nContigLengthPlusSentinels, 1, // starts at 1 "Contig::aDiscrepantPositions" ); mbtValVectorOfBool aAlignedReadsBesidesHuman( nContigLengthPlusSentinels, 1, "Contig::aAlignedReadsBesiesHuman" ); mbtValVectorOfBool aPadPositions( nContigLengthPlusSentinels, 1, "Contig::aPadPositions" ); // set aPadPositions int nRead; for( nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); // all reads, including human //if ( pLocFrag->soGetName().bContains( "HSap" ) ) continue; if ( pCP->aAutoReportPatternsOfReadsToIgnore_.bIsContainedBy( pLocFrag->soGetName() ) ) continue; if ( pLocFrag->bIsWholeReadLowQuality() ) continue; int nLeftConsPos = pLocFrag->nGetHighQualityStart(); int nRightConsPos = pLocFrag->nGetHighQualityEnd(); if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( !bIntersect( nLeftConsPos, nRightConsPos, pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), nLeftConsPos, nRightConsPos ) ) continue; if ( !bIntersect( nLeftConsPos, nRightConsPos, nGetStartConsensusIndex(), nGetEndConsensusIndex(), nLeftConsPos, nRightConsPos ) ) continue; for( int nConsPos = nLeftConsPos; nConsPos <= nRightConsPos; ++nConsPos ) { if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).bIsPad() || ntGetCons( nConsPos ).bIsPad() ) { // notice that we have this additional check which makes // sure that there are not agreeing pads in all of the // reads that we are considering due to an insertion in a // read that is excluded. if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() != ntGetCons( nConsPos ).cGetBase() ) { aPadPositions.setValue( nConsPos, true ); } } } } for( nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( pCP->soAutoReportBackboneReadHasThisStringInIt_ ) ) continue; if ( pCP->aAutoReportPatternsOfReadsToIgnore_.bIsContainedBy( pLocFrag->soGetName() ) ) continue; if ( pLocFrag->bIsWholeReadLowQuality() ) continue; int nLeftConsPos = pLocFrag->nGetHighQualityStart(); int nRightConsPos = pLocFrag->nGetHighQualityEnd(); if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( !bIntersect( nLeftConsPos, nRightConsPos, pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), nLeftConsPos, nRightConsPos ) ) continue; if ( !bIntersect( nLeftConsPos, nRightConsPos, nGetStartConsensusIndex(), nGetEndConsensusIndex(), nLeftConsPos, nRightConsPos ) ) continue; for( int nConsPos = nLeftConsPos; nConsPos <= nRightConsPos; ++nConsPos ) { aAlignedReadsBesidesHuman.setValue( nConsPos, true ); if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() != ntGetCons( nConsPos ).cGetBase() ) { aDiscrepantPositions.setValue( nConsPos, true ); } } } // see if we have a 12-base region with no discrepancies, except maybe // bbbbbxxbbbbb // b, no discrepancy // x, possible discrepancy int nPotentialSites = 0; for( int nConsPos = pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ + 1; nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // for( int nConsPos = nStart; nConsPos <= nEnd; ++nConsPos ) { int nConsPos2; bool bRegionOK = true; for( nConsPos2 = nConsPos - pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_; bRegionOK && nConsPos2 <= nConsPos - 1; ++nConsPos2 ) { if ( aDiscrepantPositions[ nConsPos2 ] || !aAlignedReadsBesidesHuman[ nConsPos2 ] ) { bRegionOK = false; } } for( nConsPos2 = nConsPos + nSizeOfDiscrepantRegion; bRegionOK && nConsPos2 <= nConsPos + nSizeOfDiscrepantRegion + pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ - 1; ++nConsPos2 ) { if ( aDiscrepantPositions[ nConsPos2 ] || !aAlignedReadsBesidesHuman[ nConsPos2 ] ) { bRegionOK = false; } } for( nConsPos2 = nConsPos; bRegionOK && nConsPos2 <= nConsPos + nSizeOfDiscrepantRegion - 1; ++nConsPos2 ) { if ( aPadPositions[ nConsPos2 ] ) { bRegionOK = false; } } if ( bRegionOK ) { ++nPotentialSites; } } return nPotentialSites; } // differs from nGetNumberOfPotentialDiscrepantRegions3 in that it // uses quality to exclude discrepant regions int Contig :: nGetNumberOfPotentialDiscrepantRegions4( const int nSizeOfDiscrepantRegion ) { int nContigLengthPlusSentinels = nGetPaddedConsLength() + pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_; mbtValVectorOfBool aDiscrepantPositions( nContigLengthPlusSentinels, 1, // starts at 1 "Contig::aDiscrepantPositions" ); mbtValVectorOfBool aAlignedReadsBesidesHuman( nContigLengthPlusSentinels, 1, "Contig::aAlignedReadsBesiesHuman" ); mbtValVectorOfBool aPadPositions( nContigLengthPlusSentinels, 1, "Contig::aPadPositions" ); mbtValVector aBestQuality( nContigLengthPlusSentinels, 1, // starts at 1 0 ); // initial value aBestQuality.soName_ = "Contig::aBestQuality"; // set aPadPositions int nRead; for( nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); // all reads, including human //if ( pLocFrag->soGetName().bContains( "HSap" ) ) continue; if ( pCP->aAutoReportPatternsOfReadsToIgnore_.bIsContainedBy( pLocFrag->soGetName() ) ) continue; if ( pLocFrag->bIsWholeReadLowQuality() ) continue; int nLeftConsPos = pLocFrag->nGetHighQualityStart(); int nRightConsPos = pLocFrag->nGetHighQualityEnd(); if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( !bIntersect( nLeftConsPos, nRightConsPos, pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), nLeftConsPos, nRightConsPos ) ) continue; if ( !bIntersect( nLeftConsPos, nRightConsPos, nGetStartConsensusIndex(), nGetEndConsensusIndex(), nLeftConsPos, nRightConsPos ) ) continue; for( int nConsPos = nLeftConsPos; nConsPos <= nRightConsPos; ++nConsPos ) { if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).bIsPad() || ntGetCons( nConsPos ).bIsPad() ) { if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() != ntGetCons( nConsPos ).cGetBase() ) { aPadPositions.setValue( nConsPos, true ); } } } } // all reads except human for( nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( pCP->soAutoReportBackboneReadHasThisStringInIt_ ) ) continue; if ( pCP->aAutoReportPatternsOfReadsToIgnore_.bIsContainedBy( pLocFrag->soGetName() ) ) continue; if ( pLocFrag->bIsWholeReadLowQuality() ) continue; int nLeftConsPos = pLocFrag->nGetHighQualityStart(); int nRightConsPos = pLocFrag->nGetHighQualityEnd(); if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( !bIntersect( nLeftConsPos, nRightConsPos, pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), nLeftConsPos, nRightConsPos ) ) continue; if ( !bIntersect( nLeftConsPos, nRightConsPos, nGetStartConsensusIndex(), nGetEndConsensusIndex(), nLeftConsPos, nRightConsPos ) ) continue; for( int nConsPos = nLeftConsPos; nConsPos <= nRightConsPos; ++nConsPos ) { aAlignedReadsBesidesHuman.setValue( nConsPos, true ); if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() != ntGetCons( nConsPos ).cGetBase() ) { aDiscrepantPositions.setValue( nConsPos, true ); } if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQualityWithout98or99() > aBestQuality[ nConsPos ] ) { aBestQuality[ nConsPos ] = pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQualityWithout98or99(); } } } // see if we have a 12-base region with no discrepancies, except maybe // bbbbbxxbbbbb // b, no discrepancy // x, possible discrepancy int nPotentialSites = 0; for( int nConsPos = pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ + 1; nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // for( int nConsPos = nStart; nConsPos <= nEnd; ++nConsPos ) { int nConsPos2; bool bRegionOK = true; for( nConsPos2 = nConsPos - pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_; bRegionOK && nConsPos2 <= nConsPos - 1; ++nConsPos2 ) { if ( aDiscrepantPositions[ nConsPos2 ] || !aAlignedReadsBesidesHuman[ nConsPos2 ] ) { bRegionOK = false; } } // e.g., if discrepant region is 2, since nConsPos is at the beginning // of the discrepant region, then nConsPos + 6 is the end of // the flanking region for( nConsPos2 = nConsPos + nSizeOfDiscrepantRegion; bRegionOK && nConsPos2 <= ( nConsPos + nSizeOfDiscrepantRegion + pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ - 1 ); ++nConsPos2 ) { if ( aDiscrepantPositions[ nConsPos2 ] || !aAlignedReadsBesidesHuman[ nConsPos2 ] ) { bRegionOK = false; } } for( nConsPos2 = nConsPos; bRegionOK && nConsPos2 <= nConsPos + nSizeOfDiscrepantRegion - 1; ++nConsPos2 ) { if ( aPadPositions[ nConsPos2 ] ) { bRegionOK = false; } } for( nConsPos2 = nConsPos; bRegionOK && nConsPos2 <= nConsPos + nSizeOfDiscrepantRegion - 1; ++nConsPos2 ) { if ( aBestQuality[ nConsPos2 ] < pCP->nQualityThresholdForFindingHighQualityDiscrepancies_ ) { bRegionOK = false; } } if ( bRegionOK ) { ++nPotentialSites; } } return nPotentialSites; } void Contig :: addToMinimumQualityHistogram( RWTValVector& aMinimumQualityHistogram ) { // make an array of this contig and an array of bool telling whether // a particular location is ok to use or not. Run through all reads // (not human) figuring out the minimum of each column. After all // reads have been read, then run through the quality array and // for each column (that should be used), add 1 to the corresponding // position in aMinimumQualityHistogram int nContigLength = nGetPaddedConsLength(); mbtValVectorOfBool aAlignedReadsBesidesHuman( nContigLength, 1, // starts at 1 "Contig::aAlignedReadsBesidesHuman" ); mbtValVector aWorstQuality( nContigLength, 1, // starts at 1 100 ); // initial value aWorstQuality.soName_ = "Contig::aWorstQuality"; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pCP->aAutoReportPatternsOfReadsToIgnore_.bIsContainedBy( pLocFrag->soGetName() ) ) continue; // also do not include human read if ( pLocFrag->soGetName().bContains( pCP->soAutoReportBackboneReadHasThisStringInIt_ ) ) continue; if ( pLocFrag->bIsWholeReadLowQuality() ) continue; int nLeftConsPos = pLocFrag->nGetHighQualityStart(); int nRightConsPos = pLocFrag->nGetHighQualityEnd(); if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( !bIntersect( nLeftConsPos, nRightConsPos, pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), nLeftConsPos, nRightConsPos ) ) continue; if ( !bIntersect( nLeftConsPos, nRightConsPos, nGetStartConsensusIndex(), nGetEndConsensusIndex(), nLeftConsPos, nRightConsPos ) ) continue; for( int nConsPos = nLeftConsPos; nConsPos <= nRightConsPos; ++nConsPos ) { aAlignedReadsBesidesHuman.setValue( nConsPos, true ); // pads are not allowed to contribute to the minimum quality if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQuality() < aWorstQuality[ nConsPos ] && !pLocFrag->ntGetFragFromConsPos( nConsPos ).bIsPad() ) { aWorstQuality[ nConsPos ] = pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); } } } for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( aAlignedReadsBesidesHuman[ nConsPos ] ) { ++aMinimumQualityHistogram[ aWorstQuality[ nConsPos ] ]; } } } class humanPrimatePad { public: int nConsPos_; bool bPrimatePad_; bool bHumanPad_; bool bPrimateAlignedBase_; Quality qual_; public: humanPrimatePad(const int nConsPos ) : nConsPos_( nConsPos ), bPrimatePad_( false ), bHumanPad_( false ), bPrimateAlignedBase_( false ), qual_( 0 ) {} humanPrimatePad() : nConsPos_( -666 ), bPrimatePad_( false ), bHumanPad_( false ), bPrimateAlignedBase_( false ), qual_( 0 ) {} bool operator==( const humanPrimatePad& hum ) const { return( ( nConsPos_ == hum.nConsPos_ ) && ( bPrimatePad_ == hum.bPrimatePad_ ) && ( bHumanPad_ == hum.bHumanPad_ ) && ( bPrimateAlignedBase_ == hum.bPrimateAlignedBase_ ) && ( qual_ == hum.qual_ ) ); } }; void Contig :: addNumberOfIsolatedPads( mbtValVector& aIsolatedPrimatePads, mbtValVector& aIsolatedHumanPads ) { int nContigLength = nGetPaddedConsLength(); RWTValOrderedVector aHumanPrimatePads( (size_t) nContigLength ); int nConsPos; for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { aHumanPrimatePads.insert( humanPrimatePad( nConsPos ) ); } for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( pCP->soAutoReportIsolatedPadsOfReadsWithThisPattern_ ) ) { if ( pLocFrag->bIsWholeReadLowQuality() ) continue; int nLeftConsPos = pLocFrag->nGetHighQualityStart(); int nRightConsPos = pLocFrag->nGetHighQualityEnd(); if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( !bIntersect( nLeftConsPos, nRightConsPos, pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), nLeftConsPos, nRightConsPos ) ) continue; if ( !bIntersect( nLeftConsPos, nRightConsPos, nGetStartConsensusIndex(), nGetEndConsensusIndex(), nLeftConsPos, nRightConsPos ) ) continue; for( int nConsPos = nLeftConsPos; nConsPos <= nRightConsPos; ++nConsPos ) { aHumanPrimatePads[ nConsPos ].bPrimateAlignedBase_ = true; if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQuality() > aHumanPrimatePads[ nConsPos ].qual_ ) { aHumanPrimatePads[ nConsPos ].qual_ = pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); } if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).bIsPad() ) { aHumanPrimatePads[ nConsPos ].bPrimatePad_ = true; } } } } // now go through again and look for human pads. for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( ntGetCons( nConsPos ).bIsPad() ) { aHumanPrimatePads[ nConsPos ].bHumanPad_ = true; } } // now go through and look for double pads and remove those columns. int nIndex; for( nIndex = aHumanPrimatePads.length() - 1; nIndex >= 0; --nIndex ) { if ( aHumanPrimatePads[ nIndex ].bPrimatePad_ && aHumanPrimatePads[ nIndex ].bHumanPad_ ) { aHumanPrimatePads.removeAt( nIndex ); } } // now look for isolated bases for( nIndex = 1; nIndex <= ( aHumanPrimatePads.length() - 2 ); ++nIndex ) { if ( !aHumanPrimatePads[ nIndex - 1 ].bHumanPad_ && aHumanPrimatePads[ nIndex ].bHumanPad_ && !aHumanPrimatePads[ nIndex + 1 ].bHumanPad_ && aHumanPrimatePads[ nIndex ].bPrimateAlignedBase_ ) { ++aIsolatedHumanPads[ aHumanPrimatePads[nIndex].qual_ ]; // this is just temporary so we can navigate by high // quality pads if ( aHumanPrimatePads[nIndex].qual_ >= pCP->nQualityThresholdForFindingHighQualityDiscrepancies_ ) { fprintf( pAO, "human pad %s %d %d\n", soGetName().data(), nUnpaddedIndex( aHumanPrimatePads[nIndex].nConsPos_ ), (int) aHumanPrimatePads[nIndex].qual_ ); } // end temp } if ( !aHumanPrimatePads[ nIndex - 1 ].bPrimatePad_ && aHumanPrimatePads[ nIndex ].bPrimatePad_ && !aHumanPrimatePads[ nIndex + 1 ].bPrimatePad_ && aHumanPrimatePads[ nIndex - 1 ].bPrimateAlignedBase_ && aHumanPrimatePads[ nIndex ].bPrimateAlignedBase_ && aHumanPrimatePads[ nIndex + 1 ].bPrimateAlignedBase_ ) { ++aIsolatedPrimatePads[ aHumanPrimatePads[nIndex].qual_ ]; // temporary so we can navigate by high quality pads if ( aHumanPrimatePads[nIndex].qual_ >= pCP->nQualityThresholdForFindingHighQualityDiscrepancies_ ) { fprintf( pAO, "primate pad %s %d %d\n", soGetName().data(), nUnpaddedIndex( aHumanPrimatePads[nIndex].nConsPos_ ), (int) aHumanPrimatePads[nIndex].qual_ ); } // end temporary } } } typedef mbtValVector arrayOfQuality; void Contig :: countPadsForEachSpecies( const RWTValOrderedVector& aSpecies, RWTValOrderedVector& aNumberOfPadsInPrimateButNotInHuman, RWTValOrderedVector& aNumberOfPadsInHumanButNotInPrimate, int& nPossibleSites ) { RWTPtrOrderedVector aArrayForEachSpeciesWhetherAlignedHQS; RWTPtrOrderedVector aArrayForEachSpeciesOfHighestQuality; RWTPtrOrderedVector aArrayForEachSpeciesWhetherPad; int nSpecies; for( nSpecies = 0; nSpecies < aSpecies.length(); ++nSpecies ) { RWCString soName = "Contig::countPadsForEachSpecies "; soName += aSpecies[ nSpecies]; mbtValVectorOfBool* pArrayOfAlignedHQS = new mbtValVectorOfBool( nGetPaddedConsLength(), 1, soName + "pArrayOfAligned" ); arrayOfQuality* pArrayOfQuality = new mbtValVector( nGetPaddedConsLength(), 1, // offset 0 ); // initial quality mbtValVectorOfBool* pArrayOfWhetherPad = new mbtValVectorOfBool( nGetPaddedConsLength(), 1, soName + "pArrayOfWhetherPad" ); aArrayForEachSpeciesWhetherAlignedHQS.insert( pArrayOfAlignedHQS ); aArrayForEachSpeciesOfHighestQuality.insert( pArrayOfQuality ); aArrayForEachSpeciesWhetherPad.insert( pArrayOfWhetherPad ); } for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->bIsWholeReadLowQuality() ) continue; int nLeftConsPos = pLocFrag->nGetHighQualityStart(); int nRightConsPos = pLocFrag->nGetHighQualityEnd(); if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( !bIntersect( nLeftConsPos, nRightConsPos, pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), nLeftConsPos, nRightConsPos ) ) continue; if ( !bIntersect( nLeftConsPos, nRightConsPos, nGetStartConsensusIndex(), nGetEndConsensusIndex(), nLeftConsPos, nRightConsPos ) ) continue; // let's figure out the species of the read int nSpeciesOfRead = -1; for( int nSpecies = 0; nSpecies < aSpecies.length(); ++nSpecies ) { if ( pLocFrag->soGetName().bContains( aSpecies[ nSpecies ] ) ) { nSpeciesOfRead = nSpecies; break; } } if ( nSpeciesOfRead == -1 ) continue; mbtValVectorOfBool* pArrayOfAlignedHQS = aArrayForEachSpeciesWhetherAlignedHQS[ nSpeciesOfRead ]; arrayOfQuality* pArrayOfQuality = aArrayForEachSpeciesOfHighestQuality[ nSpeciesOfRead ]; mbtValVectorOfBool* pArrayOfWhetherPad = aArrayForEachSpeciesWhetherPad[ nSpeciesOfRead ]; for( int nConsPos = nLeftConsPos; nConsPos <= nRightConsPos; ++nConsPos ) { pArrayOfAlignedHQS->setValue( nConsPos, true ); if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQuality() > (*pArrayOfQuality)[ nConsPos ] ) { (*pArrayOfQuality)[ nConsPos ] = pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); // we are only interested if the highest quality read has // a pad bool bIsPad = pLocFrag->ntGetFragFromConsPos( nConsPos ).bIsPad(); pArrayOfWhetherPad->setValue( nConsPos, bIsPad ); } } } // now let's see which positions have pads in all species mbtValVectorOfBool aArrayAllSpeciesAlignedHQS( nGetPaddedConsLength(), 1, "Contig::countPadsForEachSpecies aArrayForAllSpeciesAlignedHQS" ); // start out by setting this array true at each consensus // position. If any species is not aligned, set it false. int nConsPos; for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { aArrayAllSpeciesAlignedHQS.setValue( nConsPos, true ); } mbtValVectorOfBool aArrayAnySpeciesPad( nGetPaddedConsLength(), 1, "Contig::countPadsForEachSpecies aArrayAnySpeciesPad" ); mbtValVectorOfBool aArrayAnySpeciesNotPad( nGetPaddedConsLength(), 1, "Contig::countPadsForEachSpecies aArrayAnySpeciesNotPad" ); mbtValVector aArrayOfWorstSpeciesQuality( nGetPaddedConsLength(), 1, 100 ); for( nSpecies = 0; nSpecies < aArrayForEachSpeciesWhetherAlignedHQS.length(); ++nSpecies ) { mbtValVectorOfBool* pArrayOfAlignedHQS = aArrayForEachSpeciesWhetherAlignedHQS[ nSpecies]; arrayOfQuality* pArrayOfQuality = aArrayForEachSpeciesOfHighestQuality[ nSpecies ]; mbtValVectorOfBool* pArrayOfWhetherPad = aArrayForEachSpeciesWhetherPad[ nSpecies ]; for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( !(*pArrayOfAlignedHQS)[nConsPos] ) { aArrayAllSpeciesAlignedHQS.setValue( nConsPos, false ); } // special case for human--quality is all 20, but // really should be considered 50 or so. if ( aSpecies[ nSpecies ] != "HSap" ) { if ( aArrayOfWorstSpeciesQuality[ nConsPos ] > (*pArrayOfQuality)[ nConsPos ] ) { aArrayOfWorstSpeciesQuality[ nConsPos ] = (*pArrayOfQuality)[ nConsPos ]; } } if ( (*pArrayOfWhetherPad)[ nConsPos ] ) { aArrayAnySpeciesPad.setValue( nConsPos, true ); } else { aArrayAnySpeciesNotPad.setValue( nConsPos, true ); } } } // find human pad array. Our rule is that human must have no // pad mbtValVectorOfBool* pHumanArrayOfWhetherPad = NULL; for( nSpecies = 0; nSpecies < aSpecies.length(); ++nSpecies ) { if ( aSpecies[ nSpecies ].bContains( "HSap" ) ) { pHumanArrayOfWhetherPad = aArrayForEachSpeciesWhetherPad[ nSpecies ]; break; } } assert( pHumanArrayOfWhetherPad ); // now look for a particular motif: // b-b where b is no pad, the - can be a pad or not a pad, but must be // of quality above a particular threshold. for( nConsPos = nGetStartConsensusIndex() + 1; nConsPos <= ( nGetEndConsensusIndex() - 1); ++nConsPos ) { if ( aArrayAllSpeciesAlignedHQS[ nConsPos - 1 ] && aArrayAllSpeciesAlignedHQS[ nConsPos ] && aArrayAllSpeciesAlignedHQS[ nConsPos + 1 ] && !aArrayAnySpeciesPad[ nConsPos - 1 ] && // aArrayAnySpeciesPad[ nConsPos ] && !aArrayAnySpeciesPad[ nConsPos + 1 ] && // !(*pHumanArrayOfWhetherPad)[ nConsPos ] && ( aArrayOfWorstSpeciesQuality[ nConsPos ] >= pCP->nQualityThresholdForFindingHighQualityDiscrepancies_ ) ) { ++nPossibleSites; } if ( aArrayAllSpeciesAlignedHQS[ nConsPos - 1 ] && aArrayAllSpeciesAlignedHQS[ nConsPos ] && aArrayAllSpeciesAlignedHQS[ nConsPos + 1 ] && !aArrayAnySpeciesPad[ nConsPos - 1 ] && aArrayAnySpeciesPad[ nConsPos ] && !aArrayAnySpeciesPad[ nConsPos + 1 ] && !(*pHumanArrayOfWhetherPad)[ nConsPos ] && ( aArrayOfWorstSpeciesQuality[ nConsPos ] >= pCP->nQualityThresholdForFindingHighQualityDiscrepancies_ ) ) { // some species has a pad at this position. Figure out which // it is (may be more than one) and increment its counter. for( nSpecies = 0; nSpecies < aArrayForEachSpeciesWhetherPad.length(); ++nSpecies ) { mbtValVectorOfBool* pArrayOfWhetherPad = aArrayForEachSpeciesWhetherPad[ nSpecies ]; if ( (*pArrayOfWhetherPad)[ nConsPos ] ) { ++aNumberOfPadsInPrimateButNotInHuman[ nSpecies ]; // temporary--make a script looking just // for cases in which pad in human, not in chimp if ( aSpecies[ nSpecies ] == "PTro" ) { // help in testing fprintf( pAO, "pad %s %d\n", soGetName().data(), nUnpaddedIndex( nConsPos ) ); } } } } } // for( nConsPos for( nConsPos = nGetStartConsensusIndex() + 1; nConsPos <= ( nGetEndConsensusIndex() - 1 ); ++nConsPos ) { if ( aArrayAllSpeciesAlignedHQS[ nConsPos - 1 ] && aArrayAllSpeciesAlignedHQS[ nConsPos ] && aArrayAllSpeciesAlignedHQS[ nConsPos + 1 ] && !aArrayAnySpeciesPad[ nConsPos - 1 ] && !aArrayAnySpeciesPad[ nConsPos + 1 ] && (*pHumanArrayOfWhetherPad)[ nConsPos ] && aArrayAnySpeciesNotPad[ nConsPos ] && ( aArrayOfWorstSpeciesQuality[ nConsPos ] >= pCP->nQualityThresholdForFindingHighQualityDiscrepancies_ ) ) { // b*b human // bbb some primate // b*b (others might still have pads) // ^ all are good quality // all primates present with hqs and aligned for( nSpecies = 0; nSpecies < aSpecies.length(); ++nSpecies ) { mbtValVectorOfBool* pArrayOfWhetherPad = aArrayForEachSpeciesWhetherPad[ nSpecies ]; if ( !(*pArrayOfWhetherPad)[ nConsPos ] ) { ++aNumberOfPadsInHumanButNotInPrimate[ nSpecies ]; // temporary--make a script looking just // for cases in which pad in human, not in chimp if ( aSpecies[ nSpecies ] == "PTro" ) { // help in testing fprintf( pAO, "pad %s %d\n", soGetName().data(), nUnpaddedIndex( nConsPos ) ); } } } } } } void Contig :: printBasesInDiscrepantRegions( autoReport* pAutoReport ) { setCpGMatrix(); int nBases = nGetNumberOfBasesInContigNotBackbone( "HSap" ); fprintf( pAO, "\n\nAutoReportPrintDiscrepantRegionsAndBases {\n" ); fprintf( pAO, "CONTIG: %s %d\n\n", soGetName().data(), nBases ); int nContigLengthPlusSentinels = nGetPaddedConsLength() + pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_; int nContigLengthWithoutSentinels = nGetPaddedConsLength(); mbtValVectorOfBool aDiscrepantPositions( nContigLengthPlusSentinels, 1, // starts at 1 "Contig::aDiscrepantPositions" ); mbtValVectorOfBool aDiscrepantPositionsNotNearEndsOfReads( nContigLengthPlusSentinels, 1, // starts at 1 "Contig::aDiscrepantPositionsNotNearEndsOfReads" ); // mbtValVectorOfBool aDiscrepantPositionsHighQuality( nContigLengthPlusSentinels, // 1, // starts at 1 // "Contig::aDiscrepantPositionsHighQuality" ); mbtValVector aLeftSideOfCpG( nContigLengthPlusSentinels, 1, // starts at 1 0 ); // initial value aLeftSideOfCpG.soName_ = "Contig::aLeftSideOfCpG"; typedef RWTPtrOrderedVector arrayOfPointersToLocatedFragments; mbtPtrVector aArraysOfDiscrepantReads( nContigLengthPlusSentinels, 1); // starts at 1 aArraysOfDiscrepantReads.soName_ = "Contig::aArraysOfDiscrepantReads"; mbtValVectorOfBool aAlignedReadsBesidesHuman( nContigLengthPlusSentinels, 1, // starts at 1 "Contig::aAlignedReadsBesidesHuman" ); // this loop takes the non-human reads and set aAlignedReadsBesidesHuman // and aBestQuality int nRead; for( nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pCP->aAutoReportPatternsOfReadsToIgnore_.bIsContainedBy( pLocFrag->soGetName() ) ) continue; if ( pLocFrag->soGetName().bContains( pCP->soAutoReportBackboneReadHasThisStringInIt_ ) ) continue; if ( pLocFrag->bIsWholeReadLowQuality() ) continue; int nHighQualityAlignedConsPosLeft = pLocFrag->nGetHighQualityStart(); int nHighQualityAlignedConsPosRight = pLocFrag->nGetHighQualityEnd(); if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( !bIntersect( nHighQualityAlignedConsPosLeft, nHighQualityAlignedConsPosRight, pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), nHighQualityAlignedConsPosLeft, nHighQualityAlignedConsPosRight ) ) continue; if ( !bIntersect( nHighQualityAlignedConsPosLeft, nHighQualityAlignedConsPosRight, nGetStartConsensusIndex(), nGetEndConsensusIndex(), nHighQualityAlignedConsPosLeft, nHighQualityAlignedConsPosRight ) ) continue; for( int nConsPos = nHighQualityAlignedConsPosLeft; nConsPos <= nHighQualityAlignedConsPosRight; ++nConsPos ) { aAlignedReadsBesidesHuman.setValue( nConsPos, true ); } } // record CpG's (and mutant forms), discrepancies, and pads. // Do this for all reads, including human backbone, except // for specifically ignored reads for( nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pCP->aAutoReportPatternsOfReadsToIgnore_.bIsContainedBy( pLocFrag->soGetName() ) ) continue; // we are only interested in the part of the read that is // high quality segment, aligned, and on the consensus if ( pLocFrag->bIsWholeReadLowQuality() ) continue; int nHighQualityAlignedConsPosLeft = pLocFrag->nGetHighQualityStart(); int nHighQualityAlignedConsPosRight = pLocFrag->nGetHighQualityEnd(); if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( !bIntersect( nHighQualityAlignedConsPosLeft, nHighQualityAlignedConsPosRight, pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), nHighQualityAlignedConsPosLeft, nHighQualityAlignedConsPosRight ) ) continue; if ( !bIntersect( nHighQualityAlignedConsPosLeft, nHighQualityAlignedConsPosRight, nGetStartConsensusIndex(), nGetEndConsensusIndex(), nHighQualityAlignedConsPosLeft, nHighQualityAlignedConsPosRight ) ) continue; // <-------------------------> // high quality and aligned // ^nHighQualityAlignedConsPosLeft ^nHighQualityAlignedConsPosRight // ^nHighQualityAlignedNotNearEndConsPosLeft // ^ // nHighQualityAlignedNotNearEndConsPosRight int nHighQualityAlignedNotNearEndConsPosLeft = nHighQualityAlignedConsPosLeft + pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_; int nHighQualityAlignedNotNearEndConsPosRight = nHighQualityAlignedConsPosRight - pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_; for( int nConsPos = nHighQualityAlignedConsPosLeft; nConsPos <= nHighQualityAlignedConsPosRight; ++nConsPos ) { // check for CpG -- this needs to include human. // Note that this means that if there is a CpG in which only // one of the forms is above the quality threshold, it will // not be considered a CpG if ( nConsPos < nHighQualityAlignedConsPosRight ) { char cBaseA = pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase(); char cBaseB = pLocFrag->ntGetFragFromConsPos( nConsPos + 1 ).cGetBase(); if ( cCpGMatrix[cBaseA][cBaseB] ) { aLeftSideOfCpG[ nConsPos ] |= cCpGMatrix[cBaseA][cBaseB]; } } if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() != ntGetCons( nConsPos ).cGetBase() ) { aDiscrepantPositions.setValue( nConsPos, true ); if ( nHighQualityAlignedNotNearEndConsPosLeft <= nConsPos && nConsPos <= nHighQualityAlignedNotNearEndConsPosRight ) aDiscrepantPositionsNotNearEndsOfReads.setValue( nConsPos, true ); arrayOfPointersToLocatedFragments* pArrayOfDiscrepantReads = aArraysOfDiscrepantReads[ nConsPos ]; if ( !pArrayOfDiscrepantReads ) { RWCString soName = "RWTPtrOrderedVector at nConsPos = "; soName += RWCString( (long) nConsPos ); pArrayOfDiscrepantReads = new RWTPtrOrderedVector( 3, // initial capacity soName ); aArraysOfDiscrepantReads[ nConsPos ] = pArrayOfDiscrepantReads; } pArrayOfDiscrepantReads->insert( pLocFrag ); } } } // scan again looking for a discrepancy preceeded by at least // nAutoReportNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ // mark all locations for which there are at least // pCP->nAutoReportNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ // non-discrepant positions up to and including that base // start this out by being 1 wherever there are high quality segment // aligned reads (not the ones to ignore). Thus this // array will be 0 at the beginning and end of the consensus // where there are no reads other than the human backbone mbtValVectorOfBool aStretchesOfAgreeingBases( nContigLengthPlusSentinels, 1, // starts at 1 "Contig::aStretchesOfAgreeingBases" ); bool bInAgreeingRegion = false; int nPositionsThatAgree = 0; int nConsPos; for( nConsPos = 1; nConsPos <= aDiscrepantPositions.length(); ++nConsPos ) { // there must be aligned bases besides human that have // no discrepancies for it to be considered // aStretchesOfAgreeingBases true. Thus if we hit // a region of no such aligned reads, // this region is not aStretchesOfAgreeingBases. Furthermore, // further to the right where there are aligned reads, there must // be several columns of agreement before we consider it // a aStretchesOfAgreeingBases. Note that there is a potential // problem: // aaaaaaaaaaaaaaaaaaaaaaaaaaaa one read // daaaaaaaaaaaaaaaaaaaaaaa another read // ^ this would be considered an isolated discrepancy. // However, since there are no aligned agreeing bases in the // same read, we probably don't want it. This could be // fixed by not allowing discrepancies if they occur // immediately at the beginning or end of a read. if ( !aAlignedReadsBesidesHuman[ nConsPos ] ) { bInAgreeingRegion = false; continue; } if ( aDiscrepantPositions[ nConsPos ] ) { bInAgreeingRegion = false; } else { if ( bInAgreeingRegion ) { ++nPositionsThatAgree; } else { bInAgreeingRegion = true; nPositionsThatAgree = 1; } if ( nPositionsThatAgree >= pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ) { aStretchesOfAgreeingBases.setValue( nConsPos, true ); } } } // now let's calculate the number of "potential" sites for each // needed size of discrepant region // now print out minimal regions that are flanked by at // least pCP->nAutoReportNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ for( nConsPos = 1; nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // -----DDDD----- // ^ must be true (aStretchesOfAgreeingBases) if ( !aStretchesOfAgreeingBases[nConsPos] ) continue; // now want to see a range of high quality discrepancies // -----DDDD----- // ^ must be true (aDiscrepantPositions) // I used to just look at the discrepant positions. However, // there were cases like this: // aaaaaaaaaa // DDaaaaaaaa where the DD was at the beginning of the // high quality aligned portion of the read if ( !aDiscrepantPositionsNotNearEndsOfReads[ nConsPos + 1 ] ) continue; // now look for another stretch of of agreeing bases on the right side // -----D----- // ^nConsPos // ^nFirstConsPosToCheck // -----DD----- // -----DDD----- // -----DDDD----- // ^nLastConsPosToCheck int nFirstConsPosToCheck = nConsPos + 1 + pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_; if ( nFirstConsPosToCheck > nGetEndConsensusIndex() ) continue; int nLastConsPosToCheck = MIN( nGetEndConsensusIndex(), nConsPos + pCP->nAutoReportMaxSizeOfDiscrepantRegion_ + pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ); int nLeftDiscrepantRegion = nConsPos + 1; int nRightDiscrepantRegion; int nSizeOfDiscrepantRegion; bool bFoundStretchOfAgreeingBases = false; for( int nConsPos2 = nFirstConsPosToCheck; nConsPos2 <= nLastConsPosToCheck && !bFoundStretchOfAgreeingBases; ++nConsPos2 ) { if ( aStretchesOfAgreeingBases[ nConsPos2 ] ) { bFoundStretchOfAgreeingBases = true; nSizeOfDiscrepantRegion = nConsPos2 - nFirstConsPosToCheck + 1; nRightDiscrepantRegion = nLeftDiscrepantRegion + nSizeOfDiscrepantRegion - 1; } } if ( !bFoundStretchOfAgreeingBases ) continue; // if made it here, this is an isolated discrepant // region. However, it might have CpG mutations in it. bool bCpG = false; for( int nConsPos4 = nLeftDiscrepantRegion - 1; nConsPos4 <= nRightDiscrepantRegion; ++nConsPos4 ) { // check with any 2 of CpG mutation's are found here // must have 2 or else all 3 if ( aLeftSideOfCpG[ nConsPos4 ] == 3 || aLeftSideOfCpG[ nConsPos4 ] == 6 || aLeftSideOfCpG[ nConsPos4 ] == 5 || aLeftSideOfCpG[ nConsPos4 ] == 7 ) { bCpG = true; } } if ( bCpG ) continue; // We must make sure that the agreeing region has at least // pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ // non-pads. So far we have just counted that many padded positions. int nFlankingRegionLeft; int nNumberOfNonPads = 0; bool bFoundEnoughNonPads = false; for( int nConsPos5 = nLeftDiscrepantRegion - 1; nConsPos5 >= nGetStartConsensusIndex() && !bFoundEnoughNonPads; --nConsPos5 ) { if ( !ntGetCons( nConsPos5 ).bIsPad() ) { ++nNumberOfNonPads; if ( nNumberOfNonPads >= pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ) { nFlankingRegionLeft = nConsPos5; bFoundEnoughNonPads = true; } } } // sorry--can't be sure of the alignment if ( !bFoundEnoughNonPads ) continue; // now look at the right flanking region int nFlankingRegionRight; nNumberOfNonPads = 0; bFoundEnoughNonPads = false; for( int nConsPos6 = nRightDiscrepantRegion + 1; nConsPos6 <= nGetEndConsensusIndex() && !bFoundEnoughNonPads; ++nConsPos6 ) { if ( !ntGetCons( nConsPos6 ).bIsPad() ) { ++nNumberOfNonPads; if ( nNumberOfNonPads >= pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ) { nFlankingRegionRight = nConsPos6; bFoundEnoughNonPads = true; } } } if ( !bFoundEnoughNonPads ) continue; // now check that these regions are agreeing. the left flanking // region goes from nFlankingRegionLeft to nLeftDiscrepantRegion - 1. // Since aStretchOfAgreeingBases is indexed by the right // base of the stretch of // pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ // bases, we need to check this range: bool bOK = true; for( int nConsPos7 = nFlankingRegionLeft + pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ - 1; nConsPos7 <= nLeftDiscrepantRegion - 1; ++nConsPos7 ) { if ( !aStretchesOfAgreeingBases[ nConsPos7 ] ) { bOK = false; break; } } if ( !bOK ) { continue; } // check the right region that it agrees // the right region is from nRightDiscrepantRegion + 1 to // nFlankingRegionRight for( int nConsPos8 = nRightDiscrepantRegion + pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_; nConsPos8 <= nFlankingRegionRight; ++nConsPos8 ) { if ( !aStretchesOfAgreeingBases[ nConsPos8 ] ) { bOK = false; break; } } if ( !bOK ) { continue; } // now we want to print out the bases for all species. // If there is no read for a species, print out a "?" int nLowestQuality = 200; int nHighestQuality = -1; for( int nConsPos3 = nLeftDiscrepantRegion; nConsPos3 <= nRightDiscrepantRegion; ++nConsPos3 ) { // this can happen as follows: // aaaaaaDaDaaaaaaaa // ^ in middle of discrepant region if ( !aDiscrepantPositions[ nConsPos3 ] ) continue; int nNumberOfSpecies = pAutoReport->aArrayOfSpecies_.length(); RWTPtrVector aBestReadForSpecies( (size_t) nNumberOfSpecies, 0 ); // initialize to 0 RWTValVector aBaseOfBestReadForSpecies( (size_t) nNumberOfSpecies, '?' ); // initialize to ? for( nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->bIsWholeReadLowQuality() ) continue; if ( ! ( pLocFrag->nGetHighQualityStart() <= nFlankingRegionLeft && nFlankingRegionRight <= pLocFrag->nGetHighQualityEnd() ) ) continue; if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( ! ( pLocFrag->nGetAlignClipStart() <= nFlankingRegionLeft && nFlankingRegionRight <= pLocFrag->nGetAlignClipEnd() ) ) continue; // figure out what species this is. int nSpecies = pAutoReport->aArrayOfSpecies_.nFindIndexOfFirstContainedBy( pLocFrag->soGetName() ); if ( nSpecies == RW_NPOS ) { cerr << "couldn't find species for: " << pLocFrag->soGetName() << endl; } assert( nSpecies != RW_NPOS ); if ( !aBestReadForSpecies[nSpecies] ) { aBestReadForSpecies[nSpecies] = pLocFrag; aBaseOfBestReadForSpecies[ nSpecies ] = pLocFrag->ntGetFragFromConsPos( nConsPos3 ).cGetBase(); } else { // there already is a read for this species. Let's see // which is better LocatedFragment* pOldRead = aBestReadForSpecies[nSpecies]; const int nWindowSize = 11; float fOldReadQuality = pOldRead->fGetAverageQualityInWindow( nConsPos3, nWindowSize ); float fNewReadQuality = pLocFrag->fGetAverageQualityInWindow( nConsPos3, nWindowSize ); if ( fNewReadQuality > fOldReadQuality ) { aBestReadForSpecies[nSpecies] = pLocFrag; aBaseOfBestReadForSpecies[ nSpecies ] = pLocFrag->ntGetFragFromConsPos( nConsPos3 ).cGetBase(); } } } // for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); // at this point, we've collected info on all the species at this // nConsPos3. So let's print them out. for( int nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { putc( aBaseOfBestReadForSpecies[ nSpecies ], pAO ); putc( ' ', pAO ); } // let's add a little information in case we want to find this // location to study it further, such as look at the traces fprintf( pAO, "%s %d %s\n", soGetName().data(), // contig name nUnpaddedIndex( nConsPos3 ), ConsEd::pGetAssembly()->soGetAceFileName().data() ); // do we really want to write this so many times? // START--looking for informative trees. At least 2 species // for each of 2 different bases. bool bFoundTreeWith2Groups = false; for( int nQualityCutoff = 70; nQualityCutoff >= 5 && !bFoundTreeWith2Groups; nQualityCutoff -= 5 ) { int nNumberOfSpeciesForBase[256]; for( int n = 0; n < 256; ++n ) nNumberOfSpeciesForBase[n] = 0; bool bMMulAboveCutoff = false; bool bPPygAboveCutoff = false; char cMMul = '?'; char cPPyg = '!'; for( int nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; int nQuality = -1; if ( soSpecies == "HSap" ) nQuality = 50; else if ( aBestReadForSpecies[ nSpecies ] ) { nQuality = aBestReadForSpecies[ nSpecies ]->ntGetFragFromConsPos( nConsPos3 ).qualGetQuality(); } if ( nQuality >= nQualityCutoff ) { ++nNumberOfSpeciesForBase[ aBestReadForSpecies[ nSpecies ]->ntGetFragFromConsPos( nConsPos3 ).cGetBase() ]; if ( soSpecies == "MMul" ) { bMMulAboveCutoff = true; cMMul = aBestReadForSpecies[ nSpecies ]->ntGetFragFromConsPos( nConsPos3 ).cGetBase(); } else if ( soSpecies == "PPyg" ) { bPPygAboveCutoff = true; cPPyg = aBestReadForSpecies[ nSpecies ]->ntGetFragFromConsPos( nConsPos3 ).cGetBase(); } } } if ( bMMulAboveCutoff && bPPygAboveCutoff ) { // check that there are at least 2 groups of species // and each group has at least 2 species in it. int nNumberOfGroupsWithAtLeast2Species = 0; for( int n = 0; n < 256; ++n ) { if ( nNumberOfSpeciesForBase[n] >= 2 ) { ++nNumberOfGroupsWithAtLeast2Species; } } if ( nNumberOfGroupsWithAtLeast2Species >= 2 ) { // we have found a location with at least 2 bases. // Each of these 2 bases is in common with at least // 2 different species, both with at least quality nQualityCutoff // Furthermore, MMul and PPyg are both above the cutoff. RWCString soPossible = "possible"; if ( cMMul != cPPyg && nNumberOfSpeciesForBase[cMMul] >= 2 && nNumberOfSpeciesForBase[cPPyg] >= 2 ) { soPossible = "impossible"; } fprintf( pAO, "not necessarily problem tree: " ); fprintf( pAO, "%s %d %s %d %s\n", soGetName().data(), // contig name nUnpaddedIndex( nConsPos3 ), ConsEd::pGetAssembly()->soGetAceFileName().data(), nQualityCutoff, soPossible.data() ); bFoundTreeWith2Groups = true; } } } // END--looking for informative trees } // for( int nConsPos3 = nLeftDiscrepantRegion; int nQualityDifference = nHighestQuality - nLowestQuality; fprintf( pAO, "qual difference: %d high: %d low: %d size: %d %d to %d %s\n", nQualityDifference, nHighestQuality, nLowestQuality, nSizeOfDiscrepantRegion, nUnpaddedIndex( nLeftDiscrepantRegion ), nUnpaddedIndex( nRightDiscrepantRegion ), soGetName().data() ); // START--code looking for ambiguously aligned regions // look for ambiguous alignments by seeing if a shift can // be done bool bAmbiguousOnLeft = false; bool bAmbiguousOnRight = false; for( nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->bIsWholeReadLowQuality() ) continue; if ( ! ( pLocFrag->nGetHighQualityStart() <= nFlankingRegionLeft && nFlankingRegionRight <= pLocFrag->nGetHighQualityEnd() ) ) continue; if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( ! ( pLocFrag->nGetAlignClipStart() <= nFlankingRegionLeft && nFlankingRegionRight <= pLocFrag->nGetAlignClipEnd() ) ) continue; RWCString soBases( (size_t) pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ); for( int nConsPos4 = nFlankingRegionLeft; nConsPos4 <= ( nLeftDiscrepantRegion - 1); ++nConsPos4 ) { if ( !pLocFrag->ntGetFragFromConsPos( nConsPos4 ).bIsPad() ) { soBases += pLocFrag->ntGetFragFromConsPos( nConsPos4 ).cGetBase(); } } // now see if there is any ambiguity of the alignment if ( soBases.length() > 2 ) { int n1Shorter = soBases.length() - 1; int n2Shorter = soBases.length() - 2; if ( soBases.soGetRestOfString( 1 ) == soBases( 0, n1Shorter ) ) { // problem--match like this: // aaaaaxxxaaaaa // aaaaaxxxaaaaa bAmbiguousOnLeft = true; } else if ( soBases.soGetRestOfString( 2 ) == soBases( 0, n2Shorter ) ) { // problem. Match like this: // aaaaaxxxaaaaa // aaaaaxxxaaaaa bAmbiguousOnLeft = true; } } soBases = ""; for( int nConsPos9 = nRightDiscrepantRegion + 1; nConsPos9 <= nFlankingRegionRight; ++nConsPos9 ) { if ( !pLocFrag->ntGetFragFromConsPos( nConsPos9 ).bIsPad() ) { soBases += pLocFrag->ntGetFragFromConsPos( nConsPos9 ).cGetBase(); } } if ( soBases.length() > 2 ) { int n1Shorter = soBases.length() - 1; int n2Shorter = soBases.length() - 2; if ( soBases.soGetRestOfString( 1 ) == soBases( 0, n1Shorter ) ) { bAmbiguousOnRight = true; } else if ( soBases.soGetRestOfString( 2 ) == soBases( 0, n2Shorter ) ) { bAmbiguousOnRight = true; } } }// for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); if ( bAmbiguousOnLeft && bAmbiguousOnRight ) { fprintf( pAO, "ambiguous alignment left/right unpadded %d to %d %s\n", nUnpaddedIndex( nFlankingRegionLeft ), nUnpaddedIndex( nFlankingRegionRight ), soGetName().data() ); } // END--code looking for ambiguously aligned regions } // for( nConsPos = 1; nConsPos <= nGetEndConsensusIndex(); fprintf( pAO, "} AutoReportPrintDiscrepantRegionsAndBases\n" ); } static bool bIsCpG( const char cBaseA1, const char cBaseA2, const char cBaseA3, const char cBaseB1, const char cBaseB2, const char cBaseB3 ) { int nCpGALeft = cCpGMatrix[ cBaseA1 ][ cBaseA2 ]; int nCpGBLeft = cCpGMatrix[ cBaseB1 ][ cBaseB2 ]; if ( nCpGALeft && nCpGBLeft && ( nCpGALeft != nCpGBLeft ) ) return true; int nCpGARight = cCpGMatrix[ cBaseA2 ][ cBaseA3 ]; int nCpGBRight = cCpGMatrix[ cBaseB2 ][ cBaseB3 ]; if ( nCpGARight && nCpGBRight && ( nCpGARight != nCpGBRight ) ) return true; return false; } const int nWindowLow = 10; const int nWindowHigh = 15; void Contig :: printAgreeDisagreeBetweenPairsOfSpecies( autoReport* pAutoReport ) { double dErrors = 0; setErrorRateFromQuality(); int nDiscrepancies = 0; setCpGMatrix(); int nContigLengthPlusSentinel = nGetPaddedConsLength() + 1; RWTValVector aBaseA( (size_t) nContigLengthPlusSentinel, '?', "aBasesA" ); RWTValVector aBaseB( (size_t) nContigLengthPlusSentinel, '?', "aBasesB" ); RWTPtrVector aReadA( (size_t) nContigLengthPlusSentinel, NULL, "aReadA" ); RWTPtrVector aReadB( (size_t) nContigLengthPlusSentinel, NULL, "aReadB" ); RWTValVector aQualityA( (size_t) nContigLengthPlusSentinel, 0, "aQualityA" ); RWTValVector aQualityB( (size_t) nContigLengthPlusSentinel, 0, "aQualityB" ); for( int nSpeciesA = 0; nSpeciesA < pAutoReport->aArrayOfSpecies_.length() - 1; ++nSpeciesA ) { RWCString soSpeciesA = pAutoReport->aArrayOfSpecies_[ nSpeciesA ]; for( int nSpeciesB = nSpeciesA + 1; nSpeciesB < pAutoReport->aArrayOfSpecies_.length(); ++nSpeciesB ) { RWCString soSpeciesB = pAutoReport->aArrayOfSpecies_[ nSpeciesB ]; // clear arrays for( int n = 0; n < aBaseA.length(); ++n ) { aBaseA[n] = '?'; aBaseB[n] = '?'; aReadA[n] = NULL; aReadB[n] = NULL; aQualityA[n] = 0; aQualityB[n] = 0; } for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); // is this one of the species we are interested in? if ( !pLocFrag->soGetName().bContains( soSpeciesA ) && !pLocFrag->soGetName().bContains( soSpeciesB ) ) continue; // yes. // get range of high quality aligned sequence if ( pLocFrag->bIsWholeReadLowQuality() ) continue; if ( pLocFrag->bIsWholeReadUnaligned() ) continue; int nAlignedHighQualityLeft = nGetStartConsensusIndex(); int nAlignedHighQualityRight = nGetEndConsensusIndex(); if ( !bIntersect( nAlignedHighQualityLeft, nAlignedHighQualityRight, pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), nAlignedHighQualityLeft, nAlignedHighQualityRight ) ) continue; if ( !bIntersect( nAlignedHighQualityLeft, nAlignedHighQualityRight, pLocFrag->nGetHighQualityStart(), pLocFrag->nGetHighQualityEnd(), nAlignedHighQualityLeft, nAlignedHighQualityRight ) ) continue; RWTValVector* pQualityArray = 0; RWTPtrVector* pReadArray = 0; RWTValVector* pBaseArray = 0; if ( pLocFrag->soGetName().bContains( soSpeciesA ) ) { pQualityArray = &aQualityA; pReadArray = &aReadA; pBaseArray = &aBaseA; } else { pQualityArray = &aQualityB; pReadArray = &aReadB; pBaseArray = &aBaseB; } for( int nConsPos = nAlignedHighQualityLeft; nConsPos <= nAlignedHighQualityRight; ++nConsPos ) { if ( ! ( (*pReadArray)[ nConsPos ] ) ) { (*pReadArray)[ nConsPos ] = pLocFrag; (*pQualityArray)[ nConsPos ] = pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); (*pBaseArray)[ nConsPos ] = pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase(); } else { // already a read from the other strand if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() != (*pBaseArray)[ nConsPos ] ) { // what to do? Let's pretend there was never a base // here on either strand--can't use the data (*pReadArray)[ nConsPos ] = 0; (*pQualityArray)[ nConsPos ] = 0; (*pBaseArray)[ nConsPos ] = '?'; } else { // so the bases agree. Since they are on opposite // strands, let's add the quality values. (*pQualityArray)[ nConsPos ] += pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); } } } // for( int nConsPos = nAlignedHighQualityLeft; } // for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); // now we can do the analysis const int nMaxQualityPlus1 = 100; int nFrequencyOfAgreeingBases[nMaxQualityPlus1][nMaxQualityPlus1]; int nFrequencyOfDisagreeingBases[nMaxQualityPlus1][nMaxQualityPlus1]; int nFrequencyOfCpGDisagreeingBases[nMaxQualityPlus1][nMaxQualityPlus1]; // zero the arrays for( int n1 = 0; n1 < nMaxQualityPlus1; ++n1 ) { for( int n2 = 0; n2 < nMaxQualityPlus1; ++n2 ) { nFrequencyOfAgreeingBases[n1][n2] = 0; nFrequencyOfDisagreeingBases[n1][n2] = 0; nFrequencyOfCpGDisagreeingBases[n1][n2] = 0; } } bool bHaveAtLeastOneValue = false; // Let's look for a motif like this: // aaaaa?aaaaa // (5 agreeing bases, a base pair not pad, 5 agreeing bases) for( int nConsPos = nGetStartConsensusIndex() + pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ - 1 ; nConsPos <= nGetEndConsensusIndex() - pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ + 1; ++nConsPos ) { bool bAgreeSoFar = true; // look to the left of the pair of aligned bases int nConsPos2 = nConsPos; // first -- will make it start // to the left int nNumberOfNonPadAgreeingBases = 0; while( nNumberOfNonPadAgreeingBases < pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ) { --nConsPos2; if ( nConsPos2 < nGetStartConsensusIndex() ) { bAgreeSoFar = false; break; } // in the following case: // * // b // (a pad aligned against a base) // this is counted the same as a disagreeing pair of non-pads if ( ! ( aReadA[ nConsPos2 ] && aReadB[ nConsPos2 ] && ( aBaseA[ nConsPos2 ] == aBaseB[ nConsPos2 ] ) ) ) { bAgreeSoFar = false; break; } // if made it here, there are reads from both species // and they agree if ( aBaseA[ nConsPos2 ] != '*' ) { ++nNumberOfNonPadAgreeingBases; } } if ( !bAgreeSoFar ) continue; // now look to the right of the pair of aligned bases nConsPos2 = nConsPos; // first ++ will make it start // to the right nNumberOfNonPadAgreeingBases = 0; while( nNumberOfNonPadAgreeingBases < pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ) { ++nConsPos2; if ( nConsPos2 > nGetEndConsensusIndex() ) { bAgreeSoFar = false; break; } if ( ! ( aReadA[ nConsPos2 ] && aReadB[ nConsPos2 ] && ( aBaseA[ nConsPos2 ] == aBaseB[ nConsPos2 ] ) ) ) { bAgreeSoFar = false; break; } // if made it here, there are reads from both species // and they agree if ( aBaseA[ nConsPos2] != '*' ) { ++nNumberOfNonPadAgreeingBases; } } if ( !bAgreeSoFar ) continue; // we have found such a motif. // how could this fail( below) ? (4/29/05 DG) // i.e. how could there a read from species A and B at nConsPos? // It could fail if there are 2 reads from species A and // they disagree with each other and there is no way to // reconcile them. // should also add that neither is a pad. // We are not interested in either of these cases: // bbbbb*bbbbb // bbbbbbbbbbb // ^ // We aren't interested in this case because such an indel // has a different probability (hence branch length) than // a substitution mutation. // // bbbbb*bbbbb // bbbbb*bbbbb // ^ // We aren't interested in this case because this isn't // either a mutation or a non-mutation. It is simply an // artifact of our aligning. if ( aReadA[ nConsPos ] && aReadB[ nConsPos ] ) { const bool bDisallowIndels = true; if ( bDisallowIndels ) { if ( aBaseA[ nConsPos ] == '*' || aBaseB[ nConsPos ] == '*' ) continue; } else { if ( aBaseA[ nConsPos ] == '*' && aBaseB[ nConsPos ] == '*' ) continue; } bHaveAtLeastOneValue = true; if ( aBaseA[ nConsPos ] == aBaseB[ nConsPos ] ) { ++nFrequencyOfAgreeingBases[ aQualityA[ nConsPos ] ][ aQualityB[ nConsPos ] ]; } else { ++nFrequencyOfDisagreeingBases[ aQualityA[ nConsPos ] ][ aQualityB[ nConsPos ] ]; if ( bIsCpG( aBaseA[ nConsPos - 1 ], aBaseA[ nConsPos ], aBaseA[ nConsPos + 1 ], aBaseB[ nConsPos - 1 ], aBaseB[ nConsPos ], aBaseB[ nConsPos + 1 ] ) ) { ++nFrequencyOfCpGDisagreeingBases[ aQualityA[ nConsPos ] ][ aQualityB[ nConsPos ] ]; } } // testing if ( soSpeciesA == "PPan" && soSpeciesB == "PTro" && nWindowLow <= aQualityA[ nConsPos ] && nWindowLow <= aQualityB[ nConsPos ] && ! ( nWindowHigh <= aQualityA[ nConsPos ] && nWindowHigh <= aQualityB[ nConsPos ] ) ) { dErrors += ( dErrorRateFromQuality[ aQualityA[ nConsPos ] ] * (1.0 - dErrorRateFromQuality[ aQualityB[ nConsPos ] ] )); dErrors += ( dErrorRateFromQuality[ aQualityB[ nConsPos ] ] * ( 1.0 - dErrorRateFromQuality[ aQualityA[ nConsPos ]] )); dErrors += 2.0 / 3.0 * dErrorRateFromQuality[ aQualityA[ nConsPos ] ] * dErrorRateFromQuality[ aQualityB[ nConsPos ] ]; if ( aBaseA[ nConsPos ] != aBaseB[ nConsPos ] ) ++nDiscrepancies; fprintf( pAO, "nav %s %s %d %c %c %d %d %.4g %d %s\n", soSpeciesA.data(), soSpeciesB.data(), nUnpaddedIndex( nConsPos ), aBaseA[nConsPos], aBaseB[nConsPos], aQualityA[ nConsPos ], aQualityB[ nConsPos ], dErrors, nDiscrepancies, soGetName().data() ); } // end testing } } if ( bHaveAtLeastOneValue ) { // now print out these arrays fprintf( pAO, "Agreeing and Disagreeing Bases %s %s {\n", soSpeciesA.data(), soSpeciesB.data() ); for( int nQualityA = 0; nQualityA < 100; ++nQualityA ) { for( int nQualityB = 0; nQualityB < 100; ++nQualityB ) { if ( nFrequencyOfAgreeingBases[nQualityA][nQualityB] || nFrequencyOfDisagreeingBases[nQualityA][nQualityB] ) { fprintf( pAO, "%d %d %d %d %d\n", nQualityA, nQualityB, nFrequencyOfAgreeingBases[nQualityA][nQualityB], nFrequencyOfDisagreeingBases[nQualityA][nQualityB], nFrequencyOfCpGDisagreeingBases[nQualityA][nQualityB]); } } } fprintf( pAO, "} Agreeing and Disagreeing Bases %s %s\n", soSpeciesA.data(), soSpeciesB.data() ); } // bHaveAtLeastOneValue } // for( int nSpeciesB = nSpeciesA + 1; } // for( int nSpeciesA = 0; nSpeciesA < } void Contig :: calculateErrorProbabilitiesByComparingPTroPPan( autoReport* pAutoReport ) { setErrorRateFromQuality(); int nContigLengthPlusOne = nGetPaddedConsLength() + 1; RWTValVector aBaseOfHighQualityRead( (size_t) nContigLengthPlusOne, '?', "aBaseOfHighQualityReads" ); RWTPtrVector aHighQualityRead( (size_t) nContigLengthPlusOne, NULL, "aHighQualityRead" ); RWTValVector aQualityOfHighQualityRead( (size_t) nContigLengthPlusOne, 0, "aQualityOfHighQualityRead" ); RWTValVector aAmbiguousBase( (size_t) nContigLengthPlusOne, false, "aAmbiguousBase" ); int nTotalDiscrepancies = 0; int nTotalBasesFlanked = 0; double dTotalPredictedMeanErrors = 0.0; const int nQualityCutoff = 45; int nRead; for( nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); // is this one of the species we are interested in? if ( !pLocFrag->soGetName().bContains( "PPan" ) && !pLocFrag->soGetName().bContains( "PTro" ) ) continue; // get range of high quality aligned sequence if ( pLocFrag->bIsWholeReadLowQuality() ) continue; if ( pLocFrag->bIsWholeReadUnaligned() ) continue; int nAlignedHighQualityLeft = nGetStartConsensusIndex(); int nAlignedHighQualityRight = nGetEndConsensusIndex(); if ( !bIntersect( nAlignedHighQualityLeft, nAlignedHighQualityRight, pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), nAlignedHighQualityLeft, nAlignedHighQualityRight ) ) continue; if ( !bIntersect( nAlignedHighQualityLeft, nAlignedHighQualityRight, pLocFrag->nGetHighQualityStart(), pLocFrag->nGetHighQualityEnd(), nAlignedHighQualityLeft, nAlignedHighQualityRight ) ) continue; for( int nConsPos = nAlignedHighQualityLeft; nConsPos <= nAlignedHighQualityRight; ++nConsPos ) { if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQuality() >= nQualityCutoff ) { if ( !aHighQualityRead[ nConsPos ] ) { aHighQualityRead[nConsPos ] = pLocFrag; aQualityOfHighQualityRead[ nConsPos ] = pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); aBaseOfHighQualityRead[ nConsPos ] = pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase(); } else { // there is already a base there. What is it? if ( aBaseOfHighQualityRead[ nConsPos ] != pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { aAmbiguousBase[ nConsPos ] = true; } } } } // for( int nConsPos = nAlignedHighQualityLeft; } // for( int nRead = 0; nRead < nGetNumberOfFragsInContig() for( nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); // is this one of the species we are interested in? if ( !pLocFrag->soGetName().bContains( "PPan" ) && !pLocFrag->soGetName().bContains( "PTro" ) ) continue; // get range of high quality aligned sequence if ( pLocFrag->bIsWholeReadLowQuality() ) continue; if ( pLocFrag->bIsWholeReadUnaligned() ) continue; int nAlignedHighQualityLeft = nGetStartConsensusIndex(); int nAlignedHighQualityRight = nGetEndConsensusIndex(); if ( !bIntersect( nAlignedHighQualityLeft, nAlignedHighQualityRight, pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), nAlignedHighQualityLeft, nAlignedHighQualityRight ) ) continue; if ( !bIntersect( nAlignedHighQualityLeft, nAlignedHighQualityRight, pLocFrag->nGetHighQualityStart(), pLocFrag->nGetHighQualityEnd(), nAlignedHighQualityLeft, nAlignedHighQualityRight ) ) continue; for( int nConsPos = nAlignedHighQualityLeft; nConsPos <= nAlignedHighQualityRight; ++nConsPos ) { if ( !aHighQualityRead[ nConsPos ] ) continue; if ( aHighQualityRead[ nConsPos ] == pLocFrag ) continue; if ( aAmbiguousBase[ nConsPos ] ) continue; if ( aBaseOfHighQualityRead[ nConsPos ] == '*' ) continue; // testing--restrict to low quality reads in the nWindowLow to // nWindowHigh - 1 // window if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQuality() < nWindowLow ) continue; if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQuality() >= nWindowHigh && aQualityOfHighQualityRead[ nConsPos ] >= nWindowHigh ) continue; // end testing // now look left and right and see if there are flanking agreeing bases // from the 2 reads LocatedFragment* pHighQualityRead = aHighQualityRead[ nConsPos ]; if ( pHighQualityRead->bIsWholeReadLowQuality() ) continue; if ( pHighQualityRead->bIsWholeReadUnaligned() ) continue; int nAlignedHighQualityLeft2 = nGetStartConsensusIndex(); int nAlignedHighQualityRight2 = nGetEndConsensusIndex(); if ( !bIntersect( nAlignedHighQualityLeft2, nAlignedHighQualityRight2, pHighQualityRead->nGetAlignClipStart(), pHighQualityRead->nGetAlignClipEnd(), nAlignedHighQualityLeft2, nAlignedHighQualityRight2 ) ) continue; if ( !bIntersect( nAlignedHighQualityLeft2, nAlignedHighQualityRight2, pHighQualityRead->nGetHighQualityStart(), pHighQualityRead->nGetHighQualityEnd(), nAlignedHighQualityLeft2, nAlignedHighQualityRight2 ) ) continue; int nAlignedHighQualityLeftBoth; int nAlignedHighQualityRightBoth; assert( bIntersect( nAlignedHighQualityLeft, nAlignedHighQualityRight, nAlignedHighQualityLeft2, nAlignedHighQualityRight2, nAlignedHighQualityLeftBoth, nAlignedHighQualityRightBoth ) ); bool bAgreeSoFar = true; int nConsPos2 = nConsPos; // first -- will make it start to the left int nNumberOfNonPadAgreeingBases = 0; while( nNumberOfNonPadAgreeingBases < pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ) { --nConsPos2; if ( nConsPos2 < nAlignedHighQualityLeftBoth ) { bAgreeSoFar = false; break; } // the following case: // * // b // (a pad aligned against a base) // this is counted the same as a disagreeing pair of non-pads char cBaseOfHighQualityRead = aBaseOfHighQualityRead[ nConsPos2 ]; char cBase2 = pLocFrag->ntGetFragFromConsPos( nConsPos2 ).cGetBase(); if ( cBaseOfHighQualityRead != cBase2 ) { bAgreeSoFar = false; break; } // since the bases are equal, this checks that neither is a pad if ( cBaseOfHighQualityRead != '*' ) { ++nNumberOfNonPadAgreeingBases; } } if ( !bAgreeSoFar ) continue; // now look to the right nConsPos2 = nConsPos; // first ++ will make it start to the right nNumberOfNonPadAgreeingBases = 0; while( nNumberOfNonPadAgreeingBases < pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ) { ++nConsPos2; if ( nConsPos2 > nAlignedHighQualityRightBoth ) { // like this: // B---- (end of hqs aligned part of read) // B------- // ^ base considered // Thus not flanked on right by enough agreeing bases. bAgreeSoFar = false; break; } char cBaseOfHighQualityRead = aBaseOfHighQualityRead[ nConsPos2 ]; char cBase2 = pLocFrag->ntGetFragFromConsPos( nConsPos2 ).cGetBase(); if ( cBaseOfHighQualityRead != cBase2 ) { bAgreeSoFar = false; break; } if ( cBaseOfHighQualityRead != '*' ) { ++nNumberOfNonPadAgreeingBases; } } if ( !bAgreeSoFar ) continue; // so we've found a base that lies within a motif like this: // -----B----- // -----B----- where the '-----' are agreeing bases // the reads are pLocFrag and pHighQualityRead ++nTotalBasesFlanked; if ( aBaseOfHighQualityRead[ nConsPos ] != pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { ++nTotalDiscrepancies; } // calculate error probabilities double dErr1 = dErrorRateFromQuality[ pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ]; double dErr2 = dErrorRateFromQuality[ pHighQualityRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ]; double dProbabilityOfADiscrepancy = dErr1 * ( 1.0 - dErr2 ) + dErr2 * ( 1.0 - dErr1 ) + 2.0/3.0 * dErr1 * dErr2; dTotalPredictedMeanErrors += dProbabilityOfADiscrepancy; fprintf( pAO, "nav: %s %d %s %d %s %d %d %d\n", soGetName().data(), nUnpaddedIndex( nConsPos ), pHighQualityRead->soGetName().data(), pHighQualityRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(), pLocFrag->soGetName().data(), pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQuality(), nTotalDiscrepancies, nTotalBasesFlanked ); } // for( int nConsPos = nAlignedHighQualityLeft; } // for( int nRead = 0; nRead < nGetNumberOfFragsInContig() fprintf( pAO, "bases: %d discrepancies: %d predicted: %.6f\n", nTotalBasesFlanked, nTotalDiscrepancies, dTotalPredictedMeanErrors ); } void Contig :: printAgreeDisagreeBetweenPairsOfSpecies2( autoReport* pAutoReport ) { double dErrors = 0; setErrorRateFromQuality(); int nDiscrepancies = 0; setCpGMatrix(); int nContigLengthPlusSentinel = nGetPaddedConsLength() + 1; RWTValVector aBaseA( (size_t) nContigLengthPlusSentinel, '?', "aBasesA" ); RWTValVector aBaseB( (size_t) nContigLengthPlusSentinel, '?', "aBasesB" ); RWTPtrVector aReadA( (size_t) nContigLengthPlusSentinel, NULL, "aReadA" ); RWTPtrVector aReadB( (size_t) nContigLengthPlusSentinel, NULL, "aReadB" ); RWTValVector aQualityA( (size_t) nContigLengthPlusSentinel, 0, "aQualityA" ); RWTValVector aQualityB( (size_t) nContigLengthPlusSentinel, 0, "aQualityB" ); for( int nSpeciesA = 0; nSpeciesA < pAutoReport->aArrayOfSpecies_.length() - 1; ++nSpeciesA ) { RWCString soSpeciesA = pAutoReport->aArrayOfSpecies_[ nSpeciesA ]; for( int nSpeciesB = nSpeciesA + 1; nSpeciesB < pAutoReport->aArrayOfSpecies_.length(); ++nSpeciesB ) { RWCString soSpeciesB = pAutoReport->aArrayOfSpecies_[ nSpeciesB ]; // now we can do the analysis const int nMaxQualityPlus1 = 100; int nFrequencyOfAgreeingBases[nMaxQualityPlus1][nMaxQualityPlus1]; int nFrequencyOfDisagreeingBases[nMaxQualityPlus1][nMaxQualityPlus1]; int nFrequencyOfCpGDisagreeingBases[nMaxQualityPlus1][nMaxQualityPlus1]; // zero the arrays for( int n1 = 0; n1 < nMaxQualityPlus1; ++n1 ) { for( int n2 = 0; n2 < nMaxQualityPlus1; ++n2 ) { nFrequencyOfAgreeingBases[n1][n2] = 0; nFrequencyOfDisagreeingBases[n1][n2] = 0; nFrequencyOfCpGDisagreeingBases[n1][n2] = 0; } } bool bHaveAtLeastOneValue = false; for( int nBothStrandsAreWhichStrand = 0; nBothStrandsAreWhichStrand <= 1; ++nBothStrandsAreWhichStrand ) { bool bTopStrandNotBottomStrand = true; if ( nBothStrandsAreWhichStrand == 1 ) bTopStrandNotBottomStrand = false; // clear arrays for( int n = 0; n < aBaseA.length(); ++n ) { aBaseA[n] = '?'; aBaseB[n] = '?'; aReadA[n] = NULL; aReadB[n] = NULL; aQualityA[n] = 0; aQualityB[n] = 0; } for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->bComp() == bTopStrandNotBottomStrand ) continue; // is this one of the species we are interested in? if ( !pLocFrag->soGetName().bContains( soSpeciesA ) && !pLocFrag->soGetName().bContains( soSpeciesB ) ) continue; // yes. // get range of high quality aligned sequence if ( pLocFrag->bIsWholeReadLowQuality() ) continue; if ( pLocFrag->bIsWholeReadUnaligned() ) continue; int nAlignedHighQualityLeft = nGetStartConsensusIndex(); int nAlignedHighQualityRight = nGetEndConsensusIndex(); if ( !bIntersect( nAlignedHighQualityLeft, nAlignedHighQualityRight, pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), nAlignedHighQualityLeft, nAlignedHighQualityRight ) ) continue; if ( !bIntersect( nAlignedHighQualityLeft, nAlignedHighQualityRight, pLocFrag->nGetHighQualityStart(), pLocFrag->nGetHighQualityEnd(), nAlignedHighQualityLeft, nAlignedHighQualityRight ) ) continue; RWTValVector* pQualityArray = 0; RWTPtrVector* pReadArray = 0; RWTValVector* pBaseArray = 0; if ( pLocFrag->soGetName().bContains( soSpeciesA ) ) { pQualityArray = &aQualityA; pReadArray = &aReadA; pBaseArray = &aBaseA; } else { pQualityArray = &aQualityB; pReadArray = &aReadB; pBaseArray = &aBaseB; } for( int nConsPos = nAlignedHighQualityLeft; nConsPos <= nAlignedHighQualityRight; ++nConsPos ) { if ( ! ( (*pReadArray)[ nConsPos ] ) ) { (*pReadArray)[ nConsPos ] = pLocFrag; (*pQualityArray)[ nConsPos ] = pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); (*pBaseArray)[ nConsPos ] = pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase(); } else { cerr << "existing read is " << (*pReadArray)[ nConsPos ]->soGetName() << " and other read is " << pLocFrag->soGetName() << endl; assert( false ); // already a read from the other strand if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() != (*pBaseArray)[ nConsPos ] ) { // what to do? Let's pretend there was never a base // here on either strand--can't use the data (*pReadArray)[ nConsPos ] = 0; (*pQualityArray)[ nConsPos ] = 0; (*pBaseArray)[ nConsPos ] = '?'; } else { // so the bases agree. Since they are on opposite // strands, let's add the quality values. (*pQualityArray)[ nConsPos ] += pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); assert( false ); // this shouldn't happen // since different reads, same species, are // on different strands } } } // for( int nConsPos = nAlignedHighQualityLeft; } // for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); // Let's look for a motif like this: // aaaaa?aaaaa // (5 agreeing bases, a base pair not pad, 5 agreeing bases) for( int nConsPos = nGetStartConsensusIndex() + pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ - 1 ; nConsPos <= nGetEndConsensusIndex() - pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ + 1; ++nConsPos ) { bool bAgreeSoFar = true; // look to the left of the pair of aligned bases int nConsPos2 = nConsPos; // first -- will make it start // to the left int nNumberOfNonPadAgreeingBases = 0; while( nNumberOfNonPadAgreeingBases < pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ) { --nConsPos2; if ( nConsPos2 < nGetStartConsensusIndex() ) { bAgreeSoFar = false; break; } // in the following case: // * // b // (a pad aligned against a base) // this is counted the same as a disagreeing pair of non-pads if ( ! ( aReadA[ nConsPos2 ] && aReadB[ nConsPos2 ] && ( aBaseA[ nConsPos2 ] == aBaseB[ nConsPos2 ] ) ) ) { bAgreeSoFar = false; break; } // if made it here, there are reads from both species // and they agree if ( aBaseA[ nConsPos2 ] != '*' ) { ++nNumberOfNonPadAgreeingBases; } } if ( !bAgreeSoFar ) continue; // now look to the right of the pair of aligned bases nConsPos2 = nConsPos; // first ++ will make it start // to the right nNumberOfNonPadAgreeingBases = 0; while( nNumberOfNonPadAgreeingBases < pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ) { ++nConsPos2; if ( nConsPos2 > nGetEndConsensusIndex() ) { bAgreeSoFar = false; break; } if ( ! ( aReadA[ nConsPos2 ] && aReadB[ nConsPos2 ] && ( aBaseA[ nConsPos2 ] == aBaseB[ nConsPos2 ] ) ) ) { bAgreeSoFar = false; break; } // if made it here, there are reads from both species // and they agree if ( aBaseA[ nConsPos2] != '*' ) { ++nNumberOfNonPadAgreeingBases; } } if ( !bAgreeSoFar ) continue; // we have found such a motif. // how could this fail( below) ? (4/29/05 DG) // i.e. how could there a read from species A and B at nConsPos? // It could fail if there are 2 reads from species A and // they disagree with each other and there is no way to // reconcile them. // should also add that neither is a pad. // We are not interested in either of these cases: // bbbbb*bbbbb // bbbbbbbbbbb // ^ // We aren't interested in this case because such an indel // has a different probability (hence branch length) than // a substitution mutation. // // bbbbb*bbbbb // bbbbb*bbbbb // ^ // We aren't interested in this case because this isn't // either a mutation or a non-mutation. It is simply an // artifact of our aligning. if ( aReadA[ nConsPos ] && aReadB[ nConsPos ] ) { const bool bDisallowIndels = true; if ( bDisallowIndels ) { if ( aBaseA[ nConsPos ] == '*' || aBaseB[ nConsPos ] == '*' ) continue; } else { if ( aBaseA[ nConsPos ] == '*' && aBaseB[ nConsPos ] == '*' ) continue; } bHaveAtLeastOneValue = true; if ( aBaseA[ nConsPos ] == aBaseB[ nConsPos ] ) { ++nFrequencyOfAgreeingBases[ aQualityA[ nConsPos ] ][ aQualityB[ nConsPos ] ]; } else { ++nFrequencyOfDisagreeingBases[ aQualityA[ nConsPos ] ][ aQualityB[ nConsPos ] ]; if ( bIsCpG( aBaseA[ nConsPos - 1 ], aBaseA[ nConsPos ], aBaseA[ nConsPos + 1 ], aBaseB[ nConsPos - 1 ], aBaseB[ nConsPos ], aBaseB[ nConsPos + 1 ] ) ) { ++nFrequencyOfCpGDisagreeingBases[ aQualityA[ nConsPos ] ][ aQualityB[ nConsPos ] ]; } } const int n45 = 45; if ( soSpeciesA == "PPan" && soSpeciesB == "PTro" && pCP->nAutoReportQualityWindowLow_ <= aQualityA[ nConsPos ] && pCP->nAutoReportQualityWindowLow_ <= aQualityB[ nConsPos ] && ( ! ( pCP->nAutoReportQualityWindowHigh_ <= aQualityA[ nConsPos ] && pCP->nAutoReportQualityWindowHigh_ <= aQualityB[ nConsPos ] ) ) && ( n45 <= aQualityA[ nConsPos ] || n45 <= aQualityB[ nConsPos ] ) ) { dErrors += ( dErrorRateFromQuality[ aQualityA[ nConsPos ] ] * (1.0 - dErrorRateFromQuality[ aQualityB[ nConsPos ] ] )); dErrors += ( dErrorRateFromQuality[ aQualityB[ nConsPos ] ] * ( 1.0 - dErrorRateFromQuality[ aQualityA[ nConsPos ]] )); dErrors += 2.0 / 3.0 * dErrorRateFromQuality[ aQualityA[ nConsPos ] ] * dErrorRateFromQuality[ aQualityB[ nConsPos ] ]; if ( aBaseA[ nConsPos ] != aBaseB[ nConsPos ] ) ++nDiscrepancies; fprintf( pAO, "nav %s %s %d %c %c %d %d %.4g %d %s\n", soSpeciesA.data(), soSpeciesB.data(), nUnpaddedIndex( nConsPos ), aBaseA[nConsPos], aBaseB[nConsPos], aQualityA[ nConsPos ], aQualityB[ nConsPos ], dErrors, nDiscrepancies, soGetName().data() ); } } // if ( aReadA[ nConsPos ] && aReadB[ nConsPos } // for( int nConsPos = nGetStartConsensusIndex() } // for( int nBothStrandsAreWhichStrand = 0; nBothStrandsAreWhichStrand <= 1; if ( bHaveAtLeastOneValue ) { // now print out these arrays fprintf( pAO, "Agreeing and Disagreeing Bases %s %s {\n", soSpeciesA.data(), soSpeciesB.data() ); for( int nQualityA = 0; nQualityA < 100; ++nQualityA ) { for( int nQualityB = 0; nQualityB < 100; ++nQualityB ) { if ( nFrequencyOfAgreeingBases[nQualityA][nQualityB] || nFrequencyOfDisagreeingBases[nQualityA][nQualityB] ) { fprintf( pAO, "%d %d %d %d %d\n", nQualityA, nQualityB, nFrequencyOfAgreeingBases[nQualityA][nQualityB], nFrequencyOfDisagreeingBases[nQualityA][nQualityB], nFrequencyOfCpGDisagreeingBases[nQualityA][nQualityB]); } } } fprintf( pAO, "} Agreeing and Disagreeing Bases %s %s\n", soSpeciesA.data(), soSpeciesB.data() ); } // bHaveAtLeastOneValue } // for( int nSpeciesB = nSpeciesA + 1; } // for( int nSpeciesA = 0; nSpeciesA < } void Contig :: areReadsCorrectlyAligned( autoReport* pAutoReport ) { for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) continue; if ( pLocFrag->soGetName().bEndsWith( "f" ) ) { if ( pLocFrag->bComp() ) { fprintf( pAO, "Wrong strand: %s is on bottom strand\n", pLocFrag->soGetName().data() ); } else { fprintf( pAO, "Right strand: %s is on top strand\n", pLocFrag->soGetName().data() ); } } else if ( pLocFrag->soGetName().bEndsWith( "r" ) ) { if ( !pLocFrag->bComp() ) { fprintf( pAO, "Wrong strand: %s is on top strand\n", pLocFrag->soGetName().data() ); } else { fprintf( pAO, "Right strand: %s is on bottom strand\n", pLocFrag->soGetName().data() ); } } else { assert( false ); } } } void Contig :: printLengthsOfUnalignedHighQualitySegments() { for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) continue; gotoList myGotoList; pLocFrag->addUnalignedHighQualityRegionsToList( &myGotoList, true ); // bIncludeTotallyUnalignedReads if ( myGotoList.nGetNumGotoItems() > 0 ) { // I can't see how the difference of 2 line segments could // have any more than 2 line segments assert( myGotoList.nGetNumGotoItems() <= 2 ); int nLengthOfUnalignedHighQualityRegion = 0; for( int nGotoItem = 0; nGotoItem < myGotoList.nGetNumGotoItems(); ++nGotoItem ) { gotoItem* pGotoItem = myGotoList.pGetGotoItem( nGotoItem ); int nUnpaddedStart = pLocFrag->nConsPosToUnpaddedFragPos( pGotoItem->nGotoItemStart_ ); int nUnpaddedEnd = pLocFrag->nConsPosToUnpaddedFragPos( pGotoItem->nGotoItemEnd_ ); // we don't know which is larger, nUnpaddedEnd or nUnpaddedStart int nLengthOfOneRegion = ABS( nUnpaddedEnd - nUnpaddedStart ) + 1; nLengthOfUnalignedHighQualityRegion += nLengthOfOneRegion; // print this region fprintf( pAO, "unaligned_hqs_bases{\n" ); fprintf( pAO, ">%s cons pos %d-%d\n", pLocFrag->soGetName().data(), pGotoItem->nUnpaddedGotoItemStart_, pGotoItem->nUnpaddedGotoItemEnd_ ); int nBasesOnLine = 0; for( int nConsPos = pGotoItem->nGotoItemStart_; nConsPos <= pGotoItem->nGotoItemEnd_; ++nConsPos ) { if ( !pLocFrag->ntGetFragFromConsPos( nConsPos ).bIsPad() ) { if ( nBasesOnLine == 50 ) { putc( '\n', pAO ); nBasesOnLine = 0; } putc( pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase(), pAO ); ++nBasesOnLine; } } fprintf( pAO, "\n}unaligned_hqs_bases\n" ); } // for( int nGotoItem = 0; fprintf( pAO, "unaligned hqs bases: %d of read %s\n", nLengthOfUnalignedHighQualityRegion, pLocFrag->soGetName().data() ); } else { fprintf( pAO, "unaligned hqs bases: 0 of read %s\n", pLocFrag->soGetName().data() ); } } // we also need to look at reads that are totally unaligned // how can we even know about them? We must read the ../phd_dir DIR *pDirFile; struct dirent *pDirEntry; if ( ( pDirFile = opendir( ConsEd::pGetConsEd()->filGetPHDDir() )) == NULL ) { THROW_FILE_ERROR( ConsEd::pGetConsEd()->filGetPHDDir() ); } RWCRegexp regPattern( "\\.phd\\.[0-9]+$" ); RWTValOrderedVector aSingletPhdFiles; RWTValOrderedVector aSingletReads; while( ( pDirEntry = readdir( pDirFile ) ) != NULL ) { FileName filPHDFile( pDirEntry->d_name ); // could be "." or ".." if ( !filPHDFile.bContains( regPattern ) ) continue; RWCString soReadName = filPHDFile; // chop off the .phd.# soReadName( regPattern ) = ""; // is this read already in the assembly? if ( ConsEd::pGetAssembly()->pGetLocatedFragmentByName( soReadName ) ) continue; // if reached here, the read is not in the assembly aSingletPhdFiles.insert( filPHDFile ); aSingletReads.insert( soReadName ); } for( int nSinglet = 0; nSinglet < aSingletPhdFiles.length(); ++nSinglet ) { FileName filSinglet = aSingletPhdFiles[ nSinglet ]; FileName filPHDFullPath = ConsEd::pGetConsEd()->filGetPHDDir() + "/" + filSinglet; FILE* pFil = fopen( ( char*) filPHDFullPath.data(), "r" ); if ( !pFil ) { THROW_FILE_ERROR( filPHDFullPath ); } singletInfo mySingletInfo; readPHDAgainForHighQualitySegment( &mySingletInfo, pFil ); int nNumberOfHighQualityBases = mySingletInfo.nHighQualitySegmentEnd_ - mySingletInfo.nHighQualitySegmentStart_ + 1; if ( nNumberOfHighQualityBases == 1 ) nNumberOfHighQualityBases = 0; RWCString soReadName = aSingletReads[ nSinglet ]; fprintf( pAO, "unaligned hqs bases: %d of read %s (singlet)\n", nNumberOfHighQualityBases, soReadName.data() ); } } void Contig :: printLengthsOfAlignedSegments() { RWTValOrderedVector aSingletPhdFiles; RWTValOrderedVector aSingletReads; getAllSinglets( aSingletPhdFiles, aSingletReads ); for( int nSinglet = 0; nSinglet < aSingletReads.length(); ++nSinglet ) { RWCString soReadName = aSingletReads[ nSinglet ]; fprintf( pAO, "%s 0\n", soReadName.data() ); } for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) continue; if ( pLocFrag->bIsWholeReadUnaligned() ) { fprintf( pAO, "%s 0\n", pLocFrag->soGetName().data() ); } else { int nConsPosLeft = pLocFrag->nGetAlignClipStart(); int nConsPosRight = pLocFrag->nGetAlignClipEnd(); int nUnpaddedReadLeft = pLocFrag->nConsPosToUnpaddedFragPos( nConsPosLeft ); int nUnpaddedReadRight = pLocFrag->nConsPosToUnpaddedFragPos( nConsPosRight ); int nAlignedLength = nUnpaddedReadRight - nUnpaddedReadLeft + 1; fprintf( pAO, "%s %d\n", pLocFrag->soGetName().data(), nAlignedLength ); } } } void Contig :: compareTopAndBottomStrands( autoReport* pAutoReport ) { // this will change when we delete columns int nPaddedContigLength = nGetPaddedConsLength(); mbtValVectorOfBool aSomeCombinedReadHasNoPad( nPaddedContigLength, 1, "Contig::aSomeCombinedReadHasNoPad" ); const char cSingleSignalUndetermined = 'u'; const char cSingleSignal = 's'; const char cMultipleSignal = 'm'; RWTPtrOrderedVector aListOfReadsInSingleSignalArrays( (size_t) 10 ); typedef MBTValOrderedOffsetVector typeArrayOfChar; RWTPtrOrderedVector aListOfSingleSignalArrays( (size_t) 10 ); int nSpecies; for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; LocatedFragment* pHumanRead = NULL; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { pHumanRead = pLocFrag; continue; } // the fake read for the 2nd alignment if ( pLocFrag->soGetName().bContains( "b." ) ) { continue; } if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } // there should always be a human read. No, perhaps we are in // a contig that doesn't have one. In that case, ignore this contig. if (! pHumanRead ) return; // save single signal arrays before deleting column of pads for( int n = 0; n <= 1; ++n ) { LocatedFragment* pLocFrag = NULL; if ( n == 0 ) pLocFrag = pForwardRead; else pLocFrag = pReverseRead; if ( !pLocFrag ) continue; MBTValOrderedOffsetVector* pSingleSignalArray = NULL; pLocFrag->bGetSingleSignalBasesConsensusLength( pSingleSignalArray ); aListOfReadsInSingleSignalArrays.insert( pLocFrag ); aListOfSingleSignalArrays.insert( pSingleSignalArray ); } for( int nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // the consensus and human read should be equal assert( pHumanRead->bIsInAlignedPartOfRead( nConsPos ) ); if ( !pHumanRead->ntGetFragFromConsPos( nConsPos ).bIsPad() ) { aSomeCombinedReadHasNoPad.setValue( nConsPos, true ); } // cases: use just forward // use just reverse // use both // use neither char cCase = ' '; LocatedFragment* pReadToUse = NULL; if ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos ) && pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) ) { if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { // just so it isn't null: pReadToUse = pForwardRead; cCase = 'b'; // both } else { // will just use one read. Which is it? if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() == pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) { // I don't know what to do in this case so let's ignore // such data points cCase = 'n'; } else if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() > pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) { pReadToUse = pForwardRead; cCase = 'o'; } else { pReadToUse = pReverseRead; cCase = 'o'; } } } else if ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos ) ) { cCase = 'o'; pReadToUse = pForwardRead; } else if ( pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) ) { cCase = 'o'; pReadToUse = pReverseRead; } else { cCase = 'n'; } // if both the human and primate are pads, // skip this base if ( cCase != 'n' && !pReadToUse->ntGetFragFromConsPos( nConsPos ).bIsPad() ) { aSomeCombinedReadHasNoPad.setValue( nConsPos, true ); } } // for( int nConsPos = nGetStartConsensusIndex(); } // for( nSpecies = 0; nSpecies < // remove columns in which all combined reads have a pad // first, do it to the single signal arrays. for( int nRead = 0; nRead < aListOfSingleSignalArrays.length(); ++nRead ) { MBTValOrderedOffsetVector* pSingleSignalArray = aListOfSingleSignalArrays[ nRead ]; for( int nConsPos = nGetEndConsensusIndex(); nConsPos >= nGetStartConsensusIndex(); --nConsPos ) { if ( !aSomeCombinedReadHasNoPad[ nConsPos ] ) { pSingleSignalArray->removeAt( nConsPos ); } } } // for( int nRead = 0; nRead < aListOfReadsInSingleSignalArrays.length(); int nConsPos; for( nConsPos = nGetEndConsensusIndex(); nConsPos >= nGetStartConsensusIndex(); --nConsPos ) { if ( ! aSomeCombinedReadHasNoPad[ nConsPos ] ) { deleteColumn( nConsPos, false ); } } // let's save the assembly to see that everything went ok ConsEd::pGetAssembly()->saveToFile( "temp_no_columns.ace", false ); // bAceFormat1 // new padded length after deleting columns nPaddedContigLength = nGetPaddedConsLength(); mbtValVector aMinQualityInColumn( (size_t) nPaddedContigLength, 1, 666, "aMinQualityInColumn" ); const unsigned char ucPTro = 1; const unsigned char ucPPan = 2; const unsigned char ucGGor = 4; const unsigned char ucPPyg = 8; const unsigned char ucMMul = 16; mbtValVector aSpeciesInColumn( (size_t) nPaddedContigLength, 1, 0, "aSpeciesInColumn" ); mbtValVectorOfBool aDiscrepantPositions( nPaddedContigLength, 1, "Contig::aDiscrepantPositions" ); mbtValVectorOfBool aNeededSpeciesAreAlignedAndAgree( nPaddedContigLength, 1, "Contig::aNeededSpeciesAreAlignedAndAgree" ); mbtValVectorOfBool aNeededSpeciesAreAlignedAndHQ( nPaddedContigLength, 1, "Contig::aNeededSpeciesAreAlignedAndHQ" ); mbtValVectorOfBool aSomeCombinedReadIsMultipleSignal( nPaddedContigLength, 1, "Contig::aSomeCombinedReadIsMultipleSignal" ); for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; unsigned char ucSpeciesMask; if ( soSpecies == "PTro" ) { ucSpeciesMask = ucPTro; } else if ( soSpecies == "PPan" ) { ucSpeciesMask = ucPPan; } else if ( soSpecies == "GGor" ) { ucSpeciesMask = ucGGor; } else if ( soSpecies == "PPyg" ) { ucSpeciesMask = ucPPyg; } else if ( soSpecies == "MMul" ) { ucSpeciesMask = ucMMul; } else { ucSpeciesMask = 0; } LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; LocatedFragment* pHumanRead = NULL; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { pHumanRead = pLocFrag; continue; } if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } // there should always be a human read. No, perhaps we are in // a contig that doesn't have one. In that case, ignore this contig. if (! pHumanRead ) return; MBTValOrderedOffsetVector* pForwardReadSingleSignalArray = NULL; MBTValOrderedOffsetVector* pReverseReadSingleSignalArray = NULL; if ( pForwardRead ) { int nIndex = aListOfReadsInSingleSignalArrays.indexByPointers( pForwardRead ); assert( nIndex != RW_NPOS ); pForwardReadSingleSignalArray = aListOfSingleSignalArrays[ nIndex ]; } if ( pReverseRead ) { int nIndex = aListOfReadsInSingleSignalArrays.indexByPointers( pReverseRead ); assert( nIndex != RW_NPOS ); pReverseReadSingleSignalArray = aListOfSingleSignalArrays[ nIndex ]; } int nConsPos; for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // the consensus and human read should be equal assert( pHumanRead->bIsInAlignedPartOfRead( nConsPos ) ); // cases: use just forward // use just reverse // use both // use neither const char cOneRead = 'o'; const char cBothReads = 'b'; const char cNoRead = 'n'; char cCase = ' '; LocatedFragment* pReadToUse = NULL; bool bSingleSignalOfBase = false; bool bForwardIsSingleSignal = false; if ( pForwardReadSingleSignalArray ) { bForwardIsSingleSignal = ( (*pForwardReadSingleSignalArray)[ nConsPos ] == cSingleSignal ); } bool bReverseIsSingleSignal = false; if ( pReverseReadSingleSignalArray ) { bReverseIsSingleSignal = ( (*pReverseReadSingleSignalArray)[ nConsPos ] == cSingleSignal ); } if ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos ) && pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) ) { if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { bSingleSignalOfBase = bForwardIsSingleSignal || bReverseIsSingleSignal; if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() > pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) pReadToUse = pForwardRead; else pReadToUse = pReverseRead; cCase = cBothReads; // both } else { // will just use one read. Which is it? if ( bForwardIsSingleSignal == bReverseIsSingleSignal ) { bSingleSignalOfBase = bForwardIsSingleSignal; // choose by which read is higher quality if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() == pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) { // no way to choose--don't use cCase = cNoRead; } else if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() > pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() ) { cCase = cOneRead; pReadToUse = pForwardRead; } else { cCase = cOneRead; pReadToUse = pReverseRead; } } else { // complicated case in which one is single signal // and one is multiple signal int nHigherQualityIsForward = (int) pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() - (int) pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); if ( nHigherQualityIsForward >= 0 && bForwardIsSingleSignal ) { cCase = cOneRead; pReadToUse = pForwardRead; bSingleSignalOfBase = true; } else if ( nHigherQualityIsForward < 0 && bReverseIsSingleSignal ) { cCase = cOneRead; pReadToUse = pReverseRead; bSingleSignalOfBase = true; } else { // conflict between quality and single signal // also conflict between base calls. This is the // most ambiguous case. if ( abs( nHigherQualityIsForward ) <= 5 ) { // qualities are close, so use whichever // is single signal if ( bForwardIsSingleSignal ) { cCase = cOneRead; pReadToUse = pForwardRead; bSingleSignalOfBase = true; } else { cCase = cOneRead; pReadToUse = pReverseRead; bSingleSignalOfBase = true; } } else { // qualities are not close, so use // whichever is higher quality fprintf( pAO, "conflict: %s %s and %s %s at conspos %d\n", pForwardRead->soGetName().data(), szPrintBool( bForwardIsSingleSignal ), pReverseRead->soGetName().data(), szPrintBool( bReverseIsSingleSignal ), nUnpaddedIndex( nConsPos ) ); bSingleSignalOfBase = false; if ( nHigherQualityIsForward > 0 ) { cCase = cOneRead; pReadToUse = pForwardRead; } else { cCase = cOneRead; pReadToUse = pReverseRead; } } } } // if ( bForwardIsSingleSignal == bReverseIsSingleSignal )else } // if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { else } // if ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos ) && pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) ) { else if ( pForwardRead && pForwardRead->bIsInAlignedPartOfRead( nConsPos ) ) { cCase = cOneRead; pReadToUse = pForwardRead; bSingleSignalOfBase = bForwardIsSingleSignal; } else if ( pReverseRead && pReverseRead->bIsInAlignedPartOfRead( nConsPos ) ) { cCase = cOneRead; pReadToUse = pReverseRead; bSingleSignalOfBase = bReverseIsSingleSignal; } else { cCase = cNoRead; } if ( cCase != cNoRead ) { if ( pReadToUse->ntGetFragFromConsPos( nConsPos ).cGetBase() != pHumanRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { aDiscrepantPositions.setValue( nConsPos, true ); } aSpeciesInColumn[ nConsPos ] |= ucSpeciesMask; if ( !bSingleSignalOfBase ) aSomeCombinedReadIsMultipleSignal.setValue( nConsPos, true ); } // set aMinQualityInColumn if ( cCase == cOneRead ) { int nQuality = pReadToUse->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); aMinQualityInColumn[ nConsPos ] = MIN( aMinQualityInColumn[ nConsPos ], nQuality ); } else if ( cCase == cBothReads ) { int nQuality = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality() + pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); if ( nQuality > 90 ) nQuality = 90; aMinQualityInColumn[ nConsPos ] = MIN( aMinQualityInColumn[ nConsPos ], nQuality ); } if ( bDebug ) { cerr << soSpecies << " case: " << cCase << " species " << (unsigned int) aSpeciesInColumn[ nConsPos ] << endl; } } // for( int nConsPos = nGetStartConsensusIndex(); } // for( int nSpecies = 0; nSpecies < const int nThresholdQuality = 20; int nNumberOfColumnsAboveThreshold = 0; int nNumberOfColumnsWithNeededSpecies = 0; int nNumberOfColumnsWithNeededSpeciesAndQuality = 0; int nNumberOfColumnsWithNeededSpeciesAndQualityAndSingleSignal = 0; unsigned char ucOKMask = ucMMul | ucPPyg | ucGGor; unsigned char ucChimpMask = ucPPan | ucPTro; for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { // need to check with Phil if deletion in human is // of interest in making trees. // if ( ntGetCons( nConsPos ).bIsPad() ) { // continue; // } bool bColumnAboveThreshold = false; if ( aMinQualityInColumn[ nConsPos ] != 666 && aMinQualityInColumn[ nConsPos ] >= nThresholdQuality ) { bColumnAboveThreshold = true; ++nNumberOfColumnsAboveThreshold; } bool bColumnWithNeededSpecies = false; if ( ( aSpeciesInColumn[ nConsPos ] & ucOKMask ) == ucOKMask ) { // now check that one of chimps is also present if ( aSpeciesInColumn[ nConsPos ] & ucChimpMask ) { bColumnWithNeededSpecies = true; ++nNumberOfColumnsWithNeededSpecies; // if there are no discrepancies in this column, then // it is an acceptable aligned column: Notice that there // are no quality conditions on the flanking bases. if ( !aDiscrepantPositions[ nConsPos ] ) { aNeededSpeciesAreAlignedAndAgree.setValue( nConsPos, true ); } } } if ( bColumnAboveThreshold && bColumnWithNeededSpecies ) { ++nNumberOfColumnsWithNeededSpeciesAndQuality; if ( !aSomeCombinedReadIsMultipleSignal[ nConsPos ] ) { ++nNumberOfColumnsWithNeededSpeciesAndQualityAndSingleSignal; } aNeededSpeciesAreAlignedAndHQ.setValue( nConsPos, true ); } } // now let's see about the quality of the alignment and see // how many places have a certain alignment // mbtValVectorOfBool aNeededSpeciesAreAlignedAndAgreeSeveralColumns( // nPaddedContigLength, // 1, // "Contig::NeededSpeciesAreAlignedAndAgreeSeveralColumn" ); mbtValVectorOfBool aColumnsFlanked( nPaddedContigLength, 1, "Contig:aColumnsFlanked" ); for( int nSizeOfDiscrepantRegion = 1; nSizeOfDiscrepantRegion <= 3; ++nSizeOfDiscrepantRegion ) { // nConsPosDiscrepantRegionLeft for( int nConsPosDiscrepantRegionLeft = pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ + 1; // e.g., if consensus is from 1 to 11, then 11 - 5 = 6 which is // the last position to check nConsPosDiscrepantRegionLeft <= ( nGetEndConsensusIndex() - pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ - nSizeOfDiscrepantRegion + 1); ++nConsPosDiscrepantRegionLeft ) { // check the left flank int nNumberOfFlankingBases = 0; bool bRegionOK = true; int nConsPos2; for( nConsPos2 = nConsPosDiscrepantRegionLeft - 1; bRegionOK && ( nConsPos2 >= nGetStartConsensusIndex() ) && ( nNumberOfFlankingBases < pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ); --nConsPos2 ) { bRegionOK = aNeededSpeciesAreAlignedAndAgree[ nConsPos2 ]; ++nNumberOfFlankingBases; } // a little check for the left edge of the consensus if ( nNumberOfFlankingBases < pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ) bRegionOK = false; // check the right flank nNumberOfFlankingBases = 0; for( nConsPos2 = nConsPosDiscrepantRegionLeft + nSizeOfDiscrepantRegion; bRegionOK && ( nConsPos2 <= nGetEndConsensusIndex() ) && ( nNumberOfFlankingBases < pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ); ++nConsPos2 ) { bRegionOK = aNeededSpeciesAreAlignedAndAgree[ nConsPos2 ]; ++nNumberOfFlankingBases; } // another little check for the right edge of the consensus if ( nNumberOfFlankingBases < pCP->nAutoReportMinNumberOfPerfectlyAlignedBasesBeforeDiscrepancy_ ) bRegionOK = false; if ( bRegionOK ) { for( nConsPos2 = nConsPosDiscrepantRegionLeft; nConsPos2 <= nConsPosDiscrepantRegionLeft + nSizeOfDiscrepantRegion - 1; ++nConsPos2 ) { aColumnsFlanked.setValue( nConsPos2, true ); } } } } // now count how many columns there are that passed the alignment check: int nNumberOfColumnsFlanked = 0; int nNumberOfColumnsFlankedAndHQ = 0; int nNumberOfColumnsFlankedHQSingleSignal = 0; int nNumberOfColumnsSingleSignalAllSpeciesHQ = 0; for( nConsPos = nGetStartConsensusIndex(); nConsPos <= nGetEndConsensusIndex(); ++nConsPos ) { if ( aNeededSpeciesAreAlignedAndHQ[ nConsPos ] && !aSomeCombinedReadIsMultipleSignal[ nConsPos ] ) { ++nNumberOfColumnsSingleSignalAllSpeciesHQ; } if ( aColumnsFlanked[ nConsPos ] ) { ++nNumberOfColumnsFlanked; if ( aNeededSpeciesAreAlignedAndHQ[ nConsPos ] ) { ++nNumberOfColumnsFlankedAndHQ; if ( !aSomeCombinedReadIsMultipleSignal[ nConsPos ] ) { ++nNumberOfColumnsFlankedHQSingleSignal; } } } } fprintf( pAO, "totals: quality threshold: %d columns above quality: %d columns with species: %d columns with both: %d also single: %d flanked: %d flanked and hq: %d flanked_hq_single: %d\n", nThresholdQuality, nNumberOfColumnsAboveThreshold, nNumberOfColumnsWithNeededSpecies, nNumberOfColumnsWithNeededSpeciesAndQuality, nNumberOfColumnsWithNeededSpeciesAndQualityAndSingleSignal, nNumberOfColumnsFlanked, nNumberOfColumnsFlankedAndHQ, nNumberOfColumnsFlankedHQSingleSignal ); assert( nNumberOfColumnsSingleSignalAllSpeciesHQ == nNumberOfColumnsWithNeededSpeciesAndQualityAndSingleSignal ); } void Contig :: compareTopAndBottomStrandsWithHuman( autoReport* pAutoReport ) { // get 2-D arrays of quality and relative area bin giving # of bases // 2 of these 2-D arrays--one for bases agreeing with human and 1 for // bases disagreeing with human const int nNumberOfQualityValues = 99; const int nNumberOfRelativeAreaBins = 13; const int nTopRelativeAreaBin = nNumberOfRelativeAreaBins - 1; typedef RWTValVector arrayOfInt; RWTPtrVector ppArrayOfRelativeAreasAgreeWithHuman( (size_t) ( nNumberOfQualityValues + 1 ), NULL, "ppArrayOfRelativeAreasAgreeWithHuman" ); RWTPtrVector ppArrayOfRelativeAreasDisagreeWithHuman( (size_t) ( nNumberOfQualityValues + 1 ), NULL, "ppArrayOfRelativeAreasDisagreeWithHuman" ); int nQuality; for( nQuality = 0; nQuality <= nNumberOfQualityValues; ++nQuality ) { RWCString soNameAgree = "ppArrayOfRelativeAreasAgreeWithHuman["; soNameAgree += RWCString( (long) nQuality ); soNameAgree += "]"; ppArrayOfRelativeAreasAgreeWithHuman[ nQuality ] = new RWTValVector( (size_t) nNumberOfRelativeAreaBins, 0, soNameAgree ); RWCString soNameDisagree = "ppArrayOfRelativeAreasDisagreeWithHuman["; soNameDisagree += RWCString( (long) nQuality ); soNameDisagree += "]"; ppArrayOfRelativeAreasDisagreeWithHuman[ nQuality ] = new RWTValVector( (size_t) nNumberOfRelativeAreaBins, 0, soNameDisagree ); } // read all of the good reads RWCString soGoodReads = pCP->filAutoReportGoodHitReads_; // RWCString soGoodReads = "/me2/gordon/primates/checkHumanGenome/tempreads.txt"; FILE* pGoodReads = fopen( soGoodReads.data(), "r" ); if ( !pGoodReads ) { RWCString soError = "could not open "; soError += soGoodReads; THROW_ERROR( soError ); } mbtValOrderedVectorOfRWCString aGoodReads( (size_t) 1000 ); while( fgets( soLine, nMaxLineSize, pGoodReads ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); aGoodReads.insert( soLine ); } fclose( pGoodReads ); for( int nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; LocatedFragment* pHumanRead = NULL; bool bSpeciesHasBadRead = false; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains("HSap" ) ) { pHumanRead = pLocFrag; continue; } if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; if ( pLocFrag->soGetName().bEndsWith( "b.r" ) ) { cerr << "ignoring read " << pLocFrag->soGetName() << endl; continue; } if ( aGoodReads.index( pLocFrag->soGetName() ) == RW_NPOS ) { bSpeciesHasBadRead = true; break; } if ( pLocFrag->soGetName().bEndsWith(".f" )) { pForwardRead = pLocFrag; } else if ( pLocFrag->soGetName().bEndsWith( ".r" ) ) { pReverseRead = pLocFrag; } else { assert( false ); } } if ( bSpeciesHasBadRead ) continue; // can't use species if all reads aren't there if ( !pForwardRead || !pReverseRead || !pHumanRead ) continue; // now get the relative area arrays for both reads mbtValVector* pRelativeAreaOfUncalledBasesForwardRead = NULL; mbtValVector* pRelativeAreaOfUncalledBasesReverseRead = NULL; assert( pForwardRead->bGetSingleSignalRelativeAreas( pRelativeAreaOfUncalledBasesForwardRead ) ); assert( pReverseRead->bGetSingleSignalRelativeAreas( pRelativeAreaOfUncalledBasesReverseRead ) ); if ( pForwardRead->bIsWholeReadUnaligned() || pReverseRead->bIsWholeReadUnaligned() ) continue; int nConsPosLeft; int nConsPosRight; // just fixed if ( !bIntersect( pForwardRead->nGetAlignClipStart(), pForwardRead->nGetAlignClipEnd(), pReverseRead->nGetAlignClipStart(), pReverseRead->nGetAlignClipEnd(), nConsPosLeft, nConsPosRight ) ) continue; for( int nConsPos = nConsPosLeft; nConsPos <= nConsPosRight; ++nConsPos ) { // we are only interested in cases in which the reads do not agree if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) continue; // since they disagree with each other, at least 1 must disagree // with human. // since they disagree with each other, at least 1 must not be // a pad. // we are only interested in case in which higher quality read // has larger relative area int nQualityForward = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); int nQualityReverse = pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); int nOrientedUnpaddedReadPosOfForwardRead = pForwardRead->nOrientedUnpaddedFragPosFromConsPos( nConsPos ); int nOrientedUnpaddedReadPosOfReverseRead = pReverseRead->nOrientedUnpaddedFragPosFromConsPos( nConsPos ); float fForwardRelativeArea = (*pRelativeAreaOfUncalledBasesForwardRead)[ nOrientedUnpaddedReadPosOfForwardRead ]; float fReverseRelativeArea = (*pRelativeAreaOfUncalledBasesReverseRead)[ nOrientedUnpaddedReadPosOfReverseRead ]; if ( nQualityForward >= nQualityReverse && fForwardRelativeArea < fReverseRelativeArea ) continue; if ( nQualityForward <= nQualityReverse && fForwardRelativeArea > fReverseRelativeArea ) continue; // we are left with the ambiguous case where quality and // relative area conflict // which bin will each fall into? int nForwardReadRelativeAreaIndex = -1; if ( fForwardRelativeArea == -1.0 ) nForwardReadRelativeAreaIndex = 0; else { for( int nBin = 1; nBin <= ( nTopRelativeAreaBin - 1 ); ++nBin ) { float fRelativeAreaBottomOfBin = ( nBin - 1 ) / 10.0; float fRelativeAreaTopOfBin = nBin / 10.0; if ( fRelativeAreaBottomOfBin <= fForwardRelativeArea && fForwardRelativeArea < fRelativeAreaTopOfBin ) { nForwardReadRelativeAreaIndex = nBin; } } if ( nForwardReadRelativeAreaIndex == -1 ) nForwardReadRelativeAreaIndex = nTopRelativeAreaBin; } // which bin will each fall into? int nReverseReadRelativeAreaIndex = -1; if ( fReverseRelativeArea == -1.0 ) nReverseReadRelativeAreaIndex = 0; else { for( int nBin = 1; nBin <= ( nTopRelativeAreaBin - 1 ); ++nBin ) { float fRelativeAreaBottomOfBin = ( nBin - 1 ) / 10.0; float fRelativeAreaTopOfBin = nBin / 10.0; if ( fRelativeAreaBottomOfBin <= fReverseRelativeArea && fReverseRelativeArea < fRelativeAreaTopOfBin ) { nReverseReadRelativeAreaIndex = nBin; } } if ( nReverseReadRelativeAreaIndex == -1 ) nReverseReadRelativeAreaIndex = nTopRelativeAreaBin; } for( int nForwardOrReverse = 0; nForwardOrReverse <= 1; ++nForwardOrReverse ) { int nQuality = -1; int nRelativeAreaIndex = -1; LocatedFragment* pLocFrag2 = NULL; if ( nForwardOrReverse == 0 ) { nQuality = nQualityForward; nRelativeAreaIndex = nForwardReadRelativeAreaIndex; pLocFrag2 = pForwardRead; } else { nQuality = nQualityReverse; nRelativeAreaIndex = nReverseReadRelativeAreaIndex; pLocFrag2 = pReverseRead; } RWTValVector* pArrayOfRelativeAreas = NULL; if ( ntGetCons( nConsPos ).cGetBase() == pLocFrag2->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { // agree with human case pArrayOfRelativeAreas = ppArrayOfRelativeAreasAgreeWithHuman[ nQuality ]; } else { // disagree with human case pArrayOfRelativeAreas = ppArrayOfRelativeAreasDisagreeWithHuman[ nQuality ]; } ++( (*pArrayOfRelativeAreas)[ nRelativeAreaIndex ] ); } // for( int nForwardOrReverse = 0; } // for( int nConsPos = nConsPosLeft; nConsPos <= nConsPosRight; ++nConsPos ) { delete pRelativeAreaOfUncalledBasesForwardRead; delete pRelativeAreaOfUncalledBasesReverseRead; } // for( nSpecies = 0; nSpecies < fprintf( pAO, "Relative Areas {\n" ); for( nQuality = 0; nQuality <= nNumberOfQualityValues; ++nQuality ) { fprintf( pAO, "%2d", nQuality ); RWTValVector* pArrayOfAgreeWithHumanRelativeAreas = ppArrayOfRelativeAreasAgreeWithHuman[ nQuality ]; RWTValVector* pArrayOfDisagreeWithHumanRelativeAreas = ppArrayOfRelativeAreasDisagreeWithHuman[ nQuality ]; for( int nRelativeAreaBin = 0; nRelativeAreaBin <= nTopRelativeAreaBin; ++nRelativeAreaBin ) { fprintf( pAO, " %d %d", (*pArrayOfAgreeWithHumanRelativeAreas)[ nRelativeAreaBin ], (*pArrayOfDisagreeWithHumanRelativeAreas)[ nRelativeAreaBin ] ); } fprintf( pAO, "\n" ); } fprintf( pAO, "} Relative Areas\n" ); } // void Contig :: compareTopAndBottomStrandsWithHuman( autoReport* pAutoReport ) { void Contig :: compareTopAndBottomStrands2( autoReport* pAutoReport ) { const int nNumberOfQualityValues = 99; const int nNumberOfRelativeAreaBins = 13; const int nTopRelativeAreaBin = nNumberOfRelativeAreaBins - 1; // read all of the good reads RWCString soGoodReads = "/me2/gordon/primates/checkHumanGenome/goodReadsBothBatches.out"; // RWCString soGoodReads = "/me2/gordon/primates/checkHumanGenome/tempreads.txt"; FILE* pGoodReads = fopen( soGoodReads.data(), "r" ); if ( !pGoodReads ) { RWCString soError = "could not open "; soError += soGoodReads; THROW_ERROR( soError ); } mbtValOrderedVectorOfRWCString aGoodReads( (size_t) 1000 ); while( fgets( soLine, nMaxLineSize, pGoodReads ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); aGoodReads.insert( soLine ); } fclose( pGoodReads ); fprintf( pAO, "Relative Areas {\n" ); for( int nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies ]; LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; LocatedFragment* pHumanRead = NULL; bool bSpeciesHasBadRead = false; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains("HSap" ) ) { pHumanRead = pLocFrag; continue; } if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; if ( pLocFrag->soGetName().bEndsWith( "b.r" ) ) { cerr << "ignoring read " << pLocFrag->soGetName() << endl; continue; } if ( aGoodReads.index( pLocFrag->soGetName() ) == RW_NPOS ) { bSpeciesHasBadRead = true; break; } if ( pLocFrag->soGetName().bEndsWith(".f" )) { pForwardRead = pLocFrag; } else if ( pLocFrag->soGetName().bEndsWith( ".r" ) ) { pReverseRead = pLocFrag; } else { assert( false ); } } if ( bSpeciesHasBadRead ) continue; // can't use species if all reads aren't there if ( !pForwardRead || !pReverseRead || !pHumanRead ) continue; // now get the relative area arrays for both reads mbtValVector* pRelativeAreaOfUncalledBasesForwardRead = NULL; mbtValVector* pRelativeAreaOfUncalledBasesReverseRead = NULL; assert( pForwardRead->bGetSingleSignalRelativeAreas( pRelativeAreaOfUncalledBasesForwardRead ) ); assert( pReverseRead->bGetSingleSignalRelativeAreas( pRelativeAreaOfUncalledBasesReverseRead ) ); if ( pForwardRead->bIsWholeReadUnaligned() || pReverseRead->bIsWholeReadUnaligned() ) continue; int nConsPosLeft; int nConsPosRight; if ( !bIntersect( pForwardRead->nGetAlignClipStart(), pForwardRead->nGetAlignClipEnd(), pReverseRead->nGetAlignClipStart(), pReverseRead->nGetAlignClipEnd(), nConsPosLeft, nConsPosRight ) ) continue; for( int nConsPos = nConsPosLeft; nConsPos <= nConsPosRight; ++nConsPos ) { // we are only interested in cases in which the reads do not agree if ( pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() == pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ) continue; // since they disagree with each other, at least 1 must disagree // with human. // since they disagree with each other, at least 1 must not be // a pad. // we are only interested in case in which higher quality read // has larger relative area int nQualityForward = pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); int nQualityReverse = pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); int nOrientedUnpaddedReadPosOfForwardRead = pForwardRead->nOrientedUnpaddedFragPosFromConsPos( nConsPos ); int nOrientedUnpaddedReadPosOfReverseRead = pReverseRead->nOrientedUnpaddedFragPosFromConsPos( nConsPos ); float fForwardRelativeArea = (*pRelativeAreaOfUncalledBasesForwardRead)[ nOrientedUnpaddedReadPosOfForwardRead ]; float fReverseRelativeArea = (*pRelativeAreaOfUncalledBasesReverseRead)[ nOrientedUnpaddedReadPosOfReverseRead ]; if ( nQualityForward >= nQualityReverse && fForwardRelativeArea < fReverseRelativeArea ) continue; if ( nQualityForward <= nQualityReverse && fForwardRelativeArea > fReverseRelativeArea ) continue; char cHumanBase = ntGetCons( nConsPos ).cGetBase(); bool bForwardReadAgreesWithHuman = ( cHumanBase == pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ); bool bReverseReadAgreesWithHuman = ( cHumanBase == pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase() ); int nQualityOfHighQualityRead = -1; int nQualityOfLowwQualityRead = -1; float fRelativeAreaOfHighQualityRead = -666.0; float fRelativeAreaOfLowwQualityRead = -666.0; bool bHighQualityReadAgreesWithHuman; bool bLowwQualityReadAgreesWithHuman; if ( nQualityForward >= nQualityReverse ) { nQualityOfHighQualityRead = nQualityForward; nQualityOfLowwQualityRead = nQualityReverse; fRelativeAreaOfHighQualityRead = fForwardRelativeArea; fRelativeAreaOfLowwQualityRead = fReverseRelativeArea; bHighQualityReadAgreesWithHuman = bForwardReadAgreesWithHuman; bLowwQualityReadAgreesWithHuman = bReverseReadAgreesWithHuman; } else { nQualityOfHighQualityRead = nQualityReverse; nQualityOfLowwQualityRead = nQualityForward; fRelativeAreaOfHighQualityRead = fReverseRelativeArea; fRelativeAreaOfLowwQualityRead = fForwardRelativeArea; bHighQualityReadAgreesWithHuman = bReverseReadAgreesWithHuman; bLowwQualityReadAgreesWithHuman = bForwardReadAgreesWithHuman; } fprintf( pAO, "%d %d %.1f %.1f %d %d %d %s %s\n", nQualityOfHighQualityRead, nQualityOfLowwQualityRead, fRelativeAreaOfHighQualityRead, fRelativeAreaOfLowwQualityRead, ( bHighQualityReadAgreesWithHuman ? 1 : 0 ), ( bLowwQualityReadAgreesWithHuman ? 1 : 0 ), nUnpaddedIndex( nConsPos ), soSpecies.data(), soGetName().data() ); } // for( int nConsPos = nConsPosLeft; nConsPos <= nConsPosRight; ++nConsPos ) { delete pRelativeAreaOfUncalledBasesForwardRead; delete pRelativeAreaOfUncalledBasesReverseRead; } // for( nSpecies = 0; nSpecies < fprintf( pAO, "} Relative Areas\n" ); } // void Contig :: compareTopAndBottomStrandsWithHuman2( autoReport* pAutoReport ) { // void Contig :: singleSignalInfo( autoReport* pAutoReport ) { // for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { // LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); // if ( pLocFrag->soGetName().bContains( "HSap" ) ) { // continue; // } // // fake read that has no chromat // if ( pLocFrag->soGetName().bEndsWith( "b.r" ) ) { // continue; // } // mbtValVector* paSingleSignalArray = NULL; // pLocFrag->getSingleSignalBasesTemp( paSingleSignalArray ); // for( int n1ReadPos = paSingleSignalArray->nGetStartIndex(); // n1ReadPos <= paSingleSignalArray->nGetEndIndex(); // ++n1ReadPos ) { // if ( (*paSingleSignalArray)[ n1ReadPos ] ) { // // see if this base is quality 20 or better // int n1PaddedReadPos = // pLocFrag->nUnorientedPaddedFromOrientedUnpadded( n1ReadPos ); // if ( pLocFrag->ntideGet( n1PaddedReadPos ).qualGetQuality() >= // 20 ) { // // for now, also check if discrepant // int nConsPos = // pLocFrag->nGetConsPosFromFragIndex( n1PaddedReadPos ); // if ( ntGetCons( nConsPos ).cGetBase() != // pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() ) { // fprintf( pAO, "%s %d\n", // pLocFrag->soGetName().data(), // n1ReadPos ); // } // } // } // } // } // } typedef RWTValVector arrayOfInt; void Contig :: singleSignalInfo( autoReport* pAutoReport ) { const int nNumberOfQualityValues = 99; const int nNumberOfRelativeAreaBins = 13; const int nTopRelativeAreaBin = nNumberOfRelativeAreaBins - 1; RWTPtrVector ppArrayOfRelativeAreasAgree( (size_t) ( nNumberOfQualityValues + 1), NULL, "ppArrayOfRelativeAreasAgree" ); RWTPtrVector ppArrayOfRelativeAreasDisagree( (size_t) ( nNumberOfQualityValues + 1), NULL, "ppArrayOfRelativeAreasDisagree" ); int nQuality; for( nQuality = 0; nQuality <= nNumberOfQualityValues; ++nQuality ) { RWCString soNameAgree = "ppArrayOfRelativeAreasAgree["; soNameAgree += RWCString( (long) nQuality ); soNameAgree += "]"; ppArrayOfRelativeAreasAgree[nQuality] = new RWTValVector( (size_t) nNumberOfRelativeAreaBins, 0, soNameAgree ); RWCString soNameDisagree = "ppArrayOfRelativeAreasDisagree["; soNameDisagree += RWCString( (long) nQuality ); soNameDisagree += "]"; ppArrayOfRelativeAreasDisagree[nQuality] = new RWTValVector( (size_t) nNumberOfRelativeAreaBins, 0, soNameDisagree ); } // read all of the good reads RWCString soGoodReads = "/me2/gordon/primates/checkHumanGenome/goodReadsBothBatches.out"; // RWCString soGoodReads = "/me2/gordon/primates/checkHumanGenome/tempreads.txt"; FILE* pGoodReads = fopen( soGoodReads.data(), "r" ); if ( !pGoodReads ) { RWCString soError = "could not open "; soError += soGoodReads; THROW_ERROR( soError ); } mbtValOrderedVectorOfRWCString aGoodReads( (size_t) 1000 ); while( fgets( soLine, nMaxLineSize, pGoodReads ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); aGoodReads.insert( soLine ); } fclose( pGoodReads ); for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { continue; } // fake read that has no chromat if ( pLocFrag->soGetName().bEndsWith( "b.r" ) ) { continue; } if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( aGoodReads.index( pLocFrag->soGetName() ) == RW_NPOS ) continue; mbtValVector* pArrayOfRelativeAreaOfUncalledBases = NULL; assert( pLocFrag->bGetSingleSignalRelativeAreas( pArrayOfRelativeAreaOfUncalledBases ) ); int nUnpaddedOrientedReadPosStart = pLocFrag->nOrientedUnpaddedFragPosFromConsPos( pLocFrag->nGetStartUnclippedConsPos() ); int nUnpaddedOrientedReadPosEnd = pLocFrag->nOrientedUnpaddedFragPosFromConsPos( pLocFrag->nGetEndUnclippedConsPos() ); int nIncrement = 1; if ( pLocFrag->bComp() ) nIncrement = -1; // get ready for first += nIncrement int nUnpaddedOrientedReadPos = nUnpaddedOrientedReadPosStart - nIncrement; for( int nConsPos = pLocFrag->nGetStartUnclippedConsPos(); nConsPos <= pLocFrag->nGetEndUnclippedConsPos(); ++nConsPos ) { // what is the oriented unpadded read pos? if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).bIsPad() ) continue; nUnpaddedOrientedReadPos += nIncrement; int nQuality = pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); // what is the relative area at this base? float fRelativeArea = (*pArrayOfRelativeAreaOfUncalledBases)[ nUnpaddedOrientedReadPos ]; // convert this to an index int nFoundIndex = -1; for( int nBin = 0; nBin <= ( nTopRelativeAreaBin - 1 ); ++nBin ) { float fRelativeAreaBottomOfBin = ( nBin - 1 ) / 10.0; float fRelativeAreaTopOfBin = nBin / 10.0; if ( fRelativeArea == -1 ) { nFoundIndex = 0; break; } else if ( fRelativeAreaBottomOfBin <= fRelativeArea && fRelativeArea < fRelativeAreaTopOfBin ) { nFoundIndex = nBin; break; } } if ( nFoundIndex == -1 ) { nFoundIndex = nTopRelativeAreaBin; } bool bAgree = ( ntGetCons( nConsPos ).cGetBase() == pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() ); RWTValVector* pArrayOfRelativeAreas = NULL; if ( bAgree ) { // agree case pArrayOfRelativeAreas = ppArrayOfRelativeAreasAgree[ nQuality ]; } else { // disagree case pArrayOfRelativeAreas = ppArrayOfRelativeAreasDisagree[ nQuality ]; } ++( (*pArrayOfRelativeAreas)[ nFoundIndex ] ); } delete pArrayOfRelativeAreaOfUncalledBases; } // for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) fprintf( pAO, "Relative Areas {\n" ); for( nQuality = 0; nQuality <= 99; ++nQuality ) { fprintf( pAO, "%2d", nQuality ); RWTValVector* pArrayOfAgreeRelativeAreas = ppArrayOfRelativeAreasAgree[ nQuality ]; RWTValVector* pArrayOfDisagreeRelativeAreas = ppArrayOfRelativeAreasDisagree[ nQuality ]; for( int nRelativeAreaBin = 0; nRelativeAreaBin <= nTopRelativeAreaBin; ++nRelativeAreaBin ) { fprintf( pAO, " %d %d", (*pArrayOfAgreeRelativeAreas)[ nRelativeAreaBin ], (*pArrayOfDisagreeRelativeAreas)[ nRelativeAreaBin ] ); } fprintf( pAO, "\n" ); } fprintf( pAO, "} Relative Areas\n" ); } void Contig :: singleSignalInfo2() { RWTValVector aArrayOfSingleSignalBasesByQuality( ucQualityMax + 1, 0, "aArrayOfSingleSignalByQuality" ); RWTValVector aArrayOfMultipleSignalBasesByQuality( ucQualityMax + 1, 0, "aArrayOfMultipleSignalByQuality" ); // read all of the good reads RWCString soGoodReads = pCP->filAutoReportGoodHitReads_; // RWCString soGoodReads = "/me2/gordon/primates/checkHumanGenome/tempreads.txt"; FILE* pGoodReads = fopen( soGoodReads.data(), "r" ); if ( !pGoodReads ) { RWCString soError = "could not open "; soError += soGoodReads; THROW_ERROR( soError ); } mbtValOrderedVectorOfRWCString aGoodReads( (size_t) 1000 ); while( fgets( soLine, nMaxLineSize, pGoodReads ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); aGoodReads.insert( soLine ); } fclose( pGoodReads ); for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bContains( "HSap" ) ) { continue; } // fake read that has no chromat if ( pLocFrag->soGetName().bEndsWith( "b.r" ) ) { continue; } if ( pLocFrag->bIsWholeReadUnaligned() ) continue; if ( pLocFrag->bIsWholeReadLowQuality() ) continue; if ( aGoodReads.index( pLocFrag->soGetName() ) == RW_NPOS ) continue; MBTValOrderedOffsetVector* pSingleSignalArray = NULL; assert( pLocFrag->bGetSingleSignalBasesConsensusLength( pSingleSignalArray ) ); // only interested in high quality segment int nIntersectLeft; int nIntersectRight; if ( !bIntersect( pLocFrag->nGetHighQualityStart(), pLocFrag->nGetHighQualityEnd(), pLocFrag->nGetAlignClipStart(), pLocFrag->nGetAlignClipEnd(), nIntersectLeft, nIntersectRight ) ) continue; for( int nConsPos = nIntersectLeft; nConsPos <= nIntersectRight; ++nConsPos ) { if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).bIsPad() ) continue; int nQuality = pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); if ( (*pSingleSignalArray)[ nConsPos ] == cSingleSignal ) { ++aArrayOfSingleSignalBasesByQuality[ nQuality ]; } else { ++aArrayOfMultipleSignalBasesByQuality[ nQuality ]; } } delete pSingleSignalArray; } // for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) fprintf( pAO, "SingleSignalInfo2 {\n" ); for( int nQuality = 0; nQuality <= 99; ++nQuality ) { fprintf( pAO, "%2d %d %d\n", nQuality, (aArrayOfSingleSignalBasesByQuality)[ nQuality ], (aArrayOfMultipleSignalBasesByQuality)[ nQuality ] ); } fprintf( pAO, "} SingleSignalInfo2\n" ); } void Contig :: compareTopAndBottomStrands3( autoReport* pAutoReport ) { // read all of the good reads RWCString soGoodReads = pCP->filAutoReportGoodHitReads_; FILE* pGoodReads = fopen( soGoodReads.data(), "r" ); if ( !pGoodReads ) { RWCString soError = "could not open "; soError += soGoodReads; THROW_ERROR( soError ); } RWTPtrOrderedVector aGoodReads( (size_t) 10 ); while( fgets( soLine, nMaxLineSize, pGoodReads ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName() == soLine ) { // the fake read for the 2nd alignment assert( !pLocFrag->soGetName().bContains( "b." ) ); aGoodReads.insert( pLocFrag ); } } } fclose( pGoodReads ); int nSpecies; for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { RWCString soSpecies = pAutoReport->aArrayOfSpecies_[ nSpecies]; LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; for( int nRead = 0; nRead < aGoodReads.length(); ++nRead ) { LocatedFragment* pLocFrag = aGoodReads[ nRead ]; if ( !pLocFrag->soGetName().bContains( soSpecies ) ) continue; if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } if ( !pForwardRead || !pReverseRead ) { continue; } // if reached here, we have both a .f and .r read for this species if ( pForwardRead->bIsWholeReadUnaligned() || pReverseRead->bIsWholeReadUnaligned() ) continue; int nConsPosLeft; int nConsPosRight; if ( !bIntersect( pForwardRead->nGetAlignClipStart(), pForwardRead->nGetAlignClipEnd(), pReverseRead->nGetAlignClipStart(), pReverseRead->nGetAlignClipEnd(), nConsPosLeft, nConsPosRight ) ) continue; int nMiddleOfCommonAlignedRegion = ( nConsPosLeft + nConsPosRight ) / 2; // convert this to unpadded read position for each read int nForwardUnpadded = pForwardRead->nOrientedUnpaddedFragPosFromConsPos( nMiddleOfCommonAlignedRegion ); int nReverseUnpadded = pReverseRead->nOrientedUnpaddedFragPosFromConsPos( nMiddleOfCommonAlignedRegion ); fprintf( pAO, "FORWARD_REVERSE_ALIGNMENT %s .f: %s %d .r: %s %d\n", soSpecies.data(), pForwardRead->soGetName().data(), nForwardUnpadded, pReverseRead->soGetName().data(), nReverseUnpadded ); } // for( nSpecies = 0; nSpecies < pAutoReport->aArrayOfSpecies_.length(); ++nSpecies ) { } // void Contig :: compareTopAndBottomStrands3( autoReport* pAutoReport ) { void Contig :: compareTopAndBottomStrands4( autoReport* pAutoReport ) { fprintf( pAO, "compareTopAndBottomStrands4 {\n" ); LocatedFragment* pForwardRead = NULL; LocatedFragment* pReverseRead = NULL; for( int nRead = 0; nRead < nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pLocatedFragmentGet( nRead ); if ( pLocFrag->soGetName().bEndsWith( ".f" ) ) { pForwardRead = pLocFrag; } else { pReverseRead = pLocFrag; } } assert( pForwardRead && pReverseRead ); // check if the alignment matches (or is similar) to that against // human int nConsPosForward = pForwardRead->nConsPosFromOrientedUnpaddedFragPos( pCP->nAutoReportTopStrandPinnedPosition_ ); int nConsPosReverse = pReverseRead->nConsPosFromOrientedUnpaddedFragPos( pCP->nAutoReportBottomStrandPinnedPosition_ ); fprintf( pAO, "pinned positions: %d %d %d\n", nConsPosForward, nConsPosReverse, abs( nConsPosForward - nConsPosReverse ) ); // save single signal arrays before deleting column of pads MBTValOrderedOffsetVector* pSingleSignalArrayForward; MBTValOrderedOffsetVector* pSingleSignalArrayReverse; pForwardRead->bGetSingleSignalBasesConsensusLength( pSingleSignalArrayForward ); pReverseRead->bGetSingleSignalBasesConsensusLength( pSingleSignalArrayReverse ); int nIntersectConsPosStart; int nIntersectConsPosEnd; if ( !bIntersect( pForwardRead->nGetAlignClipStart(), pForwardRead->nGetAlignClipEnd(), pReverseRead->nGetAlignClipStart(), pReverseRead->nGetAlignClipEnd(), nIntersectConsPosStart, nIntersectConsPosEnd ) ) return; for( int nConsPos = nIntersectConsPosStart; nConsPos <= nIntersectConsPosEnd; ++nConsPos ) { if ( (*pSingleSignalArrayForward)[ nConsPos ] == cSingleSignal && (*pSingleSignalArrayReverse)[ nConsPos ] == cSingleSignal ) { fprintf( pAO, "%d %d %c %c %d %s\n", pForwardRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(), pReverseRead->ntGetFragFromConsPos( nConsPos ).qualGetQuality(), pForwardRead->ntGetFragFromConsPos( nConsPos ).cGetBase(), pReverseRead->ntGetFragFromConsPos( nConsPos ).cGetBase(), nUnpaddedIndex( nConsPos ), soGetName().data() ); } } fprintf( pAO, "} compareTopAndBottomStrands4\n" ); }