/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "phaster2PhdBall.h" #include #include #include "mbt_exception.h" #include "soDecodePhasterBasesToRead.h" #include "soDecodePhasterBasesToGenomic.h" #include "mbtValOrderedVectorOfInt.h" #include "consedParameters.h" #include "soGetDateTime.h" #include "szGetTime.h" #include "complement_so.h" #include "rwtvalvector.h" #include "nAmbiguityCodeToNumber2.h" #include "rwcregexp.h" #include "bIsNumeric.h" #include "getAllTokens.h" #include "rwctokenizer.h" #include "max.h" #include "bIntervalsIntersectLong.h" #include "soAddCommas.h" #include "bIntersectLong.h" void phaster2PhdBall :: doIt() { setAmbiguityCodeTables(); FILE* pPhasterFOF = fopen( filPhasterFOF_.data(), "r" ); if ( !pPhasterFOF ) { THROW_FILE_ERROR( filPhasterFOF_ ); } pPhdBallFOF_ = fopen( filPhdBallFOF_.data(), "w" ); if ( !pPhdBallFOF_ ) { THROW_FILE_ERROR( filPhdBallFOF_ ); } if ( pCP->bPhaster2PhdBallCalculateNewLocationsFile_ ) { FileName filNewLocations = filPhasterLocations_ + ".new"; pNewLocations_ = fopen( filNewLocations.data(), "w" ); if ( !pNewLocations_ ) { THROW_FILE_ERROR( filNewLocations ); } } parseLocationsFile(); openPhdBalls(); soDateTimeForTimestamp_ = szGetTime(); soDateTimeForWRItems_ = soGetDateTime( nColonInMiddle ); FileName filPhasterFile( (size_t) 1000 ); while( fgets( filPhasterFile.data(), nMAXLINESIZEB, pPhasterFOF ) != NULL ) { filPhasterFile.nCurrentLength_ = strlen( filPhasterFile.data() ); filPhasterFile.stripTrailingWhitespaceFast(); searchOnePhasterFile( filPhasterFile ); } if ( pCP->bPhaster2PhdBallCalculateNewLocationsFile_ ) { writeNewLocationsFile(); fclose( pNewLocations_ ); } fclose( pPhasterFOF ); closePhdBalls(); fclose( pPhdBallFOF_ ); } static long* pGenomicLocations; static int nCompareGenomicLocations( const int* pIndexA, const int* pIndexB ) { if ( pGenomicLocations[*pIndexA] < pGenomicLocations[*pIndexB] ) return -1; else if ( pGenomicLocations[*pIndexA] > pGenomicLocations[*pIndexB] ) return 1; else return 0; } void phaster2PhdBall :: searchOnePhasterFile( const FileName& filPhasterFile ) { cerr << "searching phaster file " << filPhasterFile << endl; int nHitsThisPhasterFile = 0; int nSavedReadPairsThisPhasterFile = 0; filCurrentPhasterFile_ = filPhasterFile; FILE* pPhasterFile = fopen( filPhasterFile.data(), "r" ); if ( !pPhasterFile ) { THROW_FILE_ERROR( filPhasterFile ); } #define nSZLINE_SIZE 10000 char szLine[ nSZLINE_SIZE ]; int nMaxReadLength; bool bFoundReadLength = false; while( fgets( szLine, nSZLINE_SIZE, pPhasterFile ) != NULL ) { // looks like: // Sequence file: s_5_78_export.txt 337002 entries, max used read length 76, calf_type 1 if ( szLine[0] == 'S' ) { char* szMax = strstr( szLine, "max used read length" ); if ( szMax != NULL ) { int nTokens = sscanf( szMax, "max used read length %d,", &nMaxReadLength ); if ( nTokens == 1 ) { bFoundReadLength = true; break; } } } } if ( !bFoundReadLength ) { THROW_ERROR2( "couldn't find \"max used read length\" in " + filPhasterFile ); } // now look for the cref header // first look for .cref file // then look for # Run date:time // looks like this: // .cref file: /net/grc/vol3/np_testing/dgordon/phast_combineSnps1000Genomes/hg18_cref_type2_no_ins.cref, type: 2 (bases per byte: 2), Header: // # /wt1/gordon/next_phred/100409/make_phast -cref_type:2 -ref_genome:hg18_cref_type2_no_ins snp_chr1.fa snp_chr2.fa snp_chr3.fa snp_chr4.fa snp_chr5.fa snp_chr6.fa snp_chr7.fa snp_chr8.fa snp_chr9.fa snp_chr10.fa snp_chr11.fa snp_chr12.fa snp_chr13.fa snp_chr14.fa snp_chr15.fa snp_chr16.fa snp_chr17.fa snp_chr18.fa snp_chr19.fa snp_chr20.fa snp_chr21.fa snp_chr22.fa snp_chrM.fa snp_chrX.fa snp_chrY.fa snp_chr1_random.fa snp_chr2_random.fa snp_chr3_random.fa snp_chr4_random.fa snp_chr5_random.fa snp_chr6_random.fa snp_chr7_random.fa snp_chr8_random.fa snp_chr9_random.fa snp_chr10_random.fa snp_chr11_random.fa snp_chr13_random.fa snp_chr15_random.fa snp_chr16_random.fa snp_chr17_random.fa snp_chr18_random.fa snp_chr19_random.fa snp_chr21_random.fa snp_chr22_random.fa snp_chrX_random.fa // # VERSION 1.100409 // # Run date:time 100524:093415 // >chr1 COORDS:1-247249719 // >chr2 COORDS:247249720-490200868 // >chr3 COORDS:490200869-689702695 // >chr4 COORDS:689702696-880975758 // >chr5 COORDS:880975759-1061833624 // >chr6 COORDS:1061833625-1232733616 bool bFoundCrefHeader = false; while( fgets( szLine, nSZLINE_SIZE, pPhasterFile ) != NULL ) { // looks like // .cref file: /net/grc/vol3/np_testing/dgordon/phast_combineSnps1000Genomes/hg18_cref_type2_no_ins.cref, type: 2 (bases per byte: 2), Header: if ( szLine[0] == '.' ) { if ( memcmp( szLine, ".cref file:", 11 ) == 0 ) { bFoundCrefHeader = true; break; } } } if ( !bFoundCrefHeader ) { THROW_ERROR2( "couldn't find \".cref file:\" in " + filPhasterFile ); } // now looking for: // bool bFoundRunDateTime = false; while( fgets( szLine, nSZLINE_SIZE, pPhasterFile ) != NULL ) { // looks like: // # Run date:time 100524:093415\ if ( szLine[0] == '#' ) { if ( memcmp( szLine, "# Run date:time ", 16 ) == 0 ) { bFoundRunDateTime = true; break; } } } if ( !bFoundRunDateTime ) { THROW_ERROR2( "couldn't find \"# Run date:time \" in " + filPhasterFile ); } // now load up the conversion table long long llStart; long long llEnd; int nConversionIndex = -1; // ready for ++ int nReturnStatus; RWCString soChromosome( (size_t) 1000 ); while( ( nReturnStatus = fscanf( pPhasterFile, ">%s COORDS:%lld-%lld\n", soChromosome.data(), &llStart, &llEnd ) ) == 3 ) { soChromosome.nCurrentLength_ = strlen( soChromosome.data() ); if ( !bConversionTableSet_ ) { aConvertChromosomes_.append( soChromosome ); aConvertStarts_.append( llStart ); aConvertEnds_.append( llEnd ); } else { ++nConversionIndex; if ( nConversionIndex >= aConvertChromosomes_.length() ) { THROW_ERROR2( "phaster output file " + filPhasterFile + " uses a different cref file than the first phaster output file" ); } if ( ! ( ( aConvertChromosomes_[ nConversionIndex ] == soChromosome ) && ( aConvertStarts_[ nConversionIndex ] == llStart ) && ( aConvertEnds_[ nConversionIndex ] == llEnd ) ) ) { THROW_ERROR2( "phaster file " + filPhasterFile + " uses a different cref file than the first phaster output file" ); } } } if ( !bConversionTableSet_ ) { setGenomicLocationsOfDesiredLocations(); if ( pCP->bPhaster2PhdBallCalculateNewLocationsFile_ ) { setReadDepthArrays(); } } aDesiredLocations_.resort(); // if reached here, we've set aConvertChromosomes_ etc. bConversionTableSet_ = true; bool bFoundStartAlignments = false; while( fgets( szLine, nSZLINE_SIZE, pPhasterFile ) != NULL ) { if ( szLine[0] == 'S' && szLine[1] == 'T' ) { if ( memcmp( szLine, "START_ALIGNMENTS ", 17 ) == 0 ) { bFoundStartAlignments = true; break; } } } assert( bFoundStartAlignments ); // looks like: // START_ALIGNMENTS 259303 // ^pos 0 ^pos 17 int nNumberOfAlignments = atoi( szLine + 17 ); if ( nNumberOfAlignments == 0 ) { fclose( pPhasterFile ); return; } desiredLocation dummyDL(); soLine_.increaseButNotDecreaseMaxLength( 10000 ); while( fgets( soLine_.data(), nMAXLINESIZEB, pPhasterFile ) != NULL ) { soLine_.nCurrentLength_ = strlen( soLine_.data() ); bCurrentPhasterLineIsFinelyParsed_ = false; if ( soLine_[0] == 'E' ) { if ( soLine_.bStartsWith( "END_ALIGNMENTS" ) ) { break; } } parsePhasterLineCrudely(); bool bSavedReadPair = false; considerEachSnpForThisReadPair( bSavedReadPair ); if ( bSavedReadPair ) ++nHitsThisPhasterFile; } fclose( pPhasterFile ); cerr << nHitsThisPhasterFile << " hits this phaster file" << endl; } void phaster2PhdBall :: parsePhasterLineCrudely() { lGenomicLeftRead1_ = 0; lGenomicLeftRead2_ = 0; int nTokens = sscanf( soLine_.data(), "%*s %*s %*s %*s %ld %*s %*s %*s %*s %*s %ld ", &lGenomicLeftRead1_, &lGenomicLeftRead2_ ); assert( nTokens > 0 ); bTwoReads_ = ( nTokens == 2 ? true : false ); const int nMoreThanMaxAlignmentLength = 1000; lGenomicFarRightRead1_ = lGenomicLeftRead1_ + nMoreThanMaxAlignmentLength; lGenomicFarRightRead2_ = lGenomicLeftRead2_ + nMoreThanMaxAlignmentLength; lMaxRight_ = 0; if ( lGenomicLeftRead1_ != 0 ) lMaxRight_ = MAX( lMaxRight_, lGenomicFarRightRead1_ ); if ( lGenomicLeftRead2_ != 0 ) lMaxRight_ = MAX( lMaxRight_, lGenomicFarRightRead2_ ); } void phaster2PhdBall :: parsePhasterLineFinely() { // since there is a reasonable chance that we will save this read, // let's get all the fields we need all at once // looks like this: // 2_48 (1) read name // 264 (2) mapping quality score // 76 (3) read starts or ends // 1 (4) read starts or ends // 1098731449 (5) genomic position of left end of alignment // --#!'#--)2--3-333'3--3-3---!!--!-'3-'!''3!''0'''-''!'3-'33-33'3''!3'''33---3 (6) encoded read and genomic bases // 1*&&)%)($('.'%34,#.//%%%54.04.>5)/1:(*A5*%9;$EE6:990776A:<2:5<<9;@8-2@0<9?*1 (7) encoded read qualities // 289 (8) 2nd read mapping quality score // 77 (9) 2nd read start or end cycle # // 152 (10) 2nd read start ora end cycle # // 1098731401 (11) genomic position of left end of alignment // !!!'!3-3'!---!!!33-'3'''-!!33!-!-!'!-!'!--'!-!'!--!!)"--'3--3-333'3--2-3---" (12) encoded read and genomic bases // -227;2?606A329877=?>;->@?29:;7C;A;57=;-,?A13@@/49A4>/$=(.4:@24>88%3@$(.'61)' (13) encoded qualities // just to reassure that these are large enough soReadName_.increaseButNotDecreaseMaxLength( 1000 ); soEncodedBasesRead1_.increaseButNotDecreaseMaxLength( 1000 ); soEncodedBasesRead2_.increaseButNotDecreaseMaxLength( 1000 ); soEncodedQualitiesRead1_.increaseButNotDecreaseMaxLength( 1000 ); soEncodedQualitiesRead2_.increaseButNotDecreaseMaxLength( 1000 ); int nTokens = sscanf( soLine_.data(), "%s %*s %d %d %lld %s %s %*s %d %d %lld %s %s\n", soReadName_.data(), &nRead1Left_, &nRead1Right_, &lGenomicLeftRead1_, soEncodedBasesRead1_.data(), soEncodedQualitiesRead1_.data(), &nRead2Left_, &nRead2Right_, &lGenomicLeftRead2_, soEncodedBasesRead2_.data(), soEncodedQualitiesRead2_.data() ); if ( nTokens == 6 ) bTwoReads_ = false; else if ( nTokens == 11 ) bTwoReads_ = true; else { THROW_ERROR2( "there should have been 6 or 11 tokens but there were " + RWCString( (long) nTokens ) ); } soReadName_.nCurrentLength_ = strlen( soReadName_.data() ); soEncodedBasesRead1_.nCurrentLength_ = strlen( soEncodedBasesRead1_.data() ); soEncodedBasesRead2_.nCurrentLength_ = strlen( soEncodedBasesRead2_.data() ); soEncodedQualitiesRead1_.nCurrentLength_ = strlen( soEncodedQualitiesRead1_.data() ); soEncodedQualitiesRead2_.nCurrentLength_ = strlen( soEncodedQualitiesRead2_.data() ); const int nMoreThanMaxAlignmentLength = 1000; lGenomicFarRightRead1_ = lGenomicLeftRead1_ + nMoreThanMaxAlignmentLength; lGenomicFarRightRead2_ = lGenomicLeftRead2_ + nMoreThanMaxAlignmentLength; soDecodedBasesRead1_ = soDecodePhasterBasesToRead( soEncodedBasesRead1_ ); soDecodedBasesRead2_ = soDecodePhasterBasesToRead( soEncodedBasesRead2_ ); soDecodedGenomicBasesRead1_ = soDecodePhasterBasesToGenomic( soEncodedBasesRead1_ ); soDecodedGenomicBasesRead2_ = soDecodePhasterBasesToGenomic( soEncodedBasesRead2_ ); int n; int nNonGapsRead1 = 0; int nNonGapsRead2 = 0; for( n = 0; n < soDecodedGenomicBasesRead1_.length(); ++n ) { if ( soDecodedGenomicBasesRead1_[n] != '-' ) ++nNonGapsRead1; } for( n = 0; n < soDecodedGenomicBasesRead2_.length(); ++n ) { if ( soDecodedGenomicBasesRead2_[n] != '-' ) ++nNonGapsRead2; } lGenomicRightRead1_ = lGenomicLeftRead1_ + nNonGapsRead1 - 1; lGenomicRightRead2_ = lGenomicLeftRead2_ + nNonGapsRead2 - 1; // int nIndex; // if ( ( nIndex = soDecodedGenomicBasesRead1_.index( "-" ) ) != RW_NPOS ) { // if ( ( soDecodedGenomicBasesRead1_.length() > ( nIndex + 1 ) ) && // ( soDecodedGenomicBasesRead1_[ nIndex + 1 ] != '-' ) ) { // RWCString soChromosome; // int n1ChrPos; // convertFromGenomic( lGenomicLeftRead1_, soChromosome, n1ChrPos ); // printf( "convertFromGenomic %s %d %ld\n", soChromosome.data(), n1ChrPos, lGenomicLeftRead1_ ); // printf( "read 1 of %s has a gap in the genomic at %d after gap: %ld\n", // soReadName_.data(), // nIndex, // lGenomicLeftRead1_ + nIndex ); // int nGaps = soDecodedGenomicBasesRead1_.nGetNumberOfCopiesOfChar( '-' ); // printf( "left end = %s unpadded right end: %s\n", // soAddCommas( lGenomicLeftRead1_ ).data(), // soAddCommas( lGenomicLeftRead1_ + // soDecodedGenomicBasesRead1_.length() - 1 - // nGaps ).data() ); // int n1ChrPosLeft; // int n1ChrPosRight; // RWCString soChromosomeTemp; // convertFromGenomic( lGenomicLeftRead1_, soChromosomeTemp, n1ChrPosLeft ); // assert( soChromosomeTemp == soChromosome ); // convertFromGenomic( lGenomicLeftRead1_ + // soDecodedGenomicBasesRead1_.length() - 1 - // nGaps, // soChromosomeTemp, // n1ChrPosRight ); // assert( soChromosomeTemp == soChromosome ); // printf( "in 1chr: left end = %s right = %s\n", // soAddCommas( n1ChrPosLeft ).data(), // soAddCommas( n1ChrPosRight ).data() ); // printf( "%s\n", soDecodedGenomicBasesRead1_.data() ); // printf( "%s\n", soDecodedBasesRead1_.data() ); // } // } // if ( ( nIndex = soDecodedGenomicBasesRead2_.index( "-" ) ) != RW_NPOS ) { // if ( ( soDecodedGenomicBasesRead2_.length() > ( nIndex + 1 ) ) && // ( soDecodedGenomicBasesRead2_[ nIndex + 1 ] != '-' ) ) { // RWCString soChromosome; // int n1ChrPos; // convertFromGenomic( lGenomicLeftRead2_, soChromosome, n1ChrPos ); // printf( "%s %d :\n", soChromosome.data(), n1ChrPos ); // printf( "read 2 of %s has a gap in the genomic at %d after gap: %ld\n", // soReadName_.data(), // nIndex, // lGenomicLeftRead2_ + nIndex ); // printf( "%s\n", soDecodedGenomicBasesRead2_.data() ); // printf( "%s\n", soDecodedBasesRead2_.data() ); // } // } bCurrentPhasterLineIsFinelyParsed_ = true; } bool phaster2PhdBall :: bAlleleOK( desiredLocation* pDL, const char cWhichRead, const char cWhichSite ) { if ( !bCurrentPhasterLineIsFinelyParsed_ ) { parsePhasterLineFinely(); } RWCString* pDecodedReadBases; RWCString* pDecodedGenomicBases; if ( cWhichRead == cREAD1 ) { pDecodedReadBases = &soDecodedBasesRead1_; pDecodedGenomicBases = &soDecodedGenomicBasesRead1_; } else { pDecodedReadBases = &soDecodedBasesRead2_; pDecodedGenomicBases = &soDecodedGenomicBasesRead2_; } int nAllowedAlleles; if ( cWhichSite == cSITEA ) { nAllowedAlleles = nAmbiguityCodeToNumber2[ pDL->cAllowedAllelesA_ ]; } else { nAllowedAlleles = nAmbiguityCodeToNumber2[ pDL->cAllowedAllelesB_ ]; } // now need to find the read base at the snp position long lGenomicOfSnp = ( cWhichSite == cSITEA ? pDL->lGenomicLocationA_ : pDL->lGenomicLocationB_ ); long lGenomicOfRead = ( cWhichRead == cREAD1 ? lGenomicLeftRead1_ : lGenomicLeftRead2_ ); // ----------------- // ^ ^ snp is here // read starts here // but alignment has pads in the genomic: // -----****------------ // ^ ^ snp is here // read starts here // so the snp position is moved over int nNumberOfUnpaddedGenomicBasesToSnp = lGenomicOfSnp - lGenomicOfRead + 1; char cReadBaseAtSnpPosition = 0; int nNumberOfUnpaddedGenomicBasesSoFar = 0; int n0UnpaddedBasePosition = -1; // 0 will be first unpadded base for( int n0PaddedBasePos = 0; n0PaddedBasePos < pDecodedGenomicBases->length(); ++n0PaddedBasePos ) { if ( (*pDecodedGenomicBases)[ n0PaddedBasePos ] != '-' ) { ++nNumberOfUnpaddedGenomicBasesSoFar; if ( nNumberOfUnpaddedGenomicBasesSoFar == nNumberOfUnpaddedGenomicBasesToSnp ) { cReadBaseAtSnpPosition = (*pDecodedReadBases)[ n0PaddedBasePos ]; break; } } } // case in which the alignment does not extend as far as the snp if ( !cReadBaseAtSnpPosition ) return false; int nReadBaseAtSnpPosition = nAmbiguityCodeToNumber2[ cReadBaseAtSnpPosition ]; if ( ( nReadBaseAtSnpPosition & nAllowedAlleles ) == 0 ) { return false; } else { return true; } } static desiredLocation dummyDL; void phaster2PhdBall :: considerEachSnpForThisReadPair( bool& bSavedReadPair ) { bSavedReadPair = false; long lLeftMostGenomic; bool bOnly1AlignedRead = false; if ( bTwoReads_ ) { if ( ( lGenomicLeftRead1_ == 0 ) && ( lGenomicLeftRead2_ == 0 ) ) return; else if ( lGenomicLeftRead1_ == 0 ) { lLeftMostGenomic = lGenomicLeftRead2_; bOnly1AlignedRead = true; } else if ( lGenomicLeftRead2_ == 0 ) { lLeftMostGenomic = lGenomicLeftRead1_; bOnly1AlignedRead = true; } else { lLeftMostGenomic = MIN( lGenomicLeftRead1_, lGenomicLeftRead2_ ); } } else { if ( lGenomicLeftRead1_ == 0 ) return; lLeftMostGenomic = lGenomicLeftRead1_; bOnly1AlignedRead = true; } // used to do this: // <-----bbbbbbbbbbb-----> // ^ lGenomicLeft1 // bbbb is the read bases // ---- is part of the range // look for snps anywhere in this window // ^ or after // ^ but not here or after // due to a bug (100916), I'm just going to check all of the snps int nIndexOfFoundSnp = -666; bool bWantThisReadPair = false; for( int nIndexOfSnp = 0; ( nIndexOfSnp < aDesiredLocations_.length() ) && !bWantThisReadPair; ++nIndexOfSnp ) { desiredLocation* pDL = aDesiredLocations_[ nIndexOfSnp ]; if ( pDL->bMateUnmapped_ && ( lGenomicLeftRead1_ != 0 ) && ( lGenomicLeftRead2_ != 0 ) ) { // this location line won't work since it requires // an unmapped mate, but both reads are mapped continue; } if ( pDL->cTypeOfMatch_ == desiredLocation::cMatchOneSite ) { if (( lGenomicLeftRead1_ != 0 ) && ( lGenomicLeftRead1_ <= pDL->lGenomicLocationA_ ) && ( pDL->lGenomicLocationA_ <= lGenomicFarRightRead1_ ) ) { // need to check allele if ( bAlleleOK( pDL, cREAD1, cSITEA ) ) { nIndexOfFoundSnp = nIndexOfSnp; bWantThisReadPair = true; break; } } // even if we've tried the 1st read, try the second read if ( bTwoReads_ && ( lGenomicLeftRead2_ != 0 ) && ( lGenomicLeftRead2_ <= pDL->lGenomicLocationA_ ) && ( pDL->lGenomicLocationA_ <= lGenomicFarRightRead2_ ) ) { // need to check allele if ( bAlleleOK( pDL, cREAD2, cSITEA ) ) { nIndexOfFoundSnp = nIndexOfSnp; bWantThisReadPair = true; break; } } // if ( bTwoReads_ ... } // if ( pDL->cTypeOfMatch_ == desiredLocation::cMatchOneSite ) { else if ( pDL->cTypeOfMatch_ == desiredLocation::cMatchBothSites ) { // there are a variety of ways this can work: // siteA in read1, siteA in read2 // siteB in read1, siteB in read2 // all 4 combinations of these: // siteA in read1, siteB in read1 // siteA in read1, siteB in read2 // siteA in read2, siteB in read1 // siteA in read2, siteB in read2 bool bSiteAMatches = false; if ( lGenomicLeftRead1_ != 0 ) { if ( ( lGenomicLeftRead1_ <= pDL->lGenomicLocationA_ ) && ( pDL->lGenomicLocationA_ <= lGenomicFarRightRead1_ ) ) { if ( bAlleleOK( pDL, cREAD1, cSITEA ) ) { bSiteAMatches = true; } } } if ( !bSiteAMatches ) { // let's try the other read if ( bTwoReads_ && ( lGenomicLeftRead2_ != 0 ) ) { if ( ( lGenomicLeftRead2_ <= pDL->lGenomicLocationA_ ) && ( pDL->lGenomicLocationA_ <= lGenomicFarRightRead2_ ) ) { if ( bAlleleOK( pDL, cREAD2, cSITEA ) ) { bSiteAMatches = true; } } } } // if neither read matches site A of this snp, no point // in looking at site B of this snp since this snp requires // both sites to match. So go on to a different snp. if ( bSiteAMatches ) { bool bSiteBMatches = false; if ( lGenomicLeftRead1_ != 0 ) { if ( ( lGenomicLeftRead1_ <= pDL->lGenomicLocationB_ ) && ( pDL->lGenomicLocationB_ <= lGenomicFarRightRead1_ ) ) { if ( bAlleleOK( pDL, cREAD1, cSITEB ) ) { bSiteBMatches = true; } } } if ( !bSiteBMatches ) { // let's try the other read if ( bTwoReads_ && ( lGenomicLeftRead2_ != 0 ) && ( lGenomicLeftRead2_ <= pDL->lGenomicLocationB_ ) && ( pDL->lGenomicLocationB_ <= lGenomicFarRightRead2_ ) ) { if ( bAlleleOK( pDL, cREAD2, cSITEB ) ) { bSiteBMatches = true; } } } if ( bSiteAMatches && bSiteBMatches ) { nIndexOfFoundSnp = nIndexOfSnp; bWantThisReadPair = true; } } } // else if ( pDL->cTypeOfMatch_ == desiredLocation::cMatchBothSites ) { else if ( pDL->cTypeOfMatch_ == desiredLocation::cMatchWindow ) { if ( lGenomicLeftRead1_ != 0 ) { if ( bIntervalsIntersectLong( lGenomicLeftRead1_, lGenomicFarRightRead1_, pDL->lGenomicLocationA_, pDL->lGenomicLocationB_ ) ) { if ( !bCurrentPhasterLineIsFinelyParsed_ ) { parsePhasterLineFinely(); } if ( bIntervalsIntersectLong( lGenomicLeftRead1_, lGenomicRightRead1_, pDL->lGenomicLocationA_, pDL->lGenomicLocationB_ ) ) { nIndexOfFoundSnp = nIndexOfSnp; bWantThisReadPair = true; } } } if ( !bWantThisReadPair ) { // try the other read if ( bTwoReads_ && ( lGenomicLeftRead2_ != 0 ) && ( bIntervalsIntersectLong( lGenomicLeftRead2_, lGenomicFarRightRead2_, pDL->lGenomicLocationA_, pDL->lGenomicLocationB_ ) ) ) { if ( !bCurrentPhasterLineIsFinelyParsed_ ) { parsePhasterLineFinely(); } if ( bIntervalsIntersectLong( lGenomicLeftRead2_, lGenomicRightRead2_, pDL->lGenomicLocationA_, pDL->lGenomicLocationB_ ) ) { nIndexOfFoundSnp = nIndexOfSnp; bWantThisReadPair = true; } } } } // else if ( pDL->cTypeOfMatch_ == desiredLocation::cMatchWindow ) { } // while( nIndexOfSnp < aGenomicLocations_.length() ) { if ( bWantThisReadPair ) { saveReadPair( nIndexOfFoundSnp ); bSavedReadPair = true; ++nHits_; } } // void phaster2PhdBall :: considerEachSnpForThisReadPair() { void phaster2PhdBall :: saveReadPair( const int nIndexOfLocation ) { if ( pCP->bPhaster2PhdBallCalculateNewLocationsFile_ ) { desiredLocation* pDL = aDesiredLocations_[ nIndexOfLocation ]; long lIntersectLeft; long lIntersectRight; long lReadLeft; long lReadRight; if ( lGenomicLeftRead1_ != 0 ) { lReadLeft = lGenomicLeftRead1_; lReadRight = lGenomicRightRead1_; } else { lReadLeft = lGenomicLeftRead2_; lReadRight = lGenomicRightRead2_; } assert( bIntersectLong( pDL->lGenomicLocationA_, pDL->lGenomicLocationB_, lReadLeft, lReadRight, lIntersectLeft, lIntersectRight ) ); ++(*(pDL->paReadDepth_))[lIntersectLeft - pDL->lGenomicLocationA_ ]; --(*(pDL->paReadDepth_))[lIntersectRight + 1 - pDL->lGenomicLocationA_ ]; } if ( pCP->bPhaster2PhdBallSaveInPhasterFormat_ ) { desiredLocation* pDL = aDesiredLocations_[ nIndexOfLocation ]; fprintf( pDL->pPhdBall_, "%s", soLine_.data() ); return; } RWTValOrderedVector aQualitiesRead1( soDecodedBasesRead1_.length(), "aQualitiesRead1" ); RWTValOrderedVector aQualitiesRead2( soDecodedBasesRead2_.length(), "aQualitiesRead2" ); convertQualities( soEncodedQualitiesRead1_, aQualitiesRead1 ); if (bTwoReads_ ) convertQualities( soEncodedQualitiesRead2_, aQualitiesRead2 ); assert( soDecodedBasesRead1_.length() == aQualitiesRead1.length() ); if (bTwoReads_ ) assert( soDecodedBasesRead2_.length() == aQualitiesRead2.length() ); // eliminate any gap characters RWCString soBasesWithoutGapsRead1( soDecodedBasesRead1_ ); RWCString soBasesWithoutGapsRead2( soDecodedBasesRead2_ ); int nBase; for( nBase = soBasesWithoutGapsRead1.length() - 1; nBase >= 0; --nBase ) { if ( soBasesWithoutGapsRead1[ nBase ] == '-' ) { soBasesWithoutGapsRead1.remove( nBase, 1 ); aQualitiesRead1.removeAt( nBase ); } } if ( bTwoReads_ ) { for( nBase = soBasesWithoutGapsRead2.length() - 1; nBase >= 0; --nBase ) { if ( soBasesWithoutGapsRead2[ nBase ] == '-' ) { soBasesWithoutGapsRead2.remove( nBase, 1 ); aQualitiesRead2.removeAt( nBase ); } } } // reverse complement bottom strand reads if ( nRead1Left_ > nRead1Right_ ) { soBasesWithoutGapsRead1 = soComplementSO( soBasesWithoutGapsRead1 ); aQualitiesRead1.reverseElementOrder(); } if ( bTwoReads_ && (nRead2Left_ > nRead2Right_ ) ) { soBasesWithoutGapsRead2 = soComplementSO( soBasesWithoutGapsRead2 ); aQualitiesRead2.reverseElementOrder(); } // fix read names so they have the lane and tile as well // phaster phout file looks like this: // /codon/gordon/for_phil/100205_GRC5_0002_614VMAAXX/run_100524/lane4/4_62.uncalibrated.clusters.phout RWCString soPhasterPhoutWithoutDirectory = filCurrentPhasterFile_.soGetBasename(); int nContainsDot = soPhasterPhoutWithoutDirectory.first( '.' ); RWCString soLaneAndTile; if ( nContainsDot == RW_NPOS ) { soLaneAndTile = soPhasterPhoutWithoutDirectory; } else { soLaneAndTile = soPhasterPhoutWithoutDirectory( 0, nContainsDot ); } RWCString soReadName1 = soLaneAndTile + "_" + soReadName_; RWCString soReadName2 = soLaneAndTile + "_" + soReadName_ + ".2"; if ( ( lGenomicLeftRead1_ == 0 ) || ( pCP->soPhaster2PhdBallSaveWhichMate_ == "both" ) ) { writeReadToPhdBall( nIndexOfLocation, soReadName1, soBasesWithoutGapsRead1, aQualitiesRead1, cREAD1 ); } if ( bTwoReads_ ) { if ( ( lGenomicLeftRead2_ == 0 ) || ( pCP->soPhaster2PhdBallSaveWhichMate_ == "both" ) ) { writeReadToPhdBall( nIndexOfLocation, soReadName2, soBasesWithoutGapsRead2, aQualitiesRead2, cREAD2 ); } } } void phaster2PhdBall :: writeReadToPhdBall( const int nIndexOfLocation, const RWCString& soReadName, const RWCString& soBases, const RWTValOrderedVector& aQualities, const char cFirstOrSecondRead ) { desiredLocation* pDL = aDesiredLocations_[ nIndexOfLocation ]; fprintf( pDL->pPhdBall_, "BEGIN_SEQUENCE %s 1\nBEGIN_COMMENT\n", soReadName.data() ); // no CHROMAT_FILE, ABI_THUMBPRINT, PHRED_VERSION, CALL_METHOD, or // QUALITY_VALUES for solexa reads fprintf( pDL->pPhdBall_, "TIME: %s\n", soDateTimeForTimestamp_.data() ); fprintf( pDL->pPhdBall_, "CHEM: solexa\n" ); fprintf( pDL->pPhdBall_, "END_COMMENT\n" ); fprintf( pDL->pPhdBall_, "BEGIN_DNA\n" ); assert( soBases.length() == aQualities.length() ); for( int nBase = 0; nBase < soBases.length(); ++nBase ) { fprintf( pDL->pPhdBall_, "%c %d\n", soBases[ nBase ], aQualities[ nBase ] ); } fprintf( pDL->pPhdBall_, "END_DNA\n" ); fprintf( pDL->pPhdBall_, "END_SEQUENCE\n\n" ); fprintf( pDL->pPhdBall_, "WR{\n" ); fprintf( pDL->pPhdBall_, "template phaster2PhdBall %s\n", soDateTimeForWRItems_.data() ); RWCString soTemplateName( soReadName ); soTemplateName.bEndsWithAndRemove( ".2" ); fprintf( pDL->pPhdBall_, "name: %s\n", soTemplateName.data() ); fprintf( pDL->pPhdBall_, "}\n\n" ); fprintf( pDL->pPhdBall_, "WR{\n" ); fprintf( pDL->pPhdBall_, "primer phaster2PhdBall %s\n", soDateTimeForWRItems_.data() ); fprintf( pDL->pPhdBall_, "type: univ %s\n", ( cFirstOrSecondRead == cREAD1 ? "fwd" : "rev" ) ); fprintf( pDL->pPhdBall_, "}\n\n" ); if ( ( ( cFirstOrSecondRead == cREAD1 ) && ( lGenomicLeftRead1_ == 0 ) ) || ( ( cFirstOrSecondRead == cREAD2 ) && ( lGenomicLeftRead2_ == 0 ) ) ) { fprintf( pDL->pPhdBall_, "WR{\n" ); fprintf( pDL->pPhdBall_, "unmapped phaster2PhdBall %s\n", soDateTimeForWRItems_.data() ); fprintf( pDL->pPhdBall_, "}\n\n" ); } // just so I can see something at the beginning if ( nHits_ < 200 ) { fflush( pDL->pPhdBall_ ); } } void phaster2PhdBall :: convertQualities( const RWCString& soEncodedQualities, RWTValOrderedVector& aQualities ) { for( int n = 0; n < soEncodedQualities.length(); ++n ) { int nQuality = (int) soEncodedQualities[n] - 33; aQualities.append( nQuality ); } } long phaster2PhdBall :: lConvertToGenomic( const RWCString& soChromosome, const long l1ChrPos ) { if ( soChromosome == "genomic" ) return l1ChrPos; int nChromosomeIndex; if ( soLastChromosome_ == soChromosome ) { nChromosomeIndex = nLastChromosomeIndex_; } else { nChromosomeIndex = aConvertChromosomes_.index( soChromosome ); if ( nChromosomeIndex == RW_NPOS ) { cerr << "dump of chromosomes: " << endl; for( int n = 0; n < aConvertChromosomes_.length(); ++n ) { cerr << aConvertChromosomes_[n] << endl; } THROW_ERROR2( "cannot find chromosome " + soChromosome + " in cref list" ); } soLastChromosome_ = soChromosome; nLastChromosomeIndex_ = nChromosomeIndex; } // if reached here, have set nChromosomeIndex long l1GenomicPos = aConvertStarts_[ nChromosomeIndex ] + l1ChrPos - 1; if ( l1GenomicPos > aConvertEnds_[ nChromosomeIndex ] ) { THROW_ERROR2( soChromosome + " cannot go to pos " + RWCString( (long) l1ChrPos ) + " largest is " + RWCString( (long) ( aConvertEnds_[ nChromosomeIndex ] - aConvertStarts_[ nChromosomeIndex ] + 1 ) ) ); } return l1GenomicPos; } // for testing void phaster2PhdBall :: convertFromGenomic( const long lGenomic, RWCString& soChromosome, int& n1ChrPos ) { for( int nChr = 0; nChr < aConvertStarts_.length(); ++nChr ) { if ( ( aConvertStarts_[ nChr ] <= lGenomic ) && ( lGenomic < aConvertEnds_[ nChr ] ) ) { // found the right chromosome soChromosome = aConvertChromosomes_[ nChr ]; n1ChrPos = lGenomic - aConvertStarts_[ nChr ] + 1; return; } } soChromosome = ""; return; } // - is not a special character so doesn't need \\ in front of it RWCRegexp regWindow( "^[0-9,]+-[0-9,]+$" ); void phaster2PhdBall :: parseLocationsFile() { FILE* pPhasterLocations = fopen( filPhasterLocations_, "r" ); if ( !pPhasterLocations ) { THROW_FILE_ERROR( filPhasterLocations_ ); } bool bFirstTime = true; while( fgets( soLine_.data(), nMAXLINESIZEB, pPhasterLocations ) != NULL ) { soLine_.nCurrentLength_ = strlen( soLine_.data() ); soLine_.stripTrailingWhitespaceFast(); if ( soLine_.bIsNull() ) continue; if ( soLine_[0] == '#' ) continue; // looks like: // chrX 1234 // or // chrX 1234 M // or // chrX 1234 1334 // or // chrX 1234 M 1334 A // chrX 1234-2589 // "chrX" in each case above can be replace by "genomic" // "unmapped" can follow any of these in which case the pair // will only be accepted if one of the mates is unmapped mbtValOrderedVectorOfRWCString aWords; getAllTokens( soLine_, aWords ); // join tokens into a file name FileName filPhdBall; for( int nToken = 0; nToken < aWords.length(); ++nToken ) { filPhdBall += aWords[nToken]; if ( nToken != ( aWords.length() - 1 ) ) filPhdBall += "_"; } filPhdBall += ".phdball"; if ( aWords.length() < 2 ) { RWCString soError = "not enough tokens in location line: " + soLine_; THROW_ERROR2( soError ); } bool bMateUnmapped = false; int nIndexOfUnmapped = aWords.index( "unmapped" ); if ( nIndexOfUnmapped != RW_NPOS ) { bMateUnmapped = true; aWords.removeAt( nIndexOfUnmapped ); } desiredLocation* pDL = NULL; if ( aWords.length() == 2 ) { if ( ( aWords[1] ).bContains( regWindow ) ) { // case of // charX 1234-5678 RWCTokenizer tok( aWords[1] ); RWCString soStartWindow = tok('-' ); RWCString soEndWindow = tok('-' ); soStartWindow.removeAllCommas(); soEndWindow.removeAllCommas(); long l1ChrPosLeft; long l1ChrPosRight; l1ChrPosLeft = atol( soStartWindow.data() ); l1ChrPosRight = atol( soEndWindow.data() ); pDL = new desiredLocation( bMateUnmapped, aWords[0], l1ChrPosLeft, desiredLocation::cMatchBothSites, filPhdBall ); pDL->l1ChrPosB_ = l1ChrPosRight; // pDL->cAllowedAlleles_ = 'n'; pDL->cTypeOfMatch_ = desiredLocation::cMatchWindow; } else { // case of // chrX 1234 aWords[1].removeAllCommas(); if ( !bIsNumeric( aWords[1] ) ) { RWCString soError = "2nd token is not numeric in " + soLine_; THROW_ERROR( soError ); } long l1ChrPos = atol( aWords[1].data() ); pDL = new desiredLocation( bMateUnmapped, aWords[0], l1ChrPos, desiredLocation::cMatchOneSite, filPhdBall ); } } else if ( aWords.length() == 3 ) { if ( ( aWords[2].length() == 1 ) && ( isalpha( (aWords[2] )[0] ) || ( aWords[2] == "-" ) ) ) { // case of // chrX 1234 M if ( aWords[2].length() != 1 ) { RWCString soError = "3rd token (ambiguity code) is not of length 1: " + soLine_; THROW_ERROR2( soError ); } aWords[1].removeAllCommas(); if ( !bIsNumeric( aWords[1] ) ) { RWCString soError = "2nd token is not numeric in " + soLine_; THROW_ERROR( soError ); } long l1ChrPos = atol( aWords[1].data() ); pDL = new desiredLocation( bMateUnmapped, aWords[0], l1ChrPos, desiredLocation::cMatchOneSite, filPhdBall ); pDL->cAllowedAllelesA_ = (aWords[2])[0]; } else { // case of // chrX 1234 1334 aWords[1].removeAllCommas(); aWords[2].removeAllCommas(); if ( !bIsNumeric( aWords[1] ) ) { RWCString soError = "unrecognized format A: " + soLine_; THROW_ERROR2( soError ); } if ( !bIsNumeric( aWords[2] ) ) { RWCString soError = "unrecognized format B: " + soLine_; THROW_ERROR2( soError ); } long l1ChrPosA = atol( aWords[1].data() ); long l1ChrPosB = atol( aWords[2].data() ); if ( l1ChrPosA > l1ChrPosB ) { long lTemp = l1ChrPosA; l1ChrPosA = l1ChrPosB; l1ChrPosB = lTemp; } pDL = new desiredLocation( bMateUnmapped, aWords[0], l1ChrPosA, desiredLocation::cMatchBothSites, filPhdBall ); pDL->l1ChrPosB_ = l1ChrPosB; } } else if ( aWords.length() == 5 ) { // chrX 1234 M 1334 A aWords[1].removeAllCommas(); aWords[3].removeAllCommas(); if ( ! ( bIsNumeric( aWords[1] ) && bIsNumeric( aWords[3] ) && ( aWords[2].length() == 1 ) && ( aWords[2].length() == 1 ) ) ) { RWCString soError = "5 tokens but unrecognized format a: " + soLine_; THROW_ERROR2( soError ); } if ( !isalpha( (aWords[2])[0] ) || !isalpha( (aWords[4])[0] ) ) { RWCString soError = "5 tokens but unrecognized format b: " + soLine_; THROW_ERROR2( soError ); } int l1ChrPosA = atol( aWords[1].data() ); int l1ChrPosB = atol( aWords[3].data() ); if ( l1ChrPosA > l1ChrPosB ) { long lTemp = l1ChrPosA; l1ChrPosA = l1ChrPosB; l1ChrPosB = lTemp; } pDL = new desiredLocation( bMateUnmapped, aWords[0], l1ChrPosA, desiredLocation::cMatchBothSites, filPhdBall ); pDL->l1ChrPosB_ = l1ChrPosB; pDL->cAllowedAllelesA_ = (aWords[2])[0]; pDL->cAllowedAllelesB_ = (aWords[4])[0]; } else { RWCString soError = "unrecognized format--wrong # of tokens: " + soLine_; THROW_ERROR2( soError ); } aDesiredLocations_.append( pDL ); } fclose( pPhasterLocations ); } //void phaster2PhdBall :: parseLocationsFile() { void phaster2PhdBall :: setGenomicLocationsOfDesiredLocations() { // set nGenomicLocationA_ and nGenomicLocationB_ for( int nLoc = 0; nLoc < aDesiredLocations_.length(); ++nLoc ) { desiredLocation* pDL = aDesiredLocations_[nLoc]; pDL->lGenomicLocationA_ = lConvertToGenomic( pDL->soChromosome_, pDL->l1ChrPosA_ ); pDL->lGenomicLocationB_ = lConvertToGenomic( pDL->soChromosome_, pDL->l1ChrPosB_ ); cerr << "desired location: " << pDL->soChromosome_ << " " << pDL->l1ChrPosA_ << " " << pDL->lGenomicLocationA_ << endl; } } void phaster2PhdBall :: openPhdBalls() { for( int nLoc = 0; nLoc < aDesiredLocations_.length(); ++nLoc ) { desiredLocation* pDL = aDesiredLocations_[ nLoc ]; pDL->pPhdBall_ = fopen( pDL->filPhdBall_.data(), "w" ); if ( !pDL->pPhdBall_ ) { THROW_FILE_ERROR( pDL->filPhdBall_ ); } fprintf( pDL->pPhdBall_, "# %s\n", pDL->filPhdBall_.data() ); fprintf( pPhdBallFOF_, "%s\n", pDL->filPhdBall_.data() ); } } void phaster2PhdBall :: closePhdBalls() { for( int nLoc = 0; nLoc < aDesiredLocations_.length(); ++nLoc ) { desiredLocation* pDL = aDesiredLocations_[ nLoc ]; fclose( pDL->pPhdBall_ ); } } void phaster2PhdBall :: setReadDepthArrays() { for( int nLoc = 0; nLoc < aDesiredLocations_.length(); ++nLoc ) { desiredLocation* pDL = aDesiredLocations_[ nLoc ]; pDL->paReadDepth_ = new RWTValVector( pDL->lGenomicLocationB_ - pDL->lGenomicLocationA_ + 2, 0, "paReadDepth" ); } } void phaster2PhdBall :: writeNewLocationsFile() { const char cNotInRegion = 'n'; const char cInARegion = 'i'; for( int nLoc = 0; nLoc < aDesiredLocations_.length(); ++nLoc ) { desiredLocation* pDL = aDesiredLocations_[ nLoc ]; int n; for( n = 1; n <= pDL->paReadDepth_->nGetEndIndex(); ++n ) { (*(pDL->paReadDepth_))[n] += (*(pDL->paReadDepth_))[n-1]; } // this check makes sure that we always finish the last // region if ( (*(pDL->paReadDepth_))[ pDL->paReadDepth_->nGetEndIndex() ] != 0 ) { cerr << "read depth at end should be zero but was " << (*(pDL->paReadDepth_))[ pDL->paReadDepth_->nGetEndIndex() ] << endl; cerr << "nLoc = " << nLoc << " end index = " << pDL->paReadDepth_->nGetEndIndex() << endl; assert( false ); } long lRegionLeft; long lRegionRight; char cState = cNotInRegion; int nMaxDepthOfCoverage = 0; for( n = 0; n <= pDL->paReadDepth_->nGetEndIndex(); ++n ) { bool bHasReads = ( (*(pDL->paReadDepth_))[n] > 0 ); if ( bHasReads && (cState == cNotInRegion ) ) { lRegionLeft = n + pDL->lGenomicLocationA_; nMaxDepthOfCoverage = (*(pDL->paReadDepth_))[n]; cState = cInARegion; } else if ( bHasReads && ( cState == cInARegion ) ) { nMaxDepthOfCoverage = MAX( nMaxDepthOfCoverage, (*(pDL->paReadDepth_))[n] ); } else if ( !bHasReads && ( cState == cInARegion ) ) { lRegionRight = n - 1 + pDL->lGenomicLocationA_; fprintf( pNewLocations_, "genomic %ld-%ld unmapped %d\n", lRegionLeft, lRegionRight, nMaxDepthOfCoverage ); cState = cNotInRegion; } } // for( n = 0; n <= pDL->paReadDepth_->nGetEndIndex(); ++n ) { } }