/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "sff2PhdBall.h" #include "rwcstring.h" #include "soLine.h" #include "seqJustBases.h" #include "basesAndQualitiesAndPeaks.h" #include "matchLinker.h" #include "rwtptrorderedvector.h" #include "rwtvalorderedvector.h" #include "readOneSffFile.h" #include "consedParameters.h" #include "findQueryWithinSubject.h" #include "complement_so.h" #include "getNextToken.h" #include "rwtvalvector.h" #include "soAddCommas.h" #include "bIntersect.h" #include "szGetTime.h" #include "soGetDateTime.h" #include "soLine.h" #include "rwctokenizer.h" #include #include #include "rwtvalorderedvector.h" #include "mbtValOrderedVectorOfRWCString.h" // Jim Knight's rules allow at worse case indels + mismatches*2 <= 5 // so there can be no more than 5 indels. However, the score can be // worse than L (linker length) - 5 since you can also have 2 mismatches and // 1 insertion in the subject giving a L-3 for the # of matches so score of // L-3 - 2*2 -1*1 = L-6 If we are allowing this to be the worst score, then // this worst score can also be found by 6 insertions in the subject. Thus // we need to allow 6 indels. // I have found that Jim Knight's rules have a problem: since mismatches // count 2 and indels as 1, you get alignments like this: //gttggaaccgaaagggtttgaattca*aaccctttcggttccaac // x x //gttggaaccgaaagggtttgaattcaga*ccctttcggttccaac // // in which there clearly is a single base error in the read //gttggaaccgaaagggtttgaattcaaaccctttcggttccaac // x //gttggaaccgaaagggtttgaattcagaccctttcggttccaac // but the scoring system is allowing 2 indels to be used instead. // Thus I'm going to use the following: // match: 1 // mismatch: 3 // indel: 2 // In this way, the algorithm will prefer a mismatch over 2 indels. // I will still use Jim Knight's rule of allowing 5 indels or 2 mismatches and 1 indel: // penalty of 5*2 = 10 // penalty of 2*3+1*2 = 8 // Thus the worst score for a linker will be L - 10 or // L - 2 - 8 (since if 2 bases are mismatching they can't be matching // so L-2 is the number of matches and -8 is the penalty for the 2 mismatches and 1 indel const int nMaxIndels = 5; const int nMinimumReadLength = 15; void sff2PhdBall :: doIt() { soPrefixForTemporaryFiles_ = filSffFullPath_.soGetBasename() + "." + soGetDateTime( nDotInMiddle ); pPhdBall_ = fopen( filNewPhdBallFullPath_.data(), "w" ); if ( !pPhdBall_ ) { THROW_FILE_ERROR( filNewPhdBallFullPath_ ); } if ( bUseOnlyCertainReads_ ) { FILE* pFOF = fopen( filFOFOfReadsToUse_.data(), "r" ); if ( !pFOF ) { THROW_FILE_ERROR( filFOFOfReadsToUse_ ); } while( fgets( soLine.data(), nMaxLineSize, pFOF ) != NULL ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); aReadsToUse_.insert( soLine ); } fclose( pFOF ); } // get linker sequences RWTPtrOrderedVector aArrayOfLinkerSequences; readLinkerSequencesAndComplement( aArrayOfLinkerSequences ); RWTValOrderedVector aMinScoreForLinker( (size_t) aArrayOfLinkerSequences.length() ); setMinScoreForLinkers( aArrayOfLinkerSequences, aMinScoreForLinker ); aRawSffReads_.soName_ = "aRawSffReads_"; if ( bUseOnlyCertainReads_ ) { cerr << "extracting reads from sff file " << filSffFullPath_ << " that are listed in file " << filFOFOfReadsToUse_ << endl; } else { cerr << "extracting all reads from sff file " << filSffFullPath_ << endl; } readOneSffFile( filSffFullPath_.data(), aRawSffReads_, bUseOnlyCertainReads_, aReadsToUse_ ); cerr << "found " << aRawSffReads_.length() << " reads" << endl; cerr << "now looking for matches with linkers..." << endl; RWTValOrderedVector aMatches( aRawSffReads_.length(), "aMatches" ); // they are already initialized so that nNumberOfMatches_ = 0 for // each match object aMatches.nCurrentLength_ = aRawSffReads_.length(); // now it has become clear that the Ntide method of holding // the sequence is not a good one. Better is fastq with // 2 RWCString arrays int nMatchesSoFar = 0; for( int nRead = 0; nRead < aRawSffReads_.length(); ++nRead ) { if ( nRead % 10000 == 0 ) { int nPerCentDone = ( nRead + 1 ) * 100 / ( aRawSffReads_.length() + 1 ); cerr << nPerCentDone << "% done. Matches so far: " << soAddCommas( nMatchesSoFar ) << " out of " << soAddCommas( nRead + 1 ) << " or " << nMatchesSoFar * 100 / ( nRead + 1 ) << "%" << endl; } basesAndQualitiesAndPeaks* pSeqRead = aRawSffReads_[ nRead ]; for( int nLinker = 0; nLinker < aArrayOfLinkerSequences.length(); ++nLinker ) { seqJustBases* pSeqLinker = aArrayOfLinkerSequences[ nLinker ]; bool bFoundMatch = false; int n0SubjectStart; int n0SubjectEnd; int nScore; findQueryWithinSubject( pSeqLinker->soBases_, pSeqRead->soBases_, pCP->n454LinkerAlignmentIndelScore_, pCP->n454LinkerAlignmentMatchScore_, pCP->n454LinkerAlignmentMismatchScore_, aMinScoreForLinker[ nLinker ], nMaxIndels, bFoundMatch, n0SubjectStart, n0SubjectEnd, nScore ); if ( bFoundMatch ) { ++nMatchesSoFar; matchLinker& myMatch = aMatches[ nRead ]; if ( myMatch.nNumberOfMatches_ == 0 || nScore > myMatch.nScore_ ) { ++myMatch.nNumberOfMatches_; myMatch.nScore_ = nScore; myMatch.nLinkerOfMatch_ = nLinker; myMatch.n0SubjectStart_ = n0SubjectStart; myMatch.n0SubjectEnd_ = n0SubjectEnd; } else if ( myMatch.nNumberOfMatches_ != 0 && nScore <= myMatch.nScore_ ) { ++myMatch.nNumberOfMatches_; } // make matching region in read x's and try again. If // the sequence matches in 2 different places, then // we will reject the read. RWCString soCopyOfBases = pSeqRead->soBases_; for( int n0 = n0SubjectStart; n0 <= n0SubjectEnd; ++n0 ) { soCopyOfBases[n0] = 'x'; } bFoundMatch = false; findQueryWithinSubject( pSeqLinker->soBases_, soCopyOfBases, pCP->n454LinkerAlignmentIndelScore_, pCP->n454LinkerAlignmentMatchScore_, pCP->n454LinkerAlignmentMismatchScore_, aMinScoreForLinker[ nLinker ], nMaxIndels, bFoundMatch, n0SubjectStart, n0SubjectEnd, nScore ); if ( bFoundMatch ) { // bad--found twice ++myMatch.nNumberOfMatches_; } } // if ( bFoundMatch ) { } } int nReadsWithMatchesToLinkers = 0; int nMatch; for( nMatch = 0; nMatch < aMatches.length(); ++nMatch ) { if ( aMatches[ nMatch ].nNumberOfMatches_ > 0 ) ++nReadsWithMatchesToLinkers; } bool bPairedReadDataset = false; if ( nReadsWithMatchesToLinkers > aRawSffReads_.length() / 4 ) { bPairedReadDataset = true; } // this section below is just for information int nNumberWithNoMatches = 0; RWTValOrderedVector aLinkerHistogram( 4, "aLinkerHistogram" ); aLinkerHistogram.increaseCurrentLength( 4, 0 ); RWTValVector aScoreHistogram( 100, 0, "aScoreHistogram" ); for( nMatch = 0; nMatch < aMatches.length(); ++nMatch ) { matchLinker& myMatch = aMatches[ nMatch ]; if ( myMatch.nNumberOfMatches_ == 0 ) { ++nNumberWithNoMatches; } else { // there might be more than 4 values of myMatch.nLinkerOfMatch_ // if the users added any if ( myMatch.nLinkerOfMatch_ >= aLinkerHistogram.length() ) { aLinkerHistogram.increaseCurrentLength( myMatch.nLinkerOfMatch_ + 1, 0 ); } ++aLinkerHistogram[ myMatch.nLinkerOfMatch_ ]; ++aScoreHistogram[ myMatch.nScore_ ]; /* printf( "nNumberOfMatches: %d score: %d linker: %d %d - %d\n", myMatch.nNumberOfMatches_, myMatch.nScore_, myMatch.nLinkerOfMatch_, myMatch.n0SubjectStart_, myMatch.n0SubjectEnd_ ); */ } } printf( "number with matches: %d number without matches: %d\n", aMatches.length() - nNumberWithNoMatches, nNumberWithNoMatches ); printf( "scores:\n" ); for( int nScore = 0; nScore <= aScoreHistogram.nGetEndIndex(); ++nScore ) { if ( aScoreHistogram[ nScore ] != 0 ) { printf( "%d: %d\n", nScore, aScoreHistogram[ nScore ] ); } } printf( "linkers:\n" ); for( int nLinker = 0; nLinker < aLinkerHistogram.length(); ++nLinker ) { printf( "%d: %d\n", nLinker, aLinkerHistogram[ nLinker ] ); } printf( "number of deletions in linker: %s\n", soAddCommas( pCP->nDebugging_ ).data() ); printf( "number of deletions in reads: %s\n", soAddCommas( pCP->nDebugging2_ ).data() ); printf( "number of mismatches: %s\n", soAddCommas( pCP->nDebugging3_ ).data() ); // end information output const int nLEFT = 0; RWTValVector aLeftOrRight( 2, "" ); aLeftOrRight[0] = pCP->soPaired454LeftReadExtension_; aLeftOrRight[1] = pCP->soPaired454RightReadExtension_; // this will set it to be big enough for all the reads to be paired reads aDesiredReads_.soName_ = "aDesiredReads_"; aDesiredReads_.resize( aRawSffReads_.length() * 2 ); assert( aMatches.length() == aRawSffReads_.length() ); // I'm indexing this by the same integer as the index of aRawSffReads_ // instead of pointers just to save some space aNonPairedReads_.soName_ = "aNonPairedReads_"; aNonPairedReads_.resize( aRawSffReads_.length() * 2 ); for( nMatch = 0; nMatch < aMatches.length(); ++nMatch ) { matchLinker& myMatch = aMatches[ nMatch ]; // aRawSffReads_ is indexed the same as aMatches basesAndQualitiesAndPeaks* pSeqRead = aRawSffReads_[ nMatch ]; if ( myMatch.nNumberOfMatches_ == 0 ) { // such a read is a non-paired read aNonPairedReads_.insert( nMatch ); } else if ( myMatch.nNumberOfMatches_ == 1 ) { // first check if either the left or the right read is too small bool bTooSmall = false; int nLeftOrRight; for( nLeftOrRight = 0; nLeftOrRight <= 1; ++nLeftOrRight ) { int n0Left; int n0Right; if ( nLeftOrRight == nLEFT ) { n0Left = 0; n0Right = myMatch.n0SubjectStart_ - 1; // the reason for the -1 is that n0SubjectStart_ // is the beginning of the linker } else { n0Left = myMatch.n0SubjectEnd_ + 1; // the reason for the +1 is that n0SubjectEnd_ // is the end of the linker n0Right = pSeqRead->soBases_.length() - 1; } if ( !bIntersect( n0Left, n0Right, pSeqRead->n1ClipLeft_ - 1, pSeqRead->n1ClipRight_ - 1, n0Left, n0Right ) ) { // cerr << "warning: read " << pSeqRead->soSequenceName_ << // " has no untrimmed sequence for read on " << // aLeftOrRight[ nLeftOrRight ] << endl; bTooSmall = true; } else { int nReadLength = n0Right - n0Left + 1; if ( nReadLength < nMinimumReadLength ) { bTooSmall = true; } } } if ( bTooSmall ) { aNonPairedReads_.insert( nMatch ); continue; } for( nLeftOrRight = 0; nLeftOrRight <= 1; ++nLeftOrRight ) { int n0Left; int n0Right; if ( nLeftOrRight == nLEFT ) { n0Left = 0; n0Right = myMatch.n0SubjectStart_ - 1; // the reason for the -1 is that n0SubjectStart_ // is the beginning of the linker } else { n0Left = myMatch.n0SubjectEnd_ + 1; // the reason for the +1 is that n0SubjectEnd_ // is the end of the linker n0Right = pSeqRead->soBases_.length() - 1; } if ( !bIntersect( n0Left, n0Right, pSeqRead->n1ClipLeft_ - 1, pSeqRead->n1ClipRight_ - 1, n0Left, n0Right ) ) { cerr << "warning: read " << pSeqRead->soSequenceName_ << " has no untrimmed sequence for read on " << aLeftOrRight[ nLeftOrRight ] << endl; continue; } basesAndQualitiesAndPeaks* pNewRead = new basesAndQualitiesAndPeaks(); pNewRead->soSequenceName_ = pSeqRead->soSequenceName_ + aLeftOrRight[ nLeftOrRight ]; pNewRead->soBases_ = pSeqRead->soBases_( n0Left, n0Right - n0Left + 1 ); // the clipping positions are just the beginning and end of the // read pNewRead->n1ClipLeft_ = 1; pNewRead->n1ClipRight_ = pNewRead->soBases_.length(); pNewRead->aQualities_.substring( pSeqRead->aQualities_, n0Left, n0Right - n0Left + 1 ); pNewRead->setPointPosArraySubstring( pSeqRead, n0Left, n0Right - n0Left + 1 ); pNewRead->nTraceArrayMaxIndex_ = pSeqRead->nTraceArrayMaxIndex_; if ( nLeftOrRight == nLEFT ) { pNewRead->reverseComplementThyself(); } aDesiredReads_.insert( pNewRead ); } // } // else if ( myMatch.nNumberOfMatches_ == 1 ) { } // for( nMatch = 0; nMatch < aMatches.length(); ++nMatch ) { filterNonPairedReadsAgainstPuc19(); writePhdBall( aDesiredReads_ ); } // void sff2PhdBall :: doIt() { void sff2PhdBall :: readLinkerSequencesAndComplement( RWTPtrOrderedVector& aArrayOfLinkerSequences ) { FILE* pLinkers = fopen( pCP->fil454LinkerSequences_.data(), "r" ); if ( !pLinkers ) { THROW_FILE_ERROR( pCP->fil454LinkerSequences_ ); } bool bStart = true; seqJustBases* pSeq; while( fgets( soLine.data(), nMaxLineSize, pLinkers ) != NULL ) { soLine.nCurrentLength_ = strlen( soLine.data() ); if ( bStart && ( soLine[0] != '>' ) ) { RWCString soErrorMessage = pCP->fil454LinkerSequences_ + " did not start with > so it must not be in fasta format"; THROW_ERROR( soErrorMessage ); } if ( soLine[0] == '>' ) { if ( bStart ) { bStart = false; } else { // finish last sequence--I think there is nothing to finish } pSeq = new seqJustBases(); int nTokenStart; int nTokenEnd; bool bError = false; getNextToken( soLine.data(), 1, // start here nTokenStart, nTokenEnd, bError ); // added June 2009 since Torsten Thomas t.thomas@unsw.edu.au // had this problem if ( bError ) { RWCString soError = pCP->fil454LinkerSequences_ + " (linker sequence) has nothing following the > but there should be a sequence name"; THROW_ERROR( bError ); } // assert( !bError ); pSeq->soSequenceName_ = soLine( nTokenStart, nTokenEnd - nTokenStart + 1 ); aArrayOfLinkerSequences.insert( pSeq ); } else { soLine.stripTrailingWhitespaceFast(); pSeq->soBases_ += soLine.returnToLower(); } } // while( fgets( soLine.data(), nMaxLineSize, pLinkers ) != NULL ) { fclose( pLinkers ); // now add complements to these int nNumberOfUncomplementedSequences = aArrayOfLinkerSequences.length(); for( int nSeq = 0; nSeq < nNumberOfUncomplementedSequences; ++nSeq ) { seqJustBases* pSeq = aArrayOfLinkerSequences[ nSeq ]; // check if the sequence is palindromic. If it is, then // no need to check for its complement. RWCString soReverseComplement = soComplementSO( pSeq->soBases_ ); if ( soReverseComplement == pSeq->soBases_ ) { cerr << "not complementing " << pSeq->soSequenceName_ << " because it is palindromic" << endl; continue; } cerr << "complementing " << pSeq->soSequenceName_ << " because it is not palindromic" << endl; seqJustBases* pSeqComp = new seqJustBases(); pSeqComp->soSequenceName_ = pSeq->soSequenceName_ + ".comp"; pSeqComp->soBases_ = soComplementSO( pSeq->soBases_ ); aArrayOfLinkerSequences.insert( pSeqComp ); } } void sff2PhdBall :: setMinScoreForLinkers( RWTPtrOrderedVector aArrayOfLinkerSequences, RWTValOrderedVector& aMinScoreForLinker ) { for( int nLinker = 0; nLinker < aArrayOfLinkerSequences.length(); ++nLinker ) { int nLength = aArrayOfLinkerSequences[ nLinker ]->soBases_.length(); aMinScoreForLinker.insert( nLength - nMaxIndels* pCP->n454LinkerAlignmentIndelScore_ ); } } void sff2PhdBall :: writePhdBall( const RWTPtrOrderedVector aReads ) { cerr << "writing phd ball" << endl; RWCString soSffFileWithoutPath = filSffFullPath_.soGetBasename(); char* szTimeStamp = szGetTime(); RWCString soDateTime = soGetDateTime( nNoSlashes ); for( int nRead = 0; nRead < aReads.length(); ++nRead ) { if ( nRead % 10000 == 0 ) { cerr << "."; cerr.flush(); } basesAndQualitiesAndPeaks* pSeqRead = aReads[ nRead ]; fprintf( pPhdBall_, "BEGIN_SEQUENCE %s 1\n\n", pSeqRead->soSequenceName_.data() ); fprintf( pPhdBall_, "BEGIN_COMMENT\n\n" ); RWCString soReadNameWithoutLeftOrRight = pSeqRead->soSequenceName_; bool bLeft = false; bool bRight = false; if ( soReadNameWithoutLeftOrRight.bEndsWithAndRemove( pCP->soPaired454LeftReadExtension_ ) ) { bLeft = true; } else if ( soReadNameWithoutLeftOrRight.bEndsWithAndRemove( pCP->soPaired454RightReadExtension_ ) ) { bRight = true; } fprintf( pPhdBall_, "CHROMAT_FILE: sff:%s%s:%s\n", ( bLeft ? "-f:" : "" ), soSffFileWithoutPath.data(), soReadNameWithoutLeftOrRight.data() ); fprintf( pPhdBall_, "TIME: %s\n", szTimeStamp ); fprintf( pPhdBall_, "CHEM: 454\n" ); fprintf( pPhdBall_, "TRACE_ARRAY_MIN_INDEX: 0\n" ); fprintf( pPhdBall_, "TRACE_ARRAY_MAX_INDEX: %d\n", pSeqRead->nTraceArrayMaxIndex_ ); fprintf( pPhdBall_, "\nEND_COMMENT\n\nBEGIN_DNA\n" ); assert( 0 <= pSeqRead->n1ClipLeft_ - 1 ); assert( pSeqRead->n1ClipRight_ <= pSeqRead->soBases_.length() ); for( int n0Base = pSeqRead->n1ClipLeft_ - 1; n0Base <= pSeqRead->n1ClipRight_ - 1; ++n0Base ) { fprintf( pPhdBall_, "%c %d %d\n", tolower( pSeqRead->soBases_[ n0Base ] ), pSeqRead->aQualities_[ n0Base ], pSeqRead->nGetPointPos( n0Base ) ); } fprintf( pPhdBall_, "END_DNA\n\nEND_SEQUENCE\n\n" ); if ( bLeft || bRight ) { // should write WR items so assembly view will know the // reads are part of a pair // newbler write like this: // WR{ // template newbler 000727:123348 // name: FGXQUUK01DBP5M // lib: 454PairedRead // } fprintf( pPhdBall_, "WR{\n" ); fprintf( pPhdBall_, "template consed-sff2PhdBall %s\n", soDateTime.data() ); fprintf( pPhdBall_, "name: %s\n", soReadNameWithoutLeftOrRight.data() ); fprintf( pPhdBall_, "lib: 454PairedRead\n" ); fprintf( pPhdBall_, "}\n\n" ); // WR{ // primer newbler 000727:123348 // type: univ fwd // } fprintf( pPhdBall_, "WR{\n" ); fprintf( pPhdBall_, "primer consed-sff2PhdBall %s\n", soDateTime.data() ); if ( bLeft ) { fprintf( pPhdBall_, "type: univ fwd\n" ); } else { fprintf( pPhdBall_, "type: univ rev\n" ); } fprintf( pPhdBall_, "}\n\n" ); } } fclose( pPhdBall_ ); cerr << endl; cerr << "see " << filNewPhdBallFullPath_ << endl; } void sff2PhdBall :: filterNonPairedReadsAgainstPuc19() { printf("number of non-paired reads: %d\n", aNonPairedReads_.length() ); // fix for Cliff Han (LANL) bug in which all reads are paired if ( aNonPairedReads_.length() == 0 ) return; // write fasta file for use for searching non-paired reads against puc19 openFastaForCrossMatch(); for( int n = 0; n < aNonPairedReads_.length(); ++n ) { int nIndex = aNonPairedReads_[n]; basesAndQualitiesAndPeaks* pRead = aRawSffReads_[ nIndex ]; writeToFastaForCrossMatch( pRead ); } fclose( pFasta_ ); runCrossMatch(); readCrossMatchOutputAndAddToDesiredReads(); } void sff2PhdBall :: openFastaForCrossMatch() { filFastaToCrossMatch_ = soPrefixForTemporaryFiles_ + ".fa"; pFasta_ = fopen( filFastaToCrossMatch_.data(), "w" ); if ( !pFasta_ ) { THROW_FILE_ERROR( filFastaToCrossMatch_ + " for screening against puc19" ); } } void sff2PhdBall :: writeToFastaForCrossMatch( basesAndQualitiesAndPeaks* pRead ) { fprintf( pFasta_, ">%s\n", pRead->soSequenceName_.data() ); const int nMaxBasesOnLine = 50; for( int nBase = 0; nBase < pRead->soBases_.length(); nBase += nMaxBasesOnLine ) { int nNumberOfBasesOnLine = nMaxBasesOnLine; if ( nBase + nNumberOfBasesOnLine > pRead->soBases_.length() ) { // check: if length() = 20, and nBase = 19, then we should // write 1 base and 20 - 19 = 1 nNumberOfBasesOnLine = pRead->soBases_.length() - nBase; } fprintf( pFasta_, "%.*s\n", nNumberOfBasesOnLine, pRead->soBases_.data() + nBase ); } } void sff2PhdBall :: runCrossMatch() { filCrossMatchLinkerOutput_ = soPrefixForTemporaryFiles_ + ".linker.cross"; filCrossMatchPucOutput_ = soPrefixForTemporaryFiles_ + ".puc.cross"; RWCString soCommand = pCP->filFullPathnameOfFilter454ReadsScript_ + " " + filFastaToCrossMatch_ + " " + pCP->fil454LinkerSequences_ + " " + pCP->filFilter454ReadsAgainstThis_ + " " + filCrossMatchLinkerOutput_ + " " + filCrossMatchPucOutput_; int nRetStat = system( soCommand.data() ); if ( nRetStat != 0 ) { RWCString soError = "Something went wrong running "; soError += soCommand; soError += " Gave error code "; soError += soGetErrno(); soError += "\nSee xterm for more detail on what went wrong or trying running the command directly from the command line."; THROW_ERROR( soError ); } } void sff2PhdBall :: readCrossMatchOutputAndAddToDesiredReads() { FILE* pPucCross = fopen( filCrossMatchPucOutput_.data(), "r" ); if ( !pPucCross ) { THROW_FILE_ERROR( filCrossMatchPucOutput_ + " cross_match output file" ); } cerr << "reading " << filCrossMatchPucOutput_ << endl; int nNumberOfMatchingReads = 0; while( fgets( soLine.data(), nMaxLineSize, pPucCross ) ) { if ( memcmp( soLine.data(), "ALIGNMENT", 9 ) == 0 ) { ++nNumberOfMatchingReads; } } mbtValOrderedVectorOfRWCString aMatchingReads( (size_t) nNumberOfMatchingReads ); rewind( pPucCross ); while( fgets( soLine.data(), nMaxLineSize, pPucCross ) ) { if ( memcmp( soLine.data(), "ALIGNMENT", 9 ) != 0 ) continue; soLine.nCurrentLength_ = strlen( soLine.data() ); // looks like this: // // ALIGNMENT 25 3.57 0.00 0.00 s_1_1_886_8 1 28 (0) C chr5 (854045) 180003821 180003794 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 // ALIGNMENT 27 0.00 0.00 0.00 s_1_1_300_628 1 28 (0) C chr5 (126239857) 54618009 54617982 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 // ALIGNMENT 148 0.00 0.00 0.00 puc19.454 1 153 (2533) FGXQUUK01D4S1E 122 274 (0 // 0 1 2 3 4 5 6 7 8 9 10 11 12 // really all we need is whether the read matches or not RWCTokenizer tok( soLine ); for( int n = 0; n < 9; ++n ) { tok(); } RWCString soRead = tok(); if ( soRead == "C" ) { soRead = tok(); } aMatchingReads.insert( soRead ); } fclose( pPucCross ); cerr << "start sorting" << endl; aMatchingReads.resort(); cerr << "done sorting" << endl; // now make a pass through the nonpaired reads, only making it a aDesired // read if it has no match to puc19. int nReadsMatchingPuc19 = 0; int nReadsNotMatchingPuc19 = 0; aNonPairedReadsHasPuc_.reshape( aNonPairedReads_.length(), false ); for( int n = 0; n < aNonPairedReads_.length(); ++n ) { int nIndex = aNonPairedReads_[n]; RWCString soReadName = aRawSffReads_[ nIndex ]->soSequenceName_; if ( aMatchingReads.bContains( soReadName ) ) { aNonPairedReadsHasPuc_[n] = true; ++nReadsMatchingPuc19; } else { ++nReadsNotMatchingPuc19; } } cerr << nReadsNotMatchingPuc19 << " unpaired reads not matching puc19" << endl; cerr << nReadsMatchingPuc19 << " unpaired reads matching puc19" << endl; cerr << "done passing through the full list of nonpaired reads" << endl; // now look at linker matches to see if the clipping positions // need to be changed FILE* pLinkerCross = fopen( filCrossMatchLinkerOutput_.data(), "r" ); if ( !pLinkerCross ) { THROW_FILE_ERROR( filCrossMatchLinkerOutput_ + " cross_match output file" ); } cerr << "reading " << filCrossMatchLinkerOutput_ << endl; int nNumberOfUnpairedReadsMatchingLinker = 0; int nNonPairedReadsIndex = -1; // will first ++ RWCString soLastRead = ""; int nLine = 0; while( fgets( soLine.data(), nMaxLineSize, pLinkerCross ) ) { ++nLine; if ( memcmp( soLine.data(), "ALIGNMENT", 9 ) != 0 ) continue; ++nNumberOfUnpairedReadsMatchingLinker; soLine.nCurrentLength_ = strlen( soLine.data() ); // looks like this: // // ALIGNMENT 24 0.00 0.00 0.00 FGXQUUK01AXTGF 27 51 (239) linker 20 44 (0) // 0 1 2 3 4 5 6 7 // ALIGNMENT 22 0.00 0.00 9.76 FGXQUUK01EDYEW 195 235 (34) C linker (7) 37 1 RWCTokenizer tok( soLine ); for( int n = 0; n <= 4; ++n ) { tok(); } RWCString soRead = tok(); if ( soRead == soLastRead ) { cerr << "duplicated read " << soRead << " line " << nLine << endl; continue; } RWCString soReadStart = tok(); RWCString soReadEnd = tok(); RWCString soLeftover = tok(); int n1ReadStart = atoi( soReadStart.data() ); int n1ReadEnd = atoi( soReadEnd.data() ); // find this read in the list of aNonPairedReads_ basesAndQualitiesAndPeaks* pSeq; bool bFoundRead = false; while( !bFoundRead ) { ++nNonPairedReadsIndex; if ( nNonPairedReadsIndex >= aNonPairedReads_.length() ) { RWCString soError = "could not find read "; soError += soRead; soError += " in list aNonPairedReads_. This might be due to someone editing the perl script filter454Reads.perl--the list of reads must come before the linker sequence. "; soError += RWCString( (long) nLine ); THROW_ERROR( soError ); } int nRawSffReadsIndex = aNonPairedReads_[ nNonPairedReadsIndex ]; pSeq = aRawSffReads_[ nRawSffReadsIndex ]; if ( pSeq->soSequenceName_ == soRead ) { bFoundRead = true; } else { // this read has no linker sequence in it. // Thus if it has no puc, we want it: if ( !aNonPairedReadsHasPuc_[ nNonPairedReadsIndex ] ) { // add it as is (no additional clipping) to the desired reads aDesiredReads_.insert( pSeq ); } } } // set up for next time: soLastRead = soRead; // check if this read is already eliminated due to puc if ( aNonPairedReadsHasPuc_[ nNonPairedReadsIndex ] ) continue; // see which is larger // -------llllll------- // a b c d int n1A; int n1B; int nLeftLength = 0; if ( bIntersect( pSeq->n1ClipLeft_, pSeq->n1ClipRight_, 1, n1ReadStart - 1, n1A, n1B ) ) { nLeftLength = n1B - n1A + 1; } int n1C; int n1D; int nRightLength = 0; if ( bIntersect( pSeq->n1ClipLeft_, pSeq->n1ClipRight_, n1ReadEnd + 1, pSeq->soBases_.length(), n1C, n1D ) ) { nRightLength = n1D - n1C + 1; } if ( nLeftLength > nRightLength ) { pSeq->n1ClipLeft_ = n1A; pSeq->n1ClipRight_ = n1B; } else { pSeq->n1ClipLeft_ = n1C; pSeq->n1ClipRight_ = n1D; } if ( pSeq->n1ClipRight_ - pSeq->n1ClipLeft_ + 1 >= nMinimumReadLength ) aDesiredReads_.insert( pSeq ); } // while( fgets( soLine.data(), nMaxLineSize, pLinkerCross ) ) { // there may be some nonpaired reads leftover after looking through all // the linkers. Many of these (the ones without puc19) should be made into // unpaired reads for( ++nNonPairedReadsIndex; nNonPairedReadsIndex < aNonPairedReads_.length(); ++nNonPairedReadsIndex ) { // this read has no linker sequence in it. // Thus if it has no puc, we want it: if ( !aNonPairedReadsHasPuc_[ nNonPairedReadsIndex ] ) { int nRawSffReadsIndex = aNonPairedReads_[ nNonPairedReadsIndex ]; basesAndQualitiesAndPeaks* pSeq = aRawSffReads_[ nRawSffReadsIndex ]; // add it as is (no additional clipping) to the desired reads aDesiredReads_.insert( pSeq ); } } if ( pCP->bFilter454ReadsDeleteCrossMatchOutput_ ) { int nRet; nRet = unlink( filCrossMatchLinkerOutput_.data() ); if ( nRet != 0 ) { cerr << "Warning: could not delete cross_match output file " << filCrossMatchLinkerOutput_ << endl; cerr << soGetErrno() << endl; } nRet = unlink( filCrossMatchPucOutput_.data() ); if ( nRet != 0 ) { cerr << "Warning: could not delete cross_match output file " << filCrossMatchPucOutput_ << endl; cerr << soGetErrno() << endl; } FileName filLogFile = filFastaToCrossMatch_ + ".log"; nRet = unlink( filLogFile.data() ); if ( nRet != 0 ) { cerr << "Warning: could not delete cross_match log file " << filLogFile << endl; cerr << soGetErrno() << endl; } } }