/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "solexa2PhdBall.h" #include using namespace std; #include #include "szGetTime.h" #include "consedParameters.h" #include "nNumberFromBase2.h" #include "rwctokenizer.h" #include #include "nGetNumberOfTokens.h" #include "getNextToken.h" #include "bIsNumeric.h" #include "nStripTrailingWhitespace.h" #include "soGetDateTime.h" #include "consed_version.h" const char cFASTQ = 'f'; const char cBUSTARD = 'b'; const char cSOLEXA = 'o'; const char cSANGER = 'a'; const char cFORWARD_READ = 'f'; const char cREVERSE_READ = 'r'; void solexa2PhdBall :: writeReadHeaderToPhdBall( const RWCString& soReadName ) { fprintf( pNewPhdBall_, "BEGIN_SEQUENCE %s 1\nBEGIN_COMMENT\n", soReadName.data() ); // no CHROMAT_FILE, ABI_THUMBPRINT, PHRED_VERSION, CALL_METHOD, or // QUALITY_VALUES for solexa reads fprintf( pNewPhdBall_, "TIME: %s\n", szDateTime_ ); fprintf( pNewPhdBall_, "CHEM: solexa\n" ); fprintf( pNewPhdBall_, "END_COMMENT\n" ); } void solexa2PhdBall :: openNextPhdBall() { FileName filPhdBall = pCP->filPhdBallDirectory_ + "/" + pCP->filFastStartupFile_; FileName filNewPhdBall = filPhdBall.filFindOneHigherThanHighestVersion3(); pNewPhdBall_ = fopen( filNewPhdBall.data(), "w" ); if ( !pNewPhdBall_ ) { THROW_FILE_ERROR( filNewPhdBall ); } fprintf( pNewPhdBallFOF_, "%s\n", filNewPhdBall.data() ); } void solexa2PhdBall :: openPhdBallFOF() { pNewPhdBallFOF_ = fopen( filNewPhdBallFOF_.data(), "w" ); if ( !pNewPhdBallFOF_ ) { THROW_FILE_ERROR( filNewPhdBallFOF_ ); } } const int n256 = 256; static int nPhredFromFastq[n256]; static char cFastqTypeAlreadySet = ' '; static void setPhredQualityFromFastqTable( const char cFastqType ) { if ( cFastqTypeAlreadySet == cFastqType ) return; cFastqTypeAlreadySet = cFastqType; int nSubtractFromChar; if ( cFastqType == cSOLEXA ) nSubtractFromChar = 64; else if ( cFastqType == cSANGER ) nSubtractFromChar = 33; else assert( false ); if ( pCP->bSolexaFastqFilesArePhredQualityNotSolexaQuality_ ) { for( int nFastqChar = 0; nFastqChar < n256; ++nFastqChar ) { int nPhredQuality = nFastqChar - nSubtractFromChar; if ( nPhredQuality <= 0 ) { nPhredFromFastq[ nFastqChar ] = 0; } else { nPhredFromFastq[ nFastqChar ] = nPhredQuality; } } } else { for( int nFastqChar = 0; nFastqChar < n256; ++nFastqChar ) { int nSolexaQuality = ( (int) nFastqChar ) - nSubtractFromChar; if ( nSolexaQuality <= 0 ) { nPhredFromFastq[ nFastqChar ] = 0; } else { float fPhredQuality = 10.0 * log10( 1.0 + pow( 10.0, nSolexaQuality / 10.0 ) ); int nPhredQuality = fPhredQuality + 0.499; nPhredFromFastq[ nFastqChar ] = nPhredQuality; } } } } const int nSeqLineSize = 1000; static char szSeqLine[ nSeqLineSize ]; const int nQualityLineSize = 1000; static char szQualityLine[ nQualityLineSize ]; const int nThrowAwayLineSize = 1000; static char szThrowAwayLine[ nThrowAwayLineSize ]; void solexa2PhdBall :: doIt() { cerr << "consed version: " << szConsedVersion << endl; // get a single date/time for all of the reads soDateTime_ = soGetDateTime( nColonInMiddle ); strcpy( szDateTime_, szGetTime() ); openPhdBallFOF(); openNextPhdBall(); bWrittenPhdBallHeader_ = false; nNumberOfReadsThisPhdBall_ = 0; const int nFOFLineSize = 1000; RWCString soFOFLine( (size_t) nFOFLineSize ); FILE* pFOF = fopen( filFOFOfSolexaSeqFiles_.data(), "r" ); if ( !pFOF ) { THROW_FILE_ERROR( filFOFOfSolexaSeqFiles_ ); } bool bFiguredOutFastqOrBustard = false; char cFastqOrBustard = ' '; char cSolexaOrSangerFastq = ' '; while( fgets( soFOFLine.data(), nFOFLineSize, pFOF ) ) { soFOFLine.nCurrentLength_ = strlen( soFOFLine.data() ); soFOFLine.stripTrailingWhitespaceFast(); if ( soFOFLine.bIsNull() ) continue; // I want this to mainly apply to bustard files. In the case of // fastq files, the check is made with each sequence. The // reason for the difference is that Bustard files are small so // it is ok to finish each one before changing to a new phd // file. But solexa fastq files can have more than 4 million // reads which corresponds to a phd ball of over a gigabyte so // it is better to make a new phd ball when the number of reads // in the current phd ball exceeds a million. // So why does it apply at all to fastq files? Because I do want, // when starting a new fastq file, to start a new phd ball even // if the previous one isn't filled up. if ( bFiguredOutFastqOrBustard && ( ( ( cFastqOrBustard == cBUSTARD) && ( nNumberOfReadsThisPhdBall_ >= pCP->nMaxNumberOfReadsPerPhdBall_ ) ) || ( ( cFastqOrBustard == cFASTQ ) && ( nNumberOfReadsThisPhdBall_ > 0 ) ) ) ) { fclose( pNewPhdBall_ ); openNextPhdBall(); nNumberOfReadsThisPhdBall_ = 0; bWrittenPhdBallHeader_ = false; } RWCTokenizer tok( soFOFLine ); FileName filSolexaSeqFile1 = (FileName ) tok(); RWCString filSolexaSeqFileOrBustardReadPrefix = tok(); // If this is a fastq file, it is fine to have no prefix // if ( soReadNamePrefix.bIsNull() ) { // RWCString soError = "each line of file " + filFOFOfSolexaSeqFiles_ + " must have (seq file) (prefix) where (prefix) is something to prepend to each read name so that reads from different flow cells that have the same lane, tile, X and Y are given different names. However, this line is just " + soFOFLine; // THROW_ERROR( soError ); // } if ( !filSolexaSeqFile1.bStartsWith( ".." ) && !filSolexaSeqFile1.bStartsWith( "/" ) ) { filSolexaSeqFile1 = pCP->filSolexaFilesAreAssumedToBeHere_ + "/" + filSolexaSeqFile1; } if ( !bWrittenPhdBallHeader_ ) { fprintf( pNewPhdBall_, "\n# solexa file %s (beginning)\n\n", filSolexaSeqFile1.data() ); bWrittenPhdBallHeader_ = true; } FILE* pSolexaSeqFile1 = fopen( filSolexaSeqFile1, "r" ); if ( !pSolexaSeqFile1 ) { THROW_FILE_ERROR( filSolexaSeqFile1 ); } nLine_ = 0; // I'm going to check every single file that comes through--it is // only 1 check every million reads... checkIfFastqOrBustard( pSolexaSeqFile1, filSolexaSeqFile1, cFastqOrBustard, cSolexaOrSangerFastq ); if ( cFastqOrBustard == cFASTQ ) setPhredQualityFromFastqTable( cSolexaOrSangerFastq ); bFiguredOutFastqOrBustard = true; if ( cFastqOrBustard == cFASTQ ) { // check if there is a 2nd file bool bPairedSolexaReads = false; FileName filSolexaSeqFile2 = (FileName) filSolexaSeqFileOrBustardReadPrefix; FILE* pSolexaSeqFile2 = NULL; if ( !filSolexaSeqFile2.bIsNull() ) { if ( !filSolexaSeqFile2.bStartsWith( ".." ) && !filSolexaSeqFile2.bStartsWith( "/" ) ) { filSolexaSeqFile2 = pCP->filSolexaFilesAreAssumedToBeHere_ + "/" + filSolexaSeqFile2; } pSolexaSeqFile2 = fopen( filSolexaSeqFile2, "r" ); if ( !pSolexaSeqFile2 ) { THROW_FILE_ERROR( filSolexaSeqFile2 ); } // ok, this 2nd file damn well had better be a fastq file char cFastqOrBustard2 = ' '; char cSolexaOrSangerFastq2 = ' '; checkIfFastqOrBustard( pSolexaSeqFile2, filSolexaSeqFile2, cFastqOrBustard2, cSolexaOrSangerFastq2 ); if ( cFastqOrBustard2 != cFastqOrBustard ) { RWCString soError = filSolexaSeqFile1 + " is type " + ( cFastqOrBustard == cFASTQ ? "fastq" : "bustard" ) + " but " + filSolexaSeqFile2 + " is type " + ( cFastqOrBustard == cFASTQ ? "fastq" : "bustard" ); THROW_ERROR( soError ); } if ( cFastqOrBustard2 == cFASTQ ) { if ( cSolexaOrSangerFastq2 != cSolexaOrSangerFastq ) { RWCString soError = filSolexaSeqFile1 + " is fastq of type " + ( cSolexaOrSangerFastq == cSOLEXA ? "solexa" : "sanger" ) + " but file " + filSolexaSeqFile2 + " is fastq of type " + ( cSolexaOrSangerFastq2 == cSOLEXA ? "solexa" : "sanger" ); THROW_ERROR( soError ); } } bPairedSolexaReads = true; } // if ( !filSolexaSeqFile2.bIsNull() ) { RWCString soTemplate1; RWCString soTemplate2; nLine_ = 0; // go around this loop once per sequence while( true ) { bool bEOF = false; // what are the arguments for soReadName1 and bPairedSolexaReads // (I assume cFORWARD_READ is for writing the WR primer item processReadInFastq( pSolexaSeqFile1, filSolexaSeqFile1, bPairedSolexaReads, cFORWARD_READ, soTemplate1, bEOF ); if ( bEOF ) break; if ( bPairedSolexaReads ) { processReadInFastq( pSolexaSeqFile2, filSolexaSeqFile2, true, // bPairedSolexaReads, cREVERSE_READ, soTemplate2, bEOF ); assert( !bEOF ); // check that both reads represent mate pairs of the // same template/cluster: if ( soTemplate1 != soTemplate2 ) { THROW_ERROR( "templates " + soTemplate1 + " and " + soTemplate2 + " should have been the same are not. files " + filSolexaSeqFile1 + " and " + filSolexaSeqFile2 + " line in each file: " + RWCString( (long) nLine_ ) ); } } // if ( bPairedSolexaReads ) { } // while( true ); fclose( pSolexaSeqFile1 ); if ( bPairedSolexaReads ) fclose( pSolexaSeqFile2 ); } // if ( cFastqOrBustard == cFASTQ ) { else { RWCString soReadNamePrefix = filSolexaSeqFileOrBustardReadPrefix; if ( soReadNamePrefix.bIsNull() ) { RWCString soError = "each line of file " + filFOFOfSolexaSeqFiles_ + " must have (seq file) (prefix) where (prefix) is something to prepend to each read name so that reads from different flow cells that have the same lane, tile, X and Y are given different names. However, this line is just " + soFOFLine; THROW_ERROR( soError ); } FILE* pSolexaPrbFile = NULL; FileName filSolexaPrbFile = filFindSolexaPrbFileFromSeqFile( filSolexaSeqFile1 ); pSolexaPrbFile = fopen( filSolexaPrbFile, "r" ); if ( !pSolexaPrbFile ) { RWCString soErrorMessage = "solexa prb file " + filSolexaPrbFile + " which should be associated with solexa seq file " + filSolexaSeqFile1 + " as specified in " + filFOFOfSolexaSeqFiles_; THROW_FILE_ERROR( soErrorMessage ); } char szChannel[ 100 ]; char szTile[ 100 ]; char szX[ 100]; char szY[ 100]; char szSequence[ 200 ]; const int nPrbLineSize = 10000; char szPrbLine[ nPrbLineSize ]; int nLine_ = 0; // go around this loop once per sequence while( fgets( szSeqLine, nSeqLineSize, pSolexaSeqFile1 ) ) { ++nLine_; if ( fgets( szPrbLine, nPrbLineSize, pSolexaPrbFile ) == NULL ) { RWCString soError = "solexa seq file " + filSolexaSeqFile1 + " has more lines than solexa prb file " + filSolexaPrbFile; THROW_ERROR( soError ); } int nTokens; if ( ( nTokens = sscanf( szSeqLine, "%s %s %s %s %s\n", szChannel, szTile, szX, szY, szSequence ) ) != 5 ) { RWCString soError = filSolexaSeqFile1 + " at line # " + RWCString( (long) nLine_ ) + " does not have 5 tokens (channel) (tile) (X) (Y) (sequence) but only has " + RWCString( (long) nTokens) + " tokens. Line: " + szSeqLine; THROW_ERROR( soError ); } // solexa uses . for n int n = 0; // could do this with lookup table for efficiency while( szSequence[n] != 0 ) { if ( szSequence[n] == '.' ) { szSequence[n] = 'n'; } ++n; } int nSeqLength = n; RWCString soReadName = soReadNamePrefix + "_" + szChannel + "_" + szTile + "_" + szX + "_" + szY; writeReadHeaderToPhdBall( soReadName ); fprintf( pNewPhdBall_, "BEGIN_DNA\n" ); // now need to get quality values for these bases char* sz4Probs; for( int n0Base = 0; n0Base < nSeqLength; ++n0Base ) { if ( n0Base == 0 ) sz4Probs = strtok( szPrbLine, "\t" ); else sz4Probs = strtok( NULL, "\t\n" ); // the "\t\n" above is because sz4Probs may end in a new line if ( !sz4Probs ) { RWCString soError = "Error: in seq file " + filSolexaSeqFile1 + " read " + soReadName + " has length " + RWCString( (long) nSeqLength ) + " but prb file " + filSolexaPrbFile + " has length only " + RWCString( (long) n0Base ); THROW_ERROR( soError ); } char cBase = szSequence[ n0Base ]; int nACGTInNumbers = nNumberFromBase2[ cBase ]; int nQual; if ( nACGTInNumbers == 4 ) { // this means the base wasn't acgt nQual = 0; } else { // go over nACGTInNumbers commas int nPos = 0; int nSpaceDigitsSoFar = 0; while( nSpaceDigitsSoFar <= nACGTInNumbers ) { // find another space, then a digit if ( isspace( sz4Probs[ nPos ] ) && ( sz4Probs[ nPos + 1 ] == '-' || isdigit( sz4Probs[ nPos + 1 ] ) ) ) { ++nSpaceDigitsSoFar; } // check we haven't increased nPose forever looking for // the correct # of nSpaceDigitsSoFar const int nBigNumberCheckingForDesiredSpaceDigits = 500; assert( nPos < nBigNumberCheckingForDesiredSpaceDigits ); ++nPos; } // now we are pointing at a digit. The field may end // with either a space or a null. (the null used to be // a tab or carriage return, but strtok replaced it by a null). // phred quality is: -10 * log10( $e) // solexa quality is: -10 * log10( $3 / ( 1- $e )) // see http://maq.sourceforge.net/qual.shtml float fSolexaQuality = atof( sz4Probs + nPos ); float fPhredQuality = 10.0 * log10( 1.0 + pow( 10.0, fSolexaQuality / 10.0 ) ); nQual = fPhredQuality + 0.499; } // got the base and the phred quality value. Can now add a // line to the phdball fprintf( pNewPhdBall_, "%c %d\n", cBase, nQual ); } // for( int n0Base = 0; n0Base < nSeqLength; ++n0Base ) { fprintf( pNewPhdBall_, "END_DNA\n" ); fprintf( pNewPhdBall_, "END_SEQUENCE\n\n" ); ++nNumberOfReadsThisPhdBall_; } // while( fgets( szSeqLine, nSeqLineSize, pSolexaSeqFile1 ) ) { fclose( pSolexaPrbFile ); fclose( pSolexaSeqFile1 ); } // } // if ( cFastqOrBustard == cFASTQ ) { else } // while( fgets( soFOFLine.data(), nFOFLineSize, pFOF ) ) { fclose( pNewPhdBall_ ); fclose( pFOF ); fclose( pNewPhdBallFOF_ ); } // void solexa2PhdBall :: doIt() { FileName solexa2PhdBall :: filFindSolexaPrbFileFromSeqFile( const FileName& filSolexaSeqFile ) { FileName filSolexaPrbFile( filSolexaSeqFile ); assert( filSolexaPrbFile.bEndsWithAndRemove( "_seq.txt" ) ); filSolexaPrbFile += "_prb.txt"; return( filSolexaPrbFile ); } #define THROW_SOLEXA2PHDBALL_ERROR( szErrorMessage ) { \ RWCString soError = "Fatal error: "; \ soError += filFastq; \ soError += " line "; \ soError += RWCString( (long) nLine_ ); \ soError += " "; \ soError += szErrorMessage; \ soError += " called from "; \ soError += __FILE__; \ soError += " "; \ soError += RWCString( (long) __LINE__ ); \ InputDataError ide( soError ); \ throw ide; \ } void solexa2PhdBall :: processReadInFastq( FILE* pSolexaFastqFile, const FileName& filFastq, const bool bPairedRead, const char cForwardOrReverse, RWCString& soTemplateName, bool& bEOF ) { // The FILE* pSeqLine2 (or whatever) will need to be high enough scope to allow it used. Don't change phd balls in between a mate pair. if ( !fgets( szSeqLine, nSeqLineSize, pSolexaFastqFile ) ) { bEOF = true; return; } // don't increment line number for both files. They should be in synch. if ( cForwardOrReverse == cFORWARD_READ ) ++nLine_; if ( nNumberOfReadsThisPhdBall_ >= pCP->nMaxNumberOfReadsPerPhdBall_ ) { fclose( pNewPhdBall_ ); openNextPhdBall(); nNumberOfReadsThisPhdBall_ = 0; fprintf( pNewPhdBall_, "\n# solexa file %s (continued)\n\n", filFastq.data() ); bWrittenPhdBallHeader_ = true; } // looks like fastq which looks like: // @EAS54_6_R1_2_1_413_324 // CCCTTCTTGTCTTCAGCGTTTCTCC // + // ;;3;;;;;;;;;;;;7;;;;;;;88 // this contains the header if ( szSeqLine[0] != '@' ) { THROW_SOLEXA2PHDBALL_ERROR( " should start with @ but doesn't" ); } RWCString soReadName( &szSeqLine[1] ); // there is a newline at the end--remove it soReadName.stripTrailingWhitespaceFast(); writeReadHeaderToPhdBall( soReadName ); fprintf( pNewPhdBall_, "BEGIN_DNA\n" ); // get seq line if ( !fgets( szSeqLine, nSeqLineSize, pSolexaFastqFile ) ) { THROW_SOLEXA2PHDBALL_ERROR( "fgets error" ); } // don't increment line number for both files. They should be in synch. if ( cForwardOrReverse == cFORWARD_READ ) ++nLine_; int nNumberOfBases = nStripTrailingWhitespace( szSeqLine ); // get + line if ( !fgets( szThrowAwayLine, nThrowAwayLineSize, pSolexaFastqFile ) ) { THROW_SOLEXA2PHDBALL_ERROR( "fgets error" ); } // don't increment line number for both files. They should be in synch. if ( cForwardOrReverse == cFORWARD_READ ) ++nLine_; if ( szThrowAwayLine[0] != '+' ) { THROW_ERROR( "Fatal: " + filFastq + " is in fastq format so line " + RWCString( (long) nLine_ ) + " should have started with + but instead is " + szThrowAwayLine ); } // get quality line if ( !fgets( szQualityLine, nQualityLineSize, pSolexaFastqFile ) ) { THROW_SOLEXA2PHDBALL_ERROR( "fgets error" ); } // don't increment line number for both files. They should be in synch. if ( cForwardOrReverse == cFORWARD_READ ) ++nLine_; int nNumberOfQualities = nStripTrailingWhitespace( szQualityLine ); if ( nNumberOfBases != nNumberOfQualities ) { THROW_SOLEXA2PHDBALL_ERROR( "Fatal: different number of bases and qualities line " + RWCString( (long) nLine_ ) + " qualities: " + RWCString( (long) nNumberOfQualities ) + " bases: " + RWCString( (long) nNumberOfBases ) ); } for( int n0Base = 0; n0Base < nNumberOfQualities; ++n0Base ) { int nPhredQuality = nPhredFromFastq[ szQualityLine[n0Base] ]; fprintf( pNewPhdBall_, "%c %d\n", tolower( szSeqLine[ n0Base ] ), nPhredQuality ); } fprintf( pNewPhdBall_, "END_DNA\n" ); fprintf( pNewPhdBall_, "END_SEQUENCE\n\n" ); ++nNumberOfReadsThisPhdBall_; if ( bPairedRead ) { // figure out template name--chop off the /1 or /2 from the read // name soTemplateName = soReadName.soRemoveFinalBases( 2 ); // looks like: // WR{ // template determineReadTypes 011116:153932 // name: djs736a2_fp04q210 // lib: djs736a2 <---not including this // } // WR{ // primer determineReadTypes 011116:153932 // type: univ fwd // } fprintf( pNewPhdBall_, "WR{\n" ); fprintf( pNewPhdBall_, "template solexa2PhdBall %s\n", soDateTime_.data() ); fprintf( pNewPhdBall_, "name: %s\n", soTemplateName.data() ); fprintf( pNewPhdBall_, "}\n\n" ); fprintf( pNewPhdBall_, "WR{\n" ); fprintf( pNewPhdBall_, "primer solexa2PhdBall %s\n", soDateTime_.data() ); fprintf( pNewPhdBall_, "type: univ %s\n", ( cForwardOrReverse == cFORWARD_READ ? "fwd" : "rev" ) ); fprintf( pNewPhdBall_, "}\n\n" ); } // if ( bPairedRead ) { else soTemplateName = ""; } // void solexa2PhdBall :: processReadInFastq( FILE* pSolexaFastqFile, void solexa2PhdBall :: checkIfFastqOrBustard( FILE* pSolexaFile, const FileName& filSolexaFile, char& cFastqOrBustard, char& cSolexaOrSangerFastq ) { if ( !fgets( szSeqLine, nSeqLineSize, pSolexaFile ) ) { THROW_FILE_ERROR( filSolexaFile + " fgets error" ); } int nCountTokens = nGetNumberOfTokens( szSeqLine ); if ( nCountTokens == 1 ) { if ( szSeqLine[0] != '@' ) { THROW_ERROR( "can't figure out whether the format is fastq or bustard format for " + filSolexaFile + " since first line is " + szSeqLine + " which has 1 token unlike bustard files but does does start with @ unlike fastq files" ); } // looks like fastq which looks like: // @EAS54_6_R1_2_1_413_324 // CCCTTCTTGTCTTCAGCGTTTCTCC // + // ;;3;;;;;;;;;;;;7;;;;;;;88 // @EAS54_6_R1_2_1_540_792 // TTGGCAGGCCAAGGCCGATGGATCA // + // ;;;;;;;;;;;7;;;;;-;;;3;83 // @EAS54_6_R1_2_1_443_348 // GTTGCTTCTGGCGTGGGTGGGGGGG // +EAS54_6_R1_2_1_443_348 // ;;;;;;;;;;;9;7;;.7;393333 // let's check that 2 lines further starts with '+': if ( !fgets( szSeqLine, nSeqLineSize, pSolexaFile ) ) { THROW_FILE_ERROR( filSolexaFile + " fgets error" ); } if ( !fgets( szSeqLine, nSeqLineSize, pSolexaFile ) ) { THROW_FILE_ERROR( filSolexaFile + " fgets error" ); } if ( szSeqLine[0] != '+' ) { THROW_ERROR( filSolexaFile + " does not have fastq or bustard format since 1st line has 1 token but 3rd line does not start with + but instead is " + szSeqLine ); } // ok--it is fastq, but is is Solexa fastq or Sanger fastq? // let's see if ( pCP->soSolexa64FastqOrSanger33Fastq_ == "solexa64" ) { cSolexaOrSangerFastq = cSOLEXA; } else if ( pCP->soSolexa64FastqOrSanger33Fastq_ == "sanger33" ) { cSolexaOrSangerFastq = cSANGER; } else { int nNumberOfQualities = 0; int nSumOfQualities = 0; int nMinimumQualityChar = 256; int nMaximumQualityChar = -1; const int nNumberOfQualityLines = 10; for( int nQualityLines = 0; nQualityLines < nNumberOfQualityLines; ++nQualityLines ) { if ( !fgets( szSeqLine, nSeqLineSize, pSolexaFile ) ) { THROW_FILE_ERROR( filSolexaFile + " fgets error" ); } // get rid of trailing CR (and any spaces someone may have // introduced with an editor) nStripTrailingWhitespace( szSeqLine ); cerr << "looking at " << szSeqLine << endl; for( int nCharPos = 0; szSeqLine[ nCharPos ]; ++nCharPos ) { ++nNumberOfQualities; nSumOfQualities += (int) szSeqLine[ nCharPos ]; if ( szSeqLine[ nCharPos ] < nMinimumQualityChar ) nMinimumQualityChar = szSeqLine[ nCharPos ]; if ( nMaximumQualityChar < szSeqLine[ nCharPos ] ) nMaximumQualityChar = szSeqLine[ nCharPos ]; } // header line if ( !fgets( szSeqLine, nSeqLineSize, pSolexaFile ) ) { break; } // bases line if ( !fgets( szSeqLine, nSeqLineSize, pSolexaFile ) ) { break; } // quality header line if ( !fgets( szSeqLine, nSeqLineSize, pSolexaFile ) ) { break; } } float fAverageChar = (float) nSumOfQualities / nNumberOfQualities; cerr << "average quality char = " << fAverageChar << " nMinimumQualityChar = " << nMinimumQualityChar << " nMaximumQualityChar = " << nMaximumQualityChar << endl; cSolexaOrSangerFastq = ' '; const int nSubtractIfSangerFastq = 33; const int nSubtractIfSolexaFastq = 64; const int nMidwayBetweenSangerAndSolexaFastq = ( nSubtractIfSolexaFastq + nSubtractIfSangerFastq ) / 2; if ( nMinimumQualityChar <= nMidwayBetweenSangerAndSolexaFastq ) { cSolexaOrSangerFastq = cSANGER; cerr << "deciding this is sanger fastq format--will subtract " << nSubtractIfSangerFastq << endl; } else if ( nMidwayBetweenSangerAndSolexaFastq < nMinimumQualityChar ) { cSolexaOrSangerFastq = cSOLEXA; cerr << "deciding this is solexa fastq format--will subtract 64" << endl; } else { THROW_ERROR( "Warning--cannot figure out whether fastq is Sanger or Solexa" ); } } // if ( pCP->soSolexa64FastqOrSanger33Fastq_ == "solexa64" ) { rewind( pSolexaFile ); nLine_ = 0; cFastqOrBustard = cFASTQ; } else if ( nCountTokens == 5 ) { // looks like bustard... // check that the 1st 4 tokens are numeric // 1 169 361 287 AGTACTAGAATTCTAGGTATGCATCACCATGCTGGGCCAGG int nTokenStart; int nTokenEnd; bool bError; getNextToken( szSeqLine, 0, // nStartHere nTokenStart, nTokenEnd, bError ); assert( !bError ); if ( !bIsNumeric( RWCString( szSeqLine + nTokenStart, nTokenEnd - nTokenStart + 1 ) ) ) { THROW_ERROR( filSolexaFile + " 1st line " + szSeqLine + " is not fastq but also can't be bustard because 1st token isn't numeric" ); } getNextToken( szSeqLine, nTokenEnd + 1, // nStartHere nTokenStart, nTokenEnd, bError ); assert( !bError ); if ( !bIsNumeric( RWCString( szSeqLine + nTokenStart, nTokenEnd - nTokenStart + 1 ) ) ) { THROW_ERROR( filSolexaFile + " 1st line " + szSeqLine + " is not fastq but also can't be bustard because 2nd token isn't numeric" ); } getNextToken( szSeqLine, nTokenEnd + 1, // nStartHere nTokenStart, nTokenEnd, bError ); if ( !bIsNumeric( RWCString( szSeqLine + nTokenStart, nTokenEnd - nTokenStart + 1 ) ) ) { THROW_ERROR( filSolexaFile + " 1st line " + szSeqLine + " is not fastq but also can't be bustard because 3rd token isn't numeric" ); } getNextToken( szSeqLine, nTokenEnd + 1, // nStartHere nTokenStart, nTokenEnd, bError ); if ( !bIsNumeric( RWCString( szSeqLine + nTokenStart, nTokenEnd - nTokenStart + 1 ) )) { THROW_ERROR( filSolexaFile + " 1st line " + szSeqLine + " is not fastq but also can't be bustard because 4th token isn't numeric" ); } rewind( pSolexaFile ); cFastqOrBustard = cBUSTARD; } else { THROW_ERROR( filSolexaFile + " 1st line " + szSeqLine + " doesn't appear to be either fastq or bustard format" ); } } // void solexa2PhdBall :: checkIfFastqOrBustard( FILE* pSolexaFile,