/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "phdBall2Fasta.h" #include using namespace std; #include "mbt_exception.h" #include "rwcstring.h" #include #include "rwtvalorderedvector.h" #include "consedParameters.h" void phdBall2Fasta :: doIt() { FILE* pPhdBall = fopen( filPhdBall_.data(), "r" ); if ( !pPhdBall ) { THROW_FILE_ERROR( filPhdBall_ ); } nLine_ = 0; FILE* pFasta; FILE* pQualities; FILE* pFastq; if ( !filFasta_.bIsNull() ) { pFasta = fopen( filFasta_.data(), "w" ); if ( !pFasta ) { THROW_FILE_ERROR( filFasta_ ); } filQualities_ = filFasta_ + ".qual"; pQualities = fopen( filQualities_.data(), "w" ); if ( !pQualities ) { THROW_FILE_ERROR( filQualities_ ); } } if ( !filFastq_.bIsNull() ) { pFastq = fopen( filFastq_.data(), "w" ); if ( !pFastq ) { THROW_FILE_ERROR( filFastq_ ); } } const int nPrettyLargeSequence = 2000; RWCString soBases( (size_t) nPrettyLargeSequence ); RWTValOrderedVector aQualities( (size_t) nPrettyLargeSequence, "phdBall2Fasta::aQualities" ); const int nPrettyLargeHeader = 2000; RWCString soFastaHeader( (size_t) nPrettyLargeHeader ); RWCString soFastaQualHeader( (size_t) nPrettyLargeHeader ); bool bEndOfFile = false; readFirstBeginSequenceLine( pPhdBall, bEndOfFile ); if ( bEndOfFile ) { RWCString soError = "file "; soError += filPhdBall_; soError += " does not have a BEGIN_SEQUENCE line in it.\n"; soError += "Is it a phd ball???"; THROW_ERROR( soError ); } while( !bEndOfFile ) { readFileOfPhdFilesForPhdBall2Fasta( pPhdBall, soFastaHeader, soFastaQualHeader, soBases, aQualities, bEndOfFile ); if ( pCP->bPhdBall2FastaIgnoreLowQualityReads_ ) { float fMeanQuality = fGetMeanQuality( aQualities ); if ( fMeanQuality < pCP->nPhdBall2FastaLowestAverageQuality_ ) { cerr << "ignoring read " << soFastaHeader << " because mean quality is " << fMeanQuality << " which is too low" << endl; continue; } } // we are guaranteed to have found a phd block even if bEndOfFile is true // since that just means there isn't another phd block if ( !filFasta_.bIsNull() ) { fprintf( pFasta, ">%.*s\n", soFastaHeader.length(), soFastaHeader.data() ); fprintf( pQualities, ">%.*s\n", soFastaQualHeader.length(), soFastaQualHeader.data() ); const int nMaxBasesOnLine = 50; for( int n0Pos = 0; n0Pos < soBases.length(); n0Pos += nMaxBasesOnLine ) { int nNumberOfBasesToWrite = nMaxBasesOnLine; if ( n0Pos + nNumberOfBasesToWrite > soBases.length() ) { nNumberOfBasesToWrite = soBases.length() - n0Pos; } fprintf( pFasta, "%.*s\n", nNumberOfBasesToWrite, &soBases[ n0Pos ] ); for( int nBase = 0; nBase < nNumberOfBasesToWrite; ++nBase ) { fprintf( pQualities, " %d", aQualities[ nBase + n0Pos ] ); } fputc( '\n', pQualities ); } // for( int n0Pos; ... } // if ( !filFasta_.bIsNull() ) { if ( !filFastq_.bIsNull() ) { fprintf( pFastq, "@%.*s\n", soFastaQualHeader.length(), soFastaQualHeader.data() ); fprintf( pFastq, "%.*s\n", soBases.length(), soBases.data() ); RWCString soFastqQualities( (size_t) aQualities.length() ); soFastqQualities.nCurrentLength_ = aQualities.length(); for( int nBase = 0; nBase < aQualities.length(); ++nBase ) { char cQuality = aQualities[ nBase ] + '!'; soFastqQualities.setBaseUnsafe( nBase, cQuality ); } fprintf( pFastq, "+\n%.*s\n", soFastqQualities.length(), soFastqQualities.data() ); } // if ( !filFastq_.bIsNull() ) { } // while( !bEndOfFile ) { fclose( pPhdBall ); if ( !filFasta_.bIsNull() ) { fclose( pFasta ); fclose( pQualities ); } if ( !filFastq_.bIsNull() ) { fclose( pFastq ); } } float phdBall2Fasta :: fGetMeanQuality( const RWTValOrderedVector& aQualities ) { float fSum = 0.0; for( int n = 0; n < aQualities.length(); ++n ) { fSum += aQualities[ n ]; } if ( aQualities.length() == 0 ) return 0.0; else return fSum / aQualities.length(); }