/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include #include #include "singletInfo.h" #include "readPHDAgainForHighQualitySegment.h" #include #include "filename.h" #include "mbt_exception.h" #include "assert.h" // This differs from readPHD in that it doesn't bother to read the // bases or point positions--just the qualities. It doesn't even save // the qualities, but just uses them to calculate the high quality // segment--the only thing it returns. It is only used to get the // high quality segment for singlets. // quality 13 has an error rate of 0.050118723 so // 0.05 - (error rate of quality 13 base) = neg number // Hence quality 13 bases on the end of a high quality segment will // not be included in it. static double dAmountToSubtractFrom = 0.05; #define PARSE_PANIC( szMessage ) \ { ostrstream ost; \ ost << "phd file error detected from source file " \ << __FILE__ << " at " << __LINE__ <filPHD_ << " at line " << \ nLine << " Line:\n" \ << szLine << "\n" << szMessage << endl \ << ends; \ InputDataError ide(ost.str()); \ throw ide; } #define PARSE_PANIC_1_ARG( szMessage, arg1 ) \ { \ char szTemp[500]; \ sprintf( szTemp, szMessage, arg1 ); \ PARSE_PANIC( szTemp ); \ } const int nMaxLineSize = 300; static char szLine[ nMaxLineSize + 1]; static int nLine; void readPHDAgainForHighQualitySegment( singletInfo* pSingletFromAutoFinish, FILE* pFil ) { rewind( pFil ); // look for BEGIN_DNA bool bFoundBEGIN_DNA = false; while( !bFoundBEGIN_DNA ) { if ( fgets( szLine, nMaxLineSize, pFil ) == NULL ) { PARSE_PANIC( "premature end of file while looking for BEGIN_DNA" ); } else { ++nLine; } if ( memcmp( szLine, "BEGIN_DNA", 9 ) == 0 ) bFoundBEGIN_DNA = true; } double dSumFromBeginning = 0.0; // these values are set so that, if the first base is quality 14 or more, // the high quality segment can start from base 1 // if there is a negative scoring segment starting at the beginning, // that segment will replace this least sum of 0. This is usually the // case. double dLeastSumFromBeginning = 0.0; int nLocationOfLeastSumFromBeginning = 0; double dBestSumSegment = 0.0; int nBeginningOfBestSumSegment = -1; int nEndOfBestSumSegment = -1; // anticipating ++nLine being the first base line int nLineOfFirstBaseLine = nLine + 1; // look for END_DNA bool bContinueLookingForEndDNA = true; do { if ( fgets( szLine, nMaxLineSize, pFil ) == NULL ) { PARSE_PANIC( "premature end of file while looking for END_DNA" ); } ++nLine; if ( szLine[0] == 'E' && szLine[1] == 'N' && szLine[2] == 'D' && szLine[3] == '_' && szLine[4] == 'D' && szLine[5] == 'N' && szLine[6] == 'A' ) bContinueLookingForEndDNA = false; else { double dQuality; assert( szLine[1] == ' ' && '0' <= szLine[2] && szLine[2] <= '9' ); int nQuality = ( szLine[2] - '0' ); int n = 3; while( szLine[n] != ' ' ) { assert( '0' <= szLine[n] && szLine[n] <= '9' ); nQuality = 10*nQuality + ( szLine[n] - '0' ); ++n; } double dAdjustedError = dAmountToSubtractFrom - pow( 10.0, ( (double) -nQuality) / 10.0 ); dSumFromBeginning += dAdjustedError; if ( dSumFromBeginning < dLeastSumFromBeginning ) { dLeastSumFromBeginning = dSumFromBeginning; nLocationOfLeastSumFromBeginning = // nLine - nLineOfFirstBaseLine + 1 // the reason for the +1 is that we need to make the number // be 1-based rather than 0-based nLine - nLineOfFirstBaseLine + 1; } if ( dSumFromBeginning - dLeastSumFromBeginning > dBestSumSegment ) { dBestSumSegment = dSumFromBeginning - dLeastSumFromBeginning; // the high quality segment starts the base *after* the // base that makes the least sum base nBeginningOfBestSumSegment = nLocationOfLeastSumFromBeginning + 1; nEndOfBestSumSegment = nLine - nLineOfFirstBaseLine + 1; } } } while ( bContinueLookingForEndDNA ); // nLine is END_DNA line // e.g., // BEGIN_DNA // a 2 12 <-nLineOfFirstBaseLine // g 2 15 // END_DNA <-nLine pSingletFromAutoFinish->nNumberOfBases_ = nLine - nLineOfFirstBaseLine; pSingletFromAutoFinish->nHighQualitySegmentStart_ = nBeginningOfBestSumSegment; pSingletFromAutoFinish->nHighQualitySegmentEnd_ = nEndOfBestSumSegment; }