/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "readPHDOfSinglet.h" #include "readPHDAgainForHighQualitySegment.h" #include "filename.h" #include "singletInfo.h" #include using namespace std; #include "mbt_exception.h" #include #include "consed.h" #include "nGetReadTypeFromReadName.h" // singlets are not read for the bases or qualities. But we need the // experiment id for Autofinish. The chemistry, read type, and high // quality segment are also read. bIsThisUniversalPrimerReadInSinglets is // probably used so that Autofinish doesn't repeatedly suggest the same // universal primer read. The high quality segment is used by the // evaluation stage of Autofinish for determining how well Autofinish's // suggestions worked. #define PARSE_PANIC( szMessage ) \ { ostringstream ost; \ ost << "phd file error detected from source file " \ << __FILE__ << " at " << __LINE__ <filGetPHDDir() + "/" + pSingletInfo->filPHD_; FILE* pFil = fopen( (char*) filPHDFullPath.data(), "r" ); if ( pFil == NULL ) { printf( "could not open singlet phd file %s\n", (char*) filPHDFullPath.data() ); bFoundExpID = false; return; } nLine = 0; // will be set by read name if not found in phd file pSingletInfo->nReadType_ = 0; pSingletInfo->soChemistry_ = ""; // look for BEGIN_COMMENT bool bLookingForBEGIN_COMMENT = true; do { if ( fgets( szLine, nMaxLineSize, pFil ) == NULL ) { PARSE_PANIC( "premature end of file while looking for BEGIN_COMMENT" ); } ++nLine; if ( szLine[0] == 'B' && szLine[1] == 'E' && szLine[2] == 'G' && szLine[3] == 'I' && szLine[4] == 'N' && szLine[5] == '_' && szLine[6] == 'C' && szLine[7] == 'O' && szLine[8] == 'M' && szLine[9] == 'M' && szLine[10] == 'E' && szLine[11] == 'N' && szLine[12] == 'T' ) { bLookingForBEGIN_COMMENT = false; } } while( bLookingForBEGIN_COMMENT ); // look for CHEM: bool bContinueLookingForChemistry = true; do { if ( fgets( szLine, nMaxLineSize, pFil ) == NULL ) { PARSE_PANIC( "premature end of file while looking for END_COMMENT" ); } ++nLine; if ( szLine[0] == 'E' && szLine[1] == 'N' && szLine[2] == 'D' && szLine[3] == '_' && szLine[4] == 'C' && szLine[5] == 'O' && szLine[6] == 'M' && szLine[7] == 'M' && szLine[8] == 'E' && szLine[9] == 'N' && szLine[10] == 'T' ) { bContinueLookingForChemistry = false; } else { if ( szLine[0] == 'C' && szLine[1] == 'H' && szLine[2] == 'E' && szLine[3] == 'M' && szLine[4] == ':' ) { pSingletInfo->soChemistry_ = ""; // jump over whitespace int n = 5; while( isspace( szLine[n] ) ) ++n; // so szLine[n] is the first character of the chemistry while( szLine[n] != '\n' && szLine[n] != '\0' && szLine[n] != ' ' ) { pSingletInfo->soChemistry_.append( szLine[n] ); ++n; } bContinueLookingForChemistry = false; } } } while( bContinueLookingForChemistry ); // look for END_DNA bool bContinueLookingForEndDNA = true; do { if ( fgets( szLine, nMaxLineSize, pFil ) == NULL ) { PARSE_PANIC( "premature end of file while looking for END_DNA" ); } ++nLine; if ( szLine[0] == 'E' && szLine[1] == 'N' && szLine[2] == 'D' && szLine[3] == '_' && szLine[4] == 'D' && szLine[5] == 'N' && szLine[6] == 'A' ) bContinueLookingForEndDNA = false; } while ( bContinueLookingForEndDNA ); // try to find expid WR item while(1) { if ( fgets( szLine, nMaxLineSize, pFil ) == NULL ) { bFoundExpID = false; break; } ++nLine; if ( szLine[0] == 'W' && szLine[1] == 'R' && szLine[2] == '{' && isspace( szLine[3] ) ) { FGETS_OR_PARSE_PANIC( "premature end of file while in WR item" ); if ( szLine[0] == 'e' && szLine[1] == 'x' && szLine[2] == 'p' && szLine[3] == 'i' && szLine[4] == 'd' && isspace( szLine[5] ) ) { // found expid whole read item // get the expid FGETS_OR_PARSE_PANIC( "premature end of file while in expid WR item" ); int n = 0; int nExpID = 0; while( szLine[n] != '\n' ) { if ( '0' <= szLine[n] && szLine[n] <= '9' ) { nExpID = 10*nExpID + ( szLine[n] - '0' ); ++n; } else { PARSE_PANIC_1_ARG( "line should have been just a number but found char %c\n", szLine[n] ); } } bFoundExpID = true; pSingletInfo->nExpID_ = nExpID; } // if ( szLine[0] == 'e' && ... else if ( szLine[0] == 't' && szLine[1] == 'e' && szLine[2] == 'm' && szLine[3] == 'p' && szLine[4] == 'l' && szLine[5] == 'a' && szLine[6] == 't' && szLine[7] == 'e' && isspace( szLine[8] ) ) { // found the template WR item // now get the template name bool bEndOfWRItem = false; do { FGETS_OR_PARSE_PANIC( "premature end of file while in template WR item" ); RWCString soDataLine( szLine ); if ( soDataLine.length() >= 2 && soDataLine[0] == '}' && isspace( soDataLine[1] ) ) bEndOfWRItem = true; else if ( soDataLine.length() >= 5 && soDataLine( 0, 5 ) == "name:" ) { if ( soDataLine.length() < 6 ) { PARSE_PANIC( "In template WR item which should be of form name: (template name) but (template name) is missing" ); } soDataLine = soDataLine( 5, soDataLine.length() - 5 ); // strip off leading and trailing whitespace soDataLine = soDataLine.stripWhitespace( RWCString::BOTH ); pSingletInfo->soTemplate_ = soDataLine; } else if ( soDataLine.length() >= 5 && soDataLine( 0, 5 ) == "type:" ) { if ( soDataLine.length() < 6 ) PARSE_PANIC( "In template WR item, type line which should be of the form type: (bac, cos, puc, pbc, pcr, etc.)" ); soDataLine = soDataLine( 5, soDataLine.length() - 5 ); soDataLine = soDataLine.stripWhitespace( RWCString::BOTH ); pSingletInfo->soTemplateType_ = soDataLine; } // size isn't yet implemented } while( !bEndOfWRItem ); } // if ( szLine[0] == 't' ... template else if ( szLine[0] == 'p' && szLine[1] == 'r' && szLine[2] == 'i' && szLine[3] == 'm' && szLine[4] == 'e' && szLine[5] == 'r' && isspace( szLine[6] ) ) { // found the primer WR item // now get the read type bool bEndOfWRItem = false; do { FGETS_OR_PARSE_PANIC( "premature end of file while in primer WR item" ); RWCString soDataLine( szLine ); if ( soDataLine.length() >= 2 && soDataLine[0] == '}' && isspace( soDataLine[1] ) ) bEndOfWRItem = true; else if ( soDataLine.length() >= 5 && soDataLine( 0, 5 ) == "type:" ) { if ( soDataLine.length() < 6 ) PARSE_PANIC( "In primer WR item, type line which should be of the form type: univ fwd, univ rev, pcr end, or walk" ); soDataLine = soDataLine( 5, soDataLine.length() - 5 ); soDataLine = soDataLine.stripWhitespace( RWCString::BOTH ); if ( soDataLine == "univ fwd" ) pSingletInfo->nReadType_ = nUniversalForward; else if ( soDataLine == "univ rev" ) pSingletInfo->nReadType_ = nUniversalReverse; else if ( soDataLine == "walk" ) pSingletInfo->nReadType_ = nWalk; else if ( soDataLine == "pcr end" ) pSingletInfo->nReadType_ = nPCREnd; else { PARSE_PANIC( "after type: there must be one of univ fwd, univ rev, pcr end, or walk" ) } } // ...soDataLine( 0, 5 ) == "type:" } while( !bEndOfWRItem ); } } // if ( szLine[0] == 'W' ... } // while( 1 ) if ( bFoundExpID ) { readPHDAgainForHighQualitySegment( pSingletInfo, pFil ); } else { pSingletInfo->nHighQualitySegmentStart_ = -1; pSingletInfo->nHighQualitySegmentEnd_ = -1; pSingletInfo->nExpID_ = -1; } fclose( pFil ); }