/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include #include #include #include #include "rwcstring.h" #include "rwtvalvector.h" #include "rwctokenizer.h" #include // for chmod #include #include #include "sysdepend.h" #include "locatedFragment.h" #include "contig.h" #include "assembly.h" #include "basesegment.h" #include "mbt_exception.h" #include "filename.h" #include "listOfFragments.h" #include "listOfFragmentsAndEndPositions.h" #include "guiapp.h" #include "consed.h" #include "deleteFileIfItExists.h" #include "consedParameters.h" #include "guiTopWindow.h" #include "popupErrorMessage.h" #include "subcloneTTemplate.h" #include "textbox.h" #include "autoFinishExpTag.h" #include "universalPrimerAutoFinishExpTagsSortedByTemplateName.h" #include #include "please_wait.h" #include "evalExpCluster.h" #include "expidAndLocatedFragment.h" #include "singletInfo.h" #include "readPHDOfSinglet.h" #include "mbt_errors.h" #include "rwcregexp.h" #include "oligoTag.h" #include "fileDefines.h" #include #include "mbt_val_ord_offset_vec.h" #include "dErrorRateFromQuality.h" #include "nGetReadTypeFromReadName.h" #include "readFileOfPhdFiles.h" #include "dGetErrorsFixedAtBase.h" #include "listOfLibraries.h" #include "templatesSortedByLibrary.h" #include "abs.h" #include "guiMultiContigNavigator.h" #include "findRepeat.h" #include "lowconsqual.h" #include "contigEndPair.h" #include "templateForMinilibrary.h" #include "restrictionFragment.h" #include "popupInfoOrErrorMessageNoFormat.h" #include "clScaffold.h" #include "fwdRevPair.h" #include "highQualityDiscrepancyGotoList.h" #include "copyFiles.h" #include "renameFiles.h" #include "bIsNumeric.h" #include "soGetErrno.h" #include "bIsNumericMaybeWithWhitespace.h" #include "hp_exception_kludge.h" #include "popupErrorMessage2.h" #include "soAddCommas.h" #include "ntide.h" #include "printTime.h" #include "popupInfoMessage.h" #define PARSE_PANIC( file, nLine, soMessage, soLine ) \ { ostrstream ost; \ ost << "Corrupted ace file detected by consed source file " \ << __FILE__ << " at " << __LINE__ <bReadOnly_ ) { RWCString soErrorMessage = "Sorry, you can't save the assembly since you started consed -read_only"; InputDataError ide( soErrorMessage ); throw ide; } // get filename FileName soAssemblyDirName( soGetAceFileName().soGetDirectory()); FileName soDirMask = soAssemblyDirName + "*.ace*"; FileName soTryOutFileName = soGetAceFileName().filGetNextVersion(); // bump version number until stat says file doesn't exist while (soTryOutFileName.bFileByThisNameExists()) { soTryOutFileName = soTryOutFileName.filGetNextVersion(); } FileName soUserOutFileName = GuiApp::popupFileSelector("Save assembly to file", soDirMask, soTryOutFileName); // if user cancelled or did not supply name, bail now if (soUserOutFileName.isNull()) { bUserPushedCancel = true; return; } bool bGoAhead = true; if (soUserOutFileName.bFileByThisNameExists()) { bGoAhead = GuiApp::popupDecisionMessage("File %s exists. Save anyway?", (const char*)soUserOutFileName); } if (! bGoAhead) { bUserPushedCancel = true; return; } PleaseWait* pPleaseWait = NULL; try { // the "if" is to fix a Linux bug: Linux didn't like // fclose( NULL ) but Solaris could handle it if ( pfilEditHistory_ ) { fclose( pfilEditHistory_ ); pfilEditHistory_ = NULL; } bool bAceFormat1 = (consedParameters::pGetConsedParameters()->nWriteThisAceFormat_ == 1)? true : false; if ( widOfCallingWindow == 0 ) widOfCallingWindow = GuiApp::pGetGuiApp()->widGetTopLevel(); pPleaseWait = new PleaseWait( widOfCallingWindow ); // this can take a long time saveToFile(soUserOutFileName, bAceFormat1 ); delete pPleaseWait; deleteFileIfItExists( soUserOutFileName + ".wrk" ); // update label in gui contig win to reflect new version ConsEd::pGetConsEd()->setAssemblyNameOnAllContigwins( soUserOutFileName.soGetBasename() ); GuiApp::pGetGuiApp()->pGetGuiTopWindow()->setAssemblyName( soUserOutFileName.soGetBasename() ); clearAllChangedFlags(); ConsEd::pGetConsEd()->disallowUndos(); bUserPushedCancel = false; } catch (SysRequestFailed srf) { GuiApp::popupErrorMessage(srf.szGetDesc()); // this is a method of keeping consed from exiting if there // was a problem writing the file if ( pPleaseWait ) delete pPleaseWait; bUserPushedCancel = true; } } void Assembly :: clearAllChangedFlags() { setUnchanged(); // loop through all contigs and fragments in assembly for ( int nContig = 0; nContig < nNumContigs(); nContig++) { Contig* pContig = pGetContig( nContig ); pContig->setUnchanged(); int nNumFrags = pContig->nGetNumberOfFragsInContig(); for (int nFrag = 0; nFrag < nNumFrags; nFrag++) { // get pointer to this located frag from contig LocatedFragment* pLocFrag = pContig->pLocatedFragmentGet(nFrag); pLocFrag->setUnchanged(); } } } // returns NULL if contig is not found Contig* Assembly :: pGetContigByName( const RWCString& soContigName ) { Contig* pContig; int nContig = 0; bool bFound = false; while( !bFound && nContig < nNumContigs() ) { pContig = pGetContig( nContig ); if (pContig->soGetName() == soContigName ) bFound = true; else ++nContig; } if (!bFound) pContig = 0; return( pContig ); } Contig* Assembly :: pGetContigByNameCaseInsensitive( const RWCString& soContigName ) { RWCString soContigLower( soContigName ); soContigLower.toLower(); for( int nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); if ( pContig->soGetName().returnToLower() == soContigLower ) { return pContig; } } return NULL; } void Assembly :: writeEditToEditHistoryFile( char* szFormat, ... ) { char szLineToWrite[10000]; va_list args; va_start(args, szFormat); vsprintf( szLineToWrite, szFormat, args ); va_end(args); // shouldn't ever get here, but handle it anyway if ( pCP->bReadOnly_ ) { return; } if ( pfilEditHistory_ == NULL ) { // this must be the first edit--open the edit history file FileName filEditHistory = soGetAceFileName() + ".wrk"; if (bOpenEditHistoryForAppendFlag() ) { pfilEditHistory_ = fopen( (char*)filEditHistory.data(), "a" ); } else { // open the output file pfilEditHistory_ = fopen( (char*)filEditHistory.data(), "w" ); } if ( pfilEditHistory_ == NULL ) { // throw an exception with a detailed explanation ostrstream ost; ost << "Unable to open file " << filEditHistory << " for writing" << endl << ends; SysRequestFailed srf(ost.str()); // what the hell, show 'em the errno srf.includeErrnoDescription(); throw srf; } } // now the file is open, one way or the other. so write the edit-- // that's what we're here for strcat( szLineToWrite, "\n" ); if ( fputs( szLineToWrite, pfilEditHistory_ ) == EOF ) { // throw an exception with a detailed explanation ostrstream ost; ost << "Unable to write to .wrk file" << endl << ends; SysRequestFailed srf(ost.str()); // what the hell, show 'em the errno srf.includeErrnoDescription(); throw srf; } if (fflush( pfilEditHistory_ ) != 0 ) { ostrstream ost; ost << "Unable to flush .wrk file" << endl << ends; SysRequestFailed srf(ost.str()); // what the hell, show 'em the errno srf.includeErrnoDescription(); throw srf; } } // called repeatedly by saveToFile() static void writeContigToAceFile(Contig& rContig, ofstream& ofsAceFile, const bool bAceFormat1 ) { Ntide::setGetBaseUpperOrLowerLookupTableIfNecessary(); rContig.sortReadsByPosition(); if (bAceFormat1) { // write consensus sequence ofsAceFile << endl << "DNA " << rContig.soGetName() << endl; } else { // write consensus sequence ofsAceFile << endl << "CO " << rContig.soGetName() << " " << rContig.nGetPaddedSeqLength() << " " << rContig.nGetNumberOfFragsInContig() << " " << rContig.baseSegArray_.nGetNumSegments() << " " << (rContig.bIsComplementedFromWayPhrapCreated() ? "C" : "U" ) << endl; } // loop through entire sequence int nCharsOnLine = 0; int nPos; for ( nPos = rContig.nGetStartConsensusIndex(); nPos <= rContig.nGetEndConsensusIndex(); nPos++) { Ntide nt = rContig.ntGetCons(nPos); char cBase = nt.cGetBaseUpperOrLowerUnsafe(); ofsAceFile << cBase; nCharsOnLine++; if (nCharsOnLine == nAceMaxDnaLineLen) { ofsAceFile << endl; nCharsOnLine = 0; } } if (nCharsOnLine) ofsAceFile << endl; ofsAceFile << endl; // write the BaseQuality lines if (bAceFormat1 ) ofsAceFile << endl << "BaseQuality " << rContig.soGetName() << endl; else ofsAceFile << endl << "BQ" << endl; int nNumbersOnLine = 0; for( nPos = rContig.nGetStartConsensusIndex(); nPos <= rContig.nGetEndConsensusIndex(); nPos++) { Ntide nt = rContig.ntGetCons( nPos ); // quality information is only output for non-pads if (!nt.bIsPad() ) { ofsAceFile << " " << (int) nt.qualGetQuality(); ++nNumbersOnLine; static const int nAceMaxBaseQualityLineLen = 50; if (nNumbersOnLine >= nAceMaxBaseQualityLineLen) { ofsAceFile << endl; nNumbersOnLine = 0; } } } if (nNumbersOnLine) ofsAceFile << endl; ofsAceFile << endl; if (bAceFormat1) // the "sequence" section contains Assembled_from* and BaseSegment* lines ofsAceFile << "Sequence " << rContig.soGetName() << endl; // put out "AF" lines int nNumFrags = rContig.nGetNumberOfFragsInContig(); int nFrag; for ( nFrag = 0; nFrag < nNumFrags; nFrag++) { // get pointer to this located frag from contig LocatedFragment* pLocFrag = rContig.pLocatedFragmentGet(nFrag); if (bAceFormat1) { // when reading the .ace file, we peeled '.comp' off of // the fragment name. replace it here RWCString soFragName = pLocFrag->soGetFragmentName(); if (pLocFrag->bComp()) soFragName += ".comp"; ofsAceFile << "Assembled_from* " << soFragName << " " << pLocFrag->nGetAlignStart() << " " << pLocFrag->nGetAlignEnd() << endl; } else { ofsAceFile << "AF " << pLocFrag->soGetFragmentName(); if (pLocFrag->bComp() ) ofsAceFile << " C "; else ofsAceFile << " U "; ofsAceFile << pLocFrag->nGetAlignStart() << endl; } } // put out BS lines int nNumBaseSegs = rContig.baseSegArray_.nGetNumSegments(); for (int nSeg = 0; nSeg < nNumBaseSegs; nSeg++) { BaseSegment* pBs = rContig.baseSegArray_[nSeg]; LocatedFragment* pLf = pBs->pGetLocFrag(); if (bAceFormat1) { RWCString soFragName = pLf->soGetFragmentName(); if (pLf->bComp()) soFragName += ".comp"; ofsAceFile << "Base_segment* " << pBs->nGetStartInConsensus() << " " << pBs->nGetEndInConsensus() << " " << soFragName << " " << pLf->nGetFragIndexFromConsPos( pBs->nGetStartInConsensus() ) << " "<< pLf->nGetFragIndexFromConsPos( pBs->nGetEndInConsensus() ) << endl; } else { ofsAceFile << "BS " << pBs->nGetStartInConsensus() << " " << pBs->nGetEndInConsensus() << " " << pLf->soGetFragmentName() << endl; } } if (!bAceFormat1) ofsAceFile << endl; // only used in bAceFormat1 RWCString soFragName; // put out fragment sequence and clipping lines for (nFrag = 0; nFrag < nNumFrags; nFrag++) { LocatedFragment* pLf = rContig.pLocatedFragmentGet(nFrag); if (bAceFormat1) { soFragName = pLf->soGetFragmentName(); if (pLf->bComp()) soFragName += ".comp"; ofsAceFile << endl << "DNA " << soFragName << endl; } else { int nNumberOfReadTags = pLf->nGetNumberOfReadTagsToWriteToAceFile(); ofsAceFile << "RD " << pLf->soGetName() << " " << pLf->nGetPaddedSeqLength() << " " << "0" // number of whole read info items << " " << nNumberOfReadTags << endl; } // loop through entire sequence int nCharsOnLine = 0; for (int nPos = pLf->nGetStartIndex(); // fragment, not consensus indices nPos <= pLf->nGetEndIndex(); // from Sequence class member funs nPos++) { Ntide nt = pLf->ntideGet(nPos); // from Sequence class char cBase = nt.cGetBaseUpperOrLowerUnsafe(); ofsAceFile << cBase; nCharsOnLine++; if (nCharsOnLine == nAceMaxDnaLineLen) { ofsAceFile << endl; nCharsOnLine = 0; } } if (nCharsOnLine) ofsAceFile << endl; ofsAceFile << endl; // clipping line if (bAceFormat1) { ofsAceFile << "Sequence " << soFragName << endl; ofsAceFile << "Clipping* " << pLf->nGetStartUnclippedFragPos() << ' ' << pLf->nGetEndUnclippedFragPos() << endl; } else { ofsAceFile << "QA "; if ( pLf->bIsWholeReadLowQuality() ) ofsAceFile << "-1 -1 "; else { // check to make sure a corrupt ace file isn't being // written: int nStartHighQuality = pLf->nGetFragIndexFromConsPos( pLf->nGetStartUnclippedConsPos() ); int nEndHighQuality = pLf->nGetFragIndexFromConsPos( pLf->nGetEndUnclippedConsPos() ); if ( ! ( 1 <= nStartHighQuality && nStartHighQuality <= nEndHighQuality && nEndHighQuality <= pLf->nGetEndIndex() ) ) { RWCString soError = "The ace file you just wrote is corrupted and will not be able to be read by Consed. The QA (quality) line for read "; soError += pLf->soGetName(); soError += " contains out-of-bounds numbers. ("; soError += RWCString( (long) nStartHighQuality ); soError += ", "; soError += RWCString( (long) nEndHighQuality ); soError += ", "; soError += RWCString( (long) pLf->nGetEndIndex() ); soError += "). If you can reproduce what you have done so you can get this message again, and you can supply the data set to David Gordon with instructions so that he can also reproduce it, then he can fix this bug."; popupErrorMessage( soError ); } ofsAceFile << pLf->nGetFragIndexFromConsPos( pLf->nGetStartUnclippedConsPos() ) << " " << pLf->nGetFragIndexFromConsPos( pLf->nGetEndUnclippedConsPos() ) << " "; } if ( pLf->bIsWholeReadUnaligned() ) ofsAceFile << "-1 -1 " << endl; else { // check to make sure a corrupt ace file isn't being written int nStartAlignedRegion = pLf->nGetFragIndexFromConsPos( pLf->nGetAlignClipStart() ); int nEndAlignedRegion = pLf->nGetFragIndexFromConsPos( pLf->nGetAlignClipEnd() ); if ( !( 1 <= nStartAlignedRegion && nStartAlignedRegion <= nEndAlignedRegion && nEndAlignedRegion <= pLf->nGetEndIndex() ) ) { RWCString soError = "The ace file you just wrote is corrupted and Consed will not be able to read it. The QA (alignment) line for read "; soError += pLf->soGetName(); soError += " contains out-of-bounds numbers. ( "; soError += RWCString( (long) nStartAlignedRegion ); soError += ", "; soError += RWCString( (long) nEndAlignedRegion ); soError += "). nGetEndIndex = "; soError += RWCString( (long) pLf->nGetEndIndex() ); soError += " If you can reproduce what you have done so you get this message again, and you can supply the data set to David Gordon with instructions so he can also reproduce this message, then he can fix this bug."; popupErrorMessage( soError ); } ofsAceFile << pLf->nGetFragIndexFromConsPos( pLf->nGetAlignClipStart() ) << " " << pLf->nGetFragIndexFromConsPos( pLf->nGetAlignClipEnd() ) << endl; } } // the PHD file and timestamp should have just been changed (above) // for all of the reads that were edited since the last save if (bAceFormat1) { ofsAceFile << "Description " << "CHROMAT_FILE: " << (pLf->filGetChromatFilename() ) << " PHD_FILE: " << (pLf->filGetPHDFilename() ) << " TIME: " << pLf->soGetTimestamp() << endl; } else { RWCString soDSLine( (size_t) 1000 ); soDSLine = "DS"; if ( !pLf->filGetChromatFilename().bIsNull() && pLf->filGetChromatFilename() != "none" ) { soDSLine += " CHROMAT_FILE: "; soDSLine += pLf->filGetChromatFilename(); } if ( !pLf->filGetPHDFilename().bIsNull() ) { soDSLine += " PHD_FILE: "; soDSLine += pLf->filGetPHDFilename(); } // now VERSION // the intent is to not write version for older files if ( pLf->filGetPHDFilename().bIsNull() ) { soDSLine += " VERSION: "; char szTemp[10]; sprintf( szTemp, "%d", pLf->nVersion_ ); soDSLine += szTemp; } // time soDSLine += " TIME: "; soDSLine += pLf->soGetTimestamp(); // chemistry. Historically, CHEM has not been written // for prim and term (Sanger) reads. However, phrap // historically *does* write them. When in doubt, don't // change it. if ( pLf->soChemistry_ != "prim" && pLf->soChemistry_ != "term" && !pLf->soChemistry_.bIsNull() ) { soDSLine += " CHEM: "; soDSLine += pLf->soChemistry_; } // why is DYE not here? DG 8/15/08 ofsAceFile << soDSLine << endl; pLf->writeReadTagsToAceFile( ofsAceFile ); } } // for (nFrag = 0; nFrag < nNumFrags; nFrag++) { } FileName Assembly :: filSaveAssemblyToUserSpecifiedOrNextAvailableVersionOfAceFile() { FileName filToSave; if ( !pCP->filUserWantsToSaveToThisAceFile_.bIsNull() ) filToSave = pCP->filUserWantsToSaveToThisAceFile_; else { filToSave = filGetAceFileFullPathname().filFindOneHigherThanHighestVersion3(); } bool bAceFormat1 = (consedParameters::pGetConsedParameters()->nWriteThisAceFormat_ == 1)? true : false; saveToFile( filToSave, bAceFormat1 ); return filToSave; } void Assembly :: saveAssemblyToUserSpecifiedOrNextAvailableVersionOfAceFile( const FileName& filToSave ) { FileName filToSave2( filToSave ); if ( filToSave2.bIsNull() ) filToSave2 = filGetAceFileFullPathname().filFindOneHigherThanHighestVersion3(); bool bAceFormat1 = (consedParameters::pGetConsedParameters()->nWriteThisAceFormat_ == 1)? true : false; saveToFile( filToSave2, bAceFormat1 ); } // passed a file name, saves a new version of the // .ace file (but only with '*' i.e. padded lines) // overwrites existing file, throws exception if // it cannot open. void Assembly :: saveToFile(const char* szOutFilePath, const bool bAceFormat1 ) { int nContig; // loop through all fragments in assembly // look for any that have changed and write new PHD files for them for (nContig = 0; nContig < nNumContigs(); nContig++) { Contig* pContig = pGetContig( nContig ); int nNumFrags = pContig->nGetNumberOfFragsInContig(); for (int nFrag = 0; nFrag < nNumFrags; nFrag++) { // get pointer to this located frag from contig LocatedFragment* pLocFrag = pContig->pLocatedFragmentGet(nFrag); if (pLocFrag->bChanged() ) pLocFrag->writePHDAndMore(); } } // make sure we got a name assert (szOutFilePath); cerr << "writing " << szOutFilePath << endl; // open the output file #ifdef OFSTREAM_OPEN_WITHOUT_PERMISSIONS ofstream ofsAceFile(szOutFilePath, ios::out ); #else ofstream ofsAceFile(szOutFilePath, ios::out, 0666 ); #endif if (! ofsAceFile) { // throw an exception with a detailed explanation ostrstream ost; ost << "Unable to open file " << szOutFilePath << " for writing" << endl << ends; SysRequestFailed srf(ost.str()); // what the hell, show 'em the errno srf.includeErrnoDescription(); throw srf; } if (!bAceFormat1) { setNumberOfReadsInAssemblyAccurate(); ofsAceFile << "AS " << nNumContigs() << " " << nGetNumberOfReadsInAssembly() << endl; } // loop through all contigs in assembly for ( nContig = 0; nContig < nNumContigs(); nContig++) { writeContigToAceFile((*this)[nContig], ofsAceFile, bAceFormat1 ); } // write all consensus tags in assembly for ( nContig = 0; nContig < nNumContigs(); nContig++) { Contig* pContig = pGetContig( nContig ); pContig->writeConsensusTagsToAceFile( ofsAceFile ); } // write all wholeAssemblyItems if ( !bAceFormat1 ) { for( int nWA = 0; nWA < aWholeAssemblyItems_.length(); ++nWA ) { aWholeAssemblyItems_[nWA]->writeWholeAssemblyItemToAceFile( ofsAceFile ); } } ofsAceFile.close(); #ifdef OFSTREAM_OPEN_WITHOUT_PERMISSIONS int nError = chmod( szOutFilePath, 0666 ); #endif // update the file name soFileName_ = szOutFilePath; } // Searches the entire assembly for a fragment of name soFragmentName // If found, returns its LocatedFragment pointer. // If not found, returns null. LocatedFragment* Assembly :: pGetLocatedFragmentByName( const RWCString& soFragmentName ) { if ( bReadsSortedByReadNameNeedsFixing_ ) setUpReadsSortedByReadName(); return( pReadsSortedByReadName_->pFindReadByName( soFragmentName ) ); } void Assembly :: readPHDBallsAndPHDFilesAndSetQualitiesAndPeakPositions() { int nContig; int nFragsSoFar = 0; pCP->nPhdFilesRead_ = 0; // check if the ace file refers to any phd balls for( int nWA = 0; nWA < aWholeAssemblyItems_.length(); ++nWA ) { wholeAssemblyItem* pWA = aWholeAssemblyItems_[nWA]; if ( pWA->soType_ == "phdBall" ) { FileName filPhdBall = pWA->soText_; // I guess this is so this file isn't attempted to be // read twice if ( filPhdBall == pCP->filFileOfPhdFiles_ ) continue; FileName filPhdBallFullPath = pWA->soText_; if ( !filPhdBallFullPath.bFileByThisNameExists() ) { filPhdBallFullPath = pCP->filPhdBallDirectory_ + "/" + pWA->soText_; if ( !filPhdBallFullPath.bFileByThisNameExists() ) { RWCString soError = pWA->soText_ + " and " + filPhdBallFullPath + " do not exist even though it is specified in a WA item in the ace file"; THROW_ERROR( soError ); } } readFileOfPhdFiles( filPhdBallFullPath ); } } // now (for backwards compatibility) check for the phd.ball // in the current directory // command line overrides .consedrc // But if commandline does not have a -fileOfPhdFiles, then use // the .consedrc // command line -fileOfPhdFiles is put into pCP->filFileOfPhdFiles_ // // pCP->filFastStartupFile_ comes from .consedrc and is phd.ball by // default if ( pCP->filFileOfPhdFiles_.isNull() ) { if ( pCP->bFastStartup_ ) { pCP->filFileOfPhdFiles_ = pCP->filFastStartupFile_; } } if ( !pCP->filFileOfPhdFiles_.isNull() && pCP->filFileOfPhdFiles_.bFileByThisNameExists() ) { readFileOfPhdFiles( pCP->filFileOfPhdFiles_ ); } int nNumberOfIndividualPhdFilesRead = 0; for (nContig = 0; nContig < nNumContigs(); nContig++) { Contig* pContig = pGetContig( nContig ); int nNumFrags = pContig->nGetNumberOfFragsInContig(); for (int nFrag = 0; nFrag < nNumFrags; nFrag++) { ++nFragsSoFar; // get pointer to this located frag from contig LocatedFragment* pLocFrag = pContig->pLocatedFragmentGet(nFrag); try { if ( !pLocFrag->bAlreadyReadPhdFile_ ) { // if the phd file has already been read, this will not // read it again pLocFrag->readPHDFileAndSetQualitiesAndPeakPositions(); ++(pCP->nPhdFilesRead_ ); ++nNumberOfIndividualPhdFilesRead; if ( pCP->nPhdFilesRead_ % 50 == 0 ) { int nPerCentComplete = ( pCP->nPhdFilesRead_ * 100 / nNumberOfReadsInAssembly_ ) / 2 + 50; printf( "%d%% done. %d phd files read so far...\n", nPerCentComplete, pCP->nPhdFilesRead_ ); } } if ( pLocFrag->nVersion_ != 1 && !pLocFrag->bAlreadyReadPhredCallsInPhdBall_ ) { // this is the case in which there is a read // with a higher version and there is no file // for this read in the phd.ball (or else the user // isn't using the phd.ball) try { pLocFrag->readPhredCalls( ); } catch( ExceptionBase eb ) { // couldn't read the phred calls. However, *could* // read the edited phd file (the highest version). // So ok to show quality values in Aligned Reads Window // (don't set it to all quality 0) but don't // allow user to bring up traces. pLocFrag->bReadOnly_ = true; popupErrorMessage( eb.szGetDesc() ); pCP->bErrorMessageDisplayedAtStartup_ = true; } } // if ( pLocFrag->filGetPHDFilename().nGetVersion() != 1 ... } // try catch( ExceptionBase eb ) { // something was wrong in the phd file // So mark it read-only and set the qualities to 0's pLocFrag->setBadPHDFile(); popupErrorMessage( eb.szGetDesc() ); consedParameters::pGetConsedParameters()->bErrorMessageDisplayedAtStartup_ = true; } // catch... } // for (int nFrag = 0; nFrag < nNumFrags; } // for (nContig = 0; cerr << "Number of individual phd files read: " << nNumberOfIndividualPhdFilesRead << endl; cerr << "Total reads in assembly: " << soAddCommas( nNumberOfReadsInAssembly_ ) << endl; } void Assembly :: setContigMatchTablesInAllContigs() { for ( int nContig = 0; nContig < nNumContigs(); nContig++) { Contig* pContig = pGetContig( nContig ); pContig->setContigMatchTables(); } } int Assembly :: nGetContigIndexByContig( Contig* pContig ) { for( int nContig = 0; nContig < nNumContigs(); ++nContig ) { if ( pGetContig( nContig ) == pContig ) return( nContig ); } return( -1 ); } bool Assembly :: bDoesTheNumberOfTemplatesMakeSense() { if ( pReadsSortedByTemplateName_ ) { pReadsSortedByTemplateName_->clear(); pReadsSortedByTemplateName_->resize( nGetNumberOfReadsInAssembly() ); } else { pReadsSortedByTemplateName_ = new readsSortedByTemplateName( nGetNumberOfReadsInAssembly() ); } int nNumberOfReadsAdded = 0; for( int nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); for( int nRead = 0; nRead < pContig->nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pContig->pLocatedFragmentGet( nRead ); pLocFrag->setDefaultTemplateNameIfNecessary(); if ( pLocFrag->bIsWholeCloneTemplateNotSubcloneTemplate() ) continue; if ( pLocFrag->bIsAFakeRead() ) continue; pReadsSortedByTemplateName_->append( pLocFrag ); } } pReadsSortedByTemplateName_->resort2(); if ( pReadsSortedByTemplateName_->length() == 0 ) { popupErrorMessage( "Warning: there are no reads that are subclone reads. All reads are either whole clone (bac, cosmid, or cDNA) or fake reads. If this is not correct, then you probably have not correctly customized determineReadTypes.perl for your read naming convention. See README.txt and see the documentation within determineReadTypes.perl To help you in figuring out what is wrong, on the Main Consed Window, point to Info, hold down mouse button 1, and release on Show Info For Each Read" ); consedParameters::pGetConsedParameters()->bErrorMessageDisplayedAtStartup_ = true; return( false ); } const int nTemplatesToReport = 3; int nMostReadsPerTemplate[ nTemplatesToReport ]; RWTValVector< RWCString > aTemplatesAndReads( nTemplatesToReport ); int n; for( n = 0; n < nTemplatesToReport; ++n ) { nMostReadsPerTemplate[ n ] = 0; aTemplatesAndReads[ n ] = ""; } int nNumberOfTemplates = 0; int nSubscriptOfFirstReadOfTemplate = 0; RWCString soLastTemplate = ""; for( int nRead = 0; nRead < pReadsSortedByTemplateName_->length(); ++nRead ) { RWCString soTemplate = (*pReadsSortedByTemplateName_)[ nRead ]->soGetTemplateName(); if ( soTemplate != soLastTemplate && nRead != 0 ) { // check last template to see if it has more reads for it than // other saved templates int nNumberOfReadsInLastTemplate = nRead - nSubscriptOfFirstReadOfTemplate; bool bFoundSlot = false; for( int nSavedTemplate = 0; nSavedTemplate < nTemplatesToReport && !bFoundSlot; ++nSavedTemplate ) { if ( nNumberOfReadsInLastTemplate > nMostReadsPerTemplate[ nSavedTemplate ] ) { // found the slot this one should go in. Push the // others down (I wish I had "push" like perl). for( n = nTemplatesToReport - 1; n > nSavedTemplate; --n ) { nMostReadsPerTemplate[ n ] = nMostReadsPerTemplate[ n - 1 ]; aTemplatesAndReads[ n ] = aTemplatesAndReads[ n - 1 ]; } RWCString soExplanation = soLastTemplate; // often too many reads--don't make such a large box // for( n = nSubscriptOfFirstReadOfTemplate; // n < nRead; // ++n ) { // soExplanation += " "; // soExplanation += // (*pReadsSortedByTemplateName_)[ n ]->soGetName(); // } aTemplatesAndReads[ nSavedTemplate ] = soExplanation; nMostReadsPerTemplate[ nSavedTemplate ] = nNumberOfReadsInLastTemplate; bFoundSlot = true; } } } if ( soTemplate != soLastTemplate ) { // now deal with the next template ++nNumberOfTemplates; soLastTemplate = soTemplate; nSubscriptOfFirstReadOfTemplate = nRead; } } // check last template to see if it has more reads for it than // other saved templates int nNumberOfReadsInLastTemplate = pReadsSortedByTemplateName_->length() - nSubscriptOfFirstReadOfTemplate; bool bFoundSlot = false; for( int nSavedTemplate = 0; nSavedTemplate < nTemplatesToReport && !bFoundSlot; ++nSavedTemplate ) { if ( nNumberOfReadsInLastTemplate > nMostReadsPerTemplate[ nSavedTemplate ] ) { // found the slot this one should go in. Push the // others down (I wish I had "push" like perl). for( n = nTemplatesToReport - 1; n > nSavedTemplate; --n ) { nMostReadsPerTemplate[ n ] = nMostReadsPerTemplate[ n - 1 ]; aTemplatesAndReads[ n ] = aTemplatesAndReads[ n - 1 ]; } RWCString soExplanation = soLastTemplate; // often too many reads--don't make such a large box // for( n = nSubscriptOfFirstReadOfTemplate; // n < nRead; // ++n ) { // soExplanation += " "; // soExplanation += // (*pReadsSortedByTemplateName_)[ n ]->soGetName(); // } bFoundSlot = true; aTemplatesAndReads[ nSavedTemplate ] = soExplanation; nMostReadsPerTemplate[ nSavedTemplate ] = nNumberOfReadsInLastTemplate; } } // What is going to be the rule for checking whether the number of // templates makes sense? If most reads are whole clone reads (as // in the case of cDNA assemblies, then there won't be any subclone // templates at all). So I suggest that we check how many subclone // reads there are (not fake). This is given by the reads in the // pReadsSortedByTemplateName_ array. Then we check how many // templates there are in this and see if the number of templates // is less than the number of reads and more than the number of // reads divided by 5. In autofinish, if this is not true, // autofinish will terminate with an error. We will have a resource // that allows the person to override the error and continue. if ( nNumberOfTemplates > 500 && ( nNumberOfTemplates == pReadsSortedByTemplateName_->length() ) ) { popupErrorMessage( "Warning: there were %d reads that went into the assembly and %d reads from subclones. However, consed believes that each of these was from a different subclone template. Consed does not see a single forward/reverse pair or forward and walk, or redo. If this is not correct, then you should read README.txt about determineReadTypes.perl and customize determineReadTypes.perl for your naming convention. To help you in figuring out what is wrong, on the Main Consed Window, point to Info, hold down mouse button 1, and release on Show Info For Each Read. If this *is* correct, then you can override this error message by setting the consed resource \nconsed.autoFinishContinueEvenThoughReadInfoDoesNotMakeSense: true\nSee README.txt for more info.", nGetNumberOfReadsInAssembly(), pReadsSortedByTemplateName_->length() ); consedParameters::pGetConsedParameters()->bErrorMessageDisplayedAtStartup_ = true; return( false ); } if ( nNumberOfTemplates < pReadsSortedByTemplateName_->length() / 5 ) { popupErrorMessage( "Warning: there were %d reads that went into the assembly and %d reads from subclones. However, consed/autofinish thinks there were only %d subclone templates. This is probably incorrect and will cause numerous problems: primer picking and template picking within consed will not work and autofinish will not work correctly. This problem is probably due to your not customizing determineReadTypes.perl correctly for your read naming convention. See README.txt (which came with consed) and see the documentation within determineReadType.perl For example, consed thinks that template %s has %d reads, template %s has %d reads and template %s has %d reads. To help you in figuring out what is wrong, on the Main Consed Window, point to Info, hold down mouse button 1, and release on Show Info For Each Read", nGetNumberOfReadsInAssembly(), pReadsSortedByTemplateName_->length(), nNumberOfTemplates, (const char*) aTemplatesAndReads[ 0 ], nMostReadsPerTemplate[ 0 ], (const char*) aTemplatesAndReads[ 1 ], nMostReadsPerTemplate[ 1 ], (const char*) aTemplatesAndReads[ 2 ], nMostReadsPerTemplate[ 2 ] ); consedParameters::pGetConsedParameters()->bErrorMessageDisplayedAtStartup_ = true; return( false ); } return( true ); } static void toDoAtEndOfReadsForATemplate( const int nSubscriptOfFirstReadOfLastTemplate, const RWCString& soLastTemplate, TextBox* pTextBox, const int nNewTemplateRead, readsSortedByTemplateName* pReadsSortedByTemplateName ) { int nNumberOfReadsOfLastTemplate = nNewTemplateRead - nSubscriptOfFirstReadOfLastTemplate; pTextBox->appendWithArgs( "template %s with %d %s\n", (const char*) soLastTemplate, nNumberOfReadsOfLastTemplate, ( nNumberOfReadsOfLastTemplate == 1 ? "read" : "reads" ) ); for( int nRead = nSubscriptOfFirstReadOfLastTemplate; nRead < nNewTemplateRead; ++nRead ) { LocatedFragment* pLocFrag = (*pReadsSortedByTemplateName)[ nRead ]; pTextBox->appendWithArgs( " %-20s %-8s %-8s ", (const char*) pLocFrag->soGetName(), (const char*) pLocFrag->soChemistry_, szGetReadType2( pLocFrag->nReadType_ ) ); if ( pLocFrag->nReadType_ == pLocFrag->nReadTypeFromPhdFile_ ) pTextBox->append( "(from phd file)\n" ); else pTextBox->appendWithArgs( "(inferred from name--no info in phd file: %d)\n", pLocFrag->nReadTypeFromPhdFile_ ); } } void Assembly :: showReadInfo() { PleaseWait* pPleaseWait = new PleaseWait( GuiApp::pGetGuiApp()->widGetTopLevel() ); int nNumberOfReadsOfEachType[ nReadTypesMaxSubscript + 1 ]; int nNumberOfReadsNotSetInPhdFile = 0; for( int n = 0; n <= nReadTypesMaxSubscript; ++n ) nNumberOfReadsOfEachType[ n ] = 0; int nContig; for( nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); for( int nRead = 0; nRead < pContig->nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pContig->pLocatedFragmentGet( nRead ); if ( pLocFrag->nReadTypeFromPhdFile_ == 0 ) ++nNumberOfReadsNotSetInPhdFile; ++nNumberOfReadsOfEachType[ pLocFrag->nReadType_ ]; } } processSinglets(); int nSinglet; for( nSinglet = 0; nSinglet < pSingletsSortedByTemplateName_->length(); ++nSinglet ) { singletInfo* pSingletInfo = (*pSingletsSortedByTemplateName_)[ nSinglet ]; if ( pSingletInfo->nReadType_ == 0 ) { ++nNumberOfReadsNotSetInPhdFile; pSingletInfo->nReadType_ = nGetReadTypeFromReadName( pSingletInfo->soReadName_ ); } ++nNumberOfReadsOfEachType[ pSingletInfo->nReadType_ ]; } delete pPleaseWait; const int nNumberOfRowsVisible = 35; TextBox* pTextBox = new TextBox( "Read Info", nNumberOfRowsVisible ); pTextBox->append( "Autofinish and Consed's primer picker must know, \nfor each read, whether it is a forward universal \nprimer read, a reverse universal primer read, a pcr \nend read, or a walk. Autofinish and Consed's \nprimer picker must also know, for each read, what \nthe read's template name is so they know when 2 \nreads come from the same subclone template or \ndifferent subclone templates. If a read comes \nfrom the whole clone (bac, cosmid, or cDNA), it is \nimportant to know that as well.\n\nIf the information below is not correct, it is \nlikely that you have not correctly customized \ndetermineReadTypes.perl for your read naming \nconvention. Please refer to README.txt or Help \non the Aligned Reads Window and refer to the \ndocumentation in determineReadsTypes.perl itself\n" ); pTextBox->append( "Read Type Summary:\n\n" ); for( int nReadType = 0; nReadType <= nReadTypesMaxSubscript; ++nReadType ) { if ( nReadType == 0 ) { pTextBox->appendWithArgs( "Reads whose type is not in phd file and will be \ninferred to be universal forward or universal \nreverse depending on the extension (such reads \nwill never be considered walks): %d\n", nNumberOfReadsNotSetInPhdFile ); } else { pTextBox->appendWithArgs( "Reads of type %s: %d\n", szGetReadType2( nReadType ), nNumberOfReadsOfEachType[ nReadType ] ); } } // now figure out summary information about template names if ( pReadsSortedByTemplateName_ ) { pReadsSortedByTemplateName_->clear(); pReadsSortedByTemplateName_->resize( nGetNumberOfReadsInAssembly() ); } else { pReadsSortedByTemplateName_ = new readsSortedByTemplateName( nGetNumberOfReadsInAssembly() ); } int nSmallFractionOfReads = nGetNumberOfReadsInAssembly() / 100; RWTPtrOrderedVector aWholeCloneReads( (size_t) nSmallFractionOfReads ); RWTPtrOrderedVector aFakeReads( (size_t) nSmallFractionOfReads ); int nNumberOfWholeCloneReads = 0; int nNumberOfFakeReads = 0; int nNumberOfReads = 0; int nNumberOfSubcloneReads = 0; for( nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); for( int nRead = 0; nRead < pContig->nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pContig->pLocatedFragmentGet( nRead ); ++nNumberOfReads; pLocFrag->setDefaultTemplateNameIfNecessary(); if ( pLocFrag->bIsWholeCloneTemplateNotSubcloneTemplate() ) { ++nNumberOfWholeCloneReads; aWholeCloneReads.append( pLocFrag ); } else if ( pLocFrag->bIsAFakeRead() ) { ++nNumberOfFakeReads; aFakeReads.append( pLocFrag ); } else { ++nNumberOfSubcloneReads; pReadsSortedByTemplateName_->append( pLocFrag ); } } } pReadsSortedByTemplateName_->resort2(); pTextBox->append( "\n\n\nTemplate Name Summary Information:\n\n\n" ); pTextBox->appendWithArgs( "Number of whole clone reads (from bac, cosmid, or cDNA): %d\n", nNumberOfWholeCloneReads ); pTextBox->appendWithArgs( "Number of fake reads: %d\n", nNumberOfFakeReads ); pTextBox->appendWithArgs( "Number of reads from subclones (M13, plasmid): %d\n", nNumberOfSubcloneReads ); pTextBox->appendWithArgs( "Total Number of reads in assembly: %d\n\n\n", nNumberOfReads ); // now make histogram of the number of templates with each # of reads const int nNumberOfBins = 10; int nTemplatesWithParticularNumberOfReads[ nNumberOfBins ]; int nBin; for( nBin = 0; nBin < nNumberOfBins; ++nBin ) nTemplatesWithParticularNumberOfReads[ nBin ] = 0; RWCString soLastTemplate = ""; int nSubscriptOfFirstReadOfLastTemplate = 0; int nRead; for( nRead = 0; nRead < pReadsSortedByTemplateName_->length(); ++nRead ) { RWCString soTemplate = (*pReadsSortedByTemplateName_)[ nRead ]->soGetTemplateName(); if ( soTemplate != soLastTemplate && nRead != 0 ) { int nNumberOfReadsOfLastTemplate = nRead - nSubscriptOfFirstReadOfLastTemplate; if ( nNumberOfReadsOfLastTemplate <= ( nNumberOfBins - 2 ) ) ++nTemplatesWithParticularNumberOfReads[ nNumberOfReadsOfLastTemplate ]; else ++nTemplatesWithParticularNumberOfReads[ nNumberOfBins - 1 ]; } // do this, even for the 1st read in the list if ( soTemplate != soLastTemplate ) { soLastTemplate = soTemplate; nSubscriptOfFirstReadOfLastTemplate = nRead; } } // do for last template group in list int nNumberOfReadsOfLastTemplate = nRead - nSubscriptOfFirstReadOfLastTemplate; if ( nNumberOfReadsOfLastTemplate <= ( nNumberOfBins - 2 ) ) ++nTemplatesWithParticularNumberOfReads[ nNumberOfReadsOfLastTemplate ]; else ++nTemplatesWithParticularNumberOfReads[ nNumberOfBins - 1 ]; // start with 1 because there will be no templates with 0 reads for( nBin = 1; nBin < nNumberOfBins - 1; ++nBin ) { pTextBox->appendWithArgs("# of templates with %d %s: %d\n", nBin, ( nBin == 1 ? "read" : "reads" ), nTemplatesWithParticularNumberOfReads[ nBin ] ); } pTextBox->appendWithArgs("# of templates with %d or more reads: %d\n", nNumberOfBins - 1, nTemplatesWithParticularNumberOfReads[ nNumberOfBins - 1 ] ); pTextBox->append( "\n\n\n\nList of Fake Reads:\n\n" ); for( int nFakeRead = 0; nFakeRead < aFakeReads.length(); ++nFakeRead ) { LocatedFragment* pLocFrag = aFakeReads[ nFakeRead ]; pTextBox->appendWithArgs( " %s\n", (const char*) pLocFrag->soGetName() ); } pTextBox->append( "\n\n\nList of Whole Clone Reads:\n\n" ); if ( aWholeCloneReads.length() != 0 ) pTextBox->append( " name chemistry template\n" ); for( int nWholeCloneRead = 0; nWholeCloneRead < aWholeCloneReads.length(); ++nWholeCloneRead ) { LocatedFragment* pLocFrag = aWholeCloneReads[ nWholeCloneRead ]; pTextBox->appendWithArgs( " %20s %10s %8s\n", (const char*) pLocFrag->soGetName(), (const char*) pLocFrag->soChemistry_, (const char*) pLocFrag->soGetTemplateName() ); } pTextBox->append( "\n\n\nList of Subclone Reads:\n\n" ); pTextBox->append( " name chemistry read type\n\n" ); soLastTemplate = ""; nSubscriptOfFirstReadOfLastTemplate = 0; for( nRead = 0; nRead < pReadsSortedByTemplateName_->length(); ++nRead ) { LocatedFragment* pLocFrag = (*pReadsSortedByTemplateName_)[ nRead ]; if ( pLocFrag->soGetTemplateName() != soLastTemplate && nRead != 0 ) { toDoAtEndOfReadsForATemplate( nSubscriptOfFirstReadOfLastTemplate, soLastTemplate, pTextBox, nRead, pReadsSortedByTemplateName_ ); } if ( pLocFrag->soGetTemplateName() != soLastTemplate ) { soLastTemplate = pLocFrag->soGetTemplateName(); nSubscriptOfFirstReadOfLastTemplate = nRead; } } toDoAtEndOfReadsForATemplate( nSubscriptOfFirstReadOfLastTemplate, soLastTemplate, pTextBox, pReadsSortedByTemplateName_->length(), pReadsSortedByTemplateName_ ); pTextBox->append( "\n\n\nList of Reads Put (by Phrap) into Singlets File:\n\n" ); for( nSinglet = 0; nSinglet < pSingletsSortedByTemplateName_->length(); ++nSinglet ) { singletInfo* pSinglet = (*pSingletsSortedByTemplateName_)[ nSinglet ]; pTextBox->appendWithArgs("%-15s %-5s %-20s %-10s %-15s\n", pSinglet->soTemplate_.data(), pSinglet->soTemplateType_.data(), pSinglet->soReadName_.data(), pSinglet->soChemistry_.data(), szGetReadType2( pSinglet->nReadType_ ) ); } pTextBox->makeVisible(); } void Assembly :: showTemplateQualityInfo() { PleaseWait* pPleaseWait = new PleaseWait( GuiApp::pGetGuiApp()->widGetTopLevel() ); setContigTemplateArrays(); const int nNumberOfRowsVisible = 35; TextBox* pTextBox = new TextBox( "Template Quality Info", nNumberOfRowsVisible ); pTextBox->append( "\n\n" ); pTextBox->append( "name error rate\n" ); pTextBox->append( "\n\n" ); for( int nTemplate = 0; nTemplate < subcloneTTemplateArray_.entries(); ++nTemplate ) { subcloneTTemplate* pSub = subcloneTTemplateArray_[ nTemplate ]; pSub->calculateErrorRateIfNecessary(); pTextBox->appendWithArgs( "%s %.2e\n", (const char*) pSub->soTemplateName_, pSub->dErrorRate_ ); } pTextBox->makeVisible(); delete pPleaseWait; } void Assembly :: setContigTemplateArrays() { if ( !pCP->bNeedToSetContigTemplateArrays_ ) return; pCP->bNeedToSetContigTemplateArrays_ = false; // first make a list of reads: Assembly :: pReadsSortedByTemplateName_ // Then make list of templates: Assembly :: subcloneTTemplateArray_ // Then process this and split into various Contig :: aSubcloneTemplates_ // Some subcloneTTemplates may be duplicated so it can be put into // more than one Contig :: aSubcloneTemplates_ list, each with a // different subcloneTTemplate :: pContig_ if ( pCP->nWhatIsRunning_ == nGraphicalConsedIsRunning ) { printf( "setting contig template arrays...\n" ); fflush( stdout ); } // this is needed by subcloneTTemplate::bReadsAreConsistent() // to see if read pairs spanning the gap are close enough to the // ends of the contigs. This used to be elsewhere, but the // showTemplateInsertLocations wouldn't work if it wasn't here. setAllContigHighQualitySegments(); clearAndResizeContigTemplateArrays(); subcloneTTemplateArray_.clear(); int nNumberOfReadsInAssemblyWithTemplates = nGetNumberOfReadsInAssemblyWithTemplates(); if ( pReadsSortedByTemplateName_ ) { pReadsSortedByTemplateName_->clear(); pReadsSortedByTemplateName_->resize( nNumberOfReadsInAssemblyWithTemplates ); } else { pReadsSortedByTemplateName_ = new readsSortedByTemplateName( nNumberOfReadsInAssemblyWithTemplates ); } for( int nContig = 0; nContig < nNumContigs(); nContig++) { Contig* pContig = pGetContig( nContig ); for( int nRead = 0; nRead < pContig->nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pContig->pLocatedFragmentGet( nRead ); pLocFrag->setDefaultTemplateNameIfNecessary(); if ( pLocFrag->bIsWholeCloneTemplateNotSubcloneTemplate() ) { continue; } if ( pLocFrag->bIsAFakeRead() ) { continue; } if ( pLocFrag->bIsAnUnpairedShortRead() ) { continue; } pReadsSortedByTemplateName_->append( pLocFrag ); } } pReadsSortedByTemplateName_->resort2(); subcloneTTemplateArray_.resize( (size_t) pReadsSortedByTemplateName_->entries() ); // creating subcloneTTemplate objects RWCString soLastTemplate; subcloneTTemplate* pTemplate = NULL; bool bFirstTime = true; for( int nRead = 0; nRead < pReadsSortedByTemplateName_->entries(); ++nRead ) { LocatedFragment* pLocFrag = (*pReadsSortedByTemplateName_)[ nRead ]; // fixes a UWGC bug in which the first read had a null template // so it matched soLastTemplate, but pTemplate was null if ( pLocFrag->soGetTemplateName() != soLastTemplate || bFirstTime ) { bFirstTime = false; soLastTemplate = pLocFrag->soGetTemplateName(); // new template--process it. // The ctor inserts this subcloneTTemplate into // Assembly::subcloneTTemplateArray_ pTemplate = new subcloneTTemplate( pLocFrag ); } else { pTemplate->aReads_.insert( pLocFrag ); pLocFrag->pSub_ = pTemplate; } } // why do the templates need to be sorted by name? // Because in contig5.cpp, universal reads near the end of the // contig are checked for mates by finding the subcloneTTemplate // object by looking up via the template name. sortTemplatesByTemplateName(); // this must be done before pSub->processingAfterAllReadsAdded // because the latter uses the library in order to check that // the reads from the same template are not too far apart assignLibrariesToSubcloneTTemplates(); // 1st stage in processing subcloneTTemplate arrays int nTemplate; for( nTemplate = 0; nTemplate < subcloneTTemplateArray_.entries(); ++nTemplate ) { subcloneTTemplate* pSub = subcloneTTemplateArray_[ nTemplate ]; pSub->processingAfterAllReadsAdded(); } // calculating library information calculateLibraryInsertSizeFromForwardReversePairs(); // use that library information to mark more templates as // bad for( nTemplate = 0; nTemplate < subcloneTTemplateArray_.entries(); ++nTemplate ) { subcloneTTemplate* pSub = subcloneTTemplateArray_[ nTemplate ]; if ( pSub->bOKToUseThisTemplate_ ) pSub->markTemplateBadIfInsertSeemsTooBig(); } // we do this after calculating the insert size so that // we can use unavailable (lost/dropped) for calculating the // library insert size setBadTemplateArray(); setBadLibraryArray(); flagAllSubclonesIfBadTemplateOrBadLibrary(); // 2nd stage in processing subcloneTTemplate arrays. this stage // puts the templates into the different contigs. If a template // is in a badTemplate or badLibrary list, it is not put into // a contig. for( nTemplate = 0; nTemplate < subcloneTTemplateArray_.entries(); ++nTemplate ) { subcloneTTemplate* pSub = subcloneTTemplateArray_[ nTemplate ]; pSub->processingAfterAddedAllTemplatesInThisLibrary(); } sortContigTemplateArrays(); cerr << "after sortContigTemplateArrays" << endl; if ( consedParameters::pGetConsedParameters()->bAutoFinishDumpTemplates_ ) dumpTemplates(); } void Assembly :: dumpTemplates() { for( int nTemplate = 0; nTemplate < subcloneTTemplateArray_.entries(); ++nTemplate ) { subcloneTTemplate* pSub = subcloneTTemplateArray_[ nTemplate ]; pSub->dumpTemplate(); fprintf( pFILE, "%s with reads: \n", pSub->soTemplateName_.data() ); for( int nRead = 0; nRead < pSub->aReads_.entries(); ++nRead ) { LocatedFragment* pLocFrag = pSub->aReads_[ nRead ]; fprintf( pFILE, " %s\n", pLocFrag->soGetName().data() ); } for( int nContig = 0; nContig < pSub->aContigs_.entries(); ++nContig ) { Contig* pContig = pSub->aContigs_[ nContig ]; fprintf( pFILE, "in contig: %s from %d to %d ", pContig->soGetName().data(), pSub->aUnpaddedLeft_[ nContig ], pSub->aUnpaddedRight_[ nContig ] ); if ( pSub->aFlags_[ nContig ] & LEFT_CONS_POS_EQUAL_OR_FURTHER_LEFT ) fprintf( pFILE, "LEFT_CONS_POS_EQUAL_OR_FURTHER_LEFT " ); if ( pSub->aFlags_[ nContig ] & LEFT_CONS_POS_EQUAL_OR_FURTHER_RIGHT ) fprintf( pFILE, "LEFT_CONS_POS_EQUAL_OR_FURTHER_RIGHT " ); if ( pSub->aFlags_[ nContig ] & RIGHT_CONS_POS_EQUAL_OR_FURTHER_LEFT ) fprintf( pFILE, "RIGHT_CONS_POS_EQUAL_OR_FURTHER_LEFT " ); if ( pSub->aFlags_[ nContig ] & RIGHT_CONS_POS_EQUAL_OR_FURTHER_RIGHT ) fprintf( pFILE, "RIGHT_CONS_POS_EQUAL_OR_FURTHER_RIGHT " ); if ( pSub->aFlags_[ nContig ] & RIGHT_CONS_POS_EXACT ) fprintf( pFILE, "RIGHT_CONS_POS_EXACT " ); if ( pSub->aFlags_[ nContig ] & LEFT_CONS_POS_EXACT ) fprintf( pFILE, "LEFT_CONS_POS_EXACT " ); fprintf( pFILE, "\n" ); } } // for( int nTemplate ... } static int compareTemplates( const subcloneTTemplate** ppSub1, const subcloneTTemplate** ppSub2 ) { if ( (*ppSub1)->soTemplateName_ < (*ppSub2)->soTemplateName_ ) return( -1 ); else if ( (*ppSub1)->soTemplateName_ == (*ppSub2)->soTemplateName_ ) return( 0 ); else return( 1 ); } void Assembly :: sortTemplatesByTemplateName() { void* pArray = (void*) subcloneTTemplateArray_.data(); size_t nNumberOfTemplates = subcloneTTemplateArray_.entries(); size_t nSizeOfAnElement = sizeof( subcloneTTemplate* ); qsort( pArray, nNumberOfTemplates, nSizeOfAnElement, ( (int(*) ( const void*, const void*) ) compareTemplates ) ); if ( !bTemplatesAreSortedAlphabetically() ) { PANIC_OST( ost ) << "Assembly::sortTemplatesByTemplateName didn't work." << ends; throw ProgramLogicError( ost.str() ); } } // static int compareTemplateToTemplateName( const RWCString* pTemplateName, // const subcloneTTemplate* pSub ) { // cerr << "comparing " << pSub->soTemplateName_ << " and " << // (*pTemplateName) << endl; // if ( (*pTemplateName ) < pSub->soTemplateName_ ) // return( -1 ); // else if ( (*pTemplateName) == pSub->soTemplateName_ ) // return( 0 ); // else // return( 1 ); // } subcloneTTemplate* Assembly :: pGetSubcloneTemplateByTemplateName( const RWCString& soTemplateName ) { int nIndex = subcloneTTemplateArray_.nFindFirstOccurrenceOfMatch( soTemplateName ); // I used to assert if no template found. Now I'm more forgiving. if ( nIndex == RW_NPOS ) return( NULL ); subcloneTTemplate* pSub = subcloneTTemplateArray_[ nIndex ]; assert( pSub->soTemplateName_ == soTemplateName ); return( pSub ); } bool Assembly :: bTemplatesAreSortedAlphabetically() { bool bOK = true; for( int nTemplate = 1; nTemplate < subcloneTTemplateArray_.entries(); ++nTemplate ) { subcloneTTemplate* pSub1 = subcloneTTemplateArray_[ nTemplate - 1 ]; subcloneTTemplate* pSub2 = subcloneTTemplateArray_[ nTemplate ]; if ( pSub1->soTemplateName_ > pSub2->soTemplateName_ ) { cerr << "in Assembly::bTemplatesAreSortedAlphabetically(), templates " << nTemplate - 1 << " (" << pSub1->soTemplateName_ << ") and " << nTemplate << " (" << pSub2->soTemplateName_ << ") are out of order" << endl; bOK = false; } } return( bOK ); } void Assembly :: sortContigTemplateArrays() { // run through all contigs, sorting arrays for ( int nContig = 0; nContig < nNumContigs(); nContig++) { Contig* pContig = pGetContig( nContig ); pContig->aSubcloneTemplates_.resort(); } } void Assembly :: clearAndResizeContigTemplateArrays() { for ( int nContig = 0; nContig < nNumContigs(); nContig++) { Contig* pContig = pGetContig( nContig ); pContig->aSubcloneTemplates_.clearAndDestroy(); pContig->aSubcloneTemplates_.resize( (size_t) pContig->nGetNumberOfFragsInContig() ); } } void Assembly :: tagAceFileWithExperiments() { ofstream ofsAceFile( (char*) soFileName_.data(), ios::out | ios::app ); if ( !ofsAceFile ) { ostrstream ost; ost << "Unable to open file " << soFileName_ << "for append " << endl << ends; SysRequestFailed srf( ost.str() ); srf.includeErrnoDescription(); throw srf; } for( int nChosenExp = 0; nChosenExp < pAllChosenExperiments_->length(); ++nChosenExp ) { experiment* pExp = (*pAllChosenExperiments_)[ nChosenExp ]; ofsAceFile << endl; pExp->writeAutoFinishExpTagToAceFile( ofsAceFile ); } ofsAceFile.close(); } void Assembly :: tagAceFileWithOligosForExperiments() { ofstream ofsAceFile( (char*) soFileName_.data(), ios::out | ios::app ); if ( !ofsAceFile ) { ostrstream ost; ost << "Unable to open file " << soFileName_ << "for append " << endl << ends; SysRequestFailed srf( ost.str() ); srf.includeErrnoDescription(); throw srf; } for( int nChosenExp = 0; nChosenExp < pAllChosenExperiments_->length(); ++nChosenExp ) { experiment* pExp = (*pAllChosenExperiments_)[ nChosenExp ]; if ( pExp->nReadType_ != nWalk ) continue; ofsAceFile << endl; bool bComplementedFromWayPhrapCreatedContig = !pExp->bTopStrandNotBottomStrand_; if ( pExp->pContig_->bComplementedFromWayPhrapCreated_ ) bComplementedFromWayPhrapCreatedContig = !bComplementedFromWayPhrapCreatedContig; oligoTag* pOligoTag = new oligoTag( pExp->pCustomPrimer_, pExp->bCloneTemplateNotSubcloneTemplate_, bComplementedFromWayPhrapCreatedContig, pExp->nPrimerID_, "" ); // soComment pOligoTag->writeTagToAceFile( ofsAceFile ); } // also append oligo tags for pcr to the ace file if ( pChosenPrimersForAutofinishPCR_ ) { for( int nOligoTag = 0; nOligoTag < pChosenPrimersForAutofinishPCR_->length(); ++nOligoTag ) { oligoTag* pOligoTag = (*pChosenPrimersForAutofinishPCR_)[ nOligoTag ]; pOligoTag->writeTagToAceFile( ofsAceFile ); } } ofsAceFile.close(); } void Assembly :: saveACopyOfAceFileToBack() { FileName filBackupAceFile = soFileName_ + ".back"; FileName filBackupOrigAceFile = filBackupAceFile + "_orig"; FileName filCopyToAce; if ( filBackupOrigAceFile.bFileByThisNameExists() ) { deleteFileIfItExists( filBackupAceFile ); renameFiles( soFileName_, filBackupAceFile ); copyFiles( filBackupAceFile, soFileName_ ); } else { if ( filBackupAceFile.bFileByThisNameExists() ) { renameFiles( filBackupAceFile, filBackupOrigAceFile ); renameFiles( soFileName_, filBackupAceFile ); copyFiles( filBackupAceFile, soFileName_ ); } else { renameFiles( soFileName_, filBackupOrigAceFile ); copyFiles( filBackupOrigAceFile, soFileName_ ); } } } void Assembly :: showReadChemistries() { RWTValOrderedVector aChemistryNames; RWTValOrderedVector aChemistryNumberOfReads; for( int nContig = 0; nContig < nNumContigs(); nContig++) { Contig* pContig = pGetContig( nContig ); for( int nRead = 0; nRead < pContig->nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pContig->pLocatedFragmentGet( nRead ); RWCString soChemistry = pLocFrag->soChemistry_; if ( soChemistry.isNull() ) soChemistry = "Not in phd file"; int nIndex = aChemistryNames.index( soChemistry ); if ( nIndex == RW_NPOS ) { aChemistryNames.insert( soChemistry ); aChemistryNumberOfReads.insert( 1 ); } else { ++aChemistryNumberOfReads[ nIndex ]; } } } assert( aChemistryNames.length() == aChemistryNumberOfReads.length() ); int nNumberOfRowsNeeded = 6 + aChemistryNames.length(); if ( nNumberOfRowsNeeded > 20 ) nNumberOfRowsNeeded = 20; TextBox* pTB = new TextBox( "Number of Reads for Each Chemistry", nNumberOfRowsNeeded ); pTB->append( "Chemistry Name # of reads\n\n\n" ); for( int nChem = 0; nChem < aChemistryNames.length(); ++nChem ) { pTB->appendWithArgs( "%-20s %6d\n", (char*) aChemistryNames[ nChem ].data(), aChemistryNumberOfReads[ nChem ] ); } pTB->makeVisible(); } int Assembly :: nGetNumberOfForwardUniversalPrimerAutoFinishExpTags() { int nUniversalForwardTags = 0; for( int nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); for( int nTag = 0; nTag < pContig->nGetNumberOfTags(); ++nTag ) { tag* pTag = pContig->pGetTag( nTag ); if ( pTag->soType_ != "autoFinishExp" ) continue; autoFinishExpTag* pAutoFinishExpTag = (autoFinishExpTag*) pTag; if ( pAutoFinishExpTag->nReadType_ == nUniversalForward ) ++nUniversalForwardTags; } } return( nUniversalForwardTags ); } int Assembly :: nGetNumberOfReverseUniversalPrimerAutoFinishExpTags() { int nUniversalReverseTags = 0; for( int nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); for( int nTag = 0; nTag < pContig->nGetNumberOfTags(); ++nTag ) { tag* pTag = pContig->pGetTag( nTag ); if ( pTag->soType_ != "autoFinishExp" ) continue; autoFinishExpTag* pAutoFinishExpTag = (autoFinishExpTag*) pTag; if ( pAutoFinishExpTag->nReadType_ == nUniversalReverse ) ++nUniversalReverseTags; } } return( nUniversalReverseTags ); } bool Assembly :: bHasUniversalPrimerReadBeenTriedBeforeByAutoFinish( subcloneTTemplate* pSub, const int nReadType ) { universalPrimerAutoFinishExpTagsSortedByTemplateName* pUniversal; if ( nReadType == nUniversalForward ) pUniversal = pForwardUniversalPrimerAutoFinishExpTags_; else if ( nReadType == nUniversalReverse ) pUniversal = pReverseUniversalPrimerAutoFinishExpTags_; else { cerr << "error condition in bHasUniversalPrimerReadBeenTriedBefore " << pSub->soGetName() << endl; return( false ); } int nIndex = pUniversal->nFindFirstOccurrenceOfMatch( pSub->soGetName() ); if ( nIndex == RW_NPOS ) return( false ); else return( true ); } bool Assembly :: bIsThereAnAutoFinishExpTagForThisReverse( const RWCString& soTemplateName, autoFinishExpTag*& pAutoFinishExpTag ) { int nIndex = pReverseUniversalPrimerAutoFinishExpTags_->nFindIndexOfMatchOrSuccessor2( soTemplateName ); if ( nIndex != RW_NPOS ) { pAutoFinishExpTag = (*pReverseUniversalPrimerAutoFinishExpTags_)[ nIndex ]; if ( pAutoFinishExpTag->aTemplates_[0] == soTemplateName ) return( true ); } // if reached here, must not have been the same template return( false ); } static int cmpChosenExperiments( experiment** ppExp1, experiment** ppExp2 ) { if ( (*ppExp1)->pContig_->soGetName() < (*ppExp2)->pContig_->soGetName() ) return( -1 ); else if ( (*ppExp1)->pContig_->soGetName() > (*ppExp2)->pContig_->soGetName() ) return( 1 ); else { // same contig if ( (*ppExp1)->nUnpaddedLeft_ < (*ppExp2)->nUnpaddedLeft_ ) return( -1 ); else if ( (*ppExp1)->nUnpaddedLeft_ > (*ppExp2)->nUnpaddedLeft_ ) return( 1 ); else { // same contig and same location // Just to give a defined order, let's use expid: if ( (*ppExp1)->aExpID_[0] < (*ppExp2)->aExpID_[0] ) return( -1 ); else if ( (*ppExp1)->aExpID_[0] > (*ppExp2)->aExpID_[0] ) return( 1 ); else return( 0 ); } } } bool Assembly :: bAreChosenExperimentsSorted() { bool bOK = true; // the casting to int below is necesary--otherwise the - 1 will be // done with unsigned which will make 0 - 1 into four billion or so for( int nExp = 0; nExp < ( (int) (pAllChosenExperiments_->length()) - 1 ); ++nExp ) { experiment* pExp1 = (*pAllChosenExperiments_)[ nExp ]; experiment* pExp2 = (*pAllChosenExperiments_)[ nExp + 1 ]; if ( cmpChosenExperiments( &pExp1, &pExp2 ) > 0 ) { cerr << "chosen experiments " << nExp << " and " << nExp + 1 << " are out of order out of a total of " << pAllChosenExperiments_->length() << " experiments" << endl; bOK = false; } } return( bOK ); } void Assembly :: sortChosenExperiments() { void* pArray = (void*) pAllChosenExperiments_->data(); size_t nNumberOfElementsInArray = pAllChosenExperiments_->entries(); size_t nSizeOfAnElement = sizeof( experiment* ); qsort( pArray, nNumberOfElementsInArray, nSizeOfAnElement, ( ( int(*) ( const void*, const void*) ) cmpChosenExperiments )); if ( ! bAreChosenExperimentsSorted() ) { PANIC_OST( ost ) << "sortChosenExperiments failed" << ends; throw ProgramLogicError( ost.str() ); } } void Assembly :: createExpSummaryFiles() { sortChosenExperiments(); fprintf( pAO, "printing experiment summary files:\n %s\n %s\n %s\n %s\n", (const char*) pCP->filAutoFinishForwardsSummary_, (const char*) pCP->filAutoFinishReversesSummary_, (const char*) pCP->filAutoFinishCustomPrimersSummary_, (const char*) pCP->filAutoFinishSortedSummary_ ); fprintf( pFORWARDS, "ace file: %s\nautofinish full output file: %s\n", (const char*) filGetAceFileFullPathname(), (const char*) pCP->filAutoFinishFullOutput_ ); fprintf( pFORWARDS, ",,,(strand),(first base of read),(high quality left),(high quality right),(contig),(exp id),(template)\n" ); fprintf( pREVERSES, "ace file: %s\nautofinish complete output file: %s\n", (const char*) filGetAceFileFullPathname(), (const char*) pCP->filAutoFinishFullOutput_ ); fprintf( pREVERSES, ",,,(strand),(first base of read),(high quality left),(high quality right),(contig),(exp id),(template)\n" ); fprintf( pCUSTOM_PRIMERS, "ace file: %s\nautofinish complete output file: %s\n", (const char*) filGetAceFileFullPathname(), (const char*) pCP->filAutoFinishFullOutput_ ); fprintf( pCUSTOM_PRIMERS, "(primer bases),(melt temp),(primer id),(strand),(first base of read),(high quality left),(high quality right),(contig),(exp id),(template),(exp id),(template),...\n" ); fprintf( pSORTED_SUMMARY, "(contig) (left) (right) (type),(strand),(first base of read),(exp id),(template)\n" ); if ( pCP->bAutoFinishPrintCustomNavigationFileForChosenReads_ ) { fprintf( pCUSTOM_NAVIGATION, "TITLE: autofinish output file %s\n", pCP->filAutoFinishCustomNavigation_.data() ); } if ( pCP->bAutoFinishPrintMinilibrariesSummaryFile_ ) { fprintf( pMINILIBRARIES, "ace file: %s\nautofinish complete output file: %s\n", (const char*) filGetAceFileFullPathname(), (const char*) pCP->filAutoFinishFullOutput_ ); fprintf( pMINILIBRARIES, "(contig),(which end of that contig),(contig),(which end of that contig),(subclone1),(subclone2),...\n"); printMinilibrariesSummaryFile(); } FILE* pFile; for( int nExp = 0; nExp < pAllChosenExperiments_->entries(); ++nExp ) { experiment* pExp = (*pAllChosenExperiments_)[ nExp ]; if ( pExp->nReadType_ == nWalk || pExp->nReadType_ == nPCREnd ) pFile = pCUSTOM_PRIMERS; else if ( pExp->nReadType_ == nUniversalForward ) pFile = pFORWARDS; else if ( pExp->nReadType_ == nUniversalReverse ) pFile = pREVERSES; else assert( false ); pExp->printToSummaryFile( pFile ); pExp->printToSummaryFile2( pSORTED_SUMMARY ); if ( pCP->bAutoFinishPrintCustomNavigationFileForChosenReads_ ) { pExp->printToCustomNavigationFile( pCUSTOM_NAVIGATION ); } } fclose( pREVERSES ); fclose( pFORWARDS ); fclose( pCUSTOM_PRIMERS ); fclose( pSORTED_SUMMARY ); if ( pCP->bAutoFinishPrintCustomNavigationFileForChosenReads_ ) fclose( pCUSTOM_NAVIGATION ); } void Assembly :: dumpReadInfo() { for( int nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); for( int nRead = 0; nRead < pContig->nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pContig->pLocatedFragmentGet( nRead ); printf( "%s %s %s\n", (char*) pLocFrag->soGetName().data(), (char*) pLocFrag->soGetTemplateName().data(), ( pLocFrag->bIsWalkingNotUniversalPrimerRead() ? "walk" : ( pLocFrag->bIsForwardNotReverseRead() ? "univ fwd" : "univ reverse" ) ) ); } } } void Assembly :: assignLibrariesToSubcloneTTemplates() { // assign subcloneTTemplate::soLibrary_ int nTemplate; for( nTemplate = 0; nTemplate < subcloneTTemplateArray_.entries(); ++nTemplate ) { subcloneTTemplate* pSub = subcloneTTemplateArray_[ nTemplate ]; pSub->transferLibraryNameFromReadsToSubcloneTTemplate(); } // guard against doing this over and over if ( !pCP->pDefaultLibrary_ ) { pCP->pDefaultLibrary_ = new library( ); pCP->pDefaultLibrary_->soName_ = "default"; pCP->pDefaultLibrary_->nMeanInsertSize_ = 0; pCP->pDefaultLibrary_->nStandardDeviationOfInsertSize_ = 0; pCP->pDefaultLibrary_->nDefaultInsertSize_ = pCP->nAutoFinishAverageInsertSize_; // nAverageInsertSizeToUse_ and nALittleLessThanAverageInsertSizeToUse_ // will be set in finalProcessingOfInsertSize pCP->pDefaultLibrary_->nMaxInsertSize_ = pCP->nPrimersMaxInsertSizeOfASubclone_; pCP->pDefaultLibrary_->dCostToMakeMinilibrary_ = pCP->dAutoFinishCostOfMinilibrary_; pCP->pDefaultLibrary_->bSingleNotDoubleStranded_ = pCP->bPrimersAssumeTemplatesAreDoubleStrandedUnlessSpecified_; } // make a temporary list of subclone templates sorted by // library name // remember to delete this off the heap when done calculating // insert sizes of the different libraries templatesSortedByLibrary aTemplatesSortedByLibrary; aTemplatesSortedByLibrary.soName_ = "templatesSortedByLibrary in assembly2.cpp"; aTemplatesSortedByLibrary.resize( subcloneTTemplateArray_.length() ); int n; for( n = 0; n < subcloneTTemplateArray_.length(); ++n ) aTemplatesSortedByLibrary.insert( subcloneTTemplateArray_[ n ] ); aTemplatesSortedByLibrary.resort2(); // sorts by library name RWTPtrOrderedVector aLibrariesMissingFromFile; aLibrariesMissingFromFile.soName_ = "aLibrariesMissingFromFile in assembly2.cpp"; RWCString soLastLibrary; library* pLastLibrary = NULL; for( nTemplate = 0; nTemplate < aTemplatesSortedByLibrary.length(); ++nTemplate ) { subcloneTTemplate* pSubTT = aTemplatesSortedByLibrary[ nTemplate ]; if ( pSubTT->soLibrary_.isNull() ) { pSubTT->pLibrary_ = pCP->pDefaultLibrary_; } else { if ( pSubTT->soLibrary_ == soLastLibrary ) { pSubTT->pLibrary_ = pLastLibrary; } else { library* pLib = pCP->pListOfLibraries_->pFindLibraryByName( pSubTT->soLibrary_ ); if ( !pLib ) { pLastLibrary = new library(); pLastLibrary->soName_ = pSubTT->soLibrary_; soLastLibrary = pSubTT->soLibrary_; // add defaults pLastLibrary->nMaxInsertSize_ = pCP->nPrimersMaxInsertSizeOfASubclone_; pLastLibrary->nDefaultInsertSize_ = pCP->nAutoFinishAverageInsertSize_; pLastLibrary->bSingleNotDoubleStranded_ = !pCP->bPrimersAssumeTemplatesAreDoubleStrandedUnlessSpecified_; pLastLibrary->dCostToMakeMinilibrary_ = pCP->dAutoFinishCostOfMinilibrary_; aLibrariesMissingFromFile.insert( pLastLibrary ); } else { pLastLibrary = pLib; soLastLibrary = pLib->soName_; } pSubTT->pLibrary_ = pLastLibrary; } } } // for( int nTemplate = 0; nTemplate < pTemplates->length(); ... // check to make sure don't keep adding the default library if ( ! pCP->pListOfLibraries_->pFindLibraryByName( "default" ) ) { pCP->pListOfLibraries_->aLibraries_.insert( pCP->pDefaultLibrary_ ); } for( n = 0; n < aLibrariesMissingFromFile.length(); ++n ) pCP->pListOfLibraries_->aLibraries_.insert( aLibrariesMissingFromFile[ n ] ); // is it important to keep this sorted? I don't think we will // be doing lookups any more in this list since each subcloneTTemplate // now has a pointer directly to the library. pCP->pListOfLibraries_->sortLibrariesByName(); } void Assembly :: calculateLibraryInsertSizeFromForwardReversePairs() { // make code work in case this has been executed previously, // such as when someone "Show Library Info" more than once int nLib; for( nLib = 0; nLib < pCP->pListOfLibraries_->aLibraries_.length(); ++nLib ) { library* pLib = pCP->pListOfLibraries_->aLibraries_[ nLib ]; pLib->dSumOfSquaresOfInsertSizes_ = 0.0; pLib->nNumberOfForwardReversePairsForCalculation_ = 0; pLib->dSumOfInsertSizes_ = 0; } int nTemplate; for( nTemplate = 0; nTemplate < subcloneTTemplateArray_.length(); ++nTemplate ) { subcloneTTemplate* pSubTT = subcloneTTemplateArray_[ nTemplate ]; if ( !pSubTT->bCalculateInsertSizeFromForwardReversePair() ) continue; ++(pSubTT->pLibrary_->nNumberOfForwardReversePairsForCalculation_); pSubTT->pLibrary_->dSumOfInsertSizes_ += pSubTT->nInsertSizeFromForwardReversePair_; // you can't square 50kb without getting 2,500,000,000 which is // a negative number in some int's double dInsertSize = (double) pSubTT->nInsertSizeFromForwardReversePair_; pSubTT->pLibrary_->dSumOfSquaresOfInsertSizes_ += ( dInsertSize * dInsertSize ); } // for( int nTemplate = 0; nTemplate < pTemplates->length(); ... // now can go through libraries and calculate the mean and standard // deviation for( nLib = 0; nLib < pCP->pListOfLibraries_->aLibraries_.length(); ++nLib ) { library* pLib = pCP->pListOfLibraries_->aLibraries_[ nLib ]; pLib->finalProcessingOfInsertSize(); } // now repeat this process, but exclude any template that is 3 standard // deviations from the mean for( nLib = 0; nLib < pCP->pListOfLibraries_->aLibraries_.length(); ++nLib ) { library* pLib = pCP->pListOfLibraries_->aLibraries_[ nLib ]; pLib->dSumOfSquaresOfInsertSizes_ = 0.0; pLib->nNumberOfForwardReversePairsForCalculation_ = 0; pLib->dSumOfInsertSizes_ = 0; } for( nTemplate = 0; nTemplate < subcloneTTemplateArray_.length(); ++nTemplate ) { subcloneTTemplate* pSubTT = subcloneTTemplateArray_[ nTemplate ]; if ( !pSubTT->bCalculateInsertSizeFromForwardReversePair() ) continue; library* pLib = pSubTT->pLibrary_; if ( !pLib->bAverageInsertSizeFromForwardReversePairs_ ) continue; // check for outliers (In practice, these do not have the // insert size calculated--there is some problem with the // calculation. Thus don't use these for calculating the mean and // standard deviation.) if ( ABS( pSubTT->nInsertSizeFromForwardReversePair_ - pLib->nMeanInsertSize_ ) > 3 * pLib->nStandardDeviationOfInsertSize_ ) continue; ++(pSubTT->pLibrary_->nNumberOfForwardReversePairsForCalculation_); pSubTT->pLibrary_->dSumOfInsertSizes_ += pSubTT->nInsertSizeFromForwardReversePair_; double dInsertSize = (double) pSubTT->nInsertSizeFromForwardReversePair_; pSubTT->pLibrary_->dSumOfSquaresOfInsertSizes_ += ( dInsertSize * dInsertSize ); } // for( int nTemplate = 0; nTemplate < pTemplates->length(); ... // now can go through libraries and calculate the mean and standard // deviation for( nLib = 0; nLib < pCP->pListOfLibraries_->aLibraries_.length(); ++nLib ) { library* pLib = pCP->pListOfLibraries_->aLibraries_[ nLib ]; pLib->finalProcessingOfInsertSize(); pLib->increasePrimersMaxInsertSizeOfASubcloneIfNecessary(); } } void Assembly :: checkAutoFinishReads() { fprintf( pAO, "\n\n\nEVALUATING AUTOFINISH READS\n\n\n" ); processSinglets(); // for each autoFinishExp tag, see if we can find a read that has the // corresponding expID bool bAutoFinishTagsFound = false; for( int nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); for( int nTag = 0; nTag < pContig->nGetNumberOfTags(); ++nTag ) { tag* pTag = pContig->pGetTag( nTag ); if ( pTag->soType_ != "autoFinishExp" ) continue; bAutoFinishTagsFound = true; autoFinishExpTag* pAutoFinishExpTag = (autoFinishExpTag*) pTag; evalExpCluster* pEval = new evalExpCluster( pAutoFinishExpTag ); for( int nExp = 0; nExp < pAutoFinishExpTag->aExpID_.entries(); ++nExp ) { int nExpID = pAutoFinishExpTag->aExpID_[ nExp ]; RWCString soTemplate = pAutoFinishExpTag->aTemplates_[ nExp ]; LocatedFragment* pLocFrag; if ( bFoundExpInAssembly( nExpID, pLocFrag ) ) { pEval->addFoundReadInAssembly( nExpID, pLocFrag ); } else { // case in which there is no corresponding read for // this experiment singletInfo* pSingletFromAutoFinish; if ( bFoundExpInSinglets( nExpID, pSingletFromAutoFinish ) ) { pEval->addFoundReadInSinglets( nExpID, pSingletFromAutoFinish ); } else { pEval->addNotFoundRead( nExpID ); } } } // for( int nExp = 0; nExp < pAutoF... // this will print out lots about this experiment cluster pEval->evaluate(); } // for( int nTag = 0 ... } // for( int nContig = 0; nContig < nNumContigs(); ++nContig ) { if ( !bAutoFinishTagsFound ) { fprintf( pAO, "No autofinish tags found in ace file. Did you forget \nto put run autofinish with -doExperiments? This \noption is necessary to allow autofinish to evaluate \nhow well it did with the previous round and also to \nprevent it from picking a failed experiment again.\n" ); } } bool Assembly :: bFoundExpInAssembly( const int nExpID, LocatedFragment*& pLocFrag ) { for( int n = 0; n < aExpidAndLocatedFragment_.entries(); ++n ) { if ( aExpidAndLocatedFragment_[ n ]->nExpID_ == nExpID ) { pLocFrag = aExpidAndLocatedFragment_[ n ]->pLocFrag_; return( true ); } } return( false ); } bool Assembly :: bFoundExpInSinglets( const int nExpID, singletInfo*& pSingletFromAutoFinish ) { for( int n = 0; n < aSingletsFromAutoFinish_.entries(); ++n ) { if ( aSingletsFromAutoFinish_[ n ]->nExpID_ == nExpID ) { pSingletFromAutoFinish = aSingletsFromAutoFinish_[ n ]; return( true ); } } return( false ); } void Assembly :: processSinglets() { pSingletsSortedByTemplateName_ = new singletsSortedByTemplateName( aSingletPHDFilenames_.entries() ); for( int nSinglet = 0; nSinglet < aSingletPHDFilenames_.entries(); ++nSinglet ) { singletInfo* pSingletInfo = new singletInfo(); pSingletInfo->filPHD_ = (FileName) aSingletPHDFilenames_[ nSinglet ]; pSingletInfo->soReadName_ = aSingletReadNames_[ nSinglet ]; if ( pSingletInfo->filPHD_.index( ".phd." ) == RW_NPOS ) { fprintf( pAO, "warning--singlet %s has no phd file--autofinish can't check its suggestions\n", (char*) pSingletInfo->filPHD_.data() ); continue; } bool bFoundExpID; readPHDOfSinglet( bFoundExpID, pSingletInfo ); pSingletsSortedByTemplateName_->insert( pSingletInfo ); if ( bFoundExpID ) { aSingletsFromAutoFinish_.insert( pSingletInfo ); } } pSingletsSortedByTemplateName_->resort2(); } void Assembly :: setAllUnpaddedErrorProbabilitiesAndMore() { for( int nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); pContig->setUnpaddedErrorProbabilitiesAndMore(); } } void Assembly :: setAllFindSingleSubcloneBases() { for( int nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); pContig->autoFinishSetSingleSubcloneArrays( true ); // bObserveDoNotFinishTags } } void Assembly :: setAllContigHighQualitySegments() { for( int nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); pContig->setHighQualitySegment(); } } void Assembly :: writeConsensusOfAllContigsToFastaFile() { FileName soAssemblyDirName( soGetAceFileName().soGetDirectory() ); FileName soTryOutFileName = soAssemblyDirName + "allContigs.fasta"; FileName soDirMask = soAssemblyDirName + "*.fasta"; FileName soUserOutFileName = GuiApp::popupFileSelector( "Save all contigs to file", soDirMask, soTryOutFileName ); // if user cancelled or do not supply name, cancel if ( soUserOutFileName.isNull() ) return; bool bGoAhead = true; if ( soUserOutFileName.bFileByThisNameExists() ) { bGoAhead = GuiApp::popupDecisionMessage( "File %s exists. Save anyway?", (const char*) soUserOutFileName ); } if ( bGoAhead ) { FILE* pfilBases = fopen( (const char*) soUserOutFileName, "w" ); if ( pfilBases == NULL ) { ostrstream ost; ost << "Unable to open file " << soUserOutFileName << " for writing bases " << endl << ends; SysRequestFailed srf( ost.str() ); srf.includeErrnoDescription(); throw srf; } for( int nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); pContig->writeConsensusToFastaFile3( pfilBases, NULL, // pfilQualities false, // bWriteBothBasesAndQualFiles soUserOutFileName, // soBasesFileName "", // soQualFileName true, // bWriteWholeContig 0, // nStartUnpaddedConsPos 0 ); // nEndUnpaddedConsPos } fclose( pfilBases ); } } void Assembly :: createModelRead() { pModelReadNumberOfReadsAtEachLocation_ = new MBTValOrderedOffsetVector( (size_t) 1500 ); pModelReadNumberOfAlignedReadsAtEachLocation_ = new MBTValOrderedOffsetVector( (size_t) 1500 ); pModelReadDistributions_ = new MBTValOrderedOffsetVector( (size_t) 1500 ); // some reads are complemented and some are not. So go through the // list for top strand, and then go through the list for bottom strand // reads // Only the aligned portion of reads should be used, because the // unaligned portion of reads should be considered completely // wrong--error probability of 1 (or should it be error probability // of 3/4?) int nTotalReadsUsed = 0; for ( int nContig = 0; nContig < nNumContigs(); nContig++) { Contig* pContig = pGetContig( nContig ); int nNumFrags = pContig->nGetNumberOfFragsInContig(); for (int nFrag = 0; nFrag < nNumFrags; nFrag++) { // get pointer to this located frag from contig LocatedFragment* pLocFrag = pContig->pLocatedFragmentGet(nFrag); if ( pLocFrag->bIsAFakeRead() ) continue; if ( pLocFrag->bIsWholeReadUnaligned() ) continue; // any other reads that should be eliminated? if ( pLocFrag->length() > pModelReadDistributions_->length() ) { int nNumberOfElementsToAdd = pLocFrag->length() - pModelReadDistributions_->length(); for( int n = 0; n < nNumberOfElementsToAdd; ++n ) { float* pQualityArray = (float*) malloc( ( nMAX_QUALITY_LEVEL2 + 1) * sizeof( float ) ); if ( !pQualityArray ) { PANIC_OST( ost ) << "could not malloc memory for quality array in Assembly :: createModelRead()" << ends; throw SysRequestFailed( ost.str() ); } for( int nQuality = 0; nQuality <= nMAX_QUALITY_LEVEL2; ++nQuality ) pQualityArray[ nQuality ] = 0.0; pModelReadDistributions_->append( pQualityArray ); pModelReadNumberOfReadsAtEachLocation_->append( 0 ); pModelReadNumberOfAlignedReadsAtEachLocation_->append( 0 ); } } ++nTotalReadsUsed; // first evaluate the beginning of the read (in the direction // of sequencing) to include the quality of any x's // Stop when get to the aligned part of the read. int nConsPosStart; int nConsPosEnd; int nIncrement; if ( pLocFrag->bComp() ) { nIncrement = -1; nConsPosStart = pLocFrag->nGetAlignEnd(); nConsPosEnd = pLocFrag->nGetAlignClipEnd() + 1; } else { nIncrement = 1; nConsPosStart = pLocFrag->nGetAlignStart(); nConsPosEnd = pLocFrag->nGetAlignClipStart() - 1; } int nUnpaddedOrientedReadPos = 0; // will ++ below to 1 on // first base int nConsPos; for( nConsPos = nConsPosStart; ( pLocFrag->bComp() ? ( nConsPos >= nConsPosEnd ) : ( nConsPos <= nConsPosEnd ) ); nConsPos += nIncrement ) { if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).bIsPad() ) continue; ++nUnpaddedOrientedReadPos; if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() == 'x' ) { // in this case, add the quality of the base int nQuality = pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); // Phil said (2/2000) to not count such bases if ( nQuality == 98 || nQuality == 99 ) continue; if ( nQuality > nMAX_QUALITY_LEVEL2 ) nQuality = nMAX_QUALITY_LEVEL2; float* pArrayOfQualityValues = (*pModelReadDistributions_)[ nUnpaddedOrientedReadPos ]; ++ ( pArrayOfQualityValues[ nQuality ] ); ++ (*pModelReadNumberOfReadsAtEachLocation_)[ nUnpaddedOrientedReadPos ]; ++ (*pModelReadNumberOfAlignedReadsAtEachLocation_)[ nUnpaddedOrientedReadPos ]; } // if ( pLocFrag->ntGetFragFromConsPos( ... == 'x' ) ... else { float* pArrayOfQualityValues = (*pModelReadDistributions_)[ nUnpaddedOrientedReadPos ]; // assume quality 0 if unaligned ++ ( pArrayOfQualityValues[ 0 ] ); ++ (*pModelReadNumberOfReadsAtEachLocation_)[ nUnpaddedOrientedReadPos ]; } } // for( int nConsPos ... // now look through aligned portion of the read if ( pLocFrag->bComp() ) { nConsPosStart = pLocFrag->nGetAlignClipEnd(); nConsPosEnd = pLocFrag->nGetAlignClipStart(); } else { nConsPosStart = pLocFrag->nGetAlignClipStart(); nConsPosEnd = pLocFrag->nGetAlignClipEnd(); } for( nConsPos = nConsPosStart; ( pLocFrag->bComp() ? ( nConsPos >= nConsPosEnd ) : ( nConsPos <= nConsPosEnd ) ); nConsPos += nIncrement ) { if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).bIsPad() ) continue; ++nUnpaddedOrientedReadPos; int nQuality = pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); // Phil said (2/2000) to not count such bases if ( nQuality == 98 || nQuality == 99 ) continue; // fix for Touchman 5/2006 if ( nQuality > nMAX_QUALITY_LEVEL2 ) nQuality = nMAX_QUALITY_LEVEL2; float* pArrayOfQualityValues = (*pModelReadDistributions_)[ nUnpaddedOrientedReadPos ]; ++ ( pArrayOfQualityValues[ nQuality ] ); ++ (*pModelReadNumberOfReadsAtEachLocation_)[ nUnpaddedOrientedReadPos ]; ++ (*pModelReadNumberOfAlignedReadsAtEachLocation_)[ nUnpaddedOrientedReadPos ]; } // now look for any vector bases in the remaining portion of the read if ( pLocFrag->bComp() ) { nConsPosStart = pLocFrag->nGetAlignClipStart() - 1; nConsPosEnd = pLocFrag->nGetAlignStart(); } else { nConsPosStart = pLocFrag->nGetAlignClipEnd() + 1; nConsPosEnd = pLocFrag->nGetAlignEnd(); } for( nConsPos = nConsPosStart; ( pLocFrag->bComp() ? ( nConsPos >= nConsPosEnd ) : ( nConsPos <= nConsPosEnd ) ); nConsPos += nIncrement ) { if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).bIsPad() ) continue; ++nUnpaddedOrientedReadPos; if ( pLocFrag->ntGetFragFromConsPos( nConsPos ).cGetBase() == 'x' ) { int nQuality = pLocFrag->ntGetFragFromConsPos( nConsPos ).qualGetQuality(); if ( nQuality == 98 || nQuality == 99 ) continue; if ( nQuality > nMAX_QUALITY_LEVEL2 ) nQuality = nMAX_QUALITY_LEVEL2; float* pArrayOfQualityValues = (*pModelReadDistributions_)[ nUnpaddedOrientedReadPos ]; ++ ( pArrayOfQualityValues[ nQuality ] ); ++ (*pModelReadNumberOfReadsAtEachLocation_)[ nUnpaddedOrientedReadPos ]; ++ (*pModelReadNumberOfAlignedReadsAtEachLocation_)[ nUnpaddedOrientedReadPos ]; } // if ( pLocFrag->ntGetFragFromConsPos ... == 'x' )... else { // this is an unaligned base that is not x, so give it // a quality of 0 float* pArrayOfQualityValues = (*pModelReadDistributions_)[ nUnpaddedOrientedReadPos ]; ++( pArrayOfQualityValues[0] ); ++ (*pModelReadNumberOfReadsAtEachLocation_)[ nUnpaddedOrientedReadPos ]; } } assert( pLocFrag->nGetUnpaddedSeqLength() == nUnpaddedOrientedReadPos ); } // for( int nFrag = ... } nModelReadTotalReads_ = nTotalReadsUsed; // To avoid the model read being influenced by the few reads that // are very long, cut off the model read when less than 10% of // reads are that long. int nMinNumberOfReads = nModelReadTotalReads_ / 10; // bug fix in which cDNA sequencing would not clip model read even // when it had only 0 reads if ( nMinNumberOfReads < 1 ) nMinNumberOfReads = 1; int nUnpaddedSeq; for( nUnpaddedSeq = pModelReadNumberOfReadsAtEachLocation_->nGetEndIndex(); nUnpaddedSeq >= pModelReadNumberOfReadsAtEachLocation_->nGetStartIndex(); --nUnpaddedSeq ) { if ( (*pModelReadNumberOfReadsAtEachLocation_)[ nUnpaddedSeq ] < nMinNumberOfReads ) { float* pArrayOfQualityValues = (*pModelReadDistributions_)[ nUnpaddedSeq ]; free( pArrayOfQualityValues); pModelReadDistributions_->removeLast(); pModelReadNumberOfReadsAtEachLocation_->removeLast(); pModelReadNumberOfAlignedReadsAtEachLocation_->removeLast(); } else break; } // If this lab has finishing reads that are shorter than the // shotgun reads, trim the model read back to this point. if ( pModelReadNumberOfReadsAtEachLocation_->length() > pCP->nAutoFinishMaximumFinishingReadLength_ ) { for( nUnpaddedSeq = pModelReadNumberOfReadsAtEachLocation_->nGetEndIndex(); nUnpaddedSeq > pCP->nAutoFinishMaximumFinishingReadLength_; --nUnpaddedSeq ) { float* pArrayOfQualityValues = (*pModelReadDistributions_)[ nUnpaddedSeq ]; free( pArrayOfQualityValues ); pModelReadDistributions_->removeLast(); pModelReadNumberOfReadsAtEachLocation_->removeLast(); pModelReadNumberOfAlignedReadsAtEachLocation_->removeLast(); } } // convert pModelReadsDistributions_ from arrays with the absolute // number of reads at each quality // to arrays with the cumulative fractional number (per cent) of // reads at each quality for( nUnpaddedSeq = pModelReadNumberOfReadsAtEachLocation_->nGetStartIndex(); nUnpaddedSeq <= pModelReadNumberOfReadsAtEachLocation_->nGetEndIndex(); ++nUnpaddedSeq ) { float* pArrayOfQualityValues = (*pModelReadDistributions_)[ nUnpaddedSeq ]; int nNumberOfReadsAtThisPosition = (*pModelReadNumberOfReadsAtEachLocation_)[ nUnpaddedSeq ]; // in this case, all the numbers of reads at each quality level // will also be zero, so no need to divide if ( nNumberOfReadsAtThisPosition == 0 ) { pArrayOfQualityValues[0] = 1.0; continue; } // if we want to consider shorter reads as having quality 0 at this // read position, add some quality zero counts if ( !pCP->bAutoFinishUseLongModelReadRatherThanShort_ ) { pArrayOfQualityValues[0] += ( nModelReadTotalReads_ - nNumberOfReadsAtThisPosition ); } // convert to cumulative distribution int nQuality; for( nQuality = 1; nQuality <= nMAX_QUALITY_LEVEL2; ++nQuality ) pArrayOfQualityValues[ nQuality ] += pArrayOfQualityValues[ nQuality - 1 ]; // convert to fractional distribution float fDenominator = pCP->bAutoFinishUseLongModelReadRatherThanShort_ ? (float) nNumberOfReadsAtThisPosition : (float) nModelReadTotalReads_; for( nQuality = 0; nQuality <= nMAX_QUALITY_LEVEL2; ++nQuality ) pArrayOfQualityValues[ nQuality ] /= fDenominator; assert( pArrayOfQualityValues[ nMAX_QUALITY_LEVEL2 ] == 1.0 ); } if ( pCP->bAutoFinishEmulate9_66Behavior_ ) addRedundancyToModelReadFor9_66(); // now calculate the model read makeModelRead( pModelRead_, pArrayOfModelReadsWithSeveralTemplates_, nModelReadHighQualitySegmentStart_, pModelReadDistributions_ ); } void Assembly :: makeModelRead( MBTValOrderedOffsetVector*& pModelRead, mbtPtrVector*& pArrayOfModelReadsWithSeveralTemplates, int& nModelReadHighQualitySegmentStart, MBTValOrderedOffsetVector* pModelReadDistributions ) { // have an offset of 1 so that it is indexed by the number of templates, // starting at 1 pArrayOfModelReadsWithSeveralTemplates = new mbtPtrVector( (size_t) pCP->nAutoFinishHowManyTemplatesYouIntendToUseForCustomPrimerSubcloneReactions_, 1 ); // starts with index 1 int nTemplates; for( nTemplates = 1; nTemplates <= pCP->nAutoFinishHowManyTemplatesYouIntendToUseForCustomPrimerSubcloneReactions_; ++nTemplates ) { MBTValOrderedOffsetVector* pModelReadWithSeveralTemplates = new MBTValOrderedOffsetVector( (size_t) pModelReadDistributions->length() ); (*pArrayOfModelReadsWithSeveralTemplates)[ nTemplates ] = pModelReadWithSeveralTemplates; } pModelRead = (*pArrayOfModelReadsWithSeveralTemplates)[ 1 ]; // temporary array of storing the array, the square of the array, // and the cube of the array where the array is the cumulative // error probability distribution float* pArrayOfQualityValues = (float*) malloc( ( nMAX_QUALITY_LEVEL2 + 1) * sizeof( float ) ); int nUnpaddedSeq; for( nUnpaddedSeq = pModelReadDistributions->nGetStartIndex(); nUnpaddedSeq <= pModelReadDistributions->nGetEndIndex(); ++nUnpaddedSeq ) { float* pArrayOfQualityValuesOriginal = (*pModelReadDistributions)[ nUnpaddedSeq ]; int nQuality; for( nQuality = 0; nQuality <= nMAX_QUALITY_LEVEL2; ++nQuality ) pArrayOfQualityValues[ nQuality ] = 1.0; for( nTemplates = 1; nTemplates <= pCP->nAutoFinishHowManyTemplatesYouIntendToUseForCustomPrimerSubcloneReactions_; ++nTemplates ) { for( nQuality = 0; nQuality <= nMAX_QUALITY_LEVEL2; ++nQuality ) pArrayOfQualityValues[ nQuality ] *= pArrayOfQualityValuesOriginal[ nQuality ]; for( nQuality = 0; nQuality <= nMAX_QUALITY_LEVEL2; ++nQuality ) { if ( pArrayOfQualityValues[ nQuality ] >= 0.5 ) break; } aModelReadWithSeveralTemplatesType* pModelReadWithSeveralTemplates = (*pArrayOfModelReadsWithSeveralTemplates)[ nTemplates ]; pModelReadWithSeveralTemplates->append( (unsigned char) nQuality ); } } // nUnpaddedSeq = pModelReadDistributions_->nGetStartIndex() ... // calculate nModelReadHighQualitySegmentStart_ nModelReadHighQualitySegmentStart = 1; // default for( nUnpaddedSeq = pModelRead_->nGetStartIndex(); nUnpaddedSeq <= pModelRead_->nGetEndIndex(); ++nUnpaddedSeq ) { if ( (*pModelRead_)[ nUnpaddedSeq ] != 0 ) { nModelReadHighQualitySegmentStart = nUnpaddedSeq; break; } } // calculate nModelReadAlignedSegmentEnd_ // which is used for fixing single subclone regions int nNumberOfAlignedReadsRequired = nModelReadTotalReads_ * pCP->nAutoFinishConfidenceThatReadWillCoverSingleSubcloneRegion_ / 100; nModelReadAlignedSegmentEnd_ = -1; for( nUnpaddedSeq = pModelReadNumberOfAlignedReadsAtEachLocation_->nGetEndIndex(); nUnpaddedSeq >= pModelReadNumberOfAlignedReadsAtEachLocation_->nGetStartIndex(); --nUnpaddedSeq ) { if ( (*pModelReadNumberOfAlignedReadsAtEachLocation_)[ nUnpaddedSeq ] >= nNumberOfAlignedReadsRequired ) { nModelReadAlignedSegmentEnd_ = nUnpaddedSeq; break; } } nModelReadAlignedSegmentStart_ = -1; for( nUnpaddedSeq = pModelReadNumberOfAlignedReadsAtEachLocation_->nGetStartIndex(); nUnpaddedSeq <= pModelReadNumberOfAlignedReadsAtEachLocation_->nGetEndIndex(); ++nUnpaddedSeq ) { if ( (*pModelReadNumberOfAlignedReadsAtEachLocation_)[ nUnpaddedSeq ] >= nNumberOfAlignedReadsRequired ) { nModelReadAlignedSegmentStart_ = nUnpaddedSeq; break; } } if ( nModelReadAlignedSegmentStart_ == -1 || nModelReadAlignedSegmentEnd_ == -1 ) { nModelReadAlignedSegmentStart_ = pCP->nAutoFinishPotentialHighQualityPartOfReadStart_; nModelReadAlignedSegmentEnd_ = pCP->nAutoFinishPotentialHighQualityPartOfReadEnd_; } free( pArrayOfQualityValues ); } void Assembly :: showAutofinishGoodModelRead() { PleaseWait* pPleaseWait = new PleaseWait( GuiApp::pGetGuiApp()->widGetTopLevel() ); createModelRead(); delete pPleaseWait; TextBox* pTB = new TextBox( "Autofinish Model Read", 20 ); // number of rows pTB->append( "\n\n" ); pTB->appendWithArgs( "total reads used: %d\n", nModelReadTotalReads_ ); pTB->appendWithArgs( "high quality segment start = %d\n\n", nModelReadHighQualitySegmentStart_ ); pTB->appendWithArgs( "nModelReadAlignedSegmentStart_ = %d\n", nModelReadAlignedSegmentStart_ ); pTB->appendWithArgs( "nModelReadAlignedSegmentEnd_ = %d\n\n", nModelReadAlignedSegmentEnd_ ); pTB->append( "columns:\n" ); pTB->append( "read position, unpadded, in direction of sequencing\n" ); pTB->append( "median quality at that read position\n" ); pTB->append( "number of reads used at that read position to determine quality\n" ); pTB->append( "median maximum quality of 2 independent reads at that position\n" ); pTB->append( "median maximum quality of 3 independent reads at that position\n" ); pTB->append( "median maximum quality of 4 independent reads at that position\n" ); pTB->append( "quality value and % of reads that have that quality value or less at that sequence position\n" ); pTB->append( "\n" ); pTB->append( "read quality error for errors\n" ); pTB->append( "pos prob fixed\n\n\n" ); double dSumErrorProbabilities = 0; int nSeqPos; for( nSeqPos = pModelRead_->nGetStartIndex(); nSeqPos <= pModelRead_->nGetEndIndex(); ++nSeqPos ) { int nQuality = (*pModelRead_)[ nSeqPos ]; float* pDistribution = (*pModelReadDistributions_)[ nSeqPos ]; pTB->appendWithArgs( "%3d %2d %5d ", nSeqPos, nQuality, (*pModelReadNumberOfReadsAtEachLocation_)[ nSeqPos ] ); float dis2Reads[nMAX_QUALITY_LEVEL2 + 1]; float dis3Reads[nMAX_QUALITY_LEVEL2 + 1]; float dis4Reads[nMAX_QUALITY_LEVEL2 + 1]; for( nQuality = 0; nQuality <= nMAX_QUALITY_LEVEL2; ++nQuality ) { dis2Reads[ nQuality ] = pDistribution[ nQuality ] * pDistribution[ nQuality ]; dis3Reads[ nQuality ] = dis2Reads[ nQuality ] * pDistribution[ nQuality ]; dis4Reads[ nQuality ] = dis3Reads[ nQuality ] * pDistribution[ nQuality ]; } // find median of each distribution int nMedianQuality2Reads = -1; for( nQuality = 0; nQuality <= nMAX_QUALITY_LEVEL2; ++nQuality ) { if ( dis2Reads[ nQuality ] >= .5 ) { nMedianQuality2Reads = nQuality; break; } } int nMedianQuality3Reads = -1; for( nQuality = 0; nQuality <= nMAX_QUALITY_LEVEL2; ++nQuality ) { if ( dis3Reads[ nQuality ] >= .5 ) { nMedianQuality3Reads = nQuality; break; } } int nMedianQuality4Reads = -1; for( nQuality = 0; nQuality <= nMAX_QUALITY_LEVEL2; ++nQuality ) { if ( dis4Reads[ nQuality ] >= .5 ) { nMedianQuality4Reads = nQuality; break; } } pTB->appendWithArgs("%2d %2d %2d ", nMedianQuality2Reads, nMedianQuality3Reads, nMedianQuality4Reads ); for( nQuality = 0; nQuality <= nMAX_QUALITY_LEVEL2; ++nQuality ) { float fPercent = pDistribution[ nQuality ]; pTB->appendWithArgs( "%d: %.3f ", nQuality, fPercent ); } pTB->append( "\n" ); } pTB->append("\n\n\n"); pTB->append( "Aligned reads at each position:\n" ); pTB->append( "\n" ); pTB->append( "read pos # of aligned reads fraction of total reads\n" ); for( nSeqPos = pModelReadNumberOfAlignedReadsAtEachLocation_->nGetStartIndex(); nSeqPos <= pModelReadNumberOfAlignedReadsAtEachLocation_->nGetEndIndex(); ++nSeqPos ) { pTB->appendWithArgs("%2d %5d %.5f\n", nSeqPos, (*pModelReadNumberOfAlignedReadsAtEachLocation_)[nSeqPos ], (float) (*pModelReadNumberOfAlignedReadsAtEachLocation_)[nSeqPos ]/ (float) nModelReadTotalReads_ ); } pTB->makeVisible(); } static int compareTemplatesBySize( const subcloneTTemplate** ppSub1, const subcloneTTemplate** ppSub2 ) { if ( (*ppSub1)->bOKToUseThisTemplateForCalculatingInsertSizeFromForwardReversePairs_ && !(*ppSub2)->bOKToUseThisTemplateForCalculatingInsertSizeFromForwardReversePairs_ ) return( -1 ); else if ( !(*ppSub1)->bOKToUseThisTemplateForCalculatingInsertSizeFromForwardReversePairs_ && (*ppSub2)->bOKToUseThisTemplateForCalculatingInsertSizeFromForwardReversePairs_ ) return( 1 ); else if ( !(*ppSub1)->bOKToUseThisTemplateForCalculatingInsertSizeFromForwardReversePairs_ && !(*ppSub2)->bOKToUseThisTemplateForCalculatingInsertSizeFromForwardReversePairs_ ) { // just to have a defined order, sort bad templates alphabetically return( compareTemplates( ppSub1, ppSub2 ) ); } else { // my favorite case--when there are 2 templates that both have // forward/reverse pairs if ( (*ppSub1)->nInsertSizeFromForwardReversePair_ < (*ppSub2)->nInsertSizeFromForwardReversePair_ ) return( -1 ); else if ( (*ppSub1)->nInsertSizeFromForwardReversePair_ == (*ppSub2)->nInsertSizeFromForwardReversePair_ ) return( 0 ); else return( 1 ); } } void Assembly :: showTemplateInsertLocations() { PleaseWait* pPleaseWait = new PleaseWait( GuiApp::pGetGuiApp()->widGetTopLevel() ); setContigTemplateArrays(); RWTPtrOrderedVector aTemplatesSortedBySize; aTemplatesSortedBySize.resize( subcloneTTemplateArray_.length() ); int nSub; for( nSub = 0; nSub < subcloneTTemplateArray_.length(); ++nSub ) { aTemplatesSortedBySize.insert( subcloneTTemplateArray_[ nSub ] ); } void* pArray = (void*) aTemplatesSortedBySize.data(); size_t nNumberOfTemplates = aTemplatesSortedBySize.entries(); size_t nSizeOfAnElement = sizeof( subcloneTTemplate* ); qsort( pArray, nNumberOfTemplates, nSizeOfAnElement, ( (int(*) ( const void*, const void*) ) compareTemplatesBySize ) ); // check that the array is now sorted by template size for( nSub = 0; nSub < aTemplatesSortedBySize.entries() - 1; ++nSub ) { subcloneTTemplate* pSub1 = aTemplatesSortedBySize[ nSub ]; subcloneTTemplate* pSub2 = aTemplatesSortedBySize[ nSub + 1 ]; if ( pSub1->bOKToUseThisTemplateForCalculatingInsertSizeFromForwardReversePairs_ && pSub2->bOKToUseThisTemplateForCalculatingInsertSizeFromForwardReversePairs_ ) { if ( ! ( pSub1->nInsertSizeFromForwardReversePair_ <= pSub2->nInsertSizeFromForwardReversePair_ ) ) { cerr << "Elements " << nSub << " and " << nSub + 1 << " out of order with insert sizes " << pSub1->nInsertSizeFromForwardReversePair_ << " and " << pSub2->nInsertSizeFromForwardReversePair_ << endl; } } } TextBox* pTB = new TextBox( "Template Insert Locations", 20 ); // number of rows pTB->append( "Example:\n" ); pTB->append( "0719 762 ok\n" ); pTB->append( " HAHCA1Q0719.scf univ primer 38 \n" ); pTB->append( " HAHCA1T0719.scf univ primer 33 \n" ); pTB->append( "\n\nIn the above example, \n"); pTB->append( " 0719 is the template name, \n" ); pTB->append( " 762 is the insert size, \n" ); pTB->append( " ok indicates that this template is ok to be used by autofinish\n" ); pTB->append( " 38 is the sequencing position of the start of the insert in universal primer read HAHCA1Q0719.scf, \n" ); pTB->append( " 33 is the sequencing position of the start of the insert in universal primer read HAHCA1T0719.scf\n\n\n" ); for( nSub = 0; nSub < aTemplatesSortedBySize.length(); ++nSub ) { subcloneTTemplate* pSub = aTemplatesSortedBySize[ nSub ]; RWCString soProblems; if ( pSub->bOKToUseThisTemplate_ ) soProblems += "ok "; if ( pSub->bBadTemplateFile_ ) soProblems += "in bad template file "; if ( pSub->bBadLibraryFile_ ) soProblems += "in bad library file "; if ( pSub->bShortInsert_ ) soProblems += "short insert "; if ( pSub->bReadsAreInconsistent_ ) soProblems += "reads inconsistent "; if ( pSub->bReadPairTooFarApart_ ) soProblems += "forward/reverse pair too far apart "; if ( pSub->bUnalignedHighQualityRegionTooLong_ ) soProblems += "unaligned high quality region too long "; if ( pSub->bHasSeriousHighQualityDiscrepancies_ ) soProblems += "has serious high quality discrepancies "; if ( pSub->bIsChimeric_ ) soProblems += "is chimeric "; if ( pSub->bInconsistentGapSpanning_ ) soProblems += "incorrectly indicates the way that 2 contigs are oriented"; int nInsertSize = -1; if ( pSub->bOKToUseThisTemplateForCalculatingInsertSizeFromForwardReversePairs_ ) nInsertSize = pSub->nInsertSizeFromForwardReversePair_; else if ( pSub->bOKToUseThisTemplate_ ) { // this essentially just picks the "a little less than average" // size for the library nInsertSize = pSub->nUnpaddedEnd_ - pSub->nUnpaddedStart_ + 1; } pTB->appendWithArgs( "%-30s %d %s", (char*) pSub->soTemplateName_, nInsertSize, (char*) soProblems ); if ( pSub->soLibrary_.isNull() ) pTB->append( "\n" ); else pTB->appendWithArgs( " library: %s\n", pSub->soLibrary_.data() ); for( int nRead = 0; nRead < pSub->aReads_.length(); ++nRead ) { LocatedFragment* pLocFrag = pSub->aReads_[ nRead ]; if ( pLocFrag->nReadType_ == nWalk ) { pTB->appendWithArgs( " %-20s walk ", (char*) pLocFrag->soGetName() ); if ( pLocFrag->bIsWholeReadUnaligned() ) pTB->append( "totally unaligned " ); else { int nLastBaseBeforeVector; bool bFoundXsAfterAligned; pLocFrag->findVectorInsertJunctionInWalkingRead( bFoundXsAfterAligned, nLastBaseBeforeVector ); if ( bFoundXsAfterAligned ) pTB->appendWithArgs( " found vector after last aligned base %d\n", nLastBaseBeforeVector ); else pTB->appendWithArgs(" did not find vector after last aligned base %d\n", nLastBaseBeforeVector ); } } else { int nFirstVectorInsertJunction; int nSecondVectorInsertJunction; bool bFoundFirstVectorInsertJunction; bool bFoundSecondVectorInsertJunction; pLocFrag->findVectorInsertJunctionsInUniversalPrimerRead( nFirstVectorInsertJunction, nSecondVectorInsertJunction, bFoundSecondVectorInsertJunction ); int nSeqPos = pLocFrag->nOrientedUnpaddedFragPosFromConsPos( nFirstVectorInsertJunction ); pTB->appendWithArgs( " %-20s univ primer %d ", (char*) pLocFrag->soGetName(), nSeqPos ); if ( pLocFrag->bIsWholeReadUnaligned() ) pTB->append( "totally unaligned " ); if ( bFoundSecondVectorInsertJunction ) { int nSeqPos2 = pLocFrag->nOrientedUnpaddedFragPosFromConsPos( nSecondVectorInsertJunction ); pTB->appendWithArgs( " found second v/i junction at %d\n", nSeqPos2 ); } else pTB->append( "\n" ); } } // for( int nRead ... } // for( int nSub ... delete pPleaseWait; pTB->makeVisible(); } void Assembly :: showAutoFinishErrorsFixedFormula() { PleaseWait* pPleaseWait = new PleaseWait( GuiApp::pGetGuiApp()->widGetTopLevel() ); createModelRead(); TextBox* pTB = new TextBox( "Autofinish Errors Fixed Formula" ); pTB->append( "\n\n" ); pTB->append( "quality new cons qual ----errors fixed---- ---error probability\n" ); pTB->append( "con read uni red unique redundant consensus read \n" ); pTB->append( "\n" ); for( int nConsensusQuality = 0; nConsensusQuality <= 90; nConsensusQuality += 5 ) { for( int nReadQuality = 0; nReadQuality <= 50; nReadQuality += 10 ) { double dConsensusErrorProbability = dErrorRateFromQuality[ nConsensusQuality ]; double dReadErrorProbability = dErrorRateFromQuality[ nReadQuality ]; double dErrorsFixedRedundantReadType = dGetErrorsFixedAtBase( dConsensusErrorProbability, dReadErrorProbability, TOP_STRAND_TERMINATOR_READS, TOP_STRAND_TERMINATOR_READS ); double dErrorsFixedUniqueReadType = dGetErrorsFixedAtBase( dConsensusErrorProbability, dReadErrorProbability, TOP_STRAND_TERMINATOR_READS, BOTTOM_STRAND_TERMINATOR_READS ); double dNewConsensusErrorProbabilityRedundant = dConsensusErrorProbability - dErrorsFixedRedundantReadType; double dNewConsensusErrorProbabilityUnique = dConsensusErrorProbability - dErrorsFixedUniqueReadType; double dQualityNewConsensusRedundant = -10.0 * log10( dNewConsensusErrorProbabilityRedundant ); double dQualityNewConsensusUnique = -10.0 * log10( dNewConsensusErrorProbabilityUnique ); pTB->appendWithArgs( "%2d %2d %4.1f %4.1f ", nConsensusQuality, nReadQuality, dQualityNewConsensusUnique, dQualityNewConsensusRedundant ); // now for the fun--add another read // first, find the location in the model read where the // desired read quality value is attained as a median: bool bFoundSeqPosition = false; int nSeqPos; for( nSeqPos = pModelRead_->nGetStartIndex(); nSeqPos <= pModelRead_->nGetEndIndex(); ++nSeqPos ) { if ( (*pModelRead_)[ nSeqPos ] >= nReadQuality ) { bFoundSeqPosition = true; break; } } // there might not be any such point, though if ( bFoundSeqPosition ) { float* pDistribution = (*pModelReadDistributions_)[ nSeqPos ]; int nMedianQualityOf2Reads; for( nMedianQualityOf2Reads = 0; nMedianQualityOf2Reads <= nMAX_QUALITY_LEVEL2; ++nMedianQualityOf2Reads ) { if ( pDistribution[ nMedianQualityOf2Reads ] * pDistribution[ nMedianQualityOf2Reads ] > 0.5 ) break; } dErrorsFixedRedundantReadType = dGetErrorsFixedAtBase( dConsensusErrorProbability, dErrorRateFromQuality[ nMedianQualityOf2Reads ], TOP_STRAND_TERMINATOR_READS, TOP_STRAND_TERMINATOR_READS ); dErrorsFixedUniqueReadType = dGetErrorsFixedAtBase( dConsensusErrorProbability, dErrorRateFromQuality[ nMedianQualityOf2Reads ], TOP_STRAND_TERMINATOR_READS, BOTTOM_STRAND_TERMINATOR_READS ); dNewConsensusErrorProbabilityRedundant = dConsensusErrorProbability - dErrorsFixedRedundantReadType; dNewConsensusErrorProbabilityUnique = dConsensusErrorProbability - dErrorsFixedUniqueReadType; dQualityNewConsensusRedundant = -10.0 * log10( dNewConsensusErrorProbabilityRedundant ); dQualityNewConsensusUnique = -10.0 * log10( dNewConsensusErrorProbabilityUnique ); pTB->appendWithArgs( "%4d 2 reads: %2d %4.1f %4.1f ", nSeqPos, nMedianQualityOf2Reads, dQualityNewConsensusUnique, dQualityNewConsensusRedundant ); int nMedianQualityOf3Reads; for( nMedianQualityOf3Reads = 0; nMedianQualityOf3Reads <= nMAX_QUALITY_LEVEL2; ++nMedianQualityOf3Reads ) { if ( pDistribution[ nMedianQualityOf3Reads ] * pDistribution[ nMedianQualityOf3Reads ] * pDistribution[ nMedianQualityOf3Reads ] > 0.5 ) break; } dErrorsFixedRedundantReadType = dGetErrorsFixedAtBase( dConsensusErrorProbability, dErrorRateFromQuality[ nMedianQualityOf3Reads ], TOP_STRAND_TERMINATOR_READS, TOP_STRAND_TERMINATOR_READS ); dErrorsFixedUniqueReadType = dGetErrorsFixedAtBase( dConsensusErrorProbability, dErrorRateFromQuality[ nMedianQualityOf3Reads ], TOP_STRAND_TERMINATOR_READS, BOTTOM_STRAND_TERMINATOR_READS ); dNewConsensusErrorProbabilityRedundant = dConsensusErrorProbability - dErrorsFixedRedundantReadType; dNewConsensusErrorProbabilityUnique = dConsensusErrorProbability - dErrorsFixedUniqueReadType; dQualityNewConsensusRedundant = -10.0 * log10( dNewConsensusErrorProbabilityRedundant ); dQualityNewConsensusUnique = -10.0 * log10( dNewConsensusErrorProbabilityUnique ); pTB->appendWithArgs( "3 reads: %2d %4.1f %4.1f\n", nMedianQualityOf3Reads, dQualityNewConsensusUnique, dQualityNewConsensusRedundant ); } // if ( bFoundSeqPosition ) else pTB->append(" no model read position this good\n" ); } } delete pPleaseWait; pTB->makeVisible(); } void Assembly :: showLibraryInfo() { PleaseWait* pPleaseWait = new PleaseWait( GuiApp::pGetGuiApp()->widGetTopLevel() ); setContigTemplateArrays(); countReadsInEachLibrary(); const int nNumberOfRowsVisible = 35; TextBox* pTextBox = new TextBox( "Library Info", nNumberOfRowsVisible ); pTextBox->append( "columns:\n" ); pTextBox->append( "# : # of fwd/rev pairs used in calculation\n" ); pTextBox->append( "#2 : # of reads in library\n" ); pTextBox->append( " (note: whole clone reads, singlets, and fake reads\n are not included in this number)\n" ); pTextBox->append( "mean : calculated mean insert size of library\n" ); pTextBox->append( "std : standard deviation of insert size of library\n" ); pTextBox->append( "def : default insert size--only used if not enough fwd/rev pairs\n" ); pTextBox->append( "avg : insert size that will be used\n" ); pTextBox->append( "less : a little less than average insert size to be used\n" ); pTextBox->append( "max : maximum insert size to use (usually from statistical analysis of distribution of fwd/rev pairs\n" ); pTextBox->append( "maxf : maximum insert size from librariesInfo.txt\n" ); pTextBox->append( "st : user-specified single-stranded or double-stranded\n" ); pTextBox->append( "f/r : using forward/reverse pairs (not defaults)\n" ); pTextBox->append( "ok : not in bad libraries file\n" ); pTextBox->append( "\n\n" ); pTextBox->append( "name # #2 mean std def avg less max maxf st f/r ok?\n" ); for( int nLib = 0; nLib < pCP->pListOfLibraries_->aLibraries_.length(); ++nLib ) { library* pLib = pCP->pListOfLibraries_->aLibraries_[ nLib ]; pTextBox->appendWithArgs( "%-15s %6d %6d %5d %5d %4d %5d %4d %5d %5d %-2s %-3s %-3s\n", pLib->soName_.data(), pLib->nNumberOfForwardReversePairsForCalculation_, pLib->nNumberOfReadsFromThisLibrary_, pLib->nMeanInsertSize_, pLib->nStandardDeviationOfInsertSize_, pLib->nDefaultInsertSize_, pLib->nAverageInsertSizeToUse_, pLib->nALittleLessThanAverageInsertSizeToUse_, pLib->nMaxInsertSize_, pLib->nMaxInsertSizeFromFile_, ( pLib->bSingleNotDoubleStranded_ ? "SS" : "DS" ), ( pLib->bAverageInsertSizeFromForwardReversePairs_ ? "Yes" : "No" ), ( bIsThisLibraryOKNotBad( pLib->soName_ ) ? "ok" : "bad" ) ); } pTextBox->makeVisible(); delete pPleaseWait; } void Assembly :: countReadsInEachLibrary() { for( int nLib = 0; nLib < pCP->pListOfLibraries_->aLibraries_.length(); ++nLib ) { library* pLib = pCP->pListOfLibraries_->aLibraries_[ nLib ]; pLib->nNumberOfReadsFromThisLibrary_ = 0; } for( int nTemplate = 0; nTemplate < subcloneTTemplateArray_.entries(); ++nTemplate ) { subcloneTTemplate* pSub = subcloneTTemplateArray_[ nTemplate ]; pSub->pLibrary_->nNumberOfReadsFromThisLibrary_ += pSub->aReads_.length(); } } bool Assembly :: bIsThisUniversalPrimerReadInSinglets( const int nReadType, const RWCString& soTemplateName ) { int nIndex = pSingletsSortedByTemplateName_->nFindFirstOccurrenceOfMatch( soTemplateName ); if ( nIndex == RW_NPOS ) return( false ); bool bLookAgain = true; for( ; nIndex < pSingletsSortedByTemplateName_->entries() && bLookAgain; ++nIndex ) { singletInfo* pSinglet = (*pSingletsSortedByTemplateName_)[ nIndex ]; if ( pSinglet->soTemplate_ != soTemplateName ) bLookAgain = false; else { if ( pSinglet->nReadType_ == nReadType ) return( true ); } } return( false ); } void Assembly :: addRedundancyToModelReadFor9_66() { int nAmountToShiftQualityDistribution = (int) pCP->dAutoFinishRedundancy_ - 1; for( int nUnpaddedSeq = pModelReadNumberOfReadsAtEachLocation_->nGetStartIndex(); nUnpaddedSeq <= pModelReadNumberOfReadsAtEachLocation_->nGetEndIndex(); ++nUnpaddedSeq ) { float* pArrayOfQualityValues = (*pModelReadDistributions_)[ nUnpaddedSeq ]; // shift left int nQuality; for( nQuality = 0; nQuality <= nMAX_QUALITY_LEVEL2 - nAmountToShiftQualityDistribution; ++nQuality ) { pArrayOfQualityValues[ nQuality ] = pArrayOfQualityValues[ nQuality + nAmountToShiftQualityDistribution ]; } for( nQuality = nMAX_QUALITY_LEVEL2 - nAmountToShiftQualityDistribution + 1; nQuality <= nMAX_QUALITY_LEVEL2; ++nQuality ) { pArrayOfQualityValues[ nQuality ] = 1.0; } } } void Assembly :: getReadLengthStatistics( double& dMean, double& dStandardDeviation ) { if ( !bAlreadyCalculatedReadLengthStatistics_ ) findReadLengthStatistics(); dMean = dMeanReadLength_; dStandardDeviation = dStandardDeviationOfReadLength_; } void Assembly :: findReadLengthStatistics() { // calculate mean and standard deviation of all reads that // have a high quality segment. Then throw out the outliers and do // it again. double dTotalEndOfHighQualityRegions = 0.0; double dTotalSquaredEndOfHighQualityRegions = 0.0; int nTotalReads = 0; int nContig; for( nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); for( int nRead = 0; nRead < pContig->nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pContig->pLocatedFragmentGet( nRead ); if ( pLocFrag->bWholeReadIsLowQuality_ ) continue; int nEndOfHighQualityReadPos; if ( pLocFrag->bComp() ) { // <----------------------------- // lllllHHHHHHHHHHHHHHHHHHlllllllll // ^ // nGetHighQualityStart() // ^ // nGetAlignEnd() nEndOfHighQualityReadPos = pContig->nUnpaddedIndex( pLocFrag->nGetAlignEnd() ) - pContig->nUnpaddedIndex( pLocFrag->nGetHighQualityStart() ) + 1; } else { // --------------------------------> // llllllHHHHHHHHHHHHHHHllllllllllll // ^ // nGetHighQualityEnd() // ^ // nGetAlignStart() nEndOfHighQualityReadPos = pContig->nUnpaddedIndex( pLocFrag->nGetHighQualityEnd() ) - pContig->nUnpaddedIndex( pLocFrag->nGetAlignStart() ) + 1; } dTotalEndOfHighQualityRegions += nEndOfHighQualityReadPos; dTotalSquaredEndOfHighQualityRegions += ( nEndOfHighQualityReadPos * nEndOfHighQualityReadPos ); ++nTotalReads; } } if ( nTotalReads == 0 ) { // this caused floating point exception if there were 0 // reads with high quality segment. This info is not used // in such a case (finding stops) addStopsCausingLCQs so // it really doesn't matter what the value is. // (Helped Kerrie Bubb, 1/14/03) dMeanReadLength_ = 0.0; dStandardDeviationOfReadLength_ = 0.0; bAlreadyCalculatedReadLengthStatistics_ = true; return; } dMeanReadLength_ = dTotalEndOfHighQualityRegions / (double) nTotalReads; dStandardDeviationOfReadLength_ = sqrt( dTotalSquaredEndOfHighQualityRegions / (double) nTotalReads - dMeanReadLength_*dMeanReadLength_ ); // now do this same process again, throwing out outliers int nTooBig = dMeanReadLength_ + 3.0*dStandardDeviationOfReadLength_; dTotalEndOfHighQualityRegions = 0.0; dTotalSquaredEndOfHighQualityRegions = 0.0; nTotalReads = 0; for( nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); for( int nRead = 0; nRead < pContig->nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pContig->pLocatedFragmentGet( nRead ); if ( pLocFrag->bWholeReadIsLowQuality_ ) continue; int nEndOfHighQualityReadPos; if ( pLocFrag->bComp() ) { // <----------------------------- // lllllHHHHHHHHHHHHHHHHHHlllllllll // ^ // nGetHighQualityStart() // ^ // nGetAlignEnd() nEndOfHighQualityReadPos = pContig->nUnpaddedIndex( pLocFrag->nGetAlignEnd() ) - pContig->nUnpaddedIndex( pLocFrag->nGetHighQualityStart() ) + 1; } else { // --------------------------------> // llllllHHHHHHHHHHHHHHHllllllllllll // ^ // nGetHighQualityEnd() // ^ // nGetAlignStart() nEndOfHighQualityReadPos = pContig->nUnpaddedIndex( pLocFrag->nGetHighQualityEnd() ) - pContig->nUnpaddedIndex( pLocFrag->nGetAlignStart() ) + 1; } if ( nEndOfHighQualityReadPos >= nTooBig ) continue; dTotalEndOfHighQualityRegions += nEndOfHighQualityReadPos; dTotalSquaredEndOfHighQualityRegions += ( nEndOfHighQualityReadPos * nEndOfHighQualityReadPos ); ++nTotalReads; } } if ( nTotalReads == 0 ) { // this caused floating point exception if there were 0 // reads with high quality segment. This info is not used // in such a case (finding stops) addStopsCausingLCQs so // it really doesn't matter what the value is. // (Helped Kerrie Bubb, 1/14/03) dMeanReadLength_ = 0.0; dStandardDeviationOfReadLength_ = 0.0; bAlreadyCalculatedReadLengthStatistics_ = true; return; } dMeanReadLength_ = dTotalEndOfHighQualityRegions / (double) nTotalReads; dStandardDeviationOfReadLength_ = sqrt( dTotalSquaredEndOfHighQualityRegions / (double) nTotalReads - dMeanReadLength_*dMeanReadLength_ ); bAlreadyCalculatedReadLengthStatistics_ = true; } void Assembly :: showMicrosatellites() { setAllPaddedPositionsArrays(); gotoList* pGotoList = new gotoList(); // first look through the entire assembly for runs for( int nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); const int nExtraXs = 4; int nConsensusLength; char* szUnpaddedConsensus = pContig->szGetUnpaddedConsensus( nConsensusLength, nExtraXs ); // pad on the right with more x's strcat( szUnpaddedConsensus + nConsensusLength, "xxxx" ); int nStringLength = nConsensusLength + nExtraXs; int nBestScore; do { int nEndingRowOfBestScoredAlignment; int nEndingColOfBestScoredAlignment; findRepeat( szUnpaddedConsensus, nStringLength, &nBestScore, &nEndingRowOfBestScoredAlignment, &nEndingColOfBestScoredAlignment ); if ( nBestScore >= pCP->nAutoFinishMinSmithWatermanScoreOfARun_ ) { // report this run RWCString soAlignment1; RWCString soAlignment2; RWCString soAlignment3; int nBeginningRowOfBestScoredAlignment; int nBeginningColOfBestScoredAlignment; findAlignment( szUnpaddedConsensus, nStringLength, nBestScore, nEndingRowOfBestScoredAlignment, nEndingColOfBestScoredAlignment, &nBeginningRowOfBestScoredAlignment, &nBeginningColOfBestScoredAlignment, soAlignment1, soAlignment2, soAlignment3 ); // the unpadded position is 1-based while the // location is zero-based int nUnpaddedLeft = nBeginningRowOfBestScoredAlignment + 1; int nUnpaddedRight = nEndingRowOfBestScoredAlignment + 1; int nUnpaddedLeft2 = nBeginningColOfBestScoredAlignment + 1; int nUnpaddedRight2 = nEndingColOfBestScoredAlignment + 1; // print the alignment to the xterm printf( "\n\n%s score = %d\n", pContig->soGetName().data(), nBestScore ); int nUnpaddedTop = nUnpaddedLeft; int nUnpaddedBottom = nUnpaddedLeft2; for( int nPadded = 0; nPadded <= soAlignment1.length(); nPadded += 50 ) { int nMaxPadded = MIN( soAlignment1.length() - 1, nPadded + 49 ); printf( "%-9d ", nUnpaddedTop ); int nPadded2; for( nPadded2 = nPadded; nPadded2 <= nMaxPadded; ++nPadded2 ) { printf( "%c", soAlignment1[ nPadded2 ] ); if ( soAlignment1[ nPadded2 ] != '*' ) ++nUnpaddedTop; } printf( " %-9d\n", nUnpaddedTop - 1 ); printf( " " ); for( nPadded2 = nPadded; nPadded2 <= nMaxPadded; ++nPadded2 ) printf( "%c", soAlignment2[ nPadded2 ] ); printf( "\n" ); printf( "%-9d ", nUnpaddedBottom ); for( nPadded2 = nPadded; nPadded2 <= nMaxPadded; ++nPadded2 ) { printf( "%c", soAlignment3[ nPadded2 ] ); if ( soAlignment3[ nPadded2 ] != '*' ) ++nUnpaddedBottom; } printf( " %-9d\n", nUnpaddedBottom - 1 ); printf( "\n\n" ); } int nPaddedLeft = pContig->nPaddedIndexFast( nUnpaddedLeft ); int nPaddedRight = pContig->nPaddedIndexFast( nUnpaddedRight ); RWCString soNote( (long) nBestScore ); gotoItem* pGotoItem = new gotoItem( pContig, NULL, nPaddedLeft, nPaddedRight, nUnpaddedLeft, nUnpaddedRight, soNote ); pGotoList->addToList( pGotoItem ); // set up for trying to find another run. // this masks out most of the alignment so that, hopefully, // the next one will be found somewhere else for( int nZeroBased = nUnpaddedLeft - 1; nZeroBased <= ( nUnpaddedRight - 1 ); ++nZeroBased ) { szUnpaddedConsensus[ nZeroBased ] = 'x'; } } } while( nBestScore >= pCP->nAutoFinishMinSmithWatermanScoreOfARun_ ); delete szUnpaddedConsensus; } // for( int nContig ... pGotoList->sortByPosition(); ConsEd::pGetConsEd()->addGuiMultiContigNavigator( new guiMultiContigNavigator( "Microsatellites", "", "", 75, // nWidthInChars "", NULL, NULL, NULL, pGotoList, NULL ) ); } void Assembly :: showMononucleotideRichRegions() { gotoList* pGotoList = new gotoList(); for( int nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); bool bFoundRegion; pContig->addMononucleotideRichRegionsToList( pGotoList, pContig->nGetUnpaddedStartIndex(), pContig->nGetUnpaddedEndIndex(), true, // bNeedToSetPaddedPositionsArrays bFoundRegion ); } ConsEd::pGetConsEd()->addGuiMultiContigNavigator( new guiMultiContigNavigator( "Mononucleotide-Rich Regions", "", "", 75, // nWidthInChars "", NULL, NULL, NULL, pGotoList, NULL ) ); } void Assembly :: showDinucleotideRichRegions() { gotoList* pGotoList = new gotoList(); for( int nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); bool bFoundRegion; pContig->addDinucleotideRichRegionsToList( pGotoList, pContig->nGetUnpaddedStartIndex(), pContig->nGetUnpaddedEndIndex(), true, // bNeedToSetPaddedPositionsArrays bFoundRegion ); } ConsEd::pGetConsEd()->addGuiMultiContigNavigator( new guiMultiContigNavigator( "Dinucleotide-Rich Regions", "", "", 75, // nWidthInChars "", NULL, NULL, NULL, pGotoList, NULL ) ); } void Assembly :: showRunsCausingLCQs() { gotoList* pRunsGotoList = new gotoList(); for( int nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); pContig->addRunsAndMicrosatellitesCausingLCQs( pRunsGotoList, false ); } pRunsGotoList->sortByPosition(); ConsEd::pGetConsEd()->addGuiMultiContigNavigator( new guiMultiContigNavigator( "Runs and Microsatellites Causing Low Consensus Quality Regions", "", "", 75, // nWidthInChars "", NULL, NULL, NULL, pRunsGotoList, NULL ) ); } void Assembly :: showRunsAndStopsCausingLCQs() { gotoList* pRunsAndStopsGotoList = new gotoList(); for( int nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); pContig->addStopsCausingLCQs( pRunsAndStopsGotoList, true ); pContig->addRunsAndMicrosatellitesCausingLCQs( pRunsAndStopsGotoList, false ); } pRunsAndStopsGotoList->sortByPosition(); ConsEd::pGetConsEd()->addGuiMultiContigNavigator( new guiMultiContigNavigator( "Runs and Stops Causing Low Consensus Quality Regions", "", "", 75, // nWidthInChars "", NULL, NULL, NULL, pRunsAndStopsGotoList, NULL ) ); } void Assembly :: showStopsCausingLCQs() { gotoList* pStopsGotoList = new gotoList(); for( int nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); pContig->addStopsCausingLCQs( pStopsGotoList, true ); } pStopsGotoList->sortByPosition(); ConsEd::pGetConsEd()->addGuiMultiContigNavigator( new guiMultiContigNavigator( "Stops Causing Low Consensus Quality Regions", "", "", 75, // nWidthInChars "", NULL, NULL, NULL, pStopsGotoList, NULL ) ); } void Assembly :: showEditableLowConsensusQualityRegions() { cerr << "in showEditableLowConsensusQualityRegions" << endl; gotoList* pEditableSitesList = new gotoList(); for( int nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); pContig->addEditableSites( pEditableSitesList ); } ConsEd::pGetConsEd()->addGuiMultiContigNavigator( new guiMultiContigNavigator( "Editable Low Consensus Quality Regions", "", "", 75, // nWidthInChars "", NULL, NULL, NULL, pEditableSitesList, NULL ) ); } void Assembly :: printMinilibrariesSummaryFile() { // if there is only one contig if ( !pContigEndPairArray_ ) return; for( int nPair = 0; nPair < pContigEndPairArray_->length(); ++nPair ) { contigEndPair* pPair = (*pContigEndPairArray_)[ nPair ]; // there might not be any minilibrary for this contigEndPair if ( !pPair->pChoiceOfTemplatesForMinilibraries_ ) continue; fprintf( pMINILIBRARIES, "%s,%s,%s,%s", pPair->pContig_[0]->soGetName().data(), ( pPair->nWhichEnd_[0] == nLeftGap ? "left" : "right" ), pPair->pContig_[1]->soGetName().data(), ( pPair->nWhichEnd_[1] == nLeftGap ? "left" : "right" ) ); for( int nMini = 0; ( nMini < pCP->nAutoFinishSuggestThisManyMinilibrariesPerGap_ ) && ( nMini < pPair->pChoiceOfTemplatesForMinilibraries_->length() ); ++nMini ) { templateForMinilibrary* pMini = (*(pPair->pChoiceOfTemplatesForMinilibraries_))[ nMini ]; fprintf( pMINILIBRARIES, ",%s", pMini->pSub_->soTemplateName_.data() ); } // for( int nMini = 0; ... fprintf( pMINILIBRARIES, "\n" ); } // for( int nPair = 0; ... } void Assembly :: figureOutRealContigs() { if ( aRealContigs_.length() == 0 ) aRealContigs_.resize( (size_t) ( nNumContigs()/ 10 ) ); aRealContigs_.clear(); for( int nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); if ( pContig->bIAmARealContig() ) { aRealContigs_.insert( pContig ); } } } static int nCompareContigEndPairs( const contigEndPair** ppCEP1, const contigEndPair** ppCEP2 ) { if ( !(*ppCEP1)->bUserDefined_ && !(*ppCEP2)->bUserDefined_ ) { // most common case--both contigEndPair's are based on forward // reverse pairs if ( (*ppCEP1)->nNumberOfForwardReversePairs_ < (*ppCEP2)->nNumberOfForwardReversePairs_ ) return( 1 ); else if ( (*ppCEP1)->nNumberOfForwardReversePairs_ > (*ppCEP2)->nNumberOfForwardReversePairs_ ) return( -1 ); else return( 0 ); } else if ( (*ppCEP1)->bUserDefined_ && !(*ppCEP2)->bUserDefined_ ) return( -1 ); else if ( !(*ppCEP1)->bUserDefined_ && (*ppCEP2)->bUserDefined_ ) return( 1 ); else { // both are user defined. In this case, I really don't care, // but let's sort by contig name to define an order. if ( (*ppCEP1)->pContig_[0]->soGetName() < (*ppCEP2)->pContig_[0]->soGetName() ) return( -1 ); else if ( (*ppCEP1)->pContig_[0]->soGetName() == (*ppCEP2)->pContig_[0]->soGetName() ) return( 0 ); else return( 1 ); } } // currently (July 2002) I see only 3 uses for this: // assemblyView // restriction digest // showing the scaffold // I do not see this used by autoFinish. void Assembly :: figureOutContigOrderAndOrientation() { FILE* pSaveAO = pAO; if ( pAO == stderr && !pCP->filDumpContigOrderAndOrientationInfoToThisFile_.isNull() ) { pAO = fopen( pCP->filDumpContigOrderAndOrientationInfoToThisFile_.data(), "w" ); if ( !pAO ) { fprintf( stderr, "failed to open %s: %s\n", pCP->filDumpContigOrderAndOrientationInfoToThisFile_.data(), soGetErrno().data() ); } } setContigTemplateArrays(); figureOutRealContigs(); if ( pContigEndPairArray_ ) { pContigEndPairArray_->clearAndDestroy(); } else { pContigEndPairArray_ = new RWTPtrOrderedVector( (size_t) ( aRealContigs_.length() * 2 ), "Assembly::figureOutContigOrderAndOrientation ) pContigEndPairArray_" ); } // if we are clearAndDestroy'ing pContigEndPairArray_, we darn // well better not continue to point to those contigEndPair's. // (added Jan, 2002 to make assemblyView able to come up more // than once) int nContig; for( nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); pContig->pContig_[ nLeftGap ] = NULL; pContig->pContig_[ nRightGap ] = NULL; pContig->nWhichEnd_[ nLeftGap ] = 0; pContig->nWhichEnd_[ nRightGap ] = 0; pContig->pCEP_[ nLeftGap ] = NULL; pContig->pCEP_[ nRightGap ] = NULL; } getUserDefinedContigEndPairs( *pContigEndPairArray_ ); RWTPtrSortedVector aContigsConsideredSoFar( (size_t) nNumContigs() ); for( nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); for( int nGap = 0; nGap <= 1; ++nGap ) { int nWhichGap = ( nGap == 0 ) ? nLeftGap : nRightGap; pContig->findNeighborForOneGap2( nWhichGap, aContigsConsideredSoFar, pContigEndPairArray_ ); } // for( int nGap = 0; aContigsConsideredSoFar.insert( pContig ); } // for( nContig = 0; nContig // sort the pairs based on the number of forward/reverse pairs // supporting the ordering void* pArray = (void*) pContigEndPairArray_->data(); size_t nNumberOfElementsInArray = pContigEndPairArray_->length(); size_t nSizeOfAnElement = sizeof( contigEndPair* ); qsort( pArray, nNumberOfElementsInArray, nSizeOfAnElement, ( ( int(*) ( const void*, const void*) ) nCompareContigEndPairs ) ); for( int nContigEndPair = 0; nContigEndPair < (int) pContigEndPairArray_->length() - 1; ++nContigEndPair ) { contigEndPair* pPair1 = (*pContigEndPairArray_)[ nContigEndPair ]; contigEndPair* pPair2 = (*pContigEndPairArray_)[ nContigEndPair + 1 ]; if ( !pPair1->bUserDefined_ && !pPair2->bUserDefined_ && pPair1->nNumberOfForwardReversePairs_ < pPair2->nNumberOfForwardReversePairs_ ) { RWCString soMessage = RWCString( "qsort of pContigEndPairArray_ in assembly2.cpp ") + RWCString( (long) __LINE__ ) + " did not work with positions " + RWCString( (long) nContigEndPair ) + " and " + RWCString( (long) ( nContigEndPair + 1 ) ) + " out of a total of " + RWCString( (long) pContigEndPairArray_->length() ) + " pairs"; throw ProgramLogicError( soMessage ); } } // work through the contigEndPairs from front to back, constructing // chains by linking them together RWTPtrOrderedVector aContigEndPairsCouldNotBePlacedInFirstPass( (size_t) pContigEndPairArray_->length() ); int nTryToPlaceThisPair; for( nTryToPlaceThisPair = 0; nTryToPlaceThisPair < pContigEndPairArray_->length(); ++nTryToPlaceThisPair ) { contigEndPair* pTryToPlacePair = (*pContigEndPairArray_)[ nTryToPlaceThisPair ]; // see if this pair can be joined with any of the chains // already constructed Contig* pContig0 = pTryToPlacePair->pContig_[0]; Contig* pContig1 = pTryToPlacePair->pContig_[1]; int nWhichEnd0 = pTryToPlacePair->nWhichEnd_[0]; int nWhichEnd1 = pTryToPlacePair->nWhichEnd_[1]; // int nIndexInContig0 = ( nWhichEnd0 == nLeftGap ) ? 0 : 1; // int nIndexInContig1 = ( nWhichEnd1 == nLeftGap ) ? 0 : 1; // check for consistency with the contig end pairs already // recorded. if ( pContig0->pContig_[ nWhichEnd0 ] && pContig1->pContig_[ nWhichEnd1 ] ) { if ( ( pContig0->pContig_[ nWhichEnd0 ] == pContig1 ) && ( pContig1->pContig_[ nWhichEnd1 ] == pContig0 ) && ( pContig0->nWhichEnd_[ nWhichEnd0 ] == nWhichEnd1 ) && ( pContig1->nWhichEnd_[ nWhichEnd1 ] == nWhichEnd0 ) ) { // we've already placed this pair, so ignore it. // This could happen if // user defined contig end pairs could have 2 copies // of the same pair...DG March 2004) continue; } } if ( pContig0->pContig_[ nWhichEnd0 ] || pContig1->pContig_[ nWhichEnd1 ] ) { // inconsistency, maybe. One (or both) of these ends // is already joined to some other contig end. But perhaps // the two scaffolds can be merged by interlacing them. aContigEndPairsCouldNotBePlacedInFirstPass.insert( pTryToPlacePair ); } else { // neither end has yet been placed pContig0->pContig_[ nWhichEnd0 ] = pContig1; pContig0->nWhichEnd_[ nWhichEnd0 ] = nWhichEnd1; pContig0->pCEP_[ nWhichEnd0 ] = pTryToPlacePair; pContig1->pContig_[ nWhichEnd1 ] = pContig0; pContig1->nWhichEnd_[ nWhichEnd1 ] = nWhichEnd0; pContig1->pCEP_[ nWhichEnd1 ] = pTryToPlacePair; } } // for( int nTryToPlaceThisPair = 0; // Now see if some of the contigEndPairs that couldn't be placed // in the first pass are // actually consistent--they just have to squeeze between // the consistent (and bigger) contigs. Or perhaps they are consistent // in this manner: // ____ ____ (existing linked contigs ) // A B // ____ ______ ("inconsistent linked contigs) // A C // In such a case, I need to add C. for( nTryToPlaceThisPair = 0; nTryToPlaceThisPair < aContigEndPairsCouldNotBePlacedInFirstPass.length(); ++nTryToPlaceThisPair ) { contigEndPair* pInconsistentContigEndPair = aContigEndPairsCouldNotBePlacedInFirstPass[ nTryToPlaceThisPair ]; if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO, "trying to use inconsistent link %s\n", pInconsistentContigEndPair->soGetDescription().data() ); } tryToMergeContigEndPairWithExistingScaffold( pInconsistentContigEndPair ); } // at this point, how can we find the chains? No, better to just // look through the *contigs*, not the contig-end-pairs, and find // chains that way. Any contig that has one or the other link // missing is the end of a chain. So then follow the links to get // all contigs in the chain. // One other issue: shall we invert the chains? We can deal with // that when we get to the ends. We probably should keep a list of // the chains. I think we are not inverting the scaffolds. (June, 2001) aHeadsOfScaffoldsOfContigs_.clear(); aScaffoldIsCircularNotLinear_.clear(); for( nContig = 0; nContig < nNumContigs(); ++nContig ) { pGetContig( nContig )->bPlacedInAScaffold_ = false; } for( nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); // ignore little contigs that are not connected to anything if ( !pContig->bIAmARealContig() && !pContig->pContig_[ nLeftGap ] && !pContig->pContig_[ nRightGap ] ) continue; // we are only interested in heads of scaffolds. Heads will not // have pointers to previous and next contigs. if ( !pContig->bPlacedInAScaffold_ && ! (pContig->pContig_[nLeftGap] && pContig->pContig_[nRightGap] ) ) { pContig->bPlacedInAScaffold_ = true; aHeadsOfScaffoldsOfContigs_.insert( pContig ); aScaffoldIsCircularNotLinear_.insert( false ); // we've found a contig that is in a scaffold, but // we haven't yet put it in the list of scaffolds pContig->pPreviousContig_ = NULL; // we are looking for the end that is null int nPreviousContigWasConnectedAtWhichEndOfThisContig = ( pContig->pContig_[nLeftGap] ? nRightGap : nLeftGap ); pContig->pNextContig_ = pContig->pGetNextContigInScaffold( nPreviousContigWasConnectedAtWhichEndOfThisContig, pContig->bThisContigIsComplementedInTheScaffold_, nPreviousContigWasConnectedAtWhichEndOfThisContig ); // this contig might be in a chain all by itself. If not, // mark all other contigs in the chain. while( pContig->pNextContig_ ) { Contig* pPreviousContig = pContig; pContig = pContig->pNextContig_; pContig->pPreviousContig_ = pPreviousContig; pContig->bPlacedInAScaffold_ = true; pContig->pNextContig_ = pContig->pGetNextContigInScaffold( nPreviousContigWasConnectedAtWhichEndOfThisContig, pContig->bThisContigIsComplementedInTheScaffold_, nPreviousContigWasConnectedAtWhichEndOfThisContig ); } // when reached here, we have found the end of the chain. } // if ( !pContig->bPlacedInAScaffold_ ) { } // nContig = 0; nContig < aRealContigs_.length(); ++nContig ) { // there may be some circular scaffolds. The contigs in such scaffolds // will not yet be put into any scaffolds. for( nContig = 0; nContig < aRealContigs_.length(); ++nContig ) { Contig* pContig = aRealContigs_[ nContig ]; if ( !pContig->bPlacedInAScaffold_ ) { // this must be in a circular scaffold assert( pContig->pContig_[ nLeftGap ] && pContig->pContig_[ nRightGap ] ); // start a scaffold with this contig pContig->bPlacedInAScaffold_ = true; aHeadsOfScaffoldsOfContigs_.insert( pContig ); aScaffoldIsCircularNotLinear_.insert( true ); // follow the way around this scaffold Contig* pOriginalContig = pContig; pContig->pPreviousContig_ = NULL; pContig->pNextContig_ = pContig->pContig_[ nRightGap ]; pContig->bThisContigIsComplementedInTheScaffold_ = false; int nPreviousContigWasConnectedAtWhichEndOfThisContig = pContig->nWhichEnd_[ nRightGap ]; while( pContig->pNextContig_ != pOriginalContig ) { Contig* pPreviousContig = pContig; pContig = pContig->pNextContig_; pContig->pPreviousContig_ = pPreviousContig; pContig->bPlacedInAScaffold_ = true; pContig->pNextContig_ = pContig->pGetNextContigInScaffold( nPreviousContigWasConnectedAtWhichEndOfThisContig, pContig->bThisContigIsComplementedInTheScaffold_, nPreviousContigWasConnectedAtWhichEndOfThisContig ); } // pContig->pNextContig_ is pointing to the head of the scaffold. // Although this is correct, it will cause problems with the digest // code which wants a null at the end to indicate the end. pContig->pNextContig_ = NULL; } // if ( !pContig->bPlacedInAScaffold_ ) { } // for( nContig = 0; nContig < aRealContigs_.length(); ++nContig ) { // at this point, all real contigs should be placed in scaffolds RWCString soErrorMessage; for( nContig = 0; nContig < aRealContigs_.length(); ++nContig ) { Contig* pContig = aRealContigs_[ nContig ]; if ( !pContig->bPlacedInAScaffold_ ) { soErrorMessage += pContig->soGetName(); soErrorMessage += " is not in a scaffold "; } } if ( ! soErrorMessage.isNull() ) { THROW_ERROR2( soErrorMessage ); } // Some of the chains will not contain any "real" contigs (contigs // that meet all the criteria to be shown). I wish I could have // eliminated those chains earlier, but that is difficult because // any little contig should be included if it has fwd/rev pair data // connecting it to a real contig, so it can only be eliminated // after creating the chain. The heads of chains may be little // contigs, so I can't run the filter on the heads of contigs. // Thus chains will be eliminated at this stage. // Similarly, some scaffolds will not include any user-requested contigs. // Eliminate such scaffolds (except for the case in which the user is // not requesting any contigs--then this filter should not apply). // reverse any chains that must be because most bases are in complemented // contigs RWTPtrOrderedVector aUserDesiredContigs; if ( !pCP->soAssemblyViewContigsToShow_.isNull() ) { RWCTokenizer tokContigs( pCP->soAssemblyViewContigsToShow_ ); RWCString soOneContig; while( !( soOneContig = tokContigs(',')).isNull() ) { Contig* pContig = ConsEd::pGetAssembly()->pGetContigByVariousNames( soOneContig ); if ( pContig ) aUserDesiredContigs.insert( pContig ); } } int nChain; for( nChain = aHeadsOfScaffoldsOfContigs_.length() - 1; nChain >= 0; --nChain ) { Contig* pContig = aHeadsOfScaffoldsOfContigs_[ nChain ]; int nComplementedBases = 0; int nUncomplementedBases = 0; bool bFoundARealContig = false; bool bFoundAUserRequestedContigInThisScaffold = false; while( pContig ) { if ( pContig->bIAmARealContig() ) bFoundARealContig = true; if ( aUserDesiredContigs.index( pContig ) != RW_NPOS ) { bFoundAUserRequestedContigInThisScaffold = true; } if ( pContig->bThisContigIsComplementedInTheScaffold_ ) nComplementedBases += pContig->nGetUnpaddedSeqLength(); else nUncomplementedBases += pContig->nGetUnpaddedSeqLength(); pContig = pContig->pNextContig_; } if ( !bFoundARealContig || (aUserDesiredContigs.length() != 0 && !bFoundAUserRequestedContigInThisScaffold ) ) { // case in which the chain only has little contigs // So delete the chain. aHeadsOfScaffoldsOfContigs_.removeAt( nChain ); continue; // next scaffold } if ( nComplementedBases > nUncomplementedBases ) { // reverse the entire chain Contig* pLastContig = NULL; pContig = aHeadsOfScaffoldsOfContigs_[ nChain ]; while( 1 ) { Contig* pOldNextContig = pContig->pNextContig_; pContig->pNextContig_ = pContig->pPreviousContig_; pContig->pPreviousContig_ = pOldNextContig; pContig->bThisContigIsComplementedInTheScaffold_ = !pContig->bThisContigIsComplementedInTheScaffold_; if ( !pOldNextContig ) { pLastContig = pContig; break; } pContig = pOldNextContig; } aHeadsOfScaffoldsOfContigs_[ nChain ] = pLastContig; } } // for( nChain = 0; ... // print out the chains for( nChain = 0; nChain < aHeadsOfScaffoldsOfContigs_.length(); ++nChain ) { Contig* pContig = aHeadsOfScaffoldsOfContigs_[ nChain ]; // first contig in chain: if ( pContig->bThisContigIsComplementedInTheScaffold_ ) { if ( pContig->bIsThisEndTheEndOfTheClone( nRightGap ) ) { if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO, "right end of %s is the end of the clone\n", pContig->soGetName().data() ); } } else { if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO, "unknown contig on right of %s\n", pContig->soGetName().data() ); } } } else { if ( pContig->bIsThisEndTheEndOfTheClone( nLeftGap ) ) { if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO, "left end of %s is the end of the clone\n", pContig->soGetName().data() ); } } else { if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO, "unknown contig on left of %s\n", pContig->soGetName().data() ); } } } // set up for beginning of connection to next contig, if there is one if ( pContig->pNextContig_ ) { if ( pContig->bThisContigIsComplementedInTheScaffold_ ) { if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO, "left end of %s ", pContig->soGetName().data() ); } } else { if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO, "right end of %s ", pContig->soGetName().data() ); } } } while( pContig->pNextContig_ ) { pContig = pContig->pNextContig_; if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO, "connected to %s end of %s\n", ( pContig->bThisContigIsComplementedInTheScaffold_ ? "right" : "left" ), pContig->soGetName().data() ); } } // reached end of the chain if ( pContig->bThisContigIsComplementedInTheScaffold_ ) { if ( pContig->bIsThisEndTheEndOfTheClone( nLeftGap ) ) { if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO, " and left end of %s is the clone end\n", pContig->soGetName().data() ); } } else { if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO, " and left end of %s is connected to unknown contig\n", pContig->soGetName().data() ); } } } else { if ( pContig->bIsThisEndTheEndOfTheClone( nRightGap ) ) { if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO, " and right end of %s is the clone end\n", pContig->soGetName().data() ); } } else { if ( pCP->bPrintInFigureOutContigOrderAndOrientation_ ) { fprintf( pAO, " and right end of %s is connected to unknown contig\n", pContig->soGetName().data() ); } } } } // for( int nChain = 0; nChain < aChainsOfContigs.length(); ... if ( pAO != pSaveAO ) { fclose( pAO ); pAO = pSaveAO; } } // void Assembly :: figureOutContigOrderAndOrientation() { void Assembly :: insertContigEndPairIfItIsNotAlreadyThere( contigEndPair*& pPairToInsert ) { for( int nContigEndPair = 0; nContigEndPair < pContigEndPairArray_->length(); ++nContigEndPair ) { contigEndPair* pPair = (*pContigEndPairArray_)[ nContigEndPair ]; if ( ( pPair->pContig_[nLeftGap] == pPairToInsert->pContig_[nLeftGap] ) && ( pPair->pContig_[nRightGap] == pPairToInsert->pContig_[nRightGap] ) && ( pPair->nWhichEnd_[nLeftGap] == pPairToInsert->nWhichEnd_[nLeftGap] ) && ( pPair->nWhichEnd_[nRightGap] == pPairToInsert->nWhichEnd_[nRightGap] ) ) { // found the same pair of contig ends. pPairToInsert = pPair; return; } } // if reached here, did not find the same contig/end pair pContigEndPairArray_->insert( pPairToInsert ); }