/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "addNewReadsWithExistingAlignments.h" #include "consedParameters.h" #include "filename.h" #include "mbt_exception.h" #include "consed.h" #include #include #include #include "maybeTerminateIfAnotherReadWriteConsed.h" #include "addNewReads.h" #include "filGetUniqueFilename.h" #include "fileDefines.h" #include "soLine.h" #include "printTime.h" #include "rwctokenizer.h" #include "filFindSolexaPrbFileFromSeqFile.h" #include "szGetTime.h" #include "bIsNumericFloat.h" #include "nNumberFromBase2.h" #include "soGetDateTime.h" #include "soLine.h" #include "rwcregexp.h" #include "crossMatchInfoForRead.h" #include "soGetChemistry.h" // non-solexa call chain: // doIt calls // parseAlignmentsAndAddNewReads which calls // readBasesAndQualitiesForAddedReads() which calls // readFileOfPhdFilesForAddedReads addNewReadsWithExistingAlignments :: addNewReadsWithExistingAlignments( const FileName& filAceFileToOpen, const FileName& filNewAceFile, const FileName& filFOFOfAlignmentFiles, const RWCString& soChemistry ) : filAceFileToOpen_( filAceFileToOpen ), filFOFOfAlignmentFiles_( filFOFOfAlignmentFiles ), filNewAceFile_( filNewAceFile ) { if ( soChemistry == "solexa" ) { nChemistry_ = nSolexa; } else if ( soChemistry == "454" ) { nChemistry_ = n454; } else if ( soChemistry == "sanger" ) { nChemistry_ = nSanger; } else if ( soChemistry == "unknown" ) { // added July 2010 for aligning fake reads (from fasta files) // against existing contigs nChemistry_ = nUnknown; } else { THROW_ERROR( "unrecognized chemistry: " + soChemistry ); } } void addNewReadsWithExistingAlignments :: doIt() { pCP->bOKToUseGui_ = false; // I have not used this file and it is just a nuisance to have to delete // this zero-length file with every run. // pCP->filAutoFinishFullOutput_ = // filGetUniqueFilename( filAceFileToOpen_.soGetProjectFromAceFilename() ) // + ".out"; // pAO = fopen( pCP->filAutoFinishFullOutput_.data(), "w" ); // if ( !pAO ) { // RWCString soError = "could not open log file "; // soError += pCP->filAutoFinishFullOutput_; // soError += " for writing."; // THROW_ERROR( soError ); // } // pFILE = pAO; // FILE* pFOF = fopen( "addNewReads.fof", "w" ); // if ( pFOF ) { // // just ignore the problem if we can't write to this file // fprintf( pFOF, "%s\n", pCP->filAutoFinishFullOutput_.data() ); // fclose( pFOF ); // } try { checkThatCanReadAllAlignmentFiles(); // create the ConsEd object, which owns and manages the Assembly // data structures as well as the lists of currently existing // ContigWin and Teditor objects (which there will be none in // this case). There is a global point to this sole instance, // set in the ctor. ConsEd* pConsed = new ConsEd(); // This is perhaps archaic--it might not be used at all. So why // force user to create it? // terminateIfNoPhdDir(); maybeTerminateIfAnotherReadWriteConsed(); // this does "new Assembly" ConsEd::pGetConsEd()->openAssemblyFile( filAceFileToOpen_ ); parseAlignmentsAndAddNewReads(); printTime( "now saving assembly..." ); saveAssembly(); printTime( "done" ); } catch( ExceptionBase eb ) { if ( !eb.bUserNotified() ) { fprintf( pAO, "Fatal: exception thrown: %s\n", eb.szGetDesc() ); } fclose( pAO ); cerr << "See log file: " << pCP->filAutoFinishFullOutput_ << endl; exit( 1 ); } fclose( pAO ); cerr << "See log file: " << pCP->filAutoFinishFullOutput_ << endl; // ring the bell for( int n = 0; n < 100; ++n ) { fprintf( stderr, "\a\a\a\a\a\a" ); fflush( stderr ); } } void addNewReadsWithExistingAlignments :: parseAlignmentsAndAddNewReads() { FILE* pFOFOfAlignmentFiles = fopen( filFOFOfAlignmentFiles_.data(), "r" ); if ( !pFOFOfAlignmentFiles ) { THROW_FILE_ERROR( filFOFOfAlignmentFiles_ ); } // count alignment files int nTotalAlignmentFiles = 0; char c; while( ( c = getc( pFOFOfAlignmentFiles ) ) != EOF ) { if ( c == '\n' ) { ++nTotalAlignmentFiles; } } rewind( pFOFOfAlignmentFiles ); const bool bUsingAddReads2ConsedScript = false; pAddNewReads_ = new addNewReads( bUsingAddReads2ConsedScript ); pAddNewReads_->pAddNewReadsWithExistingAlignments_ = this; // pAddNewReads_->zeroAddNewReadsContigCounts(); int nPreallocatedSize = pCP->nSolexaAlignmentFilesPerInsertingPadsCycle_ * pCP->nSolexaAlignmentsPerAlignmentFile_; // add 20% more room nPreallocatedSize += nPreallocatedSize / 5; pAddNewReads_->aCrossMatchInfoForReads_.resize( nPreallocatedSize ); int nAlignmentFiles = 0; int nAlignmentFilesReadSoFarBeforeInsertingPads = 0; while( fgets( soLine.data(), nMaxLineSize, pFOFOfAlignmentFiles ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); ++nAlignmentFiles; FileName filAlignmentFile = soLine; // use some of the code from addNewReads--the interactive method // of adding new reads ++nAlignmentFilesReadSoFarBeforeInsertingPads; if ( nAlignmentFilesReadSoFarBeforeInsertingPads == 1 ) { } assert( pAddNewReads_ ); pAddNewReads_->readAlignmentFile( filAlignmentFile ); fprintf( stdout, "read %d of %d alignment files with %d total alignments\n", nAlignmentFiles, nTotalAlignmentFiles, pAddNewReads_->aCrossMatchInfoForReads_.length() ); pAddNewReads_->readBasesAndQualitiesForAddedReads(); } // while( fgets( soLine.data(), nMaxLineSize, pFOFOfAlignmentFiles )... fclose( pFOFOfAlignmentFiles ); // no longer supported--superceded by consed -fixContigEnds (Oct 2010) // if ( pCP->bAddNewReadsExtendConsensusUsingProtrudingNewReads_ ) { // pAddNewReads_->runMiniAssembliesOnProtrudingReads(); // } pAddNewReads_->insertPadsInContigs(); // this adds the read to the contig pAddNewReads_->insertPadsInNewReadsAndSetReadBases(); pAddNewReads_->convertVectorTagsToXsInNewReads(); // RWCString soMessage; // pAddNewReads_->tellUserAllAboutIt2( soMessage ); // fprintf( pAO, "%s", soMessage.data() ); // there are too many of these, too many reads, no one will use it, // and it takes too much disk space // pAddNewReads_->createNavigationFile(); if ( pCP->bAddNewReadsRecalculateConsensusQuality_ ) { pAddNewReads_->recalculateConsensusQualityValues(); } } // parseAlignmentsAndAddNewReads void addNewReadsWithExistingAlignments :: checkThatCanReadAllAlignmentFiles() { FILE* pFOFOfAlignmentFiles = fopen( filFOFOfAlignmentFiles_.data(), "r" ); if ( !pFOFOfAlignmentFiles ) { THROW_FILE_ERROR( filFOFOfAlignmentFiles_ ); } while( fgets( soLine.data(), nMaxLineSize, pFOFOfAlignmentFiles ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); FileName filAlignmentFile = soLine; FILE* pAlignmentFile = fopen( filAlignmentFile.data(), "r" ); if ( !pAlignmentFile ) { THROW_FILE_ERROR( filAlignmentFile ); } fclose( pAlignmentFile ); } fclose( pFOFOfAlignmentFiles ); } void addNewReadsWithExistingAlignments :: saveAssembly() { if ( filNewAceFile_.isNull() ) { filNewAceFile_ = filAceFileToOpen_.filGetNextVersion(); // find next version that doesn't already exist while( filNewAceFile_.bFileByThisNameExists() ) { filNewAceFile_ = filNewAceFile_.filGetNextVersion(); } } ConsEd::pGetAssembly()->saveToFile( filNewAceFile_, false ); // ace file format2 RWCString soAceFileToPrint = filNewAceFile_; // this mismash means the path starts with ./(dkdkdkd) where the // (dkdkdkd) part should not have any "/" in it RWCRegexp regPattern( "^\\./[^/]+$" ); if ( soAceFileToPrint.bContains( regPattern ) ) { soAceFileToPrint.bStartsWithAndRemove( "./" ); } cerr << "See new ace file " << soAceFileToPrint << endl; }