/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #define MY_DEBUG #include "addNewReads.h" #include "popupErrorMessage.h" #include using namespace std; #include "consedParameters.h" #include "filename.h" #include "consed.h" #include "assembly.h" #include "filePopupAndGetFilename.h" #include #include #include "locatedFragment.h" #include "mbt_errors.h" #include "nCountPadsAndConvertFormat.h" #include "quality.h" #include "textbox.h" #include #include "choose_contig.h" #include "fragment_and_contig.h" #include "guiTopWindow.h" #include #include "please_wait.h" #include "bGuiGetAnswerYesNo.h" #include "readPHD.h" #include "fileDefines.h" #include "mbtValOrderedVectorOfRWCString.h" #include "soGetDateTime.h" #include "soFindFlowCellFromPath.h" #include "soLine.h" #include "rwctokenizer.h" #include #include "guiAddNewReads.h" #include "printTime.h" #include "crossMatchInfoForRead.h" #include "bIsNumericMaybeWithWhitespace.h" #include "soAddCommas.h" #include "addNewReadsWithExistingAlignments.h" #include "soGetChemistry.h" #include "regionOfSequence.h" #include "regionOfSequenceArray.h" #include "maybeTerminateIfAnotherReadWriteConsed.h" #include "terminateIfDirectoryIsNotWritable.h" #include "bIsNumeric.h" #include #include "writeNewAceFileNameToFile.h" #include "contigEndReadList.h" #include "szGetTime.h" // I'm going to try to do everything in here, including error messages, // waiting, etc. // throw 1 doesn't do anything except abort the constructor and clean up // the partially built object. The error message is reported within the // routine, so nothing need be done except catch the exception, which // is done in guiTopWindow.cpp #define MYTHROW { InputDataError ide; throw ide; } // This is how adding new reads works: // The user supplies a file containing the list of reads to be added. // 1) bRunAddReads2Consed // The script addReads2Consed.perl runs phred on each of these reads // for which there is no existing phd file. The script makes a fasta // file from the phd files. Then this is masked for vector. Then the // ace file is used to construct a fasta file of the contigs. Then // crossmatch is run between the contigs and the masked new reads. // 2) bParseAlignmentsAndAddReads() // Consed reads the resulting alignments (which may just be for // portions of the reads, and some reads may not have any alignment at // all). This also "new LocatedFragment" // 3) readPHDFilesForNewReads(); // Consed reads the phd files for the reads and sets the phred // called bases. // 4) insertPadsInNewReadsAndSetReadBases() // Then consed moves the bases one by one from the // phred calls to the LocatedFragment, adding pads as determined by // the alignment. // Pads are also inserted into the contigs. Finally, // the user is informed of which reads were inserted and where they // were inserted. // For solexa reads, reads are typically added through // addNewReadsWithExistingAlignments. In this case, crossmatch // is already run. This has the advantage of consed and crossmatch // not being in memory at the same time. // If you are determined to add solexa reads the old way, you can no // longer do so. void addNewReads :: doIt() { if ( !bCheckThatAceFileOnDiskIsCurrent() ) MYTHROW; if ( !bCheckThatAllExecutablesExist() ) MYTHROW; if ( !bAskUserForNameOfFileWithReadNames() ) MYTHROW; PleaseWait* pPleaseWait = new PleaseWait( GuiApp::pGetGuiApp()->widGetTopLevel() ); if ( !bCheckThatFOFExists() ) MYTHROW; if ( !bRunAddReads2Consed() ) MYTHROW; if ( !bParseAlignmentsAndAddReads() ) MYTHROW; ConsEd::pGetConsEd()->closeAllWindows(); // this fixes bug in which newly added reads do not have // pSub_ (template) set and this can lead to segmentation // fault in assemblyView 8/05 (DG) ConsEd::pGetConsEd()->whatToDoBeforeModifyAssembly(); readPHDFilesForNewReads(); insertPadsInContigs(); insertPadsInNewReadsAndSetReadBases(); if ( nMode_ == nAddReads2ConsedScript ) readPHDFilesJustForTagsAndWholeReadItems(); convertVectorTagsToXsInNewReads(); if ( bIfAReadDoesNotFitPutItIntoItsOwnContig_ ) makeReadsThatDidNotAlignIntoTheirOwnContigs(); ConsEd::pGetAssembly()->setNumberOfReadsInAssemblyAccurate(); updateFirstAndLastDisplayableContigPositions(); if ( bRecalculateConsensusQualityValues_ ) recalculateConsensusQualityValues(); updateListOfReadsInTopLevelWindow(); ConsEd::pGetAssembly()->setChanged(); delete pPleaseWait; tellUserAllAboutIt(); // clean up all the memory we used delete this; } addNewReads :: ~addNewReads() { aCrossMatchInfoForReads_.clearAndDestroy(); } #define PARSE_PANIC( szMessage ) \ { ostringstream ost; \ ost << "crossmatch output file error detected from source file " \ << __FILE__ << " at " << __LINE__ <filFullPathnameOfCrossMatch_.bIsExecutable() ) { popupErrorMessage( "I can't find %s or I don't have permission to execute it. The path of this executable is set via a lien in your ~/.consedrc file. You should put a line in that file that looks like this:\nconsed.fullPathnameOfCrossMatch: /usr/local/genome/bin/cross_match\n(That should be all on one line)\nThe path /usr/local/genome/bin/cross_match should be changed to reflect wherever this file is kep on your computer. For more information, click on 'Help' in the top right of the aligned reads window and see the section entitled \"CONSED CUSTOMIZATION\"", pCP->filFullPathnameOfCrossMatch_.data() ); } } else { if ( ! pCP->filFullPathnameOfAddReads2ConsedScript_.bIsExecutable() ) { popupErrorMessage( "I can't find %s or I don't have permission to execute it. The path of this executable is set via a line in your ~/.consedrc file. You should put a line in that file that looks like this:\nconsed.fullPathnameOfAddReads2ConsedScript: /usr/local/genome/bin_common/addReads2Consed.perl\n(That should be all on one line)\nThe path /usr/local/genome/bin/addReads2Consed.perl should be changed to reflect wherever this file is kept on your computer. For more information, click on 'Help' in the top right of the aligned reads window and see the section entitled \"CONSED CUSTOMIZATION\"", (char*) consedParameters::pGetConsedParameters()->filFullPathnameOfAddReads2ConsedScript_.data() ); return( false ); } } // if reached here, all executables are in place return( true ); } // returns false if the user wants to cancel out bool addNewReads :: bAskUserForNameOfFileWithReadNames() { guiAddNewReads* pGuiAddNewReads = new guiAddNewReads(); bool bUserPushedCancel; pGuiAddNewReads->createWindowAndWaitForUserResponse( bUserPushedCancel, filReadsToAddFOF_, bRecalculateConsensusQualityValues_, bIfAReadDoesNotFitPutItIntoItsOwnContig_ ); nMode_ = nAddReads2ConsedScript; if ( bUserPushedCancel ) return false; if ( filReadsToAddFOF_.soGetBasename().isNull() ) { // makes more sense to just return, since the user might // have pushed cancel // popupErrorMessage( "You must specify a filename" ); return( false ); } return( true ); } bool addNewReads :: bCheckThatFOFExists() { if ( !filReadsToAddFOF_.bFileByThisNameExists() ) { popupErrorMessage( "File %s does not exist", (char*) filReadsToAddFOF_.data() ); return( false ); } return( bCheckThatFOFExistsNotSolexa() ); } bool addNewReads :: bCheckThatFOFExistsNotSolexa() { aTryingToAddTheseReads_.clear(); FILE* pReadsToAddFOF = fopen( (char*) filReadsToAddFOF_.data(), "r" ); if ( !pReadsToAddFOF ) { popupErrorMessage( "Couldn't open file %s", (char*) filReadsToAddFOF_.data() ); return( false ); } while( fgets( szLine_, SZLINE_SIZE, pReadsToAddFOF ) != NULL ) { RWCString soLine = szLine_; soLine = soLine.strip( RWCString::TRAILING, '\n' ); // fuss around removing the .exp if it is there int nLength = soLine.length(); if ( nLength > 4 ) { if ( soLine( nLength - 4, 4 ) == ".exp" ) soLine = soLine( 0, nLength - 4 ); } RWCString soNewReadName = soLine; if ( soNewReadName.index("/") != RW_NPOS ) { popupErrorMessage( "The list of reads must be simple filenames--not including the pathname or directory as with %s", (const char*) soNewReadName ); MYTHROW; } // there used to be a bug with extra carriage returns on the end if ( soLine.isNull() ) continue; aTryingToAddTheseReads_.insert( soNewReadName ); LocatedFragment* pLocFrag = ConsEd::pGetAssembly()->pGetLocatedFragmentByName( soNewReadName ); if ( pLocFrag ) { popupErrorMessage( "Sorry--there is already a read by the name %s in the assembly and you are trying to add another one.", (char*) soNewReadName.data() ); MYTHROW; } } fclose( pReadsToAddFOF ); RWCString soErrorMessage; if ( bAreThereAnyDuplicates( soErrorMessage ) ) { popupErrorMessage( soErrorMessage ); MYTHROW; } return( true ); } bool addNewReads :: bAreThereAnyDuplicates( RWCString& soErrorMessage ) { // check that no read is in here twice mbtValOrderedVectorOfRWCString aReadNames( aTryingToAddTheseReads_.length() ); int nRead; for( nRead = 0; nRead < aTryingToAddTheseReads_.length(); ++nRead ) { aReadNames.insert( aTryingToAddTheseReads_[ nRead ] ); } // sort this list aReadNames.resort(); for( nRead = 1; nRead < aReadNames.length(); ++nRead ) { if ( aReadNames[ nRead - 1 ] == aReadNames[ nRead ] ) { soErrorMessage = "Fatal: Bad boy!\nYou are trying to add 2 copies of read "; soErrorMessage += aReadNames[ nRead ]; if ( !filReadsToAddFOF_.isNull() ) { // this is necessary since if bAreThereAnyDuplicates is // called by addNewReadsAutomated, filReadsToAddFOF_ is null soErrorMessage += " in file "; soErrorMessage += filReadsToAddFOF_; } return( true ); } } soErrorMessage = ""; return( false ); } bool addNewReads :: bRunAddReads2Consed() { assert( nMode_ == nAddReads2ConsedScript ); RWCString soCommand = consedParameters::pGetConsedParameters()->filFullPathnameOfAddReads2ConsedScript_; soCommand += " "; soCommand += ConsEd::pGetAssembly()->soGetAceFileName(); if ( !filReadsToAddFOF_.isNull() ) { soCommand += " "; soCommand += filReadsToAddFOF_; } time_t timee; time( &timee ); const int nTimeSize = 100; char szTime[ nTimeSize ]; // all this funny business is just to hide this from SCCS strftime( szTime, nTimeSize, "%H""%M""%S", localtime( &timee ) ); filAlignments_ = "addReadsAlignments"; filAlignments_ += szTime; soCommand += " "; soCommand += filAlignments_; cout << "about to execute: " << soCommand << endl; int nRetStat = system( (char*) soCommand.data() ); if ( nRetStat != 0 ) { char* szErrorMessage = strerror( nRetStat ); if ( szErrorMessage ) { popupErrorMessage( "Something went wrong running %s. Gave error code %d which means %s", (char*) soCommand.data(), nRetStat, szErrorMessage ); } else popupErrorMessage( "Something went wrong running %s. Gave error code %d which is out of range. See xterm for more details on what went wrong.", (char*) soCommand.data(), nRetStat ); return( false ); } else return( true ); } bool addNewReads :: bParseAlignmentsAndAddReads() { ConsEd::pGetAssembly()->setAllPaddedPositionsArrays(); FILE* pAlignmentsFile = fopen( (char*) filAlignments_.data(), "r" ); if ( ! pAlignmentsFile ) { popupErrorMessage( "Can't open file that should have contained the alignments of the new reads with the existing contigs: %s\n Error %d ( %s )", (char*) filAlignments_.data(), errno, strerror( errno ) ); return( false ); } nCurrentLine_ = 0; parseAlignmentsAndAddReads2( pAlignmentsFile ); fclose( pAlignmentsFile ); return( true ); } // this is also called by addNewReadsAutomated (the automated method of // adding new reads) // parseAlignmentHeaderLine does new LocatedFragment void addNewReads :: parseAlignmentsAndAddReads2( FILE* pAlignmentsFile ) { if ( pCP->bAddNewReadsCheckThatCrossMatchRunCorrectly_ ) checkThatCrossMatchRunWithCorrectParameters( pAlignmentsFile ); // look for ALIGNMENT bool bEndOfFile = false; crossMatchInfoForRead* pCrossMatchInfoForRead = NULL; while( !bEndOfFile ) { if ( fgets( szLine_, SZLINE_SIZE, pAlignmentsFile ) == NULL ) { bEndOfFile = true; } else { ++nCurrentLine_; if ( memcmp( szLine_, "ALIGNMENT", 9 ) == 0 ) { parseAlignmentHeaderLine( pCrossMatchInfoForRead ); } else if ( memcmp( szLine_, "DISCREPANCY", 11 ) == 0 ) { parseOneDiscrepancyForOneAlignment( pCrossMatchInfoForRead ); } } } // while( !bEndOfFile ) { } #define ALIGNMENT_LINE_FORM( szMessage ) \ "Line should be of form ALIGNMENT 359 5.23 0.65 0.44 (<# of read bases after end of alignment>) (<# of contig bases after end alignment>) or ALIGNMENT 709 0.14 0.00 0.00 (<# of read bases after end of alignment>) C (<# of contig bases after end of alignment>) " szMessage // this does new LocatedFragment void addNewReads :: parseAlignmentHeaderLine( crossMatchInfoForRead*& pCrossMatchInfoForRead ) { //if reached here, the first token is ALIGNMENT //Typical examples: //uncomplemented: //ALIGNMENT 772 0.49 0.37 0.61 Contig2 5 822 (0) NewContig2 403 1218 (0) //complemented: //ALIGNMENT 382 0.25 0.76 0.00 Contig1 1 397 (248) C NewContig2 (818) 400 1 // This is from an actual run of addNewReads (interactive). Notice the // read comes first. //ALIGNMENT 816 2.11 0.30 1.91 djs74-649.x1 36 1031 (130) C Contig1 (768) 1823 844 const int nMaxLineSize = 1000; char szSavedLine[ nMaxLineSize + 1 ]; // save for error messages strncpy( szSavedLine, szLine_, nMaxLineSize ); char* szAlignmentKeyword = strtok( szLine_, " " ); // ALIGNMENT assert( (strcmp( szAlignmentKeyword, "ALIGNMENT" ) == 0 ) ); strtok( NULL, " " ); strtok( NULL, " " ); strtok( NULL, " " ); strtok( NULL, " " ); char* szReadName = strtok( NULL, " " ); if ( !szReadName ) { PARSE_PANIC( "this line should have contained something like this: ALIGNMENT 772 0.49 0.37 0.61 Contig2 5 822 (0) NewContig2 403 1218 (0) but it contained too few words" ); } char* szUnpaddedReadStart = strtok( NULL, " " ); char* szUnpaddedReadEnd = strtok( NULL, " " ); char* szReadBasesLeftOver = strtok( NULL, " " ); char* szMaybeC = strtok( NULL, " " ); if ( !szMaybeC ) { PARSE_PANIC( "this line should have contained something like this: ALIGNMENT 772 0.49 0.37 0.61 Contig2 5 822 (0) NewContig2 403 1218 (0) but it contained too few words (2)" ); } char* szContigName; char* szUnpaddedContigStart; char* szUnpaddedContigEnd; if ( strcmp( szMaybeC, "C" ) == 0 ) { bCurrentReadIsComplemented_ = true; szContigName = strtok( NULL, " " ); if (! szContigName ) { PARSE_PANIC( "this line should have contained something like this: ALIGNMENT 772 0.49 0.37 0.61 Contig2 5 822 (0) NewContig2 403 1218 (0) but it contained too few words (3)" ); } strtok( NULL, " " ); // number in form (####) szUnpaddedContigEnd = strtok( NULL, " " ); szUnpaddedContigStart = strtok( NULL, " " ); } else { bCurrentReadIsComplemented_ = false; szContigName = szMaybeC; szUnpaddedContigStart = strtok( NULL, " " ); szUnpaddedContigEnd = strtok( NULL, " " ); } soCurrentRead_ = szReadName; pCurrentContig_ = ConsEd::pGetAssembly()->pGetContigByName( szContigName ); if ( ! pCurrentContig_ ) { PARSE_PANIC_1_ARG( "This line refers to contig %s which is not in this assembly", szContigName ); } int nUnpaddedReadStart; if ( nAToIWithError( szUnpaddedReadStart, nUnpaddedReadStart ) != NO_ERROR ) { PARSE_PANIC( ALIGNMENT_LINE_FORM( "but unpadded read start isn't numeric" ) ); } int nUnpaddedReadEnd; if ( nAToIWithError( szUnpaddedReadEnd, nUnpaddedReadEnd ) != NO_ERROR ) { PARSE_PANIC( ALIGNMENT_LINE_FORM( "but unpadded read end isn't numeric" ) ); } int nReadBasesLeftOver; int nTokens = sscanf( szReadBasesLeftOver, "(%d)", &nReadBasesLeftOver ); if ( nTokens != 1 ) { fprintf( stderr, "szReadBasesLeftOver = %s nTokens = %d\n", szReadBasesLeftOver, nTokens ); PARSE_PANIC( ALIGNMENT_LINE_FORM( "but # of read bases after end of alignment isn't numeric" ) ); } int nUnpaddedContigStart; if ( nAToIWithError( szUnpaddedContigStart, nUnpaddedContigStart ) != NO_ERROR ) { PARSE_PANIC( ALIGNMENT_LINE_FORM( "but unpadded contig start isn't numeric" ) ); } int nUnpaddedContigEnd; if ( nAToIWithError( szUnpaddedContigEnd, nUnpaddedContigEnd ) != NO_ERROR ) { PARSE_PANIC( ALIGNMENT_LINE_FORM( "but unpadded contig end isn't numeric" ) ); } // the alignment position will be set below in insertPadsInOneNewRead pCurrentLocFrag_ = new LocatedFragment( soCurrentRead_, 1, // dummy value for alignment position bCurrentReadIsComplemented_, pCurrentContig_ ); ++pCurrentContig_->nReadsToAdd_; if ( nMode_ != nAddReads2ConsedScript ) { // is this necessary? Or is the chemistry already set somewhere else? // I don't think so since there is only the solexa file to read. // The phd ball for this read doesn't yet exist, nor does the ace // file for it. // Actually, this is *not* necessary, and is actually useless since // the chemistry is overwritten in // readFileOfPhdFiles.cpp::processCommentAndDnaForAddedRead // But what about if, by mistake, there is no chemistry in the // phdball? This will save us. pCurrentLocFrag_->soChemistry_ = soGetChemistry( pAddNewReadsWithExistingAlignments_->nChemistry_ ); // this is already set in processCommentAndDnaForAddedRead of // readFileOfPhdFiles.cpp // if ( pAddNewReadsWithExistingAlignments_->nChemistry_ == n454 ) { // RWCString soChromat = // "sff:" + // // pAddNewReadsWithExistingAlignments_->soSffFile_ + ":" + // soCurrentRead_; // pCurrentLocFrag_->setChromatFilename( soChromat ); // } pCurrentLocFrag_->nVersion_ = 1; } else { FileName filPHDFilename = soCurrentRead_ + ".phd.1"; pCurrentLocFrag_->setPHDFilename( filPHDFilename ); pCurrentLocFrag_->setChromatFilename( soCurrentRead_ ); pCurrentLocFrag_->nVersion_ = 1; } pCrossMatchInfoForRead = new crossMatchInfoForRead( pCurrentLocFrag_, nUnpaddedReadStart, nUnpaddedReadEnd, nUnpaddedContigStart, nUnpaddedContigEnd, bCurrentReadIsComplemented_ ); aCrossMatchInfoForReads_.insert( pCrossMatchInfoForRead ); // no longer supported--superceded by consed -fixContigEnds (Oct 2010) // if ( pCP->bAddNewReadsExtendConsensusUsingProtrudingNewReads_ ) { // int nWhichEnd = nNoGap; // // ---------------------------------- consensus // // uuuuuaaaaaaaaaaaaauuuuuuu (read, u=unaligned, a=aligned) // // ^ ^ // // nUnpaddedConsPosLeftEndOfRead // // nUnpaddedConsPosRightEndOfRead // // // // note that the read might be complemented // // check left end // int nUnpaddedConsPosAtLeftEndOfRead; // int nUnpaddedConsPosAtRightEndOfRead; // if ( bCurrentReadIsComplemented_ ) { // // nUnpaddedContigStart // // v // // ---------------------------------- // // uuuuaaaaaaaaaaaauuuuuu // // <--> // // nReadBasesLeftOver // // nUnpaddedContigEnd // // v // // ---------------------------------- // // uuuuaaaaaaaaaaaauuuuuu // // <-----> nUnpaddedReadStart // // ^ nUnpaddedContigEnd + // // nUnpaddedReadStart - 1 // nUnpaddedConsPosAtLeftEndOfRead = // nUnpaddedContigStart - nReadBasesLeftOver; // nUnpaddedConsPosAtRightEndOfRead = // nUnpaddedContigEnd + ( nUnpaddedReadStart - 1 ); // } // else { // // nUnpaddedContigStart // // v // // ---------------------------------- // // uuuuaaaaaaaaaaaauuuuuu // // <---> // // nUnpaddedReadStart // // // // ^ nUnpaddedContigStart - ( nUnpaddedReadStart - 1 ) // nUnpaddedConsPosAtLeftEndOfRead = // nUnpaddedContigStart - ( nUnpaddedReadStart - 1 ); // nUnpaddedConsPosAtRightEndOfRead = // nUnpaddedContigEnd + nReadBasesLeftOver; // } // if ( nUnpaddedConsPosAtLeftEndOfRead < 1 ) { // // sticks out left end of read // nWhichEnd = nLeftGap; // } // else if ( pCurrentContig_->nGetUnpaddedEndIndex() < nUnpaddedConsPosAtRightEndOfRead ) { // nWhichEnd = nRightGap; // } // // note that many reads will align within the contig. We are not // // concerned with those reads--they will be added to the contig // // the old way, using the cross_match alignment and not going through // // phrap. // if ( nWhichEnd != nNoGap ) { // // get list // contigEndReadList myCERL( pCurrentContig_, // nWhichEnd ); // if ( !paContigEndReadList_ ) { // int nEstimatedNumberOfContigEnds = // ConsEd::pGetAssembly()->nNumContigs() * 2; // paContigEndReadList_ = // new mbtPtrOrderedVector( nEstimatedNumberOfContigEnds, // "addNewReads::paContigEndReadList_" ); // } // contigEndReadList* pCERL = paContigEndReadList_->pFindFast( &myCERL ); // if ( !pCERL ) { // pCERL = new contigEndReadList( pCurrentContig_, // nWhichEnd ); // if ( nWhichEnd == nLeftGap ) { // pCERL->nUnpaddedConsPosOfReadFurthestIntoContig_ = -1; // } // else { // pCERL->nUnpaddedConsPosOfReadFurthestIntoContig_ = // pCurrentContig_->nGetUnpaddedEndIndex() + 1; // } // paContigEndReadList_->insertSorted( pCERL ); // } // // N.B. I believe this MAX and MIN below is correct: // // for reads at the left end of the contig: // // cccccccccccccccccccccc // // rrrrrrrrrr // // rrrrrrrrrrr // // rrrrrrrrr // // ^this is the point we want // // for nConsPosOfReadFurthestIntoContig_ so we want to take // // MAX // // for read at the right end of the contig: // // cccccccccccccccccccccc // // rrrrrrrrrrrr // // rrrrrrrrrrr // // rrrrrrrrr // // ^ this is the point we want // // for nConsPosOfReadFurthestIntoContig_ so we want to // // take MIN // if ( nWhichEnd == nLeftGap ) { // pCERL->nUnpaddedConsPosOfReadFurthestIntoContig_ = // MAX( pCERL->nUnpaddedConsPosOfReadFurthestIntoContig_, // nUnpaddedConsPosAtRightEndOfRead ); // } // else { // pCERL->nUnpaddedConsPosOfReadFurthestIntoContig_ = // MIN( pCERL->nUnpaddedConsPosOfReadFurthestIntoContig_, // nUnpaddedConsPosAtLeftEndOfRead ); // } // pCERL->aCrossMatchInfoForRead_.insert( pCrossMatchInfoForRead ); // } // if ( nWhichEnd != nNoGap ) { // } // if ( pCP->bAddNewReadsExtendConsensusUsingProtrudingNewReads_ ) { } // void addNewReads :: parseAlignmentHeaderLine() { void addNewReads :: tellUserAllAboutIt() { int nNumberOfRowsNeeded = 6 + aCrossMatchInfoForReads_.length(); if ( nNumberOfRowsNeeded > 20 ) nNumberOfRowsNeeded = 20; TextBox* pTextBox = new TextBox( "Reads Added", nNumberOfRowsNeeded ); pTextBox->append( "Warning--you must now save the assembly before doing any\nedits or the .wrk file will not be usable.\n\n" ); ConsEd::pGetConsEd()->disallowUndos(); RWCString soMessage; tellUserAllAboutIt2( soMessage ); pTextBox->append( soMessage ); pTextBox->makeVisible(); createNavigatorWindow(); createNavigationFile(); } void addNewReads :: createNavigationFile() { FileName filAddNewReadsNav = ConsEd::pGetAssembly()->soGetAceFileName().soGetProjectFromAceFilename() + "." + soGetDateTime( nDotInMiddle ) + ".nav"; FileName filAddNewReadsNavFOF = "navFileForAddedReads.fof"; FILE* pFOF = fopen( filAddNewReadsNavFOF.data(), "w" ); if ( pFOF == NULL ) { RWCString soError = "could not write " + filAddNewReadsNavFOF + " due to " + soGetErrno(); popupErrorMessage( soError ); return; } fprintf( pFOF, "%s\n", filAddNewReadsNav.data() ); fclose( pFOF ); FILE* pNav = fopen( filAddNewReadsNav.data(), "w" ); if ( pNav == NULL ) { RWCString soError = "could not write " + filAddNewReadsNav + " due to " + soGetErrno(); popupErrorMessage( soError ); return; } fprintf( pNav, "TITLE: %d reads added\n\n", aCrossMatchInfoForReads_.length() ); for( int nRead = 0; nRead < aCrossMatchInfoForReads_.length(); ++nRead ) { LocatedFragment* pLocFrag = aCrossMatchInfoForReads_[ nRead ]->pLocFrag_; // custom navigation entries look like this: // BEGIN_REGION // TYPE: READ // CONTIG: Contig1 // READ: djs74-3174.s1 // UNPADDED_CONS_POS: 1775 1775 // COMMENT: This is an example of a read location // END_REGION // I want to show the read where it is aligned int nStartPaddedConsPos; if (pLocFrag->bComp() ) nStartPaddedConsPos = pLocFrag->nGetHighQualityEndUnlessAllBad(); else nStartPaddedConsPos = pLocFrag->nGetHighQualityStartUnlessAllBad(); int nUnpaddedStart = pLocFrag->pContig_->nUnpaddedIndex( nStartPaddedConsPos ); fprintf( pNav, "BEGIN_REGION\n" ); fprintf( pNav, "TYPE: READ\n" ); fprintf( pNav, "CONTIG: %s\n", pLocFrag->pContig_->soGetName().data() ); fprintf( pNav, "READ: %s\n", pLocFrag->soGetName().data() ); fprintf( pNav, "UNPADDED_CONS_POS: %d %d\n", nUnpaddedStart, nUnpaddedStart ); fprintf( pNav, "COMMENT: Successfully added read\n" ); fprintf( pNav, "END_REGION\n\n" ); } fclose( pNav ); cerr << "see " << filAddNewReadsNav << endl; } void addNewReads :: createNavigatorWindow() { gotoList* pGotoList = new gotoList(); for( int nRead = 0; nRead < aCrossMatchInfoForReads_.length(); ++nRead ) { LocatedFragment* pLocFragNew = aCrossMatchInfoForReads_[ nRead ]->pLocFrag_; // RWCString soMessage( (size_t) 1000 ); // soMessage.appendFormat( "%-20s %-15s %8d %8d", // (char*) pLocFragNew->soGetName().data(), // (char*) pLocFragNew->pContig_->soGetName().data(), // pLocFragNew->nGetAlignStartUnpadded(), // pLocFragNew->nGetAlignEndUnpadded() ); // I want to show the read where it is aligned int nStartPadded; int nEndPadded; if ( pLocFragNew->bWholeReadIsLowQuality_ ) { nStartPadded = pLocFragNew->nGetAlignClipStart(); nEndPadded = pLocFragNew->nGetAlignClipEnd(); } else { nStartPadded = pLocFragNew->nGetHighQualityStart(); nEndPadded = pLocFragNew->nGetHighQualityEnd(); } gotoItem* pGotoItem = new gotoItem( pLocFragNew->pContig_, pLocFragNew, nStartPadded, nEndPadded, pLocFragNew->pContig_->nUnpaddedIndex( nStartPadded ), pLocFragNew->pContig_->nUnpaddedIndex( nEndPadded ), "", true ); // bPrefixContigToDescription pGotoList->addToList( pGotoItem ); } pGotoList->sortByPosition(); ConsEd::pGetConsEd()->addGuiMultiContigNavigator( new guiMultiContigNavigator( "New Reads In Assembly", "", // soTextForFirstLine "", // soTextForSecondLine 75, // nwidthInChars "", // soTitleOfSpecialPurposeButton NULL, // cbSpecialPurposeButton NULL, // pClientDataForCbSpecialPurposeButton NULL, // widTopLevelShellToBeConnectedTo pGotoList, NULL ) // pContigWin ); } void addNewReads :: tellUserAllAboutIt2( RWCString& soMessage ) { aCrossMatchInfoForReads_.resort(); aTryingToAddTheseReads_.resort(); int nSizeOfBuffer = aCrossMatchInfoForReads_.length() * 100 + 200; soMessage.increaseMaxLength( (size_t) nSizeOfBuffer ); int nReadsThatCouldNotBeEntered = aTryingToAddTheseReads_.length() - aCrossMatchInfoForReads_.length(); if ( nReadsThatCouldNotBeEntered == 0 ) { soMessage += "All reads were successfully added\n"; } else if ( aCrossMatchInfoForReads_.length() == 0 ) { soMessage += "No reads could be entered at all:\n"; for( int nTryingToAddTheseReads = 0; nTryingToAddTheseReads < aTryingToAddTheseReads_.length(); ++nTryingToAddTheseReads ) { RWCString soReadToBeAdded = aTryingToAddTheseReads_[ nTryingToAddTheseReads ]; soMessage += " "; soMessage += soReadToBeAdded; soMessage += "\n"; } } else { soMessage += "The following "; soMessage += RWCString( (long) nReadsThatCouldNotBeEntered ); soMessage += " reads could not be entered:\n"; int nNewLocatedFragment = 0; for( int nTryingToAddTheseReads = 0; nTryingToAddTheseReads < aTryingToAddTheseReads_.length(); ++nTryingToAddTheseReads ) { RWCString soReadToBeAdded = aTryingToAddTheseReads_[ nTryingToAddTheseReads ]; // see if this is in aCrossMatchInfoForReads_ if ( soReadToBeAdded == aCrossMatchInfoForReads_[ nNewLocatedFragment ]->pLocFrag_->soGetName() ) { if ( nNewLocatedFragment < ( aCrossMatchInfoForReads_.length() - 1 ) ) { ++nNewLocatedFragment; // ready for next nTryingToAddTheseReads } } else { soMessage += soReadToBeAdded; soMessage += "\n"; } } } soMessage += "\n"; soMessage.increaseMaxLengthIfNecessary( 1000 ); soMessage.appendFormat( "The following %d read(s) were added:\n", aCrossMatchInfoForReads_.length() ); soMessage += " start end\n"; soMessage += " pos pos\n"; soMessage += "\n\n"; for( int nRead = 0; nRead < aCrossMatchInfoForReads_.length(); ++nRead ) { LocatedFragment* pLocFragNew = aCrossMatchInfoForReads_[ nRead ]->pLocFrag_; soMessage.increaseMaxLengthIfNecessary( 1000 ); soMessage.appendFormat( "%-20s %-15s %8d %8d\n", (char*) pLocFragNew->soGetName().data(), (char*) pLocFragNew->pContig_->soGetName().data(), pLocFragNew->nGetAlignStartUnpadded(), pLocFragNew->nGetAlignEndUnpadded() ); } } // changed Aug 2009 to not print line contents because it is not necessary // (FGETS_OR_PARSE_PANIC is always used when there is a premature end-of-file) // and because it causes problems since szSavedLine is not always available #define FGETS_OR_PARSE_PANIC( szErrorMessage ) \ if ( fgets( szLine_, SZLINE_SIZE, pAlignmentsFile ) == NULL ) { \ ostringstream ost; \ ost << "crossmatch output file error detected from source file " \ << __FILE__ << " at " << __LINE__ << endl \ << "Error in file " << (char*) filAlignments_.data() << " at line " \ << nCurrentLine_ << endl \ << szErrorMessage << endl << ends; \ InputDataError ide( ost.str().c_str() ); \ throw ide; } \ ++nCurrentLine_; // discrepancy lists look like this: // Notice that they always end with a blank line // ALIGNMENT 26 12.20 0.00 0.00 2_0007_2_7_537_772 1 41 (0) mm8_dna- // Contig 1064794 1064834 (1218865) // DISCREPANCY S 5 T(15) 1064798 ctggTcttttt // DISCREPANCY S 25 T(15) 1064818 ttggagTctttta // DISCREPANCY S 32 T(15) 1064825 cttttaTtttctg // DISCREPANCY S 34 T(15) 1064827 tttattTtctgct // DISCREPANCY S 35 T(15) 1064828 ttatttTctgctt // ALIGNMENT 41 0.00 0.00 0.00 2_0007_2_7_239_455 1 41 (0) mm8_dna- // Contig 53656 53696 (2230003) // ALIGNMENT 40 0.00 0.00 0.00 2_0007_2_7_302_125 1 40 (1) C mm8_dna- // Contig (1230903) 1052796 1052757 // ALIGNMENT 40 0.00 0.00 0.00 2_0007_2_7_37_441 1 40 (1) C mm8_dna-C // ontig (1238384) 1045315 1045276 // 12214 matching entries (first file). // Can also have: // DISCREPANCY S 85 C(15) 194570 tgcaggCcacctt // DISCREPANCY D 15 T(15) 194640 gtgcatTaacagt // DISCREPANCY I 748 G(15) 193899 attacaGgatgcc // DISCREPANCY D-8 187 A(15) 194467 ataagaAgtttgt // DISCREPANCY I-8 564 TTGATAAT(15) 571 tgatacTTGATAATgtctgg void addNewReads :: parseOneDiscrepancyForOneAlignment( crossMatchInfoForRead* pCrossMatchInfoForRead ) { assert( pCrossMatchInfoForRead ); char szType[50]; char szNucleotideAndPosition[50]; int n1ReadPos; int n1ContigPos; int nNumberOfConvertedArguments = sscanf( szLine_, "DISCREPANCY %s %d %s %d", szType, &n1ReadPos, szNucleotideAndPosition, &n1ContigPos ); if ( nNumberOfConvertedArguments != 4 ) { const int nMaxLineSize = 1000; char szSavedLine[ nMaxLineSize + 1 ]; // save for error messages strncpy( szSavedLine, szLine_, nMaxLineSize ); PARSE_PANIC( "should have been DISCREPANCY line but wrong number of arguments" ); } if ( szType[0] == 'S' ) return; assert( szType[0] == 'D' || szType[0] == 'I' ); int nMultiplicity = 1; if ( strlen( szType ) != 1 ) { if ( szType[0] == 'D' ) { assert( sscanf( szType, "D-%d", &nMultiplicity ) == 1 ); } else { assert( sscanf( szType, "I-%d", &nMultiplicity ) == 1 ); } } crossMatchIndel* pCrossMatchIndel = new crossMatchIndel( szType, n1ReadPos, n1ContigPos, nMultiplicity ); if ( !pCrossMatchInfoForRead->pCrossMatchIndelList_ ) { pCrossMatchInfoForRead->pCrossMatchIndelList_ = new mbtPtrOrderedVector( 2, // initial capacity "crossMatchIndelList" ); } pCrossMatchInfoForRead->pCrossMatchIndelList_->append( pCrossMatchIndel ); } // parseOneDiscrepancyForOneAlignment // this only sets the phred calls--it does not set the read bases. // The latter is set in insertPadsInOneNewReadAndSetReadBases void addNewReads :: readPHDFilesForNewReads() { for( int n = 0; n < aCrossMatchInfoForReads_.length(); ++n ) { LocatedFragment* pLocFrag = aCrossMatchInfoForReads_[n]->pLocFrag_; pLocFrag->readPHDFileForAddedRead(); } } void addNewReads :: readPHDFilesJustForTagsAndWholeReadItems() { for( int n = 0; n < aCrossMatchInfoForReads_.length(); ++n ) { LocatedFragment* pLocFrag = aCrossMatchInfoForReads_[n]->pLocFrag_; readPHDFileJustForTagsAndWholeReadItems( pLocFrag ); } } void addNewReads :: insertPadsInContigs() { printTime( "starting insertPadsInContigs" ); cout << "Inserting pads in contigs to accommodate insertions in new reads..."; cout.flush(); Assembly* pAssembly = ConsEd::pGetAssembly(); for( int nContig = 0; nContig < pAssembly->nNumContigs(); ++nContig ) { Contig* pContig = pAssembly->pGetContig( nContig ); insertPadsInOneContig( pContig ); } cout << "Done" << endl; cout.flush(); printTime( "ending insertPadsInContigs" ); } #define nGET_NEW_CONS_POS_FROM_OLD( nOLD_CONS_POS ) \ ( ( ( 1 <= nOLD_CONS_POS ) && ( nOLD_CONS_POS <= nOldConsensusLength ) ) \ ? pPaddedConsPosNewFromOld[ nOLD_CONS_POS ] : \ ( nOLD_CONS_POS < 1 ? nOLD_CONS_POS : \ nNewConsensusLength + nOLD_CONS_POS - nOldConsensusLength \ ) ) void addNewReads :: insertPadsInOneContig( Contig* pContig ) { // This array is 1-based. Just don't use the 0th position. // Hence the +1 // pPadsToInsert[ nOldConsPos ] gives the number of pads to insert // *after* nOldConsPos int* pPadsToInsert = (int*) calloc( pContig->nGetPaddedConsLength() + 1, sizeof(int) ); for( int nAlignment = 0; nAlignment < aCrossMatchInfoForReads_.length(); ++nAlignment ) { crossMatchInfoForRead* pCrossMatchInfoForRead = aCrossMatchInfoForReads_[ nAlignment ]; // for phrapping reads at end of contig if ( pCrossMatchInfoForRead->bAlreadyUsedThis_ ) continue; if ( pContig == pCrossMatchInfoForRead->pLocFrag_->pContig_ ) { // read through the contig part of the alignment. // For each stretch of pads, find how many pads there // are in the corresponding position in the existing consensus. // If there are none, or not enough, add to the pPadsToInsert array if ( pCrossMatchInfoForRead->pCrossMatchIndelList_ ) { for( int nIndel = 0; nIndel < pCrossMatchInfoForRead->pCrossMatchIndelList_->length(); ++nIndel ) { crossMatchIndel* pCrossMatchIndel = (*(pCrossMatchInfoForRead->pCrossMatchIndelList_))[ nIndel ]; // we are not interested in pads in the read right now if ( pCrossMatchIndel->soType_[0] != 'I' ) continue; // read 51 AATAGCAAAAAAAATGTTCTAGAATGCTAAGGATACAGACATGAGGTACA 100 // contig 51 AATAGCAAAAAAAATGTTCTAGAATGCTAAGGATACAGACATGAGGTACA 100 // read 101 CAGTACTGAGTCCCTGCAGGGCTGTCTTCCTCCTCTCACTGAGACCTCTT 150 // --- // contig 101 ---TACTGAGTCCCTGCAGGGCTGTCTTCCTCCTCTCACTGAGACCTCTT 147 // DISCREPANCY I-3 101 CAG(15) 100 ggtacaCAGtactga // read contig // The 101 refers to the C in CAG // the 100 refers to the A in ACA in the contig // before the gaps in the contig. // complemented case: // C read 750 AATAGCAAAAAAAATGTTCTAGAATGCTAAGGATACAGACATGAGGTACA 701 // contig 51 AATAGCAAAAAAAATGTTCTAGAATGCTAAGGATACAGACATGAGGTACA 100 // C read 700 CAGTACTGAGTCCCTGCAGGGCTGTCTTCCTCCTCTCACTGAGACCTCTT 651 // --- // contig 101 ---TACTGAGTCCCTGCAGGGCTGTCTTCCTCCTCTCACTGAGACCTCTT 147 // DISCREPANCY I-3 698 CTG(15) 101 tcagtaCTGtgtacc // So ACACAGTAC The CAG is 700, 699, 698 // ACA---TAC The A---T is 100, - - - 101 // now we know the number of insertions // We must find the number of pads needed in the consensus. // There might already be some. Find how many. int nConsPosOfNonPadBeforePaddedRegion; int nConsPosOfNonPadAfterPaddedRegion; if ( pCrossMatchInfoForRead->bReadIsComplemented_ ) { nConsPosOfNonPadBeforePaddedRegion = pContig->nPaddedIndexFast( pCrossMatchIndel->nUnpaddedConsPos_ - 1 ); nConsPosOfNonPadAfterPaddedRegion = pContig->nPaddedIndexFast( pCrossMatchIndel->nUnpaddedConsPos_ ); } else { nConsPosOfNonPadBeforePaddedRegion = pContig->nPaddedIndexFast( pCrossMatchIndel->nUnpaddedConsPos_ ); nConsPosOfNonPadAfterPaddedRegion = pContig->nPaddedIndexFast( pCrossMatchIndel->nUnpaddedConsPos_ + 1 ); } // typically this is 0--there are no pads in the existing // contig int nNumberOfPadsInExistingContig = nConsPosOfNonPadAfterPaddedRegion - nConsPosOfNonPadBeforePaddedRegion - 1; int nNumberOfInsertions = pCrossMatchIndel->nMultiplicity_; int nMorePadsNeeded = nNumberOfInsertions - nNumberOfPadsInExistingContig; if ( nMorePadsNeeded > 0 ) { // next, see if we are already planning on adding // more pads to that location if ( nMorePadsNeeded > pPadsToInsert[ nConsPosOfNonPadBeforePaddedRegion ] ) { pPadsToInsert[ nConsPosOfNonPadBeforePaddedRegion ] = nMorePadsNeeded; } } } // for( int nIndel = 0; } // if ( pCrossMatchIndelList_ ) { } // if ( pContig == pCrossMatchInfoForRead->pLocFrag_->pContig_ ) { } // for( int nAlignment = nIndexOfFirstAlignmentAgainstThisContig; // convert pPadsToInsert into an array that gives new padded cons pos // from old padded cons pos int* pPaddedConsPosNewFromOld = (int*) malloc( ( pContig->nGetPaddedConsLength() + 1 ) * sizeof(int) ); pPaddedConsPosNewFromOld[1] = 1; for( int nConsPos = 2; nConsPos <= pContig->nGetEndConsensusIndex(); ++nConsPos ) { pPaddedConsPosNewFromOld[nConsPos] = pPaddedConsPosNewFromOld[ nConsPos - 1 ] + pPadsToInsert[nConsPos - 1 ] + 1; } // warning: these are used by nGET_NEW_CONS_POS_FROM_OLD int nOldConsensusLength = pContig->nGetEndConsensusIndex(); int nNewConsensusLength = pPaddedConsPosNewFromOld[ pContig->nGetEndConsensusIndex() ]; assert( nNewConsensusLength >= nOldConsensusLength ); assert( pContig->nGetEndConsensusIndex() == pContig->nGetPaddedConsLength() ); // now increase the length of the consensus if ( pContig->nMaxLength_ < nNewConsensusLength ) { pContig->resize( nNewConsensusLength ); } pContig->nCurrentLength_ = nNewConsensusLength; // move bases like this: // old: bbbbbbbbbbbbbbbbbbbbbbb // // new: bbbb bbbbbb bbbbbbbbb bbbb // Then we need to fill in the holds with pads int nNewConsPosOnRight = nNewConsensusLength + 1; int nOldConsPos; for( nOldConsPos = nOldConsensusLength; nOldConsPos >= 1; --nOldConsPos ) { int nNewConsPos = pPaddedConsPosNewFromOld[ nOldConsPos ]; pContig->setNtideAtPos( nNewConsPos, pContig->Sequence::ntideGet( nOldConsPos ) ); if ( nNewConsPosOnRight - nNewConsPos > 1 ) { // decide on quality for pads // it is ok to use nNewConsPos and nNewConsPosOnRight because // both of those Ntides have already been moved unsigned char ucQualityAfterPads = pContig->ntideGet( nNewConsPosOnRight ).qualGetQualityWithout98or99X0(); unsigned char ucQualityBeforePads = pContig->ntideGet( nNewConsPos ).qualGetQualityWithout98or99X0(); unsigned char ucAverageQuality = ( ucQualityAfterPads + ucQualityBeforePads ) / 2; for( int nNewConsPosPad = nNewConsPos + 1; nNewConsPosPad <= nNewConsPosOnRight - 1; ++nNewConsPosPad ) { pContig->setNtideAtPos( nNewConsPosPad, Ntide('*', ucAverageQuality ) ); } } nNewConsPosOnRight = nNewConsPos; } // now adjust consensus tags for( int nTag = 0; nTag < pContig->nGetNumberOfTags(); ++nTag ) { tag* pTag = pContig->pGetTag( nTag ); pTag->nPaddedConsPosStart_ = nGET_NEW_CONS_POS_FROM_OLD( pTag->nPaddedConsPosStart_ ); pTag->nPaddedConsPosEnd_ = nGET_NEW_CONS_POS_FROM_OLD( pTag->nPaddedConsPosEnd_ ); } // now pass through existing reads (ones present before adding new reads) // and add pads // set these to absurb numbers so that the first read will start // setting them. int nNewFirstDisplayableContigPos = 666; int nNewLastDisplayableContigPos = -666; for (int nRead = 0; nRead < pContig->nGetNumberOfFragsInContig(); nRead++) { // pick up pointer to this located fragment LocatedFragment* pLocFrag = pContig->pLocatedFragmentGet( nRead ); // consider about this case: // -------------- consensus // --------------- read // ccccccooooooooo ('c' means read on consensus, 'o' means // part of read off consensus) // pads will only be inserted in the 'c' part of the read // since it is only the consensus being aligned against // the new reads. // pads will only have been added int nAlignedStartOnConsensus; int nAlignedEndOnConsensus; if ( bIntersect( pLocFrag->nGetAlignStart(), pLocFrag->nGetAlignEnd(), 1, // pContig->nGetStartConsensusIndex(), nOldConsensusLength, // pContig->nGetEndConsensusIndex()-- can't use this because the consensus has already changed nAlignedStartOnConsensus, nAlignedEndOnConsensus ) ) { // now check if the region from nAlignedStartOnConsensus to // nAlignedEndOnConsensus has expanded int nOldLengthOfReadOnConsensus = nAlignedEndOnConsensus - nAlignedStartOnConsensus + 1; int nNewLengthOfReadOnConsensus = pPaddedConsPosNewFromOld[ nAlignedEndOnConsensus ] - pPaddedConsPosNewFromOld[ nAlignedStartOnConsensus ] + 1; // if there are no pads inserted into this region, there is // no need to insert pads into this read if ( nNewLengthOfReadOnConsensus != nOldLengthOfReadOnConsensus ) { assert( nNewLengthOfReadOnConsensus > nOldLengthOfReadOnConsensus ); // pads have been added somewhere in the read. We are not // interested in pads added after the end of the read. // I was going to use this, but I think there will never // be pads after the last consensus base so we might as well // use the -1 (as above) whether the reads extend past the end of the // consensus or not // if ( pContig->nGetEndConsensusIndex() < pLocFrag->nGetAlignEnd() ) { // nLastReadBaseToAppendPads = pContig->nGetEndConsensusIndex(); // } // else { // nLastReadBaseToAppendPads = pLocFrag->nGetAlignEnd() - 1; // } int nBasesToAdd = nNewLengthOfReadOnConsensus - nOldLengthOfReadOnConsensus; pLocFrag->increaseMaxLengthIfNecessary( nBasesToAdd ); // increase the length of the peak array as well if ( pLocFrag->bHasPeakPositions() ) { pLocFrag->makeCopyOfPhredCallsIfNecessary(); pLocFrag->increaseMaxLengthIfNecessaryOfPeakPositions( nBasesToAdd ); } // set up way of going through read once and moving each base // of the read only once. Have an easy conversion from // old padded consed pos to new read position // nAlignedStartOnConsensus -> pPaddedConsPosNewFromOld[ nAlignedStartOnConsensus ] // nOldConsPos -> pPaddedConsPosNewFromOld[ nOldConsPos ] // Must use Sequence::setNtideAtPos which uses 1-based // read position // pLocFrag->nGetAlignStart() is n1PaddedRead = 1 -> // n1NewPaddedRead = 1 // nOldConsPos -> nNewConsPos = pPaddedConsPosNewFromOld[ nOldConsPos ] // and nNewConsPos -> n1NewPaddedRead = ( nNewConsPos - pPaddedConsPosNewFromOld[ pLocFrag->nGetAlignStart() ] ) + 1 // So nOldConsPos -> n1NewPaddedRead = ( pPaddedConsPosNewFromOld[ nOldConsPos ] - pPaddedConsPosNewFromOld[ pLocFrag->nGetAlignStart() ] ) + 1 // Actually, if pLocFrag->nGetAlignStart() is off the consensus, // pPaddedConsPosNewFromOld[ pLocFrag->nGetAlignStart() ] will // give a segmentation fault. So we need to do the futzing // that is done by nGET_NEW_CONS_POS_FROM_OLD // Does setNtideAtPos allow for setting beyond end of // array? No (see rwtvalorderedvector.h). Perhaps // we can do an "unsafe" version in rwtvalorderedvector for speed // this will change as we add pads so let's store it int nOldGetAlignEnd = pLocFrag->nGetAlignEnd(); int nOldGetAlignStart = pLocFrag->nGetAlignStart(); int nAddToNewPaddedConsPosToGetNewPaddedRead = - nGET_NEW_CONS_POS_FROM_OLD( nOldGetAlignStart ) + 1; // set up for first round. This should be // base just after that corresponding to nOldGetAlignEnd int n1LastNewPaddedRead = -666; // drastic step--program better fill in all bases below // Note: This changes pLocFrag->nGetAlignEnd(): pLocFrag->nCurrentLength_ += nBasesToAdd; if ( pLocFrag->bHasPeakPositions() ) { if ( pLocFrag->cWhichPeakPositionsAreUsed_ == Sequence::cLittle ) pLocFrag->pLittlePeakPositions_->nCurrentLength_ += nBasesToAdd; else pLocFrag->pBigPeakPositions_->nCurrentLength_ += nBasesToAdd; } for( nOldConsPos = nOldGetAlignEnd; nOldConsPos >= nOldGetAlignStart; --nOldConsPos ) { int n1OldPaddedRead = nOldConsPos - nOldGetAlignStart + 1; // we need to use nGET_NEW_CONS_POS_FROM_OLD instead // of the faster pPaddedConsPosNewFromOld since // the read might be partially off the consensus int nNewPaddedConsPos = nGET_NEW_CONS_POS_FROM_OLD( nOldConsPos ); int n1NewPaddedRead = nNewPaddedConsPos + nAddToNewPaddedConsPosToGetNewPaddedRead; // sanity test if ( nOldConsPos == nOldGetAlignStart ) { assert( n1NewPaddedRead == 1 ); } else if ( nOldConsPos == nOldGetAlignEnd ) { assert( n1NewPaddedRead == pLocFrag->nCurrentLength_ ); } pLocFrag->Sequence::setNtideAtPos( n1NewPaddedRead, pLocFrag->Sequence::ntideGet( n1OldPaddedRead ) ); if ( pLocFrag->bHasPeakPositions() ) { pLocFrag->Sequence::setTracePointPosAtSeqPos( n1NewPaddedRead, pLocFrag->Sequence::nGetPointPos( n1OldPaddedRead ) ); } // we also might need to insert pads into the read. Since // this part of the read array is hopefully in cache, // let's put in those pads also int nNumberOfPadsToAdd = ( nOldConsPos == nOldGetAlignEnd ) ? 0 : ( n1LastNewPaddedRead - n1NewPaddedRead - 1 ); // set up for next time n1LastNewPaddedRead = n1NewPaddedRead; // program error check: if ( 1 <= nOldConsPos && nOldConsPos <= nOldConsensusLength ) { // we *never* insert pads after the last base, even // if the new reads cause a pad to be inserted in the // consensus after the last base if ( nOldConsPos != nOldGetAlignEnd ) { assert( pPadsToInsert[ nOldConsPos ] == nNumberOfPadsToAdd ); } } else { assert( nNumberOfPadsToAdd == 0 ); } // these pads will go after n1NewPaddedRead and before // n1LastNewPaddedRead if ( nNumberOfPadsToAdd > 0 ) { // what quality should I assign to the pad? // the old way--ucDefaultQualityEdited which // is 98. But that leaves dark stripes in the old // reads so I'm going to make it the same quality // as the base just added which is before the pads Quality qual = pLocFrag->Sequence::ntideGet( n1NewPaddedRead ).qualGetQuality(); // similarly, for the peak positions of the pads, // make it the same as that of the base just added // before the pads int nPeakPosOfPads; if ( pLocFrag->bHasPeakPositions() ) { nPeakPosOfPads = pLocFrag->Sequence::nGetPointPos( n1NewPaddedRead ); } for( int nPads = 1; nPads <= nNumberOfPadsToAdd; ++nPads ) { int n1NewReadPosOfPad = n1NewPaddedRead + nPads; pLocFrag->Sequence::setNtideAtPos( n1NewReadPosOfPad, Ntide( '*', qual ) ); if ( pLocFrag->bHasPeakPositions() ) { pLocFrag->Sequence::setTracePointPosAtSeqPos( n1NewReadPosOfPad, nPeakPosOfPads ); } } } // if ( nNumberOfPadsToAdd > 0 ) { } // for( nOldConsPos = nOldGetAlignEnd;... } // if ( nNewLengthOfReadOnConsensus != nOldLengthOfReadOnConsensus ) { } // if ( bIntersect... // now adjust all read positions //////////////////////////////////////////////////////// // Attention: // This is the most important line in this routine!!!! //////////////////////////////////////////////////////// pLocFrag->nAlignStartPos_ = nGET_NEW_CONS_POS_FROM_OLD( pLocFrag->nAlignStartPos_ ); if ( !pLocFrag->bIsWholeReadLowQuality() ) { pLocFrag->nQualClipStart_ = nGET_NEW_CONS_POS_FROM_OLD( pLocFrag->nQualClipStart_ ); pLocFrag->nQualClipEnd_ = nGET_NEW_CONS_POS_FROM_OLD( pLocFrag->nQualClipEnd_ ); } if ( !pLocFrag->bIsWholeReadUnaligned() ) { pLocFrag->nAlignClipStart_ = nGET_NEW_CONS_POS_FROM_OLD( pLocFrag->nAlignClipStart_ ); pLocFrag->nAlignClipEnd_ = nGET_NEW_CONS_POS_FROM_OLD( pLocFrag->nAlignClipEnd_ ); } // and adjust all read tag positions for( int nTag = 0; nTag < pLocFrag->aTags_.length(); ++nTag ) { tag* pTag = pLocFrag->aTags_[ nTag ]; pTag->nPaddedConsPosStart_ = nGET_NEW_CONS_POS_FROM_OLD( pTag->nPaddedConsPosStart_ ); pTag->nPaddedConsPosEnd_ = nGET_NEW_CONS_POS_FROM_OLD( pTag->nPaddedConsPosEnd_ ); } if ( nNewLastDisplayableContigPos < pLocFrag->nGetAlignEnd() ) nNewLastDisplayableContigPos = pLocFrag->nGetAlignEnd(); if ( pLocFrag->nGetAlignStart() < nNewFirstDisplayableContigPos ) nNewFirstDisplayableContigPos = pLocFrag->nGetAlignStart(); } // for (int nRead = 0; pContig->nFirstDisplayableContigPos_ = nNewFirstDisplayableContigPos; pContig->nLastDisplayableContigPos_ = nNewLastDisplayableContigPos; // now fix all base segments for( int nSeg = 0; nSeg < pContig->baseSegArray_.nGetNumSegments(); ++nSeg ) { BaseSegment* pSeg = (pContig->baseSegArray_)[ nSeg ]; assert( pSeg->pLocFrag_->pContig_ == pContig ); // doing this expands base segment to the left to include // pads that were just inserted to the left of the base segment // (Aug 2008) // pSeg->nStartInConsensus_ = // nGET_NEW_CONS_POS_FROM_OLD( pSeg->nStartInConsensus_ - 1 ) + 1; // However, (Jan 2009), if the read for this base segment started // at the beginning of this base segment, this causes the base // segment to extend to the left of the read and thus gives // a fatal error. So we will have to content ourselves with // some places where a pad is added and there is no base segment // covering that pad. It will usually be fixed when consed starts up // again, but not always. pSeg->nStartInConsensus_ = nGET_NEW_CONS_POS_FROM_OLD( pSeg->nStartInConsensus_ ); pSeg->nEndInConsensus_ = nGET_NEW_CONS_POS_FROM_OLD( pSeg->nEndInConsensus_ ); } // check that I didn't screw up the base segments assert( pContig->baseSegArray_.bGetDataStructureOk() ); free( (char*) pPaddedConsPosNewFromOld ); free( (char*) pPadsToInsert ); // since we just added a bunch of new pads, must reset the // contig padded/unpadded conversion tables pContig->setPaddedPositionsArray(); pContig->setUnpaddedPositionsArray(); } // // now we know the number of deletions // // pCrossMatch->nUnpaddedRead_ is *always* the read's // // unpadded base before the deleted region // // pCrossMatch->nUnpaddedContig_ depends on // // whether the alignment is complemented or not. If it // // is not complemented, it is the contig's smallest // // unpadded position aligned with gaps in the read. If the // // alignment is complemented, it is the contig's // // largest unpadded position aligned with gaps in the read. // // For example: // // // // read 51 AATAGCAAAAAAAATGTTCTAGAATGCTAAGGATACAGACATGAGGTACA 100 // // contig 51 AATAGCAAAAAAAATGTTCTAGAATGCTAAGGATACAGACATGAGGTACA 100 // // read 101 ---TACTGAGTCCCTGCAGGGCTGTCTTCCTCCTCTCACTGAGACCTCTT 147 // // --- // // contig 101 CAGTACTGAGTCCCTGCAGGGCTGTCTTCCTCCTCTCACTGAGACCTCTT 150 // // DISCREPANCY D-3 100 A(15) 101 aggtacAtactga // // (read) (contig) // // The "100" refers to the A of ACA on the end of // // the previous line before the TAC // // The "101" refers to the C in CAG of the contig. // // If the alignment is complemented: // // // // C read 747 AATAGCAAAAAAAATGTTCTAGAATGCTAAGGATACAGACATGAGGT--- 701 // // --- // // contig 51 AATAGCAAAAAAAATGTTCTAGAATGCTAAGGATACAGACATGAGGTACA 100 // // C read 700 CAGTACTGAGTCCCTGCAGGGCTGTCTTCCTCCTCTCACTGAGACCTCTT 651 // // // // contig 101 CAGTACTGAGTCCCTGCAGGGCTGTCTTCCTCCTCTCACTGAGACCTCTT 150 // // DISCREPANCY D-3 700 G(15) 100 agtactGacctca // // (read) (contig) // // "700" is the C in CAG in the read before the gaps // // (next line) // // "100" is the last A in ACA in the contig. // this adds the read to the contig void addNewReads :: insertPadsInNewReadsAndSetReadBases() { cout << "Inserting pads in reads and setting read bases..." << endl; for( int nAlignment = 0; nAlignment < aCrossMatchInfoForReads_.length(); ++nAlignment ) { crossMatchInfoForRead* pCrossMatchInfoForRead = aCrossMatchInfoForReads_[ nAlignment ]; // this is for case in which this read was already put into // the assembly by the runMiniAssembliesOnProtrudingReads() code if ( pCrossMatchInfoForRead->bAlreadyUsedThis_ ) continue; insertPadsInOneNewReadAndSetReadBases( pCrossMatchInfoForRead ); // futzing to keep apLocatedFragment_ from resizing with // every read that is added Contig* pContig = pCrossMatchInfoForRead->pLocFrag_->pContig_; if ( pContig->nReadsToAdd_ ) { pContig->apLocatedFragment_.increaseMaxLengthIfNecessary( pContig->nReadsToAdd_ ); // this prevents it from being resized again pContig->nReadsToAdd_ = 0; } // add thyself pCrossMatchInfoForRead->pLocFrag_->pContig_->addLocatedFragment( pCrossMatchInfoForRead->pLocFrag_ ); // not necessary since Contig::addLocatedFragment does this // pCrossMatchInfoForRead->pLocFrag_->pContig_->bReadListsNeedFixing_ = //true; } } // void addNewReads :: insertPadsInNewReadsAndSetReadBases() { const int nNOINDEL = 1; const int nDELETION = 2; const int nINSERTION = 3; void addNewReads :: insertPadsInOneNewReadAndSetReadBases( crossMatchInfoForRead* pCrossMatchInfoForRead ) { LocatedFragment* pLocFrag = pCrossMatchInfoForRead->pLocFrag_; if ( !pLocFrag->bAlreadyReadPhdFile_ ) { RWCString soError = pLocFrag->soGetName() + " found in alignments but not found in phdball so bases/qualities unknown"; THROW_ERROR( soError ); } Contig* pContig = pLocFrag->pContig_; // resize the read int nBasesBeforeAlignment = pCrossMatchInfoForRead->nUnpaddedReadStart_ - 1; int nBasesAfterAlignment = pLocFrag->pSeqPhredCalledBases_->nGetEndIndex() - pCrossMatchInfoForRead->nUnpaddedReadEnd_; // padded consensus bases in region where read is aligned int nPaddedConsPosStart = pContig->nPaddedIndexFast( pCrossMatchInfoForRead->nUnpaddedContigStart_ ); int nPaddedConsPosEnd = pContig->nPaddedIndexFast( pCrossMatchInfoForRead->nUnpaddedContigEnd_ ); int nPaddedConsPosSize = nPaddedConsPosEnd - nPaddedConsPosStart + 1; int nCalculatedReadLength = nBasesBeforeAlignment + nBasesAfterAlignment + nPaddedConsPosSize; // seems to be necessary--don't understand why nCalculatedReadLength += 2; pLocFrag->resize( nCalculatedReadLength ); // How many unaligned bases do I copy before starting the alignment? // If the read is uncomplemented, nUnpaddedStart_ is the base after the // unaligned beginning bases // If the read is complemented, then nUnpaddedStart_ is still the base // after the unaligned beginning bases, but it is in coordinates from the // right end of the complemented read // If the read is complemented, then readPhredCalls will already have // complemented the phred calls if ( pLocFrag->bComp() ) { // this is bizarre, but true. // This is because in the case of a complemented alignment, Phil // reverses the positions of the contig (not complemented) and // doesn't reverse the positions of the read--but gives them in // uncomplemented coordinates // ALIGNMENT 397 2.91 0.22 0.22 K26-766c 21 466 (129) C Contig1 (484) 983 538 // Thus the 21 is really the *end* of the alignment and the 466 is // really the beginning of the alignment. However, these numbers // need to be converted into uncomplemented assert( pCrossMatchInfoForRead->nUnpaddedReadStart_ <= pCrossMatchInfoForRead->nUnpaddedReadEnd_ ); // make nUnpaddedStart_ and nUnpaddedEnd_ be with respect to // the left end of the read, regardless whether it is complemented // or not int nNumberOfNonPads = pLocFrag->pSeqPhredCalledBases_->length(); int nTemp = nNumberOfNonPads + 1 - pCrossMatchInfoForRead->nUnpaddedReadStart_; pCrossMatchInfoForRead->nUnpaddedReadStart_ = nNumberOfNonPads + 1 - pCrossMatchInfoForRead->nUnpaddedReadEnd_; pCrossMatchInfoForRead->nUnpaddedReadEnd_ = nTemp; if ( pCrossMatchInfoForRead->pCrossMatchIndelList_ ) { assert( pCrossMatchInfoForRead->pLocFrag_->pSeqPhredCalledBases_ ); int nUnpaddedReadLength = pCrossMatchInfoForRead->pLocFrag_->pSeqPhredCalledBases_->length(); for( int nCrossMatchIndel = 0; nCrossMatchIndel < pCrossMatchInfoForRead->pCrossMatchIndelList_->length(); ++nCrossMatchIndel ) { crossMatchIndel* pCrossMatchIndel = (*(pCrossMatchInfoForRead->pCrossMatchIndelList_))[ nCrossMatchIndel ]; // convert from read pos in direction of sequencing to // left-to-right coordinate pCrossMatchIndel->nUnpaddedReadPos_ = nUnpaddedReadLength + 1 - pCrossMatchIndel->nUnpaddedReadPos_; // however, we have a problem in that this is now on the // right edge of whatever the indel is and we want it on // the left edge. if ( pCrossMatchIndel->soType_[0] == 'I' ) { // If the alignment is complemented and I, the pointers // are here // v // bbbbb***bbbb cons // bbbbbbbbbbbb read // ^ // But we want them to be here, just as with an // uncomplement and I: // v // bbbbb***bbbb cons // bbbbbbbbbbbb read // ^ pCrossMatchIndel->nUnpaddedConsPos_ -= 1; pCrossMatchIndel->nUnpaddedReadPos_ -= ( pCrossMatchIndel->nMultiplicity_ - 1 ); } else { // If the alignment is complemented and D, the pointers // are here: // v // bbbbbbbbb cons // bbb***bbb read // ^ // But we want them here, just as with an // uncomplemented D: // v // bbbbbbbbb cons // bbb***bbb read // ^ pCrossMatchIndel->nUnpaddedConsPos_ -= ( pCrossMatchIndel->nMultiplicity_ - 1 ); pCrossMatchIndel->nUnpaddedReadPos_ -= 1; } // if ( pCrossMatchIndel->soType_[0] == 'I' ) { else } // for( int nCrossMatchIndel = 0; ... } // if ( pCrossMatchInfoForRead->pCrossMatchIndelList_ ) { } // if ( pLocFrag->bComp() ) { // sort the discrep_list now if ( pCrossMatchInfoForRead->pCrossMatchIndelList_ ) { pCrossMatchInfoForRead->pCrossMatchIndelList_->resort(); } int nNumberOfUnalignedBasesAtBeginning = pCrossMatchInfoForRead->nUnpaddedReadStart_ - 1; // will be used many times below and pLocFrag->bHasPeakPositions() is more // expensive bool bReadHasPeakPositions = pLocFrag->pSeqPhredCalledBases_->bHasPeakPositions(); if ( bReadHasPeakPositions ) { pLocFrag->createPointPosArray2( pLocFrag->pSeqPhredCalledBases_->cWhichPeakPositionsAreUsed_, pLocFrag->pSeqPhredCalledBases_->length() * 1.1, false ); // bFillUpArray // copy the unaligned bases at the beginning plus the initial // aligned base for( int nPos = 1; nPos <= nNumberOfUnalignedBasesAtBeginning; ++nPos ) { pLocFrag->appendNtide( pLocFrag->ntideGetPhredCalledBase( nPos ) ); pLocFrag->appendPointPos( pLocFrag->nGetPointPosOfPhredCalledBase(nPos)); } } else { // copy the unaligned bases at the beginning plus the initial // aligned base for( int nPos = 1; nPos <= nNumberOfUnalignedBasesAtBeginning; ++nPos ) { pLocFrag->appendNtide( pLocFrag->ntideGetPhredCalledBase( nPos ) ); } } // this gives the consensus position of the leftmost aligned base in the read int nConsPosOfFirstAlignedBase = pContig->nPaddedIndexFast( pCrossMatchInfoForRead->nUnpaddedContigStart_ ); // this would be a good place to set the alignment clipping pLocFrag->nAlignClipStart_ = nConsPosOfFirstAlignedBase; // This is probably the most important value in the feature of adding reads pLocFrag->nAlignStartPos_ = nConsPosOfFirstAlignedBase - nNumberOfUnalignedBasesAtBeginning; // these point to the next base int nUnpaddedConsPos = pCrossMatchInfoForRead->nUnpaddedContigStart_; int nUnpaddedReadPos = pCrossMatchInfoForRead->nUnpaddedReadStart_; int nIndel = 0; while( nUnpaddedReadPos <= pCrossMatchInfoForRead->nUnpaddedReadEnd_ ) { // check if we are entering an indel int nTypeOfNextAlignedSegment = nNOINDEL; int nMaxReadPos = pCrossMatchInfoForRead->nUnpaddedReadEnd_; if ( pCrossMatchInfoForRead->pCrossMatchIndelList_ ) { if ( nIndel < pCrossMatchInfoForRead->pCrossMatchIndelList_->length() ) { crossMatchIndel* pCrossMatchIndel = (*(pCrossMatchInfoForRead->pCrossMatchIndelList_))[ nIndel ]; if ( pCrossMatchIndel->soType_[0] == 'D' ) { // cons bbbbb // read b***b // ^ pCrossMatchIndel->nUnpaddedReadLeft_ points // here, which is nUnpaddedReadPos - 1 if ( ( nUnpaddedReadPos - 1 ) == pCrossMatchIndel->nUnpaddedReadPos_ ) { nTypeOfNextAlignedSegment = nDELETION; } else { nMaxReadPos = pCrossMatchIndel->nUnpaddedReadPos_; nTypeOfNextAlignedSegment = nNOINDEL; } } else { assert( pCrossMatchIndel->soType_[0] == 'I' ); // consed b***b // read bbbbb // ^ pCrossMatchIndel->nUnpaddedReadLeft_ // points here, which is nUnpaddeReadPos if ( nUnpaddedReadPos == pCrossMatchIndel->nUnpaddedReadPos_ ) { nTypeOfNextAlignedSegment = nINSERTION; } else { nTypeOfNextAlignedSegment = nNOINDEL; nMaxReadPos = pCrossMatchIndel->nUnpaddedReadPos_ - 1; } } } // if ( nIndel < ... } if ( nTypeOfNextAlignedSegment == nNOINDEL ) { // this is just to fake out the system so it doesn't add any // pads when adding the first base. I don't think any should // be added. What came before? Either an indel or the // beginning of the alignment. If indel, we already added // the correct # of pads. If the beginning of the alignment, // we should not add any pads. int nConsPosOfLastBase = pContig->nPaddedIndexFast( nUnpaddedConsPos ) - 1; for( ; nUnpaddedReadPos <= nMaxReadPos; ++nUnpaddedReadPos, ++nUnpaddedConsPos ) { int nConsPosOfCurrentBase = pContig->nPaddedIndexFast( nUnpaddedConsPos ); int nPadsToAdd = nConsPosOfCurrentBase - nConsPosOfLastBase - 1; // typically nPadsToAdd will be zero if ( nPadsToAdd ) { // use quality of last base added int nLastIndex = pLocFrag->nCurrentLength_ - 1; Quality qual = pLocFrag->RWTValOrderedVector::operator[]( nLastIndex ).qualGetQuality(); // since Fragment is 1-based, the last index is // pLocFrag->nCurrentLength_ int nPointPosOfBaseAboutToBeAdded = !bReadHasPeakPositions ? 0 : pLocFrag->nGetPointPosOfPhredCalledBase( nUnpaddedReadPos ); for( int nPad = 0; nPad < nPadsToAdd; ++nPad ) { pLocFrag->appendNtide( Ntide('*', qual ) ); if ( bReadHasPeakPositions ) { pLocFrag->appendPointPos( nPointPosOfBaseAboutToBeAdded ); } } } pLocFrag->appendNtide( pLocFrag->ntideGetPhredCalledBase( nUnpaddedReadPos ) ); if ( bReadHasPeakPositions ) { pLocFrag->appendPointPos( pLocFrag->nGetPointPosOfPhredCalledBase( nUnpaddedReadPos ) ); } nConsPosOfLastBase = nConsPosOfCurrentBase; } // after leaving this loop, nUnpaddedReadPos and // nUnpaddedConsPos both point // to the base *after* this aligned segment } else if ( nTypeOfNextAlignedSegment == nINSERTION ) { // pointers start out: // v nUnpaddedConsPos // consensus: bbb***bbb // read: bbbbbbbbb // ^ nUnpaddedReadPos crossMatchIndel* pCrossMatchIndel = (*(pCrossMatchInfoForRead->pCrossMatchIndelList_))[ nIndel ]; assert( nUnpaddedReadPos == pCrossMatchIndel->nUnpaddedReadPos_ ); assert( pCrossMatchIndel->soType_[0] == 'I' ) ; // "I": // consensus: bbb***bbb // read: bbbbbbbbb // In this case, the nUnpaddedConsPos does not // change. Read bases would be moved // without incrementing nUnpaddedConsPos. int nSizeOfInsertion = pCrossMatchIndel->nMultiplicity_; // notice that the nUnpaddedConsPos is not // incremented for( int nBasesAdded = 0; nBasesAdded < nSizeOfInsertion; ++nBasesAdded, ++nUnpaddedReadPos ) { pLocFrag->appendNtide( pLocFrag->ntideGetPhredCalledBase( nUnpaddedReadPos ) ); if ( bReadHasPeakPositions ) { pLocFrag->appendPointPos( pLocFrag->nGetPointPosOfPhredCalledBase( nUnpaddedReadPos ) ); } } // nUnpaddedReadPos is left after the deletion, // v nUnpaddedConsPos // consensus: bbb***bbb // read: bbbbbbbbb // pointing here: ^ // Note that the consensus may have more than 3 pads (or // whatever the # of the insertion is). In such a case, we // may need to add some pads to the read. int nConsPosBeforeInsertion = pContig->nPaddedIndexFast( nUnpaddedConsPos - 1 ); int nConsPosAfterInsertion = pContig->nPaddedIndexFast( nUnpaddedConsPos ); int nPadsInConsensus = nConsPosAfterInsertion - nConsPosBeforeInsertion - 1; // typically these will be equal int nPadsToAdd = nPadsInConsensus - nSizeOfInsertion; if ( nPadsToAdd > 0 ) { // use quality of last base added int nLastBaseIndex = pLocFrag->nCurrentLength_ - 1; Quality qual = pLocFrag->RWTValOrderedVector::operator[]( nLastBaseIndex ).qualGetQuality(); // the reason for the -1 is that the loop about leaves // nUnpaddedReadPos pointing to the base after the last // base added. It is probably ok to use the point position // of either base nUnpaddedReadPos - 1 or nUnpaddedReadPos, // but I feel more comfortable with the former since we // know it is a base while the latter may be off the end of the // read (wouldn't happen if crossmatch is doing its job) int nPointPosOfLastBaseAdded = !bReadHasPeakPositions ? 0 : pLocFrag->nGetPointPosOfPhredCalledBase( nUnpaddedReadPos - 1 ); for( int nPad = 0; nPad < nPadsToAdd; ++nPad ) { pLocFrag->appendNtide( Ntide('*', qual ) ); if ( bReadHasPeakPositions ) { pLocFrag->appendPointPos( nPointPosOfLastBaseAdded ); } } } ++nIndel; // do not look at that indel again } // if ( nTypeOfNextAlignedSegment == nNOINDEL ) { else else { crossMatchIndel* pCrossMatchIndel = (*(pCrossMatchInfoForRead->pCrossMatchIndelList_))[ nIndel ]; assert( nTypeOfNextAlignedSegment == nDELETION ); assert( pCrossMatchIndel->soType_[0] == 'D' ); // "D": // v nUnpaddedConsPos here, I think // consensus: bbbbbbbbb // read: bbb***bbb // ^ nUnpaddedReadPos here, I think // In this case, we could use the D indel to add pads and increase // nUnpaddedConsPos. // nUnpaddedReadPos is not changed. // v nConsPosBeforeDeletion // consensus: bbbbbbbbb // read: bbb***bbb // ^ nConsPosAfterDeletion int nConsPosBeforeDeletion = pContig->nPaddedIndexFast( nUnpaddedConsPos - 1 ); int nConsPosAfterDeletion = pContig->nPaddedIndexFast( nUnpaddedConsPos + pCrossMatchIndel->nMultiplicity_ ); int nPadsToAdd = nConsPosAfterDeletion - nConsPosBeforeDeletion - 1; // get quality to use for pads--just the quality of the last // base added. int nLastIndex = pLocFrag->nCurrentLength_ - 1; Quality qual = pLocFrag->RWTValOrderedVector::operator[]( nLastIndex ).qualGetQuality(); int nPointPosOfLastAddedBase = !bReadHasPeakPositions ? 0 : pLocFrag->nGetPointPosOfPhredCalledBase( nUnpaddedReadPos - 1 ); for( int nPad = 0; nPad < nPadsToAdd; ++nPad ) { pLocFrag->appendNtide( Ntide('*', qual ) ); if ( bReadHasPeakPositions ) { pLocFrag->appendPointPos( nPointPosOfLastAddedBase ); } } nUnpaddedConsPos += pCrossMatchIndel->nMultiplicity_; // do not move nUnpaddedReadPos // Thus they end up: // v nUnpaddedConsPos here // consensus: bbbbbbbbb // read: bbb***bbb // ^ nUnpaddedReadPos here ++nIndel; } // if ( nTypeOfNextAlignedSegment == nNOINDEL ) { else } // so we've added all the bases within the aligned region. Now add // whatever unaligned bases are left. // now we have finished with the aligned bases // this would be a good place to set the alignment clipping pLocFrag->nAlignClipEnd_ = pContig->nPaddedIndexFast( pCrossMatchInfoForRead->nUnpaddedContigEnd_ ); pLocFrag->nAlignClipEnd_ = pLocFrag->nGetAlignEnd(); // We now need to copy any remaining bases in the read for( nUnpaddedReadPos = pCrossMatchInfoForRead->nUnpaddedReadEnd_ + 1; nUnpaddedReadPos <= pLocFrag->pSeqPhredCalledBases_->nGetEndIndex(); ++nUnpaddedReadPos ) { pLocFrag->appendNtide( pLocFrag->ntideGetPhredCalledBase( nUnpaddedReadPos ) ); if ( bReadHasPeakPositions ) pLocFrag->appendPointPos( pLocFrag->nGetPointPosOfPhredCalledBase( nUnpaddedReadPos ) ); } pLocFrag->calculateHighQualitySegmentOfRead(); pLocFrag->fixTagCoordinates(); } // void addNewReads :: insertPadsInOneNewReadAndSetReadBases( void addNewReads :: updateListOfReadsInTopLevelWindow() { ChooseContig* pChooseContig = ConsEd::pGetConsEd()->pGetChooseContig(); for( int nRead = 0; nRead < aCrossMatchInfoForReads_.length(); ++nRead ) { LocatedFragment* pLocFragNew = aCrossMatchInfoForReads_[ nRead ]->pLocFrag_; FragmentAndContig* pFragmentAndContig = new FragmentAndContig( pLocFragNew ); pChooseContig->daFragmentAndContig_.insert( pFragmentAndContig ); } if ( bIfAReadDoesNotFitPutItIntoItsOwnContig_ ) { for( int nRead = 0; nRead < aLocatedFragmentsThatDidNotAlign_.length(); ++nRead ) { LocatedFragment* pLocFragNew = aLocatedFragmentsThatDidNotAlign_[ nRead ]; FragmentAndContig* pFragmentAndContig = new FragmentAndContig( pLocFragNew ); pChooseContig->daFragmentAndContig_.insert( pFragmentAndContig ); } } pChooseContig->daFragmentAndContig_.resort(); ConsEd::pGetConsEd()->pGetGuiTopWindow()->showListOfContigs( pChooseContig ); } void addNewReads :: updateFirstAndLastDisplayableContigPositions() { Assembly* pAssembly = ConsEd::pGetAssembly(); for( int nContig = 0; nContig < pAssembly->nNumContigs(); ++nContig ) { Contig* pContig = pAssembly->pGetContig( nContig ); pContig->updateFirstAndLastDisplayableContigPos(); } } void addNewReads :: makeReadsThatDidNotAlignIntoTheirOwnContigs() { for( int nRead = 0; nRead < aTryingToAddTheseReads_.length(); ++nRead ) { RWCString soTryingToAddThisRead = aTryingToAddTheseReads_[ nRead ]; if ( !bWasThisReadAdded( soTryingToAddThisRead ) ) makeThisReadIntoItsOwnContig( soTryingToAddThisRead ); } } bool addNewReads :: bWasThisReadAdded( const RWCString& soReadName ) { for( int nRead = 0; nRead < aCrossMatchInfoForReads_.length(); ++nRead ) { LocatedFragment* pLocFrag = aCrossMatchInfoForReads_[ nRead ]->pLocFrag_; if ( pLocFrag->soGetName() == soReadName ) return( true ); } return( false ); } void addNewReads :: makeThisReadIntoItsOwnContig( const RWCString& soReadName ) { RWCString soNewContigName = ConsEd::pGetConsEd()->pGetAssembly()->soGetNewContigName(); Contig* pContig = new Contig( (char*) soNewContigName.data(), 1, // new number of reads 1, // new number of base segments false, // complemented from the way // phrap created ConsEd::pGetAssembly() ); LocatedFragment* pLocFrag = new LocatedFragment( soReadName, 1, // position within contig false, // complemented pContig ); FileName filPHDFilename = soReadName + ".phd.1"; pLocFrag->setPHDFilename( filPHDFilename ); pLocFrag->setChromatFilename( soReadName ); // we also need to read tags and whole read items // It appears that readPHDFileForAddedRead does not do this. // So this should be fixed. pLocFrag->readPHDFileForAddedRead(); if ( pLocFrag->pSeqPhredCalledBases_->length() == 0 ) { popupErrorMessage( "read %s doesn't have any bases", (const char*) soReadName ); // zero length read so don't make a contig out of it // or else bGetDataStructureOK will complain return; } pContig->resize( pLocFrag->pSeqPhredCalledBases_->length() ); ConsEd::pGetConsEd()->pGetAssembly()->addContig( pContig ); pLocFrag->createPointPosArray2( pLocFrag->pSeqPhredCalledBases_->cWhichPeakPositionsAreUsed_, pLocFrag->pSeqPhredCalledBases_->length() * 1.1, false ); // bFillUpArray for( int nPos = pLocFrag->pSeqPhredCalledBases_->nGetStartIndex(); nPos <= pLocFrag->pSeqPhredCalledBases_->nGetEndIndex(); ++nPos ) { pLocFrag->append( (*(pLocFrag->pSeqPhredCalledBases_))[ nPos ] ); pLocFrag->appendPointPos( pLocFrag->nGetPointPosOfPhredCalledBase( nPos ) ); pContig->append( (*(pLocFrag->pSeqPhredCalledBases_))[ nPos ] ); } pLocFrag->nAlignClipStart_ = pContig->nGetStartIndex(); pLocFrag->nAlignClipEnd_ = pContig->nGetEndIndex(); pLocFrag->nQualClipStart_ = pContig->nGetStartIndex(); pLocFrag->nQualClipEnd_ = pContig->nGetEndIndex(); pContig->addLocatedFragment( pLocFrag ); pContig->updateLastDisplayableContigPos(); pContig->setUnpaddedPositionsArray(); pContig->setPaddedPositionsArray(); BaseSegment* pBaseSeg = new BaseSegment( pLocFrag, pContig->nGetStartIndex(), pContig->nGetEndIndex() ); pContig->baseSegArray_.addPtrToNewBaseSeg( pBaseSeg ); assert( pContig->baseSegArray_.bGetDataStructureOk() ); aLocatedFragmentsThatDidNotAlign_.insert( pLocFrag ); } void addNewReads :: convertVectorTagsToXsInNewReads() { for( int n = 0; n < aCrossMatchInfoForReads_.length(); ++n ) { LocatedFragment* pLocFrag = aCrossMatchInfoForReads_[n]->pLocFrag_; pLocFrag->convertVectorTagsToXs(); } } static int nCompareContigIntervals( contigInterval** ppContigInterval1, contigInterval** ppContigInterval2 ) { if ( (*ppContigInterval1)->pContig_ == (*ppContigInterval2)->pContig_ ) { if ( (*ppContigInterval1)->nConsPosLeft_ < (*ppContigInterval2)->nConsPosLeft_ ) return( -1 ); else if ( (*ppContigInterval1)->nConsPosLeft_ == (*ppContigInterval2)->nConsPosLeft_ ) return( 0 ); else return( 1 ); } else { if ( (*ppContigInterval1)->pContig_->soGetName() < (*ppContigInterval2)->pContig_->soGetName() ) return( -1 ); else if ( (*ppContigInterval1)->pContig_->soGetName() == (*ppContigInterval2)->pContig_->soGetName() ) return( 0 ); else return( 1 ); } } static bool bContigIntervalsIntersect(contigInterval* pCI1, contigInterval* pCI2 ) { if ( pCI1->pContig_ != pCI2->pContig_ ) return( false ); return( bIntervalsIntersect( pCI1->nConsPosLeft_, pCI1->nConsPosRight_, pCI2->nConsPosLeft_, pCI2->nConsPosRight_ ) ); } static void mergeContigIntervals( contigInterval* pCI1, contigInterval* pCI2 ) { // fprintf( pAO, "merging contig intervals %s %d %d and %d %d %s\n", // pCI1->pContig_->soGetName().data(), // pCI1->pContig_->nUnpaddedIndex( pCI1->nConsPosLeft_ ), // pCI1->pContig_->nUnpaddedIndex( pCI1->nConsPosRight_ ), // pCI2->pContig_->nUnpaddedIndex( pCI2->nConsPosLeft_ ), // pCI2->pContig_->nUnpaddedIndex( pCI2->nConsPosRight_ ), // pCI2->pContig_->soGetName().data() ); // pCI1 will become the new joined contigInterval // and pCI2 will become unused int nNewLeft = MIN( pCI1->nConsPosLeft_, pCI2->nConsPosLeft_ ); int nNewRight = MAX( pCI1->nConsPosRight_, pCI2->nConsPosRight_ ); pCI1->nConsPosLeft_ = nNewLeft; pCI1->nConsPosRight_ = nNewRight; pCI2->bInUse_ = false; } void addNewReads :: recalculateConsensusQualityValues() { cerr << "recalculating consensus quality values..."; cerr.flush(); aContigIntervals_.resize( (size_t) aCrossMatchInfoForReads_.length() ); for( int nRead = 0; nRead < aCrossMatchInfoForReads_.length(); ++nRead ) { LocatedFragment* pLocFrag = aCrossMatchInfoForReads_[ nRead ]->pLocFrag_; aContigIntervals_.insert( new contigInterval( pLocFrag->pContig_, pLocFrag->nAlignClipStart_, pLocFrag->nAlignClipEnd_ ) ); } // now sort them qsort( aContigIntervals_.data(), aContigIntervals_.length(), sizeof( contigInterval* ), ( int(*)( const void*, const void*) ) nCompareContigIntervals ); // check that they are sorted bool bSorted = true; int nContigInterval; for( nContigInterval = 1; nContigInterval < aContigIntervals_.length(); ++nContigInterval ) { contigInterval* pContigInterval1 = aContigIntervals_[ nContigInterval - 1 ]; contigInterval* pContigInterval2 = aContigIntervals_[ nContigInterval ]; if ( pContigInterval1->pContig_ == pContigInterval2->pContig_ ) { if ( pContigInterval1->nConsPosLeft_ > pContigInterval2->nConsPosLeft_ ) { fprintf( pAO, "out of order index %d with nConsPosLeft_ = %d and index %d with nConsPosLeft_ = %d in %s\n", nContigInterval - 1, pContigInterval1->nConsPosLeft_, nContigInterval, pContigInterval2->nConsPosLeft_, pContigInterval2->pContig_->soGetName().data() ); bSorted = false; } } else if ( pContigInterval1->pContig_->soGetName() > pContigInterval2->pContig_->soGetName() ) { fprintf( pAO, "out of order index %d with %s and index %d with %s\n", nContigInterval - 1, pContigInterval1->pContig_->soGetName().data(), nContigInterval, pContigInterval2->pContig_->soGetName().data() ); bSorted = false; } } if ( !bSorted ) { RWCString soError = "qsort failed"; THROW_ERROR( soError ); } if ( aContigIntervals_.length() >= 2 ) { // now try to join them together int nLeftContigInterval = 0; int nRightContigInterval = 1; contigInterval* pLeftContigInterval = aContigIntervals_[ nLeftContigInterval ]; contigInterval* pRightContigInterval = aContigIntervals_[ nRightContigInterval ]; while( true ) { assert( pLeftContigInterval->bInUse_ && pRightContigInterval->bInUse_ ); if ( bContigIntervalsIntersect( pLeftContigInterval, pRightContigInterval ) ) { mergeContigIntervals( pLeftContigInterval, pRightContigInterval ); if ( bEndOrAdvanceToNextUnused( nRightContigInterval ) ) break; pRightContigInterval = aContigIntervals_[ nRightContigInterval ]; } else { if ( bEndOrAdvanceToNextUnused( nLeftContigInterval ) ) break; pLeftContigInterval = aContigIntervals_[ nLeftContigInterval ]; nRightContigInterval = nLeftContigInterval; if ( bEndOrAdvanceToNextUnused( nRightContigInterval ) ) break; pRightContigInterval = aContigIntervals_[ nRightContigInterval ]; } } } for( nContigInterval = 0; nContigInterval < aContigIntervals_.length(); ++nContigInterval ) { contigInterval* pContigInterval = aContigIntervals_[ nContigInterval ]; if ( ( nContigInterval < 10 ) || ( nContigInterval < 10000 && nContigInterval % 1000 == 0 ) || ( nContigInterval % 20000 == 0 ) ) { cerr << "now recalculating consensus qualities for interval " << soAddCommas( nContigInterval ) << " out of " << soAddCommas( aContigIntervals_.length() - 1 ) << endl; } if ( !pContigInterval->bInUse_ ) continue; // fprintf( pAO, "recalculating consensus qualities from %d to %d of %s\n", // pContigInterval->pContig_->nUnpaddedIndex( pContigInterval->nConsPosLeft_ ), // pContigInterval->pContig_->nUnpaddedIndex( pContigInterval->nConsPosRight_ ), // pContigInterval->pContig_->soGetName().data() ); // fflush( pAO ); pContigInterval->pContig_->recalculateConsensusQualitiesAndChange( pContigInterval->nConsPosLeft_, pContigInterval->nConsPosRight_ ); } cerr << "done" << endl; cerr.flush(); } bool addNewReads :: bEndOrAdvanceToNextUnused( int& nContigInterval ) { ++nContigInterval; while( nContigInterval < aContigIntervals_.length() ) { if ( aContigIntervals_[ nContigInterval ]->bInUse_ ) return( false ); ++nContigInterval; } return( true ); } bool addNewReads :: bCheckThatAceFileOnDiskIsCurrent() { if ( ConsEd::pGetAssembly()->bChanged() ) { popupErrorMessage( "You have made unsaved changes. You must save the assembly before adding new reads" ); return( false ); } return( true ); } FileName addNewReads :: filMakeAlignmentsFilename() { time_t timee; time( &timee ); const int nTimeSize = 100; char szTime[ nTimeSize ]; // all this funny business is just to hide this from SCCS strftime( szTime, nTimeSize, "%H""%M""%S", localtime( &timee ) ); FileName fil = "addReadsAlignments"; filAlignments_ += szTime; return( fil ); } // void addNewReads :: zeroAddNewReadsContigCounts() { // ConsEd::pGetAssembly()->zeroAddNewReadsContigCounts(); // bOKToUseContigCounts_ = true; // } void addNewReads :: checkThatCrossMatchRunWithCorrectParameters( FILE* pAlignmentsFile ) { if ( fgets( szLine_, SZLINE_SIZE, pAlignmentsFile ) == NULL ) { THROW_ERROR( "premature end of cross_match alignment file" ); } ++nCurrentLine_; if ( strstr( szLine_, "-alignment" ) && !strstr( szLine_, "-discrep_lists" ) ) { RWCString soErrorMessage = "addReads2Consed.perl is not the latest. In subroutine runCrossMatchBetweenNewReadsAndContigs, cross_match should be run with -discrep_lists instead of -alignments Please download a fresh copy from http://bozeman.mbt.washington.edu/consed/consed.html"; popupErrorMessage( soErrorMessage ); // in guiTopWindow, the exception will be caught but no // error message reported THROW_ERROR( soErrorMessage ); } } // not for old addNewReads void addNewReads :: readAlignmentFileHeaderToGetPhdBalls( FILE* pAlignmentsFile ) { // argh! No, this would delete the phdballs from other alignments // I believe the problem was just with selectRegions which now has // its own aArrayofPhdBallsAll_ aArrayOfPhdBallsForOneAlignment_.clear(); bool bFound = false; while( !bFound ) { FGETS_OR_PARSE_PANIC( "premature end of file while looking for BEGIN_PHDBALLS" ); // BEGIN_PHDBALLS is the current header designation // START_PHDBALLS was prior to June 19, 2008 if ( ( memcmp( szLine_, "BEGIN_PHDBALLS", 14 ) == 0 ) || ( memcmp( szLine_, "START_PHDBALLS", 14 ) == 0 ) ) { bFound = true; } } bFound = false; while( !bFound ) { FGETS_OR_PARSE_PANIC( "premature end of file while looking for END_PHDBALLS" ); if( memcmp( szLine_, "END_PHDBALLS", 12 ) == 0 ) { bFound = true; } else { // line looks like: // ../phdball_dir/phd.ball.3 RWCString soPhdBallPath( szLine_ ); soPhdBallPath.stripAllWhitespaceExceptInternal(); aArrayOfPhdBallsForOneAlignment_.insert( soPhdBallPath ); } } } // not for old addNewReads void addNewReads :: readBasesAndQualitiesForAddedReads() { FILE* pNewPhdBall = NULL; // read the phd balls for the reads that are aligned and // set the qualities and full length of bases // The aligned reads are in the addNewReads::aCrossMatchInfoForReads_ // but it isn't fast to search. So make it fast. readsSortedByReadName aAlignedReads; aAlignedReads.soName_ = "addNewReadsWithExistingAlignments::aAlignedReads"; int nNewSize = aCrossMatchInfoForReads_.length() - nLastSizeOfCrossMatchInfoForReads_; aAlignedReads.resize( nNewSize ); for( int n = nLastSizeOfCrossMatchInfoForReads_; n < aCrossMatchInfoForReads_.length(); ++n ) { LocatedFragment* pLocFrag = aCrossMatchInfoForReads_[n]->pLocFrag_; aAlignedReads.insert( pLocFrag ); } // we are now done using nLastSizeOfCrossMatchInfoForReads_ so set // it for next time we need to read phdballs: nLastSizeOfCrossMatchInfoForReads_ = aCrossMatchInfoForReads_.length(); aAlignedReads.resort(); // now aAlignedReads is ready for pFindReadByName which occurs thousands // or millions of times below for( int nPhdBall = 0; nPhdBall < aArrayOfPhdBallsForOneAlignment_.length(); ++nPhdBall ) { FileName filPhdBall = aArrayOfPhdBallsForOneAlignment_[ nPhdBall ]; fprintf( stderr, "reading phdball %s...\n", filPhdBall.data() ); ConsEd::pGetAssembly()->readFileOfPhdFilesForAddedReads( aAlignedReads, filPhdBall, false, // bCreateNewPhdBall, pNewPhdBall ); // now add WA item so that saveAssembly writes it. RWCString soTimestamp = soGetDateTime( nNoSlashes ); wholeAssemblyItem* pWA = new wholeAssemblyItem( "phdBall", "consed", soTimestamp, filPhdBall ); ConsEd::pGetAssembly()->addWholeAssemblyItem( pWA ); } fprintf( stderr, "done\n" ); } // not for old addNewReads void addNewReads :: readBasesAndQualitiesForSelectRegions() { FILE* pNewPhdBall = NULL; openNewPhdBall( pNewPhdBall ); // read the phd balls for the reads that are aligned and // set the qualities and full length of bases // The aligned reads are in the addNewReads::aCrossMatchInfoForReads_ // but it isn't fast to search. So make it fast. readsSortedByReadName aAlignedReads; aAlignedReads.soName_ = "addNewReadsWithExistingAlignments::aAlignedReads"; int nNewSize = aCrossMatchInfoForReads_.length() - nLastSizeOfCrossMatchInfoForReads_; aAlignedReads.resize( nNewSize ); for( int n = nLastSizeOfCrossMatchInfoForReads_; n < aCrossMatchInfoForReads_.length(); ++n ) { LocatedFragment* pLocFrag = aCrossMatchInfoForReads_[n]->pLocFrag_; aAlignedReads.insert( pLocFrag ); } // we are now done using nLastSizeOfCrossMatchInfoForReads_ so set // it for next time we need to read phdballs: nLastSizeOfCrossMatchInfoForReads_ = aCrossMatchInfoForReads_.length(); aAlignedReads.resort(); // now aAlignedReads is ready for pFindReadByName which occurs thousands // or millions of times below for( int nPhdBall = 0; nPhdBall < aArrayOfPhdBallsAll_.length(); ++nPhdBall ) { FileName filPhdBall = aArrayOfPhdBallsAll_[ nPhdBall ]; fprintf( stderr, "reading phdball %s...\n", filPhdBall.data() ); ConsEd::pGetAssembly()->readFileOfPhdFilesForAddedReads( aAlignedReads, filPhdBall, true, // bCreateNewPhdBall, pNewPhdBall ); } fclose( pNewPhdBall ); fprintf( stderr, "done\n" ); } // not for old addNewReads void addNewReads :: readAlignmentFile( const FileName& filAlignmentFile ) { filAlignments_ = filAlignmentFile; // save for error messages FILE* pAlignmentsFile = fopen( filAlignmentFile.data(), "r" ); if ( !pAlignmentsFile ) { THROW_FILE_ERROR( filAlignmentFile ); } nCurrentLine_ = 0; int nNumberOfAlignments = nGetNumberOfAlignments( pAlignmentsFile ); aCrossMatchInfoForReads_.increaseMaxLengthIfNecessary( nNumberOfAlignments ); rewind( pAlignmentsFile ); nCurrentLine_ = 0; readAlignmentFileHeaderToGetPhdBalls( pAlignmentsFile ); // does the new LocatedFragment in parseAlignmentHeaderLine parseAlignmentsAndAddReads2( pAlignmentsFile ); fclose( pAlignmentsFile ); } int addNewReads :: nGetNumberOfAlignments( FILE* pAlignmentsFile ) { int nNumberOfAlignments = 0; bool bEndOfFile = false; while( !bEndOfFile ) { if ( fgets( szLine_, SZLINE_SIZE, pAlignmentsFile ) == NULL ) { bEndOfFile = true; } else { ++nCurrentLine_; if ( memcmp( szLine_, "ALIGNMENT", 9 ) == 0 ) { ++nNumberOfAlignments; } } } // while( !bEndOfFile ) { return nNumberOfAlignments; } void addNewReads :: doItSelectRegions() { pCP->bOKToUseGui_ = false; ConsEd* pConsed = new ConsEd(); maybeTerminateIfAnotherReadWriteConsed(); terminateIfDirectoryIsNotWritable(); // this does "new Assembly" ConsEd::pGetConsEd()->openAssemblyFile( filAceFileToOpen_ ); readAndProcessRegions(); parseAndSelectAlignments(); readBasesAndQualitiesForSelectRegions(); insertPadsInContigs(); insertPadsInNewReadsAndSetReadBases(); convertVectorTagsToXsInNewReads(); FileName filNewAceFile = ConsEd::pGetAssembly()->filSaveAssemblyToUserSpecifiedOrNextAvailableVersionOfAceFile(); writeNewAceFileNameToFile( filNewAceFile ); } static RWCString soRegionUsage = "(seq name) (start pos) (end pos) such as chr1 125,201,573 125,201,764 (commas optional)"; // part of selectRegions void addNewReads :: readAndProcessRegions() { // go through contigs and record those that are made from a region of // a sequence Assembly* pAssembly = ConsEd::pGetAssembly(); pRegionsArray_ = new regionOfSequenceArray( pAssembly->nNumContigs(), "addNewReads::pRegionsArray_" ); for( int nContig = 0; nContig < pAssembly->nNumContigs(); ++nContig ) { Contig* pContig = pAssembly->pGetContig( nContig ); bool bContigHasReferenceRead = false; if ( pContig->pReferenceSequence_ ) { LocatedFragment* pLocFrag = pContig->pReferenceSequence_; for( int nWR = 0; nWR < pLocFrag->aWholeReadItems_.length(); ++nWR ) { wholeReadItem* pWR = pLocFrag->aWholeReadItems_[ nWR ]; if ( pWR->soType_ == "referenceSequence" ) { RWCTokenizer tok( pWR->soData_ ); RWCString soSequenceName = tok(); RWCString soStart = tok(); RWCString soEnd = tok(); RWCString soFastaFile = tok(); soStart.removeAllCommas(); soEnd.removeAllCommas(); int n1StartPos; int n1EndPos; assert( bIsNumeric( soStart ) ); assert( bIsNumeric( soEnd ) ); n1StartPos = atoi( soStart.data() ); n1EndPos = atoi( soEnd.data() ); regionOfSequence* pRegion = new regionOfSequence( soSequenceName, n1StartPos, n1EndPos, pContig ); pRegionsArray_->insert( pRegion ); bContigHasReferenceRead = true; break; } } // for( int nWR = 0; ... } // if ( pContig->pReferenceSequence_ ) { if ( !bContigHasReferenceRead ) { cerr << "warning: " << pContig->soGetName() << " has no reference sequence" << endl; } } // for( int nContig = 0; ... pRegionsArray_->resort(); // Check that every region has a contig: FILE* pRegions = fopen( filRegions_.data(), "r" ); if ( !pRegions ) { THROW_FILE_ERROR( filRegions_ ); } // I'm now checking that the fasta file in the WR item matches the // fasta file in the regions file. Presumably the contigs were // made by selectRegions.perl which used the fasta file to make // the contig. int nLine = 0; while( fgets( soLine.data(), nMaxLineSize, pRegions ) != NULL ) { soLine.nCurrentLength_ = strlen( soLine.data() ); ++nLine; if ( soLine.bStartsWith( "END_SEQ_FASTA" ) ) { break; } } while( fgets( soLine.data(), nMaxLineSize, pRegions ) != NULL ) { soLine.nCurrentLength_ = strlen( soLine.data() ); ++nLine; // ignore whitespace (June 2009) soLine.stripTrailingWhitespaceFast(); if ( soLine.bIsNull() ) continue; // looks like: // chr1 103,205,104 103,205,289 // (might not have commas ); RWCTokenizer tok( soLine ); RWCString soSequenceName = tok(); RWCString soStartPos = tok(); RWCString soEndPos = tok(); if ( soEndPos.bIsNull() ) { RWCString soError = "line " + RWCString( (long) nLine ) + " with contents: " + soLine + " should be of form (seq name) (start) (end) but it doesn't have 3 tokens"; THROW_ERROR2( soError ); } soStartPos.removeAllCommas(); soEndPos.removeAllCommas(); if ( !bIsNumeric( soStartPos ) ) { RWCString soError = "Fatal Error: " + soLine + " has start position " + soStartPos + " which is not numeric " + soRegionUsage; THROW_ERROR( soError ); } if ( !bIsNumeric( soEndPos ) ) { RWCString soError = "Fatal Error: " + soLine + " has end position " + soEndPos + " which is not numeric " + soRegionUsage; THROW_ERROR( soError ); } int n1StartPos = atoi( soStartPos.data() ); int n1EndPos = atoi( soEndPos.data() ); RWCString soStartPosWithCommas = soAddCommas( n1StartPos ); RWCString soContigName = soSequenceName + "_" + soStartPosWithCommas; regionOfSequence myRegionOfSequence( soSequenceName, n1StartPos, n1EndPos, NULL ); // pRegionsArray_ came from the Contigs. So this checks if there // is a contig corresponding to the region just read regionOfSequence* pFoundRegion = pRegionsArray_->pFindFast( &myRegionOfSequence ); if ( !pFoundRegion ) { RWCString soError = "could not find contig corresponding with sequence with WR item indicating it came from sequence "; soError += soSequenceName; soError += " at position "; soError += RWCString( (long) n1StartPos ); THROW_ERROR( soError ); } // Check that the contig is the same length as the region it is // supposed to represent Contig* pFoundContig = pFoundRegion->pContig_; int nLength = n1EndPos - n1StartPos + 1; if ( pFoundContig->nGetUnpaddedSeqLength() != nLength ) { RWCString soError = "Fatal error: region " + soLine + " has length " + RWCString( (long) nLength ) + " while existing contig " + pFoundContig->soGetName() + " has length " + pFoundContig->nGetUnpaddedSeqLength(); THROW_ERROR( soError ); } } fclose( pRegions ); } // part of selectRegions void addNewReads :: parseAndSelectAlignments() { FILE* pAlignmentsFOF = fopen( filAlignmentsFOF_.data(), "r" ); if ( !pAlignmentsFOF ) { THROW_FILE_ERROR( filAlignmentsFOF_ ); } const int nMaxAlignmentsFileLength = 2000; FileName filAlignmentsFile( (size_t) nMaxAlignmentsFileLength ); while( fgets( filAlignmentsFile.data(), nMaxLineSize, pAlignmentsFOF ) != NULL ) { filAlignmentsFile.nCurrentLength_ = strlen( filAlignmentsFile.data() ); filAlignmentsFile.stripTrailingWhitespaceFast(); parseAndSelectAlignmentsFromOneFile( filAlignmentsFile ); } fclose( pAlignmentsFOF ); } // part of selectRegions void addNewReads :: parseAndSelectAlignmentsFromOneFile( const RWCString& filAlignmentsFile ) { filAlignments_ = filAlignmentsFile; // save for error messages FILE* pAlignments = fopen( filAlignmentsFile.data(), "r" ); if ( !pAlignments ) { THROW_FILE_ERROR( filAlignmentsFile ); } nCurrentLine_ = 0; // puts result in aArrayOfPhdBalls_ readAlignmentFileHeaderToGetPhdBalls( pAlignments ); aArrayOfPhdBallsAll_.appendArray( aArrayOfPhdBallsForOneAlignment_ ); regionOfSequence* pRegion; // actually this is a region of a chromosome // or subject sequence that cross_match ran against crossMatchInfoForRead* pCrossMatchInfoForRead; bool bOKToLookForDiscrepancyLines = false; while( fgets( soLine.data(), nMaxLineSize, pAlignments ) != NULL ) { ++nCurrentLine_; if ( memcmp( soLine.data(), "ALIGNMENT", 9 ) == 0 ) { // stores result in aCrossMatchInfoForReads_ if we want // this ALIGNMENT line parseAlignmentHeaderLineSelectRegions( soLine, pCrossMatchInfoForRead, pRegion, bOKToLookForDiscrepancyLines ); } else if ( bOKToLookForDiscrepancyLines ) { if ( memcmp( soLine.data(), "DISCREPANCY", 11 ) == 0 ) { // stores result to list in pCrossMatchInfoForRead parseOneDiscrepancyForOneAlignmentSelectRegions( soLine, pRegion, pCrossMatchInfoForRead ); } } } fclose( pAlignments ); } // just have the memory available so we don't have to malloc // this with each alignment static regionOfSequence queryRegionOfSequence("", 0, 0, NULL ); void addNewReads :: parseAlignmentHeaderLineSelectRegions( RWCString& soLine, crossMatchInfoForRead*& pCrossMatchInfoForRead, regionOfSequence*& pRegion, bool& bOKToLookForDiscrepancyLines ) { // if reached here, this is an ALIGNMENT line // find the sequence and the position of the subject sequence // Looks like this: // ALIGNMENT 28 0.00 0.00 0.00 s_1_1_251_529 1 28 (0) chr5 83941889 83941916 (96915950) // 0 1 2 3 4 5 6 7 8 9 10 11 12 // ALIGNMENT 25 3.57 0.00 0.00 s_1_1_886_8 1 28 (0) C chr5 (854045) 180003821 180003794 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 // ALIGNMENT 27 0.00 0.00 0.00 s_1_1_300_628 1 28 (0) C chr5 (126239857) 54618009 54617982 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 // find 9th token. It is either a C or the sequence. soLine.nCurrentLength_ = strlen( soLine.data() ); RWCTokenizer tok( soLine ); tok(); // 0 tok(); // 1 tok(); // 2 tok(); // 3 tok(); // 4 soCurrentRead_ = tok(); // 5 RWCString soQueryStart = tok(); // 6 RWCString soQueryEnd = tok(); // 7 tok(); // 8 RWCString soSequenceName = tok(); // 9--might be a C RWCString soSubjectStart; RWCString soSubjectEnd; if ( soSequenceName == "C" ) { bCurrentReadIsComplemented_ = true; soSequenceName = tok(); // 10 tok(); // 11 soSubjectEnd = tok(); // 12 soSubjectStart = tok(); // 13 } else { bCurrentReadIsComplemented_ = false; soSubjectStart = tok(); // 10 soSubjectEnd = tok(); // 11 } if ( !bIsNumeric( soSubjectStart ) ) { THROW_ERROR( "Fatal error: line " + soLine + " has subject start = " + soSubjectStart + " which is not numeric" ); } if ( !bIsNumeric( soSubjectEnd ) ) { THROW_ERROR( "Fatal error: line " + soLine + " has subject end = " + soSubjectEnd + " which is not numeric" ); } int nSubjectStart = atoi( soSubjectStart.data() ); int nSubjectEnd = atoi( soSubjectEnd.data() ); queryRegionOfSequence.soSequenceName_ = soSequenceName; queryRegionOfSequence.n1StartPos_ = nSubjectStart; queryRegionOfSequence.n1EndPos_ = nSubjectEnd; pRegion = pRegionsArray_->pGetOverlappingRegion( &queryRegionOfSequence ); if ( !pRegion ) { bOKToLookForDiscrepancyLines = false; return; // look for another ALIGNMENT line } // convert to contig coordinates int nUnpaddedContigStart = nSubjectStart - pRegion->n1StartPos_ + 1; int nUnpaddedContigEnd = nSubjectEnd - pRegion->n1StartPos_ + 1; // finish parsing assert( bIsNumeric( soQueryStart.data() ) ); assert( bIsNumeric( soQueryEnd.data() ) ); int nUnpaddedReadStart = atoi( soQueryStart.data() ); int nUnpaddedReadEnd = atoi( soQueryEnd.data() ); pCurrentLocFrag_ = new LocatedFragment( soCurrentRead_, 1, // dummy value for alignment position bCurrentReadIsComplemented_, pRegion->pContig_ ); // pCurrentLocFrag_->soChemistry_ = "???"; this will be filled in // by processCommentAndDnaForAddedRead pCurrentLocFrag_->nVersion_ = 1; pCrossMatchInfoForRead = new crossMatchInfoForRead( pCurrentLocFrag_, nUnpaddedReadStart, nUnpaddedReadEnd, nUnpaddedContigStart, nUnpaddedContigEnd, bCurrentReadIsComplemented_ ); aCrossMatchInfoForReads_.insert( pCrossMatchInfoForRead ); // now look for discrepancy lines bOKToLookForDiscrepancyLines = true; } #define PARSE_PANIC_SOLINE( szMessage ) \ { RWCString soError = \ "crossmatch output file error detected from source file "; \ soError += __FILE__; \ soError += " at "; \ soError += ( RWCString( (long) __LINE__ ) + "\n" + \ "Error in file " + filAlignments_ + " at line " + \ RWCString( (long) nCurrentLine_ ) + ". Line:\n " + \ soLine.data() + "\n" + szMessage + "\n" ); \ THROW_ERROR( soError ); } // discrepancy lists look like this: // Notice that they always end with a blank line // ALIGNMENT 26 12.20 0.00 0.00 2_0007_2_7_537_772 1 41 (0) mm8_dna- // Contig 1064794 1064834 (1218865) // DISCREPANCY S 5 T(15) 1064798 ctggTcttttt // DISCREPANCY S 25 T(15) 1064818 ttggagTctttta // DISCREPANCY S 32 T(15) 1064825 cttttaTtttctg // DISCREPANCY S 34 T(15) 1064827 tttattTtctgct // DISCREPANCY S 35 T(15) 1064828 ttatttTctgctt // ALIGNMENT 41 0.00 0.00 0.00 2_0007_2_7_239_455 1 41 (0) mm8_dna- // Contig 53656 53696 (2230003) // ALIGNMENT 40 0.00 0.00 0.00 2_0007_2_7_302_125 1 40 (1) C mm8_dna- // Contig (1230903) 1052796 1052757 // ALIGNMENT 40 0.00 0.00 0.00 2_0007_2_7_37_441 1 40 (1) C mm8_dna-C // ontig (1238384) 1045315 1045276 // 12214 matching entries (first file). // Can also have: // DISCREPANCY S 85 C(15) 194570 tgcaggCcacctt // DISCREPANCY D 15 T(15) 194640 gtgcatTaacagt // DISCREPANCY I 748 G(15) 193899 attacaGgatgcc // DISCREPANCY D-8 187 A(15) 194467 ataagaAgtttgt // DISCREPANCY I-8 564 TTGATAAT(15) 571 tgatacTTGATAATgtctgg void addNewReads :: parseOneDiscrepancyForOneAlignmentSelectRegions( RWCString& soLine, regionOfSequence* pRegion, crossMatchInfoForRead* pCrossMatchInfoForRead ) { assert( pCrossMatchInfoForRead ); char szType[50]; char szNucleotideAndPosition[50]; int n1ReadPos; int n1SubjectPos; int nNumberOfConvertedArguments = sscanf( soLine.data(), "DISCREPANCY %s %d %s %d", szType, &n1ReadPos, szNucleotideAndPosition, &n1SubjectPos ); if ( nNumberOfConvertedArguments != 4 ) { soLine.nCurrentLength_ = strlen( soLine.data() ); PARSE_PANIC_SOLINE( " should have been a DISCREPANCY line but wrong number of tokens" ); } if ( szType[0] == 'S' ) return; // substitutions do not affect the alignment assert( szType[0] == 'D' || szType[0] == 'I' ); int nMultiplicity = 1; if ( strlen( szType ) != 1 ) { if ( szType[0] == 'D' ) { assert( sscanf( szType, "D-%d", &nMultiplicity ) == 1 ); } else { assert( sscanf( szType, "I-%d", &nMultiplicity ) == 1 ); } } // convert subject coordinate to contig coordinate int n1ContigPos = n1SubjectPos - pRegion->n1StartPos_ + 1; crossMatchIndel* pCrossMatchIndel = new crossMatchIndel( szType, n1ReadPos, n1ContigPos, nMultiplicity ); if ( !pCrossMatchInfoForRead->pCrossMatchIndelList_ ) { pCrossMatchInfoForRead->pCrossMatchIndelList_ = new mbtPtrOrderedVector( 2, // initial capacity "crossMatchIndelList" ); } pCrossMatchInfoForRead->pCrossMatchIndelList_->append( pCrossMatchIndel ); } // parseOneDiscrepancyForOneAlignment void addNewReads :: openNewPhdBall( FILE*& pPhdBall ) { FileName filTryThisPhdBall = pCP->filPhdBallDirectory_ + "/" + pCP->filFastStartupFile_; FileName filNewPhdBallFullPath = filTryThisPhdBall.filFindOneHigherThanHighestVersion3(); pCP->filNewFileOfPhdFilesFullPath_ = filNewPhdBallFullPath; pPhdBall = fopen( filNewPhdBallFullPath.data(), "w" ); if ( !pPhdBall ) { THROW_FILE_ERROR( filNewPhdBallFullPath ); } // add WA item so that saveToFile writes this in the ace file RWCString soTimestampForWritingPhdBall = soGetDateTime( nNoSlashes ); wholeAssemblyItem* pWA = new wholeAssemblyItem( "phdBall", "consed", soTimestampForWritingPhdBall, filNewPhdBallFullPath ); ConsEd::pGetAssembly()->addWholeAssemblyItem( pWA ); } // filOpenNewPhdBall( FILE*& pPhdBall ) { // void addNewReads :: runMiniAssembliesOnProtrudingReads() { // if ( !paContigEndReadList_ ) return; // RWCString soDateTimeDotInMiddle = soGetDateTime( nDotInMiddle ); // char* szTimestamp = szGetTime(); // FileName filFastaFOF = "addNewReadsExtendConsensusFasta." + // soDateTimeDotInMiddle + ".fof"; // FileName filAceFileFOF = "addNewReadsExtendConsensusAceFiles." + // soDateTimeDotInMiddle + ".fof"; // FILE* pFastaFOF = fopen( filFastaFOF.data(), "w" ); // if ( !pFastaFOF ) { // THROW_FILE_ERROR( filFastaFOF ); // } // for( int nContigEnd = 0; nContigEnd < paContigEndReadList_->length(); // ++nContigEnd ) { // contigEndReadList* pCERL = (*paContigEndReadList_)[ nContigEnd ]; // FileName filReadsToPhrap( (size_t) 1000 ); // filReadsToPhrap.nCurrentLength_ = // sprintf( filReadsToPhrap.data(), "%s_%s_%s.fasta", // szLeftOrRight( pCERL->nWhichEnd_ ), // pCERL->pContig_->soGetName().data(), // soDateTimeDotInMiddle.data() ); // FileName filReadsToPhrapQual = filReadsToPhrap + ".qual"; // fprintf( pFastaFOF, "%s\n", filReadsToPhrap.data() ); // FILE* pReads = fopen( filReadsToPhrap.data(), "w" ); // if ( !pReads ) { // THROW_FILE_ERROR( filReadsToPhrap ); // } // FILE* pQual = fopen( filReadsToPhrapQual.data(), "w" ); // if ( !pQual ) { // THROW_FILE_ERROR( filReadsToPhrapQual ); // } // for( int nCross = 0; nCross < pCERL->aCrossMatchInfoForRead_.length(); // ++nCross ) { // LocatedFragment* pLocFrag = // pCERL->aCrossMatchInfoForRead_[ nCross ]->pLocFrag_; // pLocFrag->writeReadToFasta( pReads, pQual, // true ); // bUseSeqPhredCalledBases // } // // need to figure out the consensus piece to extract // int nUnpaddedConsPosLeft; // int nUnpaddedConsPosRight; // if ( pCERL->nWhichEnd_ == nLeftGap ) { // nUnpaddedConsPosLeft = 1; // nUnpaddedConsPosRight = // pCERL->nUnpaddedConsPosOfReadFurthestIntoContig_ // + pCP->nAddNewReadsAdditionalConsensusBasesBeyondReads_; // } // else { // nUnpaddedConsPosLeft = // pCERL->nUnpaddedConsPosOfReadFurthestIntoContig_ // - pCP->nAddNewReadsAdditionalConsensusBasesBeyondReads_; // nUnpaddedConsPosRight = // pCERL->pContig_->nGetUnpaddedEndIndex(); // } // if ( nUnpaddedConsPosLeft < 1 ) // nUnpaddedConsPosLeft = 1; // if ( nUnpaddedConsPosRight > pCERL->pContig_->nGetUnpaddedEndIndex() ) // nUnpaddedConsPosRight = pCERL->pContig_->nGetUnpaddedEndIndex(); // pCERL->nReferenceUnpaddedConsPosLeft_ = nUnpaddedConsPosLeft; // pCERL->nReferenceUnpaddedConsPosRight_ = nUnpaddedConsPosRight; // const int nMaxReferenceReadSize = 1000; // RWCString soReferenceRead( (size_t) nMaxReferenceReadSize ); // soReferenceRead.nCurrentLength_ = snprintf( soReferenceRead.data(), // nMaxReferenceReadSize, // "%s_%d_%d", // pCERL->pContig_->soGetName().data(), // nUnpaddedConsPosLeft, // nUnpaddedConsPosRight ); // fprintf( pReads, ">%s VERSION: 1 TIME: %s\n", // soReferenceRead.data(), // szTimestamp ); // fprintf( pQual, ">%s VERSION: 1 TIME: %s\n", // soReferenceRead.data(), // szTimestamp ); // pCERL->soReferenceReadName_ = soReferenceRead; // // extract these bases: // // first find nConsPosStart, the padded position of the // // unpadded position we want to start at // int nConsPosStart; // int nUnpadded = 0; // bool bFound = false; // int nConsPos; // for( nConsPos = 1; nConsPos <= pCERL->pContig_->nGetPaddedSeqLength(); // ++nConsPos ) { // if ( ! pCERL->pContig_->ntGetConsUnsafe( nConsPos ).bIsPad() ) { // ++nUnpadded; // if ( nUnpadded == nUnpaddedConsPosLeft ) { // nConsPosStart = nConsPos; // bFound = true; // break; // } // } // } // if (! bFound ) { // cerr << "couldn't find unpadded left " << nUnpaddedConsPosLeft << // " in contig " << pCERL->pContig_->soGetName() << " end: " << // szWhichGap( pCERL->nWhichEnd_ ) << " of length " << // pCERL->pContig_->nGetUnpaddedEndIndex() << " pCERL->nUnpaddedConsPosOfReadFurthestIntoContig_ = " << pCERL->nUnpaddedConsPosOfReadFurthestIntoContig_ << " pCP->nAddNewReadsAdditionalConsensusBasesBeyondReads_ = " << // pCP->nAddNewReadsAdditionalConsensusBasesBeyondReads_ << endl; // assert( false ); // } // pCERL->nReferencePaddedConsPosLeft_ = nConsPosStart; // int nBasesOnLine = 0; // const int nMaxBasesOnLine = 50; // bool bFoundRightEnd = false; // nUnpadded = nUnpaddedConsPosLeft - 1; // prepare for 1st ++ // for( nConsPos = nConsPosStart; // nConsPos <= pCERL->pContig_->nGetPaddedSeqLength(); // ++nConsPos ) { // if ( ! pCERL->pContig_->ntGetConsUnsafe( nConsPos ).bIsPad() ) { // ++nUnpadded; // // write base and quality // if ( nBasesOnLine >= nMaxBasesOnLine ) { // fputc( '\n', pReads ); // fputc( '\n', pQual ); // nBasesOnLine = 0; // } // fputc( pCERL->pContig_->ntGetConsUnsafeOnConsensus( nConsPos ).cGetBase(), // pReads ); // fprintf( pQual, // " %d", // pCERL->pContig_->ntGetConsUnsafeOnConsensus( nConsPos ).qualGetQuality() ); // if ( nUnpadded >= nUnpaddedConsPosRight ) { // pCERL->nReferencePaddedConsPosRight_ = nConsPos; // bFoundRightEnd = true; // break; // } // } // } // for( nConsPos = nConsPosStart; // assert( bFoundRightEnd ); // fputc( '\n', pReads ); // fputc( '\n', pQual ); // fclose( pReads ); // fclose( pQual ); // } // for( int nContigEnd = 0; // fclose( pFastaFOF ); // // run phrap at this point // RWCString soCommand = pCP->filAddNewReadsExtendConsensusScript_ + " " + // filFastaFOF + " " + filAceFileFOF; // cout << "about to run: " << soCommand << endl; // int nRetStat = system( (char*) soCommand.data() ); // if ( nRetStat != 0 ) { // char* szErrorMessage = strerror( nRetStat ); // if ( szErrorMessage ) { // RWCString soError = // "Something went wrong running " + soCommand + // "\n Gave error code " + RWCString( (long) nRetStat ) + // " which means " + // RWCString( szErrorMessage ); // THROW_ERROR2( soError ); // } // else { // RWCString soError = // "Something went wrong running " + soCommand + // "\n Gave error code " + RWCString( (long) nRetStat ) + // " which is out of range."; // THROW_ERROR2( soError ); // } // } // // now read the new assemblies // FILE* pAceFOF = fopen( filAceFileFOF.data(), "r" ); // if ( !pAceFOF ) { // THROW_FILE_ERROR( filAceFileFOF ); // } // const int nAceFileNameSize = 2000; // FileName filAceFileName( (size_t) nAceFileNameSize ); // int nCurrentAceLine = 0; // while( fgets( filAceFileName.data(), nMaxLineSize, pAceFOF ) != NULL ) { // filAceFileName.nCurrentLength_ = strlen( filAceFileName.data() ); // ++nCurrentAceLine; // int nContigEnd = nCurrentAceLine - 1; // contigEndReadList* pCERL = (*paContigEndReadList_)[ nContigEnd ]; // filAceFileName.stripTrailingWhitespaceFast(); // openAceFileAndAlignReads( filAceFileName, pCERL ); // } // fclose( pAceFOF ); // fixCrossMatchOutputCoordinates(); // } // void addNewReads :: runMiniAssembliesOnProtrudingReads() { void addNewReads :: fixCrossMatchOutputCoordinates() { contigEndReadList* pCERL = NULL; for( int nCross = 0; nCross < aCrossMatchInfoForReads_.length(); ++nCross ) { crossMatchInfoForRead* pCross = aCrossMatchInfoForReads_[ nCross ]; // if we are using the phrap alignment, rather than the cross_match // alignment, no need to fix the cross_match positions if ( pCross->bAlreadyUsedThis_ ) continue; // get corresponding contigEndReadList object if ( ! (pCERL && pCERL->pContig_ == pCross->pLocFrag_->pContig_ ) ) { contigEndReadList myCERL( pCross->pLocFrag_->pContig_, nLeftGap ); pCERL = paContigEndReadList_->pFindFast( &myCERL ); if ( !pCERL ) { // I believe this means that we didn't modify this consensus // perhaps because there were no reads protruding from it. // Thus no need to fix the output continue; } } pCross->nUnpaddedContigStart_ += pCERL->nUnpaddedBasesAddedOnLeftEndOfConsensus_; pCross->nUnpaddedContigEnd_ += pCERL->nUnpaddedBasesAddedOnLeftEndOfConsensus_; if ( pCross->pCrossMatchIndelList_ ) { for( int nIndel = 0; nIndel < pCross->pCrossMatchIndelList_->length(); ++nIndel ) { crossMatchIndel* pIndel = (*(pCross->pCrossMatchIndelList_))[ nIndel ]; pIndel->nUnpaddedConsPos_ += pCERL->nUnpaddedBasesAddedOnLeftEndOfConsensus_; } } } } void addNewReads :: openAceFileAndAlignReads( const FileName& filAceFileName, contigEndReadList* pCERL ) { fprintf( stderr, "processing %s of %s using ace file %s\n", szWhichGap( pCERL->nWhichEnd_ ), pCERL->pContig_->soGetName().data(), filAceFileName.data() ); bool bUsingPhdFilesSaved = ConsEd::pGetConsEd()->bUsingPhdFiles_; ConsEd::pGetConsEd()->bUsingPhdFiles_ = false; Assembly* pSecondaryAssembly = new Assembly( filAceFileName, false ); // bSetGlobalAssembly ConsEd::pGetConsEd()->bUsingPhdFiles_ = bUsingPhdFilesSaved; // the assembly should have just one contig which will contain the // the part of the reference sequence. Maybe I'll allow it to // have more than 1 contig, in case there is some minor contig. // But I am only interested in the contig that has the reference // sequence as part of it. LocatedFragment* pReferenceRead = pSecondaryAssembly->pGetLocatedFragmentByName( pCERL->soReferenceReadName_ ); if ( !pReferenceRead ) { // bad news--I guess you can't use this assembly fprintf( stderr, "Error: %s end of contig %s did not assemble correctly (reference read was not part of the assembly) so cannot extend this contig end\n", szLeftOrRight( pCERL->nWhichEnd_ ), pCERL->pContig_->soGetName().data() ); return; } else { printf( "found reference read %s in new assembly\n", pReferenceRead->soGetName().data() ); } Contig* pMainContigNewAssembly = pReferenceRead->pGetContig(); // watch out--phrap may have complemented the reference read printf( "read %s is complemented? %s\n", pReferenceRead->soGetName().data(), szPrintBool( pReferenceRead->bComp() ) ); if ( pReferenceRead->bComp() ) { pMainContigNewAssembly->complementContig(); } // make left contig so it looks like this: // ----------------------- existing consensus // ---------- reference read in phrap // ------ (read) // ------ (read) // . // . // . // int nReferencePaddedConsPosLeft = pCERL->nReferencePaddedConsPosLeft_; int nReferencePaddedConsPosRight = pCERL->nReferencePaddedConsPosRight_; bool bNeedToComplementBack = false; if ( pCERL->nWhichEnd_ == nLeftGap ) { bNeedToComplementBack = true; pCERL->pContig_->complementEndPoints( nReferencePaddedConsPosLeft, nReferencePaddedConsPosRight ); pCERL->pContig_->complementContig(); pMainContigNewAssembly->complementContig(); } // check that the reference read matches the corresponding location in the // consensus bool bSequencesMatch = true; int n1ReadPos = pReferenceRead->nGetStartIndex() - 1; // ready for first ++ int nConsPos; for( nConsPos = nReferencePaddedConsPosLeft; nConsPos <= nReferencePaddedConsPosRight; ++nConsPos ) { if ( pCERL->pContig_->ntGetCons( nConsPos ).bIsPad() ) continue; ++n1ReadPos; while( n1ReadPos <= pReferenceRead->nGetEndIndex() && pReferenceRead->ntideGet( n1ReadPos ).bIsPad() ) { ++n1ReadPos; } if ( n1ReadPos > pReferenceRead->nGetEndIndex() ) { RWCString soError = "the reference read was constructed from the consensus positions " + RWCString( (long) pCERL->nReferencePaddedConsPosLeft_ ) + " to " + RWCString( (long) pCERL->nReferencePaddedConsPosRight_ ) + " but the reference read in the phrap output was only " + RWCString( (long) pReferenceRead->nGetEndIndex() ) + " for " + szWhichGap( pCERL->nWhichEnd_ ) + " of " + pCERL->pContig_->soGetName(); THROW_ERROR( soError ); } if ( pCERL->pContig_->ntGetCons( nConsPos ).cGetBase() != pReferenceRead->ntideGet( n1ReadPos ).cGetBase() ) { bSequencesMatch = false; fprintf( stderr, "bases mismatch nConsPos %d: %c vs read pos %d: %c\n", nConsPos, pCERL->pContig_->ntGetCons( nConsPos ).cGetBase(), n1ReadPos, pReferenceRead->ntideGet( n1ReadPos ).cGetBase() ); } } if ( !bSequencesMatch ) { THROW_ERROR( "the reference read coming from phrap's output doesn't match the read sent to phrap that was part of the consensus" ); } // make a list of how many pads there are after each int nReferenceReadUnpaddedLength = pCERL->nReferenceUnpaddedConsPosRight_ - pCERL->nReferenceUnpaddedConsPosLeft_ + 1; mbtValVector aPadsAfterOldConsensusBases( (size_t) nReferenceReadUnpaddedLength, 1, 0, "aPadsAfterOldConsensusBases" ); mbtValVector aPadsAfterReferenceReadBases( (size_t) nReferenceReadUnpaddedLength, 1, // nOffset 0, // initial value "aPadsAfterReferenceReadBases" ); int n1Unpadded = 0; // ready for ++ assert( !pCERL->pContig_->ntGetCons( nReferencePaddedConsPosLeft ).bIsPad() ); for( nConsPos = nReferencePaddedConsPosLeft; nConsPos <= nReferencePaddedConsPosRight; ++nConsPos ) { if ( pCERL->pContig_->ntGetCons( nConsPos ).bIsPad() ) { ++aPadsAfterOldConsensusBases[ n1Unpadded ]; } else { ++n1Unpadded; } } assert( !pReferenceRead->ntideGet( pReferenceRead->nGetStartIndex() ).bIsPad() ); n1Unpadded = 0; // ready for ++ for( n1ReadPos = pReferenceRead->nGetStartIndex(); n1ReadPos <= pReferenceRead->nGetEndIndex(); ++n1ReadPos ) { if ( pReferenceRead->ntideGet( n1ReadPos ).bIsPad() ) { ++aPadsAfterReferenceReadBases[ n1Unpadded ]; } else { ++n1Unpadded; } } // now equalize pads // first, insert columns of pads into the old consensus. Before // actually doing this, let's see how much bigger we need to // make the consensus and make it that much bigger once int nTotalExtraPadsNeededInOldContig = 0; n1Unpadded = 0; for( nConsPos = nReferencePaddedConsPosLeft; nConsPos <= nReferencePaddedConsPosRight; ++nConsPos ) { if ( !pCERL->pContig_->ntGetCons( nConsPos ).bIsPad() ) { ++n1Unpadded; int nExtraPadsInNewAssembly = aPadsAfterReferenceReadBases[ n1Unpadded ] - aPadsAfterOldConsensusBases[ n1Unpadded ]; nTotalExtraPadsNeededInOldContig += nExtraPadsInNewAssembly; } } // increase the length of the contig once pCERL->pContig_->increaseMaxLengthIfNecessary( nTotalExtraPadsNeededInOldContig ); n1Unpadded = 0; // ready for ++ for( nConsPos = nReferencePaddedConsPosLeft; nConsPos <= nReferencePaddedConsPosRight; ++nConsPos ) { if ( !pCERL->pContig_->ntGetCons( nConsPos ).bIsPad() ) { ++n1Unpadded; int nExtraPadsInNewAssembly = aPadsAfterReferenceReadBases[ n1Unpadded ] - aPadsAfterOldConsensusBases[ n1Unpadded ]; if ( nExtraPadsInNewAssembly > 0 ) { for( int n = 0; n < nExtraPadsInNewAssembly; ++n ) { pCERL->pContig_->insertColOfPads( nConsPos + 1 ); } } } } // for( nConsPos = nReferencePaddedConsPosLeft; // insert columns of pads into the new assembly (I'm not so concerned // about resizing this contig many times since it is tiny) n1Unpadded = 0; // ready for ++ for( n1ReadPos = pReferenceRead->nGetStartIndex(); n1ReadPos <= pReferenceRead->nGetEndIndex(); ++n1ReadPos ) { if ( !pReferenceRead->ntideGet( n1ReadPos ).bIsPad() ) { ++n1Unpadded; int nExtraPadsInOldAssembly = aPadsAfterOldConsensusBases[ n1Unpadded ] - aPadsAfterReferenceReadBases[ n1Unpadded ]; if ( nExtraPadsInOldAssembly > 0 ) { for( int n = 0; n < nExtraPadsInOldAssembly; ++n ) { int nNewContigConsPos = pReferenceRead->nGetConsPosFromFragIndex( n1ReadPos ); // the +1 is before we want to insert the pads *after* // nNewContigConsPos, but insertColOfPads inserts before // the argument pReferenceRead->pGetContig()->insertColOfPads( nNewContigConsPos + 1 ); } } // if ( nExtraPadsInOldAssembly > 0 ) { } // if ( !pReferenceRead->ntideGet( n1ReadPos ).bIsPad() ) { } // for( n1ReadPos = pReferenceRead->nGetStartIndex(); // Extend old consensus. How can we find the new bases? // -----------eeeeee new consensus // ----------- part of old consensus in new assembly ("reference read") // the eeeeee are the new bases--they are the ones past // the end of the "part of old consensus in the new assembly" int nNewBasesConsPosLeft = pReferenceRead->nGetAlignEnd() + 1; int nNewBasesConsPosRight = pReferenceRead->pGetContig()->nGetEndIndex(); int nExtraConsensusBases = nNewBasesConsPosRight - nNewBasesConsPosLeft + 1; pCERL->pContig_->increaseMaxLengthIfNecessary( nExtraConsensusBases ); int nExtraUnpaddedConsensusBases = 0; for( int nNewAssemblyConsPos = nNewBasesConsPosLeft; nNewAssemblyConsPos <= nNewBasesConsPosRight; ++nNewAssemblyConsPos ) { Ntide nt = pReferenceRead->pGetContig()->ntGetCons( nNewAssemblyConsPos ); pCERL->pContig_->appendNtide( nt ); if ( !nt.bIsPad() ) ++nExtraUnpaddedConsensusBases; } // save this for adjusting the crossmatch coordinates for all reads // that are interior to the contig if ( pCERL->nWhichEnd_ == nLeftGap ) { pCERL->nUnpaddedBasesAddedOnLeftEndOfConsensus_ = nExtraUnpaddedConsensusBases; } // we need to assign quality values to the reads in the new assembly Contig* pNewContig = pReferenceRead->pGetContig(); readsSortedByReadName aNewReads( (size_t) pNewContig->nGetNumberOfReads() ); int nRead; for( nRead = 0; nRead < pNewContig->nGetNumberOfReads(); ++nRead ) { LocatedFragment* pLocFrag = pNewContig->pGetLocatedFragment( nRead ); if ( pLocFrag == pReferenceRead ) continue; aNewReads.insert( pLocFrag ); } aNewReads.resort(); // sort the reads for this end by read name pCERL->aCrossMatchInfoForRead_.resort(); // aNewReads will be a shorter list than aCrossMatchInfoForRead_ // try to associate the two lists int nNextCross = 0; for( nRead = 0; nRead < aNewReads.length(); ++nRead ) { LocatedFragment* pNewLocFrag = aNewReads[ nRead ]; crossMatchInfoForRead* pFoundCross = NULL; bool bStillLooking = true; while( bStillLooking && ( nNextCross < pCERL->aCrossMatchInfoForRead_.length() ) ) { crossMatchInfoForRead* pCross = pCERL->aCrossMatchInfoForRead_[ nNextCross ]; if ( pCross->pLocFrag_->soGetName() == pNewLocFrag->soGetName() ) { pFoundCross = pCross; bStillLooking = false; ++nNextCross; // set up for next time around } else if ( pCross->pLocFrag_->soGetName() > pNewLocFrag->soGetName() ) { bStillLooking = false; } else { ++nNextCross; } } // currently I can't think of any reason that we would have a // read in the new assembly that wasn't there in the old if ( !pFoundCross ) { RWCString soError = "new phrap assembly read: " + pNewLocFrag->soGetName() + " couldn't be found among any of the cross_match records. nNextCross = " + RWCString( (long) nNextCross ); THROW_ERROR( soError ); } // this is so that the insertPadsInContigs and // insertPadsInNewReadsAndSetReadBases code doesn't try to add // this read again (using the cross_match alignments) pFoundCross->bAlreadyUsedThis_ = true; // now move base qualities LocatedFragment* pOldLocFrag = pFoundCross->pLocFrag_; if ( pOldLocFrag->bComp() != pNewLocFrag->bComp() ) { pOldLocFrag->complementLocatedFragment(); } assert( pOldLocFrag->bComp() == pNewLocFrag->bComp() ); bool bAssignPeakPositions = pOldLocFrag->bHasPeakPositions(); if ( bAssignPeakPositions ) { pNewLocFrag->createPointPosArray2( pOldLocFrag->cWhichPeakPositionsAreUsed_, pNewLocFrag->nGetPaddedFragLength(), true ); // fill up array so we can use setTracePosAtSeqPos // this is so that we can skip pads and assign peak positions // to them later } // there will be more pNewLocFrag bases than pOldLocFrag bases // since the former have pads int nOldReadPos = 0; // ready for ++ for( int nReadPos = pNewLocFrag->nGetStartIndex(); nReadPos <= pNewLocFrag->nGetEndIndex(); ++nReadPos ) { if ( pNewLocFrag->ntideGet( nReadPos ).bIsPad() ) continue; ++nOldReadPos; Ntide ntOld = (*(pOldLocFrag->pSeqPhredCalledBases_))[ nOldReadPos ]; if ( tolower( ntOld.cGetBase()) != tolower( pNewLocFrag->ntideGet( nReadPos ).cGetBase()) ) { // phrap sometimes changes low quality bases to n's so // allow for that if ( tolower( ntOld.cGetBase() ) != 'n' && tolower( pNewLocFrag->ntideGet( nReadPos ).cGetBase() ) != 'n' ) { RWCString soError = "bases mismatch between what was fed to phrap and what came out. Read: " + pNewLocFrag->soGetName() + " nReadPos = " + RWCString( (long) nReadPos ) + " " + pOldLocFrag->soGetName() + " nOldReadPos = " + RWCString( (long) nOldReadPos ) + " " + szWhichGap( pCERL->nWhichEnd_ ) + " of " + pCERL->pContig_->soGetName() + " orig base: " + RWCString( ntOld.cGetBase() ) + " phrap base: " + RWCString( pNewLocFrag->ntideGet( nReadPos ).cGetBase() ); THROW_ERROR( soError ); } } pNewLocFrag->setQualityAtSeqPos( nReadPos, ntOld.qualGetQuality() ); if ( bAssignPeakPositions ) { pNewLocFrag->setTracePointPosAtSeqPos( nReadPos, pOldLocFrag->nGetPointPos( nOldReadPos ) ); } } pNewLocFrag->assignQualityAndPeakPositionsToPads( bAssignPeakPositions ); // now move the reads over from the new assembly's contig to // the contig in the old assembly int nOldAssemblyConsPosFromNewAssemblyConsPos = nReferencePaddedConsPosLeft - pReferenceRead->nGetAlignStart(); pNewLocFrag->moveReadAndReadTagsToNewContig( pCERL->pContig_, // contig in the old assembly nOldAssemblyConsPosFromNewAssemblyConsPos ); pFoundCross->bAlreadyUsedThis_; // finally, I think I am not even going to bother freeing the memory // of the old LocatedFragment--it will be reclaimed by the operating // system when consed terminates. pFoundCross->pLocFrag_ = pNewLocFrag; } // for( nRead = 0; nRead < aNewReads.length(); ++nRead ) { if ( bNeedToComplementBack ) { pCERL->pContig_->complementContig(); } fprintf( stderr, "done processing aNewReads\n" ); fflush( stderr ); } // void addNewReads :: openAceFileAndAlignReads