/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "addAlignedSequence.h" #include "soGetErrno.h" #include "assert.h" #include "soLine.h" #include using namespace std; #include "rwctokenizer.h" #include "alignmentPiece.h" #include "bIsNumericMaybeWithWhitespace.h" #include "consed.h" #include "sequence.h" #include "szGetTime.h" #include "soGetDateTime.h" #include "consedParameters.h" #include "terminateIfNoPhdDir.h" #include "maybeTerminateIfAnotherReadWriteConsed.h" #include "listOfLibraries.h" #include "fileDefines.h" #include "automatedConsedInit.h" void addAlignedSequence :: doIt() { automatedConsedInit( filAceFileToOpen_ ); readAlignedSequenceFastaFile(); saveAssembly(); cerr << "Wrote new ace file: " << filNewAceFile_ << endl; } void addAlignedSequence :: readAlignedSequenceFastaFile() { FILE* pAlignmentFastaFile = fopen( filAlignmentFastaFile_.data(), "r" ); if ( !pAlignmentFastaFile ) { RWCString soErrorMessage = "couldn't open "; soErrorMessage += filAlignmentFastaFile_; soErrorMessage += " error: "; soErrorMessage += soGetErrno(); THROW_ERROR( soErrorMessage ); } bool bFirstTime = true; while( 1 ) { bool bEndOfFile; alignmentPiece* pAlignmentPiece; getChimpHeaderLine( pAlignmentFastaFile, pAlignmentPiece, bEndOfFile ); if ( bEndOfFile ) { if ( bFirstTime ) { RWCString soErrorMessage = "premature end of "; soErrorMessage += filAlignmentFastaFile_; soErrorMessage += " while looking for 1st line"; THROW_ERROR( soErrorMessage ); } else { break; } } bFirstTime = false; getPaddedChimpSequence( pAlignmentFastaFile, pAlignmentPiece ); getHumanHeaderLine( pAlignmentFastaFile, pAlignmentPiece ); getPaddedHumanSequence( pAlignmentFastaFile, pAlignmentPiece ); findExistingReadWithWRItem(); // process this alignment piece makeReadOutOfThisAlignmentPiece( pAlignmentPiece ); delete pAlignmentPiece; } fclose( pAlignmentFastaFile ); ConsEd::pGetAssembly()->setChanged(); } void addAlignedSequence :: getChimpHeaderLine( FILE* pAlignmentFastaFile, alignmentPiece*& pAlignmentPiece, bool& bEndOfFile ) { bEndOfFile = false; if ( fgets( soLine.data(), nMaxLineSize, pAlignmentFastaFile ) == NULL ) { bEndOfFile = true; return; } soLine.nCurrentLength_ = strlen( soLine.data() ); pAlignmentPiece = new alignmentPiece(); pAlignmentPiece->soOrganismA_ = "chimp"; pAlignmentPiece->soOrganismB_ = "human"; // header line is for chimp and looks like this: // >chr13 chimp_unpadded_chr_positions: start=23459 end=24573 strand=+ padded_seq_length=459 RWCTokenizer tok( soLine ); RWCString soChimpChromosome = tok(); RWCString soKeyword = tok(); RWCString soStart = tok(); RWCString soEnd = tok(); RWCString soChimpStrand = tok(); RWCString soPaddedSeqLength = tok(); #define CHIMP_HEADER_LINE filAlignmentFastaFile_ + " chimp header line should have been of the form: >chr13 chimp_unpadded_chr_positions: start=23459 end=24573 strand=+ padded_seq_length=459\n" if ( soPaddedSeqLength.isNull() ) { RWCString soErrorMessage = CHIMP_HEADER_LINE; soErrorMessage += "but did not have enough tokens:\n"; soErrorMessage += soLine; THROW_ERROR( soErrorMessage ); } // check each parameter if ( !soChimpChromosome.bStartsWith( ">chr" ) ) { RWCString soErrorMessage = CHIMP_HEADER_LINE; soErrorMessage += "but didn't start with >chr Line:\n"; soErrorMessage += soLine; THROW_ERROR( soErrorMessage ); } // get rid of leading ">" pAlignmentPiece->soChromosomeA_ = soChimpChromosome.soGetRestOfString( 1 ); if ( soKeyword != "chimp_unpadded_chr_positions:" ) { RWCString soErrorMessage = CHIMP_HEADER_LINE; soErrorMessage += " but \"chimp_unpadded_positions:\" was missing. Line:\n"; soErrorMessage += soLine; THROW_ERROR( soErrorMessage ); } if ( !soStart.bStartsWithAndRemove( "start=" ) ) { RWCString soErrorMessage = CHIMP_HEADER_LINE; soErrorMessage += " but \"start=\" was missing. Line:\n"; soErrorMessage += soLine; THROW_ERROR( soErrorMessage ); } if ( !soEnd.bStartsWithAndRemove( "end=" ) ) { RWCString soErrorMessage = CHIMP_HEADER_LINE; soErrorMessage += " but \"end=\" was missing. Line:\n"; soErrorMessage += soLine; THROW_ERROR( soErrorMessage ); } if ( !soChimpStrand.bStartsWithAndRemove( "strand=" ) ) { RWCString soErrorMessage = CHIMP_HEADER_LINE; soErrorMessage += " but \"strand=\" was missing. Line:\n"; soErrorMessage += soLine; THROW_ERROR( soErrorMessage ); } if ( soChimpStrand == "+" ) pAlignmentPiece->bAIsTopNotBottomStrand_ = true; else if ( soChimpStrand == "-" ) pAlignmentPiece->bAIsTopNotBottomStrand_ = false; else { RWCString soErrorMessage = CHIMP_HEADER_LINE; soErrorMessage += " but \"strand=\" was not followed by + or -. Line:\n"; soErrorMessage += soLine; THROW_ERROR( soErrorMessage ); } if ( !soPaddedSeqLength.bStartsWithAndRemove( "padded_seq_length=" ) ) { RWCString soErrorMessage = CHIMP_HEADER_LINE; soErrorMessage += " but \"padded_seq_length=\" was missing. Line:\n"; soErrorMessage += soLine; THROW_ERROR( soErrorMessage ); } if ( !bIsNumericMaybeWithWhitespace( soStart, pAlignmentPiece->n1UnpaddedStartA_ ) ) { RWCString soErrorMessage = CHIMP_HEADER_LINE; soErrorMessage += " but start=(#) was not numeric. Line:\n"; soErrorMessage += soLine; THROW_ERROR( soErrorMessage ); } if ( !bIsNumericMaybeWithWhitespace( soEnd, pAlignmentPiece->n1UnpaddedEndA_ ) ) { RWCString soErrorMessage = CHIMP_HEADER_LINE; soErrorMessage += " but end=(#) was not numeric. Line:\n"; soErrorMessage += soLine; THROW_ERROR( soErrorMessage ); } if ( !bIsNumericMaybeWithWhitespace( soPaddedSeqLength, pAlignmentPiece->nPaddedAlignmentLength_ ) ) { RWCString soErrorMessage = CHIMP_HEADER_LINE; soErrorMessage += " but padded_seq_length=(#) was not numeric. Line:\n"; soErrorMessage += soLine; THROW_ERROR( soErrorMessage ); } } void addAlignedSequence :: getPaddedChimpSequence( FILE* pAlignmentFastaFile, alignmentPiece* pAlignmentPiece ) { // this line/sequence could be very, very long. Thus better // not use fgets pAlignmentPiece->soPaddedBasesA_.increaseMaxLength( pAlignmentPiece->nPaddedAlignmentLength_ + 1 ); pAlignmentPiece->soPaddedBasesA_.nCurrentLength_ = 0; // bypassing repeated RWCString checks for efficiency char c; while( ( c = fgetc( pAlignmentFastaFile ) ) != '\n' ) { if ( c == EOF ) { // bad news RWCString soErrorMessage = "premature end of file "; soErrorMessage += filAlignmentFastaFile_; soErrorMessage += " while reading chimp sequence"; THROW_ERROR( soErrorMessage ); } pAlignmentPiece->soPaddedBasesA_.sz_[ pAlignmentPiece->soPaddedBasesA_.nCurrentLength_ ] = c; ++pAlignmentPiece->soPaddedBasesA_.nCurrentLength_; } // null terminate pAlignmentPiece->soPaddedBasesA_.sz_[ pAlignmentPiece->soPaddedBasesA_.nCurrentLength_ ] = 0; assert( pAlignmentPiece->soPaddedBasesA_.nCurrentLength_ == pAlignmentPiece->nPaddedAlignmentLength_ ); } void addAlignedSequence :: getHumanHeaderLine( FILE* pAlignmentFastaFile, alignmentPiece* pAlignmentPiece ) { if ( fgets( soLine.data(), nMaxLineSize, pAlignmentFastaFile ) == NULL ) { RWCString soErrorMessage = filAlignmentFastaFile_; soErrorMessage += " ended prematurely while looking for human header line"; THROW_ERROR( soErrorMessage ); } soLine.nCurrentLength_ = strlen( soLine.data() ); // line is like: // >chr7 human_unpadded_chr_positions: start=29475465 end=29475500 desired_human_unpadded_chr_positions: start=29475465 end=29475500 seq_length=36 // aaaaaaaaattagccgggcatggtggcaggcgcctg RWCTokenizer tok( soLine ); RWCString soHumanChromosome = tok(); RWCString soKeyword1 = tok(); RWCString soStart = tok(); RWCString soEnd = tok(); RWCString soKeyword2 = tok(); RWCString soDesiredStart = tok(); RWCString soDesiredEnd = tok(); RWCString soPaddedSeqLength = tok(); #define THROW_HUMAN_HEADER_ERROR( soVariableErrorMessage ) \ RWCString soErrorMessage = "in file "; \ soErrorMessage += filAlignmentFastaFile_; \ soErrorMessage += " header header line should have been of the form: >chr7 human_unpadded_chr_positions: start=29475465 end=29475500 desired_human_unpadded_chr_positions: start=29475465 end=29475500 seq_length=36 but "; \ soErrorMessage += soVariableErrorMessage; \ soErrorMessage += " line:\n"; \ soErrorMessage += soLine; \ THROW_ERROR( soErrorMessage ); #define HUMAN_HEADER_LINE filAlignmentFastaFile_ + " header header line should have been of the form: >chr7 human_unpadded_chr_positions: start=29475465 end=29475500 desired_human_unpadded_chr_positions: start=29475465 end=29475500 seq_length=36" if ( soPaddedSeqLength.isNull() ) { THROW_HUMAN_HEADER_ERROR( "did not have enough tokens." ); } if ( !soHumanChromosome.bStartsWith( ">chr" ) ) { THROW_HUMAN_HEADER_ERROR( "1st token didn't start with \">chr\"" ); } // get rid of leading ">" pAlignmentPiece->soChromosomeB_ = soHumanChromosome.soGetRestOfString( 1 ); if ( soKeyword1 != "human_unpadded_chr_positions:" ) { THROW_HUMAN_HEADER_ERROR( "2nd token wasn't human_unpadded_chr_positions:" ); } if ( !soStart.bStartsWithAndRemove( "start=" ) ) { THROW_HUMAN_HEADER_ERROR( "couldn't find start= where it should be" ); } if ( !soEnd.bStartsWithAndRemove( "end=" ) ) { THROW_HUMAN_HEADER_ERROR( "couldn't find end= where it should be" ); } if ( soKeyword2 != "desired_human_unpadded_chr_positions:" ) { THROW_HUMAN_HEADER_ERROR( "couldn't find desired_human_unpadded_chr_positions: where it should be" ); } if ( !soDesiredStart.bStartsWithAndRemove( "start=" ) ) { THROW_HUMAN_HEADER_ERROR( "couldn't find start= on desired human unpadded chr positions" ); } if ( !soDesiredEnd.bStartsWithAndRemove( "end=" ) ) { THROW_HUMAN_HEADER_ERROR( "couldn't find end= on desired human unpadded chr positions" ); } if ( !soPaddedSeqLength.bStartsWithAndRemove( "seq_length=" ) ) { THROW_HUMAN_HEADER_ERROR( "couldn't find seq_length= where it should be" ); } if ( !bIsNumericMaybeWithWhitespace( soStart, pAlignmentPiece->n1UnpaddedStartB_ ) ) { THROW_HUMAN_HEADER_ERROR( "human start wasn't numeric" ); } if ( !bIsNumericMaybeWithWhitespace( soEnd, pAlignmentPiece->n1UnpaddedEndB_ ) ) { THROW_HUMAN_HEADER_ERROR( "human end wasn't numeric" ); } if ( !bIsNumericMaybeWithWhitespace( soDesiredStart, pAlignmentPiece->n1DesiredUnpaddedStartB_ ) ) { THROW_HUMAN_HEADER_ERROR( "desired human start wasn't numeric" ); } if ( !bIsNumericMaybeWithWhitespace( soDesiredEnd, pAlignmentPiece->n1DesiredUnpaddedEndB_ ) ) { THROW_HUMAN_HEADER_ERROR( "desired human end wasn't numeric" ); } int nPaddedSeqLength; if ( !bIsNumericMaybeWithWhitespace( soPaddedSeqLength, nPaddedSeqLength ) ) { THROW_HUMAN_HEADER_ERROR( "padded seq length wasn't numeric" ); } if ( nPaddedSeqLength != pAlignmentPiece->nPaddedAlignmentLength_ ) { RWCString soErrorMessage = "chimp says padded length is "; soErrorMessage += RWCString( (long) pAlignmentPiece->nPaddedAlignmentLength_ ); soErrorMessage += " and human says "; soErrorMessage += RWCString( (long) nPaddedSeqLength ); soErrorMessage += " but they should be equal"; THROW_ERROR( soErrorMessage ); } } void addAlignedSequence :: getPaddedHumanSequence( FILE* pAlignmentFastaFile, alignmentPiece* pAlignmentPiece ) { pAlignmentPiece->soPaddedBasesB_.increaseMaxLength( pAlignmentPiece->nPaddedAlignmentLength_ + 1 ); pAlignmentPiece->soPaddedBasesB_.nCurrentLength_ = 0; // bypassing repeated RWCString checks for efficiency // note that the file might end without a terminating newline char c; while( ( c = fgetc( pAlignmentFastaFile ) ) != '\n' && c != EOF ) { pAlignmentPiece->soPaddedBasesB_.sz_[ pAlignmentPiece->soPaddedBasesB_.nCurrentLength_ ] = c; ++pAlignmentPiece->soPaddedBasesB_.nCurrentLength_; } // null terminate pAlignmentPiece->soPaddedBasesB_.sz_[ pAlignmentPiece->soPaddedBasesB_.nCurrentLength_ ] = 0; assert( pAlignmentPiece->soPaddedBasesB_.nCurrentLength_ == pAlignmentPiece->nPaddedAlignmentLength_ ); } void addAlignedSequence :: findExistingReadWithWRItem() { RWCString soReadName = filPhdFileWithWRItem_; assert( soReadName.bContainsAndRemoveToEnd( ".phd." ) ); pExistingReadWithWRItem_ = ConsEd::pGetAssembly()->pGetLocatedFragmentByName( soReadName ); } void addAlignedSequence :: makeReadOutOfThisAlignmentPiece( alignmentPiece* pAlignmentPiece ) { // this code is a mix of compareContigs :: createFakeRead() and // the addNewReads functions // at what point should the insertColOfPads be done? I think it // must be before this read is added to the contig and its // alignstart position set. insertColumnsOfPadsInAssembly( pAlignmentPiece ); RWCString soChimpReadName = "chimp_"; soChimpReadName += pAlignmentPiece->soChromosomeA_; soChimpReadName += "_"; soChimpReadName += RWCString( (long) pAlignmentPiece->n1UnpaddedStartA_ ); pFakeChimpRead_ = new LocatedFragment( soChimpReadName, 1, // temporary value for alignment position false, // current read is complemented pExistingReadWithWRItem_->pGetContig() ); pExistingReadWithWRItem_->pGetContig()->addLocatedFragment( pFakeChimpRead_ ); // not necessary since Contig::addLocatedFragment does this // pExistingReadWithWRItem_->pGetContig()->bReadListsNeedFixing_ = true; pFakeChimpRead_->createPointPosArray2( Sequence::cLittle, // no need for big numbers since we will just put zeros in here pAlignmentPiece->soPaddedBasesB_.length() * 1.1, false ); // bFillUpArray // add bases and alignment information addBasesAndPadsToChimpSequence( pAlignmentPiece ); pFakeChimpRead_->setPHDFilename( soChimpReadName + ".phd.1" ); pFakeChimpRead_->setChromatFilename( "no_chromat" ); // see compareContigs2.cpp char* szTimestamp = szGetTime(); pFakeChimpRead_->setPHDTimestamp( szTimestamp ); wholeReadItem* pWR = new wholeReadItem( pFakeChimpRead_, "fake", // type "consed", // source soGetDateTime( nNoSlashes ), "" ); pFakeChimpRead_->aWholeReadItems_.append( pWR ); // should add another WR item indicating where this read came from RWCString soData = "chimp "; soData += pAlignmentPiece->soChromosomeA_; soData += " 1-based-chromosome-positions: start="; soData += RWCString( (long) pAlignmentPiece->n1UnpaddedStartA_ ); soData += " end="; soData += RWCString( (long) pAlignmentPiece->n1UnpaddedEndA_ ); soData += " strand="; soData += (pAlignmentPiece->bAIsTopNotBottomStrand_ ? "+" : "-" ); wholeReadItem* pWR2 = new wholeReadItem( pFakeChimpRead_, "part-of-chromosome", // type "consed", // source soGetDateTime( nNoSlashes ), soData ); pFakeChimpRead_->aWholeReadItems_.append( pWR2 ); pFakeChimpRead_->writePHD( pFakeChimpRead_->filGetPHDFilename() ); } void addAlignedSequence :: addBasesAndPadsToChimpSequence( alignmentPiece* pAlignmentPiece ) { // must initialize nConsPos and nConsPosOfLastAddedBase // e.g., if both are position 1000 within chromosome, // then starting position within read is 1 int n1UnpaddedStartOfAlignmentWithinExistingRead = pAlignmentPiece->n1UnpaddedStartB_ - pAlignmentPiece->n1DesiredUnpaddedStartB_ + 1; Contig* pContig = pExistingReadWithWRItem_->pGetContig(); pContig->setPaddedPositionsArray(); pContig->setUnpaddedPositionsArray(); int nConsPos = pExistingReadWithWRItem_->nGetConsPosFromUnpaddedFragPos( n1UnpaddedStartOfAlignmentWithinExistingRead ); int nUnpaddedConsPos = pContig->nUnpaddedIndex( nConsPos ); // ----------------------------------------------------- // IMPORTANT: // the alignment starts here // ----------------------------------------------------- pFakeChimpRead_->nAlignStartPos_ = nConsPos; // set these up so first base will show no pads need be added int nInterveningChimpBasesAdded = 0; int nConsPosOfLastReferencePosition = nConsPos; --nUnpaddedConsPos; // set up for first ++ assert( pAlignmentPiece->soPaddedBasesA_.length() == pAlignmentPiece->soPaddedBasesB_.length() ); for( int nzAlignmentPos = 0; nzAlignmentPos < pAlignmentPiece->soPaddedBasesA_.length(); ++nzAlignmentPos ) { char cHumanBaseInAlignment = pAlignmentPiece->soPaddedBasesB_[ nzAlignmentPos ]; char cChimpBaseInAlignment = pAlignmentPiece->soPaddedBasesA_[ nzAlignmentPos ]; if ( cHumanBaseInAlignment != '-' ) { // since the base is not a pad, we have the // chance to find the consensus position of the // corresponding base in the assembly. find the // nConsPos of the corresponding base in the assembly. // The only reason to do this is to decide if we should // add pads to the chimp read (below). // Note: this depends on the consensus and the human read // to have pads in the same places ++nUnpaddedConsPos; nConsPos = pContig->nPaddedIndexFast( nUnpaddedConsPos ); assert( pExistingReadWithWRItem_->ntGetFragFromConsPos( nConsPos ).cGetBase() == tolower( cHumanBaseInAlignment ) ); // this brings nConsPos to the corresponding base in the // assembled human read } if ( cChimpBaseInAlignment != '-' && cHumanBaseInAlignment != '-' ) { // this marks a reference position in which we know // what nConsPos the chimp base should be at. Thus // we can tell if there were any additional pads // in the assembly than in the alignment and, if so, // we should add additional pads to the chimp sequence int nChimpPadsToAdd = nConsPos - nConsPosOfLastReferencePosition - 1 - nInterveningChimpBasesAdded; if ( nChimpPadsToAdd > 0 ) { Quality ucForPads = 0; for( int nPad = 0; nPad < nChimpPadsToAdd; ++nPad ) { pFakeChimpRead_->appendNtide( Ntide( '*', pCP->nAddAlignedSequenceQualityOfBases_ ) ); pFakeChimpRead_->appendPointPos( 0 ); } } // set up for next time nInterveningChimpBasesAdded = 0; nConsPosOfLastReferencePosition = nConsPos; } else { ++nInterveningChimpBasesAdded; } if ( cChimpBaseInAlignment == '-' ) cChimpBaseInAlignment = '*'; // now can add the chimp base pFakeChimpRead_->appendNtide( Ntide( cChimpBaseInAlignment, pCP->nAddAlignedSequenceQualityOfBases_ ) ); pFakeChimpRead_->appendPointPos( 0 ); } // for( int nzAlignmentPos = 0; // quality and alignment clipping pFakeChimpRead_->setQualAndAlignClipping( pFakeChimpRead_->nGetAlignStart(), pFakeChimpRead_->nGetAlignEnd(), pFakeChimpRead_->nGetAlignStart(), pFakeChimpRead_->nGetAlignEnd() ); } void addAlignedSequence :: insertColumnsOfPadsInAssembly( alignmentPiece* pAlignmentPiece ) { // must initialize nConsPos and nConsPosOfLastAddedBase // e.g., if both are position 1000 within chromosome, // then starting position within read is 1 // the start of the read is at n1DesiredUnpaddedStartB_ // (even though the start of the alignment is n1UnpaddedStartB_ ) // so the 1-based end of the read is: int n1UnpaddedEndOfAlignmentWithinExistingRead = pAlignmentPiece->n1UnpaddedEndB_ - pAlignmentPiece->n1DesiredUnpaddedStartB_ + 1; int nConsPos = pExistingReadWithWRItem_->nGetConsPosFromUnpaddedFragPos( n1UnpaddedEndOfAlignmentWithinExistingRead ); // This starts where there really wasn't a base added. // I could instead have had a flag saying "no base added yet" to // signal that it shouldn't add any pads. But then there would // be the problem of pads at the beginning of the alignment // (I can't be sure this never happens with the Santa Cruz alignments.) // In such a case, this works. For example: // bbbb (chimp) // b*** (human) // ^nConsPosOfLastAddedBase // so this would add 3 pads before adding the b, just as we would want. int nConsPosOfLastNonPadBase = nConsPos + 1; int nPadsInAlignedHumanSeq = 0; // if we go backwards, there will be no problems will the // consensus positions being screwed up for( int nzAlignmentPos = pAlignmentPiece->soPaddedBasesB_.length() - 1; nzAlignmentPos >= 0; --nzAlignmentPos ) { char cHumanBaseInAlignment = pAlignmentPiece->soPaddedBasesB_[ nzAlignmentPos ]; if ( cHumanBaseInAlignment == '-' ) { ++nPadsInAlignedHumanSeq; continue; } // if reached here, the human alignment base is not a pad. See if // any pads should be added to the assembly. // move to left in the human read over any pads nConsPos = nConsPosOfLastNonPadBase - 1; while( pExistingReadWithWRItem_->ntGetFragFromConsPos( nConsPos ).bIsPad() ) { --nConsPos; } assert( pExistingReadWithWRItem_->ntGetFragFromConsPos( nConsPos ).cGetBase() == tolower( cHumanBaseInAlignment ) ); // e.g., if bb // ^ nConsPosOfLastNonPadBase // ^ nConsPos // then no pads in alignment int nPadsInReadInAssembly = nConsPosOfLastNonPadBase - nConsPos - 1; int nPadsToAdd = nPadsInAlignedHumanSeq - nPadsInReadInAssembly; // this number could be negative, which is dealt with above when // we added pads to the chimp sequence if ( nPadsToAdd > 0 ) { for( int nPad = 0; nPad < nPadsToAdd; ++nPad ) { // insert just before nConsPos+1 // since nConsPos is a non-pad we have just backed up to. pExistingReadWithWRItem_->pContig_->insertColOfPads( nConsPos + 1 ); } } // set up for next non-pad nPadsInAlignedHumanSeq = 0; nConsPosOfLastNonPadBase = nConsPos; } // for( int nzAlignmentPos = ... } void addAlignedSequence :: saveAssembly() { if ( filNewAceFile_.isNull() ) { filNewAceFile_ = filAceFileToOpen_.filGetNextVersion(); // find next version that doesn't already exist while( filNewAceFile_.bFileByThisNameExists() ) { filNewAceFile_ = filNewAceFile_.filGetNextVersion(); } } ConsEd::pGetAssembly()->saveToFile( filNewAceFile_, false ); // ace file format 2 }