/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "autoEdit.h" #include using namespace std; #include "consed.h" #include "assembly.h" #include "tag.h" #include "rwtptrorderedvector.h" #include "mbt_exception.h" #include "rwcstring.h" #include "rwcregexp.h" #include #include #include "consedParameters.h" #include "terminateIfNoPhdDir.h" #include "maybeTerminateIfAnotherReadWriteConsed.h" #include "fileDefines.h" #include "listOfLibraries.h" #include "szGetTime.h" #include "soGetDateTime.h" #include "bIntervalsIntersect.h" #include "sequence.h" #include "locatedFragment.h" #include "guiRemoveReads.h" #include "filename.h" autoEdit :: autoEdit( const FileName& filAceFileToOpen, const FileName& filNewAceFile ) : filAceFileToOpen_( filAceFileToOpen ), filNewAceFile_( filNewAceFile ) {} void autoEdit :: doIt() { pCP->bOKToUseGui_ = false; // create the ConsEd object, which owns and manages the // Assembly data structure(s) as well as the lists of // currently existing ContigWin and Teditor objects. // there is now a global pointer to this sole instance, // set in the ctor, accessible as ConsEd::global() ConsEd* pConsed = new ConsEd(); openAutoEditOutputFile( filAceFileToOpen_ ); terminateIfNoPhdDir(); maybeTerminateIfAnotherReadWriteConsed(); // I want to read the list of libraries before reading the ace file // to make debugging by the user easier pCP->pListOfLibraries_ = new listOfLibraries(); pCP->pListOfLibraries_->aLibraries_.soName_ = "pCP->pListOfLibraries_->aLibraries_ in autoEdit.cpp"; pCP->pListOfLibraries_->parseLibraryFile(); // this does "new Assembly" ConsEd::pGetConsEd()->openAssemblyFile( filAceFileToOpen_ ); // added Oct 2005 (DG) ConsEd::pGetConsEd()->whatToDoBeforeModifyAssembly(); doItMore(); saveAssembly(); cerr << "Wrote new ace file: " << filNewAceFile_ << endl; cerr << "See output in " << pCP->filAutoFinishFullOutput_ << endl; } void autoEdit :: saveAssembly() { if ( filNewAceFile_.isNull() ) { filNewAceFile_ = filAceFileToOpen_.filGetNextVersion(); } filNewAceFile_ = filNewAceFile_.filFindOneHigherThanHighestVersion( filNewAceFile_.soGetDirectory(), 1 ); // version if none already exists fprintf( pAO, "writing new ace file %s\n", filNewAceFile_.data() ); ConsEd::pGetAssembly()->saveToFile( filNewAceFile_, false ); // ace file format2 } void autoEdit :: doItMore() { Assembly* pAssembly = ConsEd::pGetAssembly(); if ( pCP->bAutoEditRecalculateHighQualitySegmentsOfReads_ ) { for( int nContig = 0; nContig < pAssembly->nNumContigs(); ++nContig ) { Contig* pContig = pAssembly->pGetContig( nContig ); for( int nRead = 0; nRead < pContig->nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pContig->pLocatedFragmentGet( nRead ); pLocFrag->calculateHighQualitySegmentOfRead(); } } } if ( pCP->bAutoEditConvertCloneEndBasesToXs_ ) { // need to find all cloneEnd tags RWTPtrOrderedVector aCloneEndTags; pAssembly->findAllTagsOfAType( "cloneEnd", &aCloneEndTags ); for( int nTag = 0; nTag < aCloneEndTags.length(); ++nTag ) { tag* pCloneEndTag = aCloneEndTags[ nTag ]; Contig* pContig = pCloneEndTag->pContig_; char cChangeToXsRightOrLeft = ' '; int nFirstBaseToMakeAnX; if ( pCloneEndTag->soMiscData_ == "->" ) { cChangeToXsRightOrLeft = 'l'; nFirstBaseToMakeAnX = pCloneEndTag->nPaddedConsPosEnd_ - 1; } else if ( pCloneEndTag->soMiscData_ == "<-" ) { cChangeToXsRightOrLeft = 'r'; nFirstBaseToMakeAnX = pCloneEndTag->nPaddedConsPosStart_ + 1; } else { RWCString soError( (size_t) 500 ); soError.nCurrentLength_ = sprintf( soError.data(), "something wrong with cloneEnd tag in %s from %d to %d since soMiscData = %s but should be either <- or ->", pContig->soGetName().data(), pContig->nUnpaddedIndex( pCloneEndTag->nPaddedConsPosStart_ ), pContig->nUnpaddedIndex( pCloneEndTag->nPaddedConsPosEnd_ ), pCloneEndTag->soMiscData_ ); THROW_ERROR( soError ); } pContig->changeToXsInAllReads( nFirstBaseToMakeAnX, cChangeToXsRightOrLeft ); } } if ( pCP->bAutoEditTellPhrapNotToOverlapMultiplyDiscrepantReads_ ) { pAssembly->setContigTemplateArrays(); gotoList* pGotoList = new gotoList(); for( int nContig = 0; nContig < pAssembly->nNumContigs(); ++nContig ) { Contig* pContig = pAssembly->pGetContig( nContig ); pContig->findMultipleHighQualityDiscrepancies( pGotoList ); } // now for each gotoItem in the gotoList, tellPhrapNotToOverlap // the reads for( int nGotoItem = 0; nGotoItem < pGotoList->nGetNumGotoItems(); ++nGotoItem ) { gotoItem* pGotoItem = pGotoList->pGetGotoItem( nGotoItem ); pGotoItem->pContig_->tellPhrapNotToOverlapReadsDiscrepantAtThisLocation( pGotoItem->nGotoItemStart_ ); } } if ( pCP->bAutoEditTagEditableLowConsensusQualityRegions_ ) { gotoList* pEditableSitesList = new gotoList(); for( int nContig = 0; nContig < pAssembly->nNumContigs(); ++nContig ) { Contig* pContig = pAssembly->pGetContig( nContig ); pContig->addEditableSites( pEditableSitesList ); } // now go through the gotoList, tagging each location for( int nGotoItem = 0; nGotoItem < pEditableSitesList->nGetNumGotoItems(); ++nGotoItem ) { gotoItem* pGotoItem = pEditableSitesList->pGetGotoItem( nGotoItem ); tag* pTag = new tag( NULL, // LocatedFragment pGotoItem->pContig_, "editable", pGotoItem->nGotoItemStart_, pGotoItem->nGotoItemEnd_, false, // bWriteToPhdFileNotAceFile "", // soComment "autoEdit", // source soEmptyString, // current date/time false ); // bNoTrans pGotoItem->pContig_->addConsensusTag( pTag ); } } if ( pCP->bAutoEditMakeFakeRead_ ) { makeFakeRead(); } if ( pCP->bAutoEditMergeAssembly_ ) { mergeAssembly(); } if ( pCP->bAutoEditFixRunsInConsensus_ ) { FileName filFixRunsNav = ConsEd::pGetAssembly()->soGetAceFileName().soGetProjectFromAceFilename() + "." + soGetDateTime( nDotInMiddle ) + ".nav"; FileName filFixRunsNavFOF = "navFileForFixingRuns.fof"; FILE* pFOF = fopen( filFixRunsNavFOF.data(), "w" ); if ( pFOF == NULL ) { RWCString soError = "could not write " + filFixRunsNavFOF + " due to " + soGetErrno(); THROW_ERROR( soError ); } fprintf( pFOF, "%s\n", filFixRunsNav.data() ); fclose( pFOF ); pCUSTOM_NAVIGATION = fopen( filFixRunsNav.data(), "w" ); if ( pCUSTOM_NAVIGATION == NULL ) { RWCString soError = "could not write " + filFixRunsNav + "due to " + soGetErrno(); THROW_ERROR( soError ); } fprintf( pCUSTOM_NAVIGATION, "TITLE: runs fixed\n" ); for( int nContig = 0; nContig < pAssembly->nNumContigs(); ++nContig ) { Contig* pContig = pAssembly->pGetContig( nContig ); pContig->fixRunsInConsensus(); } fclose( pCUSTOM_NAVIGATION ); printf( "see %s\n", filFixRunsNav.data() ); } } void autoEdit :: openAutoEditOutputFile( const FileName& filAceFileToOpen ) { // find project name RWCString soProjectName = filAceFileToOpen; RWCRegexp regFasta("\\.fasta\\.screen\\..*$"); soProjectName( regFasta ) = ""; time_t timee; time( &timee ); char szDate[8]; char szTime[8]; int nNumberOfChars; // for some reason, on HP if you change the 8 to a 7 you // get garbage characters like this: // standard.990924Ð.102551.customPrimers nNumberOfChars = strftime( szDate, 8, "%y%m%d", localtime( &timee ) ); // all this funny business is just to hide this from SCCS #define quote( A ) #A // for some reason, on HP if you change the 8 to a 7 you get // garbage characters like this: // standard.990924Ð.102551.customPrimers nNumberOfChars = strftime( szTime, 8, quote(%H)quote(%M)quote(%S), localtime( &timee ) ); pCP->filAutoFinishFullOutput_ = soProjectName + "." + szDate + "." + szTime + ".out"; pAO = fopen( (const char*) pCP->filAutoFinishFullOutput_, "w" ); if ( ! pAO ) { RWCString soError = "cannot open file: " + pCP->filAutoFinishFullOutput_; THROW_ERROR( soError ); } pFILE = pAO; cerr << "opened file " << pCP->filAutoFinishFullOutput_ << " for output" << endl; // open autoEdit.fof to record the name of the most recent .out file FILE* pAutoEditFOF = fopen( "autoEdit.fof", "w" ); if ( !pAutoEditFOF ) { RWCString soError( (size_t) 500 ); soError.nCurrentLength_ = sprintf( soError.data(), "cannot open autoEdit.fof because of %d which means %s", errno, strerror( errno ) ); THROW_ERROR( soError ); } fprintf( pAutoEditFOF, "%s\n", pCP->filAutoFinishFullOutput_.data() ); fclose( pAutoEditFOF ); } void autoEdit :: makeFakeRead() { LocatedFragment* pLocFrag1 = ConsEd::pGetAssembly()->pGetLocatedFragmentByName( pCP->soAutoEditMakeFakeReadFromRead1_ ); LocatedFragment* pLocFrag2 = ConsEd::pGetAssembly()->pGetLocatedFragmentByName( pCP->soAutoEditMakeFakeReadFromRead2_ ); if ( !pLocFrag1 || !pLocFrag2 ) { if ( !pLocFrag1 ) { fprintf( pAO, "fatal: read %s doesn't exist in assembly\n", pCP->soAutoEditMakeFakeReadFromRead1_.data() ); } else { fprintf( pAO, "fatal: read %s doesn't exist in assembly\n", pCP->soAutoEditMakeFakeReadFromRead2_.data() ); } return; } if ( pLocFrag1->pGetContig() != pLocFrag2->pGetContig() ) { fprintf( pAO, "fatal: read %s is in contig %s but read %s is in contig %s--must be in the same contig\n", pLocFrag1->soGetName().data(), pLocFrag1->pGetContig()->soGetName().data(), pLocFrag2->soGetName().data(), pLocFrag2->pGetContig()->soGetName().data() ); return; } if ( !bIntervalsIntersect( pLocFrag1->nGetAlignStart(), pLocFrag1->nGetAlignEnd(), pLocFrag2->nGetAlignStart(), pLocFrag2->nGetAlignEnd() ) ) { fprintf( pAO, "fatal: reads %s and %s do not overlap\n", pLocFrag1->soGetName().data(), pLocFrag2->soGetName().data() ); return; } Contig* pContig = pLocFrag1->pGetContig(); // if reached here, they are in the same contig int nLeftConsPos = MIN( pLocFrag1->nGetAlignStart(), pLocFrag2->nGetAlignStart() ); int nRightConsPos = MAX( pLocFrag1->nGetAlignEnd(), pLocFrag2->nGetAlignEnd() ); LocatedFragment* pFakeRead = new LocatedFragment( pCP->soAutoEditMakeFakeReadName_, nLeftConsPos, false, // not complemented pContig ); pContig->addLocatedFragment( pFakeRead ); int nReadLength = nRightConsPos - nLeftConsPos + 1; pFakeRead->createPointPosArray( true, // know max trace index 0, // what is max index nReadLength, true ); // bFillUpArray for( int nConsPos = nLeftConsPos; nConsPos <= nRightConsPos; ++nConsPos ) { // 4 cases: the base is just in pLocFrag1 // the base is just in pLocFrag2 // the base is in both // the base is in neither bool bRead1 = pLocFrag1->bIsInRead( nConsPos ); bool bRead2 = pLocFrag2->bIsInRead( nConsPos ); Ntide nt; if ( bRead1 && !bRead2 ) { nt = pLocFrag1->ntGetFragFromConsPos( nConsPos ); } else if ( !bRead1 && bRead2 ) { nt = pLocFrag2->ntGetFragFromConsPos( nConsPos ); } else if ( bRead1 && bRead2 ) { if ( pLocFrag1->ntGetFragFromConsPos( nConsPos ).qualGetQualityWithout98or99() > pLocFrag2->ntGetFragFromConsPos( nConsPos ).qualGetQualityWithout98or99() ) { nt = pLocFrag1->ntGetFragFromConsPos( nConsPos ); } else { nt = pLocFrag2->ntGetFragFromConsPos( nConsPos ); } } else { // in neither read. So use the consensus nt = pLocFrag1->pGetContig()->ntGetCons( nConsPos ); } pFakeRead->appendNtide( nt ); } // make the whole read low quality // and make the whole read aligned pFakeRead->setQualAndAlignClipping( -1, -1, pFakeRead->nGetAlignStart(), pFakeRead->nGetAlignEnd() ); // this is required--the -1, -1 above will be translated otherwise pFakeRead->setWholeReadIsLowQuality(); // do not include directory with phd filename when writing ace file FileName filPHDFilename = ConsEd::pGetConsEd()->filGetPHDDir() + "/" + pCP->soAutoEditMakeFakeReadName_ + ".phd.1"; pFakeRead->setPHDFilename( filPHDFilename.soGetBasename() ); pFakeRead->setChromatFilename( "no_chromat" ); char* szTimestamp = szGetTime(); pFakeRead->setPHDTimestamp( szTimestamp ); wholeReadItem* pWR = new wholeReadItem( pFakeRead, "fake", // type "consed", // source soGetDateTime( nNoSlashes ), "" ); pFakeRead->aWholeReadItems_.append( pWR ); pFakeRead->writePHD( filPHDFilename ); ConsEd::pGetAssembly()->setNumberOfReadsInAssemblyAccurate(); ConsEd::pGetAssembly()->setChanged(); FILE* pFasta = fopen( pCP->filAutoEditMakeFakeReadFastaFilename_.data(), "w" ); if ( !pFasta ) { RWCString soError = "couldn't open file "; soError += pCP->filAutoEditMakeFakeReadFastaFilename_; soError += " for write"; THROW_ERROR( soError ); } fprintf( pFasta, ">%s\n", pCP->soAutoEditMakeFakeReadName_.data() ); int nBasesOnLine = 0; for( int nSeq = 1; nSeq < pFakeRead->nGetPaddedSeqLength(); ++nSeq ) { Ntide nt = pFakeRead->ntideGet( nSeq ); if ( !nt.bIsPad() ) { if ( nBasesOnLine >= 50 ) { fputc( '\n', pFasta ); nBasesOnLine = 0; } fputc( nt.cGetBase(), pFasta ); ++nBasesOnLine; } } fputc( '\n', pFasta ); fclose( pFasta ); } void autoEdit :: mergeAssembly() { Assembly* pSecondaryAssembly = new Assembly( pCP->filAutoEditSecondaryAceFile_, false ); // which is the fake contig? // pCP->soAutoEditMakeFakeReadName_ LocatedFragment* pFakeReadAmongFwdRevPair = pSecondaryAssembly->pGetLocatedFragmentByName( pCP->soAutoEditMakeFakeReadName_ ); if ( !pFakeReadAmongFwdRevPair ) { RWCString soError = RWCString( "read " ) + pCP->soAutoEditMakeFakeReadName_ + " cannot be found in assembly " + pCP->filAutoEditSecondaryAceFile_; THROW_ERROR( soError ); } // rename fake read pFakeReadAmongFwdRevPair->soSequenceName_ += "2"; // move the entire contig into the main assembly ConsEd::pGetAssembly()->addContig( pFakeReadAmongFwdRevPair->pGetContig() ); Assembly* pPrimaryAssembly = ConsEd::pGetAssembly(); cerr << "contigs in primary assembly now: " << pPrimaryAssembly->nNumContigs() << endl; for( int nContig = 0; nContig < pPrimaryAssembly->nNumContigs(); ++nContig ) { Contig* pContig = pPrimaryAssembly->pGetContig( nContig ); cerr << "contig " << nContig << " " << pContig->soGetName() << endl; } ConsEd::pGetAssembly()->setChanged(); // now we have the fake read in 2 different contigs LocatedFragment* pFakeReadAmongOtherSpecies = ConsEd::pGetAssembly()->pGetLocatedFragmentByName( pCP->soAutoEditMakeFakeReadName_ ); assert( pFakeReadAmongOtherSpecies ); assert( pFakeReadAmongOtherSpecies->pGetContig() != pFakeReadAmongFwdRevPair->pGetContig() ); int nUnpaddedLength = pFakeReadAmongOtherSpecies->nGetUnpaddedSeqLength(); assert( nUnpaddedLength == pFakeReadAmongFwdRevPair->nGetUnpaddedSeqLength() ); // now we need to equalize the positions of the pads in both reads mbtValVector nPadsInFakeReadAmongOtherSpecies( nUnpaddedLength, 1, // offset 0 ); // initial value mbtValVector nPadsInFakeReadAmongFwdRevPair( nUnpaddedLength, 1, 0 ); int nPadsAfterLastBase = 0; int nUnpadded = 0; int nPadded; for( nPadded = pFakeReadAmongOtherSpecies->nGetStartIndex(); nPadded <= pFakeReadAmongOtherSpecies->nGetEndIndex(); ++nPadded ) { bool bPad = pFakeReadAmongOtherSpecies->ntideGet( nPadded ).bIsPad(); if ( bPad ) { ++nPadsAfterLastBase; } else { ++nUnpadded; nPadsInFakeReadAmongOtherSpecies[nUnpadded] = nPadsAfterLastBase; nPadsAfterLastBase = 0; } } nPadsAfterLastBase = 0; nUnpadded = 0; for( nPadded = pFakeReadAmongFwdRevPair->nGetStartIndex(); nPadded <= pFakeReadAmongFwdRevPair->nGetEndIndex(); ++nPadded ) { bool bPad = pFakeReadAmongFwdRevPair->ntideGet( nPadded ).bIsPad(); if ( bPad ) { ++nPadsAfterLastBase; } else { ++nUnpadded; nPadsInFakeReadAmongFwdRevPair[nUnpadded] = nPadsAfterLastBase; nPadsAfterLastBase = 0; } } // now insert pads at the appropriate place so the reads become identical nUnpadded = nUnpaddedLength + 1; // first -- will bring it to // correct starting position int nConsPos; for( nConsPos = pFakeReadAmongOtherSpecies->nGetAlignEnd(); nConsPos >= pFakeReadAmongOtherSpecies->nGetAlignStart(); --nConsPos ) { bool bPad = pFakeReadAmongOtherSpecies->ntGetFragFromConsPos( nConsPos ).bIsPad(); if ( bPad ) continue; // if reached here, base is not a pad --nUnpadded; int nPadsToInsert = nPadsInFakeReadAmongFwdRevPair[nUnpadded] - nPadsInFakeReadAmongOtherSpecies[nUnpadded]; if ( nPadsToInsert > 0 ) { // insert a column of pads. for( int n = 0; n < nPadsToInsert; ++n ) { pFakeReadAmongOtherSpecies->pGetContig()->insertColOfPads( nConsPos ); } } } // do the same thing with the read in the other contig nUnpadded = nUnpaddedLength + 1; for( nConsPos = pFakeReadAmongFwdRevPair->nGetAlignEnd(); nConsPos >= pFakeReadAmongFwdRevPair->nGetAlignStart(); --nConsPos ) { bool bPad = pFakeReadAmongFwdRevPair->ntGetFragFromConsPos( nConsPos ).bIsPad(); if ( bPad ) continue; // if reached here, base is not a pad --nUnpadded; int nPadsToInsert = nPadsInFakeReadAmongOtherSpecies[ nUnpadded ] - nPadsInFakeReadAmongFwdRevPair[ nUnpadded ]; if ( nPadsToInsert > 0 ) { for( int n = 0; n < nPadsToInsert; ++n ) { pFakeReadAmongFwdRevPair->pGetContig()->insertColOfPads( nConsPos ); } } } // at this point, we've put in the same number of pads // On Monday--write out the assembly and check that this is indeed // true. Then move over reads and delete the 2nd contig and delete // the fake read in the 1st contig. Contig* pSecondaryContig = pFakeReadAmongFwdRevPair->pGetContig(); int nNewConsPosFromOldConsPos = pFakeReadAmongOtherSpecies->nGetAlignStart() - pFakeReadAmongFwdRevPair->nGetAlignStart(); for( int nRead = 0; nRead < pSecondaryContig->nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pSecondaryContig->pLocatedFragmentGet( nRead ); if ( pLocFrag == pFakeReadAmongFwdRevPair ) continue; // move this read over pLocFrag->moveReadAndReadTagsToNewContig( pFakeReadAmongOtherSpecies->pGetContig(), nNewConsPosFromOldConsPos ); } // now delete the old contig // (Does this delete the read, too? We don't want it to...No, the // reads are not deleted--just the list of pointers to the reads.) delete pSecondaryContig; // now delete the fake read within the primary contig guiRemoveReads* pGuiRemoveReads = new guiRemoveReads(); pGuiRemoveReads->aReadsToRemove_.insert( pFakeReadAmongOtherSpecies ); pGuiRemoveReads->doItAlmostNoGui( 0, true ); // bDeleteNotJustPutInOwnContig }