/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ // // assembly.h // // header file for the assembly class // // an assembly is basically an array of contigs with // some other stuff thrown in (well, later maybe). // for now, the only ctor is one that takes the names // of three files output by phrap: the reads (padded), // the contigs (also padded) and the output (i.e. stdout // from phrap). the first two are in fasta format, // and are parsed by FastaFile object(s). // The third is parsed by the PhrapFile object (for // the time being). Eventually the phrap routines // will return the data as structs. // // // no longer uses PhrapAceFile objects. // throws exception (either SysRequestFailed or InputDataError) // if ctor fails. chrisa 12-may-95 // #ifndef ASSEMBLY_INCLUDED #define ASSEMBLY_INCLUDED #include "rwcstring.h" #include "sysdepend.h" #include "assert.h" #include "locatedFragment.h" #include "contig.h" #include "filename.h" #include "listOfFragments.h" #include #include #include "stdarg.h" #include #include "matchWithinAssembly.h" #include "wholeAssemblyItem.h" #include "rwtptrorderedvector.h" #include "readsSortedByTemplateName.h" #include "subcloneTTemplate.h" #include "expidAndLocatedFragment.h" #include "singletInfo.h" #include "templatesSortedByName.h" #include "singletsSortedByTemplateName.h" #include "nMAX_QUALITY_LEVEL2.h" #include "mbtValOrderedVectorOfRWCString.h" #include "rwtvalsortedvector.h" class universalPrimerAutoFinishExpTagsSortedByTemplateName; class autoFinishExpTag; class contigEndPair; class readAndEnd; class restrictionFragment; class clScaffold; class arrayOfFwdRevPairs; class oligoTag; class readsSortedByReadName; // max number of bases on DNA lines in output .ace files static const int nAceMaxDnaLineLen = 50; typedef MBTValOrderedOffsetVector aModelReadWithSeveralTemplatesType; class Assembly { public: // for now the null ctor NOT supported Assembly() { assert (false); } Assembly(const char* szPhrapAceFileName, const bool bSetGlobalAssembly ); ~Assembly() {} int nNumContigs() const { return dapContigs_.length(); } int nGetNumberOfContigs() const { return dapContigs_.length(); } Contig* pGetContig( int nContigIndex ) { return( dapContigs_[ nContigIndex ] ); } void addContig( Contig* pContig ) { dapContigs_.insert( pContig ); } // indexing the assembly returns a ref to a contig. // note that this is NOT a const ref, the contig // can and will be modified by the editor object Contig& operator[] (const int index) { return *(dapContigs_[index]); } // returns name of current file FileName soGetAceFileName() const { return soFileName_; } FileName filGetAceFileFullPathname(); RWCString soGetLocusName() { return( filGetAceFileFullPathname().soGetProjectBeforeEditDir() ); } RWCString soGetFirstPartOfAceFileName(); FileName filGetReadListFilename(); // returns a string object containing the path of the // directory where it expects to find the trace files // currently set to same path as .ace file RWCString soGetTraceFileDirectory() const { return soFileName_.soGetDirectory(); } // passed a file name, saves a new version of the // .ace file (but only with '*' i.e. padded lines) // overwrites existing file, throws exception if // it cannot open. void saveToFile(const char* szOutFilePath, const bool bAceFormat1 ); FileName filSaveAssemblyToUserSpecifiedOrNextAvailableVersionOfAceFile(); void saveAssemblyToUserSpecifiedOrNextAvailableVersionOfAceFile( const FileName& filSaveToThisFile ); // saves entire assembly, including a new ace file and // any new versions of phd files void saveAssembly( bool& bUserPushedCancel, Widget widOfCallingWindow ); void writeConsensusOfAllContigsToFastaFile(); RWCString soGetContigName(int nContigIndex) { return( dapContigs_[ nContigIndex ]->soGetName() ); } // returns NULL if contig is not found Contig* pGetContigByName( const RWCString& soContigName ); Contig* pGetContigByNameCaseInsensitive( const RWCString& soContigName ); Contig* pGetContigByVariousNamesCaseInsensitive( const RWCString& soContigName ); Contig* pGetContigByAbbreviatedName( const RWCString& soAbbreviatedContigName ); Contig* pGetContigByVariousNames( const RWCString& soContigName ); // for assemblyView int nGetContigIndexByContig( Contig* pContig ); // Searches the entire assembly for a fragment of name soFragmentName // If found, returns its LocatedFragment pointer. // If not found, returns null. LocatedFragment* pGetLocatedFragmentByName( const RWCString& soFragmentName ); void parseDescriptionLine( const RWCString& soLine, LocatedFragment* pCurLocFrag, const int nCurLine ); void firstPassThroughAceFile( ifstream& ifs, listOfFragments& listOfFragmentss, int& nLinesInAceFile, int& nNumberOfBaseSegments); void cleanUpAfterParsingAceFileFormat1(); void cleanUpAfterParsingAceFileFormat2(); void readPHDBallsAndPHDFilesAndSetQualitiesAndPeakPositions( ); void setChanged() { bChanged_ = true; } void setUnchanged() { bChanged_ = false; } bool bChanged() { return( bChanged_ ); } void clearAllChangedFlags(); void writeEditToEditHistoryFile( char* szFormat, ... ); void setOpenEditHistoryForAppendFlag() { bOpenEditHistoryForAppend_ = true; } void clearOpenEditHistoryForAppendFlag() { bOpenEditHistoryForAppend_ = false; } bool bOpenEditHistoryForAppendFlag() { return( bOpenEditHistoryForAppend_ ); } void processBaseQualityLine( const RWCString& soLine, Contig* pCurrentContig, int& nLastBasePosition ); void setContigMatchTablesInAllContigs(); void tryParsingUsingAceFileFormat1( const char* szAceFilename, bool& bHasConsensusTags, long& lPositionOfConsensusTags); void setNumberOfReadsInAssembly( const int nNumberOfReads ) { nNumberOfReadsInAssembly_ = nNumberOfReads; } int nGetNumberOfReadsInAssembly() { return nNumberOfReadsInAssembly_; } int nGetNumberOfReadsInAssemblyWithTemplates(); // recalculate this number, set nNumberOfReadsInAssembly_ void setNumberOfReadsInAssemblyAccurate(); int nGetNextOligoNumber() { ++nLastUsedOligoNumber_; return( nLastUsedOligoNumber_ ); } RWCString soGetNextOligoName(); RWCString soGetOligoNameFromOligoNumber( const int nPrimerID ); void setLastUsedOligoNumber( const int nLastUsedOligoNumber ) { nLastUsedOligoNumber_ = nLastUsedOligoNumber; } void perhapsIncreaseLastUsedOligoNumber( const RWCString& soOligoName ); void removeContig( Contig* pContig ) { Contig* pRemovedContig = dapContigs_.remove( pContig ); if ( pContig != pRemovedContig ) { RWCString soError = "Assembly::removeContig tried to remove " + RWCString( (long) pContig ) + " but found " + RWCString( (long) pRemovedContig ); cerr << soError << endl; cerr.flush(); cerr << "tried to remove " << pContig->soGetName() << " but removed " << pRemovedContig->soGetName() << endl; cerr.flush(); THROW_ERROR2( soError ); } // assert( pContig == dapContigs_.remove( pContig ) ); } RWCString soGetNewContigName(); void addMatchWithinAssembly( matchWithinAssembly* pMatch ) { aMatchesWithinAssembly_.insert( pMatch ); } int nGetNumberOfMatchesWithinAssembly() { return( aMatchesWithinAssembly_.length() ); } matchWithinAssembly* pGetMatchWithinAssembly( const int nMatch ) { return( aMatchesWithinAssembly_[ nMatch ] ); } void addWholeAssemblyItem( wholeAssemblyItem* pWA ) { aWholeAssemblyItems_.append( pWA ); } void setBadTemplateArray(); void setBadLibraryArray(); void flagAllSubclonesIfBadTemplateOrBadLibrary(); void setAutoFinishDebugUniversalPrimersFile(); void setAutoFinishDebugCustomPrimersFile(); bool bFoundAutoFinishDebugRead( experiment* pExp ); bool bFoundAutoFinishDebugPrimer( primerType* pPrimer ); bool bIsThisTemplateOKNotBad( const RWCString& soTemplateName ); bool bIsThisLibraryOKNotBad( const RWCString& soLibraryName ); void setAllPaddedPositionsArrays(); void showWholeAssemblyItems(); void setReadOnly( const RWCString& soErrorMessage ); void clearReadOnly(); void setContigTemplateArrays(); void sortContigTemplateArrays(); void clearAndResizeContigTemplateArrays(); void dumpTemplates(); void tagAceFileWithExperiments(); void tagAceFileWithOligosForExperiments(); void saveACopyOfAceFileToBack(); void checkReadInfo(); bool bDoesTheNumberOfTemplatesMakeSense(); void showReadChemistries(); void showReadInfo(); void showTemplateQualityInfo(); void showAutofinishGoodModelRead(); void showTemplateInsertLocations(); void showLibraryInfo(); int nGetNumberOfForwardUniversalPrimerAutoFinishExpTags(); int nGetNumberOfReverseUniversalPrimerAutoFinishExpTags(); bool bIsThereAnAutoFinishExpTagForThisReverse( const RWCString& soTemplateName, autoFinishExpTag*& pAutoFinishExpTag ); bool bHasUniversalPrimerReadBeenTriedBeforeByAutoFinish( subcloneTTemplate* pSub, const int nReadType ); void createExpSummaryFiles(); void printMinilibrariesSummaryFile(); void dumpReadInfo(); void calculateLibraryInsertSizeFromForwardReversePairs(); void assignLibrariesToSubcloneTTemplates(); void checkAutoFinishReads(); bool bFoundExpInAssembly( const int nExpID, LocatedFragment*& pLocFrag ); bool bFoundExpInSinglets( const int nExpID, singletInfo*& pSingletFromAutoFinish ); void processSinglets(); void setAllContigHighQualitySegments(); void setAllUnpaddedErrorProbabilitiesAndMore(); void setAllFindSingleSubcloneBases(); bool bTemplatesAreSortedAlphabetically(); void sortTemplatesByTemplateName(); subcloneTTemplate* pGetSubcloneTemplateByTemplateName( const RWCString& soTemplateName ); void createModelRead(); void sortChosenExperiments(); bool bAreChosenExperimentsSorted(); void showAutoFinishErrorsFixedFormula(); void countReadsInEachLibrary(); void makeModelRead( MBTValOrderedOffsetVector*& pModelRead, mbtPtrVector*& pArrayOfModelReadsWithSeveralTemplates, int& nModelReadHighQualitySegmentStart, MBTValOrderedOffsetVector* pModelReadDistributions ); bool bIsThisUniversalPrimerReadInSinglets( const int nReadType, const RWCString& soTemplateName ); void clearAllContigProblemArrays(); void addRedundancyToModelReadFor9_66(); void getReadLengthStatistics( double& dMean, double& dStandardDeviation ); void findReadLengthStatistics(); void showMononucleotideRichRegions(); void showDinucleotideRichRegions(); void showMicrosatellites(); void showRunsCausingLCQs(); void showRunsAndStopsCausingLCQs(); void showStopsCausingLCQs(); void figureOutRealContigs(); void figureOutContigOrderAndOrientation(); void tryToMergeContigEndPairWithExistingScaffold( contigEndPair* pInconsistentContigEndPair ); void tryToMergeContigEndPairwithExistingScaffoldWhereBothEndsInUse( contigEndPair* pInconsistentContigEndPair ); void insertContigEndPairIfItIsNotAlreadyThere( contigEndPair*& pPairToInsert ); contigEndPair* pGetContigEndPairForGap( Contig* pContig1, const int nWhichEnd1, Contig* pContig2, const int nWhichEnd2 ); RWCString soGetContigMap(); void getRestrictionFragmentsForOneScaffold( const RWTPtrOrderedVector& aIndexedByEnzymeBases, RWTPtrOrderedVector& aRestrictionFragmentsNotEndingAtVectorInsertJunction, RWTPtrOrderedVector& aRestrictionFragmentsEndingAtVectorInsertJunction, clScaffold* pScaffold, int& nPositionOrderOfFragment ); void getScaffoldBases( clScaffold* pScaffold, RWCString& soUnpaddedChain, bool& bLeftEndIsCloneEnd, bool& bRightEndIsCloneEnd ); void getScaffolds( RWTPtrOrderedVector& aScaffolds ); // used by the routine above void setContigClonePositions( clScaffold* pScaffold ); void getContigAndPositionFromScaffoldPos( clScaffold* pScaffold, const int nScaffoldPos, Contig*& pContig, int& nUnpaddedConsPos ); void showEditableLowConsensusQualityRegions(); void showScaffoldsOfContigs(); void findRestrictionFragmentInContigMap( restrictionFragment* pRes, int& nStartPos, int& nEndPos ); void createScaffoldsFromUserContigMap( const RWCString& soUserEnteredContigMap, const int nStartUnpaddedConsPos, const int nEndUnpaddedConsPos, RWTPtrOrderedVector* pScaffoldArray, RWCString& soErrorMessage); clScaffold* pCreateScaffold( const int nScaffold ); void navigateByCertainOligoTags( const RWCString& soPartOfTagName ); void navigateByTagsFindStringInComment( const RWCString& soStringToSearchFor ); void navigateByDifferenceBetweenRecalculatedAndOriginalConsensus(); void navigateToUnalignedReads(); void navigateByHighQualityDiscrepanciesInAllReads( const bool bExcludeSpuriousPads ); void navigateByQuestionableConsensusBases(); void sortContigsByName(); void sortContigsByNumberOfReads(); void figureOutConsistentForwardReversePairDepthOfAllContigs( const bool bDoNotShowTemplatesInDoNotShowTemplatesFile ); void filterInconsistentFwdRevPairs(); void findAllTagsOfAType( const RWCString& soTagType, RWTPtrOrderedVector* pArrayOfTags ); void navigateToUnalignedBaseSegments(); void navigateByMultipleHighQualityDiscrepancies( ); int nGetScaffoldForContig( Contig* pContigToFind ); RWCString soGetNextContigEndPairID(); void setHighestContigEndPairID(); void getUserDefinedContigEndPairs( RWTPtrOrderedVector& aUserDefinedContigEndPairs ); void complementContigsIfNecessary(); void reverseScaffold( const int nScaffold ); void findLargestTagIDSoFar( const long lID ); void lookForCloneEndTags3( RWTPtrOrderedVector* pCloneEndTags ); void userPushedTellPhrapToNotOverlapReadsDiscrepantAtThisLocation( const gotoItem* pGotoItem, void* pClientData ); void getHighOrLowDepthOfCoverageRegions( const bool bHighNotLowDepthOfCoverage, const int nQualityThreshold, const int nMinDepth, gotoList*& pGotoList ); void fixReadListsAllContigs(); void setUpReadsSortedByReadName(); void zeroAddNewReadsContigCounts(); void printReadNamesInRegion(); void printAssemblySummary(); bool bReadLengthAndTracesLengthAreEqualAllReads(); public: // these routines are for parsing the ace file void parseConsensusTagsInAceFileFormat1( const char* szAceFilename, const long lPositionOfConsensusTags ); void parseAceFile( const char* szAceFilename, bool& bPerhapsAceFileFormat1 ); void getReadTagAfterFirstLine( FILE* pAceFile, LocatedFragment* pLocFrag ); void getConsensusTagAfterFirstLine( FILE* pAceFile, LocatedFragment* pLocFrag, const bool bIsReadNotConsensusTag ); void getSinglets( const RWCString& soBody ); void getWAAfterFirstLine( FILE* pAceFile ); void getMiscTags( FILE* pAceFile ); bool bGetAssemblyLine( FILE* pAceFile, int& nNumberOfContigs, int& nNumberOfReads ); void getReadTag( FILE* pAceFile, LocatedFragment* pLocFrag ); void getContigLine( FILE* pAceFile, int& nNumberOfBases, int& nNumberOfReads, int& nNumberOfBaseSegments, Contig*& pContig ); void readFileOfPhdFiles( const FileName& filFileOfPhdFiles ); void readFileOfPhdFilesForAddedReads( readsSortedByReadName& aDesiredReads, const FileName& filPhdBall, const bool bCreateNewPhdBall, FILE* pNewPhdBall ); tag* pFindTagWithThisID( const long ID ); void dealWithTagPointers(); // above here are routines for parsing the ace file public: FileName soFileName_; RWCString soTraceFileDirectory_; RWTPtrOrderedVector dapContigs_; enum { cCONTIG_NAME = 'n', cNUMBER_OF_READS = 'u', cUNKNOWN = 'k' }; char cContigsSortedByWhat_; // refers to how dapContigs_ is sorted int nNumberOfReadsInAssembly_; // set to true by any edit to any part of the // data (including contigs, located fragments, etc.). // cleared to false when the file is saved // and renamed. a "dirty" bit for the whole object bool bChanged_; bool bOpenEditHistoryForAppend_; FILE* pfilEditHistory_; int nLastUsedOligoNumber_; long lLastUsedTagID_; RWTPtrOrderedVector aMatchesWithinAssembly_; RWTPtrOrderedVector aWholeAssemblyItems_; RWTPtrOrderedVector aBadTemplatesWithRegexp_; mbtValOrderedVectorOfRWCString aBadTemplatesWithoutRegexp_; RWTValOrderedVector aBadLibraries_; RWTValOrderedVector aDebugAutofinishUniversalPrimerTemplates_; RWTValOrderedVector aDebugAutofinishUniversalPrimerForwardNotReverse_; RWTValOrderedVector aDebugAutofinishCustomPrimers_; // says that the assembly is (temporily) read_only bool bReadOnly_; RWCString soErrorMessageWhenTryToEditReadOnlyAssembly_; // pointer to this array so that when consed starts up, // no space is allocated. It is only allocated after we know // exactly how many reads there are and need to use this array // (for autofinish). readsSortedByTemplateName* pReadsSortedByTemplateName_; bool bReadsSortedByReadNameNeedsFixing_; readsSortedByReadName* pReadsSortedByReadName_; templatesSortedByName subcloneTTemplateArray_; RWTPtrOrderedVector* pAllChosenExperiments_; RWTPtrOrderedVector* pChosenPrimersForAutofinishPCR_; universalPrimerAutoFinishExpTagsSortedByTemplateName* pForwardUniversalPrimerAutoFinishExpTags_; universalPrimerAutoFinishExpTagsSortedByTemplateName* pReverseUniversalPrimerAutoFinishExpTags_; RWTPtrOrderedVector aExpidAndLocatedFragment_; // this is a list of the phd filename of all singlets from the ace file // It includes singlets whether they are autofinish reads or not // These are a pair--they have same length and corresponding // indices are for corresponding reads. If a read doesn't have // a PHD file, it is not listed in either array. RWTValOrderedVector aSingletReadNames_; RWTValOrderedVector aSingletPHDFilenames_; // all singlets singletsSortedByTemplateName* pSingletsSortedByTemplateName_; // just the singlets that have expid's (the autofinish reads) RWTPtrOrderedVector aSingletsFromAutoFinish_; // This is just (*pArrayOfModelReadsWithSeveralTemplates)[ 1 ] // That is, the model read with 1 template. This must be // just the median. MBTValOrderedOffsetVector* pModelRead_; // this is indexed by the number of templates, starting at 1 // This just gives the median quality, not the distribution, // at each base position and for each number of templates. mbtPtrVector* pArrayOfModelReadsWithSeveralTemplates_; int nModelReadHighQualitySegmentStart_; // nModelReadHighQualitySegmentEnd_ is just the end of the model read // these are used for deciding whether a single subclone base // is fixed by an Autofinish read or not int nModelReadAlignedSegmentStart_; int nModelReadAlignedSegmentEnd_; MBTValOrderedOffsetVector* pModelReadNumberOfReadsAtEachLocation_; MBTValOrderedOffsetVector* pModelReadNumberOfAlignedReadsAtEachLocation_; // the above includes reads that are vector int nModelReadTotalReads_; MBTValOrderedOffsetVector* pModelReadDistributions_; // These are not ordered in any particular way. RWTPtrOrderedVector* pContigEndPairArray_; RWTPtrOrderedVector aHeadsOfScaffoldsOfContigs_; RWTValOrderedVector aScaffoldIsCircularNotLinear_; RWTPtrOrderedVector aRealContigs_; bool bAnyReadsToCloseGaps_; bool bAlreadyCalculatedReadLengthStatistics_; double dMeanReadLength_; double dStandardDeviationOfReadLength_; RWCString soContigMap_; arrayOfFwdRevPairs* pArrayOfConfirmedInconsistentFwdRevPairs_; bool bHighestContigEndPairIDSet_; int nHighestContigEndPairID_; }; #endif // ASSEMBLY_INC