/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "assembly.h" #include "parseAceFile.h" #include "consed.h" #include "rwcstring.h" #include "rwcregexp.h" #include "nGetTrailingNumber.h" #include "filename.h" #include "consedParameters.h" #include using namespace std; #include "textbox.h" #include "popupErrorMessage.h" #include "fileDefines.h" #include #include "popupInfoOrErrorMessageNoFormat.h" #include "rwctokenizer.h" #include "bIsNumericMaybeWithWhitespace.h" #include "nCompareContigsByName.h" #include "bIsNumericFloat.h" #include "soFindFlowCellFromPath.h" #include #include "filFindSolexaPrbFileFromSeqFile.h" #include "nNumberFromBase2.h" #include #include "soAddCommas.h" Assembly :: Assembly(const char* szAceFilename, const bool bSetGlobalAssembly ) : soFileName_(szAceFilename), bChanged_(false), pfilEditHistory_(NULL), nLastUsedOligoNumber_( 0 ), lLastUsedTagID_( 0 ), bReadOnly_( false ), pReadsSortedByTemplateName_( NULL ), pReadsSortedByReadName_( NULL ), pAllChosenExperiments_( NULL ), pChosenPrimersForAutofinishPCR_( NULL ), pModelRead_( NULL ), pContigEndPairArray_( NULL ), bAnyReadsToCloseGaps_( false ), bAlreadyCalculatedReadLengthStatistics_( false ), pArrayOfConfirmedInconsistentFwdRevPairs_( NULL ), bHighestContigEndPairIDSet_( false ), bReadsSortedByReadNameNeedsFixing_( true ), cContigsSortedByWhat_( cUNKNOWN ) { // this must be put into the ConsEd object now since // readWholeReadItem and other things called by this ctor depend // on it. if ( bSetGlobalAssembly ) { ConsEd::pGetConsEd()->SetAssembly( this ); } dapContigs_.soName_ = "Assembly::dapContigs_"; pCP->bLimitNumberOfErrorsReported_ = true; // if ( pCP->pDebugFile_ ) // fprintf( pCP->pDebugFile_, "ace file: %s\n", szAceFilename ); bool bPerhapsAceFileFormat1; parseAceFile( szAceFilename, bPerhapsAceFileFormat1 ); setUpReadsSortedByReadName(); if ( bPerhapsAceFileFormat1 ) { popupErrorMessage( "This file has not been assembled using the -new_ace option to phrap. Thus many functions of consed will not give accurate information (these include: search for single subclone regions, single stranded regions, tell phrap to not overlap reads discrepant at this location, tear contigs, etc.) Also, autofinish will not function correctly. When using -new_ace, you will also get faster consed start up performance and much less disk space (the new ace file is about 60% as big as the old). Using phrap without -new_ace is *not* supported. This message will not be repeated, but don't think that means everything is ok. We suggest you immediately exit consed and reassemble with the current version of phrap using -new_ace." ); consedParameters::pGetConsedParameters()->bErrorMessageDisplayedAtStartup_ = true; bool bHasConsensusTags; long lPositionOfConsensusTags; tryParsingUsingAceFileFormat1( szAceFilename, bHasConsensusTags, lPositionOfConsensusTags ); if ( bHasConsensusTags ) parseConsensusTagsInAceFileFormat1( szAceFilename, lPositionOfConsensusTags ); } else { cleanUpAfterParsingAceFileFormat2(); } cerr << "Now setting quality values" << endl; cerr.flush(); // record time to read all phd files time_t time1 = time( NULL ); if (ConsEd::pGetConsEd()->bUsingPhdFiles_ ) readPHDBallsAndPHDFilesAndSetQualitiesAndPeakPositions(); time_t time2 = time( NULL ); double dTimeToReadPhdFiles = difftime( time2, time1 ); cerr << "Finished setting quality values in " << dTimeToReadPhdFiles << " seconds " << endl; cerr.flush(); if ( nNumberOfReadsInAssembly_ == 0 ) { fprintf( pAO, "Fatal: There were 0 reads in the assembly. Figure out why phrap is not assemblying the reads together.\n" ); fflush( pAO ); THROW_ERROR( "there were 0 reads in the assembly" ); } dealWithTagPointers(); complementContigsIfNecessary(); checkReadInfo(); if ( pCP->nWhatIsRunning_ == nAutoFinishIsRunning ) { bool bOK = bDoesTheNumberOfTemplatesMakeSense(); if ( !bOK && !consedParameters::pGetConsedParameters()->bAutoFinishContinueEvenThoughReadInfoDoesNotMakeSense_ ) { fprintf( stderr, "autofinish terminating due to read info not making sense, probably because you didn't correctly customize determineReadTypes.perl. See the .out file for details. For further info, see README.txt and look for \"determineReadTypes.perl\"" ); fclose( pAO ); exit( 1 ); } } // let's get prepared for the user to bring up the Aligned Reads windows fixReadListsAllContigs(); if ( pCP->nNumberOfStartupErrors_ ) { cerr << "total errors on consed startup: " << soAddCommas( pCP->nNumberOfStartupErrors_ ) << endl; } pCP->bLimitNumberOfErrorsReported_ = false; } void Assembly :: cleanUpAfterParsingAceFileFormat2() { for (int nContig = 0; nContig < nNumContigs(); nContig++) { Contig* pContig = pGetContig( nContig ); pContig->assignQualityAndPeakPositionsToPads( false ); // bAndPeakPositions // loop through all reads in assembly to find the left-most and // right-most reads. This must be done before // fixBaseSegments, since the latter uses ntGetCons which // checks against nLastDisplayableContigPos_ pContig->updateLastDisplayableContigPos(); // allow ace files without any base segments if ( pContig->baseSegArray_.nGetNumSegments() != 0 ) { // PANIC_OST( ost ) << "no BS lines for contig " << // pContig->soGetName() << ends; // throw InputDataError( ost.str().c_str() ); // } // remove any base segments that are not within the read // or that don't match the consensus pContig->baseSegArray_.fixBaseSegments(); // clean up any discontiguous base segment arrays pContig->baseSegArray_.forceSegsToBeContiguous(); // check now that bas segments are ok. If they still // aren't, delete them all (DG, July 2010) if ( !pContig->baseSegArray_.bGetDataStructureOk( true ) ) { cerr << "removing all base segments in " << pContig->soGetName() << "--no harm done except you won't be able to see golden path in color means match" << endl; pContig->baseSegArray_.dapBs_.clearAndDestroy(); } assert( pContig->baseSegArray_.bGetDataStructureOk( true ) ); } } } RWCString Assembly :: soGetFirstPartOfAceFileName() { // modified Nov 2002 (DG) so that // project.fasta.screen.ace_final021205 // would return "project", but before // instead returns project.fasta.screen.ace_final021205 // this is used to name oligos, temporary files used by the // miniassemblies, and the name of the file containing read names // that the user can add to from the Aligned Reads Window return( soFileName_.soGetProjectFromAceFilename() ); } FileName Assembly :: filGetAceFileFullPathname() { FileName filAceFileFullPathname = soGetAceFileName(); if ( filAceFileFullPathname.soGetDirectory() == "./" ) { const int nBigNumber = 500; char szCurrentDirectory[ nBigNumber+10 ]; getcwd( szCurrentDirectory, nBigNumber ); RWCString soCurrentDirectory( szCurrentDirectory ); filAceFileFullPathname = soCurrentDirectory + "/" + filAceFileFullPathname; } return( filAceFileFullPathname ); } RWCString Assembly :: soGetNextOligoName() { int nPrimerNumber = nGetNextOligoNumber(); return( soGetOligoNameFromOligoNumber( nPrimerNumber ) ); } RWCString Assembly :: soGetOligoNameFromOligoNumber( const int nOligoNumber ) { RWCString soAssemblyName = soGetFirstPartOfAceFileName(); char szTemp[200]; sprintf( szTemp, "%s.%d", (char*) soAssemblyName.data(), nOligoNumber ); RWCString soName = szTemp; return( soName ); } void Assembly :: perhapsIncreaseLastUsedOligoNumber( const RWCString& soOligoName ) { FileName filOligoName( soOligoName ); int nVersion = filOligoName.nGetVersion(); if ( nVersion > nLastUsedOligoNumber_ ) nLastUsedOligoNumber_ = nVersion; } RWCString Assembly :: soGetNewContigName() { int nLargestContigNumber = 0; for( int nContigIndex = 0; nContigIndex < nNumContigs(); ++nContigIndex ) { Contig* pContig = pGetContig( nContigIndex ); RWCString soContigName = pContig->soGetName(); int nContigNumber = nGetTrailingNumber( soContigName ); if ( nContigNumber != -1 ) { if ( nContigNumber > nLargestContigNumber ) { nLargestContigNumber = nContigNumber; } } } char szTemp[100]; sprintf( szTemp, "Contig%d", nLargestContigNumber + 1 ); return( szTemp ); } FileName Assembly :: filGetReadListFilename() { FileName soReadListFilename = soGetFirstPartOfAceFileName() + "_read_list.txt"; return( soReadListFilename ); } void Assembly :: setBadTemplateArray() { aBadTemplatesWithRegexp_.clear(); aBadTemplatesWithoutRegexp_.clear(); FILE* pBadTemplatesFile = fopen( (char*)consedParameters::pGetConsedParameters()->filPrimersBadTemplatesFile_.data(), "r" ); if ( ! pBadTemplatesFile ) { if ( errno == 2 ) { fprintf( pFILE, "there is no file %s\n", (const char*) pCP->filPrimersBadTemplatesFile_ ); fprintf( pFILE, "will assume that any template in this assembly is fair game to be chosen for a primer\n" ); return; } else { fprintf( pFILE, "could not open bad templates file %s\n", (const char*) pCP->filPrimersBadTemplatesFile_ ); fprintf( pFILE, "error = %d which means %s\n", errno, strerror( errno ) ); fprintf( pFILE, "will assume that any template in this assembly is fair game to be chosen for a primer\n" ); return; } } aBadTemplatesWithRegexp_.resize( 200 ); aBadTemplatesWithoutRegexp_.resize( 200 ); const int SZLINE_SIZE = 2000; char szLine[ SZLINE_SIZE ]; while( fgets( szLine, SZLINE_SIZE, pBadTemplatesFile ) != NULL ) { RWCString soLine = szLine; // chop off the newline (but what if the last line doesn't have one?) soLine.stripTrailingWhitespaceFast(); // look for * and convert it to .* bool bIsRegexp = false; for( int n = (int) soLine.length() - 1; n >= 0; --n ) { if ( soLine[n] == '*' ) { soLine.insert( n, "." ); bIsRegexp = true; } } if ( bIsRegexp ) { // put ^ and $ around it to make the string have to completely // match the template name soLine.insert( 0, "^" ); soLine.append( "$" ); RWCRegexp* pRegExp = new RWCRegexp( soLine ); aBadTemplatesWithRegexp_.append( pRegExp ); } else { aBadTemplatesWithoutRegexp_.append( soLine ); } } printf( "found %d templates with *'s and %d templates without *'s in the bad templates file %s\n", aBadTemplatesWithRegexp_.length(), aBadTemplatesWithoutRegexp_.length(), (char*) consedParameters::pGetConsedParameters()->filPrimersBadTemplatesFile_.data() ); fclose( pBadTemplatesFile ); // now sort the aBadTemplatesWithoutRegexp_ so we can use it // for binary search lookups aBadTemplatesWithoutRegexp_.resort(); } void Assembly :: setBadLibraryArray() { aBadLibraries_.clear(); FILE* pBadLibrariesFile = fopen( (char*)consedParameters::pGetConsedParameters()->filPrimersBadLibrariesFile_.data(), "r" ); if ( ! pBadLibrariesFile ) { if ( errno == 2 ) { fprintf( pFILE, "there is no file %s\n", (const char*) pCP->filPrimersBadLibrariesFile_ ); fprintf( pFILE, "will assume that any library in this assembly is fair game to be chosen for a primer\n" ); return; } else { fprintf( pFILE, "could not open bad libraries file %s\n", (const char*) pCP->filPrimersBadLibrariesFile_ ); fprintf( pFILE, "error = %d which means %s\n", errno, strerror( errno ) ); fprintf( pFILE, "will assume that any library in this assembly is fair game to be chosen for a primer\n" ); return; } } aBadLibraries_.resize( 200 ); const int SZLINE_SIZE = 2000; char szLine[ SZLINE_SIZE ]; while( fgets( szLine, SZLINE_SIZE, pBadLibrariesFile ) != NULL ) { RWCString soLine = szLine; // chop off the newline (but what if the last line doesn't have one?) soLine = soLine.stripWhitespace( RWCString::BOTH ); aBadLibraries_.append( soLine ); } printf( "found %d libraries in the bad libraries file %s\n", aBadLibraries_.length(), (char*) consedParameters::pGetConsedParameters()->filPrimersBadLibrariesFile_.data() ); fclose( pBadLibrariesFile ); } void Assembly :: setAutoFinishDebugCustomPrimersFile() { aDebugAutofinishCustomPrimers_.clear(); FILE* pAutoFinishDebug = fopen( pCP->filAutoFinishDebugCustomPrimerReadsFile_.data(), "r" ); if ( pAutoFinishDebug ) { cerr << "opened Autofinish custom primer debug file " << pCP->filAutoFinishDebugCustomPrimerReadsFile_ << endl; } else if ( !pAutoFinishDebug ) { return; } const int SZLINE_SIZE = 2000; RWCString soLine( (size_t) SZLINE_SIZE ); while( fgets( soLine.data(), SZLINE_SIZE, pAutoFinishDebug ) != NULL ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); aDebugAutofinishCustomPrimers_.insert( soLine ); } fclose( pAutoFinishDebug ); cerr << "primers are: " << endl; for( int n = 0; n < aDebugAutofinishCustomPrimers_.length(); ++n ) { cerr << aDebugAutofinishCustomPrimers_[n] << endl; } } void Assembly :: setAutoFinishDebugUniversalPrimersFile() { aDebugAutofinishUniversalPrimerTemplates_.clear(); aDebugAutofinishUniversalPrimerForwardNotReverse_.clear(); FILE* pAutoFinishDebug = fopen( pCP->filAutoFinishDebugUniversalPrimerReadsFile_.data(), "r" ); if ( pAutoFinishDebug ) { cerr << "opened Autofinish universal primer debug file " << pCP->filAutoFinishDebugUniversalPrimerReadsFile_ << endl; } else if ( !pAutoFinishDebug ) { // cerr << "no Autofinish debug file--that's ok" << endl; return; } const int SZLINE_SIZE = 2000; RWCString soLine( (size_t) SZLINE_SIZE ); while( fgets( soLine.data(), SZLINE_SIZE, pAutoFinishDebug ) != NULL ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); RWCTokenizer tok( soLine ); RWCString soTemplate = tok(); RWCString soFwdOrRev = tok(); if ( soTemplate.isNull() ) { PANIC_OST( ost ) << "line doesn't even have a template name. Instead found " << soLine << " in " << pCP->filAutoFinishDebugUniversalPrimerReadsFile_ << ends; } bool bFwdNotReverse; if ( soFwdOrRev == "fwd" ) bFwdNotReverse = true; else if ( soFwdOrRev == "rev" ) bFwdNotReverse = false; else { PANIC_OST( ost ) << "line not of form: template fwd/rev. Instead found " << soLine << " in " << pCP->filAutoFinishDebugUniversalPrimerReadsFile_ << ends; throw InputDataError( ost.str().c_str() ); } aDebugAutofinishUniversalPrimerTemplates_.insert( soTemplate ); aDebugAutofinishUniversalPrimerForwardNotReverse_.insert( bFwdNotReverse ); } fclose( pAutoFinishDebug ); for( int n = 0; n < aDebugAutofinishUniversalPrimerTemplates_.length(); ++n ) { fprintf( pAO, "will look for univ primer read with template %s and %s\n", aDebugAutofinishUniversalPrimerTemplates_[ n ], ( aDebugAutofinishUniversalPrimerForwardNotReverse_[n] ? "fwd" : "rev" ) ); } } bool Assembly :: bFoundAutoFinishDebugRead( experiment* pExp ) { int nIndex = aDebugAutofinishUniversalPrimerTemplates_.index( pExp->pSubcloneTemplate_->soTemplateName_ ); if ( nIndex != RW_NPOS ) { if ( ( aDebugAutofinishUniversalPrimerForwardNotReverse_[ nIndex ] && pExp->nReadType_ == nUniversalForward ) || ( ! aDebugAutofinishUniversalPrimerForwardNotReverse_[ nIndex ] && pExp->nReadType_ == nUniversalReverse ) ) { return( true ); } } return( false ); } bool Assembly :: bFoundAutoFinishDebugPrimer( primerType* pPrimer ) { // improvement in speed in normal case in which there is no // debug_custom.txt file if ( aDebugAutofinishCustomPrimers_.length() == 0 ) return( false ); RWCString soPrimerSequence( (size_t) pPrimer->nUnpaddedLength_ ); for( int n = 0; n < pPrimer->nUnpaddedLength_; ++n ) { soPrimerSequence.append( pPrimer->szPrimer_[n] ); } int nIndex = aDebugAutofinishCustomPrimers_.index( soPrimerSequence ); if ( nIndex == RW_NPOS ) return( false ); else return( true ); } bool Assembly :: bIsThisTemplateOKNotBad( const RWCString& soTemplateName ) { for( int n = 0; n < aBadTemplatesWithRegexp_.length(); ++n ) { RWCRegexp* pRegTemplate = aBadTemplatesWithRegexp_[ n ]; if ( !(soTemplateName( (*pRegTemplate) ).isNull() ) ) return( false ); } if ( aBadTemplatesWithoutRegexp_.bContains( soTemplateName ) ) return( false ); return( true ); } bool Assembly :: bIsThisLibraryOKNotBad( const RWCString& soLibraryName ) { for( int n = 0; n < aBadLibraries_.length(); ++n ) { if ( soLibraryName == aBadLibraries_[n] ) { return( false ); } } return( true ); } void Assembly :: setAllPaddedPositionsArrays() { for( int nContigIndex = 0; nContigIndex < nNumContigs(); ++nContigIndex ) { Contig* pContig = pGetContig( nContigIndex ); pContig->setPaddedPositionsArray(); } } void Assembly :: setNumberOfReadsInAssemblyAccurate() { int nTotalReads = 0; for( int nContigIndex = 0; nContigIndex < nNumContigs(); ++nContigIndex ) { Contig* pContig = pGetContig( nContigIndex ); nTotalReads += pContig->nGetNumberOfFragsInContig(); } nNumberOfReadsInAssembly_ = nTotalReads; } void Assembly :: showWholeAssemblyItems() { TextBox* pTB = new TextBox( "Whole Assembly Items" ); for( int n = 0; n < aWholeAssemblyItems_.length(); ++n ) { wholeAssemblyItem* pWAI = aWholeAssemblyItems_[n]; pTB->append( "-----------------------------------\n" ); pTB->append( "type: " + pWAI->soType_ + " source: " + pWAI->soSource_ + " date: " + pWAI->soDateTime_ + "\n" + pWAI->soText_ + "\n" ); } pTB->makeVisible(); } void Assembly :: setReadOnly( const RWCString& soErrorMessage ) { soErrorMessageWhenTryToEditReadOnlyAssembly_ = soErrorMessage; bReadOnly_ = true; } void Assembly :: clearReadOnly() { bReadOnly_ = false; } void Assembly :: checkReadInfo() { readsSortedByReadName aAllReads; aAllReads.resize( (size_t) nGetNumberOfReadsInAssembly() ); int nNumberOfUnknownChemistry = 0; int nNumberOfUnknownDye = 0; int nNumberOfReads = 0; int nNumberOfUniversalForward = 0; int nNumberOfUniversalReverse = 0; int nNumberOfWalks = 0; int nNumberOfPCREnd = 0; int nNumberOfUnknownReadType = 0; int nNumberOfReadsNotSetInPhdFile = 0; for( int nContig = 0; nContig < nNumContigs(); nContig++) { Contig* pContig = pGetContig( nContig ); for( int nRead = 0; nRead < pContig->nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pContig->pLocatedFragmentGet( nRead ); // if ( pLocFrag->bIsSolexaRead_ ) continue; aAllReads.insert( pLocFrag ); ++nNumberOfReads; if ( pLocFrag->soDye_.isNull() || ( pLocFrag->soDye_.returnToLower() == "unknown" ) ) ++nNumberOfUnknownDye; if ( pLocFrag->soChemistry_.isNull() || ( pLocFrag->soChemistry_.returnToLower() == "unknown" ) ) ++nNumberOfUnknownChemistry; if ( pLocFrag->nReadType_ == 0 ) { pLocFrag->setReadTypeFromReadName(); } if ( pLocFrag->nReadTypeFromPhdFile_ == 0 ) ++nNumberOfReadsNotSetInPhdFile; if ( pLocFrag->nReadType_ == 0 ) ++nNumberOfUnknownReadType; else if ( pLocFrag->nReadType_ == nUniversalForward ) ++nNumberOfUniversalForward; else if ( pLocFrag->nReadType_ == nUniversalReverse ) ++nNumberOfUniversalReverse; else if ( pLocFrag->nReadType_ == nWalk ) ++nNumberOfWalks; else if ( pLocFrag->nReadType_ == nPCREnd ) ++nNumberOfPCREnd; } } // This message is no longer important (Feb 2008) // Aye Wollam, Pat et al approved deleting it. // if ( nNumberOfReads > 1 ) { // if ( ( nNumberOfUnknownDye > nNumberOfReads / 5 ) // || // ( nNumberOfUnknownChemistry > nNumberOfReads / 5 ) ) // popupErrorMessage( "You may have installed or run phred incorrectly since phred was unable to get information about chemistry from the chromatogram in %d reads and was unable to get information about dye from the chromatogram in %d reads out of a total of %d reads. You may have not correctly set the environment variable PHRED_PARAMETER_FILE or you may not have modified this file to reflect the chemistry and dye that you are using. Please type \"phred -doc\" from the unix prompt or else refer to the phred documentation to understand what you must do. If you fail to do this, phrap will give inaccurate consensus bases and quality values, and various features of consed and autofinish will not work correctly. To aid in your debugging, in the main consed window, find the \"Info\" menu item at the top of the window. Point the cursor at it, hold down mouse button 1, drag to \"Show Chemistries of Reads\" and release the mouse button", // nNumberOfUnknownChemistry, // nNumberOfUnknownDye, // nNumberOfReads ); // consedParameters::pGetConsedParameters()->bErrorMessageDisplayedAtStartup_ // = true; // } if ( pCP->bCheckIfTooManyWalks_ ) { if ( nNumberOfReads > 0 ) { if ( ( nNumberOfWalks + nNumberOfPCREnd + nNumberOfUnknownReadType ) > nNumberOfReads / 5 ) popupErrorMessage( "Consed is detecting that there are %d forward universal primer reads, %d reverse universal primer reads, %d walking reads, %d pcr end reads, %d reads of unknown type, and %d reads did not have their type set in the phd file. Is this correct? If not, consed will not pick templates correctly for primers and autofinish will not function correctly. You probably did not correct customize determineReadTypes.perl to conform to your read naming convention. I suggest you read README.txt that came with consed and also examine the notes in determineReadTypes.perl. To help you in figuring out what is wrong, on the Main Consed Window, point to Info, hold down mouse button 1, and release on Show Info For Each Read", nNumberOfUniversalForward, nNumberOfUniversalReverse, nNumberOfWalks, nNumberOfPCREnd, nNumberOfUnknownReadType, nNumberOfReadsNotSetInPhdFile ); consedParameters::pGetConsedParameters()->bErrorMessageDisplayedAtStartup_ = true; } } // check that there are no 2 reads with the same name aAllReads.resort(); for( int nRead = 1; nRead < aAllReads.length(); ++nRead ) { LocatedFragment* pLocFrag1 = aAllReads[ nRead - 1]; LocatedFragment* pLocFrag2 = aAllReads[ nRead ]; if ( pLocFrag1->soGetName() == pLocFrag2->soGetName() ) { RWCString soError = "There are 2 reads with the same name: "; soError += pLocFrag1->soGetName(); soError += "\nOne is in contig "; soError += pLocFrag1->pGetContig()->soGetName(); soError += " ("; soError += RWCString( (long) pLocFrag1->nGetAlignStartUnpadded() ); soError += ", "; soError += RWCString( (long) pLocFrag1->nGetAlignEndUnpadded() ); soError += ") \nThe other is in contig "; soError += pLocFrag2->pGetContig()->soGetName(); soError += " ("; soError += RWCString( (long) pLocFrag2->nGetAlignStartUnpadded() ); soError += ", "; soError += RWCString( (long) pLocFrag2->nGetAlignEndUnpadded() ); soError += ") \nThis will really mess up consed."; THROW_ERROR( soError ); } } // for( int nRead ... } void Assembly :: clearAllContigProblemArrays() { for( int nContigIndex = 0; nContigIndex < nNumContigs(); ++nContigIndex ) { Contig* pContig = pGetContig( nContigIndex ); delete pContig->pProblemsInUnpaddedContig_; pContig->pProblemsInUnpaddedContig_ = NULL; } } void Assembly :: sortContigsByName() { size_t nNumberOfElements = dapContigs_.length(); size_t nSizeOfAnElement = sizeof( Contig* ); void* pArray = dapContigs_.data(); qsort( pArray, nNumberOfElements, nSizeOfAnElement, ( (int(*) ( const void*, const void*) ) nCompareContigsByName ) ); // check that they are now sorted for( int nContig = 1; nContig < dapContigs_.length(); ++nContig ) { Contig* pContigA = dapContigs_[ nContig - 1 ]; Contig* pContigB = dapContigs_[ nContig ]; assert( nCompareContigsByName( &pContigA, &pContigB ) <= 0 ); } cContigsSortedByWhat_ = cCONTIG_NAME; } int nCompareContigsByNumberOfReads( Contig** ppContigA, Contig** ppContigB ) { if ( (*ppContigA)->nGetNumberOfFragsInContig() < (*ppContigB)->nGetNumberOfFragsInContig() ) return -1; else if ( (*ppContigA)->nGetNumberOfFragsInContig() > (*ppContigB)->nGetNumberOfFragsInContig() ) return 1; else { // 2 contigs have the same number of reads. So sort // them by Contig # (e.g., Contig3 < Contig12, so not // alphabetically) return nCompareContigsByName( ppContigA, ppContigB ); } } void Assembly :: sortContigsByNumberOfReads() { size_t nNumberOfElements = dapContigs_.length(); size_t nSizeOfAnElement = sizeof( Contig* ); void* pArray = dapContigs_.data(); qsort( pArray, nNumberOfElements, nSizeOfAnElement, ( (int(*) ( const void*, const void*) ) nCompareContigsByNumberOfReads ) ); // check that they are now sorted for( int nContig = 1; nContig < dapContigs_.length(); ++nContig ) { Contig* pContigA = dapContigs_[ nContig - 1]; Contig* pContigB = dapContigs_[ nContig ]; assert( pContigA->nGetNumberOfFragsInContig() <= pContigB->nGetNumberOfFragsInContig() ); } cContigsSortedByWhat_ = cNUMBER_OF_READS; } void Assembly :: complementContigsIfNecessary() { for( int nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); RWCString soErrorMessage; pContig->complementContigIfNecessary( soErrorMessage ); if ( !soErrorMessage.isNull() ) popupErrorMessage( soErrorMessage ); } } void Assembly :: dealWithTagPointers() { // how should errors be handled? I think they should not be fatal. // But should the tag be removed? Or left without pointing to anything? // If come up with pointers that don't point to anything, could // cause segmentation fault later. I guess I could warn user of this. RWCString soErrorMessage; for( int nContig = 0; nContig < nNumContigs(); ++nContig ) { Contig* pContig = pGetContig( nContig ); for( int nConsensusTag = 0; nConsensusTag < pContig->nGetNumberOfTags(); ++nConsensusTag ) { tag* pTag = pContig->pGetTag( nConsensusTag ); if ( pTag->pUserDefinedTagType_ ) { pTag->dealWithPointerFields( soErrorMessage ); } } for( int nRead = 0; nRead < pContig->nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pContig->pLocatedFragmentGet( nRead ); for( int nTag = 0; nTag < pLocFrag->nGetNumberOfTags(); ++nTag ) { tag* pTag = pLocFrag->pGetTag( nTag ); if ( pTag->pUserDefinedTagType_ ) { pTag->dealWithPointerFields( soErrorMessage ); } } } } if ( !soErrorMessage.isNull() ) { soErrorMessage += "\nThis may cause Consed to crash with a segmentation fault later, such as when you might try to display one of these tags."; popupErrorMessage( soErrorMessage ); } } void Assembly :: fixReadListsAllContigs() { for (int nContig = 0; nContig < nNumContigs(); nContig++) { Contig* pContig = pGetContig( nContig ); pContig->fixReadLists(); } } void Assembly :: setUpReadsSortedByReadName() { if ( pReadsSortedByReadName_ ) delete pReadsSortedByReadName_; pReadsSortedByReadName_ = new readsSortedByReadName( nGetNumberOfReadsInAssembly() ); pReadsSortedByReadName_->soName_ = "Assembly::pReadsSortedByReadName_"; // nGetNumberOfReadsInAssembly is set in parseAceFile by the AS line for (int nContig = 0; nContig < nNumContigs(); nContig++) { Contig* pContig = pGetContig( nContig ); for( int nRead = 0; nRead < pContig->nGetNumberOfFragsInContig(); ++nRead ) { LocatedFragment* pLocFrag = pContig->pLocatedFragmentGet( nRead ); pReadsSortedByReadName_->insert( pLocFrag ); } } pReadsSortedByReadName_->resort(); bReadsSortedByReadNameNeedsFixing_ = false; }