/** ** Copyright (c) 2007-2010 Illumina, Inc. ** ** This software is covered by the "Illumina Genome Analyzer Software ** License Agreement" and the "Illumina Source Code License Agreement", ** and certain third party copyright/licenses, and any user of this ** source file is bound by the terms therein (see accompanying files ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and ** Illumina_Source_Code_License_Agreement.pdf and third party ** copyright/license notices). ** ** This file is part of the Consensus Assessment of Sequence And VAriation ** (CASAVA) software package. ** ** @file SmallAssemblerImpl.cpp ** ** @brief Implementation of the SmallAssembler. ** ** @author Tony Cox, Ole Schulz-Trieglaff **/ #include "assembly/SmallAssemblerImpl.hh" #include #include #include #include #include using namespace std; // stream used by DEBUG_ASBL: #ifdef DEBUG_ASBL ostream& dbg_os(cerr); #endif /** * Adds base @p base to the end (mode=0) or start (mode=1) of the contig. * * @return The extended contig. */ string SmallAssemblerImpl::addBase(const string& contig, const char base, const unsigned int mode) { switch (mode) { case 0: return contig + base; case 1: return base + contig; default: cerr << "ERROR: addBase() : " << contig << " " << base << " " << mode << endl; exit(EXIT_FAILURE); } return string(); } // addBase() /** * Returns a suffix (mode=0) or prefix (mode=1) of @p contig with length @p length. * * @return The suffix or prefix. */ string SmallAssemblerImpl::getEnd(const string& contig, const unsigned length, const unsigned mode) { const unsigned csize(contig.size()); assert(length <= csize); switch (mode) { case 0: return contig.substr((csize-length),length); case 1: return contig.substr(0,length); default: cerr << "ERROR:: getEnd() : " << contig << " " << length << " " << mode << endl; exit(EXIT_FAILURE); } return string(); } // getEnd() /** * Extends the seed contig (aka most frequent k-mer) * * @return The extended contig. */ void SmallAssemblerImpl::walk(const SmallAssemblerOptions& asmOptions, const string& seed, const unsigned wordLength, const str_uint_map_t& wordHash, unsigned& stepsBackward, string& contig) { // we start with the seed contig = seed; set seenBefore; // records k-mers already encountered during extension seenBefore.insert(contig); const str_uint_map_t::const_iterator whe(wordHash.end()); // 0 => walk to the right, 1 => walk to the left for(unsigned mode(0);mode<2;++mode){ while(true) { const string tmp(getEnd(contig,wordLength-1,mode)); #ifdef DEBUG_ASBL dbg_os << "# current contig : " << contig << " size : " << contig.size() << endl << " getEnd : " << tmp << endl; #endif if (seenBefore.find(tmp) != seenBefore.end()) { #ifdef DEBUG_ASBL dbg_os << "Seen word " << tmp << " before on this walk, terminating" << endl; #endif break; } seenBefore.insert(tmp); unsigned maxOcc(0); unsigned totalOcc(0); char maxBase('N'); static const unsigned N_BASES(4); static const char BASES[] = "ACGT"; for(unsigned i(0);isecond); totalOcc += val; if (val > maxOcc) { maxOcc = val; maxBase = b; } #ifdef DEBUG_ASBL dbg_os << b << " " << newKey << " " << val << " " << endl; #endif } #ifdef DEBUG_ASBL dbg_os << "Winner is : " << maxBase << " with " << maxOcc << " occurrences." << endl; #endif if ((maxOcc < asmOptions.minCoverage) or (maxOcc < (1.-asmOptions.maxError)* totalOcc)) { #ifdef DEBUG_ASBL dbg_os << "Coverage or error rate below threshold.\n" << "maxocc : " << maxOcc << " min coverage: " << asmOptions.minCoverage << "\n" << "maxocc : " << maxOcc << " error threshold: " << ((1.0-asmOptions.maxError)* totalOcc) << endl; #endif break; } #ifdef DEBUG_ASBL dbg_os << "Adding base " << contig << " " << maxBase << " " << mode << endl; #endif contig = addBase(contig,maxBase,mode); #ifdef DEBUG_ASBL dbg_os << "New contig: " << contig << endl; #endif if (mode == 1) { ++stepsBackward; } } // while(true) #ifdef DEBUG_ASBL dbg_os << "mode change. Current mode " << mode << endl; #endif } // for(mode) return; } // walk() void SmallAssemblerImpl::iterateWordLengths(const SmallAssemblerOptions& asmOptions, ShadowReadVec& shadows, Assembly& as) { unsigned unused_reads_now(0); for(ShadowReadVec::const_iterator ct = shadows.begin(); ct != shadows.end(); ++ct) { if (!ct->used) ++unused_reads_now; } //cout << "Unused reads " << unused_reads_now << endl; unsigned unused_reads_prev = unused_reads_now; while (unused_reads_now>0) { for(unsigned int wl=asmOptions.wordLength;wl<=asmOptions.maxWordLength;wl+=2) { bool ret = buildContigs(asmOptions,shadows,wl,as,unused_reads_now); if (ret) break; // stop iteration at first succesful assembly } //cout << "unused reads now " << unused_reads_now << " unused reads previous iteration " << unused_reads_prev << endl; if (unused_reads_now == unused_reads_prev) { // stop if no change in number of unused reads break; } unused_reads_prev=unused_reads_now; } } bool SmallAssemblerImpl::buildContigs(const SmallAssemblerOptions& asmOptions, ShadowReadVec& shadows, const unsigned wordLength, Assembly& as, unsigned& unused_reads) { //cout << "In SmallAssemblerImpl::buildContig. word length=" << wordLength << endl; const unsigned shadow_size(shadows.size()); #ifdef DEBUG_ASBL //dbg_os << "----------------------------------------" << endl; for(unsigned i(0);iread" << readNum << "_used" << used << endl; dbg_os << read << endl; } #endif // a set of read hashes; each read hash stores the starting positions of all kmers in the read vector > readHashes; // counts the number of occurences for each kmer in all shadow reads str_uint_map_t wordHash; // most frequent kmer and its number of occurences unsigned maxOcc = 0; string maxWord; for(unsigned i(0);i=wordLength); readHashes.resize(readHashes.size()+1); readHashes.back().first=readNum; str_uint_map_t& readHash = readHashes.back().second; for (unsigned j(0);j<=(read_size-wordLength);++j) { const string word(read.substr(j,wordLength)); if (readHash.find(word) != readHash.end()) { // try again with different k-mer size #ifdef DEBUG_ASBL dbg_os << "word " << word << " repeated in read " << read << "\n" << "... try again with word length " << wordLength+2 << " and " << shadow_size << " shadow reads." << endl; #endif return false; } // record (0-indexed) start point for word in read //cout << "Recording " << word << " at " << j << endl; readHash[word]=j; // count occurences ++wordHash[word]; if (wordHash[word]>maxOcc) { //cout << "Setting max word to " << maxWord << " " << maxOcc << endl; maxOcc = wordHash[word]; maxWord = word; } // if (maxOcc) } } if (maxOcc < asmOptions.minCoverage) { #ifdef DEBUG_ASBL dbg_os << "Coverage too low : " << maxOcc << " " << asmOptions.minCoverage << endl; #endif return false; } #ifdef DEBUG_ASBL dbg_os << "Seeding kmer : " << maxWord << endl; #endif // counts the number of steps taken backwards from the start kmer during the assembly. unsigned stepsBackward(0); // start initial assembly with most frequent kmer as seed AssembledContig ctg; walk(asmOptions,maxWord,wordLength,wordHash,stepsBackward,ctg.seq); // done with this now: wordHash.clear(); const unsigned rh_size(readHashes.size()); const unsigned contig_size(ctg.seq.size()); #ifdef DEBUG_ASBL dbg_os << "First pass assembly resulted in " << ctg.seq << "\n" << " with length " << contig_size << " assembled from " << rh_size << " reads. " << endl; #endif for (unsigned i(0);i(j)-static_cast(rhi->second)); //cout << "pos in read/contig : " << wordHash[word] << " " << j << endl; if (ctg.contigReads.find(readNum) == ctg.contigReads.end() ) { ctg.contigReads[readNum] = startPos; shadows[readNum].used = true; --unused_reads; } } } // for each kmer... } // don't need this anymore: readHashes.clear(); //cout << "final seeding reading count: " << ctg.seedReadCount << endl; as.push_back(ctg); return true; }