// -*- mode: c++; indent-tabs-mode: nil; -*- /** ** Copyright (c) 2007-2010 Illumina, Inc. ** ** This software is covered by the "Illumina Genome Analyzer Software ** License Agreement" and the "Illumina Source Code License Agreement", ** and certain third party copyright/licenses, and any user of this ** source file is bound by the terms therein (see accompanying files ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and ** Illumina_Source_Code_License_Agreement.pdf and third party ** copyright/license notices). ** ** This file is part of the Consensus Assessment of Sequence And VAriation ** (CASAVA) software package. ** ** @file SmallAssemblerImpl.hh ** ** @brief Implementation of the SmallAssembler. ** ** @author Tony Cox, Ole Schulz-Trieglaff **/ #ifndef SMALLASSEMBLERIMPL_HH_ #define SMALLASSEMBLERIMPL_HH_ #include "assembly/AssembledContig.hh" #include "applications/SmallAssemblerOptions.hh" #include #include #include #include #include // compile with this macro to get verbose output: //#define DEBUG_ASBL #ifdef DEBUG_ASBL #include extern std::ostream& dbg_os; #endif class SmallAssemblerImpl { public: SmallAssemblerImpl() {}; // a shadow/anomalous/semi-aligning read which stores its position in the cluster and remember if // it has been used in an assembly. struct ShadowRead { ShadowRead() : seq("EMPTY"), pos(0), used(false) {} ShadowRead(std::string s, unsigned p, bool u) : seq(s), pos(p), used(u) {} std::string seq; // sequence unsigned pos; // position in cluster bool used; // used in assembly? }; // Vector of shadow reads with their index (i.e. position in the cluster) //typedef std::vector > ShadowReadVec; typedef std::vector< ShadowRead > ShadowReadVec; // maps kmers to positions in read typedef boost::unordered_map str_uint_map_t; // typedef std::vector Assembly; /** * @brief Performs a simple de-novo assembly using a group of reads * * Assembles a cluster of shadow reads. This function iterates over a range * of word lengths until the first succesful assembly. * * If unused reads remain, the assembly is re-started using this subset. */ void iterateWordLengths(const SmallAssemblerOptions& ,ShadowReadVec& ,Assembly& ); private: /** * @brief Performs a simple de-novo assembly using a group of reads * * We build a hash of the k-mers in the shadow reads. The most * frequent k-mer is used as seed and is then gradually extended. * */ bool buildContigs(const SmallAssemblerOptions& asmOptions, ShadowReadVec& shadows, const unsigned wordLength, std::vector& contigs, unsigned& unused_reads); /** * Adds base @p base to the end (mode=0) or start (mode=1) of the contig. * * @return The extended contig. */ std::string addBase(const std::string& contig, const char base, const unsigned int mode); /** * Returns suffix (mode=0) or prefix (mode=1) of @p contig with length @p length. * * @return The suffix or prefix. */ std::string getEnd(const std::string& contig, const unsigned length, const unsigned mode); /** * Extends the seed contig (aka most frequent k-mer) * * @return The extended contig. */ void walk(const SmallAssemblerOptions& asmOptions, const std::string& seed, const unsigned wordLength, const str_uint_map_t& wordHash, unsigned& stepsBackward, std::string& contig); }; #endif /* SMALLASSEMBLERIMPL_HH_ */