#include #include #include #include #include "CLI/Timer.hpp" #include "CanonicalKmerIterator.hpp" #include "PufferFS.hpp" #include "PufferfishIndex.hpp" #include "cereal/archives/binary.hpp" #include "cereal/archives/json.hpp" #include "jellyfish/mer_dna.hpp" PufferfishIndex::PufferfishIndex() { } PufferfishIndex::PufferfishIndex(const std::string& indexDir, pufferfish::util::IndexLoadingOpts opts) { if (!puffer::fs::DirExists(indexDir.c_str())) { std::cerr << "The index directory " << indexDir << " does not exist!\n"; std::exit(1); } { std::ifstream infoStream(indexDir + "/info.json"); cereal::JSONInputArchive infoArchive(infoStream); infoArchive(cereal::make_nvp("k", k_)); infoArchive(cereal::make_nvp("num_kmers", numKmers_)); infoArchive(cereal::make_nvp("have_edge_vec", haveEdges_)); infoArchive(cereal::make_nvp("have_ref_seq", haveRefSeq_)); infoArchive(cereal::make_nvp("num_decoys", numDecoys_)); infoArchive(cereal::make_nvp("first_decoy_index", firstDecoyIndex_)); infoStream.close(); twok_ = 2 * k_; } haveEdges_ = opts.try_loading_edges and haveEdges_; haveRefSeq_ = opts.try_loading_ref_seqs and haveRefSeq_; haveEqClasses_ = opts.try_loading_eqclasses and haveEqClasses_; { CLI::AutoTimer timer{"Loading contig table", CLI::Timer::Big}; std::ifstream contigTableStream(indexDir + "/" + pufferfish::util::CTABLE); cereal::BinaryInputArchive contigTableArchive(contigTableStream); contigTableArchive(refNames_); contigTableArchive(refExt_); contigTableArchive(contigTable_); contigTableStream.close(); } { CLI::AutoTimer timer{"Loading contig offsets", CLI::Timer::Big}; std::string pfile = indexDir + "/" + pufferfish::util::CONTIG_OFFSETS; contigOffsets_.deserialize(pfile, false); } numContigs_ = contigOffsets_.size()-1; { std::string rlPath = indexDir + "/" + pufferfish::util::REFLENGTH; std::ifstream refLengthStream(rlPath); if(refLengthStream.good()) { CLI::AutoTimer timer{"Loading reference lengths", CLI::Timer::Big}; cereal::BinaryInputArchive refLengthArchive(refLengthStream); refLengthArchive(refLengths_); } else { refLengths_ = std::vector(refNames_.size(), 1000); } } { std::string rlPath = indexDir + "/" + pufferfish::util::COMPLETEREFLENGTH; std::ifstream completeRefLengthStream(rlPath); if(completeRefLengthStream.good()) { cereal::BinaryInputArchive completeRefLengthArchive(completeRefLengthStream); completeRefLengthArchive(completeRefLengths_); } else { throw std::runtime_error("could not load complete reference lengths!"); } } if (haveEqClasses_) { CLI::AutoTimer timer{"Loading eq table", CLI::Timer::Big}; std::ifstream eqTableStream(indexDir + "/" + pufferfish::util::EQTABLE); cereal::BinaryInputArchive eqTableArchive(eqTableStream); eqTableArchive(eqClassIDs_); eqTableArchive(eqLabels_); eqTableStream.close(); } { CLI::AutoTimer timer{"Loading mphf table", CLI::Timer::Big}; std::string hfile = indexDir + "/" + pufferfish::util::MPH; std::ifstream hstream(hfile); hash_.reset(new boophf_t); hash_->load(hstream); hstream.close(); hash_raw_ = hash_.get(); } { CLI::AutoTimer timer{"Loading contig boundaries", CLI::Timer::Big}; std::string bfile = indexDir + "/" + pufferfish::util::RANK; contigBoundary_.deserialize(bfile, false); rankSelDict = rank9sel(&contigBoundary_, (uint64_t)contigBoundary_.size()); } /* selectPrecomp_.reserve(numContigs_+1); selectPrecomp_.push_back(0); for (size_t i = 1; i < numContigs_; ++i) { selectPrecomp_.push_back(contigSelect_(i)); } selectPrecomp_.push_back(contigSelect_(numContigs_)); */ { CLI::AutoTimer timer{"Loading sequence", CLI::Timer::Big}; std::string sfile = indexDir + "/" + pufferfish::util::SEQ; seq_.deserialize(sfile, false); lastSeqPos_ = seq_.size() - k_; } { CLI::AutoTimer timer{"Loading positions", CLI::Timer::Big}; std::string pfile = indexDir + "/" + pufferfish::util::POS; pos_.deserialize(pfile, false); //auto f = std::async(std::launch::async, &pos_vector_t::touch_all_pages, &pos_, bits_per_element); } if (haveRefSeq_) { CLI::AutoTimer timer{"Loading reference sequence", CLI::Timer::Big}; std::string pfile = indexDir + "/" + pufferfish::util::REFSEQ; refseq_.deserialize(pfile, false); } { std::string rlPath = indexDir + "/" + pufferfish::util::REFACCUMLENGTH; std::ifstream refLengthStream(rlPath); if(refLengthStream.good()) { CLI::AutoTimer timer{"Loading reference accumulative lengths", CLI::Timer::Big}; cereal::BinaryInputArchive refLengthArchive(refLengthStream); refLengthArchive(refAccumLengths_); } else { refAccumLengths_ = std::vector(refNames_.size(), 1000); } } if (haveEdges_) { CLI::AutoTimer timer{"Loading edges", CLI::Timer::Big}; std::string pfile = indexDir + "/" + pufferfish::util::EDGE; edge_.deserialize(pfile, false); } firstDecoyEncodedIndex_ = (numDecoys_ > 0) ? getRefId(firstDecoyIndex_) : std::numeric_limits::max(); } /** * Returns a ProjectedHits object containing all of the reference loci matching this * provided Canonical kmer (including the oritentation of the match). The provided * QueryCache argument will be used to avoid redundant rank / select operations if feasible. */ auto PufferfishIndex::getRefPos(CanonicalKmer& mer, pufferfish::util::QueryCache& qc) -> pufferfish::util::ProjectedHits { using IterT = std::vector::iterator; auto km = mer.getCanonicalWord(); size_t res = hash_raw_->lookup(km); if (res < numKmers_) { uint64_t pos = const_cast(pos_)[res]; // if using quasi-dictionary idea (https://arxiv.org/pdf/1703.00667.pdf) /* uint64_t hashbits = pos & 0xF; pos = pos >> 4; if ((km & 0xF) != hashbits) { return {std::numeric_limits::max(), std::numeric_limits::max(), std::numeric_limits::max(), true, 0, k_, core::range{}}; } */ uint64_t fk = seq_.get_int(2*pos, 2*k_); // say how the kmer fk matches mer; either // identity, twin (i.e. rev-comp), or no match auto keq = mer.isEquivalent(fk); if (keq != KmerMatchType::NO_MATCH) { uint64_t rank = 0; uint64_t sp = 0; uint64_t contigEnd = 0; // check if the current position is on the same contig // as the cached contig or not. const bool same_contig = ((qc.contigStart <= pos) and (pos <= qc.contigEnd)); if (same_contig) { rank = qc.prevRank; sp = qc.contigStart; contigEnd = qc.contigEnd; } else { // the index of this contig rank = rankSelDict.rank(pos); sp = (rank == 0) ? 0 : static_cast(rankSelDict.select(rank - 1)) + 1; contigEnd = rankSelDict.select(rank); qc.prevRank = rank; qc.contigStart = sp; qc.contigEnd = contigEnd; } // the reference information in the contig table auto contigIterRange = contigRange(rank); // relative offset of this k-mer in the contig uint32_t relPos = static_cast(pos - sp); // start position of the next contig - start position of this one auto clen = static_cast(contigEnd + 1 - sp); // auto clen = // cPosInfo_[rank].length();//static_cast(contigSelect_(rank + // 1) + 1 - sp); // how the k-mer hits the contig (true if k-mer in fwd orientation, false // otherwise) bool hitFW = (keq == KmerMatchType::IDENTITY_MATCH); return {static_cast(rank), pos, relPos, hitFW, static_cast(clen), k_, contigIterRange}; //core::range{pvec.begin(), pvec.end()}}; } else { return {std::numeric_limits::max(), std::numeric_limits::max(), std::numeric_limits::max(), true, 0, k_, core::range{}}; } } return {std::numeric_limits::max(), std::numeric_limits::max(), std::numeric_limits::max(), true, 0, k_, core::range{}}; } auto PufferfishIndex::getRefPos(CanonicalKmer& mer) -> pufferfish::util::ProjectedHits { using IterT = std::vector::iterator; auto km = mer.getCanonicalWord(); size_t res = hash_raw_->lookup(km); if (res < numKmers_) { uint64_t pos = const_cast(pos_)[res]; uint64_t fk = seq_.get_int(2*pos, 2*k_); // say how the kmer fk matches mer; either // identity, twin (i.e. rev-comp), or no match auto keq = mer.isEquivalent(fk); if (keq != KmerMatchType::NO_MATCH) { // the index of this contig auto rank = rankSelDict.rank(pos);//contigRank_(pos); // the reference information in the contig table auto contigIterRange = contigRange(rank); // start position of this contig uint64_t sp = (rank == 0) ? 0 : static_cast(rankSelDict.select(rank - 1)) + 1; uint64_t contigEnd = rankSelDict.select(rank); // relative offset of this k-mer in the contig uint32_t relPos = static_cast(pos - sp); // start position of the next contig - start position of this one auto clen = static_cast(contigEnd + 1 - sp); // auto clen = // cPosInfo_[rank].length();//static_cast(contigSelect_(rank + // 1) + 1 - sp); // how the k-mer hits the contig (true if k-mer in fwd orientation, false // otherwise) bool hitFW = (keq == KmerMatchType::IDENTITY_MATCH); return {static_cast(rank), pos, relPos, hitFW, static_cast(clen), k_, contigIterRange}; } else { return {std::numeric_limits::max(), std::numeric_limits::max(), std::numeric_limits::max(), true, 0, k_, core::range{}}; } } return {std::numeric_limits::max(), std::numeric_limits::max(), std::numeric_limits::max(), true, 0, k_, core::range{}}; } PufferfishIndex::~PufferfishIndex() { }