/** >HEADER Copyright (c) 2013-2019 Rob Patro rob@cs.umd.edu This file is part of Salmon. Salmon is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Salmon is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Salmon. If not, see .
#include #include #include #include #include #include #include // C++ string formatting library #include "spdlog/fmt/fmt.h" #include "spdlog/spdlog.h" // Future C++ convenience classes #include "core/range.hpp" #include "AlevinOpts.hpp" #include "AlignmentGroup.hpp" #include "ProgramOptionsGenerator.hpp" #include "ReadExperiment.hpp" #include "SalmonDefaults.hpp" #include "SalmonMath.hpp" #include "SalmonOpts.hpp" #include "SalmonUtils.hpp" #include "Transcript.hpp" #include "parallel_hashmap/phmap.h" #include "pufferfish/MemChainer.hpp" #include "pufferfish/MemCollector.hpp" #include "pufferfish/PuffAligner.hpp" #include "pufferfish/SAMWriter.hpp" #include "pufferfish/SelectiveAlignmentUtils.hpp" #include "pufferfish/Util.hpp" #include "pufferfish/itlib/small_vector.hpp" #include "pufferfish/ksw2pp/KSW2Aligner.hpp" #include "pufferfish/metro/metrohash64.h" namespace salmon { namespace mapping_utils { using MateStatus = pufferfish::util::MateStatus; constexpr const int32_t invalid_score_ = std::numeric_limits::min(); constexpr const int32_t invalid_index_ = std::numeric_limits::min(); constexpr const size_t static_vec_size = 32; class MappingScoreInfo { public: MappingScoreInfo() : bestScore(invalid_score_), secondBestScore(invalid_score_), bestDecoyScore(invalid_score_), decoyThresh(1.0), collect_decoy_info_(false) {} MappingScoreInfo(double decoyThreshIn) : MappingScoreInfo() { decoyThresh = decoyThreshIn; } void collect_decoys(bool do_collect) { collect_decoy_info_ = do_collect; } bool collect_decoys() const { return collect_decoy_info_; } // clear everything but the decoy threshold void clear(size_t num_hits) { bestScore = invalid_score_; secondBestScore = invalid_score_; bestDecoyScore = invalid_score_; scores_.clear(); scores_.resize(num_hits, invalid_score_); bestScorePerTranscript_.clear(); perm_.clear(); // NOTE: we do _not_ reset decoyThresh here if (collect_decoy_info_) { bool revert_to_static = best_decoy_hits.size() > static_vec_size; best_decoy_hits.clear(); if (revert_to_static) { best_decoy_hits.revert_to_static(); } } } bool haveOnlyDecoyMappings() const { // if the best non-decoy mapping has score less than decoyThresh * // bestDecoyScore and if the bestDecoyScore is a valid value, then we // have no valid non-decoy mappings. return (bestScore < static_cast(decoyThresh * bestDecoyScore)) and (bestDecoyScore > std::numeric_limits::min()); } inline bool update_decoy_mappings(int32_t hitScore, size_t idx, uint32_t tid) { const bool better_score = hitScore > bestDecoyScore; if (hitScore > bestDecoyScore) { bestDecoyScore = hitScore; if (collect_decoy_info_) { best_decoy_hits.clear(); best_decoy_hits.push_back(std::make_pair(static_cast(idx), static_cast(tid))); } } else if (collect_decoy_info_ and (hitScore == bestDecoyScore)) { best_decoy_hits.push_back( std::make_pair(static_cast(idx), static_cast(tid))); } return better_score; } int32_t bestScore; int32_t secondBestScore; int32_t bestDecoyScore; double decoyThresh; itlib::small_vector> best_decoy_hits; bool collect_decoy_info_; std::vector scores_; phmap::flat_hash_map> bestScorePerTranscript_; std::vector> perm_; }; template inline bool initMapperSettings(SalmonOpts& salmonOpts, MemCollector& memCollector, ksw2pp::KSW2Aligner& aligner, pufferfish::util::AlignmentConfig& aconf, pufferfish::util::MappingConstraintPolicy& mpol) { memCollector.configureMemClusterer(salmonOpts.maxOccsPerHit); double consensusFraction = (salmonOpts.consensusSlack == 0.0) ? 1.0 : (1.0 - salmonOpts.consensusSlack); memCollector.setConsensusFraction(consensusFraction); memCollector.setHitFilterPolicy(salmonOpts.hitFilterPolicy); memCollector.setAltSkip(salmonOpts.mismatchSeedSkip); memCollector.setChainSubOptThresh(salmonOpts.pre_merge_chain_sub_thresh); // Initialize ksw aligner ksw2pp::KSW2Config config; config.dropoff = -1; config.gapo = salmonOpts.gapOpenPenalty; config.gape = salmonOpts.gapExtendPenalty; config.bandwidth = salmonOpts.dpBandwidth; config.flag = 0; config.flag |= KSW_EZ_RIGHT; config.flag |= KSW_EZ_SCORE_ONLY; int8_t a = static_cast(salmonOpts.matchScore); int8_t b = static_cast(salmonOpts.mismatchPenalty); ksw2pp::KSW2Aligner aligner2(static_cast(a), static_cast(b)); aligner2.config() = config; std::swap(aligner, aligner2); aconf.refExtendLength = 20; aconf.fullAlignment = salmonOpts.fullLengthAlignment; aconf.mismatchPenalty = salmonOpts.mismatchPenalty; aconf.bestStrata = false; aconf.decoyPresent = false; aconf.matchScore = salmonOpts.matchScore; aconf.gapExtendPenalty = salmonOpts.gapExtendPenalty; aconf.gapOpenPenalty = salmonOpts.gapOpenPenalty; aconf.minScoreFraction = salmonOpts.minScoreFraction; aconf.mimicBT2 = salmonOpts.mimicBT2; aconf.mimicBT2Strict = salmonOpts.mimicStrictBT2; aconf.allowOverhangSoftclip = salmonOpts.softclipOverhangs; aconf.allowSoftclip = salmonOpts.softclip; aconf.useAlignmentCache = !salmonOpts.disableAlignmentCache; aconf.alignmentMode = pufferfish::util::PuffAlignmentMode::SCORE_ONLY; // we actually care about the softclips in the cigar string // if we are writing output and softclipping (or softclipping of overhangs) is // enabled if ((!salmonOpts.qmFileName.empty()) and (salmonOpts.softclip or salmonOpts.softclipOverhangs)) { aconf.alignmentMode = pufferfish::util::PuffAlignmentMode::APPROXIMATE_CIGAR; } mpol.noOrphans = !salmonOpts.allowOrphans; // TODO : PF_INTEGRATION // decide how we want to set this // I think we don't want to allow general discordant reads // e.g. both map to same strand or map too far away, but // we want the "allowDovetail" option to determine if // a dovetail read is considered concordant or discordant mpol.noDiscordant = true; mpol.noDovetail = !salmonOpts.allowDovetail; aconf.noDovetail = mpol.noDovetail; mpol.setPostMergeChainSubThresh(salmonOpts.post_merge_chain_sub_thresh); mpol.setOrphanChainSubThresh(salmonOpts.orphan_chain_sub_thresh); return true; } inline void updateRefMappings(uint32_t tid, int32_t hitScore, bool isCompat, size_t idx, const std::vector& transcripts, int32_t invalidScore, salmon::mapping_utils::MappingScoreInfo& msi) { auto& scores = msi.scores_; scores[idx] = hitScore; auto& t = transcripts[tid]; bool isDecoy = t.isDecoy(); double decoyCutoff = static_cast(msi.decoyThresh * msi.bestDecoyScore); // if (hitScore < decoyCutoff or (hitScore == invalidScore)) { } if (isDecoy) { // NOTE: decide here if we need to process any of this if the // current score is < the best (non-decoy) score. I think not. // if this is a decoy and its score is better than the best decoy score bool did_update = msi.update_decoy_mappings(hitScore, idx, tid); (void)did_update; return; } else if (hitScore < decoyCutoff or (hitScore == invalidScore)) { // if the current score is to a valid target but doesn't // exceed the necessary decoy threshold, then skip it. return; } // otherwise, we have a "high-scoring" hit to a non-decoy auto& perm = msi.perm_; auto& bestScorePerTranscript = msi.bestScorePerTranscript_; // removing duplicate hits from a read to the same transcript auto it = bestScorePerTranscript.find(tid); if (it == bestScorePerTranscript.end()) { // if we didn't have any alignment for this transcript yet, then // this is the current best bestScorePerTranscript[tid].first = hitScore; bestScorePerTranscript[tid].second = idx; } else if ((hitScore > it->second.first) or (hitScore == it->second.first and isCompat)) { // otherwise, if we had an alignment for this transcript and it's // better than the current best, then set the best score to this // alignment's score, and invalidate the previous alignment it->second.first = hitScore; scores[it->second.second] = invalidScore; it->second.second = idx; } else { // otherwise, there is already a better mapping for this transcript. scores[idx] = invalidScore; } if (hitScore > msi.bestScore) { msi.secondBestScore = msi.bestScore; msi.bestScore = hitScore; } perm.push_back(std::make_pair(idx, tid)); } inline void filterAndCollectAlignments( std::vector& jointHits, uint32_t readLen, uint32_t mateLen, bool singleEnd, bool tryAlign, bool hardFilter, double scoreExp, double minAlnProb, salmon::mapping_utils::MappingScoreInfo& msi, std::vector& jointAlignments) { auto invalidScore = std::numeric_limits::min(); if (msi.bestDecoyScore == invalidScore) { msi.bestDecoyScore = invalidScore + 1; } int32_t decoyThreshold = static_cast( msi.decoyThresh * msi.bestDecoyScore); // auto filterScore = (bestDecoyScore < secondBestScore) ? secondBestScore : // bestDecoyScore; auto& scores = msi.scores_; auto& perm = msi.perm_; // throw away any pairs for which we should not produce valid alignments : // ====== // If we are doing soft-filtering (default), we remove those not exceeding the // bestDecoyScore If we are doing hard-filtering, we remove those less than // the bestScore perm.erase(std::remove_if( perm.begin(), perm.end(), [&scores, hardFilter, &msi, decoyThreshold, invalidScore]( const std::pair& idxtid) -> bool { return !hardFilter ? scores[idxtid.first] < decoyThreshold : scores[idxtid.first] < msi.bestScore; // return !hardFilter ? scores[idxtid.first] < filterScore : // scores[idxtid.first] < bestScore; }), perm.end()); // Unlike RapMap, pufferfish doesn't guarantee the hits computed above are in // order by transcript, so we find the permutation of indices that puts things // in transcript order. std::sort(perm.begin(), perm.end(), [](const std::pair& p1, const std::pair& p2) { return p1.second < p2.second; }); // moving our alinged / score jointMEMs over to QuasiAlignment objects double bestScoreD = static_cast(msi.bestScore); for (auto& idxTxp : perm) { int32_t ctr = idxTxp.first; int32_t tid = idxTxp.second; auto& jointHit = jointHits[ctr]; double currScore = scores[ctr]; double v = bestScoreD - currScore; // why -1? double estAlnProb = hardFilter ? -1.0 : std::exp(-scoreExp * v); // skip any alignment with aln prob < minAlnProb if (!hardFilter and (estAlnProb < minAlnProb)) { continue; } if (singleEnd or jointHit.isOrphan()) { readLen = jointHit.isLeftAvailable() ? readLen : mateLen; jointAlignments.emplace_back( tid, // reference id jointHit.orphanClust()->getTrFirstHitPos(), // reference pos jointHit.orphanClust()->isFw, // fwd direction readLen, // read length jointHit.orphanClust()->cigar, // cigar string jointHit.fragmentLen, // fragment length false); auto& qaln = jointAlignments.back(); // NOTE : score should not be filled in from a double qaln.score = !tryAlign ? static_cast(jointHit.orphanClust()->coverage) : jointHit.alignmentScore; qaln.estAlnProb(estAlnProb); // NOTE : wth is numHits? qaln.numHits = static_cast(jointHits.size()); // orphanClust()->coverage; qaln.mateStatus = jointHit.mateStatus; if (singleEnd) { qaln.mateLen = readLen; qaln.mateCigar.clear(); qaln.matePos = 0; qaln.mateIsFwd = true; qaln.mateScore = 0; qaln.mateStatus = MateStatus::SINGLE_END; } } else { jointAlignments.emplace_back( tid, // reference id jointHit.leftClust->getTrFirstHitPos(), // reference pos jointHit.leftClust->isFw, // fwd direction readLen, // read length jointHit.leftClust->cigar, // cigar string jointHit.fragmentLen, // fragment length true); // properly paired // Fill in the mate info auto& qaln = jointAlignments.back(); qaln.mateLen = mateLen; qaln.mateCigar = jointHit.rightClust->cigar; qaln.matePos = static_cast(jointHit.rightClust->getTrFirstHitPos()); qaln.mateIsFwd = jointHit.rightClust->isFw; qaln.mateStatus = MateStatus::PAIRED_END_PAIRED; // NOTE : wth is numHits? qaln.numHits = static_cast(jointHits.size()); // NOTE : score should not be filled in from a double qaln.score = !tryAlign ? static_cast(jointHit.leftClust->coverage) : jointHit.alignmentScore; qaln.estAlnProb(estAlnProb); qaln.mateScore = !tryAlign ? static_cast(jointHit.rightClust->coverage) : jointHit.mateAlignmentScore; } } // done moving our alinged / score jointMEMs over to QuasiAlignment objects } inline void filterAndCollectAlignmentsDecoy( std::vector& jointHits, uint32_t readLen, uint32_t mateLen, bool singleEnd, bool tryAlign, bool hardFilter, double scoreExp, double minAlnProb, salmon::mapping_utils::MappingScoreInfo& msi, std::vector& jointAlignments) { // NOTE: this function should only be called in the case that we have valid // decoy mappings to report. Currently, this happens only when there are *no // valid non-decoy* mappings. Further, this function will only add equally // *best* decoy mappings to the output jointAlignments object regardless of // the the status of hardFilter (i.e. no sub-optimal decoy mappings will be // reported). (void)hardFilter; (void)minAlnProb; (void)scoreExp; double estAlnProb = 1.0; // std::exp(-scoreExp * 0.0); for (auto& idxTxp : msi.best_decoy_hits) { int32_t ctr = idxTxp.first; int32_t tid = idxTxp.second; auto& jointHit = jointHits[ctr]; if (singleEnd or jointHit.isOrphan()) { readLen = jointHit.isLeftAvailable() ? readLen : mateLen; jointAlignments.emplace_back( tid, // reference id jointHit.orphanClust()->getTrFirstHitPos(), // reference pos jointHit.orphanClust()->isFw, // fwd direction readLen, // read length jointHit.orphanClust()->cigar, // cigar string jointHit.fragmentLen, // fragment length false); auto& qaln = jointAlignments.back(); // NOTE : score should not be filled in from a double qaln.score = !tryAlign ? static_cast(jointHit.orphanClust()->coverage) : jointHit.alignmentScore; qaln.estAlnProb(estAlnProb); // NOTE : wth is numHits? qaln.numHits = static_cast(jointHits.size()); // orphanClust()->coverage; qaln.mateStatus = jointHit.mateStatus; if (singleEnd) { qaln.mateLen = readLen; qaln.mateCigar.clear(); qaln.matePos = 0; qaln.mateIsFwd = true; qaln.mateScore = 0; qaln.mateStatus = MateStatus::SINGLE_END; } } else { jointAlignments.emplace_back( tid, // reference id jointHit.leftClust->getTrFirstHitPos(), // reference pos jointHit.leftClust->isFw, // fwd direction readLen, // read length jointHit.leftClust->cigar, // cigar string jointHit.fragmentLen, // fragment length true); // properly paired // Fill in the mate info auto& qaln = jointAlignments.back(); qaln.mateLen = mateLen; qaln.mateCigar = jointHit.rightClust->cigar; qaln.matePos = static_cast(jointHit.rightClust->getTrFirstHitPos()); qaln.mateIsFwd = jointHit.rightClust->isFw; qaln.mateStatus = MateStatus::PAIRED_END_PAIRED; // NOTE : wth is numHits? qaln.numHits = static_cast(jointHits.size()); // NOTE : score should not be filled in from a double qaln.score = !tryAlign ? static_cast(jointHit.leftClust->coverage) : jointHit.alignmentScore; qaln.estAlnProb(estAlnProb); qaln.mateScore = !tryAlign ? static_cast(jointHit.rightClust->coverage) : jointHit.mateAlignmentScore; } } // end for over best decoy hits } namespace pasc { constexpr int32_t invalid_frag_len = std::numeric_limits::min(); constexpr int32_t invalid_mate_pos = std::numeric_limits::min(); /* from pesc struct simple_hit { bool is_fw{false}; bool mate_is_fw{false}; int32_t pos{-1}; float score{0.0}; uint32_t num_hits{0}; uint32_t tid{std::numeric_limits::max()}; int32_t mate_pos{std::numeric_limits::max()}; int32_t fragment_length{std::numeric_limits::max()}; inline bool valid_pos(int32_t read_len, uint32_t txp_len, int32_t max_over) { int32_t signed_txp_len = static_cast(txp_len); return (pos > -max_over) and ((pos + read_len) < (signed_txp_len + max_over)); } inline bool has_mate() const { return mate_pos != invalid_mate_pos; } inline bool mate_is_mapped() const { return mate_pos != invalid_mate_pos; } inline int32_t frag_len() const { return (fragment_length != invalid_frag_len) ? fragment_length : 0; } }; */ struct simple_hit { bool is_fw{false}; int32_t pos{-1}; float score{0.0}; uint32_t num_hits{0}; uint32_t tid{std::numeric_limits::max()}; // int32_t mate_pos{std::numeric_limits::max()}; // int32_t fragment_length{std::numeric_limits::max()}; // UNPAIRED_LEFT, UNPAIRED_RIGHT, PAIRED_FR, PAIRED_RF PairingStatus pairing_status{PairingStatus::UNPAIRED_RIGHT}; bool valid_pos(int32_t read_len, uint32_t txp_len, int32_t max_over) { int32_t signed_txp_len = static_cast(txp_len); return (pos > -max_over) and ((pos + read_len) < (signed_txp_len + max_over)); } bool operator<(const simple_hit& hit2) { if (tid != hit2.tid) { // hit1 and hit2 are on different transcripts return tid < hit2.tid; } else { // hit1 and hit2 are on the same transcript // NOTE: Is this what we actually want? if (is_fw) { // fw < rc return true; } else { return false; } } } }; enum class HitDirection : uint8_t { FW, RC, BOTH }; /* struct sketch_hit_info { // add a hit to the current target that occurs in the forward // orientation with respect to the target. bool add_fw(int32_t ref_pos, int32_t read_pos, int32_t rl, int32_t k, int32_t max_stretch, float score_inc) { (void)rl; (void)k; bool added{false}; // since hits are collected by moving _forward_ in the // read, if this is a fw hit, it should be moving // forward in the reference. Only add it if this is // the case. This ensure that we don't // double-count a k-mer that might occur twice on // this target. if (ref_pos > last_ref_pos_fw and read_pos > last_read_pos_fw) { if (last_read_pos_fw == -1) { approx_pos_fw = ref_pos - read_pos; } else { if ((ref_pos - approx_pos_fw) > max_stretch) { return false; } } // if (last_ref_pos_fw > -1 and (ref_pos > last_ref_pos_fw + 15)) { return // false; } last_ref_pos_fw = ref_pos; last_read_pos_fw = read_pos; fw_score += score_inc; ++fw_hits; added = true; } return added; } // add a hit to the current target that occurs in the forward // orientation with respect to the target. bool add_rc(int32_t ref_pos, int32_t read_pos, int32_t rl, int32_t k, int32_t max_stretch, float score_inc) { bool added{false}; // since hits are collected by moving _forward_ in the // read, if this is an rc hit, it should be moving // backwards in the reference. Only add it if this is // the case. // This ensures that we don't double-count a k-mer that // might occur twice on this target. if (ref_pos < last_ref_pos_rc and read_pos > last_read_pos_rc) { approx_pos_rc = (ref_pos - (rl - (read_pos + k))); if (last_read_pos_rc == -1) { approx_end_pos_rc = ref_pos + read_pos; } else { if (approx_end_pos_rc - approx_pos_rc > max_stretch) { return false; } } // if (last_ref_pos_rc > -1 and ref_pos < last_ref_pos_rc - 15) { return // false; } last_ref_pos_rc = ref_pos; last_read_pos_rc = read_pos; rc_score += score_inc; ++rc_hits; added = true; } return added; } inline uint32_t max_hits_for_target() { return std::max(fw_hits, rc_hits); } // true if forward, false if rc // second element is score inline HitDirection best_hit_direction() { int32_t fw_minus_rc = static_cast(fw_hits) - static_cast(rc_hits); return (fw_minus_rc > 0) ? HitDirection::FW : ((fw_minus_rc < 0) ? HitDirection::RC : HitDirection::BOTH); } inline simple_hit get_fw_hit(PairingStatus ps) { return simple_hit{true, approx_pos_fw, fw_score, fw_hits, std::numeric_limits::max(), ps}; } inline simple_hit get_rc_hit(PairingStatus ps) { return simple_hit{false, approx_pos_rc, rc_score, rc_hits, std::numeric_limits::max(), ps}; } inline std::string to_string() { std::stringstream ss; ss << "fw_hits: " << fw_hits << ", fw_score : " << fw_score << ", fw_pos : " << approx_pos_fw << " || rc_hits: " << rc_hits << ", rc_score: " << rc_score << ", rc_pos: " << approx_pos_rc; return ss.str(); } int32_t last_read_pos_fw{-1}; int32_t last_read_pos_rc{-1}; int32_t last_ref_pos_fw{-1}; int32_t last_ref_pos_rc{std::numeric_limits::max()}; int32_t approx_pos_fw{-1}; int32_t approx_pos_rc{-1}; int32_t approx_end_pos_rc{-1}; uint32_t fw_hits{0}; uint32_t rc_hits{0}; float fw_score{0.0}; float rc_score{0.0}; };*/ struct sketch_hit_info { // add a hit to the current target that occurs in the forward // orientation with respect to the target. bool add_fw(int32_t ref_pos, int32_t read_pos, int32_t rl, int32_t k, int32_t max_stretch, float score_inc) { (void)rl; (void)k; bool added{false}; // since hits are collected by moving _forward_ in the // read, if this is a fw hit, it should be moving // forward in the reference. Only add it if this is // the case. This ensure that we don't // double-count a k-mer that might occur twice on // this target. if (ref_pos > last_ref_pos_fw and read_pos > last_read_pos_fw) { if (last_read_pos_fw == -1) { approx_pos_fw = ref_pos - read_pos; } else { if ((ref_pos - approx_pos_fw) > max_stretch) { return false; } } last_ref_pos_fw = ref_pos; last_read_pos_fw = read_pos; fw_score += score_inc; ++fw_hits; added = true; } return added; } // add a hit to the current target that occurs in the forward // orientation with respect to the target. bool add_rc(int32_t ref_pos, int32_t read_pos, int32_t rl, int32_t k, int32_t max_stretch, float score_inc) { bool added{false}; // since hits are collected by moving _forward_ in the // read, if this is an rc hit, it should be moving // backwards in the reference. // In general, we only add the hit if this is the case. // This ensures that we don't double-count a k-mer that // might occur twice on this target. // we have a special case here; what if the same exact // k-mer (i.e. not just the same sequence but same position // on the query) occurs more than one time on this refernece? // // In that case, the GENERAL case code will already have // processed and seen a k-mer with the read position // equal to `read_pos`. In the case below, we see // a hit with the *same* read pos again (one or more times). // // Here, we swap out the previous hit having this read_pos // if the position of the current hit on the read is // the same and the position on the reference is greater // (this is a heuristic to help in the case of tandem repeats or // highly-repetitive subsequence). // NOTE: consider if a similar heuristic should be // adopted for the forward case. if ((read_pos == last_read_pos_rc) and (ref_pos > last_ref_pos_rc) and (ref_pos < rightmost_bound_rc)) { last_ref_pos_rc = ref_pos; // if the read_pos was the same as the first read pos // then also update the approx_end_pos_rc accordingly // NOTE: for the time being don't mess with this position // empirically this does better, but if we really want // to optimize this for accuracy we need a better general // heuristic. // if (read_pos == first_read_pos_rc) { // approx_end_pos_rc = ref_pos + read_pos; //} return added; } // GENERAL case if (ref_pos < last_ref_pos_rc and read_pos > last_read_pos_rc) { approx_pos_rc = (ref_pos - (rl - (read_pos + k))); if (last_read_pos_rc == -1) { approx_end_pos_rc = ref_pos + read_pos; first_read_pos_rc = read_pos; } else { if (approx_end_pos_rc - approx_pos_rc > max_stretch) { return false; } } rc_score += score_inc; ++rc_hits; // new rightmost_bound_rc = last_ref_pos_rc; last_ref_pos_rc = ref_pos; last_read_pos_rc = read_pos; added = true; } return added; } inline uint32_t max_hits_for_target() { return std::max(fw_hits, rc_hits); } // true if forward, false if rc // second element is score inline HitDirection best_hit_direction() { int32_t fw_minus_rc = static_cast(fw_hits) - static_cast(rc_hits); return (fw_minus_rc > 0) ? HitDirection::FW : ((fw_minus_rc < 0) ? HitDirection::RC : HitDirection::BOTH); } inline simple_hit get_fw_hit(PairingStatus ps) { return simple_hit{true, approx_pos_fw, fw_score, fw_hits, std::numeric_limits::max(), ps}; } inline simple_hit get_rc_hit(PairingStatus ps) { return simple_hit{false, approx_pos_rc, rc_score, rc_hits, std::numeric_limits::max(), ps}; } inline std::string to_string() { std::stringstream ss; ss << "fw_hits: " << fw_hits << ", fw_score : " << fw_score << ", fw_pos : " << approx_pos_fw << " || rc_hits: " << rc_hits << ", rc_score: " << rc_score << ", rc_pos: " << approx_pos_rc; return ss.str(); } int32_t last_read_pos_fw{-1}; int32_t last_read_pos_rc{-1}; int32_t rightmost_bound_rc{std::numeric_limits::max()}; // marks the read position (key) of the // first hit we see in the rc direction int32_t first_read_pos_rc{-1}; int32_t last_ref_pos_fw{-1}; int32_t last_ref_pos_rc{std::numeric_limits::max()}; int32_t approx_pos_fw{-1}; int32_t approx_pos_rc{-1}; int32_t approx_end_pos_rc{-1}; uint32_t fw_hits{0}; uint32_t rc_hits{0}; float fw_score{0.0}; float rc_score{0.0}; }; /** * This contains the information necessary to collect hits and perform * mapping on a read using PASC. **/ template struct mapping_cache_info { public: mapping_cache_info(IndexT* midx, size_t max_occ_default_in, size_t max_occ_recover_in, size_t max_read_occ_in) : idx(midx), k(midx->k()), hit_searcher(midx), max_occ_default(max_occ_default_in), max_occ_recover(max_occ_recover_in), attempt_occ_recover(max_occ_recover > max_occ_default), max_read_occ(max_read_occ_in), alt_max_occ(max_read_occ_in) {} inline void clear() { map_type = salmon::utils::MappingType::UNMAPPED; hit_searcher.clear(); hit_map.clear(); accepted_hits.clear(); alt_max_occ = max_read_occ; has_matching_kmers = false; } // will store how the read mapped salmon::utils::MappingType map_type{salmon::utils::MappingType::UNMAPPED}; // map from reference id to hit info phmap::flat_hash_map hit_map; // where the mappings that pass will be stored std::vector accepted_hits; // map to recall the number of unmapped reads we see // for each barcode phmap::flat_hash_map unmapped_bc_map; // pointer to the underlying index IndexT* idx{nullptr}; // the k-mer size of our index size_t k{0}; // implements the PASC algorithm MemCollector hit_searcher; // the number of occurrences of a hit above which we // won't consider it size_t max_occ_default; // the number of occurences of a hit above which we // won't consider it, even in recovery mode size_t max_occ_recover; // will we attempt recovery if there are no hits with // frequency below max_occ_default? bool attempt_occ_recover; // maximum number of places a read can occur and still // be considered. size_t max_read_occ; size_t alt_max_occ; // to perform queries pufferfish::util::QueryCache qc; // regardless of having full mappings, did any k-mers match bool has_matching_kmers{false}; }; /** * This function will map the read given by `read_seq` * using PASC. The relevant parameters (e.g. maximum ambiguity * of seed and maximum number of allowable mappings) are stored * in the `map_cache` structure. The `PairingStatus` is passed in * by the caller and designates if the read being mapped is the * left end or the right end. **/ template inline bool map_read(std::string* read_seq, mapping_cache_info& map_cache, PairingStatus ps, bool verbose = false) { map_cache.clear(); // rebind map_cache variables to // local names IndexT* qidx = map_cache.idx; auto& qc = map_cache.qc; auto& hit_searcher = map_cache.hit_searcher; auto& hit_map = map_cache.hit_map; auto& accepted_hits = map_cache.accepted_hits; auto& map_type = map_cache.map_type; const bool attempt_occ_recover = map_cache.attempt_occ_recover; auto k = map_cache.k; int32_t signed_k = static_cast(k); // collect the set of matching seeds map_cache.has_matching_kmers = hit_searcher.get_raw_hits_sketch(*read_seq, qc, true, false); bool early_stop = false; // if there were hits if (map_cache.has_matching_kmers) { uint32_t num_valid_hits{0}; uint64_t total_occs{0}; uint64_t largest_occ{0}; auto& raw_hits = hit_searcher.get_left_hits(); // SANITY decltype(raw_hits[0].first) prev_read_pos = -1; // the maximum span the supporting k-mers of a // mapping position are allowed to have. // NOTE this is still > read_length b/c the stretch is measured wrt the // START of the terminal k-mer. int32_t max_stretch = static_cast(read_seq->length() * 1.0); // a raw hit is a pair of read_pos and a projected hit // the least frequent hit for this fragment. uint64_t min_occ = std::numeric_limits::max(); // this is false by default and will be set to true // if *every* collected hit for this fragment occurs // max_occ_default times or more. bool had_alt_max_occ = false; int32_t signed_rl = static_cast(read_seq->length()); auto collect_mappings_from_hits = [&max_stretch, &min_occ, &hit_map, &num_valid_hits, &total_occs, &largest_occ, &early_stop, signed_rl, k, signed_k, qidx, verbose](auto& raw_hits, auto& prev_read_pos, auto& max_allowed_occ, auto& had_alt_max_occ) -> bool { for (auto& raw_hit : raw_hits) { auto& read_pos = raw_hit.first; auto& proj_hits = raw_hit.second; auto& refs = proj_hits.refRange; // Keep track of the *least* ambiguous hit we see. // If all hits occur more than `max_allowed_occ` times, // but the least ambiguous hit (which occurs `min_occ` // times) has < `max_occ_recover` occurrences, then // we will use all hits occurring `min_occ` number // of times or less to try top recover the read's mappings. uint64_t num_occ = static_cast(refs.size()); min_occ = std::min(min_occ, num_occ); had_alt_max_occ = true; // we visit every hit that occurs the allowable number of time bool still_have_valid_target = false; prev_read_pos = read_pos; if (num_occ <= max_allowed_occ) { total_occs += num_occ; largest_occ = (num_occ > largest_occ) ? num_occ : largest_occ; float score_inc = 1.0; // vist each mapped reference position (mrp) where this hit // occurs. for (auto& pos_it : refs) { const auto& ref_pos_ori = proj_hits.decodeHit(pos_it); uint32_t tid = static_cast(qidx->getRefId(pos_it.transcript_id())); int32_t pos = static_cast(ref_pos_ori.pos); bool ori = ref_pos_ori.isFW; auto& target = hit_map[tid]; // Why >= here instead of == ? // Because hits can happen on the same target in both the forward // and rc orientations, it is possible that we start the loop with // the target having num_valid_hits hits in a given orientation (o) // we see a new hit for this target in oriention o (now it has // num_valid_hits + 1) then we see a hit for this target in // orientation rc(o). We still want to add / consider this hit, but // max_hits_for_target() > num_valid_hits. So, we must allow for // that here. if (target.max_hits_for_target() >= num_valid_hits) { if (ori) { target.add_fw(pos, static_cast(read_pos), signed_rl, signed_k, max_stretch, score_inc); } else { target.add_rc(pos, static_cast(read_pos), signed_rl, signed_k, max_stretch, score_inc); } // if this target is still valid after evaluating this hit // then we will have at least one globally valid target. If // no targets are valid after evaluating all mrps for this // hit then we can exit the search early. still_have_valid_target |= (target.max_hits_for_target() >= num_valid_hits + 1); } } // DONE: for (auto &pos_it : refs) ++num_valid_hits; // if there are no targets reaching the valid hit threshold, then // break early if (!still_have_valid_target) { return true; } } // DONE : if (num_occ <= max_allowed_occ) } // DONE : for (auto& raw_hit : raw_hits) return false; }; bool _discard = false; auto mao_first_pass = map_cache.max_occ_default - 1; early_stop = collect_mappings_from_hits(raw_hits, prev_read_pos, mao_first_pass, _discard); // If our default threshold was too stringent, then fallback to a more // liberal threshold and look up the k-mers that occur the least frequently. // Specifically, if the min occuring hits have frequency < max_occ_recover // (2500 by default) times, then collect the min occuring hits to get the // mapping. if (attempt_occ_recover and (min_occ >= map_cache.max_occ_default) and (min_occ < map_cache.max_occ_recover)) { prev_read_pos = -1; uint64_t max_allowed_occ = min_occ; early_stop = collect_mappings_from_hits(raw_hits, prev_read_pos, max_allowed_occ, had_alt_max_occ); } uint32_t best_alt_hits = 0; for (auto& kv : hit_map) { auto best_hit_dir = kv.second.best_hit_direction(); // if the best direction is FW or BOTH, add the fw hit // otherwise add the RC. auto mapping = (best_hit_dir != HitDirection::RC) ? kv.second.get_fw_hit(ps) : kv.second.get_rc_hit(ps); if (mapping.num_hits >= num_valid_hits) { mapping.tid = kv.first; accepted_hits.emplace_back(mapping); // if we had equally good hits in both directions // add the rc hit here (since we added the fw) // above if the best hit was either FW or BOTH if (best_hit_dir == HitDirection::BOTH) { auto second_hit = kv.second.get_rc_hit(ps); second_hit.tid = kv.first; accepted_hits.emplace_back(second_hit); } } else { // best_alt_score = mapping.score > best_alt_score ? mapping.score : // best_alt_score; best_alt_hits = mapping.num_hits > best_alt_hits ? mapping.num_hits : best_alt_hits; } } map_cache.alt_max_occ = had_alt_max_occ ? accepted_hits.size() : map_cache.max_occ_default; /* * This rule; if enabled, allows through mappings missing a single hit, if there * was no mapping with all hits. NOTE: this won't work with the current early-exit * optimization however. if (accepted_hits.empty() and (num_valid_hits > 1) and (best_alt_hits >= num_valid_hits - 1)) { for (auto& kv : hit_map) { auto mapping = kv.second.get_best_hit(); if (mapping.num_hits >= best_alt_hits) { //if (mapping.valid_pos(signed_read_len, transcripts[kv.first].RefLength, 10)) { mapping.tid = kv.first; accepted_hits.emplace_back(mapping); //} } } } */ } // DONE : if (rh) // If the read mapped to > maxReadOccs places, discard it if (accepted_hits.size() > map_cache.alt_max_occ) { accepted_hits.clear(); map_type = salmon::utils::MappingType::UNMAPPED; } else if (!accepted_hits.empty()) { map_type = salmon::utils::MappingType::SINGLE_MAPPED; } return early_stop; } } // namespace pasc } // namespace mapping_utils } // namespace salmon #endif //__SALMON_MAPPING_UTILS__