#include "nonstd/string_view.hpp" #include "PuffAligner.hpp" #include "Util.hpp" #include "libdivide/libdivide.h" std::string extractReadSeq(const std::string& readSeq, uint32_t rstart, uint32_t rend, bool isFw) { std::string subseq = readSeq.substr(rstart, rend - rstart); if (isFw) return subseq; return pufferfish::util::reverseComplement(subseq); //reverse-complement the substring } // decode CIGAR, just like : https://github.com/lh3/ksw2/blob/master/cli.c#L134 std::string cigar2str(const ksw_extz_t *ez) { std::string cigar; if (ez->n_cigar > 0) { cigar.resize(ez->n_cigar * 2); for (int i = 0; i < ez->n_cigar; ++i) { cigar += (std::to_string(ez->cigar[i] >> 4) + "MID"[ez->cigar[i] & 0xf]); } } return cigar; } int32_t addCigar(pufferfish::util::CIGARGenerator &cigarGen, ksw_extz_t ez, bool beginGap) { if (!beginGap) { int32_t insertionDeletion = 0; for (int i = 0; i < ez.n_cigar; ++i) { char cigar_type = 'N'; if ((ez.cigar[i] & 0xf) == 0) cigar_type = 'M'; if ((ez.cigar[i] & 0xf) == 1) { cigar_type = 'I'; insertionDeletion += ez.cigar[i] >> 4; } if ((ez.cigar[i] & 0xf) == 2) { cigar_type = 'D'; insertionDeletion -= ez.cigar[i] >> 4; } cigarGen.add_item(ez.cigar[i] >> 4, cigar_type); } return insertionDeletion; } else { uint32_t gapSize = 0; for (int i = ez.n_cigar - 1; i >= 0; --i) { char cigar_type = 'N'; if ((ez.cigar[i] & 0xf) == 0) { cigar_type = 'M'; gapSize += ez.cigar[i] >> 4; } if ((ez.cigar[i] & 0xf) == 1) { cigar_type = 'I'; } if ((ez.cigar[i] & 0xf) == 2) { cigar_type = 'D'; gapSize += ez.cigar[i] >> 4; } cigarGen.add_item(ez.cigar[i] >> 4, cigar_type); } return gapSize; } } std::string getRefSeq(compact::vector &refseq, uint64_t refAccPos, size_t tpos, uint32_t memlen) { if (memlen == 0) return ""; std::string tseq; tseq.reserve(memlen); uint64_t bucket_offset = (refAccPos + tpos) * 2; auto len_on_vector = memlen * 2; for (uint32_t w = 0; w <= len_on_vector / 64; w++) { uint32_t len = std::min((uint32_t) 64, len_on_vector - w * 64); if (len == 0) continue; uint64_t word = refseq.get_int(bucket_offset, len); for (uint32_t i = 0; i < len; i += 2) { uint8_t next_bits = ((word >> i) & 0x03); char next = 'A'; if (next_bits == 1) next = 'C'; else if (next_bits == 2) next = 'G'; else if (next_bits == 3) next = 'T'; tseq += next; } bucket_offset += len; } return tseq; } bool fillRefSeqBuffer(compact::vector &refseq, uint64_t refAccPos, size_t tpos, uint32_t memlen, std::string& refBuffer_) { refBuffer_.clear(); if (memlen == 0) return false; uint64_t bucket_offset = (refAccPos + tpos) * 2; auto len_on_vector = memlen * 2; int32_t toFetch = len_on_vector; while (toFetch > 0) { uint32_t len = (toFetch >= 64) ? 64 : toFetch; toFetch -= len; uint64_t word = refseq.get_int(bucket_offset, len); for (uint32_t i = 0; i < len; i += 2) { uint8_t next_bits = ((word >> i) & 0x03); refBuffer_ += "ACGT"[next_bits]; } bucket_offset += len; } return true; } // NOTE: This fills in refBuffer_ with the reference sequence from tpos to tpos+memlen in *reverse* order. // refBuffer will contain the reverse of the reference substring, *NOT* the reverse-complement. bool fillRefSeqBufferReverse(compact::vector &refseq, uint64_t refAccPos, size_t tpos, uint32_t memlen, std::string& refBuffer_) { refBuffer_.resize(memlen, 'N'); if (memlen == 0) return false; uint64_t bucket_offset = (refAccPos + tpos) * 2; auto len_on_vector = memlen * 2; int32_t toFetch = len_on_vector; int32_t offset = static_cast(memlen)-1; while (toFetch > 0) { uint32_t len = (toFetch >= 64) ? 64 : toFetch; toFetch -= len; uint64_t word = refseq.get_int(bucket_offset, len); for (uint32_t i = 0; i < len; i += 2) { uint8_t next_bits = ((word >> i) & 0x03); refBuffer_[offset] = "ACGT"[next_bits]; --offset; } bucket_offset += len; } return true; } // Ungapped Alignment of two sequences with the same length int32_t PuffAligner::align_ungapped(const char* const query, const char* const target, int32_t len) { int32_t score = 0; bool computeCIGAR = !(aligner.config().flag & KSW_EZ_SCORE_ONLY); auto& cigarGen = cigarGen_; for (int32_t i = 0; i < len; i++) { if (computeCIGAR) cigarGen.add_item(1, 'M'); if (query[i] != 'N' and target[i] != 'N') score += query[i] == target[i] ? mopts.matchScore : mopts.mismatchPenalty; } return score; } /** * Align the read `original_read`, whose mems consist of `mems` against the index and return the result * in `arOut`. How the alignment is computed (i.e. full vs between-mem and CIGAR vs. score only) depends * on the parameters of how this PuffAligner object was constructed. **/ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::vector &mems, uint64_t queryChainHash, bool perfectChain, bool isFw, size_t tid, AlnCacheMap &alnCache, HitCounters &hctr, AlignmentResult& arOut, bool /*verbose*/) { int32_t alignmentScore{std::numeric_limits::min()}; if (mems.empty()) { arOut.score = alignmentScore; return false; } int32_t refExtLength = static_cast(mopts.refExtendLength); // bool firstMem = true; // int32_t lastHitEnd_read = -1; int32_t currHitStart_read = 0; // int64_t lastHitEnd_ref = -1; int64_t currHitStart_ref = 0; // int32_t alignment{0}; uint32_t openGapLen{0}; // will we be computing CIGAR strings for this alignment bool computeCIGAR = !(aligner.config().flag & KSW_EZ_SCORE_ONLY); bool approximateCIGAR = mopts.alignmentMode == pufferfish::util::PuffAlignmentMode::APPROXIMATE_CIGAR; // the cigar generator we will use auto& cigarGen = cigarGen_; cigarGen.clear(); std::string cigar = ""; ksw_reset_extz(&ez); // where this reference starts, and its length. int64_t refAccPos = tid > 0 ? refAccumLengths[tid - 1] : 0; int64_t refTotalLength = refAccumLengths[tid] - refAccPos; const auto& frontMem = mems.front(); auto rpos = frontMem.rpos; auto memlen = frontMem.extendedlen; auto readLen = read.length(); auto tpos = frontMem.tpos; if (perfectChain) { arOut.score = alignmentScore = readLen * mopts.matchScore; if (computeCIGAR) { cigarGen.add_item(readLen, 'M'); } hctr.skippedAlignments_byCov += 1; SPDLOG_DEBUG(logger_,"[["); SPDLOG_DEBUG(logger_,"read sequence ({}) : {}", (isFw ? "FW" : "RC"), readView); SPDLOG_DEBUG(logger_,"ref sequence : {}", (doFullAlignment ? tseq : refSeqBuffer_)); SPDLOG_DEBUG(logger_,"perfect chain!\n]]\n"); arOut.cigar = cigar; arOut.openGapLen = openGapLen; return true; } // do full alignment if we are in that mode, or if the // current read was recovered via orphan recovery. // @mohsen & @fataltes : we need a better signal than memlen == 1 // to designate this was a recovered orphan. bool recoveredOrphan = memlen == 1; bool doFullAlignment = mopts.fullAlignment or recoveredOrphan; currHitStart_read = isFw ? rpos : readLen - (rpos + memlen); if (currHitStart_read < 0 or currHitStart_read >= (int32_t) readLen) { std::cerr << "[ERROR in PuffAligner::alignRead :] currHitStart_read is invalid; this hould not happen!\n"; return false; } currHitStart_ref = tpos; // the length of the reference string we will use to check the // alignment cache (or to compute a full alignment). int32_t keyLen = 0; std::string tseq; uint64_t hashKey{0}; bool didHash{false}; uint32_t refStart, readStart{0}; int32_t signedRefStartPos = currHitStart_ref - currHitStart_read; int32_t signedRefEndPos = signedRefStartPos + read.length(); bool invalidStart = (signedRefStartPos < 0); bool invalidEnd = (signedRefEndPos > refTotalLength); bool allowSoftclip = mopts.allowSoftclip; // allowOverhangSoftclip is a special case of allowing soft-clipping // if we allow softclipping generally, then this option is redundant. bool allowOverhangSoftclip = (mopts.allowOverhangSoftclip or allowSoftclip); if (mopts.mimicBT2Strict and (invalidStart or invalidEnd)) { arOut.score = std::numeric_limits::min(); arOut.cigar = ""; arOut.openGapLen = 0; return false; } auto& bandwidth = aligner.config().bandwidth; libdivide::divider gapExtDivisor(static_cast(mopts.gapExtendPenalty)); const int32_t minAcceptedScore = scoreStatus_.getCutoff(read.length()); //mopts.minScoreFraction * mopts.matchScore * readLen; // compute the maximum gap length that would be allowed given the length of read aligned so far and the current // alignment score. auto maxAllowedGaps = [&minAcceptedScore, &readLen, &gapExtDivisor, this] (uint32_t alignedLen, int32_t alignmentScore) -> int { int maxAllowedGaps = (alignmentScore + static_cast(mopts.matchScore) * static_cast(readLen - alignedLen) - minAcceptedScore - static_cast(mopts.gapOpenPenalty)) / gapExtDivisor; return std::max(maxAllowedGaps + 1, 1); }; auto alignable = [&minAcceptedScore, &readLen] (uint32_t alignedLen, uint32_t matchScore, int32_t alignmentScore) -> bool { return minAcceptedScore <= alignmentScore + static_cast(matchScore*(readLen-alignedLen)); }; const auto& lastMem = mems.back(); int32_t last_rpos = isFw ? lastMem.rpos : readLen - (lastMem.rpos + lastMem.extendedlen); int32_t leftEndRead = readLen - (last_rpos + lastMem.extendedlen); int32_t dataDepBuff = maxAllowedGaps(0, 0) + 1; // std::min(refExtLength, 5 * leftEndRead); bool overhangingEnd = static_cast(lastMem.tpos + lastMem.extendedlen + leftEndRead + dataDepBuff) > refTotalLength; // bool overhangingStart = currHitStart_ref < currHitStart_read; uint32_t remLen = 0; // If we are only aligning between MEMs if (!doFullAlignment) { refStart = (currHitStart_ref >= currHitStart_read) ? currHitStart_ref - currHitStart_read : 0; // overhangingStart = static_cast(refStart) < refExtLength; int32_t refStartExtLength = (refExtLength < static_cast(refStart)) ? refExtLength : refStart; refStart = static_cast(refStart) > refExtLength ? static_cast(refStart) - refExtLength : 0; remLen = (currHitStart_ref >= currHitStart_read) ? readLen : readLen - (currHitStart_read - currHitStart_ref); keyLen = (static_cast(refStart + refStartExtLength + remLen + refExtLength) < refTotalLength) ? refStartExtLength + remLen + refExtLength : refTotalLength - refStart; } else { // we are aligning from the start of the read // If the first hit starts further in the reference than in the // read, then we align from the beginning of the read and (ref_start - read_start) on // the reference. if (currHitStart_ref > currHitStart_read) { refStart = currHitStart_ref - currHitStart_read; readStart = 0; remLen = readLen; } else if (currHitStart_ref < currHitStart_read) { // If the first his starts further in the read than in the reference, than // the read overhangs the beginning of the reference and we start aligning // from the beginning of the reference and from position (read_start - ref_start) // in the read. readStart = currHitStart_read - currHitStart_ref; refStart = 0; remLen = readLen - readStart; } else { // If the first hit starts at the same position in the reference and the read // ... what is this case? readStart = 0; refStart = currHitStart_ref - currHitStart_read; remLen = readLen; } keyLen = (static_cast(refStart + remLen + refExtLength) < refTotalLength) ? remLen + refExtLength : refTotalLength - refStart; } fillRefSeqBuffer(allRefSeq, refAccPos, refStart, keyLen, refSeqBuffer_); // int32_t originalRefSeqLen = static_cast(refSeqBuffer_.length()); // If we're not using fullAlignment, we'll need the full reference sequence later // so copy it into tseq. if (!doFullAlignment) { tseq = refSeqBuffer_; } bool useAlnCache = mopts.useAlignmentCache and isMultimapping_ and !perfectChain and !overhangingEnd; // first, check if we can skip this by perfect chaining // if not, check if we can skip it via the alignment cache if (perfectChain) { arOut.score = alignmentScore = readLen * mopts.matchScore; if (computeCIGAR or approximateCIGAR) { cigarGen.add_item(readLen, 'M'); } hctr.skippedAlignments_byCov += 1; /*SPDLOG_DEBUG(logger_,"[["); SPDLOG_DEBUG(logger_,"read sequence ({}) : {}", (isFw ? "FW" : "RC"), readView); SPDLOG_DEBUG(logger_,"ref sequence : {}", (doFullAlignment ? tseq : refSeqBuffer_)); SPDLOG_DEBUG(logger_,"perfect chain!\n]]\n");*/ } else if (useAlnCache and !alnCache.empty()) { // and !overhangingStart) { // mopts.useAlignmentCache and !alnCache.empty() and isMultimapping_ and !overhangingEnd) { // and !overhangingStart) { // hash the reference string MetroHash64::Hash(reinterpret_cast(const_cast(refSeqBuffer_.data())), keyLen, reinterpret_cast(&hashKey), 0); hashKey ^= queryChainHash; didHash = true; // see if we have this hash auto hit = alnCache.find(hashKey); // if so, we know the alignment score if (hit != alnCache.end() and (isFw == hit->second.isFw) ) {//}and refStart + readLen + refExtLength < refTotalLength) { hctr.skippedAlignments_byCache += 1; arOut.score = hit->second.score; if (computeCIGAR or approximateCIGAR) { arOut.cigar = hit->second.cigar; } arOut.openGapLen = hit->second.openGapLen; return true; } } //auto logger_ = spdlog::get("console"); //spdlog::set_level(spdlog::level::debug); // Set global log level to debug //logger_->set_pattern("%v"); // @mohsen & @fataltes --- we should figure out how to // avoid computing the rc of a read if we've already done it. if (!isFw and read_rc.empty()) { read_rc = pufferfish::util::reverseComplement(read); } nonstd::string_view readView = (isFw) ? read : read_rc; if (!perfectChain) { // NOTE: We don't worry about soft clipping within the `doFullAlignment` // branch, since these two options are currently incompatible. if (doFullAlignment) { // if we allow softclipping of overhanging bases, then we can cut off the // part of the read before the start of the reference decltype(readStart) readOffset = allowOverhangSoftclip ? readStart : 0; nonstd::string_view readSeq = readView.substr(readOffset); ksw_reset_extz(&ez); auto cutoff = minAcceptedScore - mopts.matchScore * read.length(); aligner(readSeq.data(), readSeq.length(), refSeqBuffer_.data(), refSeqBuffer_.length(), &ez, cutoff, ksw2pp::EnumToType()); // if we allow softclipping of overhaning bases, then we only care about // the best score to the end of the query or the end of the reference. // Otherwise, we care about the best score all the way until the end of // the query. alignmentScore = allowOverhangSoftclip ? std::max(ez.mqe, ez.mte) : ez.mqe; SPDLOG_DEBUG(logger_, "readSeq : {}\nrefSeq : {}\nscore : {}\nreadStart : {}", readSeq, refSeqBuffer_, alignmentScore, readStart); SPDLOG_DEBUG( logger_, "currHitStart_read : {}, currHitStart_ref : {}\nmqe : {}, mte : {}\n", currHitStart_read, currHitStart_ref, ez.mqe, ez.mte); if (computeCIGAR) { openGapLen = addCigar(cigarGen, ez, false); } else { // can make start pos negative, but sam writer deals with this right now openGapLen = currHitStart_read; //((currHitStart_ref - currHitStart_read) < 0) ? 0 : //(currHitStart_read); } } else { alignmentScore = 0; // std::stringstream ss; SPDLOG_DEBUG(logger_, "[["); SPDLOG_DEBUG(logger_, "read sequence ({}) : {}", (isFw ? "FW" : "RC"), readView); SPDLOG_DEBUG(logger_, "ref sequence : {}\nrefID : {}", tseq, tid); // If the first mem does not start at the beginning of the // read, then there is a gap to align. int32_t firstMemStart_read = (isFw) ? rpos : readLen - (rpos + memlen); if (firstMemStart_read > 0) { // align the part before the first mem // the gap is of length firstMemStart_read, so grab that much (plus // buffer) before the first start position on the reference. int32_t // firstMemStart_ref = tpos; int32_t readStartPosOnRef = tpos - firstMemStart_read; // int32_t dataDepBuff = std::min(refExtLength, 5*firstMemStart_read); int32_t refWindowStart = (readStartPosOnRef - refExtLength) > 0 ? (readStartPosOnRef - refExtLength) : 0; int32_t refWindowLength = tpos - refWindowStart; fillRefSeqBufferReverse(allRefSeq, refAccPos, refWindowStart, refWindowLength, refSeqBuffer_); if (refSeqBuffer_.length() > 0) { auto readWindow = std::string(readView.substr(0, firstMemStart_read)); std::reverse(readWindow.begin(), readWindow.end()); SPDLOG_DEBUG(logger_, "PRE:\nreadStartPosOnRef : {}\nrefWindowStart : {}", readStartPosOnRef, refWindowStart); SPDLOG_DEBUG(logger_, "refWindowLength : {}\nread : [{}]\nref : [{}]", refWindowLength, readWindow, refSeqBuffer_); ksw_reset_extz(&ez); bandwidth = maxAllowedGaps(0, 0) + 1; auto cutoff = minAcceptedScore - mopts.matchScore * read.length(); aligner(readWindow.data(), readWindow.length(), refSeqBuffer_.data(), refSeqBuffer_.length(), &ez, cutoff, ksw2pp::EnumToType()); if (ez.stopped) hctr.skippedAlignments_notAlignable += 1; if (ez.mqe != KSW_NEG_INF and ez.stopped>0) ez.stopped = 0; // If we are doing approximate soft clipping, then we will retain the // higher of the two scores between extending the alignment before the // first MEM, or simply soft clipping the part of the alignment before // the first MEM and setting the associated part of the alignment // score to 0 (we gain no benefit, but it costs us nothing either). // we start out with the score we obtain if we extend all the // way to the beginning of the **query**. // simply deleting the rest of the read //int32_t delCost = (-1 * mopts.gapOpenPenalty + // -1 * mopts.gapExtendPenalty * readWindow.length()); decltype(alignmentScore) part_score = ez.mqe;//std::max(delCost, ez.mqe); int32_t num_soft_clipped{0}; if (allowSoftclip) { auto max_score = std::max(ez.mqe, ez.mte); // if we are allowing softclipping, then we consider the // best score we could get as the maximum of // 1. extending to the beginning of the query. // 2. extending to the beginning of the reference. // 3. 0 : i.e. performing no extension and starting the alignment at // the first MEM. if (max_score < 0) { // if we are in case 3 part_score = 0; // we soft clip the entire read part num_soft_clipped = readWindow.length(); } else if (ez.mqe >= ez.mte) { // if we are in case 1 part_score = ez.mqe; // we soft-clip nothing } else { // if we are in case 2 part_score = ez.mte; // we soft clip the difference between the read part length // and the number of aligned bases in the read part num_soft_clipped = readWindow.length() - ez.max_q - 1; } arOut.softclip_start = static_cast(num_soft_clipped); } else if (allowOverhangSoftclip) { if (ez.mte > ez.mqe) { // we soft clip the difference between the read part length // and the number of aligned bases in the read part num_soft_clipped = readWindow.length() - ez.max_q - 1; part_score = ez.mte; } else { part_score = ez.mqe; } } alignmentScore += part_score; if (approximateCIGAR and num_soft_clipped > 0) { cigarGen.begin_softclip_len = num_soft_clipped; } // NOTE: pre soft-clip code for adjusting the alignment score. // alignmentScore += allowOverhangSoftclip ? std::max(ez.mqe, ez.mte) : ez.mqe; // NOTE: If we are not writing down the CIGAR, we assume *for the // purpose of computing positions* that the prefix of the read we are // aligning here is optimally aligned in a gapless fashion. This, // obviously, may not be the case. However, if the aligned prefix // does contain indels, and we don't write down the appropriate CIGAR // string to designate this fact, then downstream tools will not be // able to properly adjust (so that e.g. the MEMs in the chain are // properly positioned). This approximation means that the position we // report may not be the one corresponding to the optimal alignment // score that we actually calculate. However, if we are not writing // out the actual CIGAR string, this is the best we can do. (addresses // https://github.com/COMBINE-lab/salmon/issues/475). openGapLen = computeCIGAR ? addCigar(cigarGen, ez, true) : firstMemStart_read - num_soft_clipped; SPDLOG_DEBUG(logger_, "score : {}", std::max(ez.mqe, ez.mte)); } else { // overhangingStart = true; // do any special soft clipping penalty here if we want alignmentScore += allowOverhangSoftclip ? 0 : (-1 * mopts.gapOpenPenalty + -1 * mopts.gapExtendPenalty * firstMemStart_read); openGapLen = firstMemStart_read; if (approximateCIGAR) { cigarGen.begin_softclip_len = firstMemStart_read; cigarGen.beginOverhang = true; openGapLen = 0; } } } int32_t prevMemEnd_read = firstMemStart_read - 1; //isFw ? rpos : readLen - (rpos + memlen); int32_t prevMemEnd_ref = tpos - 1; if (!alignable(prevMemEnd_read + 1, mopts.matchScore, alignmentScore)) { hctr.skippedAlignments_notAlignable += 1; ez.stopped = 1; SPDLOG_DEBUG(logger_,"ez stopped"); } SPDLOG_DEBUG(logger_, "\t Aligning through MEM chain : "); // for the second through the last mem for(auto it = mems.begin(); it != mems.end() and !ez.stopped; ++it) { auto& mem = *it; rpos = mem.rpos; memlen = mem.extendedlen; tpos = mem.tpos; // first score the mem match int32_t score = mopts.matchScore * memlen; int32_t currMemStart_ref = tpos; int32_t currMemStart_read = isFw ? rpos : readLen - (rpos + memlen); int32_t gapRef = currMemStart_ref - prevMemEnd_ref - 1; // both inclusive int32_t gapRead = currMemStart_read - prevMemEnd_read - 1; // both inclusive if ((gapRef <= 0 or gapRead <= 0) and gapRef != gapRead) { int32_t gapDiff = std::abs(gapRef - gapRead); score += (-1 * mopts.gapOpenPenalty + -1 * mopts.gapExtendPenalty * gapDiff); // subtract off extra matches score += mopts.matchScore * std::min(gapRead, gapRef); SPDLOG_DEBUG(logger_, "\t GAP NOT THE SAME:\n\t gapRef : {}, gapRead : {}", gapRef, gapRead); SPDLOG_DEBUG(logger_, "\t totalScore (MEM + gapDiff) : {}", score); } else if (gapRead > 0 and gapRef > 0) { SPDLOG_DEBUG(logger_, "\t\t overlaps : \n\t\t gapRef : {}, gapRead : {}", gapRef, gapRead); auto readWindow = readView.substr(prevMemEnd_read + 1, gapRead); const char* refSeq1 = tseq.data() + (prevMemEnd_ref)-refStart + 1; SPDLOG_DEBUG(logger_, "\t\t aligning\n\t\t [{}]\n\t\t\ [{}]", readWindow, nonstd::string_view(refSeq1, gapRef)); if (prevMemEnd_ref - refStart + 1 + gapRef >= tseq.size()) { SPDLOG_DEBUG(logger_, "\t\t tseq was not long enough; need to fetch more!"); } if (readWindow.length() <= minLengthGapRequired and gapRead == gapRef) { score += align_ungapped(readWindow.data(), refSeq1, gapRead); } else { ksw_reset_extz(&ez); bandwidth = maxAllowedGaps(prevMemEnd_read + 1, alignmentScore) + 1; score += aligner( readWindow.data(), readWindow.length(), refSeq1, gapRef, &ez, ksw2pp::EnumToType()); if (computeCIGAR) { addCigar(cigarGen, ez, false); } } } else if (it > mems.begin() and ((currMemStart_read <= prevMemEnd_read) or (currMemStart_ref <= prevMemEnd_ref))) { SPDLOG_DEBUG(logger_, "]]\n"); std::cerr << "[ERROR in PuffAligner, between-MEM alignment] : " "Improperly compacted MEM chain. Should not happen!\n"; std::cerr << "gapRef : " << gapRef << ", gapRead : " << gapRead << ", memlen : " << memlen << "\n"; std::cerr << "prevMemEnd_read : " << prevMemEnd_read << ", currMemStart_read : " << currMemStart_read << "\n"; std::cerr << "prevMemEnd_ref : " << prevMemEnd_ref << ", currMemStart_ref : " << currMemStart_ref << "\n"; std::exit(1); } SPDLOG_DEBUG(logger_, "\t MEM (rpos : {}, memlen : {}, tpos : {})", rpos, memlen, tpos); SPDLOG_DEBUG(logger_, "\t gapRef : {}, gapRead : {}", gapRef, gapRead); auto printView = readView.substr(currMemStart_read, memlen); auto refView = nonstd::string_view(tseq.c_str() + tpos - refStart, memlen); SPDLOG_DEBUG(logger_, "\t read [{}], pos : {}, len : {}, ori : {}", printView, currMemStart_read, memlen, (isFw ? "FW" : "RC")); SPDLOG_DEBUG(logger_, "\t ref [{}], pos : {}, len : {}", refView, currMemStart_ref, memlen); if (printView.length() != refView.length()) { SPDLOG_DEBUG( logger_, "\t readView length != refView length; should not happen!"); std::exit(1); } prevMemEnd_read = currMemStart_read + memlen - 1; prevMemEnd_ref = tpos + memlen - 1; alignmentScore += score; if (!alignable(prevMemEnd_read + 1, mopts.matchScore, alignmentScore)) { hctr.skippedAlignments_notAlignable += 1; ez.stopped = 1; SPDLOG_DEBUG(logger_,"ez stopped"); } } // If we got to the end, and there is a read gap left, then align that as // well SPDLOG_DEBUG(logger_, "prevMemEnd_read : {}, readLen : {}", prevMemEnd_read, readLen); bool gapAtEnd = (prevMemEnd_read + 1) <= (static_cast(readLen) - 1); if (gapAtEnd and !ez.stopped) { int32_t gapRead = (readLen - 1) - (prevMemEnd_read + 1) + 1; int32_t refTailStart = prevMemEnd_ref + 1; bandwidth = maxAllowedGaps(prevMemEnd_read + 1, alignmentScore) + 1; int32_t dataDepBuff = bandwidth; // std::min(refExtLength, 5 * gapRead); int32_t refTailEnd = refTailStart + gapRead + dataDepBuff; //overhangingEnd = (refTailStart + gapRead + dataDepBuff) > refTotalLength; if (refTailEnd >= refTotalLength) { refTailEnd = refTotalLength - 1; } int32_t refLen = (refTailEnd > refTailStart) ? refTailEnd - refTailStart + 1 : 0; auto readWindow = readView.substr(prevMemEnd_read + 1); fillRefSeqBuffer(allRefSeq, refAccPos, refTailStart, refLen, refSeqBuffer_); SPDLOG_DEBUG(logger_, "POST:"); SPDLOG_DEBUG(logger_, "read : [{}]", readWindow); SPDLOG_DEBUG(logger_, "ref : [{}]", refSeqBuffer_); SPDLOG_DEBUG(logger_, "gapRead : {}, refLen : {}, refBuffer_.size() : {}, " "refTotalLength : {}", gapRead, refLen, refSeqBuffer_.size(), refTotalLength); if (refLen > 0) { ksw_reset_extz(&ez); auto cutoff = minAcceptedScore - alignmentScore - mopts.matchScore * readWindow.length(); aligner(readWindow.data(), readWindow.length(), refSeqBuffer_.data(), refLen, &ez, cutoff, ksw2pp::EnumToType()); if (ez.stopped) hctr.skippedAlignments_notAlignable += 1; if (ez.mqe != KSW_NEG_INF) ez.stopped = 0; // we start out with the score we obtain if we extend all the // way to the end of the **query**. This is the max score we can // get by either // simply deleting the rest of the read int32_t delCost = (-1 * mopts.gapOpenPenalty + -1 * mopts.gapExtendPenalty * readWindow.length()); // or taking the ksw2 alignment score to the end fo the read decltype(alignmentScore) part_score = std::max(ez.mqe, delCost); int32_t num_soft_clipped{0}; if (allowSoftclip) { // if we are allowing softclipping, then we consider the // best score we could get as the maximum of // 1. extending to the end of the query. (including deleting the // rest of the query) // 2. extending to the end of the reference. // 3. 0 : i.e. performing no extension and end the alignment at the last MEM. // NOTE: Since it won't affect the read position, we do not need to explicitly // compute the number of soft-clipped bases like we have above for the read part // before the first MEM // int32_t bases_clipped{0}; // 1 is the default case here, and corresponds to clipping no bases if (ez.mte > 0 and ez.mte > part_score) { // if we are in case 2 // if extending to the end of the reference is better than the end // of the query and if the score is greater than 0 (which we can // always get when soft clipping) then take that part_score = ez.mte; num_soft_clipped = readWindow.length() - ez.max_q - 1; } else if (part_score < 0) { // if we are in case 3 part_score = 0; num_soft_clipped = readWindow.length(); } } else if (allowOverhangSoftclip) { part_score = std::max(std::max(ez.mqe, ez.mte), part_score); } alignmentScore += part_score; if (approximateCIGAR) { cigarGen.end_softclip_len = num_soft_clipped; } // NOTE: pre soft-clip code for adjusting the alignment score. // int32_t alnCost = allowOverhangSoftclip ? std::max(ez.mqe, ez.mte) // : ez.mqe; int32_t delCost = (-1 * mopts.gapOpenPenalty + -1 * // mopts.gapExtendPenalty * readWindow.length()); alignmentScore += // std::max(alnCost, delCost); SPDLOG_DEBUG(logger_, "POST score : {}", part_score); } else { //overhangingEnd = true; // do any special soft clipping penalty here if we want alignmentScore += allowOverhangSoftclip ? 0 : (-1 * mopts.gapOpenPenalty + -1 * mopts.gapExtendPenalty * readWindow.length()); if (approximateCIGAR) { cigarGen.end_softclip_len = readWindow.length(); cigarGen.endOverhang = true; } } } SPDLOG_DEBUG(logger_, "score : {}\n]]\n", alignmentScore); } } // not a perfect chain bool cigar_fixed{false}; if (computeCIGAR) { cigar = cigarGen.get_cigar(readLen, cigar_fixed); } if (approximateCIGAR) { cigarGen.get_approx_cigar(readLen, cigar); } if (cigar_fixed) { hctr.cigar_fixed_count++; } if (useAlnCache) { // don't bother to fill up a cache unless this is a multi-mapping read //mopts.useAlignmentCache and isMultimapping_ and !perfectChain and !overhangingEnd) { // don't bother to fill up a cache unless this is a multi-mapping read if (!didHash) { // We want the alignment cache to be on the hash of the full underlying reference sequence. // If we are using fullAlignment, this is in refSeqBuffer_, but if we are using between-mem alignment // then refSeqBuffer_ could have been used to store shorter portions of the reference during // the alignment procedure. In that case, get the original reference sequence from tseq, which // was copied from the full reference sequence in the beginning of the function. char* ptr = const_cast((doFullAlignment ? refSeqBuffer_.data() : tseq.data())); MetroHash64::Hash(reinterpret_cast(ptr), keyLen, reinterpret_cast(&hashKey), 0); hashKey ^= queryChainHash; } AlignmentResult aln; aln.isFw = isFw; aln.score = alignmentScore; if (computeCIGAR or approximateCIGAR) { aln.cigar = cigar; } aln.openGapLen = openGapLen; alnCache[hashKey] = aln; } arOut.score = alignmentScore; arOut.cigar = cigar; arOut.openGapLen = openGapLen; return true; } /** * Align read_left and read_right, filling the relevant alignment information into the output joinHit structure. * The behavior of alignment (whether the alignment is done only between MEMs or over the full read length, and * if CIGAR strings are computed or just scores, is controlled by the configuration that has been passed to this * PuffAligner object). **/ int32_t PuffAligner::calculateAlignments(std::string& read_left, std::string& read_right, pufferfish::util::JointMems& jointHit, HitCounters& hctr, bool isMultimapping, bool verbose) { isMultimapping_ = isMultimapping; auto tid = jointHit.tid; double optFrac{mopts.minScoreFraction}; bool computeCIGAR = !(aligner.config().flag & KSW_EZ_SCORE_ONLY); bool approximateCIGAR = mopts.alignmentMode == pufferfish::util::PuffAlignmentMode::APPROXIMATE_CIGAR; auto threshold = [&, optFrac] (uint64_t len) -> int32_t { return static_cast(std::floor((!mopts.matchScore)?(-0.6+-0.6*len):optFrac*mopts.matchScore*len)); }; constexpr const auto invalidScore = std::numeric_limits::min(); // If this read is in an orphaned mapping if (jointHit.isOrphan()) { hctr.totalAlignmentAttempts += 1; // If this mapping was an orphan, then this is the orphaned read bool isLeft = jointHit.isLeftAvailable(); std::string& read_orphan = isLeft ? read_left : read_right; std::string& rc_orphan = isLeft ? read_left_rc_ : read_right_rc_; auto& ar_orphan = isLeft ? ar_left : ar_right; auto& orphan_aln_cache = isLeft ? alnCacheLeft : alnCacheRight; ar_orphan.score = invalidScore; alignRead(read_orphan, rc_orphan, jointHit.orphanClust()->mems, jointHit.orphanClust()->queryChainHash, jointHit.orphanClust()->perfectChain, jointHit.orphanClust()->isFw, tid, orphan_aln_cache, hctr, ar_orphan, verbose); jointHit.alignmentScore = ar_orphan.score > threshold(read_orphan.length()) ? ar_orphan.score : invalidScore; jointHit.orphanClust()->cigar = (approximateCIGAR or computeCIGAR) ? ar_orphan.cigar : ""; jointHit.orphanClust()->openGapLen = ar_orphan.openGapLen; jointHit.orphanClust()->softClipStart = ar_orphan.softclip_start; // jointHit.orphanClust()->coverage = jointHit.alignmentScore; if (jointHit.alignmentScore < 0 and verbose) { std::cerr << read_orphan.length() << " " << threshold(read_orphan.length()) << " " << ar_left.score << "\n"; } return jointHit.alignmentScore; } else { hctr.totalAlignmentAttempts += 2; ar_left.score = ar_right.score = invalidScore; if (verbose) { std::cerr << "left\n"; } alignRead(read_left, read_left_rc_, jointHit.leftClust->mems, jointHit.leftClust->queryChainHash, jointHit.leftClust->perfectChain, jointHit.leftClust->isFw, tid, alnCacheLeft, hctr, ar_left, verbose); if (verbose) { std::cerr << "right\n"; } alignRead(read_right, read_right_rc_, jointHit.rightClust->mems, jointHit.rightClust->queryChainHash, jointHit.rightClust->perfectChain, jointHit.rightClust->isFw, tid, alnCacheRight, hctr, ar_right, verbose); jointHit.alignmentScore = ar_left.score > threshold(read_left.length()) ? ar_left.score : invalidScore; jointHit.mateAlignmentScore = ar_right.score > threshold(read_right.length()) ? ar_right.score : invalidScore; /* jointHit.alignmentScore = (score_left == invalidScore or score_right == invalidScore)? invalidScore : score_left + score_right; */ // jointHit.leftClust->coverage = score_left; jointHit.leftClust->openGapLen = ar_left.openGapLen; jointHit.leftClust->softClipStart = ar_left.softclip_start; // jointHit.rightClust->coverage = score_right; jointHit.rightClust->openGapLen = ar_right.openGapLen; jointHit.rightClust->softClipStart = ar_right.softclip_start; if (computeCIGAR or approximateCIGAR) { jointHit.leftClust->cigar = ar_left.cigar; jointHit.rightClust->cigar = ar_right.cigar; } else { jointHit.leftClust->cigar = ""; jointHit.rightClust->cigar = ""; } return (jointHit.alignmentScore == invalidScore or jointHit.mateAlignmentScore == invalidScore) ? invalidScore:jointHit.alignmentScore+jointHit.mateAlignmentScore; } } /** * Align read, filling the relevant alignment information into the output joinHit structure. * The behavior of alignment (whether the alignment is done only between MEMs or over the full read length, and * if CIGAR strings are computed or just scores, is controlled by the configuration that has been passed to this * PuffAligner object). **/ int32_t PuffAligner::calculateAlignments(std::string& read, pufferfish::util::JointMems& jointHit, HitCounters& hctr, bool isMultimapping, bool verbose) { isMultimapping_ = isMultimapping; auto tid = jointHit.tid; double optFrac{mopts.minScoreFraction}; bool computeCIGAR = !(aligner.config().flag & KSW_EZ_SCORE_ONLY); bool approximateCIGAR = mopts.alignmentMode == pufferfish::util::PuffAlignmentMode::APPROXIMATE_CIGAR; auto threshold = [&, optFrac] (uint64_t len) -> int32_t { return static_cast(std::floor((!mopts.matchScore)?(-0.6+-0.6*len):optFrac*mopts.matchScore*len)); }; constexpr const auto invalidScore = std::numeric_limits::min(); hctr.totalAlignmentAttempts += 1; ar_left.score = invalidScore; const auto& oc = jointHit.orphanClust(); alignRead(read, read_left_rc_, oc->mems, oc->queryChainHash, oc->perfectChain, oc->isFw, tid, alnCacheLeft, hctr, ar_left, verbose); jointHit.alignmentScore = ar_left.score > threshold(read.length()) ? ar_left.score : invalidScore; jointHit.orphanClust()->cigar = (approximateCIGAR or computeCIGAR) ? ar_left.cigar : ""; jointHit.orphanClust()->openGapLen = ar_left.openGapLen; // jointHit.orphanClust()->coverage = jointHit.alignmentScore; if (jointHit.alignmentScore < 0 and verbose) { std::cerr << read.length() << " " << threshold(read.length()) << " " << ar_left.score << "\n"; } return jointHit.alignmentScore; } bool PuffAligner::recoverSingleOrphan(std::string& read_left, std::string& read_right, pufferfish::util::MemCluster& clust, std::vector &recoveredMemClusters, uint32_t tid, bool anchorIsLeft, bool verbose) { int32_t anchorLen = anchorIsLeft ? read_left.length() : read_right.length(); /*auto tpos = clust.mems[0].tpos; auto anchorStart = clust.mems[0].isFw ? clust.mems[0].rpos : anchorLen - (clust.mems[0].rpos + clust.mems[0].extendedlen); uint32_t anchorPos = tpos >= anchorStart ? tpos - anchorStart : 0;*/ uint32_t anchorPos = clust.approxReadStartPos(); bool recovered_fwd; uint32_t recovered_pos=-1; auto* r1 = read_left.data(); auto* r2 = read_right.data(); auto l1 = static_cast(read_left.length()); auto l2 = static_cast(read_right.length()); const char* rptr{nullptr}; bool anchorFwd{clust.isFw}; int32_t startPos = -1, maxDist = -1, otherLen = -1, rlen = -1; std::string* otherReadPtr{nullptr}; const char* otherRead{nullptr}; char* otherReadRC{nullptr}; std::string* otherRCSpace{nullptr}; char* r1rc = nullptr; char* r2rc = nullptr; std::unique_ptr windowSeq{nullptr}; int32_t windowLength = -1; int32_t maxDistRight = std::max(1, l2 / 4); int32_t maxDistLeft = std::max(1, l1 / 4); constexpr const int32_t signedZero{0}; bool noDovetail = mopts.noDovetail; if (anchorIsLeft) { anchorLen = l1; otherLen = l2; maxDist = maxDistRight; otherReadPtr = &read_right; otherRCSpace = &rc2_; otherRead = r2; otherReadRC = r2rc; /* from rapmap anchorLen = l1; otherLen = l2; maxDist = maxDistRight; lpos = anchorPos; rpos = -1; lfwd = anchorFwd; rfwd = !lfwd; otherReadPtr = &rightRead; otherRCSpace = &rc2; otherRead = r2; otherReadRC = r2rc; leftChainStatus = anchorHit.chainStatus.getLeft(); */ } else { anchorLen = l2; otherLen = l1; maxDist = maxDistLeft; otherReadPtr = &read_left; otherRCSpace = &rc1_; otherRead = r1; otherReadRC = r1rc; } uint64_t refAccPos = tid > 0 ? refAccumLengths[tid - 1] : 0; uint64_t refLength = refAccumLengths[tid] - refAccPos; if (anchorFwd) { if (!otherReadRC){ pufferfish::util::reverseRead(*otherReadPtr, *otherRCSpace); otherReadRC = const_cast(otherRCSpace->data()); } rptr = otherReadRC; rlen = otherLen; startPos = std::max(signedZero, static_cast(anchorPos)); windowLength = std::min(static_cast(mopts.maxFragmentLength), static_cast(refLength - startPos)); if (!noDovetail) { startPos = std::max(signedZero, static_cast(anchorPos - mopts.maxFragmentLength)); windowLength = std::min(2*static_cast(mopts.maxFragmentLength), static_cast(refLength - startPos)); } } else { rptr = otherRead; rlen = otherLen; startPos = std::max(signedZero, static_cast(anchorPos + anchorLen - mopts.maxFragmentLength)); windowLength = std::min(static_cast(mopts.maxFragmentLength), static_cast(anchorPos + anchorLen)); if (!noDovetail) { windowLength = std::min(2*static_cast(mopts.maxFragmentLength), static_cast(refLength - startPos)); } } if (verbose) { std::cerr<< anchorPos<< "\n"; } fillRefSeqBuffer(allRefSeq, refAccPos, startPos, windowLength, refSeqBuffer_); /*windowSeq.reset(new char[tseq.length() + 1]); strcpy(windowSeq.get(), tseq.c_str()); */ // Note -- we use score only mode to find approx end position in rapmap, can we // do the same here? EdlibAlignResult result = edlibAlign(rptr, rlen, refSeqBuffer_.data(), windowLength, edlibNewAlignConfig(maxDist, EDLIB_MODE_HW, EDLIB_TASK_LOC)); if (result.editDistance > -1) { recovered_fwd = !anchorFwd; recovered_pos = startPos + result.startLocations[0]; if (noDovetail and (recovered_fwd and static_cast(recovered_pos + rlen) > static_cast(anchorPos + anchorLen))) { edlibFreeAlignResult(result); return false; } recoveredMemClusters.push_back(pufferfish::util::MemCluster(recovered_fwd, rlen)); auto it = recoveredMemClusters.begin() + recoveredMemClusters.size() - 1; if (verbose) { std::cerr<< anchorIsLeft << " " << anchorFwd << " " << anchorPos << " " << startPos + result.startLocations[0] <<" " << result.editDistance << "\n"; } orphanRecoveryMemCollection.push_back(pufferfish::util::UniMemInfo()); auto memItr = orphanRecoveryMemCollection.begin() + orphanRecoveryMemCollection.size() - 1; if (verbose) { std::cerr<addMem(memItr, recovered_pos, 1, recovered_fwd ? 0 : rlen-1, recovered_fwd); //delete windowSeq; edlibFreeAlignResult(result); return true; } else { //delete windowSeq; edlibFreeAlignResult(result); return false; } }