#include #include #include #include #include #include #include #include #include #include #include #include "json.hpp" #include "oneapi/tbb/combinable.h" #include "oneapi/tbb/parallel_for.h" #include "oneapi/tbb/task_arena.h" #include "AlignmentLibrary.hpp" #include "DistributionUtils.hpp" #include "GCFragModel.hpp" #include "KmerContext.hpp" #include "LibraryFormat.hpp" #include "ReadExperiment.hpp" #include "ReadPair.hpp" #include "SBModel.hpp" #include "SalmonMath.hpp" #include "SalmonUtils.hpp" #include "TryableSpinLock.hpp" #include "UnpairedRead.hpp" #include "TranscriptGroup.hpp" #include "Transcript.hpp" #include "spdlog/fmt/fmt.h" #include "spdlog/fmt/ostr.h" #include "spdlog/sinks/ostream_sink.h" #include "spdlog/spdlog.h" #include "gff.h" #include "FastxParser.hpp" //#include "jellyfish/mer_dna.hpp" #include "GenomicFeature.hpp" #include "SGSmooth.hpp" #include "TranscriptGeneMap.hpp" #include "StadenUtils.hpp" #include "SalmonDefaults.hpp" #include "pufferfish/Util.hpp" #include "zstr.hpp" namespace salmon { namespace utils { using MateStatus = pufferfish::util::MateStatus; std::string str(const MappingType& mt) { switch (mt) { case MappingType::UNMAPPED: return "u"; case MappingType::LEFT_ORPHAN: return "m1"; case MappingType::RIGHT_ORPHAN: return "m2"; case MappingType::BOTH_ORPHAN: return "m12"; case MappingType::PAIRED_MAPPED: return "mp"; case MappingType::SINGLE_MAPPED: return "ms"; case MappingType::DECOY: return "d"; } // should never get here! return "E"; } bool headersAreConsistent(SAM_hdr* h1, SAM_hdr* h2) { bool consistent{true}; // Both files must contain the same number of targets if (h1->nref != h2->nref) { consistent = false; } // Check each target to ensure that the name and length are the same. size_t i = 0; size_t n = h1->nref; while (consistent and i < n) { size_t l1 = h1->ref[i].len; size_t l2 = h2->ref[i].len; consistent = (l1 == l2) and (strcmp(h1->ref[i].name, h2->ref[i].name) == 0); ++i; } return consistent; } bool headersAreConsistent(std::vector&& headers) { if (headers.size() == 1) { return true; } // Ensure that all of the headers are consistent (i.e. the same), by // comparing each with the first. bool consistent{true}; auto itFirst = headers.begin(); auto it = itFirst; while (++it != headers.end()) { if (!headersAreConsistent(*itFirst, *it)) { consistent = false; break; } } return consistent; } std::ostream& operator<<(std::ostream& os, OrphanStatus s) { switch (s) { case OrphanStatus::LeftOrphan: os << "left orphan"; break; case OrphanStatus::RightOrphan: os << "right orphan"; break; case OrphanStatus::Paired: os << "paired"; break; } return os; } bool isCompatible(const LibraryFormat observed, const LibraryFormat expected, int32_t start, bool isForward, MateStatus ms) { // If we're dealing with a single end read. bool compat{false}; if (ms != MateStatus::PAIRED_END_PAIRED) { compat = compatibleHit(expected, start, isForward, ms); } else { compat = compatibleHit(expected, observed); } return compat; } double logAlignFormatProb(const LibraryFormat observed, const LibraryFormat expected, int32_t start, bool isForward, MateStatus ms, double incompatPrior) { // If we're dealing with a single end read. bool compat{false}; if (ms != MateStatus::PAIRED_END_PAIRED) { compat = compatibleHit(expected, start, isForward, ms); } else { compat = compatibleHit(expected, observed); } return (compat) ? salmon::math::LOG_1 : incompatPrior; /** Old compat code if (expected.type == ReadType::PAIRED_END and observed.type == ReadType::SINGLE_END) { double logOrphanProb = salmon::math::LOG_ORPHAN_PROB; if (expected.strandedness == ReadStrandedness::U or expected.strandedness == ReadStrandedness::AS or expected.strandedness == ReadStrandedness::SA) { return salmon::math::LOG_1; } else { return (expected.strandedness == observed.strandedness) ? logOrphanProb : incompatPrior; } } else if (observed.type != expected.type or observed.orientation != expected.orientation ) { return incompatPrior; } else { if (expected.strandedness == ReadStrandedness::U) { return salmon::math::LOG_ONEHALF; } else { if (expected.strandedness == observed.strandedness) { return salmon::math::LOG_1; } else { return incompatPrior; } } } fmt::print(stderr, "WARNING: logAlignFormatProb --- should not get here"); return salmon::math::LOG_0; */ } // for single end reads or orphans bool compatibleHit(const LibraryFormat expected, int32_t start, bool isForward, MateStatus ms) { auto expectedStrand = expected.strandedness; auto expectedType = expected.type; switch (ms) { case MateStatus::SINGLE_END: if (isForward) { // U, SF return (expectedStrand == ReadStrandedness::U or expectedStrand == ReadStrandedness::S); } else { // U, SR return (expectedStrand == ReadStrandedness::U or expectedStrand == ReadStrandedness::A); } break; // The next two cases are for *orphaned* PE reads // This case is where the mapped read belongs to the "left" (i.e. 1) end // of the pair. case MateStatus::PAIRED_END_LEFT: // "M"atching or same orientation is a special case /* if (expectedType == ReadType::SINGLE_END) { return (expectedStrand == ReadStrandedness::U or (expectedStrand == ReadStrandedness::S and isForward) or (expectedStrand == ReadStrandedness::A and !isForward)); } else */ if (expected.orientation == ReadOrientation::SAME) { return (expectedStrand == ReadStrandedness::U or (expectedStrand == ReadStrandedness::S and isForward) or (expectedStrand == ReadStrandedness::A and !isForward)); } else if (isForward) { // IU, ISF, OU, OSF, MU, MSF return (expectedStrand == ReadStrandedness::U or expectedStrand == ReadStrandedness::SA); } else { // IU, ISR, OU, OSR, MU, MSR return (expectedStrand == ReadStrandedness::U or expectedStrand == ReadStrandedness::AS); } break; // This case is where the mapped read belongs to the "right" (i.e. 2) end // of the pair. case MateStatus::PAIRED_END_RIGHT: // "M"atching or same orientation is a special case /* if (expectedType == ReadType::SINGLE_END) { return (expectedStrand == ReadStrandedness::U or (expectedStrand == ReadStrandedness::A and isForward) or (expectedStrand == ReadStrandedness::S and !isForward)); } else */ if (expected.orientation == ReadOrientation::SAME) { return (expectedStrand == ReadStrandedness::U or (expectedStrand == ReadStrandedness::S and isForward) or (expectedStrand == ReadStrandedness::A and !isForward)); } else if (isForward) { // IU, ISR, OU, OSR, MU, MSR return (expectedStrand == ReadStrandedness::U or expectedStrand == ReadStrandedness::AS); } else { // IU, ISF, OU, OSF, MU, MSF return (expectedStrand == ReadStrandedness::U or expectedStrand == ReadStrandedness::SA); } break; default: // SHOULD NOT GET HERE fmt::print(stderr, "WARNING: Could not associate known library type with read!\n"); return false; break; } // SHOULD NOT GET HERE fmt::print(stderr, "WARNING: Could not associate known library type with read!\n"); return false; } // for paired-end reads bool compatibleHit(const LibraryFormat expected, const LibraryFormat observed) { if (observed.type != ReadType::PAIRED_END) { // SHOULD NOT GET HERE fmt::print(stderr, "WARNING: PE compatibility function called with SE read!\n"); fmt::print(stderr, "expected: {}, observed: {}\n", expected, observed); return false; } auto es = expected.strandedness; auto eo = expected.orientation; auto os = observed.strandedness; auto oo = observed.orientation; // If the orientations are different, they are incompatible if (eo != oo) { return false; } else { // In this branch, the orientations are always compatible return (es == ReadStrandedness::U or es == os); } // SHOULD NOT GET HERE fmt::print(stderr, "WARNING: Could not determine strand compatibility!"); fmt::print(stderr, "please report this.\n"); return false; } template void writeAbundancesFromCollapsed(const SalmonOpts& sopt, ExpLib& alnLib, boost::filesystem::path& fname, std::string headerComments) { using salmon::math::LOG_0; using salmon::math::LOG_1; // If we're using lightweight-alignment (FMD) // and not allowing orphans. bool useScaledCounts = !(sopt.useQuasi or sopt.allowOrphans); std::unique_ptr output( std::fopen(fname.c_str(), "w"), std::fclose); fmt::print(output.get(), "{}", headerComments); fmt::print(output.get(), "Name\tLength\tEffectiveLength\tTPM\tNumReads\n"); double numMappedFrags = alnLib.upperBoundHits(); std::vector& transcripts_ = alnLib.transcripts(); for (auto& transcript : transcripts_) { transcript.projectedCounts = useScaledCounts ? (transcript.mass(false) * numMappedFrags) : transcript.sharedCount(); } double tfracDenom{0.0}; for (auto& transcript : transcripts_) { double refLength = sopt.noEffectiveLengthCorrection ? transcript.RefLength : std::exp(transcript.getCachedLogEffectiveLength()); if (sopt.noLengthCorrection) { refLength = 100.0; } tfracDenom += (transcript.projectedCounts / numMappedFrags) / refLength; } double million = 1000000.0; // Now posterior has the transcript fraction for (auto& transcript : transcripts_) { double logLength = sopt.noEffectiveLengthCorrection ? std::log(transcript.RefLength) : transcript.getCachedLogEffectiveLength(); double count = transcript.projectedCounts; double npm = (transcript.projectedCounts / numMappedFrags); double effLength = std::exp(logLength); if (sopt.noLengthCorrection) { effLength = 100.0; } double tfrac = (npm / effLength) / tfracDenom; double tpm = tfrac * million; fmt::print(output.get(), "{}\t{}\t{}\t{}\t{}\n", transcript.RefName, transcript.CompleteLength, effLength, tpm, count); } } template void writeAbundances(const SalmonOpts& sopt, ExpLib& alnLib, boost::filesystem::path& fname, std::string headerComments) { using salmon::math::LOG_0; using salmon::math::LOG_1; std::unique_ptr output( std::fopen(fname.c_str(), "w"), std::fclose); fmt::print(output.get(), "{}", headerComments); fmt::print(output.get(), "# Name\tLength\tTPM\tFPKM\tNumReads\n"); auto& refs = alnLib.transcripts(); auto numMappedFragments = alnLib.numMappedFragments(); const double logBillion = std::log(1000000000.0); const double million = 1000000.0; const double logNumFragments = std::log(static_cast(numMappedFragments)); const double upperBoundFactor = static_cast(alnLib.upperBoundHits()) / numMappedFragments; auto clusters = alnLib.clusterForest().getClusters(); size_t clusterID = 0; for (auto cptr : clusters) { // double logClusterMass = cptr->logMass(); // EDIT double logClusterMass = salmon::math::LOG_0; double logClusterCount = std::log(upperBoundFactor * static_cast(cptr->numHits())); bool requiresProjection{false}; auto& members = cptr->members(); size_t clusterSize{0}; for (auto transcriptID : members) { Transcript& t = refs[transcriptID]; t.uniqueCounts = t.uniqueCount(); t.totalCounts = t.totalCount(); logClusterMass = salmon::math::logAdd(logClusterMass, t.mass(false)); ++clusterSize; } if (logClusterMass == LOG_0) { // std::cerr << "Warning: cluster " << clusterID << " has 0 mass!\n"; } for (auto transcriptID : members) { Transcript& t = refs[transcriptID]; double logTranscriptMass = t.mass(false); // Try bias /* double logBias = t.bias(); logTranscriptMass += t.bias(); */ if (logTranscriptMass == LOG_0) { t.projectedCounts = 0; } else { double logClusterFraction = logTranscriptMass - logClusterMass; t.projectedCounts = std::exp(logClusterFraction + logClusterCount); requiresProjection |= t.projectedCounts > static_cast(t.totalCounts) or t.projectedCounts < static_cast(t.uniqueCounts); } } if (clusterSize > 1 and requiresProjection) { cptr->projectToPolytope(refs); } ++clusterID; } auto& transcripts_ = refs; double tfracDenom{0.0}; for (auto& transcript : transcripts_) { double refLength = sopt.noEffectiveLengthCorrection ? transcript.RefLength : std::exp(transcript.getCachedLogEffectiveLength()); // refLength = transcript.RefLength; tfracDenom += (transcript.projectedCounts / numMappedFragments) / refLength; } // Now posterior has the transcript fraction for (auto& transcript : transcripts_) { double logLength = sopt.noEffectiveLengthCorrection ? std::log(transcript.RefLength) : transcript.getCachedLogEffectiveLength(); if (sopt.noLengthCorrection) { logLength = 1.0; } // logLength = std::log(transcript.RefLength); double fpkmFactor = std::exp(logBillion - logLength - logNumFragments); double count = transcript.projectedCounts; // double countTotal = transcripts_[transcriptID].totalCounts; // double countUnique = transcripts_[transcriptID].uniqueCounts; double fpkm = count > 0 ? fpkmFactor * count : 0.0; double npm = (transcript.projectedCounts / numMappedFragments); double refLength = std::exp(logLength); double tfrac = (npm / refLength) / tfracDenom; double tpm = tfrac * million; fmt::print(output.get(), "{}\t{}\t{}\t{}\t{}\n", transcript.RefName, transcript.RefLength, tpm, fpkm, count); } } template void normalizeAlphas(const SalmonOpts& sopt, AlnLibT& alnLib) { using salmon::math::LOG_0; using salmon::math::LOG_1; auto& refs = alnLib.transcripts(); auto numMappedFragments = alnLib.numMappedFragments(); const double logNumFragments = std::log(static_cast(numMappedFragments)); auto clusters = alnLib.clusterForest().getClusters(); size_t clusterID = 0; for (auto cptr : clusters) { // double logClusterMass = cptr->logMass(); // EDIT double logClusterMass = salmon::math::LOG_0; double logClusterCount = std::log(static_cast(cptr->numHits())); bool requiresProjection{false}; auto& members = cptr->members(); size_t clusterSize{0}; for (auto transcriptID : members) { Transcript& t = refs[transcriptID]; t.uniqueCounts = t.uniqueCount(); t.totalCounts = t.totalCount(); logClusterMass = salmon::math::logAdd(logClusterMass, t.mass(false)); // + t.bias()); ++clusterSize; } if (logClusterMass == LOG_0) { // std::cerr << "Warning: cluster " << clusterID << " has 0 mass!\n"; } for (auto transcriptID : members) { Transcript& t = refs[transcriptID]; double logTranscriptMass = t.mass(false); // Try bias // double logBias = t.bias(); // logTranscriptMass += t.bias(); if (logTranscriptMass == LOG_0) { t.projectedCounts = 0; } else { double logClusterFraction = logTranscriptMass - logClusterMass; t.projectedCounts = std::exp(logClusterFraction + logClusterCount); requiresProjection |= t.projectedCounts > static_cast(t.totalCounts) or t.projectedCounts < static_cast(t.uniqueCounts); } } if (clusterSize > 1 and requiresProjection) { cptr->projectToPolytope(refs); } ++clusterID; } auto& transcripts_ = refs; double nFracDenom{0.0}; for (auto& transcript : transcripts_) { nFracDenom += (transcript.projectedCounts / numMappedFragments); } double invNFracTotal = 1.0 / nFracDenom; for (auto& transcript : transcripts_) { double v = transcript.projectedCounts / numMappedFragments; // transcript.setMass(v * invNFracTotal); transcript.setMass(transcript.projectedCounts); } } LibraryFormat hitType(int32_t end1Start, bool end1Fwd, int32_t end2Start, bool end2Fwd) { // If the reads come from opposite strands if (end1Fwd != end2Fwd) { // and if read 1 comes from the forward strand if (end1Fwd) { // then if read 1 start < read 2 start ==> ISF if (end1Start <= end2Start) { return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::TOWARD, ReadStrandedness::SA); } // otherwise read 2 start < read 1 start ==> OSF else { return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::AWAY, ReadStrandedness::SA); } } // and if read 2 comes from the forward strand if (end2Fwd) { // then if read 2 start <= read 1 start ==> ISR if (end2Start <= end1Start) { return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::TOWARD, ReadStrandedness::AS); } // otherwise, read 2 start > read 1 start ==> OSR else { return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::AWAY, ReadStrandedness::AS); } } } else { // Otherwise, the reads come from the same strand if (end1Fwd) { // if it's the forward strand ==> MSF return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::SAME, ReadStrandedness::S); } else { // if it's the reverse strand ==> MSR return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::SAME, ReadStrandedness::A); } } // SHOULD NOT GET HERE spdlog::get("jointLog") ->error("ERROR: Could not associate any known library type with read! " "Please report this bug!\n"); std::exit(-1); return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::NONE, ReadStrandedness::U); } LibraryFormat hitType(int32_t end1Start, bool end1Fwd, uint32_t len1, int32_t end2Start, bool end2Fwd, uint32_t len2, bool canDovetail) { // If the reads come from opposite strands if (end1Fwd != end2Fwd) { // and if read 1 comes from the forward strand if (end1Fwd) { // then if read 1 start < read 2 start ==> ISF // NOTE: We can't really delineate between inward facing reads that // stretch // past each other and outward facing reads --- the purpose of stretch is // to help // make this determinateion. int32_t stretch = canDovetail ? len2 : 0; if (end1Start <= end2Start + stretch) { return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::TOWARD, ReadStrandedness::SA); } // otherwise read 2 start < read 1 start ==> OSF else { return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::AWAY, ReadStrandedness::SA); } } // and if read 2 comes from the forward strand if (end2Fwd) { // then if read 2 start <= read 1 start ==> ISR // NOTE: We can't really delineate between inward facing reads that // stretch // past each other and outward facing reads --- the purpose of stretch is // to help // make this determinateion. int32_t stretch = canDovetail ? len1 : 0; if (end2Start <= end1Start + stretch) { return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::TOWARD, ReadStrandedness::AS); } // otherwise, read 2 start > read 1 start ==> OSR else { return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::AWAY, ReadStrandedness::AS); } } } else { // Otherwise, the reads come from the same strand if (end1Fwd) { // if it's the forward strand ==> MSF return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::SAME, ReadStrandedness::S); } else { // if it's the reverse strand ==> MSR return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::SAME, ReadStrandedness::A); } } // SHOULD NOT GET HERE spdlog::get("jointLog") ->error("ERROR: Could not associate any known library type with read! " "Please report this bug!\n"); std::exit(-1); return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::NONE, ReadStrandedness::U); } LibraryFormat hitType(int32_t start, bool isForward) { // If the read comes from the forward strand if (isForward) { return LibraryFormat(ReadType::SINGLE_END, ReadOrientation::NONE, ReadStrandedness::S); } else { return LibraryFormat(ReadType::SINGLE_END, ReadOrientation::NONE, ReadStrandedness::A); } // SHOULD NOT GET HERE fmt::print(stderr, "WARNING: Could not associate known library type with read!\n"); return LibraryFormat(ReadType::PAIRED_END, ReadOrientation::NONE, ReadStrandedness::U); } using std::string; using NameVector = std::vector; using IndexVector = std::vector; using KmerVector = std::vector; /** * This function parses the library format string that specifies the format in * which * the reads are to be expected. */ LibraryFormat parseLibraryFormatStringNew(std::string& fmt) { using std::vector; using std::string; using std::map; using std::stringstream; map formatMap = { {"IU", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::TOWARD, ReadStrandedness::U)}, {"ISF", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::TOWARD, ReadStrandedness::SA)}, {"ISR", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::TOWARD, ReadStrandedness::AS)}, {"OU", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::AWAY, ReadStrandedness::U)}, {"OSF", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::AWAY, ReadStrandedness::SA)}, {"OSR", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::AWAY, ReadStrandedness::AS)}, {"MU", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::SAME, ReadStrandedness::U)}, {"MSF", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::SAME, ReadStrandedness::S)}, {"MSR", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::SAME, ReadStrandedness::A)}, {"U", LibraryFormat(ReadType::SINGLE_END, ReadOrientation::NONE, ReadStrandedness::U)}, {"SF", LibraryFormat(ReadType::SINGLE_END, ReadOrientation::NONE, ReadStrandedness::S)}, {"SR", LibraryFormat(ReadType::SINGLE_END, ReadOrientation::NONE, ReadStrandedness::A)}}; // inspired by // http://stackoverflow.com/questions/236129/how-to-split-a-string-in-c // first convert the string to upper-case for (auto& c : fmt) { c = std::toupper(c); } auto libFmtIt = formatMap.find(fmt); if (libFmtIt == formatMap.end()) { stringstream errstr; errstr << "unknown library format string : " << fmt; throw std::invalid_argument(errstr.str()); } return libFmtIt->second; } /** * Parses a set of __ordered__ command line options and extracts the relevant * read libraries from them. */ std::vector extractReadLibraries(boost::program_options::parsed_options& orderedOptions) { // The current (default) format for paired end data LibraryFormat peFormat(ReadType::PAIRED_END, ReadOrientation::TOWARD, ReadStrandedness::U); // The current (default) format for single end data LibraryFormat seFormat(ReadType::SINGLE_END, ReadOrientation::NONE, ReadStrandedness::U); auto isAutoLibType = [](std::string& fmt) -> bool { return (fmt.length() == 1 and (fmt.front() == 'a' or fmt.front() == 'A')); }; auto log = spdlog::get("jointLog"); bool sawLibType{false}; bool sawPairedLibrary{false}; bool sawUnpairedLibrary{false}; bool autoLibType{false}; std::vector peLibs{peFormat}; std::vector seLibs{seFormat}; for (auto& opt : orderedOptions.options) { // Update the library type if (opt.string_key == "libType") { if (!isAutoLibType(opt.value[0])) { auto libFmt = parseLibraryFormatStringNew(opt.value[0]); if (libFmt.type == ReadType::PAIRED_END) { peFormat = libFmt; peLibs.emplace_back(libFmt); } else { seFormat = libFmt; seLibs.emplace_back(libFmt); } } else { autoLibType = true; } sawLibType = true; } if (opt.string_key == "mates1") { if (!sawLibType) { log->warn("Encountered a read file (--mates1/-1) before a " "library type specification. The (--libType/-l) " "option must precede the input files."); peLibs.clear(); return peLibs; } peLibs.back().addMates1(opt.value); if (autoLibType) { peLibs.back().enableAutodetect(); } sawPairedLibrary = true; } if (opt.string_key == "mates2") { if (!sawLibType) { log->warn("Encountered a read file (--mates2/-2) before a " "library type specification. The (--libType/-l) " "option must precede the input files."); peLibs.clear(); return peLibs; } peLibs.back().addMates2(opt.value); if (autoLibType) { peLibs.back().enableAutodetect(); } sawPairedLibrary = true; } if (opt.string_key == "unmatedReads") { if (!sawLibType) { log->warn("Encountered a read file (--unmatedReads/-r) before a " "library type specification. The (--libType/-l) " "option must precede the input files."); seLibs.clear(); return seLibs; } seLibs.back().addUnmated(opt.value); if (autoLibType) { seLibs.back().enableAutodetect(); } sawUnpairedLibrary = true; } } std::vector libs; // @Avi : Allow this temporarily for now, since there is some use to hijack // this behavior in Alevin. However, we should figure out a proper parsing // strategy for that rather than abusing single & PE library types. Once we // fix that, we should uncomment the below. /* if (sawPairedLibrary and sawUnpairedLibrary) { log->warn("It seems you have specified both paired-end and unpaired read " "libraries. Salmon does not accepted mixed library types, and " "different library types should typically not be quantified together " "anyway. Please quantifiy distinct library types separately."); return libs; } */ (void)sawPairedLibrary; (void)sawUnpairedLibrary; libs.reserve(peLibs.size() + seLibs.size()); for (auto& lib : boost::range::join(seLibs, peLibs)) { if (lib.format().type == ReadType::SINGLE_END) { if (lib.unmated().size() == 0) { // Didn't use default single end library type continue; } } else if (lib.format().type == ReadType::PAIRED_END) { if (lib.mates1().size() == 0 or lib.mates2().size() == 0) { // Didn't use default paired-end library type continue; } } libs.push_back(lib); } size_t numLibs = libs.size(); if (numLibs == 1) { log->info("There is 1 library."); } else if (numLibs > 1) { log->info("There are {} libraries.", numLibs); } return libs; } /** * This function parses the library format string that specifies the format in * which * the reads are to be expected. */ LibraryFormat parseLibraryFormatString(std::string& fmt) { using std::vector; using std::string; using std::map; using std::stringstream; // inspired by // http://stackoverflow.com/questions/236129/how-to-split-a-string-in-c // first convert the string to upper-case for (auto& c : fmt) { c = std::toupper(c); } // split on the delimiter ':', and put the key, value (k=v) pairs into a map stringstream ss(fmt); string item; map kvmap; while (std::getline(ss, item, ':')) { auto splitPos = item.find('=', 0); string key{item.substr(0, splitPos)}; string value{item.substr(splitPos + 1)}; kvmap[key] = value; } map readType = {{"SE", ReadType::SINGLE_END}, {"PE", ReadType::PAIRED_END}}; map orientationType = { {">>", ReadOrientation::SAME}, {"<>", ReadOrientation::AWAY}, {"><", ReadOrientation::TOWARD}, {"*", ReadOrientation::NONE}}; map strandType = {{"SA", ReadStrandedness::SA}, {"AS", ReadStrandedness::AS}, {"A", ReadStrandedness::A}, {"S", ReadStrandedness::S}, {"U", ReadStrandedness::U}}; auto it = kvmap.find("T"); string typeStr = ""; if (it != kvmap.end()) { typeStr = it->second; } else { it = kvmap.find("TYPE"); if (it != kvmap.end()) { typeStr = it->second; } } if (typeStr != "SE" and typeStr != "PE") { string e = typeStr + " is not a valid read type; must be one of {SE, PE}"; throw std::invalid_argument(e); } ReadType type = (typeStr == "SE") ? ReadType::SINGLE_END : ReadType::PAIRED_END; ReadOrientation orientation = (type == ReadType::SINGLE_END) ? ReadOrientation::NONE : ReadOrientation::TOWARD; ReadStrandedness strandedness{ReadStrandedness::U}; // Construct the LibraryFormat class from the key, value map for (auto& kv : kvmap) { auto& k = kv.first; auto& v = kv.second; if (k == "O" or k == "ORIENTATION") { auto it = orientationType.find(v); if (it != orientationType.end()) { orientation = orientationType[it->first]; } else { string e = v + " is not a valid orientation type; must be one of {>>, <>, ><}"; throw std::invalid_argument(e); } } if (k == "S" or k == "STRAND") { auto it = strandType.find(v); if (it != strandType.end()) { strandedness = strandType[it->first]; } else { string e = v + " is not a valid strand type; must be one of {SA, AS, S, A, U}"; throw std::invalid_argument(e); } } } LibraryFormat lf(type, orientation, strandedness); return lf; } bool peekBAMIsPaired(const boost::filesystem::path& file) { namespace bfs = boost::filesystem; std::string readMode = "r"; if (bfs::is_regular_file(file)) { if (bfs::is_empty(file)) { fmt::MemoryWriter errstr; errstr << "file [" << file.string() << "] appears to be empty " "(i.e. it has size 0). This is likely an error. " "Please re-run salmon with a corrected input file.\n\n"; throw std::invalid_argument(errstr.str()); return false; } } if (file.extension() == ".bam") { readMode = "rb"; } auto* fp = scram_open(file.c_str(), readMode.c_str()); // If we couldn't open the file, then report this and exit. if (fp == NULL) { fmt::MemoryWriter errstr; errstr << "ERROR: Failed to open file " << file.string() << ", exiting!\n"; throw std::invalid_argument(errstr.str()); return false; } bam_seq_t* read = nullptr; read = staden::utils::bam_init(); bool didRead = (scram_get_seq(fp, &read) >= 0); bool isPaired{false}; if (didRead) { isPaired = bam_flag(read) & BAM_FPAIRED; } else { fmt::MemoryWriter errstr; errstr << "ERROR: Failed to read alignment from " << file.string() << ", exiting!\n"; staden::utils::bam_destroy(read); throw std::invalid_argument(errstr.str()); return false; } scram_close(fp); staden::utils::bam_destroy(read); return isPaired; } uint64_t encode(uint64_t tid, uint64_t offset) { uint64_t res = (((tid & 0xFFFFFFFF) << 32) | (offset & 0xFFFFFFFF)); return res; } uint32_t transcript(uint64_t enc) { uint32_t t = (enc & 0xFFFFFFFF00000000) >> 32; return t; } uint32_t offset(uint64_t enc) { uint32_t o = enc & 0xFFFFFFFF; return o; } size_t numberOfReadsInFastaFile(const std::string& fname) { constexpr size_t bufferSize = 16184; char buffer[bufferSize]; std::ifstream ifile(fname, std::ifstream::in); ifile.rdbuf()->pubsetbuf(buffer, bufferSize); size_t numReads = 0; std::string s; while (ifile >> s) { if (s.front() == '>') { ++numReads; } } ifile.close(); return numReads; } bool readKmerOrder(const std::string& fname, std::vector& kmers) { std::ifstream mlist(fname, std::ios::in | std::ios::binary); // Get the number of kmers from file size_t numKmers{0}; mlist.read(reinterpret_cast(&numKmers), sizeof(size_t)); // Resize the array that will hold the sorted kmers kmers.resize(numKmers, 0); mlist.read(reinterpret_cast(&kmers[0]), sizeof(uint64_t) * kmers.size()); mlist.close(); return true; } template