/**
>HEADER
Copyright (c) 2013, 2014, 2015, 2016 Rob Patro rob.patro@cs.stonybrook.edu
This file is part of Salmon.
Salmon is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Salmon is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Salmon. If not, see .
#include
#include
#include
#include
// C++ string formatting library #include "spdlog/fmt/fmt.h"
// logger includes
#include "spdlog/spdlog.h"
enum class TargetColumn { LEN, ELEN, TPM, NREADS };
class QuantMergeOptions {
public:
std::vector samples;
std::vector names;
std::string outputName;
bool genesQuant;
std::string missingValue;
std::string outputCol;
std::shared_ptr log;
TargetColumn tcol;
void print() {
std::string slist;
for (size_t n = 0; n < samples.size(); ++n) {
if (n > 0) {
slist += ", ";
}
slist += samples[n];
}
log->info("samples: [ {} ]", slist);
if (names.size() > 0) {
std::string nlist;
for (size_t n = 0; n < names.size(); ++n) {
if (n > 0) {
nlist += ", ";
}
nlist += names[n];
}
log->info("sample names : [ {} ]", nlist);
}
log->info("output column : {}", outputCol);
log->info("output file : {}", outputName);
}
};
bool validateOptions(boost::program_options::variables_map& vm,
QuantMergeOptions& qmOpts,
std::shared_ptr log) {
// check that if we have a list of names, it's the same length
// as the list of samples
if (vm.count("names")) {
if (qmOpts.names.size() != qmOpts.samples.size()) {
log->critical("If you provide names for your samples, they number of "
"names must be the "
"same as the number of samples. You provided {} samples "
"but {} names",
qmOpts.samples.size(), qmOpts.names.size());
std::exit(1);
}
} else {
for (auto& sp : qmOpts.samples) {
qmOpts.names.push_back(boost::filesystem::path(sp).filename().string());
}
}
std::transform(qmOpts.outputCol.begin(), qmOpts.outputCol.end(),
qmOpts.outputCol.begin(),
[](unsigned char c) { return std::toupper(c); });
if (qmOpts.outputCol == "LEN" or qmOpts.outputCol == "LENGTH") {
qmOpts.tcol = TargetColumn::LEN;
} else if (qmOpts.outputCol == "ELEN" or qmOpts.outputCol == "ELENGTH" or
qmOpts.outputCol == "EFFLEN" or qmOpts.outputCol == "EFFLENGTH" or
qmOpts.outputCol == "EFFECTIVELEN" or
qmOpts.outputCol == "EFFECTIVELENGTH") {
qmOpts.tcol = TargetColumn::ELEN;
} else if (qmOpts.outputCol == "TPM") {
qmOpts.tcol = TargetColumn::TPM;
} else if (qmOpts.outputCol == "NUMREADS" or
qmOpts.outputCol == "NUM_READS" or qmOpts.outputCol == "NREADS") {
qmOpts.tcol = TargetColumn::NREADS;
} else {
qmOpts.log->critical(
"I do not understand the requested output column name, {}. "
"The output column should be one of {{len, elen, tmp, numreads}}.",
qmOpts.outputCol);
std::exit(1);
}
return true;
}
struct ExpressionRecord {
uint32_t snum;
uint32_t len;
double effectiveLen;
double tpm;
double numReads;
};
bool doMerge(QuantMergeOptions& qmOpts) {
std::unordered_map> recs;
for (uint32_t n = 0; n < qmOpts.samples.size(); ++n) {
auto& sampDir = qmOpts.samples[n];
auto quantFile = boost::filesystem::path(sampDir) /
(qmOpts.genesQuant?"quant.genes.sf":"quant.sf");
if (boost::filesystem::exists(quantFile) and
boost::filesystem::is_regular_file(quantFile)) {
qmOpts.log->info("Parsing {}", quantFile.string());
std::ifstream ifile(quantFile.string());
std::string header;
std::getline(ifile, header);
std::string targetName;
uint32_t len;
double effectiveLen, tpm, numReads;
while (ifile >> targetName >> len >> effectiveLen >> tpm >> numReads) {
recs[targetName].push_back({n, len, effectiveLen, tpm, numReads});
}
ifile.close();
} else {
qmOpts.log->critical("The sample directory {} either doesn't exist, "
"or doesn't contain a quant.sf file",
sampDir);
return false;
}
}
auto outputPath =
boost::filesystem::absolute(boost::filesystem::path(qmOpts.outputName))
.parent_path();
if (!boost::filesystem::exists(outputPath)) {
if (!boost::filesystem::create_directories(outputPath)) {
qmOpts.log->critical("Couldn't create output path {}",
outputPath.string());
std::exit(1);
}
}
size_t missingValues{0};
// Now, the path exists
std::ofstream outFile(qmOpts.outputName);
if (!outFile.is_open()) {
qmOpts.log->critical("Couldn't create output file {}", qmOpts.outputName);
outFile.close();
std::exit(1);
}
outFile << "Name";
for (size_t n = 0; n < qmOpts.samples.size(); ++n) {
outFile << '\t' << qmOpts.names[n];
}
outFile << '\n';
for (auto& kv : recs) {
auto& trecs = kv.second;
outFile << kv.first;
uint32_t nextTrec{0};
for (uint32_t n = 0; n < qmOpts.samples.size(); ++n) {
if (nextTrec < trecs.size() and trecs[nextTrec].snum == n) {
switch (qmOpts.tcol) {
case TargetColumn::LEN:
outFile << '\t' << trecs[nextTrec].len;
break;
case TargetColumn::ELEN:
outFile << '\t' << trecs[nextTrec].effectiveLen;
break;
case TargetColumn::TPM:
outFile << '\t' << trecs[nextTrec].tpm;
break;
case TargetColumn::NREADS:
outFile << '\t' << trecs[nextTrec].numReads;
break;
}
++nextTrec;
} else {
++missingValues;
outFile << '\t' << qmOpts.missingValue;
}
}
outFile << '\n';
}
outFile.close();
if (missingValues > 0) {
qmOpts.log->warn(
"There were {} missing entries (recorded as \"{}\") in the output",
missingValues, qmOpts.missingValue);
}
return true;
}
int salmonQuantMerge(int argc, const char* argv[]) {
using std::cerr;
using std::vector;
using std::string;
namespace bfs = boost::filesystem;
namespace po = boost::program_options;
QuantMergeOptions qmOpts;
po::options_description generic("\n"
"basic options");
generic.add_options()("version,v", "print version string")
("help,h", "produce help message")
("quants",
po::value>(&qmOpts.samples)->multitoken()->required(),
"List of quantification directories.")
("names", po::value>(&qmOpts.names)->multitoken(),
"Optional list of names to give to the samples.")
("column,c",
po::value(&qmOpts.outputCol)->required()->default_value("TPM"),
"The name of the column that will be merged together into the output "
"files. "
"The options are {len, elen, tpm, numreads}")
("genes", po::bool_switch(&(qmOpts.genesQuant))->default_value(false),
"Use gene quantification instead of transcript.")
("missing",
po::value(&qmOpts.missingValue)->required()->default_value("NA"),
"The value of missing values.")
("output,o", po::value(&qmOpts.outputName)->required(),
"Output quantification file.");
po::options_description all("salmon quantmerge options");
all.add(generic);
po::options_description visible("salmon quantmerge options");
visible.add(generic);
po::variables_map vm;
try {
auto orderedOptions =
po::command_line_parser(argc, argv).options(all).run();
po::store(orderedOptions, vm);
if (vm.count("help")) {
auto hstring = R"(
quantmerge
==========
Merge multiple quantification results into
a single file.
)";
std::cerr << hstring << std::endl;
std::cerr << visible << std::endl;
std::exit(0);
}
po::notify(vm);
size_t max_q_size = 131072;
spdlog::set_async_mode(max_q_size);
auto consoleSink =
std::make_shared();
auto consoleLog = spdlog::create("mergeLog", {consoleSink});
qmOpts.log = consoleLog;
validateOptions(vm, qmOpts, consoleLog);
qmOpts.print();
doMerge(qmOpts);
} catch (po::error& e) {
std::cerr << "Exception : [" << e.what() << "]. Exiting.\n";
std::exit(1);
} catch (const spdlog::spdlog_ex& ex) {
std::cerr << "logger failed with : [" << ex.what() << "]. Exiting.\n";
std::exit(1);
} catch (std::exception& e) {
std::cerr << "Exception : [" << e.what() << "]\n";
std::cerr << argv[0] << " quant was invoked improperly.\n";
std::cerr << "For usage information, try " << argv[0]
<< " quant --help\nExiting.\n";
std::exit(1);
}
return 0;
}