#include "WhiteList.hpp" //#include "tbb/task_scheduler_init.h" #include #include namespace alevin { namespace whitelist { double get_log_likelihood(double prior, DoubleVectorT& sigma, DoubleVectorT& theta, DoubleVectorT& query){ double logPrior = prior ? std::log(prior) : 0.0; double mean, variance, likelihood {0.5}; // Gaussian Probability for (size_t i=0; i& classCount, uint32_t is_true){ size_t numFeatures = features[0].size(); for ( auto& feature: features ){ for (size_t i=0; i& classCount, DoubleVectorT& classPrior, /*size_t numClasses,*/ size_t numTrueCells, size_t numAmbiguousCells, size_t numFalseCells){ //std::assert( theta.size() == sigma.size() ); size_t numCells = features.size(); size_t numFeatures = features[0].size(); size_t trueStartIdx{0}, trueEndIdx{numTrueCells-1}; size_t falseStartIdx{numTrueCells+numAmbiguousCells}; size_t falseEndIdx{numCells-1}; //std::assert( falseEndIdx-falseStartIdx == numFalseCells); classCount[0] = numTrueCells; classCount[1] = numFalseCells; update_mean_variance(DoubleMatrixT (features.begin()+trueStartIdx, features.begin()+trueEndIdx), theta, sigma, classCount, 0); update_mean_variance(DoubleMatrixT (features.begin()+falseStartIdx, features.begin()+falseEndIdx), theta, sigma, classCount, 1); // calculate prior size_t norm = classCount[0] + classCount[1]; classPrior[0] = classCount[0] / norm; classPrior[1] = classCount[1] / norm; } // Implementation from https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/naive_bayes.py std::vector classifyBarcodes(DoubleMatrixT& featureCountsMatrix, size_t numCells, size_t numFeatures, size_t numLowConfidentBarcode, size_t numTrueCells, std::vector& trueProb, std::vector& falseProb){ size_t numFalseCells { numLowConfidentBarcode }; size_t numAmbiguousCells { numCells - numTrueCells - numLowConfidentBarcode }; size_t i, numClasses { 2 }; std::vector selectedBarcodes; std::vector classCount (numClasses, 0); std::vector classPrior (numClasses, 0.0); DoubleMatrixT theta (numClasses, DoubleVectorT (numFeatures, 0.0)); DoubleMatrixT sigma (numClasses, DoubleVectorT (numFeatures, 0.0)); naive_bayes_fit(featureCountsMatrix, theta, sigma, classCount, classPrior, /*numClasses,*/ numTrueCells, numAmbiguousCells, numFalseCells); //trueProb.resize(numAmbiguousCells, 0.0); //falseProb.resize(numAmbiguousCells, 0.0); for (auto vec: sigma){ for (auto cell : vec){ std::cout< bool performWhitelisting(AlevinOpts& aopt, std::vector& trueBarcodes, bool useRibo, bool useMito, size_t numLowConfidentBarcode){ size_t numCells = trueBarcodes.size(); size_t numFeatures {5}; if (useMito) { ++numFeatures; } if (useRibo) { ++numFeatures; } aopt.numFeatures = numFeatures; aopt.jointLog->info("Starting to make feature Matrix"); size_t numTrueCells = ( numCells - numLowConfidentBarcode ) / 2; DoubleMatrixT featureCountsMatrix( numCells, DoubleVectorT (numFeatures, 0.0)); { // populating features boost::filesystem::path featuresFileName = aopt.outputDirectory / "featureDump.txt"; std::ifstream featuresfile(featuresFileName.string()); std::string line, token; size_t cellId {0}; while( std::getline( featuresfile, line) ) { cellId += 1; if (cellId == 1) { continue; } size_t featureId {0}; std::vector features; std::stringstream buffer(line); while( getline( buffer, token, '\t') ) { featureId += 1; if (aopt.dumpUmiGraph and featureId > 4+numFeatures) { break; } if (featureId > 4) { features.emplace_back( std::strtod(token.c_str(), nullptr)); } } if (features.size() != numFeatures) { aopt.jointLog->error("Size of the feature file doesn't match.\n" "Please report the error on github."); aopt.jointLog->flush(); exit(84); } featureCountsMatrix[cellId-2] = features; } if (cellId-1 != numCells) { aopt.jointLog->error("The features file has less number of cells.\n" "Please report the error on github."); aopt.jointLog->flush(); exit(84); } aopt.jointLog->info("Done making feature Matrix"); aopt.jointLog->flush(); } // done populating features std::vector trueProbs, falseProbs; std::vector whitelistBarcodes = classifyBarcodes(featureCountsMatrix, numCells, numFeatures, numLowConfidentBarcode, numTrueCells, trueProbs, falseProbs); auto whitelistFileName = aopt.outputDirectory / "whitelist.txt"; std::ofstream whitelistStream(whitelistFileName.string()); for (auto i: whitelistBarcodes){ whitelistStream << trueBarcodes[i] << std::endl; } whitelistStream.close(); aopt.intelligentCutoff = whitelistBarcodes.size(); if (aopt.dumpfeatures) { std::ofstream predictionStream; auto predictionFileName = aopt.outputDirectory / "predictions.txt"; predictionStream.open(predictionFileName.string()); predictionStream << "cb\ttrue_prob\tFalse_prob\n"; for (auto i: whitelistBarcodes){ if (i >= numTrueCells) { predictionStream << trueBarcodes[i] << "\t" << trueProbs[i-numTrueCells] << "\t" << falseProbs[i-numTrueCells] << "\t\n"; } } predictionStream.close(); }//end-if dumpfeatures return true; } template bool performWhitelisting(AlevinOpts& aopt, std::vector& trueBarcodes, bool useRibo, bool useMito, size_t numLowConfidentBarcode); template bool performWhitelisting(AlevinOpts& aopt, std::vector& trueBarcodes, bool useRibo, bool useMito, size_t numLowConfidentBarcode); template bool performWhitelisting(AlevinOpts& aopt, std::vector& trueBarcodes, bool useRibo, bool useMito, size_t numLowConfidentBarcode); template bool performWhitelisting(AlevinOpts& aopt, std::vector& trueBarcodes, bool useRibo, bool useMito, size_t numLowConfidentBarcode); template bool performWhitelisting(AlevinOpts& aopt, std::vector& trueBarcodes, bool useRibo, bool useMito, size_t numLowConfidentBarcode); template bool performWhitelisting(AlevinOpts& aopt, std::vector& trueBarcodes, bool useRibo, bool useMito, size_t numLowConfidentBarcode); template bool performWhitelisting(AlevinOpts& aopt, std::vector& trueBarcodes, bool useRibo, bool useMito, size_t numLowConfidentBarcode); template bool performWhitelisting(AlevinOpts& aopt, std::vector& trueBarcodes, bool useRibo, bool useMito, size_t numLowConfidentBarcode); template bool performWhitelisting(AlevinOpts& aopt, std::vector& trueBarcodes, bool useRibo, bool useMito, size_t numLowConfidentBarcode); template bool performWhitelisting(AlevinOpts& aopt, std::vector& trueBarcodes, bool useRibo, bool useMito, size_t numLowConfidentBarcode); template bool performWhitelisting(AlevinOpts& aopt, std::vector& trueBarcodes, bool useRibo, bool useMito, size_t numLowConfidentBarcode); template bool performWhitelisting(AlevinOpts& aopt, std::vector& trueBarcodes, bool useRibo, bool useMito, size_t numLowConfidentBarcode); template bool performWhitelisting(AlevinOpts& aopt, std::vector& trueBarcodes, bool useRibo, bool useMito, size_t numLowConfidentBarcode); template bool performWhitelisting(AlevinOpts& aopt, std::vector& trueBarcodes, bool useRibo, bool useMito, size_t numLowConfidentBarcode); } }