#include "AlevinHash.hpp" void getTxpToGeneMap(spp::sparse_hash_map& txpToGeneMap, std::vector& transcripts, const std::string& geneMapFile, spp::sparse_hash_map& geneIdxMap){ std::ifstream t2gFile(geneMapFile); spp::sparse_hash_map txpIdxMap(transcripts.size()); for (size_t i=0; i> tStr >> gStr; if(not txpIdxMap.contains(tStr)){ continue; } tid = txpIdxMap[tStr]; if (geneIdxMap.contains(gStr)){ gid = geneIdxMap[gStr]; } else{ gid = geneCount; geneIdxMap[gStr] = gid; geneCount++; } txpToGeneMap[tid] = gid; } t2gFile.close(); } if(txpToGeneMap.size() < transcripts.size()){ std::cerr << "ERROR: " << "Txp to Gene Map not found for " << transcripts.size() - txpToGeneMap.size() <<" transcripts. Exiting" << std::flush; exit(1); } } size_t readBfh(bfs::path& eqFilePath, std::vector& txpNames, size_t bcLength, EqMapT &countMap, std::vector& bcNames, CFreqMapT& freqCounter, TrueBcsT& trueBarcodes, size_t& totalNormalized, bool hasWhitelist ) { if (hasWhitelist && trueBarcodes.size() == 0) { fmt::print(stderr, "whitelist file had 0 CB"); std::cerr<> numTxps; // Number of barcodes equivFile >> numBcs; // Number of equivalence classes equivFile >> numEqclasses; txpNames.resize(numTxps); for (size_t i=0; i> txpNames[i] ; } bcNames.resize(numBcs); for (size_t i=0; i> bcNames[i] ; if (bcNames[i].size() != bcLength) { fmt::print(stderr, "CB {} has wrong length", bcNames[i]); std::cerr< barcodeMap; { // bcname and bc index rearrangement based on external whitelist if (not hasWhitelist) { for (size_t i=0; i trueBarcodeMap; for (auto& bc: trueBarcodes) { trueBarcodeMap[bc] = idx; idx += 1; } // extracting relevant barcodes for (size_t i=0; i(fmt::GREEN); char red[] = "\x1b[30m"; red[3] = '0' + static_cast(fmt::RED); std::cerr<> labelSize; std::vector txps(labelSize); for (auto& tid : txps) { equivFile >> tid; } auto txGroup = TranscriptGroup (txps); size_t bgroupSize; equivFile >> count >> bgroupSize; size_t normalizer{0}; uint32_t countValidator {0}; for (size_t j=0; j> old_bc >> ugroupSize; if (not barcodeMap.contains(old_bc)) { skipCB = true; } else { new_bc = barcodeMap[old_bc]; bcName = bcNames[new_bc]; } for (size_t k=0; k> umiSeq >> umiCount; countValidator += umiCount; if (skipCB) {normalizer += umiCount; continue;} uint64_t umiIndex; bool isUmiIdxOk = umiObj.fromChars(umiSeq); if(isUmiIdxOk){ umiIndex = umiObj.word(0); auto upfn = [new_bc, umiIndex, umiCount](SCTGValue& x) -> void { // update the count x.count += umiCount; x.updateBarcodeGroup(new_bc, umiIndex, umiCount); }; SCTGValue value(umiCount, new_bc, umiIndex, true); countMap.upsert(txGroup, upfn, value); freqCounter[bcName] += umiCount; } }// end-ugroup for }//end-bgroup for if (count != countValidator){ fmt::print(stderr, "BFH eqclass count mismatch" "{} Orignial, validator {} " "Eqclass number {}", count, countValidator, i); std::cerr<(completionFrac)}; if ( percentCompletion % 10 == 0 || percentCompletion > 95) { fmt::print(stderr, "\r{}Done Reading : {}{}%{} and skipped reads: {}{}{}", green, red, percentCompletion, green, red, normalizer, RESET_COLOR); } }//end-eqclass for std::cerr< int salmonHashQuantify(AlevinOpts& aopt, bfs::path& outputDirectory, CFreqMapT& freqCounter) { TrueBcsT trueBarcodes; bool hasWhitelist = boost::filesystem::exists(aopt.whitelistFile); if( hasWhitelist ){ alevin::utils::readWhitelist(aopt.whitelistFile, trueBarcodes); aopt.jointLog->info("Done importing white-list Barcodes"); aopt.jointLog->info("Total {} white-listed Barcodes", trueBarcodes.size()); } EqMapT countMap; size_t numReads {0}; size_t totalNormalized {0}; std::vector txpNames, bcNames; { // Populating Bfh aopt.jointLog->info("Reading BFH"); aopt.jointLog->flush(); numReads = readBfh( aopt.bfhFile, txpNames, aopt.protocol.barcodeLength, countMap, bcNames, freqCounter, trueBarcodes, totalNormalized, hasWhitelist ); if( numReads == 0 ){ aopt.jointLog->error("error reading bfh"); aopt.jointLog->flush(); std::exit(74); } if (totalNormalized > 0) { aopt.jointLog->warn("Skipped {} reads due to external whitelist", totalNormalized); } aopt.jointLog->info("Fount total {} reads in bfh", numReads); aopt.jointLog->flush(); } // Done populating Bfh // extracting meta data for calling alevinOptimize aopt.jointLog->info("Reading transcript to gene Map"); spp::sparse_hash_map txpToGeneMap; spp::sparse_hash_map geneIdxMap; getTxpToGeneMap(txpToGeneMap, txpNames, aopt.geneMapFile.string(), geneIdxMap); GZipWriter gzw(outputDirectory, aopt.jointLog); alevinOptimize(bcNames, txpToGeneMap, geneIdxMap, countMap, aopt, gzw, freqCounter, 0); return 0; } template int salmonHashQuantify(AlevinOpts& aopt, bfs::path& outputDirectory, CFreqMapT& freqCounter); template int salmonHashQuantify(AlevinOpts& aopt, bfs::path& outputDirectory, CFreqMapT& freqCounter); template int salmonHashQuantify(AlevinOpts& aopt, bfs::path& outputDirectory, CFreqMapT& freqCounter); template int salmonHashQuantify(AlevinOpts& aopt, bfs::path& outputDirectory, CFreqMapT& freqCounter); template int salmonHashQuantify(AlevinOpts& aopt, bfs::path& outputDirectory, CFreqMapT& freqCounter); template int salmonHashQuantify(AlevinOpts& aopt, bfs::path& outputDirectory, CFreqMapT& freqCounter); template int salmonHashQuantify(AlevinOpts& aopt, bfs::path& outputDirectory, CFreqMapT& freqCounter); template int salmonHashQuantify(AlevinOpts& aopt, bfs::path& outputDirectory, CFreqMapT& freqCounter); template int salmonHashQuantify(AlevinOpts& aopt, bfs::path& outputDirectory, CFreqMapT& freqCounter); template int salmonHashQuantify(AlevinOpts& aopt, bfs::path& outputDirectory, CFreqMapT& freqCounter); template int salmonHashQuantify(AlevinOpts& aopt, bfs::path& outputDirectory, CFreqMapT& freqCounter); template int salmonHashQuantify(AlevinOpts& aopt, bfs::path& outputDirectory, CFreqMapT& freqCounter); template int salmonHashQuantify(AlevinOpts& aopt, bfs::path& outputDirectory, CFreqMapT& freqCounter); template int salmonHashQuantify(AlevinOpts& aopt, bfs::path& outputDirectory, CFreqMapT& freqCounter);