#include "DedupUMI.hpp" #include "tsl/hopscotch_map.h" uint32_t getGeneId(spp::sparse_hash_map &txpToGeneMap, uint32_t tId ) { if( txpToGeneMap.contains(tId) ){ return txpToGeneMap.at(tId); } else{ std::cerr << "\n\n\nOut of Range error for txp to gene Map: " << tId << "\t not found" << std::flush; exit(74); } } // choosing list for edges and vector for adjacency container void graphFromCell(std::vector& txpGroups, std::vector& umiGroups, alevin::graph::Graph& g, uint32_t umiEditDistance, uint32_t umiLength, std::atomic& totalUniEdgesCounts, std::atomic& totalBiEdgesCounts) { using namespace alevin::graph; spp::sparse_hash_map> tidMap; // Get a map from each transcript to it's list of eq class size_t numClasses = txpGroups.size(); for (size_t eqId=0; eqId gSet; for ( uint32_t txp : txpGroups[eqId] ) { tidMap[txp].emplace_back(eqId); } } // alevin kmer object alevin::types::AlevinUMIKmer umiObj; spp::sparse_hash_map> vertexIndexMap; vertexIndexMap.reserve(numClasses); //iterating over all eqclasses for (size_t eqId=0; eqId> umiSeqCounts; for(auto& it: umiGroups[eqId]) { umiSeqCounts.emplace_back(std::make_pair(it.first, it.second)); } for ( size_t uId=0; uId(eqId), static_cast(uId)); uint32_t v1 = alevin::graph::getVertexIndex(vertexIndexMap, node); for ( size_t uId_second=uId+1; uId_second(eqId), static_cast(uId_second)); uint32_t v2 = alevin::graph::getVertexIndex(vertexIndexMap, node_second); switch ( edge ) { case EdgeType::BiDirected: totalBiEdgesCounts += 1; g.add_edge(v1, v2); g.add_edge(v2, v1); break; case EdgeType::XToY: totalUniEdgesCounts += 1; g.add_edge(v1, v2); break; case EdgeType::YToX: totalUniEdgesCounts += 1; g.add_edge(v2, v1); break; case EdgeType::NoEdge: break; }; } }//end-inner-for spp::sparse_hash_set hSet; TGroupT& tgroup = txpGroups[eqId]; size_t numTxps = tgroup.size(); // iterate over all the transcripts for ( auto& txp: tgroup ) { for (uint32_t eq2Id: tidMap[txp]) { if (eq2Id <= eqId) { continue; } if ( hSet.contains(eq2Id) ) { continue; } hSet.insert(eq2Id); size_t num2Umis = umiGroups[eq2Id].size(); // extracting umi sequences std::vector> umi2SeqCounts; for(auto& it: umiGroups[eq2Id]) { umi2SeqCounts.emplace_back(std::make_pair(it.first, it.second)); } for ( size_t uId=0; uId(eqId), static_cast(uId)); uint32_t v1 = alevin::graph::getVertexIndex(vertexIndexMap, node); for ( size_t uId_second=0; uId_second(eq2Id), static_cast(uId_second)); if ( node == node_second or edge == EdgeType::NoEdge ) { continue; } uint32_t v2 = alevin::graph::getVertexIndex(vertexIndexMap, node_second); switch ( edge ) { case EdgeType::BiDirected: totalBiEdgesCounts += 1; g.add_edge(v1, v2); g.add_edge(v2, v1); break; case EdgeType::XToY: totalUniEdgesCounts += 1; g.add_edge(v1, v2); break; case EdgeType::YToX: totalUniEdgesCounts += 1; g.add_edge(v2, v1); break; case EdgeType::NoEdge: break; }; } //end-for inner UMI }//end-for outerUMI }//end-for eq2Id }//end-inner for for txp }//end-outer-for size_t num_vertices = vertexIndexMap.size(); g.vertexNames.resize(num_vertices); for (auto& it: vertexIndexMap) { g.vertexNames[it.second] = it.first; } // Done populating graph object } void collapseVertices(uint32_t vertex, alevin::graph::Graph& g, std::vector& txpGroups, std::vector& largestMcc, spp::sparse_hash_set& processedSet) { uint32_t eqclassId = g.getEqclassId(vertex); for (uint32_t txp: txpGroups[eqclassId]){ std::deque bfsList; bfsList.push_back(vertex); spp::sparse_hash_set visitedSet; visitedSet.insert(vertex); std::vector currentMcc; while ( bfsList.size() != 0 ){ uint32_t cv = bfsList.front(); bfsList.pop_front(); currentMcc.emplace_back(cv); for (auto nextVertex: g.getNeighbors(vertex)) { if (visitedSet.contains(nextVertex) || !processedSet.contains(nextVertex)) { continue; } else{ visitedSet.insert(nextVertex); } // extract transcripts from new vertex uint32_t neqclassId = g.getEqclassId( nextVertex ); for (uint32_t ntxp: txpGroups[neqclassId]) { if (ntxp == txp){ bfsList.emplace_back(nextVertex); break; } }//end-txp group for }//end-neighbors-for }//end-while if (largestMcc.size() < currentMcc.size()) { largestMcc = currentMcc; } } //end-for } void getNumMoleculesWithArborescence(alevin::graph::Graph& g, std::vector& txpGroups, std::vector& umiGroups, spp::sparse_hash_map& t2gMap, std::vector>& arboEqClassCount, std::vector& salmonEqclasses){ // get connected components std::vector component; uint32_t numComps = g.connected_components(component); spp::sparse_hash_map, std::pair>, boost::hash>> eqclassHash; // making sets of relevant connected vertices std::vector> comps (numComps); for (size_t i=0; i(i)); } spp::sparse_hash_map compCounter; // iterating over connected components for (auto& comp: comps) { compCounter[comp.size()] += 1; // more than one vertex in the component if ( comp.size() > 1 ) { spp::sparse_hash_set vset(comp.begin(), comp.end()); while ( vset.size() != 0 ){ std::vector bestMcc; for (uint32_t vertex: vset) { std::vector newMcc; collapseVertices(vertex, g, txpGroups, newMcc, vset); //choose the longer collapse: Greedy if (bestMcc.size() < newMcc.size()) { bestMcc = newMcc; } }// end-vset for tsl::hopscotch_map globalTxpCounts; for (size_t vId=0; vId globalTxps; globalTxps.reserve(globalTxpCounts.size()); for (auto kv : globalTxpCounts) { if (kv.second == requiredCount) { globalTxps.push_back(kv.first); } } if( globalTxps.size() == 0 ) { std::cerr << "can't find a representative transcript for a molecule\n" << "Please report this on github "; exit(74); } uint16_t readspmol {0}; for (auto rv: bestMcc){ uint32_t eId = g.getEqclassId(rv); uint32_t uId = g.getUId(rv); auto it = umiGroups[eId].begin(); std::advance(it, uId); readspmol += it->second; vset.erase(rv); } spp::sparse_hash_set globalGenes; for(auto txp: globalTxps){ uint32_t geneId = getGeneId(t2gMap, txp); globalGenes.insert(geneId); } std::vector genesVec (globalGenes.begin(), globalGenes.end()); std::sort (genesVec.begin(), genesVec.end()); eqclassHash[genesVec].first += 1; eqclassHash[genesVec].second[readspmol] += 1; }//end-while } // end-if comp.size()>1 else{ assert(comp.size() == 1); uint32_t vertex = comp[0]; uint32_t eId = g.getEqclassId(vertex); uint32_t uId = g.getUId(vertex); auto it = umiGroups[eId].begin(); std::advance(it, uId); uint16_t readspmol = it->second; uint32_t eqclassId = g.getEqclassId(vertex); TGroupT txps = txpGroups[eqclassId]; spp::sparse_hash_set genes; for (auto txp: txps) { uint32_t gId = getGeneId(t2gMap, txp); genes.insert(gId); } assert(genes.size() > 0); std::vector genesVec (genes.begin(), genes.end()); std::sort (genesVec.begin(), genesVec.end()); eqclassHash[genesVec].first += 1; eqclassHash[genesVec].second[readspmol] += 1; }//end-else comp.size()==1 } //end-outer for comps iterator for (auto& it: eqclassHash) { SalmonEqClass eqclass = { it.first, it.second.first, }; salmonEqclasses.emplace_back(eqclass); arboEqClassCount.emplace_back(it.second.second); } } void getNumMolecules(alevin::graph::Graph& g, std::vector& txpGroups, std::vector& umiGroups, spp::sparse_hash_map& t2gMap, std::vector& salmonEqclasses){ // get connected components std::vector component; uint32_t numComps = g.connected_components(component); spp::sparse_hash_map, uint32_t, boost::hash>> eqclassHash; // making sets of relevant connected vertices std::vector> comps (numComps); for (size_t i=0; i(i)); } spp::sparse_hash_map compCounter; // iterating over connected components for (auto& comp: comps) { compCounter[comp.size()] += 1; // more than one vertex in the component if ( comp.size() > 1 ) { spp::sparse_hash_set vset(comp.begin(), comp.end()); while ( vset.size() != 0 ){ std::vector bestMcc; for (uint32_t vertex: vset) { std::vector newMcc; collapseVertices(vertex, g, txpGroups, newMcc, vset); //choose the longer collapse: Greedy if (bestMcc.size() < newMcc.size()) { bestMcc = newMcc; } }// end-vset for tsl::hopscotch_map globalTxpCounts; for (size_t vId=0; vId globalTxps; globalTxps.reserve(globalTxpCounts.size()); for (auto kv : globalTxpCounts) { if (kv.second == requiredCount) { globalTxps.push_back(kv.first); } } if( globalTxps.size() == 0 ) { std::cerr << "can't find a representative transcript for a molecule\n" << "Please report this on github "; exit(74); } uint16_t readspmol {0}; for (auto rv: bestMcc){ uint32_t eId = g.getEqclassId(rv); uint32_t uId = g.getUId(rv); auto it = umiGroups[eId].begin(); std::advance(it, uId); readspmol += it->second; vset.erase(rv); } spp::sparse_hash_set globalGenes; for(auto txp: globalTxps){ uint32_t geneId = getGeneId(t2gMap, txp); globalGenes.insert(geneId); } std::vector genesVec (globalGenes.begin(), globalGenes.end()); std::sort (genesVec.begin(), genesVec.end()); eqclassHash[genesVec] += 1; }//end-while } // end-if comp.size()>1 else{ assert(comp.size() == 1); uint32_t vertex = comp[0]; uint32_t eId = g.getEqclassId(vertex); uint32_t uId = g.getUId(vertex); auto it = umiGroups[eId].begin(); std::advance(it, uId); uint16_t readspmol = it->second; uint32_t eqclassId = g.getEqclassId(vertex); TGroupT txps = txpGroups[eqclassId]; spp::sparse_hash_set genes; for (auto txp: txps) { uint32_t gId = getGeneId(t2gMap, txp); genes.insert(gId); } assert(genes.size() > 0); std::vector genesVec (genes.begin(), genes.end()); std::sort (genesVec.begin(), genesVec.end()); eqclassHash[genesVec] += 1; }//end-else comp.size()==1 } //end-outer for comps iterator for (auto& it: eqclassHash) { SalmonEqClass eqclass = { it.first, it.second, }; salmonEqclasses.emplace_back(eqclass); } } void assignTiers(std::vector& txpGroups, spp::sparse_hash_map& txpToGeneMap, std::vector& tiers) { // adding tiers to the genes std::vector> geneClasses; spp::sparse_hash_map vertexIndices; for (auto& eclass: txpGroups) { spp::sparse_hash_set genes; for(auto txp: eclass){ uint32_t gene = getGeneId(txpToGeneMap, txp); genes.insert(gene); } // first tier if (genes.size() == 1){ tiers[*genes.begin()] = 1; } else { // have to re parse for second and third tier std::vector geneClass (genes.begin(), genes.end()); geneClasses.emplace_back(geneClass); // populate gene indices for(auto gene: geneClass){ if (!vertexIndices.contains(gene)){ auto gid = vertexIndices.size(); vertexIndices[gene] = gid; } } }//end-else }//end-for //generating edges for the graph spp::sparse_hash_map> edges; for (auto& geneClass: geneClasses) { for (size_t i=0; i AdjList; AdjList adjList; for (auto& it: edges) { uint32_t source = it.first; for(uint32_t target: it.second) { boost::add_edge(source, target, adjList); } } //find the connected component std::vector component(num_vertices(adjList)); uint32_t numComps = boost::connected_components(adjList, component.data()); // making sets of relevant connected vertices std::vector> comps (numComps); for (size_t i=0; i(i)); } std::vector indexToGene(vertexIndices.size()); for(auto& it: vertexIndices){ indexToGene[it.second] = it.first; } if(component.size() != vertexIndices.size()){ std::cerr<<"ERROR: tiers size doesn't match"; std::exit(74); } // iterating over connected components and assigning tiers for (auto& comp: comps) { bool tier2flag = false; for(auto geneIndex: comp) { if (geneIndex >= indexToGene.size()){ std::cerr<<"ERROR:" << geneIndex <<" gene Index > indexToGene size" << indexToGene.size() << std::flush; std::exit(74); } uint32_t gene = indexToGene[geneIndex]; if (tiers[gene] == 1){ tier2flag = true; break; } }//end gene for uint8_t tierCategory = tier2flag ? 2:3; for(auto geneIndex: comp){ uint32_t gene = indexToGene[geneIndex]; tiers[gene] = tierCategory; } //end-for }//end eclass for } bool dedupClasses(std::vector& geneAlphas, double& totalUMICount, std::vector& txpGroups, std::vector& umiGroups, std::vector& salmonEqclasses, uint32_t umiLength, spp::sparse_hash_map& txpToGeneMap, std::vector& tiers, GZipWriter& gzw, uint32_t umiEditDistance, bool dumpUmiGraph, std::string& trueBarcodeStr, std::vector>& arboEqClassCount, bool dumpArborescences, std::atomic& totalUniEdgesCounts, std::atomic& totalBiEdgesCounts){ // make directed graph from eqclasses alevin::graph::Graph g; graphFromCell(txpGroups, umiGroups, g, umiEditDistance, umiLength, totalUniEdgesCounts, totalBiEdgesCounts); if (dumpUmiGraph){ gzw.writeUmiGraph(g, trueBarcodeStr); } // assign tiers to the genes assignTiers(txpGroups, txpToGeneMap, tiers); if (dumpArborescences) { // make gene based eqclasses getNumMoleculesWithArborescence(g, txpGroups, umiGroups, txpToGeneMap, arboEqClassCount, salmonEqclasses); } else { // make gene based eqclasses getNumMolecules(g, txpGroups, umiGroups, txpToGeneMap, salmonEqclasses); } for( auto& eqclass: salmonEqclasses ) { if ( eqclass.labels.size() == 1 ) { totalUMICount += eqclass.count; geneAlphas[eqclass.labels.front()] += eqclass.count; } else if (eqclass.labels.size() == 0){ std::cerr<<"Eqclasses with No gene labels\n"; return false; } } if (dumpArborescences and arboEqClassCount.size() != salmonEqclasses.size()) { std::cerr<<"Incorrect Arborescence features\n"; return false; } return true; }