#include "DedupUMI.hpp" #include "tsl/hopscotch_map.h" uint32_t getGeneId(spp::sparse_hash_map &txpToGeneMap, uint32_t tId ) { if( txpToGeneMap.contains(tId) ){ return txpToGeneMap.at(tId); } else{ std::cerr << "Out of Range error for txp to gene Map: " << '\n' << std::flush; std::cerr << tId << "\t not found" << std::flush; exit(1); } } // choosing list for edges and vector for adjacency container void graphFromCell(std::vector& txpGroups, std::vector& umiGroups, alevin::graph::Graph& g, std::atomic& totalUniEdgesCounts, std::atomic& totalBiEdgesCounts) { using namespace alevin::graph; spp::sparse_hash_map> tidMap; // Get a map from each transcript to it's list of eq class size_t numClasses = txpGroups.size(); for (size_t eqId=0; eqId gSet; for ( uint32_t txp : txpGroups[eqId] ) { tidMap[txp].emplace_back(eqId); } } // alevin kmer object alevin::types::AlevinUMIKmer umiObj; spp::sparse_hash_map> vertexIndexMap; vertexIndexMap.reserve(numClasses); //iterating over all eqclasses for (size_t eqId=0; eqId> umiSeqCounts; for(auto& it: umiGroups[eqId]) { umiSeqCounts.emplace_back(std::make_pair(it.first, it.second)); } for ( size_t uId=0; uId(eqId), static_cast(uId)); uint32_t v1 = alevin::graph::getVertexIndex(vertexIndexMap, node); for ( size_t uId_second=uId+1; uId_second(eqId), static_cast(uId_second)); uint32_t v2 = alevin::graph::getVertexIndex(vertexIndexMap, node_second); //check if two UMI can be connected EdgeType edge = alevin::graph::hasEdge( umiSeqCounts[uId], umiSeqCounts[uId_second]); switch ( edge ) { case EdgeType::BiDirected: totalBiEdgesCounts += 1; g.add_edge(v1, v2); g.add_edge(v2, v1); break; case EdgeType::XToY: totalUniEdgesCounts += 1; g.add_edge(v1, v2); break; case EdgeType::YToX: totalUniEdgesCounts += 1; g.add_edge(v2, v1); break; case EdgeType::NoEdge: break; }; } }//end-inner-for spp::sparse_hash_set hSet; TGroupT& tgroup = txpGroups[eqId]; size_t numTxps = tgroup.size(); // iterate over all the transcripts for ( auto& txp: tgroup ) { for (uint32_t eq2Id: tidMap[txp]) { if (eq2Id <= eqId) { continue; } if ( hSet.contains(eq2Id) ) { continue; } hSet.insert(eq2Id); size_t num2Umis = umiGroups[eq2Id].size(); // extracting umi sequences std::vector> umi2SeqCounts; for(auto& it: umiGroups[eq2Id]) { umi2SeqCounts.emplace_back(std::make_pair(it.first, it.second)); } for ( size_t uId=0; uId(eqId), static_cast(uId)); uint32_t v1 = alevin::graph::getVertexIndex(vertexIndexMap, node); for ( size_t uId_second=0; uId_second(eq2Id), static_cast(uId_second)); if ( node == node_second ) { continue; } uint32_t v2 = alevin::graph::getVertexIndex(vertexIndexMap, node_second); //check if two UMI can be connected EdgeType edge = alevin::graph::hasEdge( umiSeqCounts[uId], umi2SeqCounts[uId_second]); switch ( edge ) { case EdgeType::BiDirected: totalBiEdgesCounts += 1; g.add_edge(v1, v2); g.add_edge(v2, v1); break; case EdgeType::XToY: totalUniEdgesCounts += 1; g.add_edge(v1, v2); break; case EdgeType::YToX: totalUniEdgesCounts += 1; g.add_edge(v2, v1); break; case EdgeType::NoEdge: break; }; } //end-for inner UMI }//end-for outerUMI }//end-for eq2Id }//end-inner for for txp }//end-outer-for size_t num_vertices = vertexIndexMap.size(); g.vertexNames.resize(num_vertices); for (auto& it: vertexIndexMap) { g.vertexNames[it.second] = it.first; } // Done populating graph object } void collapseVertices(uint32_t vertex, alevin::graph::Graph& g, std::vector& txpGroups, uint32_t& chosenTxp, std::vector& largestMcc, spp::sparse_hash_set& processedSet) { uint32_t eqclassId = g.getEqclassId(vertex); for (uint32_t txp: txpGroups[eqclassId]){ std::deque bfsList; bfsList.push_back(vertex); spp::sparse_hash_set visitedSet; visitedSet.insert(vertex); std::vector currentMcc; while ( bfsList.size() != 0 ){ uint32_t cv = bfsList.front(); bfsList.pop_front(); currentMcc.emplace_back(cv); for (auto nextVertex: g.getNeighbors(vertex)) { if (visitedSet.contains(nextVertex) || !processedSet.contains(nextVertex)) { continue; } else{ visitedSet.insert(nextVertex); } // extract transcripts from new vertex eqclassId = g.getEqclassId( nextVertex ); for (uint32_t ntxp: txpGroups[eqclassId]) { if (ntxp == txp){ bfsList.emplace_back(nextVertex); break; } }//end-txp group for }//end-neighbors-for }//end-while if (largestMcc.size() < currentMcc.size()) { largestMcc = currentMcc; chosenTxp = txp; } } //end-for } void getNumMolecules(alevin::graph::Graph& g, std::vector& txpGroups, spp::sparse_hash_map& t2gMap, std::vector& salmonEqclasses){ // get connected components std::vector component; uint32_t numComps = g.connected_components(component); spp::sparse_hash_map, uint32_t, boost::hash>> eqclassHash; // making sets of relevant connected vertices std::vector> comps (numComps); for (size_t i=0; i(i)); } spp::sparse_hash_map compCounter; // iterating over connected components for (auto& comp: comps) { compCounter[comp.size()] += 1; // more than one vertex in the component if ( comp.size() > 1 ) { spp::sparse_hash_set vset(comp.begin(), comp.end()); while ( vset.size() != 0 ){ std::vector bestMcc; for (uint32_t vertex: vset) { uint32_t coveringTxp; std::vector newMcc; collapseVertices(vertex, g, txpGroups, coveringTxp, newMcc, vset); //choose the longer collapse: Greedy if (bestMcc.size() < newMcc.size()) { bestMcc = newMcc; } }// end-vset for tsl::hopscotch_map globalTxpCounts; for (size_t vId=0; vId localTxps; uint32_t eqclassId = g.getEqclassId(vertex); for (uint32_t txp: txpGroups[eqclassId]){ globalTxpCounts[txp] += 1; } }//end-mcc for // only transcripts that occur in *every* vertex, and hence // have a count of bestMcc.size(), are in the proper intersection. uint32_t requiredCount = bestMcc.size(); std::vector globalTxps; globalTxps.reserve(globalTxpCounts.size()); for (auto kv : globalTxpCounts) { if (kv.second == requiredCount) { globalTxps.push_back(kv.first); } } if( globalTxps.size() == 0 ) { std::cerr << "can't find a representative transcript for a molecule\n" << "Please report this on github"; exit(1); } for (auto rv: bestMcc){ vset.erase(rv); } spp::sparse_hash_set globalGenes; for(auto txp: globalTxps){ uint32_t geneId = getGeneId(t2gMap, txp); globalGenes.insert(geneId); } std::vector genesVec (globalGenes.begin(), globalGenes.end()); std::sort (genesVec.begin(), genesVec.end()); eqclassHash[genesVec] += 1; }//end-while } // end-if comp.size()>1 else{ assert(comp.size() == 1); uint32_t vertex = comp[0]; uint32_t eqclassId = g.getEqclassId(vertex); TGroupT txps = txpGroups[eqclassId]; spp::sparse_hash_set genes; for (auto txp: txps) { uint32_t gId = getGeneId(t2gMap, txp); genes.insert(gId); } assert(genes.size() > 0); std::vector genesVec (genes.begin(), genes.end()); std::sort (genesVec.begin(), genesVec.end()); eqclassHash[genesVec] += 1; }//end-else comp.size()==1 } //end-outer for comps iterator //size_t count{0}; //for (auto& it: compCounter) { // count += it.second; // std::cerr << "Size: " << it.first << " Count: " << it.second << std::endl; //} //std::cerr << "Total" << count << std::endl; //std::cerr<<"\n\n\n\n"; for (auto& it: eqclassHash) { SalmonEqClass eqclass = { it.first, it.second, }; salmonEqclasses.emplace_back(eqclass); } } void assignTiers(std::vector& txpGroups, spp::sparse_hash_map& txpToGeneMap, std::vector& tiers) { // adding tiers to the genes std::vector> geneClasses; spp::sparse_hash_map vertexIndices; for (auto& eclass: txpGroups) { spp::sparse_hash_set genes; for(auto txp: eclass){ uint32_t gene = getGeneId(txpToGeneMap, txp); genes.insert(gene); } // first tier if (genes.size() == 1){ tiers[*genes.begin()] = 1; } else { // have to re parse for second and third tier std::vector geneClass (genes.begin(), genes.end()); geneClasses.emplace_back(geneClass); // populate gene indices for(auto gene: geneClass){ if (!vertexIndices.contains(gene)){ auto gid = vertexIndices.size(); vertexIndices[gene] = gid; } } }//end-else }//end-for //generating edges for the graph spp::sparse_hash_map> edges; for (auto& geneClass: geneClasses) { for (size_t i=0; i AdjList; AdjList adjList; for (auto& it: edges) { uint32_t source = it.first; for(uint32_t target: it.second) { boost::add_edge(source, target, adjList); } } //find the connected component std::vector component(num_vertices(adjList)); uint32_t numComps = boost::connected_components(adjList, component.data()); // making sets of relevant connected vertices std::vector> comps (numComps); for (size_t i=0; i(i)); } std::vector indexToGene(vertexIndices.size()); for(auto& it: vertexIndices){ indexToGene[it.second] = it.first; } if(component.size() != vertexIndices.size()){ std::cerr<<"ERROR: tiers size doesn't match"; std::exit(1); } // iterating over connected components and assigning tiers for (auto& comp: comps) { bool tier2flag = false; for(auto geneIndex: comp) { if (geneIndex >= indexToGene.size()){ std::cerr<<"ERROR:" << geneIndex <<" gene Index > indexToGene size" << indexToGene.size() << std::flush; std::exit(1); } uint32_t gene = indexToGene[geneIndex]; if (tiers[gene] == 1){ tier2flag = true; break; } }//end gene for uint8_t tierCategory = tier2flag ? 2:3; for(auto geneIndex: comp){ uint32_t gene = indexToGene[geneIndex]; tiers[gene] = tierCategory; } //end-for }//end eclass for } bool dedupClasses(std::vector& geneAlphas, double& totalUMICount, std::vector& txpGroups, std::vector& umiGroups, std::vector& salmonEqclasses, spp::sparse_hash_map& txpToGeneMap, std::vector& tiers, GZipWriter& gzw, bool dumpUmiGraph, std::string& trueBarcodeStr, std::atomic& totalUniEdgesCounts, std::atomic& totalBiEdgesCounts){ // make directed graph from eqclasses alevin::graph::Graph g; graphFromCell(txpGroups, umiGroups, g, totalUniEdgesCounts, totalBiEdgesCounts); if (dumpUmiGraph){ gzw.writeUmiGraph(g, trueBarcodeStr); } // assign tiers to the genes assignTiers(txpGroups, txpToGeneMap, tiers); // make gene based eqclasses getNumMolecules(g, txpGroups, txpToGeneMap, salmonEqclasses); for( auto& eqclass: salmonEqclasses ) { if ( eqclass.labels.size() == 1 ) { totalUMICount += eqclass.count; geneAlphas[eqclass.labels.front()] += eqclass.count; } else if (eqclass.labels.size() == 0){ std::cerr<<"Eqclasses with No gene labels\n"; exit(1); } } return true; }