#ifndef OUR_GFA_READER_H #define OUR_GFA_READER_H #include "FatPufferGraph.hpp" #include "Util.hpp" #include "cereal/types/string.hpp" #include "cereal/types/vector.hpp" //#include "sdsl/int_vector.hpp" #include "sparsepp/spp.h" #include "spdlog/spdlog.h" #include "compact_vector/compact_vector.hpp" #include "string_view.hpp" #include "zstr/zstr.hpp" #include #include #include #include #include #include #include #include namespace pufferfish { class GFAReader { private: std::string filename_; std::unique_ptr file; size_t k; struct Contig { std::string seq; std::string id; }; spp::sparse_hash_map contigid2seq; // map of contig_id to # of letters in contig (contig // length) spp::sparse_hash_map seq2contigid; // path maps each transcript_id to a pair of // orientation : +/true main, -/false reverse spp::sparse_hash_map>> path; spp::sparse_hash_map refIDs; // spp::sparse_hash_map refMap; std::vector refMap; std::vector refLengths; // maps each contig to a list of positions in different transcripts std::vector> explode(const stx::string_view str, const char& ch); std::map, bool, pufferfish::util::cmpByPair> pathStart; std::map, bool, pufferfish::util::cmpByPair> pathEnd; compact::vector seqVec_; //edge table //ATGC|ATGC = 8 bits compact::vector edgeVec_; //predecessor,stores the same //transcript in reverse order //improve walkability //sdsl::int_vector<8> edgeVec2_; std::vector> newSegments; pufg::Graph semiCG; size_t fillContigInfoMap_(); bool is_number(const std::string& s); // Avoiding un-necessary stream creation + replacing strings with string view // is a bit > than a 2x win! // implementation from : https://marcoarena.wordpress.com/tag/string_view/ std::vector split(stx::string_view str, char delims); bool buildEdgeVec_{false}; std::shared_ptr logger_{nullptr}; public: spp::sparse_hash_map> contig2pos; GFAReader(const char* gfaFileName, size_t input_k, bool buildEdgeVEc, std::shared_ptr logger); /*void encodeSeq(sdsl::int_vector<2>& seqVec, size_t offset, stx::string_view str); */ void encodeSeq(compact::vector& seqVec, size_t offset, stx::string_view str); // spp::sparse_hash_map& getContigNameMap(); spp::sparse_hash_map& getContigNameMap(); spp::sparse_hash_map& getContigIDMap(); // spp::sparse_hash_map& getRefIDs(); std::vector& getRefIDs(); std::vector& getRefLengths(); std::map, bool, pufferfish::util::cmpByPair>& getPathStart(); std::map, bool, pufferfish::util::cmpByPair>& getPathEnd(); std::vector>& getNewSegments(); compact::vector& getContigSeqVec(); compact::vector& getEdgeVec(); compact::vector& getEdgeVec2(); // spp::sparse_hash_map // > >& getPaths() {return path;} // spp::sparse_hash_map // > >& getPaths() {return path;} pufg::Graph& getSemiCG(); void parseFile(); void mapContig2Pos(); void clearContigTable(); void serializeContigTable(const std::string& odir); void deserializeContigTable(); // void writeFile(std::string fileName); }; } // end namespace pufferfish #endif