/** >HEADER Copyright (c) 2013 Rob Patro robp@cs.cmu.edu This file is part of Sailfish. Sailfish is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Sailfish is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Sailfish. If not, see .
#include #include #include #include #include "tbb/concurrent_queue.h" struct TranscriptGeneID { std::string transcript_id; std::string gene_id; // std::unordered_map dynamic; bool parseAttribute(std::string& key, std::string& val) { if (key == "transcript_id") { transcript_id = val; return true; } if (key == "gene_id") { gene_id = val; return true; } // dynamic[key] = val; return false; } }; std::ostream& operator<<(std::ostream& out, const TranscriptGeneID& ids); template class GenomicFeature { public: std::string seqID; std::string source; std::string type; int start, end; float score; char strand; char phase; StaticAttributes sattr; template friend std::ostream& operator<<(std::ostream& out, const GenomicFeature& gf); template friend std::istream& operator>>(std::istream& in, const GenomicFeature& gf); }; template std::ostream& operator<<(std::ostream& out, const GenomicFeature& gf); template std::istream& operator>>(std::istream& in, GenomicFeature& gf); namespace GTFParser { template void genomicFeatureFromLine(std::string& l, CustomGenomicFeature& gf) { size_t head = 0; size_t tail = l.find_first_of('\t'); gf.seqID = l.substr(head, tail); head = tail + 1; tail = l.find_first_of('\t', head); gf.source = l.substr(head, tail - head); head = tail + 1; tail = l.find_first_of('\t', head); gf.type = l.substr(head, tail - head); head = tail + 1; tail = l.find_first_of('\t', head); gf.start = atoi(l.substr(head, tail - head).c_str()); head = tail + 1; tail = l.find_first_of('\t', head); gf.end = atoi(l.substr(head, tail - head).c_str()); head = tail + 1; tail = l.find_first_of('\t', head); gf.score = atoi(l.substr(head, tail - head).c_str()); head = tail + 1; tail = l.find_first_of('\t', head); gf.strand = l.substr(tail, tail - head)[0]; head = tail + 1; tail = l.find_first_of('\t', head); gf.phase = l.substr(tail, tail - head)[0]; head = tail + 1; tail = l.find_first_of('\n', head); auto line = l.substr(head, tail - head); using tokenizer = boost::tokenizer>; boost::char_separator sep(";"); tokenizer tokens(line, sep); for (auto tokIt : tokens) { // Currently, we'll handle the following separators // '\s+' // '\s*=\s*' tokIt = tokIt.substr(tokIt.find_first_not_of(' ')); auto kvsepStart = tokIt.find('='); // If we reached the end of the key, value token, then the string must have // been separated by some set of spaces, and NO '='. If this is the case, // find the 'spaces' so that we can split on it. if (kvsepStart == tokIt.npos) { kvsepStart = tokIt.find(' '); } auto key = tokIt.substr(0, kvsepStart); key = key.substr(0, key.find(' ')); auto kvsepStop = 1 + kvsepStart + tokIt.substr(kvsepStart + 1).find_first_not_of(' '); auto val = (tokIt[kvsepStop] == '"') ? tokIt.substr(kvsepStop + 1, (tokIt.length() - (kvsepStop + 2))) : tokIt.substr(kvsepStop, (tokIt.length() - (kvsepStop + 1))); gf.sattr.parseAttribute(key, val); } } template std::vector> readGTFFile(const std::string& fname) { using StringPtr = std::string*; std::vector> feats; std::ifstream ifile(fname); bool done = false; std::vector threads; tbb::concurrent_queue queue; // boost::lockfree::queue queue(5000); threads.push_back(std::thread([&ifile, &queue, &done]() { StringPtr line = new std::string(); while (!std::getline(ifile, *line).eof()) { StringPtr ownedLine = line; queue.push(ownedLine); // for boost lockfree // while( !queue.push(ownedLine) ) {} line = new std::string(); } done = true; })); size_t nreader = 10; std::atomic tctr(nreader); tbb::concurrent_queue*> outQueue; // boost::lockfree::queue*> outQueue(5000); for (size_t i = 0; i < nreader; ++i) { threads.push_back(std::thread([&queue, &outQueue, &done, &tctr]() -> void { StringPtr l = nullptr; while (!done or queue.try_pop(l)) { if (l != nullptr) { auto gf = new GenomicFeature(); genomicFeatureFromLine(*l, *gf); outQueue.push(gf); // for boost lockfree // while( !outQueue.push(gf) ) {} delete l; l = nullptr; } } --tctr; })); } threads.push_back(std::thread([&outQueue, &feats, &tctr]() -> void { GenomicFeature* f = nullptr; while (outQueue.try_pop(f) or tctr > 0) { if (f != nullptr) { feats.push_back(*f); } } })); // Wait for all of the threads to finish for (auto& thread : threads) { thread.join(); } ifile.close(); return feats; } } // namespace GTFParser #endif // GENOMIC_FEATURE_HPP