#include #include #include #include #include #include #include #include #include "Fasta_reader.hpp" #include "sequenceUtil.hpp" #include "argProcessor.hpp" #include "DeBruijnGraph.hpp" #include "string_util.hpp" #include "irke_common.hpp" #ifdef _OPENMP #include #else #define omp_get_max_threads() 1 #define omp_get_num_threads() 1 #define omp_get_thread_num() 0 #endif using namespace std; unsigned int IRKE_COMMON::MONITOR = 0; // prototypes string usage (ArgProcessor args); int fastaToDeBruijn (int argc, char* argv[]); DeBruijnGraph constructDeBruijnGraph (vector fasta_filenames, int kmer_length, int component_val, bool sStrand, ArgProcessor args); void createGraphPerRecord(vector fasta_filenames, int kmer_length, bool sStrand, ArgProcessor args); string usage (ArgProcessor) { stringstream usage_info; usage_info << endl << endl << "**Required" << endl << " --fasta " << ":fasta file containing reads" << endl << " -K " << ":kmer length" << endl << " and " << endl << " -C " << ":component identifier" << endl << " or " << endl << " --graph_per_record " << ": write separate graph for each fasta entry." << endl << endl << " **Optional" << endl << " --SS " << ":indicates strand-specific" << endl << " --toString " << ": dump graph as descriptive output" << endl << " --toDOT " << ": dump graph in dot format " << endl << " --monitor " << ": verbosity level" << endl << " --threads " << ": number of threads to utilize. " << endl << endl; return(usage_info.str()); } int fastaToDeBruijn (int argc, char* argv[]) { try { ArgProcessor args(argc, argv); /* Check for essential options */ if (args.isArgSet("--help") || args.isArgSet("-h") || (! (args.isArgSet("--fasta") && args.isArgSet("-K") && ( args.isArgSet("-C") || args.isArgSet("--graph_per_record") ) ) ) ) { cerr << usage(args) << endl << endl; return 1; } // required string fasta_filename = args.getStringVal("--fasta"); int kmer_length = args.getIntVal("-K"); // optional bool sStrand = false; if (args.isArgSet("--SS")) { sStrand = true; } if (args.isArgSet("--monitor")) { IRKE_COMMON::MONITOR = args.getIntVal("--monitor"); } if (args.isArgSet("--threads")) { int num_threads = args.getIntVal("--threads"); omp_set_num_threads(num_threads); } vector fasta_filenames; if (fasta_filename.find(',') != string::npos) { // cerr << "Splitting filenames out: " << fasta_filename << endl; string_util::tokenize(fasta_filename, fasta_filenames, ","); } else { //cerr << "Using single fasta filename: " << fasta_filename << endl; fasta_filenames.push_back(fasta_filename); } if (args.isArgSet("--graph_per_record")) { createGraphPerRecord(fasta_filenames, kmer_length, sStrand, args); } else { // one big graph int component_val = args.getIntVal("-C"); constructDeBruijnGraph(fasta_filenames, kmer_length, component_val, sStrand, args); } } catch(exception& e) { cerr << "error: " << e.what() << "\n"; return 1; } return(0); } void createGraphPerRecord(vector fasta_file_names, int kmer_length, bool sStrand, ArgProcessor args) { for (size_t i = 0; i < fasta_file_names.size(); i++) { string fasta_filename = fasta_file_names[i]; if (IRKE_COMMON::MONITOR > 1) cerr << "Parsing file: " << fasta_filename << endl; Fasta_reader fasta_reader(fasta_filename); #pragma omp parallel while (true) { Fasta_entry fe; bool got_fe = false; #pragma omp critical if (fasta_reader.hasNext()) { fe = fasta_reader.getNext(); got_fe = true; } if (! got_fe) { break; // no more fasta entries. } string accession = fe.get_accession(); // get component value vector acc_pts; string_util::tokenize(accession, acc_pts, "_"); int component_id = atoi(acc_pts[1].c_str()); // get iworm coverage values string header = fe.get_header(); vector iworm_cov_vals_str_vec; string_util::tokenize(header, iworm_cov_vals_str_vec, " "); iworm_cov_vals_str_vec.erase(iworm_cov_vals_str_vec.begin()); // remove acc value string sequence = fe.get_sequence(); DeBruijnGraph g(kmer_length); vector seq_regions; string_util::tokenize(sequence, seq_regions, "X"); // inchworm bundles concatenated with 'X' delimiters by Chrysalis if (seq_regions.size() != iworm_cov_vals_str_vec.size()) { cerr << "Error, number of seqs and number of cov vals don't match" << endl; exit(1); } for (size_t s = 0; s < seq_regions.size(); s++) { string seq_region = seq_regions[s]; unsigned int cov_val = atoi(iworm_cov_vals_str_vec[s].c_str()); //FIXME: change to strtoul if (seq_region.size() < kmer_length) { continue; } // can be encountered in jaccard-clip mode (rarely) if (contains_non_gatc(seq_region)) { seq_region = replace_nonGATC_chars_with_A(seq_region); } if (IRKE_COMMON::MONITOR > 2) cerr << "Adding sequence to graph: " << seq_region << endl; stringstream s_acc_val; s_acc_val << accession << "^" << s; g.add_sequence(s_acc_val.str(), seq_region, sStrand, cov_val); } // end sequence region if (args.isArgSet("--toString")) { #pragma omp critical cout << g.toString(); } else if (args.isArgSet("--toDOT")) { #pragma omp critical cout << g.toDOT(sStrand); } else { #pragma omp critical cout << g.toChrysalisFormat(component_id, sStrand); } } // end fasta entry } // end fasta file return; } DeBruijnGraph constructDeBruijnGraph (vector fasta_file_names, int kmer_length, int component_val, bool sStrand, ArgProcessor args) { DeBruijnGraph g(kmer_length); for (size_t i = 0; i < fasta_file_names.size(); i++) { string fasta_filename = fasta_file_names[i]; if (IRKE_COMMON::MONITOR > 1) cerr << "Parsing file: " << fasta_filename << endl; Fasta_reader fasta_reader(fasta_filename); while (fasta_reader.hasNext()) { Fasta_entry fe = fasta_reader.getNext(); string sequence = fe.get_sequence(); string accession = fe.get_accession(); // get iworm coverage values string header = fe.get_header(); vector iworm_cov_vals_str_vec; string_util::tokenize(header, iworm_cov_vals_str_vec, " "); iworm_cov_vals_str_vec.erase(iworm_cov_vals_str_vec.begin()); // remove acc value vector seq_regions; string_util::tokenize(sequence, seq_regions, "X"); // inchworm bundles concatenated with 'X' delimiters by Chrysalis for (size_t s = 0; s < seq_regions.size(); s++) { string seq_region = seq_regions[s]; unsigned int cov_val = atoi(iworm_cov_vals_str_vec[s].c_str()); //FIXME: change to strtoul if (contains_non_gatc(seq_region)) { seq_region = replace_nonGATC_chars_with_A(seq_region); } if (IRKE_COMMON::MONITOR > 2) cerr << "Adding sequence to graph: " << seq_region << endl; stringstream s_acc_val; s_acc_val << accession << "^" << s; g.add_sequence(s_acc_val.str(), seq_region, sStrand, cov_val); } } } if (args.isArgSet("--toString")) { cout << g.toString(); } else if (args.isArgSet("--toDOT")) { cout << g.toDOT(sStrand); } else { cout << g.toChrysalisFormat(component_val, sStrand); } return(g); } int main (int argc, char* argv[]) { try { return(fastaToDeBruijn(argc, argv)); } catch (string err) { cerr << err << endl; } return(1); }