// -*- mode: c++; indent-tabs-mode: nil; -*- // // Copyright 2009 Illumina, Inc. // // This software is covered by the "Illumina Genome Analyzer Software // License Agreement" and the "Illumina Source Code License Agreement", // and certain third party copyright/licenses, and any user of this // source file is bound by the terms therein (see accompanying files // Illumina_Genome_Analyzer_Software_License_Agreement.pdf and // Illumina_Source_Code_License_Agreement.pdf and third party // copyright/license notices). // // /// \file /// #include #include #include #include #include #include #include "alignment/GlobalUtilities.hh" std::ostream& log_os(std::cerr); std::ostream& report_os(std::cout); const char* prog_name = "countFastaBases"; void usage() { std::ostream& os(log_os); os << "\n" << prog_name << " - count bases in a fasta formatted dna sequence file\n" "\n" "usage: \n" << prog_name << " < fasta_file\nor:\n" << prog_name << " fasta_file1 [[fasta_file2]...]\n" "\n" "Prints tab-delimited pair of knownBases/totalBases counts.\n"; exit(EXIT_FAILURE); } bool is_non(const char c){ return nc != realBaseCodes[static_cast(c)]; } struct scan_result { scan_result(const std::string &fileName, const std::string &contigName, unsigned known, unsigned total) : fileName(fileName), contigName(contigName), knownCount(known), totalCount(total){} std::string fileName; std::string contigName; unsigned knownCount; unsigned totalCount; }; std::ostream & operator <<(std::ostream &os, const scan_result &sr) { return os << sr.fileName << '\t' << sr.contigName << '\t' << sr.knownCount << '\t' << sr.totalCount; } bool is_chrom_name_end(char c) { return !c || isspace(c); } bool get_seq_counts(std::istream &ref_is, const std::string fileName, std::ostream &report_os) { const unsigned buff_size(1024); char buff[buff_size] = {0}; ref_is.getline(buff, buff_size - 1); unsigned line_no(1); while(ref_is){ std::string contigName; if(buff[0] != '>') { log_os << "ERROR: missing fasta header on line " << line_no << "\n"; return false; } else { const char *nameEnd = std::find_if(buff + 1, buff + buff_size, is_chrom_name_end); contigName = std::string(buff + 1, nameEnd - buff - 1); } unsigned knownCount(0); unsigned totalCount(0); bool inSequence(false); while(true){ ref_is.getline(buff,buff_size); if (ref_is) { ++line_no; } else if (buff[0]) { //assume the line was longer than the buffer } else { //assume a failure or end of data. break; } if (buff[0] == '>') { if(inSequence) break; else continue; } else { inSequence = true; } for(const char* b(buff); *b; ++b){ if(is_non(*b)) ++knownCount; if('\r' != *b) ++totalCount; //some weird newline files don't go through getline properly } buff[0] = 0; ref_is.clear(); } if (inSequence) report_os << scan_result(fileName, contigName, knownCount, totalCount) << "\n"; } if (!ref_is.eof()) { log_os << "ERROR: unexpected failure while attempting to read sequence on line " << line_no << " buffer:" << buff << "\n"; return false; } return true; } int main(int argc,char* argv[]){ static const std::string helpStrings[] = {"--help", "-help", "-h"}; static const std::string *helpStringsEnd = helpStrings + sizeof(helpStrings) / sizeof(helpStrings[0]); if(argc > 1) { for (int i = 1; i < argc; ++i){ if (helpStringsEnd != std::find(helpStrings, helpStringsEnd, argv[i])){ usage(); } else { std::ifstream ifs(argv[i]); if (!ifs){ log_os << "ERROR: Failed to open " << argv[i] << "\n"; exit(EXIT_FAILURE); } boost::filesystem::path p(argv[i]); if (!get_seq_counts(ifs, p.filename(), report_os)){ log_os << "ERROR: Failed to parse " << argv[i] << "\n"; exit(EXIT_FAILURE); } } } } else { if (!get_seq_counts(std::cin, "", report_os)){ log_os << "ERROR: Failed to parse stdin\n"; exit(EXIT_FAILURE); } } return EXIT_SUCCESS; }