/** ** Copyright (c) 2007-2010 Illumina, Inc. ** ** This software is covered by the "Illumina Genome Analyzer Software ** License Agreement" and the "Illumina Source Code License Agreement", ** and certain third party copyright/licenses, and any user of this ** source file is bound by the terms therein (see accompanying files ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and ** Illumina_Source_Code_License_Agreement.pdf and third party ** copyright/license notices). ** ** This file is part of the Consensus Assessment of Sequence And VAriation ** (CASAVA) software package. ** ** \file produceAlignStats.cpp ** ** \brief Produce alignment stats. ** ** Produce alignment stats. ** ** \author Richard Shaw **/ #include #include #include #include #include #include "common/Exceptions.hh" #include "common/Tiling.h" #include "statistics/Align_Data.h" #include "statistics/Align_Data_Factory.h" #include "statistics/Align_File_Data.h" #include "statistics/Export_File_Data.h" #include "statistics/PAS_Options.h" #include "statistics/Read_Align_Info.h" #include "statistics/Tile_Mgr.h" #include "statistics/Tile_Stats.h" namespace cc=casava::common; namespace cs=casava::statistics; namespace fs=boost::filesystem; /*****************************************************************************/ enum { ERR_OK=0, ERR_ARGS=1, ERR_READ_FILE=2, ERR_WRITE_FILE=3, ERR_PROCESSING=4 }; typedef std::vector > Align_Data_Ptr_List; typedef Align_Data_Ptr_List::iterator Align_Data_Ptr_List_Iter; typedef std::list Quality_Type_List; typedef std::list::const_iterator Quality_Type_List_CIter; /*****************************************************************************/ void get_output_filename(const std::string& output_dir_str, const std::string& output_qualifier_str, const unsigned int lane_num, const unsigned int tile_num, const Tile_Stats::Quality_Type quality_type, std::string& output_filename) { const std::string prefix_str(!output_dir_str.empty() ? (output_dir_str + "/s") : "s"); RunFolderFileName run_folder_file_name(prefix_str.c_str()); RunFolderFileName::filename_type file_name_type(RunFolderFileName::SCORE_FILE); switch (quality_type) { case Tile_Stats::QUALITY_RAW: file_name_type = RunFolderFileName::SCORE_FILE; break; case Tile_Stats::QUALITY_FILTERED: file_name_type = RunFolderFileName::RESCORE_FILE; break; } // Using getQualifiedTileFilename pending any transition to distinguishing // corresponding files output at different stages by subdirectory. output_filename = (!output_qualifier_str.empty() ? run_folder_file_name. getQualifiedTileFilename(file_name_type, output_qualifier_str, lane_num - 1, tile_num - 1) : run_folder_file_name. getTileFilename(file_name_type, lane_num - 1, tile_num - 1)); } const fs::path get_score_xml_file_path(const PAS_Options &options) { fs::path score_path(options.my_output_dir_str); score_path /= options.my_stats_file_prefix + "score.xml"; return score_path; } const fs::path get_rescore_xml_file_path(const PAS_Options &options) { fs::path rescore_path(options.my_output_dir_str); rescore_path /= options.my_stats_file_prefix + "rescore.xml"; return rescore_path; } bool is_export_gzipped(std::string &filename) { bool isCompressed = false; // open the export file std::ifstream exportStream(filename.c_str(), ios::binary); if(exportStream.fail()) { BOOST_THROW_EXCEPTION(casava::common::IoException(errno, "Couldn't open " + filename)); } // check the magic number isCompressed = (exportStream.get() == 037) && (exportStream.get() == 0213); if(exportStream.fail()) { BOOST_THROW_EXCEPTION(cc::IoException(errno, "Unable to read the magic number from file " + filename)); } // close the export file exportStream.close(); return isCompressed; } void report_one_tile(Tile_Stats &tile_stats, Tile_Mgr& tile_mgr, const PAS_Options& pas_options, cs::AlignStatsXmlWriter &score_xml_writer, cs::AlignStatsXmlWriter &rescore_xml_writer) { std::string output_rescore_filename; get_output_filename(pas_options.my_output_dir_str, pas_options.my_output_qualifier_str, tile_stats.lane_num(), tile_stats.tile_num(), Tile_Stats::QUALITY_FILTERED, output_rescore_filename); File_Buffer rescore_file_buffer(output_rescore_filename, File_Buffer::compressionNone); rescore_file_buffer.write_some_lines(std::vector()); //ensure file gets created std::string output_score_filename; get_output_filename(pas_options.my_output_dir_str, pas_options.my_output_qualifier_str, tile_stats.lane_num(), tile_stats.tile_num(), Tile_Stats::QUALITY_RAW, output_score_filename); File_Buffer score_file_buffer(output_score_filename, File_Buffer::compressionNone); score_file_buffer.write_some_lines(std::vector()); //ensure file gets created tile_stats.derive_stats(); tile_stats.write_stat_files(Tile_Stats::QUALITY_FILTERED, rescore_file_buffer, pas_options.my_sample_name, pas_options.my_barcode, rescore_xml_writer); tile_stats.write_stat_files(Tile_Stats::QUALITY_RAW, score_file_buffer, pas_options.my_sample_name, pas_options.my_barcode, score_xml_writer); tile_mgr.report_tile(tile_stats.lane_num(), tile_stats.tile_num()); } /*****************************************************************************/ /** * @brief Collects and prints per-tile alignment statistics * * The data comes in the form of files where a single tile can span more than one * file and a single file can contain more than one tile. The files are ordered * by set number and read number. The convoluted logic below scans the input * so that one read of a tile is scanned fully before the stats begins being * accumulated for the next read of the same tile or for the next tile. Reads of * the same tile must be accumulated subsequently. * * The primary reason for this particular implementation is to avoid changes in the * rest of the statistics accumulation code which was implemented much earlier based * on the (now false) premise that data files wholly contain tiles. * * Most likely the logic will fail to detect input data that is misordered or has * varying number of reads per tile. */ bool process_tiles(Align_Data_Ptr_List& align_data_ptr_list, Tile_Mgr& tile_mgr, const PAS_Options& pas_options, cs::AlignStatsXmlWriter &score_xml_writer, cs::AlignStatsXmlWriter &rescore_xml_writer) { bool at_end_of_tile(false); bool at_eof(false); Read_Align_Info last_read_align_info; Read_Align_Info read_align_info; boost::shared_ptr current_tile_stats(new Tile_Stats(pas_options.my_read_list, pas_options.max_patterns_to_store)); boost::shared_ptr next_tile_stats(new Tile_Stats(pas_options.my_read_list, pas_options.max_patterns_to_store)); size_t readIndex(0); Align_Data_Ptr_List_Iter align_data_ptr_iter(align_data_ptr_list.begin()); Align_Data_Ptr_List_Iter align_first_read_data_ptr_iter(align_data_ptr_iter); size_t readsToProcess(pas_options.my_read_list.size()); while (true) { at_eof = false; if((*align_data_ptr_iter)->get_next_read_align_info(read_align_info, at_end_of_tile, at_eof)) { // std::cerr << "read prev tile: " << last_read_align_info.my_tile_num << ", cur tile: " << read_align_info.my_tile_num << " at eof:" << at_eof << "\n"; if(last_read_align_info.my_tile_num && last_read_align_info.my_tile_num != read_align_info.my_tile_num) { // eof reads get processed twice if (!at_eof) { next_tile_stats->add_read_align_info(read_align_info); } // std::cerr << "chtile add. tile: " << read_align_info.my_tile_num << "\n"; if(++readIndex == pas_options.my_read_list.size()) { // std::cerr << "tile change. Old: " << last_read_align_info.my_tile_num << ", New " << read_align_info.my_tile_num << "\n"; report_one_tile(*current_tile_stats, tile_mgr, pas_options, score_xml_writer, rescore_xml_writer); current_tile_stats = next_tile_stats; next_tile_stats = boost::shared_ptr(new Tile_Stats(pas_options.my_read_list, pas_options.max_patterns_to_store)); readIndex = 0; // ensure that template is instantiated with distance as signed! // (some compilers treat the expression 1 - pas_options.my_read_list.size() as unsigned) std::advance(align_data_ptr_iter, 1 - (long)pas_options.my_read_list.size()); // std::cerr << "at: " << align_data_ptr_iter - align_data_ptr_list.begin() << "\n"; align_first_read_data_ptr_iter = align_data_ptr_iter; last_read_align_info = read_align_info; at_eof = false; } else { // std::cerr << "read change. Read: " << pas_options.my_read_list[readIndex] << "\n"; ++align_first_read_data_ptr_iter; align_data_ptr_iter = align_first_read_data_ptr_iter; at_eof = false; } } else { current_tile_stats->add_read_align_info(read_align_info); // std::cerr << "tile add. tile: " << read_align_info.my_tile_num << "\n"; last_read_align_info = read_align_info; } } // else // { // //some empty input file... // } if (at_eof) { // std::cerr << "at eof. Tile: " << read_align_info.my_tile_num << ", Read: " << read_align_info.my_read_num << "Reads to process:" << readsToProcess << "\n"; Align_Data_Ptr_List_Iter next_file_of_same_read_iter(align_data_ptr_iter); //advance to next file containing the same read //avoid advancing over the end() // std::cerr << "before eofat: " << next_file_of_same_read_iter - align_data_ptr_list.begin() << "\n"; std::advance(next_file_of_same_read_iter, readsToProcess); // std::cerr << "eofat: " << next_file_of_same_read_iter - align_data_ptr_list.begin() << "\n"; if(align_data_ptr_list.end() == next_file_of_same_read_iter) { // End of data, similar logic to when the tile ends. // std::cerr << "end of tiles Read: " << pas_options.my_read_list[readIndex] << "\n"; if (--readsToProcess) { ++align_first_read_data_ptr_iter; align_data_ptr_iter = align_first_read_data_ptr_iter; } else { report_one_tile(*current_tile_stats, tile_mgr, pas_options, score_xml_writer, rescore_xml_writer); current_tile_stats = next_tile_stats; next_tile_stats = boost::shared_ptr(new Tile_Stats(pas_options.my_read_list, pas_options.max_patterns_to_store)); break; } } else { // std::cerr << "moving to next file for read : " << pas_options.my_read_list[readIndex] << "\n"; // finish advancing, we're not going to hit end here. std::advance(next_file_of_same_read_iter, pas_options.my_read_list.size() - readsToProcess); align_data_ptr_iter = next_file_of_same_read_iter; } } } return true; } /*****************************************************************************/ void write_bad_tile_files(const PAS_Options& pas_options, const Tile_Mgr& tile_mgr, const Quality_Type_List& quality_type_list, const unsigned int num_reads) { unsigned int lane_num(0); Tile_Mgr::Tile_Num_List missing_tile_num_list; tile_mgr.get_missing_tile_num_list(lane_num, missing_tile_num_list); BOOST_FOREACH(unsigned int tile_num, missing_tile_num_list) { BOOST_FOREACH (const Tile_Stats::Quality_Type &quality_type, quality_type_list) { std::string output_filename; get_output_filename(pas_options.my_output_dir_str, pas_options.my_output_qualifier_str, lane_num, tile_num, quality_type, output_filename); File_Buffer file_buffer(output_filename, File_Buffer::compressionNone); File_Buffer::Line_Vec line_vec; // for (unsigned int read_num(1); read_num <= num_reads; ++read_num) { const size_t end_read_index = (1 == pas_options.my_read_list.size() ? 1 : pas_options.my_read_list.size() + 1); for (size_t read_index = 0; end_read_index > read_index; ++read_index) { const unsigned int read_num = (read_index >= pas_options.my_read_list.size() ? 0 /* our_all_reads_read_num */ : pas_options.my_read_list[read_index]); line_vec.push_back(Tile_Stats::hdr_str(lane_num, tile_num, quality_type, read_num)); } if (num_reads > 1) { // All Reads : read_num 0 line_vec.push_back(Tile_Stats::hdr_str(lane_num, tile_num, quality_type, 0)); } if (!file_buffer.write_some_lines(line_vec)) { BOOST_THROW_EXCEPTION(cc::IoException(errno, "Failed to write bad tiles to " + file_buffer.file_path_str())); } } } } /*****************************************************************************/ int main(int num_args, char* arg_cstr_arr[]) { const std::string app_name(arg_cstr_arr[0]); int ret_val(ERR_OK); PAS_Options pas_options; if (pas_options.init(num_args, arg_cstr_arr) != CommandLine::Options::RUN) { return ERR_ARGS; } Tile_Mgr tile_mgr(pas_options.my_tile_list_file_path_str); // Create a list of data ptrs from the file list (e.g. one per read). Align_Data_Ptr_List align_data_ptr_list; BOOST_FOREACH(std::string &input_path_str, pas_options.my_input_path_str_list) { if(is_export_gzipped(input_path_str)) { pas_options.my_input_compr = File_Buffer::compressionGzip; } boost::shared_ptr align_data_ptr( new Export_File_Data(input_path_str, pas_options.my_input_compr)); align_data_ptr_list.push_back(align_data_ptr); } // Process the tiles one at a time (combining reads if necessary). cs::AlignStatsXmlWriter score_xml_writer(get_score_xml_file_path(pas_options)); cs::AlignStatsXmlWriter rescore_xml_writer(get_rescore_xml_file_path(pas_options)); if (!process_tiles(align_data_ptr_list, tile_mgr, pas_options, score_xml_writer, rescore_xml_writer)) { ret_val = ERR_PROCESSING; } else { score_xml_writer.close(); rescore_xml_writer.close(); Quality_Type_List quality_type_list; quality_type_list.push_back(Tile_Stats::QUALITY_RAW); quality_type_list.push_back(Tile_Stats::QUALITY_FILTERED); // Write null files for each unrepresented tile. const unsigned int num_reads(pas_options.my_read_list.size()); write_bad_tile_files(pas_options, tile_mgr, quality_type_list, num_reads); } return ret_val; } /*****************************************************************************/