// -*- mode: c++; indent-tabs-mode: nil; -*- // // Copyright 2010 Illumina, Inc. // // This software is covered by the "Illumina Genome Analyzer Software // License Agreement" and the "Illumina Source Code License Agreement", // and certain third party copyright/licenses, and any user of this // source file is bound by the terms therein (see accompanying files // Illumina_Genome_Analyzer_Software_License_Agreement.pdf and // Illumina_Source_Code_License_Agreement.pdf and third party // copyright/license notices). // // /// \file compressXPair.cpp /// /// This is the complete logic for compressXPair -- a speciaillized /// utility to compress paired export lines into zlib streams during /// bin/sort, but with the data encoded such that unix sort can still /// operate on the file. /// /// \author Chris Saunders /// #include "blt_util/blt_exception.hh" #include "boost/program_options.hpp" #include "boost/utility.hpp" #include #include #include #include #include #include std::ostream& log_os(std::cerr); const uint8_t topbit(1<<7); const uint8_t topmask(static_cast(~topbit)); // encode 8-bit array such that problematic ascii characters are removed // removed ranges are [0-31] and [127] // static void code8(const uint8_t* in, const unsigned in_size, uint8_t* out, unsigned& out_size) { uint8_t* code_ptr(out); unsigned code_num(8); out_size=0; for(unsigned i(0);i> code_num); code_num++; } else { out[out_size++] = in[i]; } } } // inverse(code8) // static void uncode8(const uint8_t* in, const unsigned in_size, uint8_t* out, unsigned& out_size) { const uint8_t* code_ptr(out); unsigned code_num(8); out_size=0; for(unsigned i(0);i> (s+1)) | remain) | topbit; remain = (in[i] << (6-s)) & 0x7f; if(s==6) { out[out_size++] = remain | topbit; remain = 0; } s = (s+1) % 7; } if(s!=0) { out[out_size++] = remain | topbit; } } // inverse(shift8to7) // static void shift7to8(const uint8_t* in, const unsigned in_size, uint8_t* out, unsigned& out_size) { unsigned s(0); uint8_t val(0); for(unsigned i(0);i> (7-s)); } val = (in[i] << (s+1)) & 0xff; s = (s+1) % 8; } } #endif struct byte_cruncher : private boost::noncopyable { byte_cruncher() { static const int level(Z_BEST_SPEED); // (Z_DEFAULT_COMPRESSION); _strm.zalloc = Z_NULL; _strm.zfree = Z_NULL; _strm.opaque = Z_NULL; const int ret(deflateInit(&_strm, level)); if (ret != Z_OK) { throw blt_exception("ERROR: zlib deflate initialize error\n"); } } ~byte_cruncher() { (void)deflateEnd(&_strm); } void crunchit(const uint8_t* in, const unsigned in_size, uint8_t* out, const unsigned out_alloc_size, unsigned& out_size) { int ret(deflateReset(&_strm)); if (ret != Z_OK) { throw blt_exception("ERROR: zlib deflate reset error\n"); } static const int flush(Z_FINISH); _strm.avail_in = in_size; _strm.next_in = const_cast(in); // const-cast for zlib compatibility _strm.avail_out = out_alloc_size; _strm.next_out = out; ret = deflate(&_strm, flush); if(ret == Z_STREAM_ERROR){ throw blt_exception("ERROR: zlib deflate() error\n"); } assert(_strm.avail_out != 0); out_size = (out_alloc_size - _strm.avail_out); } private: z_stream _strm; }; struct byte_uncruncher : private boost::noncopyable { byte_uncruncher() { _strm.zalloc = Z_NULL; _strm.zfree = Z_NULL; _strm.opaque = Z_NULL; _strm.avail_in = 0; _strm.next_in = Z_NULL; const int ret(inflateInit(&_strm)); if (ret != Z_OK) { throw blt_exception("ERROR: zlib inflate initialize error\n"); } } ~byte_uncruncher() { (void)inflateEnd(&_strm); } void uncrunchit(const uint8_t* in, const unsigned in_size, uint8_t* out, const unsigned out_alloc_size, unsigned& out_size) { int ret(inflateReset(&_strm)); if (ret != Z_OK) { throw blt_exception("ERROR: zlib inflate reset error\n"); } _strm.avail_in = in_size; _strm.next_in = const_cast(in); // const-cast for zlib compatibility _strm.avail_out = out_alloc_size; _strm.next_out = out; ret = inflate(&_strm, Z_NO_FLUSH); assert(ret != Z_STREAM_ERROR); /* state not clobbered */ switch (ret) { case Z_NEED_DICT: case Z_DATA_ERROR: case Z_MEM_ERROR: throw blt_exception("ERROR: zlib inflate() error\n"); } assert(_strm.avail_out != 0); assert(ret == Z_STREAM_END); out_size = (out_alloc_size - _strm.avail_out); } private: z_stream _strm; }; // 'tmp' and 'out' buffers are allocated relative to 'in' such that // they have all expected data expansion built-in (short of some // pathological data-expansion case in zlib). // static void compress_string(byte_cruncher& bc, const uint8_t* in, const unsigned in_size, uint8_t* tmp, const unsigned tmp_alloc_size, uint8_t* out, unsigned& out_size) { unsigned tmp_size(0); bc.crunchit(in,in_size,tmp,tmp_alloc_size,tmp_size); code8(tmp,tmp_size,out,out_size); } static void uncompress_string(byte_uncruncher& buc, const uint8_t* in, const unsigned in_size, uint8_t* tmp, uint8_t* out, const unsigned out_alloc_size, unsigned& out_size) { unsigned tmp_size(0); uncode8(in,in_size,tmp,tmp_size); buc.uncrunchit(tmp,tmp_size,out,out_alloc_size,out_size); } static const char sep('\t'); struct tab_line_parser { tab_line_parser(std::istream& is, const unsigned buf_size, const unsigned max_word=0) : _is(is), _line_no(0), _n_word(0) , _buf_size(buf_size) , _max_word(max_word) , _buf(new char[buf_size]) { if((0==_max_word) or (MAX_WORD_COUNT < _max_word)){ _max_word=MAX_WORD_COUNT; } } ~tab_line_parser() { if(NULL!=_buf) { delete [] _buf; _buf=NULL;} } unsigned n_word() const { return _n_word; } void dump(std::ostream& os) const { os << "\tline_no: " << _line_no << "\n"; os << "\tline: '"; for(unsigned i(0);i<_n_word;++i) { if(i) os << '\t'; os << word[i]; } os << "'\n"; } // returns false for regular end of input: bool parse_line() { _n_word=0; _line_no++; _is.getline(_buf,_buf_size); const unsigned len(strlen(_buf)); if((len+1) >= _buf_size){ std::ostringstream oss; oss << "ERROR: input to compressXPair exceeds buffer size on line_no: " << _line_no << "\n\n"; throw blt_exception(oss.str().c_str()); } if(not _is) { if(_is.eof()) { return false; } // normal eof: std::ostringstream oss; oss << "ERROR: Unexpected read failure in parse_line().\n"; throw blt_exception(oss.str().c_str()); } if(NULL == _buf) return false; assert(len); // do a low-level tab parse: { char* p(_buf); word[0]=p; unsigned i(1); while(i<_max_word){ if((*p == '\n') or (*p == '\0')) break; if (*p == sep) { *p = '\0'; word[i++] = p+1; } ++p; } _n_word=i; } return true; } enum { MAX_WORD_COUNT = 50 }; char* word[MAX_WORD_COUNT]; private: std::istream& _is; unsigned _line_no; unsigned _n_word; unsigned _buf_size; unsigned _max_word; char* _buf; }; static void compress_stream(std::istream& data_is, std::ostream& data_os) { static const unsigned in_buf_size(4096); static const unsigned ZLIB_INFO_SIZE(6); static const unsigned tmp_buf_size(in_buf_size+ZLIB_INFO_SIZE); // max size of line buf after zlib compression static const unsigned out_buf_size(((tmp_buf_size*8)+9)/7); // max size of zlib line after 8to7 bit conversion uint8_t tmp_buf[tmp_buf_size]; // zlib output char out_buf[out_buf_size]; // 8to7 output tab_line_parser dparse(data_is,in_buf_size); byte_cruncher bc; // step one is to parse the uncompressed and compressed strings out of the data-line: while(dparse.parse_line()) { static const unsigned col_count(46); if(dparse.n_word()!=col_count){ std::ostringstream oss; oss << "ERROR: unexpected number of columns in paired export line:\n\n"; dparse.dump(oss); throw blt_exception(oss.str().c_str()); } for(unsigned i(1);(i+1)(compress_segment)); uint8_t* post_compressed(reinterpret_cast(out_buf)); unsigned post_size(0); compress_string(bc,pre_compressed,pre_size,tmp_buf,tmp_buf_size,post_compressed,post_size); assert(post_size!=0); out_buf[post_size] = '\0'; data_os << nocompress_segment << sep << out_buf << '\n'; } } const char* get_filename(const char a) { switch(a) { case 'n': return "unsort.txt"; case 'o': return "unsort_orph.txt"; case 'a': return "unsort_anom.txt"; } std::ostringstream oss; oss << "ERROR: Invalid read file type: " << a << "\n"; throw blt_exception(oss.str().c_str()); return NULL; } struct ltpair { bool operator()(const std::pair s1, const std::pair s2) const { if(s1.second == s2.second) { return (strcmp(s1.first, s2.first) < 0); } return (s1.second < s2.second); } }; struct split_stream_handler { split_stream_handler(const std::string& root_path) : _root_path(root_path) {} ~split_stream_handler() { map_t::iterator i(_streams.begin()); const map_t::iterator i_end(_streams.end()); for(;i!=i_end;++i){ delete [] i->first.first; // key string delete i->second; // ostream } } std::ostream& get_stream(std::pair key) { map_t::iterator i(_streams.find(key)); if(i==_streams.end()){ const std::string file_path(_root_path + "/" + key.first + "/" + get_filename(key.second)); std::ofstream* osp(new std::ofstream); char* kopy(new char[strlen(key.first)+1]); strcpy(kopy,key.first); key.first=kopy; std::pair ret = _streams.insert(std::make_pair(key,osp)); i=ret.first; osp->open(file_path.c_str()); if( not (*osp) ) { std::ostringstream oss; oss << "ERROR: failed to open file '" << file_path << "'\n"; throw blt_exception(oss.str().c_str()); } } return *(i->second); } private: const std::string& _root_path; typedef std::map,std::ostream*,ltpair> map_t; map_t _streams; }; static void compress_split_stream(std::istream& data_is, const std::string& split_path){ static const unsigned in_buf_size(4096); static const unsigned ZLIB_INFO_SIZE(6); static const unsigned tmp_buf_size(in_buf_size+ZLIB_INFO_SIZE); // max size of line buf after zlib compression static const unsigned out_buf_size(((tmp_buf_size*8)+9)/7); // max size of zlib line after 8to7 bit conversion uint8_t tmp_buf[tmp_buf_size]; // zlib output char out_buf[out_buf_size]; // 8to7 output tab_line_parser dparse(data_is,in_buf_size,4); byte_cruncher bc; split_stream_handler streams(split_path); while(dparse.parse_line()) { static const unsigned col_count(4); if(dparse.n_word()!=col_count){ std::ostringstream oss; oss << "ERROR: unexecpted number of columns in paired export line:\n\n"; dparse.dump(oss); throw blt_exception(oss.str().c_str()); } const char* nocompress_segment(dparse.word[2]); const char* compress_segment(dparse.word[3]); // finished parsing line -- now run zlib compression const unsigned pre_size(strlen(compress_segment)); const uint8_t* pre_compressed(reinterpret_cast(compress_segment)); uint8_t* post_compressed(reinterpret_cast(out_buf)); unsigned post_size(0); compress_string(bc,pre_compressed,pre_size,tmp_buf,tmp_buf_size,post_compressed,post_size); assert(post_size!=0); out_buf[post_size] = '\0'; std::ostream& data_os(streams.get_stream(std::make_pair(dparse.word[0],*(dparse.word[1])))); data_os << nocompress_segment << sep << out_buf << '\n'; } } static void uncompress_stream(std::istream& data_is, std::ostream& data_os) { static const unsigned out_buf_size(4096); static const unsigned ZLIB_INFO_SIZE(6); static const unsigned tmp_buf_size(out_buf_size+ZLIB_INFO_SIZE); // max size of line buf after zlib compression static const unsigned in_buf_size(((tmp_buf_size*8)+9)/7); // max size of zlib line before 7to8 bit conversion uint8_t tmp_buf[tmp_buf_size]; char out_buf[out_buf_size]; tab_line_parser dparse(data_is,in_buf_size); byte_uncruncher buc; // step one is to parse the uncompressed and compressed strings out of the data-line: while(dparse.parse_line()) { static const unsigned col_count(2); assert(dparse.n_word()==col_count); const char* nocompress_segment(dparse.word[0]); const char* compress_segment(dparse.word[1]); // finished parsing line -- now run zlib compression const unsigned pre_size(strlen(compress_segment)); const uint8_t* pre_compressed(reinterpret_cast(compress_segment)); uint8_t* post_compressed(reinterpret_cast(out_buf)); unsigned post_size(0); uncompress_string(buc,pre_compressed,pre_size,tmp_buf,post_compressed,out_buf_size,post_size); assert(post_size!=0); out_buf[post_size] = '\0'; data_os << nocompress_segment << sep << out_buf << "\n"; } } static void open_ifstream(std::ifstream& ifs, const char* filename){ ifs.open(filename); if( ! ifs ){ log_os << "ERROR:: Can't open file: " << filename << "\n"; exit(EXIT_FAILURE); } } static void try_main(int argc,char* argv[]){ //const time_t start_time(time(0)); //const char* progname(argv[0]); static const char progname[] = "compressXPair"; std::string cmdline; for(int i(0);i(&output_file),"output file (default: stdout)") ("split-mode,s", po::value(&split_path),"split output to multiple files rooted at path -- cannot be combined with regular output file and must be used for compression mode only") ("input-file,i", po::value(&input_file),"input file (default: stdin)") ("unbuffered,u", "force unbuffered input/output streams") ("decompress,d", "decompress"); po::options_description help("help"); help.add_options() ("help,h","print this message"); po::options_description visible("options"); visible.add(req).add(help); bool po_parse_fail(false); po::variables_map vm; try { po::store(po::parse_command_line(argc, argv, visible), vm); po::notify(vm); } catch(const boost::program_options::error& e) { log_os << "ERROR: Exception thrown by option parser: " << e.what() << "\n"; po_parse_fail=true; } if (vm.count("help") or po_parse_fail) { log_os << "\n" << progname << " compresses export pair files such that unix sort can still be applied to the output\n\n"; log_os << "usage: " << progname << " [options] \n\n"; log_os << visible << "\n"; exit(EXIT_FAILURE); } const bool is_compress(vm.count("decompress")==0); const bool is_unbuffered(vm.count("unbuffered")); if(not split_path.empty()){ if(not output_file.empty()){ log_os << "ERROR: cannot specify both out_file and split path\n"; exit(EXIT_FAILURE); } if(not is_compress) { log_os << "ERROR: cannot use split path output with decompression\n"; exit(EXIT_FAILURE); } } // setup input/output data streams: static const char stream_file[] = "-"; std::ifstream data_isf; std::istream* data_isp(NULL); if(input_file.empty() or (input_file == stream_file)) { data_isp = &std::cin; } else { open_ifstream(data_isf,input_file.c_str()); data_isp = &data_isf; } if(is_unbuffered) { data_isp->rdbuf()->pubsetbuf(0,0); } const bool is_split_mode(not split_path.empty()); if(is_split_mode) { compress_split_stream(*data_isp,split_path); } else { std::ofstream data_osf; std::ostream* data_osp(NULL); if(output_file.empty() or (output_file == stream_file)) { data_osp = &std::cout; } else { data_osf.open(output_file.c_str()); data_osp = &data_osf; } if(is_unbuffered) { data_osp->rdbuf()->pubsetbuf(0,0); } // get down to bitness... if(is_compress) { compress_stream(*data_isp,*data_osp); } else { uncompress_stream(*data_isp,*data_osp); } } } static void dump_cl(int argc, char* argv[], std::ostream& os) { os << "cmdline:"; for(int i(0);i