/* * Copyright (c) 2013-2015, 2019 Genome Research Ltd. * Author(s): James Bonfield * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger * Institute nor the names of its contributors may be used to endorse * or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * A cut down version of cram_dump.c to accumulate only size * information per data series. This is much faster than cram_dump as * it does not require uncompression of data blocks. */ #ifdef HAVE_CONFIG_H #include "io_lib_config.h" #endif #include #include #include #include #include #include // Variable sized integer function pointers. varint_vec vv; // Accumulate per {data_series, content_id} combination. void ParseMap(cram_block_compression_hdr *hdr, cram_map **ma, char *data, HashTable *ds_h) { int i; uintptr_t k; for (i = 0; i < CRAM_MAP_HASH; i++) { cram_map *m; for (m = ma[i]; m; m = m->next) { // Crude, only works with single byte ITF8 values if (m->encoding == E_EXTERNAL || m->encoding == E_VARINT_UNSIGNED || m->encoding == E_VARINT_SIGNED || m->encoding == E_BYTE_ARRAY_STOP || m->encoding == E_BYTE_ARRAY_LEN) { HashData hd; if (m->encoding == E_VARINT_UNSIGNED || m->encoding == E_VARINT_SIGNED) hd.i = (unsigned char)data[m->offset]; else hd.i = (unsigned char)data[m->offset + m->size-1]; k = (m->key << 8) | hd.i; cram_codec *c = cram_decoder_init(hdr, m->encoding, data + m->offset, m->size, E_BYTE_ARRAY, 0, &vv); int id1, id2; if (c) { id1 = cram_codec_to_id(c, &id2); c->free(c); if (id1 >= 0) { hd.i = id1; HashTableAdd(ds_h, (char *)k, 4, hd, NULL); } if (id2 >= 0) { hd.i = id2; HashTableAdd(ds_h, (char *)k, 4, hd, NULL); } } else { HashTableAdd(ds_h, (char *)k, 4, hd, NULL); } } } } } void print_sizes(HashTable *bsize_h, HashTable *ds_h, HashTable *dc_h, int bmax) { intptr_t id; for (id = -1; id <= bmax; id++) { intptr_t k; HashItem *hi; HashIter *iter; if (!(hi = HashTableSearch(bsize_h, (char *)id, sizeof(id)))) continue; k = (intptr_t) hi->key; if (k == -1) { printf("Block CORE , total size %11"PRId64"\n", hi->data.i); continue; } printf("Block content_id %7d, total size %11"PRId64" ", (int) k, hi->data.i); struct { int id; enum cram_block_method method; } id_type = {k, 0}; // Reorders cram block method enum to print order enum cram_block_method methods[] = { GZIP, BZIP2, LZMA, RANS0, RANS1, RANS_PR0, RANS_PR1, RANS_PR64, RANS_PR9, RANS_PR128, RANS_PR129, RANS_PR192, RANS_PR193, FQZ, NAME_TOK3, NAME_TOKA, BSC, ZSTD, ARITH_PR0, ARITH_PR1, ARITH_PR64, ARITH_PR9, ARITH_PR128, ARITH_PR129, ARITH_PR192, ARITH_PR193, }; int count[1+sizeof(methods)/sizeof(*methods)] = {0}; int i; for (i = 0; i < sizeof(methods)/sizeof(*methods); i++) { id_type.method = methods[i]; if ((hi = HashTableSearch(dc_h, (char *)&id_type, sizeof(id_type)))) { count[0] += hi->data.i; count[i+1] += hi->data.i; } } for (i = 1; i <= sizeof(methods)/sizeof(*methods); i++) { printf("%s%s%c%s", (count[i]+0.01)/(count[0]+0.01) >= 0.50 ? "\033[7m":"", (count[i]+0.01)/(count[0]+0.01) >= 0.10 ? "\033[4m":"", " gblrR01458923fnNBz01458923"[count[i]?i:0], "\033[0m"); } iter = HashTableIterCreate(); while ((hi = HashTableIterNext(ds_h, iter))) { int c; char buf[5]; int x = 4; if (hi->data.i != k) continue; c = ((uintptr_t) hi->key)>>8; buf[x--] = 0; while(c & 0xff) { buf[x--] = c; c >>= 8; } printf(" %s", &buf[x+1]); } putchar('\n'); HashTableIterDestroy(iter); } } int process_sizes(cram_fd *fd, HashTable *bsize_h, HashTable *ds_h, HashTable *dc_h, int *bmax) { cram_container *c; off_t pos, pos2, hpos; pos = CRAM_IO_TELLO(fd); while ((c = cram_read_container(fd))) { int j; if (fd->err) { perror("Cram container read"); return 1; } hpos = CRAM_IO_TELLO(fd); if (!c->length) { pos = CRAM_IO_TELLO(fd); continue; } if (!(c->comp_hdr_block = cram_read_block(fd))) return 1; assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER); c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block); if (!c->comp_hdr) return 1; ParseMap(c->comp_hdr, c->comp_hdr->rec_encoding_map, (char *)c->comp_hdr_block->data, ds_h); ParseMap(c->comp_hdr, c->comp_hdr->tag_encoding_map, (char *)c->comp_hdr_block->data, ds_h); for (j = 0; j < c->num_landmarks; j++) { cram_slice *s; int id; pos2 = CRAM_IO_TELLO(fd); assert(pos2 - pos - c->offset == c->landmark[j]); s = cram_read_slice(fd); // Accumulate per block content_id for (id = 0; id < s->hdr->num_blocks; id++) { HashItem *hi; intptr_t k = s->block[id]->content_type == CORE ? -1 : s->block[id]->content_id; hi = HashTableSearch(bsize_h, (char *)k, sizeof(k)); if (hi) { hi->data.i += s->block[id]->comp_size; } else { HashData hd; hd.i = s->block[id]->comp_size; HashTableAdd(bsize_h, (char *)k, sizeof(k), hd, NULL); } // WARNING: scuppered by having high content_id values. if (*bmax < s->block[id]->content_id) *bmax = s->block[id]->content_id; } // Hack for rans1 for (id = 0; id < s->hdr->num_blocks; id++) { if (s->block[id]->comp_size >= 2 && s->block[id]->orig_method == RANS0 && s->block[id]->data[0] != 0) s->block[id]->orig_method = RANS1; } // Hack for ransPR* for (id = 0; id < s->hdr->num_blocks; id++) { if (s->block[id]->comp_size >= 2 && s->block[id]->orig_method == RANS_PR0 && s->block[id]->data[0] != 0) { // Assumption: PR1 to PR193 are consecutive s->block[id]->orig_method = RANS_PR1-1; s->block[id]->orig_method += s->block[id]->data[0]&0x01; s->block[id]->orig_method += 2*((s->block[id]->data[0]&0x40)>0); s->block[id]->orig_method += 4*((s->block[id]->data[0]&0x80)>0); } } // Hack for arithPR* for (id = 0; id < s->hdr->num_blocks; id++) { if (s->block[id]->comp_size >= 2 && s->block[id]->orig_method == ARITH_PR0 && s->block[id]->data[0] != 0) { // Assumption: PR1 to PR193 are consecutive s->block[id]->orig_method = ARITH_PR1-1; s->block[id]->orig_method += s->block[id]->data[0]&0x01; s->block[id]->orig_method += 2*((s->block[id]->data[0]&0x40)>0); s->block[id]->orig_method += 4*((s->block[id]->data[0]&0x80)>0); } } // Accumulate per {id, compression_method} combo for (id = 0; id < s->hdr->num_blocks; id++) { HashData hd; HashItem *hi; cram_block *b = s->block[id]; struct { int id; enum cram_block_method method; } id_type = {b->content_id, b->orig_method}; hd.i = 0; hi = HashTableAdd(dc_h, (char *)&id_type, sizeof(id_type), hd, NULL); hi->data.i++; int t, m; char fields[1024], *fp = fields; for (m = 0; m < 2; m++) { cram_map **ma = m ? c->comp_hdr->tag_encoding_map : c->comp_hdr->rec_encoding_map; for (t = 0; t < CRAM_MAP_HASH; t++) { cram_map *m; unsigned char *data = c->comp_hdr_block->data; for (m = ma[t]; m; m = m->next) { if (m->encoding != E_EXTERNAL && m->encoding != E_VARINT_UNSIGNED && m->encoding != E_VARINT_SIGNED && m->encoding != E_BYTE_ARRAY_STOP && m->encoding != E_BYTE_ARRAY_LEN) continue; if (data[m->offset + m->size-1] != b->content_id) continue; if ((m->key>>16)&0xff) *fp++ = (m->key>>16)&0xff; *fp++ = (m->key>> 8)&0xff; *fp++ = (m->key>> 0)&0xff; *fp++ = ' '; } } } *fp++ = 0; } cram_free_slice(s); } pos = CRAM_IO_TELLO(fd); assert(pos == hpos + c->length); cram_free_container(c); } return 0; } int main(int argc, char **argv) { cram_fd *fd; HashTable *bsize_h; HashTable *ds_h; // content_id to data-series lookup. HashTable *dc_h; // content_id to data-compression lookup int bmax = 0; bsize_h = HashTableCreate(128, HASH_DYNAMIC_SIZE| HASH_NONVOLATILE_KEYS | HASH_INT_KEYS); ds_h = HashTableCreate(128, HASH_DYNAMIC_SIZE| HASH_NONVOLATILE_KEYS | HASH_INT_KEYS); dc_h = HashTableCreate(128, HASH_DYNAMIC_SIZE); if (argc < 2) { fprintf(stderr, "Usage: cram_size filename.cram\n"); return 1; } if (NULL == (fd = cram_open(argv[1], "rb"))) { fprintf(stderr, "Error opening CRAM file '%s'.\n", argv[1]); return 1; } cram_init_varint(&vv, fd->file_def->major_version); if (0 != process_sizes(fd, bsize_h, ds_h, dc_h, &bmax)) return 1; cram_close(fd); print_sizes(bsize_h, ds_h, dc_h, bmax); HashTableDestroy(bsize_h, 0); HashTableDestroy(ds_h, 0); HashTableDestroy(dc_h, 0); return 0; }