/* * Copyright (c) 2013 Genome Research Ltd. * Author(s): James Bonfield * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger * Institute nor the names of its contributors may be used to endorse * or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Author: James Bonfield, Wellcome Trust Sanger Institute. 2013 * * Support for CRAM index format: foo.cram.crai */ /* * The index is a gzipped tab-delimited text file with one line per slice. * The columns are: * 1: reference number (0 to N-1, as per BAM ref_id) * 2: reference position of 1st read in slice (1..?) * 3: number of reads in slice * 4: offset of container start (relative to end of SAM header, so 1st * container is offset 0). * 5: slice number within container (ie which landmark). * * In memory, we hold this in a nested containment list. Each list element is * a cram_index struct. Each element in turn can contain its own list of * cram_index structs. * * Any start..end range which is entirely contained within another (and * earlier as it is sorted) range will be held within it. This ensures that * the outer list will never have containments and we can safely do a * binary search to find the first range which overlaps any given coordinate. */ #ifdef HAVE_CONFIG_H #include "io_lib_config.h" #endif #include #include #include #include #include #include #include #include #include #include #include "io_lib/cram.h" #include "io_lib/os.h" #include "io_lib/zfio.h" #if 0 static void dump_index_(cram_index *e, int level) { int i, n; n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end); printf("%*soffset %"PRId64"\n", MAX(0,50-n), "", e->offset); for (i = 0; i < e->nslice; i++) { dump_index_(&e->e[i], level+1); } } static void dump_index(cram_fd *fd) { int i; for (i = 0; i < fd->index_sz; i++) { dump_index_(&fd->index[i], 0); } } #endif typedef char * (*fgets_functions)(char * s, int size, void * fp); static char * zfgets_func(char * s, int size, void * fp) { return zfgets(s,size,(zfp *)fp); } #if defined(CRAM_IO_CUSTOM_BUFFERING) static char * cram_io_input_buffer_fgets_func(char * s, int size, void * fp) { return cram_io_input_buffer_fgets(s,size,fp); } #endif static int cram_index_load_private(cram_fd *fd, void * fp, fgets_functions fgets_func) { char line[1024]; cram_index *idx = NULL; cram_index **idx_stack = NULL, *ep, e; int idx_stack_alloc = 0, idx_stack_ptr = 0; fd->index = calloc((fd->index_sz = 1), sizeof(*fd->index)); if (!fd->index) return -1; idx = &fd->index[0]; idx->refid = -1; idx->start = INT_MIN; idx->end = INT_MAX; idx_stack = calloc(++idx_stack_alloc, sizeof(*idx_stack)); idx_stack[idx_stack_ptr] = idx; while (fgets_func(line, 1024, fp)) { /* 1.1 layout */ char *cp = line; errno = 0; e.refid = strtol (cp, &cp, 10); e.start = strtol (cp, &cp, 10); e.end = strtol (cp, &cp, 10); e.offset = strtoll(cp, &cp, 10); e.slice = strtol (cp, &cp, 10); e.len = strtol (cp, &cp, 10); if (errno == EINVAL || errno == ERANGE) { free(idx_stack); return -1; } e.end += e.start-1; //printf("%d/%d..%d\n", e.refid, e.start, e.end); if (e.refid < -1) { free(idx_stack); fprintf(stderr, "Malformed index file, refid %d\n", e.refid); return -1; } if (e.refid != idx->refid) { if (fd->index_sz < e.refid+2) { size_t index_end = fd->index_sz * sizeof(*fd->index); fd->index_sz = e.refid+2; fd->index = realloc(fd->index, fd->index_sz * sizeof(*fd->index)); memset(((char *)fd->index) + index_end, 0, fd->index_sz * sizeof(*fd->index) - index_end); } idx = &fd->index[e.refid+1]; idx->refid = e.refid; idx->start = INT_MIN; idx->end = INT_MAX; idx->nslice = idx->nalloc = 0; idx->e = NULL; idx_stack[(idx_stack_ptr = 0)] = idx; } while (!(e.start >= idx->start && e.end <= idx->end) || idx->end == 0) { idx = idx_stack[--idx_stack_ptr]; } // Now contains, so append if (idx->nslice+1 >= idx->nalloc) { idx->nalloc = idx->nalloc ? idx->nalloc*2 : 16; idx->e = realloc(idx->e, idx->nalloc * sizeof(*idx->e)); } e.nalloc = e.nslice = 0; e.e = NULL; *(ep = &idx->e[idx->nslice++]) = e; idx = ep; if (++idx_stack_ptr >= idx_stack_alloc) { idx_stack_alloc *= 2; idx_stack = realloc(idx_stack, idx_stack_alloc*sizeof(*idx_stack)); } idx_stack[idx_stack_ptr] = idx; } free(idx_stack); // dump_index(fd); return 0; } #if defined(CRAM_IO_CUSTOM_BUFFERING) /* * Loads a CRAM .crai index into memory. * * Returns 0 for success * -1 for failure */ int cram_index_load_via_callbacks( cram_fd *fd, char const * fn, cram_io_allocate_read_input_t callback_allocate_function, cram_io_deallocate_read_input_t callback_deallocate_function ) { cram_fd * input = NULL; int r = -1; static char const * indexsuffix = ".crai"; char * indexfn = NULL; size_t const fnsize = strlen(fn); size_t const suffixsize = strlen(indexsuffix); size_t const indexfnsize = fnsize+suffixsize+1; if ( !(indexfn = (char *)malloc(indexfnsize)) ) { r = -1; goto cleanup; } memcpy(indexfn, fn, fnsize); memcpy(indexfn+fnsize,indexsuffix,suffixsize); indexfn[fnsize+suffixsize] = 0; if ( ! (input = cram_io_open_by_callbacks(indexfn,callback_allocate_function,callback_deallocate_function,32*1024,1/* decompress */)) ) { r = -1; goto cleanup; } r = cram_index_load_private(fd,input,cram_io_input_buffer_fgets_func); cleanup: if ( input ) { cram_io_close(input,NULL); input = NULL; } if ( indexfn ) { free(indexfn); indexfn = NULL; } return r; } #endif /* * Loads a CRAM .crai index into memory. * * Returns 0 for success * -1 for failure */ int cram_index_load(cram_fd *fd, char const *fn) { zfp *fp = NULL; char fn2[PATH_MAX]; int r = -1; /* Check if already loaded */ if (fd->index) return 0; /* copy filename */ sprintf(fn2, "%s.crai", fn); /* open index file */ if (!(fp = zfopen(fn2, "r"))) { perror(fn2); return -1; } r = cram_index_load_private(fd,fp,zfgets_func); zfclose(fp); return r; } static void cram_index_free_recurse(cram_index *e) { if (e->e) { int i; for (i = 0; i < e->nslice; i++) { cram_index_free_recurse(&e->e[i]); } free(e->e); } } void cram_index_free(cram_fd *fd) { int i; if (!fd->index) return; for (i = 0; i < fd->index_sz; i++) { cram_index_free_recurse(&fd->index[i]); } free(fd->index); fd->index = NULL; } /* * Searches the index for the first slice overlapping a reference ID * and position, or one immediately preceding it if none is found in * the index to overlap this position. (Our index may have missing * entries, but we require at least one per reference.) * * If the index finds multiple slices overlapping this position we * return the first one only. Subsequent calls should specifying * "from" as the last slice we checked to find the next one. Otherwise * set "from" to be NULL to find the first one. * * Returns the cram_index pointer on sucess * NULL on failure */ cram_index *cram_index_query(cram_fd *fd, int refid, int pos, cram_index *from) { int i, j, k; cram_index *e; if (refid+1 < 0 || refid+1 >= fd->index_sz) return NULL; if (!from) from = &fd->index[refid+1]; // Ref with nothing aligned against it. if (!from->e) return NULL; // This sequence is covered by the index, so binary search to find // the optimal starting block. i = 0, j = fd->index[refid+1].nslice-1; for (k = j/2; k != i; k = (j-i)/2 + i) { if (from->e[k].refid > refid) { j = k; continue; } if (from->e[k].refid < refid) { i = k; continue; } if (from->e[k].start >= pos) { j = k; continue; } if (from->e[k].start < pos) { i = k; continue; } } // i==j or i==j-1. Check if j is better. if (j >= 0 && from->e[j].start < pos && from->e[j].refid == refid) i = j; /* The above found *a* bin overlapping, but not necessarily the first */ while (i > 0 && from->e[i-1].end >= pos) i--; /* We may be one bin before the optimum, so check */ while (i+1 < from->nslice && (from->e[i].refid < refid || from->e[i].end < pos)) i++; e = &from->e[i]; return e; } /* * Seek within a cram file. * * Returns 0 on success * -1 on failure */ int cram_seek(cram_fd *fd, off_t offset, int whence) { char buf[65536]; fd->ooc = 0; if (CRAM_IO_SEEK(fd, offset, whence) == 0) return 0; if (!(whence == SEEK_CUR && offset >= 0)) return -1; /* Couldn't fseek, but we're in SEEK_CUR mode so read instead */ while (offset > 0) { int len = MIN(65536, offset); if (len != CRAM_IO_READ(buf, 1, len, fd)) return -1; offset -= len; } return 0; } /* * Skips to a container overlapping the start coordinate listed in * cram_range. * * In theory we call cram_index_query multiple times, once per slice * overlapping the range. However slices may be absent from the index * which makes this problematic. Instead we find the left-most slice * and then read from then on, skipping decoding of slices and/or * whole containers when they don't overlap the specified cram_range. * * Returns 0 on success * -1 on failure */ int cram_seek_to_refpos(cram_fd *fd, cram_range *r) { cram_index *e; // Ideally use an index, so see if we have one. if ((e = cram_index_query(fd, r->refid, r->start, NULL))) { if (0 != cram_seek(fd, e->offset, SEEK_SET)) if (0 != cram_seek(fd, e->offset - fd->first_container, SEEK_CUR)) return -1; } else { fprintf(stderr, "Unknown reference ID. Missing from index?\n"); return -1; } if (fd->ctr) { cram_free_container(fd->ctr); fd->ctr = NULL; fd->ctr_mt = NULL; fd->ooc = 0; fd->eof = 0; } return 0; } /* * A specialised form of cram_index_build (below) that deals with slices * having multiple references in this (ref_id -2). In this scenario we * decode the slice to look at the RI data series instead. * * Returns 0 on success * -1 on failure */ static int cram_index_build_multiref(cram_fd *fd, cram_container *c, cram_slice *s, zfp *fp, off_t cpos, int32_t landmark, int sz) { int i, ref = -2, ref_start = 0, ref_end; char buf[1024]; if (0 != cram_decode_slice(fd, c, s, fd->header)) return -1; ref_end = INT_MIN; for (i = 0; i < s->hdr->num_records; i++) { if (s->crecs[i].ref_id == ref) { if (ref_end < s->crecs[i].aend) ref_end = s->crecs[i].aend; continue; } if (ref != -2) { sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n", ref, ref_start, ref_end - ref_start + 1, (int64_t)cpos, landmark, sz); zfputs(buf, fp); } ref = s->crecs[i].ref_id; ref_start = s->crecs[i].apos; ref_end = s->crecs[i].aend; } if (ref != -2) { sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n", ref, ref_start, ref_end - ref_start + 1, (int64_t)cpos, landmark, sz); zfputs(buf, fp); } return 0; } /* * Builds an index file. * * fd is a newly opened cram file that we wish to index. * fn_base is the filename of the associated CRAM file. Internally we * add ".crai" to this to get the index filename. * * Returns 0 on success * -1 on failure */ int cram_index_build(cram_fd *fd, const char *fn_base) { cram_container *c; off_t cpos, spos, hpos; zfp *fp; char fn_idx[PATH_MAX]; int seekable; size_t len; if ((len=strlen(fn_base)) > PATH_MAX-6) return -1; if (len >= 5 && strcmp(&fn_base[len-5], ".crai") == 0) strcpy(fn_idx, fn_base); else sprintf(fn_idx, "%s.crai", fn_base); if (!(fp = zfopen(fn_idx, "wz"))) { perror(fn_idx); return -1; } cpos = CRAM_IO_TELLO(fd); if (cpos >= 0) { seekable = 1; } else { seekable = 0; cpos = fd->first_container; } while ((c = cram_read_container(fd))) { int j; if (fd->err) { perror("Cram container read"); return -1; } if (seekable) { hpos = CRAM_IO_TELLO(fd); assert(hpos == cpos + c->offset); } else { hpos = cpos + c->offset; } if (!(c->comp_hdr_block = cram_read_block(fd))) return -1; assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER); c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block); if (!c->comp_hdr) return -1; // 2.0 format for (j = 0; j < c->num_landmarks; j++) { char buf[1024]; cram_slice *s; int sz; if (seekable) { spos = CRAM_IO_TELLO(fd); assert(spos - cpos - c->offset == c->landmark[j]); } else { spos = cpos + c->offset + c->landmark[j]; } if (!(s = cram_read_slice(fd))) { zfclose(fp); return -1; } if (seekable) { sz = (int)(CRAM_IO_TELLO(fd) - spos); } else { sz = j+1 < c->num_landmarks ? c->landmark[j+1] - c->landmark[j] : c->length - c->landmark[c->num_landmarks-1]; } if (s->hdr->ref_seq_id == -2) { cram_index_build_multiref(fd, c, s, fp, cpos, c->landmark[j], sz); } else { sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", s->hdr->ref_seq_id, s->hdr->ref_seq_start, s->hdr->ref_seq_span, (int64_t)cpos, c->landmark[j], sz); zfputs(buf, fp); } cram_free_slice(s); } if (seekable) { cpos = CRAM_IO_TELLO(fd); assert(cpos == hpos + c->length); } else { cpos = hpos + c->length; } cram_free_container(c); } if (fd->err) { zfclose(fp); return -1; } return (zfclose(fp) >= 0) ? 0 : -1; }