/* * Copyright (c) 2003-2008, 2013 Genome Research Ltd. * Author(s): James Bonfield, Andrew Whitwham * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger * Institute nor the names of its contributors may be used to endorse * or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Author(s): James Bonfield * * Copyright (c) 2001 MEDICAL RESEARCH COUNCIL * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1 Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2 Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3 Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF * MOLECULAR BIOLOGY nor the names of its contributors may be used * to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H # include "io_lib_config.h" #endif #if !(defined(_MSC_VER) || defined(__MINGW32__)) # define TRACE_ARCHIVE # ifndef HAVE_LIBCURL # define USE_WGET # endif #endif #include #include #include #include #include #include #include "io_lib/os.h" #include "io_lib/xalloc.h" #ifdef TRACE_ARCHIVE # include # include # include # include # include #endif #ifdef USE_WGET # include #endif #ifndef PATH_MAX # define PATH_MAX 1024 #endif #ifdef HAVE_LIBCURL # include #endif #include "io_lib/open_trace_file.h" #include "io_lib/misc.h" #include "io_lib/tar_format.h" #include "io_lib/hash_table.h" #ifndef SAMTOOLS # include "io_lib/compress.h" # include "io_lib/sff.h" # include "io_lib/srf.h" #endif /* * Supported compression extensions. See the magics array in compress.c for * the full structure. */ static char *magics[] = {"", ".bz", ".gz", ".Z", ".z", ".bz2", ".sz"}; /* * Tokenises the search path splitting on colons (unix) or semicolons * (windows). * We also explicitly add a "./" to the end of the search path * * Returns: A new search path with items separated by nul chars. Two nul * chars in a row represent the end of the tokenised path. * Returns NULL for a failure. * * The returned data has been malloced. It is up to the caller to free this * memory. */ char *tokenise_search_path(char *searchpath) { char *newsearch; unsigned int i, j; size_t len; #ifdef _WIN32 char path_sep = ';'; #else char path_sep = ':'; #endif if (!searchpath) searchpath=""; newsearch = (char *)malloc((len = strlen(searchpath))+5); if (!newsearch) return NULL; for (i = 0, j = 0; i < len; i++) { /* "::" => ":". Used for escaping colons in http://foo */ if (i < len-1 && searchpath[i] == ':' && searchpath[i+1] == ':') { newsearch[j++] = ':'; i++; continue; } /* Handle http:// and ftp:// too without :: */ if (path_sep == ':') { if ((i == 0 || (i > 0 && searchpath[i-1] == ':')) && (!strncmp(&searchpath[i], "http:", 5) || !strncmp(&searchpath[i], "https:", 6) || !strncmp(&searchpath[i], "ftp:", 4) || !strncmp(&searchpath[i], "|http:", 6) || !strncmp(&searchpath[i], "|https:", 7) || !strncmp(&searchpath[i], "|ftp:", 5) || !strncmp(&searchpath[i], "URL=http:", 9) || !strncmp(&searchpath[i], "URL=https:",10)|| !strncmp(&searchpath[i], "URL=ftp:", 8))) { do { newsearch[j++] = searchpath[i]; } while (i 100) return NULL; /* Search the .index file */ sprintf(path, "%s.index", tarname); if (file_exists(path)) { FILE *fpind = fopen(path, "r"); char *cp; int tmp_off; int found = 0; if (fpind) { while (fgets(path, PATH_MAX+100, fpind)) { if ((cp = strchr(path, '\n'))) *cp = 0; tmp_off = strtol(path, &cp, 10); while (isspace(*cp)) cp++; if (strncmp(cp, file, name_len) == 0) { int i; for (i = 0; i < num_magics; i++) { if (strcmp(&cp[name_len], magics[i]) == 0) { offset = tmp_off; found = 1; break; } } if (found) break; } } fclose(fpind); /* Not in index */ if (!found) return NULL; } } if (NULL == (fp = fopen(tarname, "rb"))) return NULL; /* * Search through the tar file (starting from index position) looking * for our filename. If there was no index then we start from position 0. */ fseek(fp, offset, SEEK_SET); while(fread(&blk, sizeof(blk), 1, fp) == 1) { if (!blk.header.name[0]) break; size = strtol(blk.header.size, NULL, 8); /* start with the same name... */ if (strncmp(blk.header.name, file, name_len) == 0) { char *data; int i; /* ... but does it end with a known compression extension? */ for (i = 0; i < num_magics; i++) { if (strcmp(&blk.header.name[name_len], magics[i]) == 0) { break; } } /* ... apparently not? continue then */ if (i == num_magics) continue; /* Found it - copy out the data to an mFILE */ if (NULL == (data = (char *)malloc(size))) return NULL; if (size != fread(data, 1, size, fp)) { free(data); return NULL; } return mfcreate(data, size); } fseek(fp, TBLOCK*((size+TBLOCK-1)/TBLOCK), SEEK_CUR); } fclose(fp); return NULL; } /* * Reads a hash file to look for a filename. The hash file contains the * (relative) pathname for the file it is an index for along with the * positions and sizes of each file contained within it. The file format * of the archive itself is irrelevant provided that the data is not * internally compressed in some manner specific to that archive. * * Return mFILE pointer if found * NULL if not */ static mFILE *find_file_hash(char *file, char *hashfile) { size_t size; static HashFile *hf = NULL; static char hf_name[1024]; char *data; /* Cache an open HashFile for fast accesing */ if (strcmp(hashfile, hf_name) != 0) { if (hf) HashFileDestroy(hf); hf = HashFileOpen(hashfile); if (!hf) return NULL; strcpy(hf_name, hashfile); } /* Search */ if (NULL == (data = HashFileExtract(hf, file, &size))) return NULL; /* Found, so copy the contents to a fake FILE pointer */ return mfcreate(data, size); } #ifndef SAMTOOLS /* * Extracts a single trace from an SRF file. * * Return mFILE pointer if found * NULL if not */ static mFILE *find_file_srf(char *tname, char *srffile) { srf_t *srf; uint64_t cpos, hpos, dpos; mFILE *mf = NULL; char *cp; if (NULL == (srf = srf_open(srffile, "r"))) return NULL; if (NULL != (cp = strrchr(tname, '/'))) tname = cp+1; if (0 == srf_find_trace(srf, tname, &cpos, &hpos, &dpos)) { char *data = malloc(srf->th.trace_hdr_size + srf->tb.trace_size); if (!data) { srf_destroy(srf, 1); return NULL; } memcpy(data, srf->th.trace_hdr, srf->th.trace_hdr_size); memcpy(data + srf->th.trace_hdr_size, srf->tb.trace, srf->tb.trace_size); mf = mfcreate(data, srf->th.trace_hdr_size + srf->tb.trace_size); } srf_destroy(srf, 1); return mf; } #endif #ifdef TRACE_ARCHIVE /* * Searches for file in the ensembl trace archive pointed to by arcname. * If it finds it, it copies it out and returns a file pointer to the * temporary file, otherwise we return NULL. * * Arcname has the form address:port, eg "titan/22100" * * Returns mFILE pointer if found * NULL if not. */ #define RDBUFSZ 8192 static mFILE *find_file_archive(char *file, char *arcname) { char server[1024], *cp; int port; struct hostent *host; struct sockaddr_in saddr; int s = 0; char msg[1024]; ssize_t msg_len; char buf[RDBUFSZ]; mFILE *fpout; int block_count; /* Split arc name into server and port */ if (!(cp = strchr(arcname, '/'))) return NULL; strncpy(server, arcname, 1023); server[MIN(1023,cp-arcname)] = 0; port = atoi(cp+1); /* Make and connect socket */ if (NULL == (host = gethostbyname(server))) { perror("gethostbyname()"); return NULL; } saddr.sin_port = htons(port); saddr.sin_family = host->h_addrtype; memcpy(&saddr.sin_addr,host->h_addr_list[0], host->h_length); if ((s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == -1) { perror("socket()"); return NULL; } if (connect(s, (struct sockaddr *)&saddr, sizeof(saddr)) == -1) { perror("connect()"); return NULL; } /* The minimal message to send down is "--scf tracename" */ sprintf(msg, "--scf %.*s\n", 1000, file); msg_len = strlen(msg); if (send(s, msg, msg_len, 0) != msg_len) { /* * partial request sent, but requests are short so if this * happens it's unlikely we'll cure it by sending multiple * fragments. */ /* close(s); */ return NULL; } /* * Create a fake FILE (mFILE) and write to it. */ fpout = mfcreate(NULL, 0); /* * Read the data back, in multiple blocks if necessary and write it * to our temporary file. We use a blocking read with a low timeout to * prevent locking up the application indefinitely. */ { struct timeval tv = {0, 10000}; setsockopt(s, SOL_SOCKET, SO_RCVTIMEO, (char *)&tv, sizeof(tv)); } errno = 0; block_count = 200; while ((msg_len = read(s, buf, RDBUFSZ)) > 0 || (errno == EWOULDBLOCK && --block_count)) { errno = 0; if (msg_len > 0) mfwrite(buf, 1, msg_len, fpout); } close(s); if (!block_count) { mfclose(fpout); return NULL; } mrewind(fpout); return fpout; } #endif #ifndef SAMTOOLS #ifdef USE_WGET /* NB: non-reentrant due to reuse of handle */ mFILE *find_file_url(char *file, char *url) { char buf[8192], *cp; mFILE *fp; int pid; int maxlen = 8190 - strlen(file); char *fname = tempnam(NULL, NULL); int status = 0; /* Expand %s for the trace name */ for (cp = buf; *url && cp - buf < maxlen; url++) { if (*url == '%' && *(url+1) == 's') { url++; cp += strlen(strcpy(cp, file)); } else { *cp++ = *url; } } *cp++ = 0; /* Execute wget */ if ((pid = fork())) { waitpid(pid, &status, 0); } else { execlp("wget", "wget", "-q", "-O", fname, buf, NULL); } /* Return a filepointer to the result (if it exists) */ fp = (!status && file_size(fname) != 0) ? mfopen(fname, "rb") : NULL; remove(fname); free(fname); return fp; } #endif #define STATIC_CURL #ifdef HAVE_LIBCURL mFILE *find_file_url(char *file, char *url) { char buf[8192], *cp; mFILE *mf = NULL, *headers = NULL; int maxlen = 8190 - strlen(file); #ifdef STATIC_CURL static CURL *handle = NULL; static int curl_init = 0; #else CURL *handle = NULL; static int curl_init = 0; #endif char errbuf[CURL_ERROR_SIZE]; *errbuf = 0; #ifdef STATIC_CURL if (!curl_init) { if (curl_global_init(CURL_GLOBAL_ALL)) return NULL; if (NULL == (handle = curl_easy_init())) goto error; curl_init = 1; } #else if (!curl_init) { if (curl_global_init(CURL_GLOBAL_ALL)) return NULL; curl_init = 1; } if (NULL == (handle = curl_easy_init())) goto error; #endif /* Expand %s for the trace name */ for (cp = buf; *url && cp - buf < maxlen; url++) { if (*url == '%' && *(url+1) == 's') { url++; cp += strlen(strcpy(cp, file)); } else { *cp++ = *url; } } *cp++ = 0; /* Setup the curl */ if (NULL == (mf = mfcreate(NULL, 0)) || NULL == (headers = mfcreate(NULL, 0))) return NULL; if (0 != curl_easy_setopt(handle, CURLOPT_URL, buf)) goto error; if (0 != curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 60L)) goto error; if (0 != curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, (curl_write_callback)mfwrite)) goto error; if (0 != curl_easy_setopt(handle, CURLOPT_WRITEDATA, mf)) goto error; if (0 != curl_easy_setopt(handle, CURLOPT_HEADERFUNCTION, (curl_write_callback)mfwrite)) goto error; if (0 != curl_easy_setopt(handle, CURLOPT_WRITEHEADER, headers)) goto error; if (0 != curl_easy_setopt(handle, CURLOPT_ERRORBUFFER, errbuf)) goto error; if (0 != curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1)) goto error; const curl_version_info_data *info = curl_version_info(CURLVERSION_NOW); char user_agent[1000]; sprintf(user_agent, "io_lib/" PACKAGE_VERSION " libcurl/%.900s", info->version); if (0 != curl_easy_setopt(handle, CURLOPT_USERAGENT, user_agent)) goto error; /* Fetch! */ if (0 != curl_easy_perform(handle)) goto error; /* Report errors is approproate. 404 is silent as it may have just been * a search via RAWDATA path, everything else is worth reporting. */ { float version; int response; char nul = 0; mfwrite(&nul, 1, 1, headers); if (2 == sscanf(headers->data, "HTTP/%f %d", &version, &response)) { if (response != 200 && response != 301 && response != 404) { fprintf(stderr, "%.*s\n", (int)headers->size, headers->data); goto error; } } } if (mftell(mf) == 0) goto error; mfdestroy(headers); #ifndef STATIC_CURL curl_easy_cleanup(handle); #endif mrewind(mf); return mf; error: #ifndef STATIC_CURL if (handle) curl_easy_cleanup(handle); #endif if (mf) mfdestroy(mf); if (headers) mfdestroy(headers); if (*errbuf) fprintf(stderr, "CURL ERROR: %s\n", errbuf); return NULL; } #endif #if !defined(USE_WGET) && !defined(HAVE_LIBCURL) mFILE *find_file_url(char *file, char *url) { return NULL; } #endif /* * Takes an SFF file in 'data' and edits the header to ensure * that it has no index listed and only claims to contain a single entry. * This isn't strictly necessary for the sff/sff.c reading code, but it is * the 'Right Thing' to do. * * Returns an mFILE on success or NULL on failure. */ static mFILE *sff_single(char *data, size_t size) { *(uint64_t *)(data+8) = be_int8(0); /* index offset */ *(uint32_t *)(data+16) = be_int4(0); /* index size */ *(uint32_t *)(data+20) = be_int4(1); /* number of reads */ return mfcreate(data, size); } /* Hash (.hsh) format index searching for SFF files */ static mFILE *sff_hash_query(char *sff, char *entry, FILE *fp) { static HashFile *hf = NULL; static char sff_copy[1024]; char *data; size_t size; /* Cache an open HashFile for fast accessing */ if (strcmp(sff, sff_copy) != 0) { if (hf) { hf->afp = NULL; hf->hfp = NULL; /* will be closed by our parent */ HashFileDestroy(hf); } fseek(fp, -4, SEEK_CUR); if (NULL == (hf = HashFileFopen(fp))) return NULL; strcpy(sff_copy, sff); } data = HashFileExtract(hf, entry, &size); return data ? sff_single(data, size) : NULL; } /* * getuint4_255 * * A function to convert a 4-byte TVF/SFF value into an integer, where * the bytes are base 255 numbers. This is used to store the index offsets. */ static unsigned int getuint4_255(unsigned char *b) { return ((unsigned int) b[0]) * 255 * 255 * 255 + ((unsigned int) b[1]) * 255 * 255 + ((unsigned int) b[2]) * 255 + ((unsigned int) b[3]); } /* * 454 sorted format (.srt) index searching for SFF files. * Uses a binary search. * This function and getuint4_255 above are taken with permission * from 454's getsff.c with the following licence: * * Copyright (c)[2001-2005] 454 Life Sciences Corporation. All Rights Reserved. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * * IN NO EVENT SHALL LICENSOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE. * * Permission to use, copy, modify and distribute this software and its * documentation for any purpose is hereby granted without fee, provided * that this copyright and notice appears in all copies. */ static mFILE *sff_sorted_query(char *sff, char *accno, FILE *fp, uint32_t index_length) { static unsigned char *index; static char sff_copy[1024]; unsigned char *us; uint32_t start, end; uint32_t offset = 0; char *data = NULL; static char chdr[1024]; static int chdrlen = 0, nflows = 0; union { char c1[1024]; uint16_t c2[512]; uint32_t c4[256]; } rhdr; int rhdrlen; int nbases, dlen; int bytes_per_flow = 2; /* Cache index if we're querying the same SFF file */ if (strcmp(sff_copy, sff) != 0) { if (index) xfree(index); if (NULL == (index = (unsigned char *)xmalloc(index_length))) return NULL; if (index_length != fread(index, 1, index_length, fp)) { xfree(index); return NULL; } strcpy(sff_copy, sff); /* Read the common header too - minimal decoding necessary */ fseek(fp, 0, SEEK_SET); if (31 != fread(chdr, 1, 31, fp)) return NULL; chdrlen = be_int2(*(uint16_t *)(chdr+24)); nflows = be_int2(*(uint16_t *)(chdr+28)); if (chdrlen-31 != fread(chdr+31, 1, chdrlen-31, fp)) return NULL; } /* * Perform a binary search of the index, stopping when the search * region becomes relatively small. This assumes that no accession * number is near 200 characters. */ start = 0; end = index_length; while (end - start > 200) { uint32_t mid; int val; mid = (start + end) / 2; /* * From the byte midpoint, scan backwards to the beginning of the * index record that covers that byte midpoint. */ while (mid > start && index[mid-1] != 255) { mid--; } val = strcmp(accno, (char *)(index+mid)); if (val == 0) { break; } else if (val < 0) { end = mid; } else { start = mid; } } /* * Scan through the small search region, looking for the accno. */ while (start < end) { if (strcmp(accno, (char *)(index+start)) == 0) { /* * If the accno is found, skip the accno characters, * then get the record offset. */ for (us=index+start; *us; us++,start++) ; us++; start++; offset = getuint4_255(us); if (us[4] != 255) { return NULL; } /* * The original getsff.c here computed the record size by * looking at the next index item and comparing it's offset to * this one, or the end of file position if this is the last * item. This has two problems: * 1: It means the index itself cannot be added to the end of * the file. * 2: It means that we cannot simply add an index to a SFF * file without also reordering all of the items within it. * * We solve this by reading the read header to work out the * object size instead. */ break; } /* * Skip to the beginning of the next index element. */ while (start < end && index[start] != 255) { start++; } start++; } /* * Now offset indicates the position of the SFF entry. Read and decode * header to get data length. Then read this too. */ fseek(fp, offset, SEEK_SET); if (16 != fread(rhdr.c1, 1, 16, fp)) return NULL; rhdrlen = be_int2(rhdr.c2[0]); nbases = be_int4(rhdr.c4[1]); if (rhdrlen-16 != fread(rhdr.c1+16, 1, rhdrlen-16, fp)) return NULL; dlen = (nflows * bytes_per_flow + nbases * 3 + 7) & ~7; /* Built up the fake SFF entry */ if (NULL == (data = (char *)xmalloc(chdrlen + rhdrlen + dlen))) return NULL; memcpy(data, chdr, chdrlen); memcpy(data + chdrlen, rhdr.c1, rhdrlen); if (dlen != fread(data + chdrlen + rhdrlen, 1, dlen, fp)) { xfree(data); return NULL; } /* Convert to mFILE */ return sff_single(data, chdrlen + rhdrlen + dlen); } /* * This returns an mFILE containing an SFF entry. * * This does the minimal decoding necessary to skip through the SFF * container to find an entry. In this respect it is a semi-duplication * of sff/sff.[ch], but implemented for efficiency. * * Having found an entry it packs the common header, the read specific * header and the read data into a single block of memory and returns this * as an mFILE. In essence it produces a single-read SFF archive. This * is then decoded by the normal sff parsing code representing a small * amount of redundancy, but one which is swamped by the I/O time. */ static mFILE *find_file_sff(char *entry, char *sff) { static FILE *fp = NULL; static char sff_copy[1024]; union { char c1[65536]; uint16_t c2[32768]; uint32_t c4[16384]; uint64_t c8[8192]; } chdr, rhdr; /* generous, but worst case */ uint32_t nflows, chdrlen, rhdrlen = 0, dlen = 0, magic; uint64_t file_pos; static uint64_t index_offset = 0; static uint32_t index_length = 0; static char index_format[8]; uint32_t nreads, i; size_t entry_len = strlen(entry); int bytes_per_flow = 2; char *fake_file; /* * Check cached information so rapid queries to the same archive are * fast. * ASSUMPTION: we won't externally replace the sff file with another of * the same name. */ if (strcmp(sff, sff_copy) == 0) { if (memcmp(index_format, ".hsh1.00", 8) == 0) { return sff_hash_query(sff, entry, fp); } else if (memcmp(index_format, ".srt1.00", 8) == 0 || memcmp(index_format, ".mft1.00", 8) == 0) { return sff_sorted_query(sff, entry, fp, index_length-8); } } if (fp) fclose(fp); strcpy(sff_copy, sff); *index_format = 0; /* Read the common header */ if (NULL == (fp = fopen(sff, "rb"))) return NULL; if (31 != fread(chdr.c1, 1, 31, fp)) return NULL; /* Check magic & vers: TODO */ magic = be_int4(chdr.c4[0]); if (magic != SFF_MAGIC) return NULL; if (memcmp(chdr.c1+4, SFF_VERSION, 4) != 0) return NULL; /* If we have an index, use it, otherwise search linearly */ index_offset = be_int8(chdr.c8[1]); index_length = be_int4(chdr.c4[4]); if (index_length != 0) { long orig_pos = ftell(fp); fseek(fp, index_offset, SEEK_SET); if (8 != fread(index_format, 1, 8, fp)) return NULL; if (memcmp(index_format, ".hsh1.00", 8) == 0) { /* HASH index v1.00 */ return sff_hash_query(sff, entry, fp); } else if (memcmp(index_format, ".srt1.00", 8) == 0 || memcmp(index_format, ".mft1.00", 8) == 0) { /* 454 sorted v1.00 */ return sff_sorted_query(sff, entry, fp, index_length-8); } else { /* Unknown index: revert back to a slow linear scan */ fseek(fp, orig_pos, SEEK_SET); } } nreads = be_int4(chdr.c4[5]); chdrlen = be_int2(chdr.c2[12]); //nkey = be_int2(chdr.c2[13]); nflows = be_int2(chdr.c2[14]); /* Read the remainder of the header */ if (chdrlen-31 != fread(chdr.c1+31, 1, chdrlen-31, fp)) return NULL; file_pos = chdrlen; /* Loop until we find the correct entry */ for (i = 0; i < nreads; i++) { uint16_t name_len; uint32_t nbases; /* Index could be between common header and first read - skip */ if (file_pos == index_offset) { fseek(fp, index_length, SEEK_CUR); file_pos += index_length; } /* Read 16 bytes to get name length */ if (16 != fread(rhdr.c1, 1, 16, fp)) return NULL; rhdrlen = be_int2(rhdr.c2[0]); name_len = be_int2(rhdr.c2[1]); nbases = be_int4(rhdr.c4[1]); /* Read the rest of the header */ if (rhdrlen-16 != fread(rhdr.c1+16, 1, rhdrlen-16, fp)) return NULL; file_pos += rhdrlen; dlen = (nflows * bytes_per_flow + nbases * 3 + 7) & ~7; if (name_len == entry_len && 0 == memcmp(rhdr.c1+16, entry, entry_len)) break; /* This is not the read you are looking for... */ fseek(fp, dlen, SEEK_CUR); } if (i == nreads) { /* Not found */ return NULL; } /* * Although we've decoded some bits already, we take the more modular * approach of packing the sections together and passing the entire * data structure off as a single-read SFF file to be decoded fully * by the sff reading code. */ if (NULL == (fake_file = (char *)xmalloc(chdrlen + rhdrlen + dlen))) return NULL; memcpy(fake_file, chdr.c1, chdrlen); memcpy(fake_file+chdrlen, rhdr.c1, rhdrlen); if (dlen != fread(fake_file+chdrlen+rhdrlen, 1, dlen, fp)) { xfree(fake_file); return NULL; } /* Convert to an mFILE and return */ return sff_single(fake_file, chdrlen+rhdrlen+dlen); } #endif /* * Searches for file in the directory 'dirname'. If it finds it, it opens * it. This also searches for compressed versions of the file in dirname * too. * * Returns mFILE pointer if found * NULL if not */ static mFILE *find_file_dir(char *file, char *dirname) { char path[PATH_MAX+1], path2[PATH_MAX+1]; size_t len = strlen(dirname); char *cp; if (dirname[len-1] == '/') len--; /* Special case for "./" or absolute filenames */ if (*file == '/' || (len==1 && *dirname == '.')) { sprintf(path, "%s", file); } else { /* Handle %[0-9]*s expansions, if required */ char *path_end = path; *path = 0; while ((cp = strchr(dirname, '%'))) { char *endp; long l = strtol(cp+1, &endp, 10); if (*endp != 's') { strncpy(path_end, dirname, (endp+1)-dirname); path_end += (endp+1)-dirname; dirname = endp+1; continue; } strncpy(path_end, dirname, cp-dirname); path_end += cp-dirname; if (l) { strncpy(path_end, file, l); path_end += MIN(strlen(file), l); file += MIN(strlen(file), l); } else { strcpy(path_end, file); path_end += strlen(file); file += strlen(file); } len -= (endp+1) - dirname; dirname = endp+1; } strncpy(path_end, dirname, len); path_end += MIN(strlen(dirname), len); *path_end = 0; if (*file) { *path_end++ = '/'; strcpy(path_end, file); } //fprintf(stderr, "*PATH=\"%s\"\n", path); } if (is_file(path)) { return mfopen(path, "rbm"); } /* * Given a pathname /a/b/c if a/b is a file and not a directory then * we'd get an ENOTDIR error. Instead we assume that a/b is an archive * and we attempt to work out what type by reading the first and last * bits of the file. */ if ((cp = strrchr(file, '/'))) { strcpy(path2, path); /* path contains / too as it's from file */ *strrchr(path2, '/') = 0; if (is_file(path2)) { /* Open the archive to test for magic numbers */ char magic[8]; FILE *fp; enum archive_type_t { NONE, HASH, TAR, SFF, SRF } type = NONE; if (NULL == (fp = fopen(path2, "rb"))) return NULL; memcpy(magic, "\0\0\0\0\0\0", 4); if (4 != fread(magic, 1, 4, fp)) return NULL; /* .hsh or .sff at start */ if (memcmp(magic, ".hsh", 4) == 0) type = HASH; else if (memcmp(magic, ".sff", 4) == 0) type = SFF; /* Or .hsh or Ihsh at the end */ if (NONE == type) { fseek(fp, -16, SEEK_END); if (8 != fread(magic, 1, 8, fp)) return NULL; if (memcmp(magic+4, ".hsh", 4) == 0) type = HASH; else if (memcmp(magic, "Ihsh", 4) == 0) type = SRF; } /* or ustar 257 bytes in to indicate un-hashed tar */ if (NONE == type) { fseek(fp, 257, SEEK_SET); if (5 != fread(magic, 1, 5, fp)) return NULL; if (memcmp(magic, "ustar", 5) == 0) type = TAR; } fclose(fp); switch (type) { case HASH: return find_file_hash(cp+1, path2); case TAR: return find_file_tar(cp+1, path2, 0); #ifndef SAMTOOLS case SFF: return find_file_sff(cp+1, path2); case SRF: return find_file_srf(cp+1, path2); #endif default: case NONE: break; } return NULL; } } return NULL; } /* * ------------------------------------------------------------------------ * Public functions below. */ /* * Opens a trace file named 'file'. This is initially looked for as a * pathname relative to a file named "relative_to". This may (for * example) be the name of an experiment file referencing the trace * file. In this case by passing relative_to as the experiment file * filename the trace file will be picked up in the same directory as * the experiment file. Relative_to may be supplied as NULL. * * 'file' is looked for at relative_to, then the current directory, and then * all of the locations listed in 'path' (which is a colon separated list). * If 'path' is NULL it uses the RAWDATA environment variable instead. * * Returns a mFILE pointer when found. * NULL otherwise. */ mFILE *open_path_mfile(char *file, char *path, char *relative_to) { char *newsearch; char *ele; mFILE *fp; /* Use path first */ if (!path) path = getenv("RAWDATA"); if (NULL == (newsearch = tokenise_search_path(path))) return NULL; /* * Step through the search path testing out each component. * We now look through each path element treating some prefixes as * special, otherwise we treat the element as a directory. */ for (ele = newsearch; *ele; ele += strlen(ele)+1) { int i; char *suffix[6] = {"", ".gz", ".bz2", ".sz", ".Z", ".bz2"}; for (i = 0; i < 6; i++) { char file2[1024]; char *ele2; /* * '|' prefixing a path component indicates that we do not * wish to perform the compression extension searching in that * location. */ if (*ele == '|') { ele2 = ele+1; } else { ele2 = ele; } sprintf(file2, "%s%s", file, suffix[i]); if (0 == strncmp(ele2, "TAR=", 4)) { if ((fp = find_file_tar(file2, ele2+4, 0))) { free(newsearch); return fp; } } else if (0 == strncmp(ele2, "HASH=", 5)) { if ((fp = find_file_hash(file2, ele2+5))) { free(newsearch); return fp; } #ifdef TRACE_ARCHIVE } else if (0 == strncmp(ele2, "ARC=", 4)) { if ((fp = find_file_archive(file2, ele2+4))) { free(newsearch); return fp; } #endif #ifndef SAMTOOLS #if defined(USE_WGET) || defined(HAVE_LIBCURL) } else if (0 == strncmp(ele2, "URL=", 4)) { if ((fp = find_file_url(file2, ele2+4))) { free(newsearch); return fp; } } else if (!strncmp(ele2, "http:", 5) || !strncmp(ele2, "ftp:", 4) || !strncmp(ele2, "https:", 6)) { /* ftp/http compression best done via other means */ if (i == 0 && (fp = find_file_url(file2, ele2))) { free(newsearch); return fp; } #endif } else if (0 == strncmp(ele2, "SFF=", 4)) { if ((fp = find_file_sff(file2, ele2+4))) { free(newsearch); return fp; } } else if (0 == strncmp(ele2, "SRF=", 4)) { if ((fp = find_file_srf(file2, ele2+4))) { free(newsearch); return fp; } #endif } else { if ((fp = find_file_dir(file2, ele2))) { free(newsearch); return fp; } } if (*ele == '|') break; } } free(newsearch); /* Look in the same location as the incoming 'relative_to' filename */ if (relative_to) { char *cp; char relative_path[PATH_MAX+1]; strcpy(relative_path, relative_to); if ((cp = strrchr(relative_path, '/'))) *cp = 0; if ((fp = find_file_dir(file, relative_path))) return fp; } return NULL; } FILE *open_path_file(char *file, char *path, char *relative_to) { mFILE *mf = open_path_mfile(file, path, relative_to); FILE *fp; if (!mf) return NULL; if (mf->fp) return mf->fp; /* Secure temporary file generation */ #ifdef _WIN32 if (NULL == (fp = tmpfile_win())) #else if (NULL == (fp = tmpfile())) #endif return NULL; /* Copy the data */ fwrite(mf->data, 1, mf->size, fp); rewind(fp); mfclose(mf); return fp; } static char *exp_path = NULL; static char *trace_path = NULL; void iolib_set_trace_path(char *path) { trace_path = path; } char *iolib_get_trace_path(void) { return trace_path; } void iolib_set_exp_path (char *path) { exp_path = path; } char *iolib_get_exp_path (void) { return exp_path; } /* * Trace file functions: uses TRACE_PATH environment variable. */ mFILE *open_trace_mfile(char *file, char *rel_to) { return open_path_mfile(file, trace_path ? trace_path : getenv("TRACE_PATH"), rel_to); } FILE *open_trace_file(char *file, char *rel_to) { return open_path_file(file, trace_path ? trace_path : getenv("TRACE_PATH"), rel_to); } /* * Trace file functions: uses EXP_PATH environment variable. */ mFILE *open_exp_mfile(char *file, char *relative_to) { return open_path_mfile(file, exp_path ? exp_path : getenv("EXP_PATH"), relative_to); } FILE *open_exp_file(char *file, char *relative_to) { return open_path_file(file, exp_path ? exp_path : getenv("EXP_PATH"), relative_to); }