/* phase.c -- phase subcommand. Copyright (C) 2011 Broad Institute. Copyright (C) 2013-2016 Genome Research Ltd. Author: Heng Li Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include "htslib/hts.h" #include "htslib/sam.h" #include "htslib/kstring.h" #include "sam_opts.h" #include "samtools.h" #include "htslib/hts_os.h" #include "htslib/kseq.h" KSTREAM_INIT(gzFile, gzread, 16384) #define MAX_VARS 256 #define FLIP_PENALTY 2 #define FLIP_THRES 4 #define MASK_THRES 3 #define FLAG_FIX_CHIMERA 0x1 #define FLAG_LIST_EXCL 0x4 #define FLAG_DROP_AMBI 0x8 typedef struct { // configurations, initialized in the main function int flag, k, min_baseQ, min_varLOD, max_depth; // other global variables int vpos_shift; samFile* fp; bam_hdr_t* fp_hdr; char *pre; char *out_name[3]; samFile* out[3]; bam_hdr_t* out_hdr[3]; // alignment queue int n, m; bam1_t **b; } phaseg_t; typedef struct { int8_t seq[MAX_VARS]; // TODO: change to dynamic memory allocation! int vpos, beg, end; uint32_t vlen:16, single:1, flip:1, phase:1, phased:1, ambig:1; uint32_t in:16, out:16; // in-phase and out-phase } frag_t, *frag_p; #define rseq_lt(a,b) ((a)->vpos < (b)->vpos) #include "htslib/khash.h" KHASH_SET_INIT_INT64(set64) KHASH_MAP_INIT_INT64(64, frag_t) typedef khash_t(64) nseq_t; #include "htslib/ksort.h" KSORT_INIT(rseq, frag_p, rseq_lt) static inline uint64_t X31_hash_string(const char *s) { uint64_t h = *s; if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; return h; } static void count1(int l, const uint8_t *seq, int *cnt) { int i, j, n_ambi; uint32_t z, x; if (seq[l-1] == 0) return; // do nothing is the last base is ambiguous for (i = n_ambi = 0; i < l; ++i) // collect ambiguous bases if (seq[i] == 0) ++n_ambi; if (l - n_ambi <= 1) return; // only one SNP for (x = 0; x < 1u<>j&1; ++j; } z = z<<1 | c; } ++cnt[z]; } } static int **count_all(int l, int vpos, nseq_t *hash) { khint_t k; int i, j, **cnt; uint8_t *seq; seq = calloc(l, 1); cnt = calloc(vpos, sizeof(int*)); for (i = 0; i < vpos; ++i) cnt[i] = calloc(1<vpos >= vpos || f->single) continue; // out of region; or singleton if (f->vlen == 1) { // such reads should be flagged as deleted previously if everything is right f->single = 1; continue; } for (j = 1; j < f->vlen; ++j) { for (i = 0; i < l; ++i) seq[i] = j < l - 1 - i? 0 : f->seq[j - (l - 1 - i)]; count1(l, seq, cnt[f->vpos + j]); } } } free(seq); return cnt; } // phasing static int8_t *dynaprog(int l, int vpos, int **w) { int *f[2], *curr, *prev, max, i; int8_t **b, *h = 0; uint32_t x, z = 1u<<(l-1), mask = (1u<>1; y1 = xc>>1; c0 = prev[y0] + wi[x] + wi[xc]; c1 = prev[y1] + wi[x] + wi[xc]; if (c0 > c1) bi[x] = 0, curr[x] = c0; else bi[x] = 1, curr[x] = c1; } tmp = prev; prev = curr; curr = tmp; // swap } { // backtrack uint32_t max_x = 0; int which = 0; h = calloc(vpos, 1); for (x = 0, max = 0, max_x = 0; x < z; ++x) if (prev[x] > max) max = prev[x], max_x = x; for (i = vpos - 1, x = max_x; i >= 0; --i) { h[i] = which? (~x&1) : (x&1); which = b[i][x]? !which : which; x = b[i][x]? (~x&mask)>>1 : x>>1; } } // free for (i = 0; i < vpos; ++i) free(b[i]); free(f[0]); free(f[1]); free(b); return h; } // phase each fragment static uint64_t *fragphase(int vpos, const int8_t *path, nseq_t *hash, int flip) { khint_t k; uint64_t *pcnt; uint32_t *left, *rght, max; left = rght = 0; max = 0; pcnt = calloc(vpos, 8); for (k = 0; k < kh_end(hash); ++k) { if (kh_exist(hash, k)) { int i, c[2]; frag_t *f = &kh_val(hash, k); if (f->vpos >= vpos) continue; // get the phase c[0] = c[1] = 0; for (i = 0; i < f->vlen; ++i) { if (f->seq[i] == 0) continue; ++c[f->seq[i] == path[f->vpos + i] + 1? 0 : 1]; } f->phase = c[0] > c[1]? 0 : 1; f->in = c[f->phase]; f->out = c[1 - f->phase]; f->phased = f->in == f->out? 0 : 1; f->ambig = (f->in && f->out && f->out < 3 && f->in <= f->out + 1)? 1 : 0; // fix chimera f->flip = 0; if (flip && c[0] >= 3 && c[1] >= 3) { int sum[2], m, mi, md; if (f->vlen > max) { // enlarge the array max = f->vlen; kroundup32(max); left = realloc(left, max * 4); rght = realloc(rght, max * 4); } for (i = 0, sum[0] = sum[1] = 0; i < f->vlen; ++i) { // get left counts if (f->seq[i]) { int c = f->phase? 2 - f->seq[i] : f->seq[i] - 1; ++sum[c == path[f->vpos + i]? 0 : 1]; } left[i] = sum[1]<<16 | sum[0]; } for (i = f->vlen - 1, sum[0] = sum[1] = 0; i >= 0; --i) { // get right counts if (f->seq[i]) { int c = f->phase? 2 - f->seq[i] : f->seq[i] - 1; ++sum[c == path[f->vpos + i]? 0 : 1]; } rght[i] = sum[1]<<16 | sum[0]; } // find the best flip point for (i = m = 0, mi = -1, md = -1; i < f->vlen - 1; ++i) { int a[2]; a[0] = (left[i]&0xffff) + (rght[i+1]>>16&0xffff) - (rght[i+1]&0xffff) * FLIP_PENALTY; a[1] = (left[i]>>16&0xffff) + (rght[i+1]&0xffff) - (rght[i+1]>>16&0xffff) * FLIP_PENALTY; if (a[0] > a[1]) { if (a[0] > m) m = a[0], md = 0, mi = i; } else { if (a[1] > m) m = a[1], md = 1, mi = i; } } if (m - c[0] >= FLIP_THRES && m - c[1] >= FLIP_THRES) { // then flip f->flip = 1; if (md == 0) { // flip the tail for (i = mi + 1; i < f->vlen; ++i) if (f->seq[i] == 1) f->seq[i] = 2; else if (f->seq[i] == 2) f->seq[i] = 1; } else { // flip the head for (i = 0; i <= mi; ++i) if (f->seq[i] == 1) f->seq[i] = 2; else if (f->seq[i] == 2) f->seq[i] = 1; } } } // update pcnt[] if (!f->single) { for (i = 0; i < f->vlen; ++i) { int c; if (f->seq[i] == 0) continue; c = f->phase? 2 - f->seq[i] : f->seq[i] - 1; if (c == path[f->vpos + i]) { if (f->phase == 0) ++pcnt[f->vpos + i]; else pcnt[f->vpos + i] += 1ull<<32; } else { if (f->phase == 0) pcnt[f->vpos + i] += 1<<16; else pcnt[f->vpos + i] += 1ull<<48; } } } } } free(left); free(rght); return pcnt; } static uint64_t *genmask(int vpos, const uint64_t *pcnt, int *_n) { int i, max = 0, max_i = -1, m = 0, n = 0, beg = 0, score = 0; uint64_t *list = 0; for (i = 0; i < vpos; ++i) { uint64_t x = pcnt[i]; int c[4], pre = score, s; c[0] = x&0xffff; c[1] = x>>16&0xffff; c[2] = x>>32&0xffff; c[3] = x>>48&0xffff; s = (c[1] + c[3] == 0)? -(c[0] + c[2]) : (c[1] + c[3] - 1); if (c[3] > c[2]) s += c[3] - c[2]; if (c[1] > c[0]) s += c[1] - c[0]; score += s; if (score < 0) score = 0; if (pre == 0 && score > 0) beg = i; // change from zero to non-zero if ((i == vpos - 1 || score == 0) && max >= MASK_THRES) { if (n == m) { m = m? m<<1 : 4; list = realloc(list, m * 8); } list[n++] = (uint64_t)beg<<32 | max_i; i = max_i; // reset i to max_i score = 0; } else if (score > max) max = score, max_i = i; if (score == 0) max = 0; } *_n = n; return list; } // trim heading and tailing ambiguous bases; mark deleted and remove sequence static int clean_seqs(int vpos, nseq_t *hash) { khint_t k; int ret = 0; for (k = 0; k < kh_end(hash); ++k) { if (kh_exist(hash, k)) { frag_t *f = &kh_val(hash, k); int beg, end, i; if (f->vpos >= vpos) { ret = 1; continue; } for (i = 0; i < f->vlen; ++i) if (f->seq[i] != 0) break; beg = i; for (i = f->vlen - 1; i >= 0; --i) if (f->seq[i] != 0) break; end = i + 1; if (end - beg <= 0) kh_del(64, hash, k); else { if (beg != 0) memmove(f->seq, f->seq + beg, end - beg); f->vpos += beg; f->vlen = end - beg; f->single = f->vlen == 1? 1 : 0; } } } return ret; } static int dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash) { int i, is_flip, drop_ambi; drop_ambi = g->flag & FLAG_DROP_AMBI; is_flip = (drand48() < 0.5); for (i = 0; i < g->n; ++i) { int end, which; uint64_t key; khint_t k; bam1_t *b = g->b[i]; key = X31_hash_string(bam_get_qname(b)); end = bam_endpos(b); if (end > min_pos) break; k = kh_get(64, hash, key); if (k == kh_end(hash)) which = 3; else { frag_t *f = &kh_val(hash, k); if (f->ambig) which = drop_ambi? 2 : 3; else if (f->phased && f->flip) which = 2; else if (f->phased == 0) which = 3; else { // phased and not flipped char c = 'Y'; which = f->phase; bam_aux_append(b, "ZP", 'A', 1, (uint8_t*)&c); } if (which < 2 && is_flip) which = 1 - which; // increase the randomness } if (which == 3) which = (drand48() < 0.5); if (sam_write1(g->out[which], g->out_hdr[which], b) < 0) { print_error_errno("phase", "error writing to '%s'", g->out_name[which]); return -1; } bam_destroy1(b); g->b[i] = 0; } memmove(g->b, g->b + i, (g->n - i) * sizeof(void*)); g->n -= i; return 0; } static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t *hash) { int i, j, n_seqs = kh_size(hash), n_masked = 0, min_pos; khint_t k; frag_t **seqs; int8_t *path, *sitemask; uint64_t *pcnt, *regmask; if (vpos == 0) return 0; i = clean_seqs(vpos, hash); // i is true if hash has an element with its vpos >= vpos min_pos = i? cns[vpos]>>32 : 0x7fffffff; if (vpos == 1) { printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1); printf("M0\t%s\t%d\t%d\t%c\t%c\t%d\t0\t0\t0\t0\n//\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1, "ACGTX"[cns[0]&3], "ACGTX"[cns[0]>>16&3], g->vpos_shift + 1); for (k = 0; k < kh_end(hash); ++k) { if (kh_exist(hash, k)) { frag_t *f = &kh_val(hash, k); if (f->vpos) continue; f->flip = 0; if (f->seq[0] == 0) f->phased = 0; else f->phased = 1, f->phase = f->seq[0] - 1; } } if (dump_aln(g, min_pos, hash) < 0) return -1; ++g->vpos_shift; return 1; } { // phase int **cnt; uint64_t *mask; printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[vpos-1]>>32) + 1); sitemask = calloc(vpos, 1); cnt = count_all(g->k, vpos, hash); path = dynaprog(g->k, vpos, cnt); for (i = 0; i < vpos; ++i) free(cnt[i]); free(cnt); pcnt = fragphase(vpos, path, hash, 0); // do not fix chimeras when masking mask = genmask(vpos, pcnt, &n_masked); regmask = calloc(n_masked, 8); for (i = 0; i < n_masked; ++i) { regmask[i] = cns[mask[i]>>32]>>32<<32 | cns[(uint32_t)mask[i]]>>32; for (j = mask[i]>>32; j <= (int32_t)mask[i]; ++j) sitemask[j] = 1; } free(mask); if (g->flag & FLAG_FIX_CHIMERA) { free(pcnt); pcnt = fragphase(vpos, path, hash, 1); } } for (i = 0; i < n_masked; ++i) printf("FL\t%s\t%d\t%d\n", chr, (int)(regmask[i]>>32) + 1, (int)regmask[i] + 1); for (i = 0; i < vpos; ++i) { uint64_t x = pcnt[i]; int8_t c[2]; c[0] = (cns[i]&0xffff)>>2 == 0? 4 : (cns[i]&3); c[1] = (cns[i]>>16&0xffff)>>2 == 0? 4 : (cns[i]>>16&3); printf("M%d\t%s\t%d\t%d\t%c\t%c\t%d\t%d\t%d\t%d\t%d\n", sitemask[i]+1, chr, (int)(cns[0]>>32) + 1, (int)(cns[i]>>32) + 1, "ACGTX"[c[path[i]]], "ACGTX"[c[1-path[i]]], i + g->vpos_shift + 1, (int)(x&0xffff), (int)(x>>16&0xffff), (int)(x>>32&0xffff), (int)(x>>48&0xffff)); } free(path); free(pcnt); free(regmask); free(sitemask); seqs = calloc(n_seqs, sizeof(frag_t*)); for (k = 0, i = 0; k < kh_end(hash); ++k) if (kh_exist(hash, k) && kh_val(hash, k).vpos < vpos && !kh_val(hash, k).single) seqs[i++] = &kh_val(hash, k); n_seqs = i; ks_introsort_rseq(n_seqs, seqs); for (i = 0; i < n_seqs; ++i) { frag_t *f = seqs[i]; printf("EV\t0\t%s\t%d\t40\t%dM\t*\t0\t0\t", chr, f->vpos + 1 + g->vpos_shift, f->vlen); for (j = 0; j < f->vlen; ++j) { uint32_t c = cns[f->vpos + j]; if (f->seq[j] == 0) putchar('N'); else putchar("ACGT"[f->seq[j] == 1? (c&3) : (c>>16&3)]); } printf("\t*\tYP:i:%d\tYF:i:%d\tYI:i:%d\tYO:i:%d\tYS:i:%d\n", f->phase, f->flip, f->in, f->out, f->beg+1); } free(seqs); printf("//\n"); fflush(stdout); g->vpos_shift += vpos; if (dump_aln(g, min_pos, hash) < 0) return -1; return vpos; } static void update_vpos(int vpos, nseq_t *hash) { khint_t k; for (k = 0; k < kh_end(hash); ++k) { if (kh_exist(hash, k)) { frag_t *f = &kh_val(hash, k); if (f->vpos < vpos) kh_del(64, hash, k); // TODO: if frag_t::seq is allocated dynamically, free it else f->vpos -= vpos; } } } static nseq_t *shrink_hash(nseq_t *hash) // TODO: to implement { return hash; } static int readaln(void *data, bam1_t *b) { phaseg_t *g = (phaseg_t*)data; int ret; while (1) { ret = sam_read1(g->fp, g->fp_hdr, b); if (ret < 0) break; if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; if ( g->pre ) { if (g->n == g->m) { g->m = g->m? g->m<<1 : 16; g->b = realloc(g->b, g->m * sizeof(bam1_t*)); } g->b[g->n++] = bam_dup1(b); } break; } return ret; } static khash_t(set64) *loadpos(const char *fn, bam_hdr_t *h) { gzFile fp; kstream_t *ks; int ret, dret; kstring_t *str; khash_t(set64) *hash; hash = kh_init(set64); str = calloc(1, sizeof(kstring_t)); fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); ks = ks_init(fp); while (ks_getuntil(ks, 0, str, &dret) >= 0) { int tid = bam_name2id(h, str->s); if (tid >= 0 && dret != '\n') { if (ks_getuntil(ks, 0, str, &dret) >= 0) { uint64_t x = (uint64_t)tid<<32 | (atoi(str->s) - 1); kh_put(set64, hash, x, &ret); } else break; } if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); if (dret < 0) break; } ks_destroy(ks); gzclose(fp); free(str->s); free(str); return hash; } static int gl2cns(float q[16]) { int i, j, min_ij; float min, min2; min = min2 = 1e30; min_ij = -1; for (i = 0; i < 4; ++i) { for (j = i; j < 4; ++j) { if (q[i<<2|j] < min) min_ij = i<<2|j, min2 = min, min = q[i<<2|j]; else if (q[i<<2|j] < min2) min2 = q[i<<2|j]; } } return (min_ij>>2&3) == (min_ij&3)? 0 : 1<<18 | (min_ij>>2&3)<<16 | (min_ij&3) | (int)(min2 - min + .499) << 2; } static int start_output(phaseg_t *g, int c, const char *middle, const htsFormat *fmt) { kstring_t s = { 0, 0, NULL }; ksprintf(&s, "%s.%s.%s", g->pre, middle, hts_format_file_extension(fmt)); g->out_name[c] = ks_release(&s); g->out[c] = sam_open_format(g->out_name[c], "wb", fmt); if (! g->out[c]) { print_error_errno("phase", "Failed to open output file '%s'", g->out_name[c]); return -1; } g->out_hdr[c] = bam_hdr_dup(g->fp_hdr); if (sam_hdr_write(g->out[c], g->out_hdr[c]) < 0) { print_error_errno("phase", "Failed to write header for '%s'", g->out_name[c]); return -1; } return 0; } int main_phase(int argc, char *argv[]) { int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0, usage = 0; const bam_pileup1_t *plp; bam_plp_t iter; nseq_t *seqs; uint64_t *cns = 0; phaseg_t g; char *fn_list = 0; khash_t(set64) *set = 0; errmod_t *em; uint16_t *bases; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '-'), { NULL, 0, NULL, 0 } }; // FIXME Leaks galore in the case of error returns memset(&g, 0, sizeof(phaseg_t)); g.flag = FLAG_FIX_CHIMERA; g.min_varLOD = 37; g.k = 13; g.min_baseQ = 13; g.max_depth = 256; while ((c = getopt_long(argc, argv, "Q:eFq:k:b:l:D:A:", lopts, NULL)) >= 0) { switch (c) { case 'D': g.max_depth = atoi(optarg); break; case 'q': g.min_varLOD = atoi(optarg); break; case 'Q': g.min_baseQ = atoi(optarg); break; case 'k': g.k = atoi(optarg); break; case 'F': g.flag &= ~FLAG_FIX_CHIMERA; break; case 'e': g.flag |= FLAG_LIST_EXCL; break; case 'A': g.flag |= FLAG_DROP_AMBI; break; case 'b': g.pre = strdup(optarg); break; case 'l': fn_list = strdup(optarg); break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': usage=1; break; } if (usage) break; } if (usage || argc == optind) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: samtools phase [options] \n\n"); fprintf(stderr, "Options: -k INT block length [%d]\n", g.k); fprintf(stderr, " -b STR prefix of BAMs to output [null]\n"); fprintf(stderr, " -q INT min het phred-LOD [%d]\n", g.min_varLOD); fprintf(stderr, " -Q INT min base quality in het calling [%d]\n", g.min_baseQ); fprintf(stderr, " -D INT max read depth [%d]\n", g.max_depth); // fprintf(stderr, " -l FILE list of sites to phase [null]\n"); fprintf(stderr, " -F do not attempt to fix chimeras\n"); fprintf(stderr, " -A drop reads with ambiguous phase\n"); // fprintf(stderr, " -e do not discover SNPs (effective with -l)\n"); fprintf(stderr, "\n"); sam_global_opt_help(stderr, "-....-"); return 1; } g.fp = sam_open_format(argv[optind], "r", &ga.in); if (!g.fp) { print_error_errno("phase", "Couldn't open '%s'", argv[optind]); return 1; } g.fp_hdr = sam_hdr_read(g.fp); if (g.fp_hdr == NULL) { fprintf(stderr, "[%s] Failed to read header for '%s'\n", __func__, argv[optind]); return 1; } if (fn_list) { // read the list of sites to phase set = loadpos(fn_list, g.fp_hdr); free(fn_list); } else g.flag &= ~FLAG_LIST_EXCL; if (g.pre) { // open BAMs to write if (ga.out.format == unknown_format) ga.out.format = bam; // default via "wb". // Open each output file g.out[0..2], dupping and writing the header if (start_output(&g, 0, "0", &ga.out) < 0 || start_output(&g, 1, "1", &ga.out) < 0 || start_output(&g, 2, "chimera", &ga.out) < 0) return 1; } iter = bam_plp_init(readaln, &g); g.vpos_shift = 0; seqs = kh_init(64); em = errmod_init(1. - 0.83); bases = calloc(g.max_depth, 2); printf("CC\n"); printf("CC\tDescriptions:\nCC\n"); printf("CC\t CC comments\n"); printf("CC\t PS start of a phase set\n"); printf("CC\t FL filtered region\n"); printf("CC\t M[012] markers; 0 for singletons, 1 for phased and 2 for filtered\n"); printf("CC\t EV supporting reads; SAM format\n"); printf("CC\t // end of a phase set\nCC\n"); printf("CC\tFormats of PS, FL and M[012] lines (1-based coordinates):\nCC\n"); printf("CC\t PS chr phaseSetStart phaseSetEnd\n"); printf("CC\t FL chr filterStart filterEnd\n"); printf("CC\t M? chr PS pos allele0 allele1 hetIndex #supports0 #errors0 #supp1 #err1\n"); printf("CC\nCC\n"); fflush(stdout); while ((plp = bam_plp_auto(iter, &tid, &pos, &n)) != 0) { int i, k, c, tmp, dophase = 1, in_set = 0; float q[16]; if (tid < 0) break; if (tid != lasttid) { // change of chromosome g.vpos_shift = 0; if (lasttid >= 0) { seqs = shrink_hash(seqs); if (phase(&g, g.fp_hdr->target_name[lasttid], vpos, cns, seqs) < 0) { return 1; } update_vpos(0x7fffffff, seqs); } lasttid = tid; vpos = 0; } if (set && kh_get(set64, set, (uint64_t)tid<<32 | pos) != kh_end(set)) in_set = 1; if (n > g.max_depth) continue; // do not proceed if the depth is too high // fill the bases array and check if there is a variant for (i = k = 0; i < n; ++i) { const bam_pileup1_t *p = plp + i; uint8_t *seq; int q, baseQ, b; if (p->is_del || p->is_refskip) continue; baseQ = bam_get_qual(p->b)[p->qpos]; if (baseQ < g.min_baseQ) continue; seq = bam_get_seq(p->b); b = seq_nt16_int[bam_seqi(seq, p->qpos)]; if (b > 3) continue; q = baseQ < p->b->core.qual? baseQ : p->b->core.qual; if (q < 4) q = 4; if (q > 63) q = 63; bases[k++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b; } if (k == 0) continue; errmod_cal(em, k, 4, bases, q); // compute genotype likelihood c = gl2cns(q); // get the consensus // tell if to proceed if (set && (g.flag&FLAG_LIST_EXCL) && !in_set) continue; // not in the list if (!in_set && (c&0xffff)>>2 < g.min_varLOD) continue; // not a variant // add the variant if (vpos == max_vpos) { max_vpos = max_vpos? max_vpos<<1 : 128; cns = realloc(cns, max_vpos * 8); } cns[vpos] = (uint64_t)pos<<32 | c; for (i = 0; i < n; ++i) { const bam_pileup1_t *p = plp + i; uint64_t key; khint_t k; uint8_t *seq = bam_get_seq(p->b); frag_t *f; if (p->is_del || p->is_refskip) continue; if (p->b->core.qual == 0) continue; // get the base code c = seq_nt16_int[bam_seqi(seq, p->qpos)]; if (c == (cns[vpos]&3)) c = 1; else if (c == (cns[vpos]>>16&3)) c = 2; else c = 0; // write to seqs key = X31_hash_string(bam_get_qname(p->b)); k = kh_put(64, seqs, key, &tmp); f = &kh_val(seqs, k); if (tmp == 0) { // present in the hash table if (vpos - f->vpos + 1 < MAX_VARS) { f->vlen = vpos - f->vpos + 1; f->seq[f->vlen-1] = c; f->end = bam_endpos(p->b); } dophase = 0; } else { // absent memset(f->seq, 0, MAX_VARS); f->beg = p->b->core.pos; f->end = bam_endpos(p->b); f->vpos = vpos, f->vlen = 1, f->seq[0] = c, f->single = f->phased = f->flip = f->ambig = 0; } } if (dophase) { seqs = shrink_hash(seqs); if (phase(&g, g.fp_hdr->target_name[tid], vpos, cns, seqs) < 0) { return 1; } update_vpos(vpos, seqs); cns[0] = cns[vpos]; vpos = 0; } ++vpos; } if (tid >= 0) { if (phase(&g, g.fp_hdr->target_name[tid], vpos, cns, seqs) < 0) { return 1; } } bam_hdr_destroy(g.fp_hdr); bam_plp_destroy(iter); sam_close(g.fp); kh_destroy(64, seqs); kh_destroy(set64, set); free(cns); errmod_destroy(em); free(bases); if (g.pre) { int res = 0; for (c = 0; c <= 2; ++c) { if (sam_close(g.out[c]) < 0) { fprintf(stderr, "[%s] error on closing '%s'\n", __func__, g.out_name[c]); res = 1; } bam_hdr_destroy(g.out_hdr[c]); free(g.out_name[c]); } free(g.pre); free(g.b); if (res) return 1; } sam_global_args_free(&ga); return 0; }