#!/usr/bin/env python # # Copyright 2017, Daehwan Kim # # This file is part of HISAT-genotype. # # HISAT-genotype is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # HISAT-genotype is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with HISAT-genotype. If not, see . # import sys, os, subprocess, re import math import random from copy import deepcopy from datetime import datetime ################################################## # Sequence processing routines ################################################## """ """ def reverse_complement(seq): comp_table = {'A':'T', 'C':'G', 'G':'C', 'T':'A'} rc_seq = "" for s in reversed(seq): if s in comp_table: rc_seq += comp_table[s] else: rc_seq += s return rc_seq """ """ def read_genome(genome_file): chr_dic, chr_names, chr_full_names = {}, [], [] chr_name, chr_full_name, sequence = "", "", "" for line in genome_file: if line.startswith(">"): if chr_name and sequence: chr_dic[chr_name] = sequence chr_names.append(chr_name) chr_full_name = line.strip()[1:] chr_name = line.strip().split()[0][1:] chr_full_names.append(chr_full_name) sequence = "" else: sequence += line.strip() if chr_name and sequence: chr_dic[chr_name] = sequence chr_names.append(chr_name) chr_full_names.append(chr_full_name) return chr_dic, chr_names, chr_full_names ################################################## # Alleles, variants, haplotypes, etc. ################################################## """ """ def read_allele_sequences(fname): allele_seqs = {} allele_name, sequence = "", "" for line in open(fname): if line.startswith(">"): if allele_name != "" and allele_name not in allele_seqs: allele_seqs[allele_name] = sequence allele_name = line.strip()[1:] sequence = "" else: sequence += line.strip() if allele_name != "" and allele_name not in allele_seqs: allele_seqs[allele_name] = sequence return allele_seqs """ """ def read_variants(fname): allele_vars = {} for line in open(fname): var_id, type, allele_name, left, data = line.strip().split() left = int(left) if type == "deletion": data = int(data) if allele_name not in allele_vars: allele_vars[allele_name] = [] allele_vars[allele_name].append([left, type, data, var_id]) return allele_vars """ """ def read_haplotypes(fname): allele_haplotypes = {} for line in open(fname): haplotype_id, allele_name, left, right, vars = line.strip().split() vars = vars.split(',') left, right = int(left), int(right) if allele_name not in allele_haplotypes: allele_haplotypes[allele_name] = [] allele_haplotypes[allele_name].append([left, right, vars]) return allele_haplotypes """ """ def read_links(fname): links = [] for line in open(fname): var_id, allele_names = line.strip().split('\t') links.append([var_id, allele_names]) return links """ Compare two variants """ def compare_vars(a, b): a_pos, a_type, a_data = a[:3] b_pos, b_type, b_data = b[:3] if a_pos != b_pos: return a_pos - b_pos if a_type != b_type: if a_type == 'I': return -1 elif b_type == 'I': return 1 if a_type == 'S': return -1 else: return 1 if a_data < b_data: return -1 elif a_data > b_data: return 1 else: return 0 """ """ def lower_bound(Var_list, pos): low, high = 0, len(Var_list) while low < high: m = (low + high) / 2 m_pos = Var_list[m][0] if m_pos < pos: low = m + 1 elif m_pos > pos: high = m else: assert m_pos == pos while m > 0: if Var_list[m-1][0] < pos: break m -= 1 return m return low """ """ def check_files(fnames): for fname in fnames: if not os.path.exists(fname): return False return True ################################################## # Database releated routines ################################################## """ Download GRCh38 human reference and HISAT2 indexes """ def download_genome_and_index(): HISAT2_fnames = ["grch38", "genome.fa", "genome.fa.fai"] if not check_files(HISAT2_fnames): os.system("wget ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/data/grch38.tar.gz; tar xvzf grch38.tar.gz; rm grch38.tar.gz") os.system("hisat2-inspect grch38/genome > genome.fa") os.system("samtools faidx genome.fa") """ """ def clone_hisatgenotype_database(): os.system("git clone https://github.com/infphilo/hisatgenotype_db.git") # Check out one particular revision just to have the same data across multiple computers # revision = "d3b559b34b96ff9e7f0d97476222d8e4cdee63ad" # Revision on November 16, 2016 # os.system("cd IMGTHLA; git checkout %s; cd .." % revision) """ """ def extract_database_if_not_exists(base, locus_list, inter_gap = 30, intra_gap = 50, partial = True, verbose = False): fnames = [base + "_backbone.fa", base + "_sequences.fa", base + ".locus", base + ".snp", base + ".index.snp", base + ".haplotype", base + ".link", base + ".partial"] if check_files(fnames): return extract_cmd = ["hisatgenotype_extract_vars.py"] extract_cmd += ["--base", base] if len(locus_list) > 0: extract_cmd += ["--locus-list", ','.join(locus_list)] if not partial: extract_cmd += ["--no-partial"] if base == "codis": extract_cmd += ["--whole-haplotype"] else: extract_cmd += ["--inter-gap", str(inter_gap), "--intra-gap", str(intra_gap)] if base == "hla": extract_cmd += ["--min-var-freq", "0.1"] if base == "codis": extract_cmd += ["--leftshift"] # DK - debugging purposes # extract_cmd += ["--ext-seq", "300"] if verbose: print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd) proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w')) proc.communicate() if not check_files(fnames): print >> sys.stderr, "Error: hisatgenotype_extract_vars failed!" sys.exit(1) """ """ def build_index_if_not_exists(base, aligner, index_type, threads = 1, verbose = False): if aligner == "hisat2": # Build HISAT2 graph indexes based on the above information if index_type == "graph": hisat2_graph_index_fnames = ["%s.graph.%d.ht2" % (base, i+1) for i in range(8)] if not check_files(hisat2_graph_index_fnames): build_cmd = ["hisat2-build", "-p", str(threads), "--snp", "%s.index.snp" % base, "--haplotype", "%s.haplotype" % base, "%s_backbone.fa" % base, "%s.graph" % base] if verbose: print >> sys.stderr, "\tRunning:", ' '.join(build_cmd) proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w')) proc.communicate() if not check_files(hisat2_graph_index_fnames): print >> sys.stderr, "Error: indexing HLA failed! Perhaps, you may have forgotten to build hisat2 executables?" sys.exit(1) # Build HISAT2 linear indexes based on the above information else: assert index_type == "linear" hisat2_linear_index_fnames = ["%s.linear.%d.ht2" % (base, i+1) for i in range(8)] if not check_files(hisat2_linear_index_fnames): build_cmd = ["hisat2-build", "%s_backbone.fa,%s_sequences.fa" % (base, base), "%s.linear" % base] proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w')) proc.communicate() if not check_files(hisat2_linear_index_fnames): print >> sys.stderr, "Error: indexing HLA failed!" sys.exit(1) else: # Build Bowtie2 indexes based on the above information assert aligner == "bowtie2" and index_type == "linear" bowtie2_index_fnames = ["%s.%d.bt2" % (base, i+1) for i in range(4)] bowtie2_index_fnames += ["%s.rev.%d.bt2" % (base, i+1) for i in range(2)] if not tcheck_files(bowtie2_index_fnames): build_cmd = ["bowtie2-build", "%s_backbone.fa,%s_sequences.fa" % (base, base), base] proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w')) proc.communicate() if not check_files(bowtie2_index_fnames): print >> sys.stderr, "Error: indexing HLA failed!" sys.exit(1) ################################################## # Read simulation and alignment ################################################## """ Simulate reads from alleles with headers (>) filled with mapping information. For an example, see hisat2_test_HLA_genotyping.py. """ def simulate_reads(seq_dic, # seq_dic["A"]["A*24:36N"] = "ACGTCCG ..." base_fname, # hla, codis, cyp, or so on allele_list, # ["A*32:29", "B*07:02:01"] Vars, # Vars["A"]["hv326"] = ["single", 604, "C"] Links, simulate_interval = 1, read_len = 100, frag_len = 250, perbase_errorrate = 0.0, perbase_snprate = 0.0, skip_fragment_regions = []): reads_1, reads_2 = [], [] num_pairs = [] for allele_names in allele_list: gene = allele_names[0].split('*')[0] num_pairs.append([]) # Introduce SNPs into allele sequences def introduce_snps(seq): seq = list(seq) for i in range(len(seq)): if random.random() * 100 < perbase_snprate: if seq[i] == 'A': alt_bases = ['C', 'G', 'T'] elif seq[i] == 'C': alt_bases = ['A', 'G', 'T'] elif seq[i] == 'G': alt_bases = ['A', 'C', 'T'] else: assert seq[i] == 'T' alt_bases = ['A', 'C', 'G'] random.shuffle(alt_bases) alt_base = alt_bases[0] seq[i] = alt_base seq = ''.join(seq) return seq # Simulate reads from two alleles def simulate_reads_impl(seq, seq_map, ex_seq_map, ex_seq, ex_desc, simulate_interval = 1, read_len = 100, frag_len = 250, perbase_errorrate = 0.0, skip_fragment_regions = []): # Introduce sequencing errors def introduce_seq_err(read_seq, pos): read_seq = list(read_seq) for i in range(read_len): map_pos = seq_map[pos + i] if ex_desc[map_pos] != "": continue if random.random() * 100 < perbase_errorrate: if read_seq[i] == 'A': alt_bases = ['C', 'G', 'T'] elif read_seq[i] == 'C': alt_bases = ['A', 'G', 'T'] elif read_seq[i] == 'G': alt_bases = ['A', 'C', 'T'] else: assert read_seq[i] == 'T' alt_bases = ['A', 'C', 'G'] random.shuffle(alt_bases) alt_base = alt_bases[0] read_seq[i] = alt_base read_seq = ''.join(read_seq) return read_seq # Get read alignment, e.g., 260|R_483_61M5D38M23D1M_46|S|hv154,3|S|hv162,10|D|hv185,38|D|hv266 def get_info(read_seq, pos): info = "%d_" % (seq_map[pos] + 1) total_match, match, sub_match = 0, 0, 0 var_str = "" ins_len, ins_var = 0, "" for i in range(pos, pos + read_len): map_i = ex_seq_map[i] assert ex_seq[map_i] != 'D' total_match += 1 match += 1 if ex_seq[map_i] == 'I': if ins_var != "": assert ins_var == ex_desc[map_i] ins_var = ex_desc[map_i] ins_len += 1 elif ins_var != "": if var_str != "": var_str += ',' var_str += ("%s|I|%s" % (sub_match, ins_var)) ins_len, ins_var = 0, "" sub_match = 0 if ex_seq[map_i] != 'I': if ex_desc[map_i] != "" or read_seq[i-pos] != ex_seq[map_i]: if var_str != "": var_str += ',' var_str += ("%d|S|%s" % (sub_match, ex_desc[map_i] if ex_desc[map_i] != "" else "unknown")) sub_match = 0 else: sub_match += 1 if i + 1 < pos + read_len and ex_seq[map_i+1] == 'D': assert match > 0 info += ("%dM" % match) match = 0 del_len = 1 while map_i + 1 + del_len < len(ex_seq): if ex_seq[map_i + 1 + del_len] != 'D': break del_len += 1 info += ("%dD" % del_len) if var_str != "": var_str += ',' var_str += ("%s|D|%s" % (sub_match, ex_desc[map_i + 1])) sub_match = 0 assert match > 0 info += ("%dM" % match) assert total_match == read_len if var_str: info += "_" info += var_str return info comp_table = {'A':'T', 'C':'G', 'G':'C', 'T':'A'} reads_1, reads_2 = [], [] for i in range(0, len(seq) - frag_len + 1, simulate_interval): if len(skip_fragment_regions) > 0: skip = False for skip_left, skip_right in skip_fragment_regions: if i <= skip_right and i + frag_len > skip_left: skip = True break if skip: continue pos1 = i seq1 = seq[pos1:pos1+read_len] if perbase_errorrate > 0.0: seq1 = introduce_seq_err(seq1, pos1) info1 = get_info(seq1, pos1) reads_1.append([seq1, info1]) pos2 = i + frag_len - read_len seq2 = seq[pos2:pos2+read_len] if perbase_errorrate > 0.0: seq2 = introduce_seq_err(seq2, pos2) info2 = get_info(seq2, pos2) tmp_read_2 = reversed(seq2) read_2 = "" for s in tmp_read_2: if s in comp_table: read_2 += comp_table[s] else: read_2 += s reads_2.append([read_2, info2]) return reads_1, reads_2 # for each allele in a list of alleles such as ['A*32:29', 'B*07:02:01'] for allele_name in allele_names: allele_seq = seq_dic[gene][allele_name] backbone_seq = seq_dic[gene]["%s*BACKBONE" % gene] allele_ex_seq = list(backbone_seq) allele_ex_desc = [''] * len(allele_ex_seq) allele_seq_map = [i for i in range(len(allele_seq))] allele_ex_seq_map = [i for i in range(len(allele_seq))] if perbase_snprate > 0: HLA_seq = introduce_snps(allele_seq) # Extract variants included in each allele var_ids = [] for var_id, allele_list in Links.items(): if allele_name in allele_list: var_ids.append(var_id) def var_cmp(a, b): assert a.startswith("hv") and b.startswith("hv") return int(a[2:]) - int(b[2:]) var_ids = sorted(var_ids, cmp=var_cmp) # Build annotated sequence for the allele w.r.t backbone sequence add_pos = 0 for var_id in var_ids: var_type, var_pos, var_data = Vars[gene][var_id] var_pos += add_pos if var_type == "single": allele_ex_seq[var_pos] = var_data allele_ex_desc[var_pos] = var_id elif var_type == "deletion": del_len = int(var_data) assert var_pos + del_len <= len(allele_ex_seq) allele_ex_seq[var_pos:var_pos+del_len] = ['D'] * del_len allele_ex_desc[var_pos:var_pos+del_len] = [var_id] * del_len else: assert var_type == "insertion" ins_len = len(var_data) allele_ex_seq = allele_ex_seq[:var_pos] + (['I'] * ins_len) + allele_ex_seq[var_pos:] allele_ex_desc = allele_ex_desc[:var_pos] + ([var_id] * ins_len) + allele_ex_desc[var_pos:] add_pos += ins_len allele_ex_seq = ''.join(allele_ex_seq) assert len(backbone_seq) + add_pos == len(allele_ex_seq) # Build mapping from the allele to the annotated sequence prev_j, minus_pos = 0, 0 for i in range(len(allele_seq)): for j in range(prev_j, len(allele_ex_seq)): if allele_ex_seq[j] != 'D': if allele_ex_seq[j] == 'I': minus_pos += 1 break allele_seq_map[i] = j - minus_pos allele_ex_seq_map[i] = j prev_j = j + 1 # DK - debugging purposes """ for t in range(0, len(allele_ex_seq), 100): print t, allele_ex_seq[t:t+100] print t, '-'.join(allele_ex_desc[t:t+100]) print t, allele_seq_map[t:t+100] print "allele_seq length:", len(allele_seq) print len(allele_ex_seq), "vs.", len(seq_dic[gene]["A*BACKBONE"]), "vs.", len(allele_seq_map) print allele_ex_seq[1943:1946] print allele_ex_desc[1943:1946] sys.exit(1) """ tmp_reads_1, tmp_reads_2 = simulate_reads_impl(allele_seq, allele_seq_map, allele_ex_seq_map, allele_ex_seq, allele_ex_desc, simulate_interval, read_len, frag_len, perbase_errorrate, skip_fragment_regions) reads_1 += tmp_reads_1 reads_2 += tmp_reads_2 num_pairs[-1].append(len(tmp_reads_1)) # Write reads into a FASTA file def write_reads(reads, idx): read_file = open('%s_input_%d.fa' % (base_fname, idx), 'w') for read_i in range(len(reads)): query_name = "%d|%s_%s" % (read_i + 1, "LR"[idx-1], reads[read_i][1]) if len(query_name) > 254: query_name = query_name[:254] print >> read_file, ">%s" % query_name print >> read_file, reads[read_i][0] read_file.close() write_reads(reads_1, 1) write_reads(reads_2, 2) return num_pairs """ Align reads, and sort the alignments into a BAM file """ def align_reads(aligner, simulation, index_name, index_type, base_fname, read_fname, fastq, threads, out_fname, verbose): if aligner == "hisat2": aligner_cmd = [aligner, "--mm"] if not simulation: aligner_cmd += ["--no-unal"] DNA = True if DNA: aligner_cmd += ["--no-spliced-alignment"] # no spliced alignment aligner_cmd += ["-X", "1000"] # max fragment length if index_type == "linear": aligner_cmd += ["-k", "10"] else: aligner_cmd += ["--max-altstried", "64"] aligner_cmd += ["--haplotype"] if base_fname == "codis": aligner_cmd += ["--enable-codis"] elif aligner == "bowtie2": aligner_cmd = [aligner, "--no-unal", "-k", "10"] else: assert False aligner_cmd += ["-x", index_name] assert len(read_fname) in [1,2] aligner_cmd += ["-p", str(threads)] if not fastq: aligner_cmd += ["-f"] if len(read_fname) == 1: aligner_cmd += ["-U", read_fname[0]] else: aligner_cmd += ["-1", "%s" % read_fname[0], "-2", "%s" % read_fname[1]] if verbose >= 1: print >> sys.stderr, ' '.join(aligner_cmd) align_proc = subprocess.Popen(aligner_cmd, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w')) sambam_cmd = ["samtools", "view", "-bS", "-"] sambam_proc = subprocess.Popen(sambam_cmd, stdin=align_proc.stdout, stdout=open(out_fname + ".unsorted", 'w'), stderr=open("/dev/null", 'w')) sambam_proc.communicate() if index_type == "graph": bamsort_cmd = ["samtools", "sort", out_fname + ".unsorted", "-o", out_fname] bamsort_proc = subprocess.Popen(bamsort_cmd, stderr=open("/dev/null", 'w')) bamsort_proc.communicate() bamindex_cmd = ["samtools", "index", out_fname] bamindex_proc = subprocess.Popen(bamindex_cmd, stderr=open("/dev/null", 'w')) bamindex_proc.communicate() os.system("rm %s" % (out_fname + ".unsorted")) """ HISAT-genotype's mpileup """ def get_mpileup(alignview_cmd, ref_seq, base_locus, vars, allow_discordant): ref_seq_len = len(ref_seq) mpileup = [] for i in range(ref_seq_len): mpileup.append([[], {}]) proc = subprocess.Popen(alignview_cmd, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w')) prev_pos = -1 cigar_re = re.compile('\d+\w') for line in proc.stdout: line = line.strip() cols = line.split() read_id, flag, _, pos, _, cigar_str = cols[:6] read_seq = cols[9] flag, pos = int(flag), int(pos) # Unalined? if flag & 0x4 != 0: continue pos -= (base_locus + 1) if pos < 0: continue # Concordantly mapped? if flag & 0x2 != 0: concordant = True else: concordant = False if not allow_discordant and not concordant: continue read_pos, left_pos = 0, pos right_pos = left_pos cigars = cigar_re.findall(cigar_str) cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars] for i in range(len(cigars)): cigar_op, length = cigars[i] if cigar_op in "MD": for j in range(length): if cigar_op == 'M': read_nt = read_seq[read_pos + j] else: read_nt = 'D' if right_pos + j < len(mpileup): if read_nt not in mpileup[right_pos + j][1]: mpileup[right_pos + j][1][read_nt] = 1 else: mpileup[right_pos + j][1][read_nt] += 1 if cigar_op in "MND": right_pos += length if cigar_op in "MIS": read_pos += length # Choose representative bases or 'D' for i in range(len(mpileup)): nt_dic = mpileup[i][1] num_nt = sum(nt_dic.values()) nt_set = [] if num_nt >= 20: for nt, count in nt_dic.items(): if nt not in "ACGT": continue if count >= num_nt * 0.2 or count >= 7: nt_set.append(nt) mpileup[i][0] = nt_set # Sort variants var_list = [[] for i in range(len(mpileup))] for var_id, value in vars.items(): var_type, var_pos, var_data = value assert var_pos < len(var_list) var_list[var_pos].append([var_id, var_type, var_data]) # Assign known or unknown variants skip_i, prev_del_var_id = -1, "" for i in range(len(mpileup)): nt_dic = mpileup[i][1] ref_nt = ref_seq[i] new_nt_dic = {} for nt, count in nt_dic.items(): var_id = "" if nt == 'D': if i <= skip_i: assert prev_del_var_id != "" var_id = prev_del_var_id else: for var_id_, var_type, var_data in var_list[i]: if var_type != "deletion": continue del_len = int(var_data) del_exist = True for j in range(i + 1, i + del_len): assert j < len(mpileup) nt_dic2 = mpileup[j][1] if 'D' not in nt_dic2: del_exist = False break if del_exist: var_id = var_id_ prev_del_var_id = var_id skip_i = i + del_len - 1 break elif nt != 'N' and nt != ref_nt: assert nt in "ACGT" id = "unknown" for var_id_, var_type, var_data in var_list[i]: if var_type != "single": continue if nt == var_data: var_id = var_id_ break new_nt_dic[nt] = [count, var_id] mpileup[i][1] = new_nt_dic return mpileup """ """ def get_pair_interdist(alignview_cmd, simulation, verbose): bamview_proc = subprocess.Popen(alignview_cmd, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w')) sort_read_cmd = ["sort", "-k", "1,1", "-s"] # -s for stable sorting alignview_proc = subprocess.Popen(sort_read_cmd, stdin=bamview_proc.stdout, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w')) dist_list = [] prev_read_id = None cigar_re = re.compile('\d+\w') reads = [] for line in alignview_proc.stdout: line = line.strip() cols = line.split() read_id, flag, _, pos, _, cigar_str = cols[:6] read_seq = cols[9] flag, pos = int(flag), int(pos) # Unalined? if flag & 0x4 != 0: continue if simulation: read_id = read_id.split('|')[0] # Concordantly mapped? if flag & 0x2 != 0: concordant = True else: concordant = False NH, YT = sys.maxint, "" for i in range(11, len(cols)): col = cols[i] if col.startswith("NH"): NH = int(col[5:]) elif col.startswith("YT"): YT = col[5:] if NH > 1 or YT != "CP": continue if prev_read_id != None and read_id != prev_read_id: if len(reads) == 2: left1, right1 = reads[0] left2, right2 = reads[1] if left1 <= left2: dist = left2 - right1 - 1 else: dist = left1 - right2 - 1 dist_list.append(dist) reads = [] left_pos = right_pos = pos cigars = cigar_re.findall(cigar_str) cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars] for i in range(len(cigars)): cigar_op, length = cigars[i] if cigar_op in "MND": right_pos += length reads.append([left_pos, right_pos - 1]) prev_read_id = read_id dist_list = sorted(dist_list) dist_avg = sum(dist_list) / max(1, len(dist_list)) if len(dist_list) > 0: dist_median = dist_list[len(dist_list)/2] else: dist_median = -1 return dist_median ################################################## # Statistical routines ################################################## """ """ def prob_diff(prob1, prob2): diff = 0.0 for allele in prob1.keys(): if allele in prob2: diff += abs(prob1[allele] - prob2[allele]) else: diff += prob1[allele] return diff """ """ def Gene_prob_cmp(a, b): if a[1] != b[1]: if a[1] < b[1]: return 1 else: return -1 assert a[0] != b[0] if a[0] < b[0]: return -1 else: return 1 """ """ def single_abundance(Gene_cmpt, Gene_length, exonic = False): def normalize(prob): total = sum(prob.values()) for allele, mass in prob.items(): prob[allele] = mass / total def normalize_len(prob, length): total = 0 for allele, mass in prob.items(): assert allele in length total += (mass / length[allele]) for allele, mass in prob.items(): assert allele in length prob[allele] = mass / length[allele] / total Gene_prob, Gene_prob_next = {}, {} for cmpt, count in Gene_cmpt.items(): alleles = cmpt.split('-') for allele in alleles: if allele not in Gene_prob: Gene_prob[allele] = 0.0 Gene_prob[allele] += (float(count) / len(alleles)) if exonic: normalize(Gene_prob) else: normalize_len(Gene_prob, Gene_length) def next_prob(Gene_cmpt, Gene_prob, Gene_length): Gene_prob_next = {} for cmpt, count in Gene_cmpt.items(): alleles = cmpt.split('-') alleles_prob = 0.0 for allele in alleles: if allele not in Gene_prob: continue alleles_prob += Gene_prob[allele] if alleles_prob <= 0.0: continue for allele in alleles: if allele not in Gene_prob: continue if allele not in Gene_prob_next: Gene_prob_next[allele] = 0.0 Gene_prob_next[allele] += (float(count) * Gene_prob[allele] / alleles_prob) if exonic: normalize(Gene_prob_next) else: normalize_len(Gene_prob_next, Gene_length) return Gene_prob_next def select_alleles(Gene_prob): if len(Gene_prob) == 0: return Gene_prob Gene_prob2 = {} max_prob = max(Gene_prob.values()) for allele, prob in Gene_prob.items(): if prob >= max_prob / 10.0: Gene_prob2[allele] = prob return Gene_prob2 fast_EM = True diff, iter = 1.0, 0 while diff > 0.0001 and iter < 1000: Gene_prob_next = next_prob(Gene_cmpt, Gene_prob, Gene_length) if fast_EM: # Accelerated version of EM - SQUAREM iteration # Varadhan, R. & Roland, C. Scand. J. Stat. 35, 335-353 (2008) # Also, this algorithm is used in Sailfish - http://www.nature.com/nbt/journal/v32/n5/full/nbt.2862.html Gene_prob_next2 = next_prob(Gene_cmpt, Gene_prob_next, Gene_length) sum_squared_r, sum_squared_v = 0.0, 0.0 p_r, p_v = {}, {} for a in Gene_prob.keys(): p_r[a] = Gene_prob_next[a] - Gene_prob[a] sum_squared_r += (p_r[a] * p_r[a]) p_v[a] = Gene_prob_next2[a] - Gene_prob_next[a] - p_r[a] sum_squared_v += (p_v[a] * p_v[a]) if sum_squared_v > 0.0: gamma = -math.sqrt(sum_squared_r / sum_squared_v) for a in Gene_prob.keys(): Gene_prob_next2[a] = max(0.0, Gene_prob[a] - 2 * gamma * p_r[a] + gamma * gamma * p_v[a]); Gene_prob_next = next_prob(Gene_cmpt, Gene_prob_next2, Gene_length) diff = prob_diff(Gene_prob, Gene_prob_next) Gene_prob = Gene_prob_next # Accelerate convergence if iter >= 10: Gene_prob = select_alleles(Gene_prob) # DK - debugging purposes if iter % 10 == 0 and False: print >> sys.stderr, "iter", iter for allele, prob in Gene_prob.items(): if prob >= 0.01: print >> sys.stderr, "\t", iter, allele, prob iter += 1 Gene_prob = select_alleles(Gene_prob) if exonic: normalize(Gene_prob) else: normalize_len(Gene_prob, Gene_length) Gene_prob = [[allele, prob] for allele, prob in Gene_prob.items()] Gene_prob = sorted(Gene_prob, cmp=Gene_prob_cmp) return Gene_prob ################################################## # Realignment, alternative alignments ################################################## """ Identify alternative haplotypes insertions are not considered... INPUT: see the function's parameters below OUPUT: 529-hv8-hv22-606: set(['529-hv13-570', '529-hv4-hv18-590', '529-hv2-hv16-582']) 529-hv3-hv17-598: set(['529-hv6-hv21-hv26-610']) """ def get_alternatives(ref_seq, # GATAACTAGATACATGAGATAGATTTGATAGATAGATAGATACATACATACATACATACATACAGGATAGATAACTAGG... allele_vars, # {'VWA*20(22)': ['hv231', 'hv245'], "VWA*16(18')": ['hv235', 'hv250', 'hv256'], ...} Vars, # {'hv241': ['deletion', 529, '52'], 'hv240': ['deletion', 529, '48'], ... } Var_list, # [[529, 'hv230'], [529, 'hv231'], [529, 'hv232'], [529, 'hv233'], ...] verbose): haplotype_alts_left, haplotype_alts_right = {}, {} second_order_haplotypes = set() for allele_name, vars in allele_vars.items(): for v in range(len(vars) - 1): ht = vars[v] + "-" + vars[v+1] second_order_haplotypes.add(ht) rev_Var_list = [] for _, var_id in Var_list: var_type, var_pos, var_data = Vars[var_id] if var_type == "deletion": var_pos = var_pos + int(var_data) - 1 elif var_type == "insertion": var_pos += 1 rev_Var_list.append([var_pos, var_id]) rev_Var_list = sorted(rev_Var_list, cmp=lambda a, b: a[0] - b[0]) def nextbases(haplotype, left = True, exclude_list = []): if left: pos = int(haplotype[0]) - 1 else: pos = haplotype[-1] + 1 if pos < 0 or pos >= len(ref_seq): return [] if left: bases = [[[pos] + haplotype[1:], ref_seq[pos]]] prev_id = None if len(haplotype) > 2: prev_id = haplotype[1] var_i = lower_bound(rev_Var_list, pos + 1) for var_j in reversed(range(0, var_i)): _, var_id = rev_Var_list[var_j] var_type, var_pos, var_data = Vars[var_id] if var_type == "deletion": if var_pos == 0: continue var_pos = var_pos + int(var_data) - 1 if var_pos > pos: continue if var_pos < pos: break if var_id in exclude_list: continue if prev_id: second_ht = var_id + "-" + prev_id if second_ht not in second_order_haplotypes: continue if var_type == "single": bases.append([[var_pos, var_id] + haplotype[1:], var_data]) elif var_type == "deletion": bases2 = nextbases([var_pos - int(var_data) + 1, var_id] + haplotype[1:], left, exclude_list) bases += bases2 else: assert var_type == "insertion" else: bases = [[haplotype[:-1] + [pos], ref_seq[pos]]] prev_id = None if len(haplotype) > 2: prev_id = haplotype[-2] var_i = lower_bound(Var_list, pos) for var_j in range(var_i, len(Var_list)): _, var_id = Var_list[var_j] var_type, var_pos, var_data = Vars[var_id] if var_pos < pos: continue if var_pos > pos: break if var_id in exclude_list: continue if prev_id: second_ht = prev_id + "-" + var_id if second_ht not in second_order_haplotypes: continue if var_type == "single": bases.append([haplotype[:-1] + [var_id, var_pos], var_data]) elif var_type == "deletion": bases2 = nextbases(haplotype[:-1] + [var_id, var_pos + int(var_data) - 1], left, exclude_list) bases += bases2 else: assert var_type == "insertion" return bases def get_haplotype_seq(haplotype): seq = "" pos = int(haplotype[0]) for i in range(1, len(haplotype) - 1): var_id = haplotype[i] var_type, var_pos, var_data = Vars[var_id] if pos < var_pos: seq += ref_seq[pos:var_pos] if var_type == "single": seq += var_data pos = var_pos + 1 elif var_type == "deletion": pos = var_pos + int(var_data) else: assert var_type == "insertion" seq += var_data pos = var_pos last_pos = int(haplotype[-1]) + 1 assert pos <= last_pos if pos < last_pos: seq += ref_seq[pos:last_pos] return seq def get_alternative_recur(var_orig_id, haplotype, haplotype_alt, left = True, dep = 0): bases1 = nextbases(haplotype, left) bases2 = nextbases(haplotype_alt, left, [var_orig_id]) # exclude found = False for base1 in bases1: next_haplotype, bp = base1 for base2 in bases2: next_haplotype_alt, bp2 = base2 if bp != bp2: continue # Todo: implement a routine to handle haplotypes ending with the same coordinate if left: left1, left2 = int(next_haplotype[0]), int(next_haplotype_alt[0]) if left1 == left2: continue else: right1, right2 = int(next_haplotype[-1]), int(next_haplotype_alt[-1]) if right1 == right2: continue found = True get_alternative_recur(var_orig_id, next_haplotype, next_haplotype_alt, left, dep + 1) if dep > 0: if not found: def to_haplotype_str(haplotype): if len(haplotype) <= 2: haplotype = "%d-%d" % (haplotype[0], haplotype[1]) else: haplotype = "%d-%s-%d" % (haplotype[0], '-'.join(haplotype[1:-1]), haplotype[-1]) return haplotype haplotype, haplotype_alt = to_haplotype_str(haplotype), to_haplotype_str(haplotype_alt) haplotype_alts = haplotype_alts_left if left else haplotype_alts_right if haplotype not in haplotype_alts: haplotype_alts[haplotype] = set() haplotype_alts[haplotype].add(haplotype_alt) if haplotype_alt not in haplotype_alts: haplotype_alts[haplotype_alt] = set() haplotype_alts[haplotype_alt].add(haplotype) # Search alternative haplotypes in both left and right directions for var_i in range(len(Var_list)): _, var_id = Var_list[var_i] var_type, var_pos, var_data = Vars[var_id] if var_pos == 0: continue if var_type != "deletion": continue del_len = int(var_data) if var_pos + del_len >= len(ref_seq): continue # Left direction get_alternative_recur(var_id, [var_pos, var_id, var_pos + del_len - 1], [var_pos + del_len, var_pos + del_len - 1]) # Right direction get_alternative_recur(var_id, [var_pos, var_id, var_pos + del_len - 1], [var_pos, var_pos - 1], False) # Print alternative haplotypes / Sanity check def print_haplotype_alts(haplotype_alts): for haplotype, haplotype_set in haplotype_alts.items(): if verbose: print "\t%s:" % haplotype, haplotype_set haplotype_seq = get_haplotype_seq(haplotype.split('-')) for haplotype_alt in haplotype_set: haplotype_alt_seq = get_haplotype_seq(haplotype_alt.split('-')) assert haplotype_seq == haplotype_alt_seq if verbose: print "number of left haplotypes:", len(haplotype_alts_left) print_haplotype_alts(haplotype_alts_left) if verbose: print "number of right haplotypes:", len(haplotype_alts_right) print_haplotype_alts(haplotype_alts_right) return haplotype_alts_left, haplotype_alts_right """ Identify ambigious differences that may account for other alleles, given a list of differences (cmp_list) between a read and a potential allele """ def identify_ambigious_diffs(ref_seq, Vars, Alts_left, Alts_right, Alts_left_list, Alts_right_list, cmp_list, verbose, debug = False): cmp_left, cmp_right = 0, len(cmp_list) - 1 left, right = cmp_list[0][1], cmp_list[-1][1] + cmp_list[-1][2] - 1 left_alt_set, right_alt_set = set(), set() def get_haplotype_and_seq(cmp_list): ht, seq = [], "" for i in range(len(cmp_list)): cmp_i = cmp_list[i] type, pos, length = cmp_i[:3] if len(cmp_i) <= 3: var_id = "" else: var_id = cmp_i[3] if type == "match": seq += ref_seq[pos:pos+length] elif type == "mismatch": seq += ref_seq[pos] elif type == "insertion": None # seq += data else: assert type == "deletion" if var_id != "" and var_id != "unknown": ht.append(var_id) return ht, seq # Left direction found = False for i in reversed(range(len(cmp_list))): i_found = False cmp_i = cmp_list[i] type, cur_left, length = cmp_i[:3] var_id = cmp_i[3] if type in ["mismatch", "deletion"] else "" # DK - debugging purpose if type in ["mismatch", "deletion", "insertion"]: if not var_id.startswith("hv"): continue if type in ["match", "deletion"]: cur_right = cur_left + length - 1 else: cur_right = cur_left cur_ht, cur_seq = get_haplotype_and_seq(cmp_list[:i+1]) if len(cur_ht) == 0: cur_ht_str = str(left) else: cur_ht_str = "%d-%s" % (left, '-'.join(cur_ht)) ht_i = lower_bound(Alts_left_list, cur_right + 1) for ht_j in reversed(range(0, min(ht_i + 1, len(Alts_left_list)))): ht_pos, ht = Alts_left_list[ht_j] if ht_pos < cur_left: break if ht_pos > cur_right: continue if len(cur_ht) > 0: if ht.find('-'.join(cur_ht)) == -1: continue ht = ht.split('-')[:-1] if len(cur_ht) + 1 == len(ht): ht_pos = int(ht[0]) else: var_id2 = ht[len(ht) - len(cur_ht) - 1] ht_type, ht_pos, ht_data = Vars[var_id2] if ht_type == "deletion": ht_pos = ht_pos + int(ht_data) - 1 if left < ht_pos: continue i_found = True if debug: print cmp_list[:i+1] print "\t", cur_ht, "vs", Alts_left_list[ht_j], ht_pos _, rep_ht = Alts_left_list[ht_j] if debug: print "DK1:", cmp_i, cmp_list print "DK2:", rep_ht, Alts_left[rep_ht] print "DK3:", left, right, ht_pos for alt_ht_str in Alts_left[rep_ht]: alt_ht = alt_ht_str.split('-') alt_ht_left, alt_ht_right = int(alt_ht[0]), int(alt_ht[-1]) assert alt_ht_right <= cur_right seq_pos = cur_right - alt_ht_right cur_pos = alt_ht_right part_alt_ht = [] alt_ht = alt_ht[1:-1] for var_id_ in reversed(alt_ht): var_type_, var_pos_, var_data_ = Vars[var_id_] if var_type_ == "deletion": del_len = int(var_data_) var_pos_ = var_pos_ + del_len - 1 assert var_pos_ <= cur_pos next_seq_pos = seq_pos + (cur_pos - var_pos_) if next_seq_pos >= len(cur_seq): break if var_type_ == "single": next_seq_pos += 1 next_cur_pos = var_pos_ - 1 elif var_type_ == "deletion": next_cur_pos = var_pos_ - del_len else: assert var_type_ == "insertion" assert False part_alt_ht.insert(0, var_id_) if next_seq_pos >= len(cur_seq): break seq_pos, cur_pos = next_seq_pos, next_cur_pos if len(part_alt_ht) > 0: seq_left = len(cur_seq) - seq_pos - 1 part_alt_ht_str = "" if found: var_id_list = [] for j in range(i + 1, cmp_left): cmp_j = cmp_list[j] if cmp_j[0] in ["mismatch", "deletion", "insertion"]: var_id_ = cmp_j[3] if var_id_.startswith("hv"): var_id_list.append(var_id_) if len(var_id_list) > 0: part_alt_ht_str = '-' + '-'.join(var_id_list) part_alt_ht_str = ("%d-%s" % (cur_pos - seq_left, '-'.join(part_alt_ht))) + part_alt_ht_str left_alt_set.add(part_alt_ht_str) if debug: print "\t\t", cur_left, alt_ht_str if i_found: if not found: cmp_left = i + 1 left_alt_set.add(cur_ht_str) found = True if not found: left_alt_set.add(str(left)) # Right direction found = False for i in range(0, len(cmp_list)): i_found = False cmp_i = cmp_list[i] type, cur_left, length = cmp_i[:3] var_id = cmp_i[3] if type in ["mismatch", "deletion"] else "" # DK - debugging purpose if type in ["mismatch", "deletion", "insertion"]: if not var_id.startswith("hv"): continue if type in ["match", "deletion"]: cur_right = cur_left + length - 1 else: cur_right = cur_left cur_ht, cur_seq = get_haplotype_and_seq(cmp_list[i:]) if len(cur_ht) == 0: cur_ht_str = str(right) else: cur_ht_str = "%s-%d" % ('-'.join(cur_ht), right) ht_i = lower_bound(Alts_right_list, cur_left) for ht_j in range(ht_i, len(Alts_right_list)): ht_pos, ht = Alts_right_list[ht_j] if ht_pos > cur_right: break if ht_pos < cur_left: continue if len(cur_ht) > 0: if ht.find('-'.join(cur_ht)) == -1: continue ht = ht.split('-')[1:] if len(cur_ht) + 1 == len(ht): ht_pos = int(ht[-1]) else: var_id2 = ht[len(cur_ht)] _, ht_pos, _ = Vars[var_id2] if right > ht_pos: continue i_found = True _, rep_ht = Alts_right_list[ht_j] if debug: print "DK1:", cmp_i, cmp_list print "DK2:", rep_ht, Alts_right[rep_ht] print "DK3:", left, right, ht_pos for alt_ht_str in Alts_right[rep_ht]: alt_ht = alt_ht_str.split('-') alt_ht_left, alt_ht_right = int(alt_ht[0]), int(alt_ht[-1]) assert cur_left <= alt_ht_left seq_pos = alt_ht_left - cur_left cur_pos = alt_ht_left part_alt_ht = [] alt_ht = alt_ht[1:-1] for var_id_ in alt_ht: var_type_, var_pos_, var_data_ = Vars[var_id_] assert var_pos_ >= cur_pos next_seq_pos = seq_pos + (var_pos_ - cur_pos) if next_seq_pos >= len(cur_seq): break if var_type_ == "single": next_seq_pos += 1 next_cur_pos = var_pos_ + 1 elif var_type_ == "deletion": next_cur_pos = var_pos_ + int(var_data_) else: assert var_type_ == "insertion" assert False part_alt_ht.append(var_id_) if next_seq_pos >= len(cur_seq): break seq_pos, cur_pos = next_seq_pos, next_cur_pos if len(part_alt_ht) > 0: seq_left = len(cur_seq) - seq_pos - 1 assert seq_left >= 0 part_alt_ht_str = "" if found: var_id_list = [] for j in range(cmp_right + 1, i): cmp_j = cmp_list[j] if cmp_j[0] in ["mismatch", "deletion", "insertion"]: var_id_ = cmp_j[3] if var_id_.startswith("hv"): var_id_list.append(var_id_) if len(var_id_list) > 0: part_alt_ht_str = '-'.join(var_id_list) + '-' part_alt_ht_str += ("%s-%d" % ('-'.join(part_alt_ht), cur_pos + seq_left)) right_alt_set.add(part_alt_ht_str) if i_found: if not found: cmp_right = i - 1 right_alt_set.add(cur_ht_str) found = True if not found: right_alt_set.add(str(right)) if cmp_right < cmp_left: cmp_left = 0 left_alt_set = set([str(left)]) # Sanity check ht_set_ = set() for ht in left_alt_set: ht = '-'.join(ht.split('-')[1:]) if ht == "": continue if ht in ht_set_: print >> sys.stderr, "Error) %s should not be in" % ht, ht_set_ # DK - debugging purposes print "DK: cmp_list_range: [%d, %d]" % (cmp_left, cmp_right) print "DK: cmp_list:", cmp_list print "DK: left_alt_set:", left_alt_set, "right_alt_set:", right_alt_set assert False ht_set_.add(ht) for ht in right_alt_set: ht = '-'.join(ht.split('-')[:-1]) if ht == "": continue if ht in ht_set_: print >> sys.stderr, "Error) %s should not be in" % ht, ht_set_ assert False ht_set_.add(ht) if debug: print "cmp_list_range: [%d, %d]" % (cmp_left, cmp_right) print "left alt set:", left_alt_set print "right alt set:", right_alt_set return cmp_left, cmp_right, list(left_alt_set), list(right_alt_set)