#!/usr/bin/env python3 # # Copyright 2016, Daehwan Kim # # This file is part of HISAT 2. # # HISAT 2 is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # HISAT 2 is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with HISAT 2. If not, see . # import sys, os, subprocess from argparse import ArgumentParser, FileType digit2str = [str(i) for i in range(10)] """ """ def read_genome(genome_file): chr_dic = {} chr_name, sequence = "", "" for line in genome_file: if line.startswith(">"): if chr_name and sequence: chr_dic[chr_name] = sequence chr_name = line.strip().split()[0][1:] sequence = "" else: sequence += line.strip() if chr_name and sequence: chr_dic[chr_name] = sequence return chr_dic """ Compare two variants [chr, pos, type, data, dic] """ def compare_vars(a, b): a_chr, a_pos, a_type, a_data = a[:4] b_chr, b_pos, b_type, b_data = b[:4] assert a_chr == b_chr if a_pos != b_pos: return a_pos - b_pos if a_type != b_type: if a_type == 'I': return -1 elif b_type == 'I': return 1 if a_type == 'S': return -1 else: return 1 if a_data < b_data: return -1 elif a_data > b_data: return 1 else: return 0 """ """ def compatible_vars(a, b): a_chr, a_pos, a_type, a_data = a[:4] b_chr, b_pos, b_type, b_data = b[:4] assert a_chr == b_chr assert a_pos <= b_pos if a_pos == b_pos: return False if a_type == 'D': if b_pos <= a_pos + a_data: return False return True """ Given a VCF line, the function reports a list of variants [pos, type, data] type: 'S' for single nucleotide polymorphism, 'D' for deletion, and 'I' for insertion """ def extract_vars(chr_dic, chr, pos, ref_allele, alt_alleles, varID): chr_seq = chr_dic[chr] vars = [] assert ',' not in ref_allele alt_alleles = alt_alleles.split(',') for a in range(len(alt_alleles)): alt_allele = alt_alleles[a] if 'N' in alt_allele: continue ref_allele2, pos2 = ref_allele, pos min_len = min(len(ref_allele2), len(alt_allele)) assert min_len >= 1 if min_len > 1: ref_allele2 = ref_allele2[min_len - 1:] alt_allele = alt_allele[min_len - 1:] pos2 += (min_len - 1) type, data = '', '' if len(ref_allele2) == 1 and len(alt_allele) == 1: type = 'S' data = alt_allele assert ref_allele2 != alt_allele if chr_seq[pos2] != ref_allele2: continue elif len(ref_allele2) == 1: assert len(alt_allele) > 1 type = 'I' data = alt_allele[1:] if len(data) > 32: continue if chr_seq[pos] != ref_allele2: continue elif len(alt_allele) == 1: assert len(ref_allele2) > 1 type = 'D' data = len(ref_allele2) - 1 if chr_seq[pos2:pos2+data+1] != ref_allele2: continue else: assert False varID2 = varID if len(alt_alleles) > 1: varID2 = "%s.%d" % (varID, a) vars.append([chr, pos2, type, data, {"id":varID, "id2":varID2}]) return vars """ """ def generate_haplotypes(snp_file, haplotype_file, vars, inter_gap, intra_gap, num_genomes, num_haplotypes): assert len(vars) > 0 # Sort variants and remove redundant variants vars = sorted(vars, cmp=compare_vars) tmp_vars = [] v = 0 while v < len(vars): var = vars[v] for v2 in range(v + 1, len(vars)): var2 = vars[v2] if compare_vars(var, var2) == 0: v += 1 if "CLNSIG" not in var[4]: if "CLNSIG" in var2[4]: var[4]["CLNSIG"] = var2[4]["CLNSIG"] if "genotype" not in var[4]: if "genotype" in var2[4]: var[4]["genotype"] = var2[4]["genotype"] else: assert compare_vars(var, var2) < 0 break tmp_vars.append(var) v += 1 vars = tmp_vars # Write SNPs into a file (.snp) for var in vars: chr, pos, type, data, var_dic = var varID = var_dic["id2"] if type == 'S': type = "single" elif type == 'D': type = "deletion" else: assert type == 'I' type = "insertion" print >> snp_file, "%s\t%s\t%s\t%s\t%s" % \ (varID, type, chr, pos, data) # variant compatibility vars_cmpt = [-1 for i in range(len(vars))] for v in range(len(vars)): var_chr, var_pos, var_type, var_data = vars[v][:4] if var_type == 'D': var_pos += (var_data - 1) for v2 in range(v + 1, len(vars)): if vars_cmpt[v2] >= 0: continue var2_chr, var2_pos, var2_type = vars[v2][:3] assert var_chr == var2_chr if var_type == 'D' and var2_type == 'D': if var_pos + 1 < var2_pos: break else: if var_pos < var2_pos: break vars_cmpt[v2] = v # Assign genotypes for those missing genotypes genotypes_list = [] if num_genomes > 0: max_genotype_num = 1 for v in range(len(vars)): var = vars[v] var_dic = var[4] if "genotype" not in var_dic: used = [True, True] + [False for i in range(8)] if vars_cmpt[v] >= 0: v2 = v - 1 while v2 >= vars_cmpt[v]: var2 = vars[v2] if not compatible_vars(var2, var): var2_dic = var2[4] assert "genotype" in var2_dic genotype_num = int(var2_dic["genotype"][0]) used[genotype_num] = True v2 -= 1 assert False in used for i in range(len(used)): if not used[i]: var_dic["genotype"] = ("%d" % i) * (num_genomes * 2) if i > max_genotype_num: max_genotype_num = i break genotypes_list.append(var_dic["genotype"]) num_chromosomes = len(genotypes_list[0]) # daehwan - for debugging purposes """ for v in range(len(vars)): var = vars[v] var_chr, var_pos, var_type, var_data, var_dic = var print v, var_chr, var_pos, var_type, var_data, var_dic["id"], var_dic["id2"], if "CLNSIG" in var_dic: print "CLNSIG:", var_dic["CLNSIG"], if "genotype" in var_dic: print var_dic["genotype"][:50], print """ # genotypes_list looks like # Var0: 000001000 # Var1: 010000000 # Var2: 001100000 # Var3: 222222222 # Get haplotypes from genotypes_list haplotypes = set() cnv_genotypes = ["" for i in range(num_chromosomes)] for genotypes in genotypes_list: for i in range(len(genotypes)): genotype = genotypes[i] cnv_genotypes[i] += genotype cnv_genotypes = set(cnv_genotypes) for raw_haplotype in cnv_genotypes: for num in range(1, max_genotype_num + 1): num_str = str(num) if num_str not in raw_haplotype: continue haplotype = "" for i in range(len(raw_haplotype)): if raw_haplotype[i] == num_str: if haplotype == "": haplotype = str(i) else: haplotype += ("#%d" % i) assert haplotype != "" haplotypes.add(haplotype) else: for v in range(len(vars)): var = vars[v] var_dic = var[4] used = [False for i in range(100)] if vars_cmpt[v] >= 0: v2 = v - 1 while v2 >= vars_cmpt[v]: var2 = vars[v2] if not compatible_vars(var2, var): var2_dic = var2[4] assert "genotype" in var2_dic genotype_num = var2_dic["genotype"] used[genotype_num] = True v2 -= 1 assert False in used for i in range(len(used)): if not used[i]: var_dic["genotype"] = i break genotypes_list.append(var_dic["genotype"]) # genotypes_list looks like # Var0: 0 # Var1: 0 # Var2: 1 # Var3: 2 # Get haplotypes from genotypes_list max_genotype_num = max(genotypes_list) haplotypes = ["" for i in range(max_genotype_num + 1)] for i in range(len(genotypes_list)): num = genotypes_list[i] if haplotypes[num] == "": haplotypes[num] = str(i) else: haplotypes[num] += ("#%d" % i) haplotypes = set(haplotypes) # haplotypes look like # '8#10#12#23', '8#12#23', '5#8#12#23#30' # Split some haplotypes that include large gaps inside def split_haplotypes(haplotypes): split_haplotypes = set() for haplotype in haplotypes: haplotype = haplotype.split('#') assert len(haplotype) > 0 if len(haplotype) == 1: split_haplotypes.add(haplotype[0]) continue prev_s, s = 0, 1 while s < len(haplotype): _, prev_locus, prev_type, prev_data, _ = vars[int(haplotype[s-1])] _, locus, type, data, _ = vars[int(haplotype[s])] prev_locus, locus = int(prev_locus), int(locus) if prev_type == 'D': prev_locus += (int(prev_data) - 1) if prev_locus + intra_gap < locus: split_haplotypes.add('#'.join(haplotype[prev_s:s])) prev_s = s s += 1 if s == len(haplotype): split_haplotypes.add('#'.join(haplotype[prev_s:s])) return split_haplotypes haplotypes2 = split_haplotypes(haplotypes) def cmp_haplotype(a, b): a = a.split('#') _, a1_locus, _, _, _ = vars[int(a[0])] _, a2_locus, a2_type, a2_data, _ = vars[int(a[-1])] a_begin, a_end = int(a1_locus), int(a2_locus) if a2_type == 'D': a_end += (int(a2_data) - 1) b = b.split('#') _, b1_locus, _, _, _ = vars[int(b[0])] _, b2_locus, b2_type, b2_data, _ = vars[int(b[-1])] b_begin, b_end = int(b1_locus), int(b2_locus) if b2_type == 'D': b_end += (int(b2_data) - 1) if a_begin != b_begin: return a_begin - b_begin return a_end - b_end haplotypes = sorted(list(haplotypes2), cmp=cmp_haplotype) # daehwan - for debugging purposes """ dis = prev_locus - locus print "\n[%d, %d]: %d haplotypes" % (i, j, len(haplotypes)), dis if len(cur_vars) in range(0, 1000): # print "vars:", sorted(list(cur_vars), cmp=cmp_varKey print "num:", len(haplotypes) for haplotype in haplotypes: print haplotype.split('#') print "\nnum:", len(haplotypes2) for haplotype in haplotypes2: print haplotype.split('#') """ # Write haplotypes for h_i in range(len(haplotypes)): h = haplotypes[h_i].split('#') chr, h1_locus, _, _, _ = vars[int(h[0])] _, h2_locus, h2_type, h2_data, _ = vars[int(h[-1])] h_begin, h_end = int(h1_locus), int(h2_locus) if h2_type == 'D': h_end += (int(h2_data) - 1) assert h_begin <= h_end h_new_begin = h_begin for h_j in reversed(range(0, h_i)): hc = haplotypes[h_j].split('#') _, hc_begin, hc_type, hc_data, _ = vars[int(hc[-1])] hc_begin = int(hc_begin) hc_end = hc_begin if hc_type == 'D': hc_end += (int(hc_data) - 1) if hc_end + inter_gap < h_begin: break if h_new_begin > hc_end: h_new_begin = hc_end assert h_new_begin <= h_begin h_add = [] for id in h: var_dic = vars[int(id)][4] h_add.append(var_dic["id2"]) print >> haplotype_file, "ht%d\t%s\t%d\t%d\t%s" % \ (num_haplotypes, chr, h_new_begin, h_end, ','.join(h_add)) num_haplotypes += 1 return num_haplotypes """ """ def main(genome_file, VCF_fnames, base_fname, inter_gap, intra_gap, only_rs, reference_type, genotype_vcf, genotype_gene_list, extra_files, verbose): # Load genomic sequences chr_dic = read_genome(genome_file) # GRCh38 - ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/supporting/GRCh38_positions # ALL.chr22.phase3_shapeit2_mvncall_integrated_v3plus_nounphased.rsID.genotypes.GRCh38_dbSNP_no_SVs.vcf.gz # GRCh37 - ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502 # ALL.chr22.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz # List of variants (e.g. ClinVar database) genotype_var_list = {} # List of genomic regions to be processed genotype_ranges = {} if genotype_vcf != "": var_set = set() assert len(genotype_gene_list) > 0 if genotype_vcf.endswith(".gz"): vcf_cmd = ["gzip", "-cd", genotype_vcf] else: vcf_cmd = ["cat", genotype_vcf] vcf_proc = subprocess.Popen(vcf_cmd, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w')) for line in vcf_proc.stdout: if line.startswith("#"): continue chr, pos, varID, ref_allele, alt_alleles, qual, filter, info = line.strip().split('\t') pos = int(pos) - 1 if chr not in chr_dic: continue gene = None for g in genotype_gene_list: if info.find(g) != -1: gene = g break if not gene: continue CLNSIG = -1 for item in info.split(';'): if not item.startswith("CLNSIG"): continue try: key, value = item.split('=') CLNSIG = int(value) except ValueError: continue if CLNSIG not in [4, 5]: continue if CLNSIG == 4: CLNSIG = "Likely pathogenic" else: CLNSIG = "Pathogenic" vars = extract_vars(chr_dic, chr, pos, ref_allele, alt_alleles, varID) if len(vars) == 0: continue if chr not in genotype_var_list: genotype_var_list[chr] = [] genotype_ranges[chr] = {} if gene not in genotype_ranges[chr]: genotype_ranges[chr][gene] = [len(chr_dic[chr]), -1] for var in vars: var_chr, var_pos, var_ref_allele, var_alt_allele = var[:4] var_str = "%s-%d-%s-%s" % (var_chr, var_pos, var_ref_allele, var_alt_allele) if var_str in var_set: continue var[4]["CLNSIG"] = CLNSIG var[4]["gene"] = gene genotype_var_list[chr].append(var) if var_pos < genotype_ranges[chr][gene][0]: genotype_ranges[chr][gene][0] = var_pos if var_pos > genotype_ranges[chr][gene][1]: genotype_ranges[chr][gene][1] = var_pos var_set.add(var_str) print >> sys.stderr, "Number of variants in %s is:" % (genotype_vcf) for chr, vars in genotype_var_list.items(): vars = sorted(vars, cmp=compare_vars) print >> sys.stderr, "\tChromosome %s: %d variants" % (chr, len(vars)) for chr, gene_ranges in genotype_ranges.items(): for gene, value in gene_ranges.items(): gene_ranges[gene] = [value[0] - 100, value[1] + 100] value = genotype_ranges[chr][gene] if verbose: print >> sys.stderr, "%s\t%s\t%d-%d" % (chr, gene, value[0], value[1]) if extra_files or True: clnsig_file = open("%s.clnsig" % base_fname, 'w') for chr, vars in genotype_var_list.items(): for var in vars: varID = var[4]["id2"] CLNSIG = var[4]["CLNSIG"] gene = var[4]["gene"] print >> clnsig_file, "%s\t%s\t%s" % (varID, gene, CLNSIG) clnsig_file.close() SNP_file = open("%s.snp" % base_fname, 'w') haplotype_file = open("%s.haplotype" % base_fname, 'w') # Write reference information and backbone sequences into files if extra_files: ref_file = open("%s.ref" % base_fname, 'w') for chr, gene_ranges in genotype_ranges.items(): for gene, value in gene_ranges.items(): left, right = value if reference_type == "gene": left, right = 0, right - left print >> ref_file, "%s\t%s\t%d\t%d" % (gene, chr, left, right) ref_file.close() if reference_type == "gene": backbone_file = open("%s_backbone.fa" % base_fname, 'w') for chr, gene_ranges in genotype_ranges.items(): for gene, value in gene_ranges.items(): left, right = value left, right = 0, right - left print >> backbone_file, ">%s" % (gene) backbone_seq = chr_dic[chr][value[0]:value[1]+1] for s in range(0, len(backbone_seq), 60): print >> backbone_file, backbone_seq[s:s+60] backbone_file.close() elif reference_type == "chromosome": first = True for chr in genotype_ranges.keys(): if first: os.system("samtools faidx genome.fa %s > %s_backbone.fa" % (chr, base_fname)) first = False else: os.system("samtools faidx genome.fa %s >> %s_backbone.fa" % (chr, base_fname)) else: assert reference_type == "genome" os.system("cp genome.fa %s_backbone.fa" % base_fname) num_haplotypes = 0 num_unassigned = 0 for VCF_fname in VCF_fnames: empty_VCF_file = False if VCF_fname == "/dev/null" or \ not os.path.exists(VCF_fname): empty_VCF_file = True if reference_type != "genome" and \ len(genotype_gene_list) > 0: continue if not empty_VCF_file: if VCF_fname.endswith(".gz"): vcf_cmd = ["gzip", "-cd", VCF_fname] else: vcf_cmd = ["cat", VCF_fname] vcf_proc = subprocess.Popen(vcf_cmd, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w')) genomeIDs = [] vars, genotypes_list = [], [] prev_varID, prev_chr, prev_pos = "", "", -1 num_lines = 0 for line in vcf_proc.stdout: num_lines += 1 if line.startswith("##"): continue fields = line.strip().split('\t') chr, pos, varID, ref_allele, alt_alleles, qual, filter, info = fields[:8] if prev_chr != chr: curr_right = -1 if len(fields) >= 9: format = fields[8] genotypes = [] if len(fields) >= 10: genotypes = fields[9:] if line.startswith("#"): genomeIDs = genotypes num_genomes = len(genomeIDs) continue assert len(genotypes) == len(genomeIDs) if only_rs and not varID.startswith("rs"): continue if ';' in varID: continue if varID == prev_varID: continue if chr not in chr_dic: continue chr_seq = chr_dic[chr] chr_genotype_vars = [] chr_genotype_ranges = {} if len(genotype_gene_list) > 0: assert chr in genotype_var_list chr_genotype_vars = genotype_var_list[chr] assert chr in genotype_ranges chr_genotype_ranges = genotype_ranges[chr] pos = int(pos) - 1 offset = 0 gene = None if num_lines % 10000 == 1: print >> sys.stderr, "\t%s:%d\r" % (chr, pos), if chr_genotype_ranges: skip = True for gene_, range_ in chr_genotype_ranges.items(): if pos > range_[0] and pos < range_[1]: skip = False break if skip: continue if len(vars) == 0: for var in chr_genotype_vars: var_chr, var_pos, var_type, var_data, var_dic = var if var_pos < range_[0]: continue if var_pos > range_[1]: break if reference_type == "gene": var_pos -= range_[0] vars.append([gene_, var_pos, var_type, var_data, var_dic]) curr_right = range_[1] if reference_type == "gene": offset = range_[0] gene = gene_ if pos == prev_pos: continue if len(vars) > 0 and \ (curr_right + inter_gap < pos or prev_chr != chr): num_haplotypes = generate_haplotypes(SNP_file, haplotype_file, vars, inter_gap, intra_gap, num_genomes, num_haplotypes) vars = [] def add_vars(pos, offset, gene, varID, ref_allele, alt_alleles, vars, genotypes): tmp_vars = extract_vars(chr_dic, chr, pos, ref_allele, alt_alleles, varID) max_right = -1 for v in range(len(tmp_vars)): var = tmp_vars[v] _, pos2, type, data = var[:4] cnv_genotypes = [] for genotype in genotypes: P1, P2 = genotype[0], genotype[2] if P1 == digit2str[v + 1]: cnv_genotypes.append('1') else: cnv_genotypes.append('0') if P2 == digit2str[v + 1]: cnv_genotypes.append('1') else: cnv_genotypes.append('0') # Skip SNPs not present in a given population (e.g. 2,504 genomes in 1000 Genomes Project) if cnv_genotypes != [] and \ '1' not in cnv_genotypes: continue tmp_varID = var[4]["id2"] var_dic = {"id":varID, "id2":tmp_varID, "genotype":''.join(cnv_genotypes)} if reference_type == "gene": vars.append([gene, pos2 - offset, type, data, var_dic]) else: vars.append([chr, pos2, type, data, var_dic]) right = pos2 if type == 'D': right += (int(data) - 1) if max_right < right: max_right = right return max_right right = add_vars(pos, offset, gene, varID, ref_allele, alt_alleles, vars, genotypes) if curr_right < right: curr_right = right prev_varID = varID prev_chr = chr prev_pos = pos if len(vars) > 0: num_haplotypes = generate_haplotypes(SNP_file, haplotype_file, vars, inter_gap, intra_gap, num_genomes, num_haplotypes) vars = [] else: for chr in genotype_var_list.keys(): chr_seq = chr_dic[chr] chr_genotype_vars = genotype_var_list[chr] curr_right = -1 vars = [] for var in chr_genotype_vars: var_chr, var_pos, var_type, var_data, var_dic = var num_genomes = 0 if len(vars) > 0 and curr_right + inter_gap < var_pos: num_haplotypes = generate_haplotypes(SNP_file, haplotype_file, vars, inter_gap, intra_gap, num_genomes, num_haplotypes) vars = [] vars.append([var_chr, var_pos, var_type, var_data, var_dic]) curr_right = var_pos if var_type == 'D': curr_right += (var_data - 1) if len(vars) > 0: num_haplotypes = generate_haplotypes(SNP_file, haplotype_file, vars, inter_gap, intra_gap, num_genomes, num_haplotypes) vars = [] SNP_file.close() haplotype_file.close() if genotype_vcf != "": clnsig_file.close() if __name__ == '__main__': parser = ArgumentParser( description='Extract SNPs and haplotypes from VCF files') parser.add_argument('genome_file', nargs='?', type=FileType('r'), help='input genome file (e.g. genome.fa)') parser.add_argument('VCF_fnames', nargs='?', type=str, help='A comma-seperated VCF files (plain text or gzipped file is accepted: GRCh38_dbSNP_no_SVs.vcf or GRCh38_dbSNP_no_SVs.vcf.gz') parser.add_argument("base_fname", nargs='?', type=str, help="base filename for SNPs and haplotypes") parser.add_argument("--reference-type", dest="reference_type", type=str, default="genome", help="Reference type: gene, chromosome, and genome (default: genome)") parser.add_argument("--inter-gap", dest="inter_gap", type=int, default=30, help="Maximum distance for variants to be in the same haplotype (default: 30)") parser.add_argument("--intra-gap", dest="intra_gap", type=int, default=50, help="Break a haplotype into several haplotypes (default: 50)") parser.add_argument('--non-rs', dest='only_rs', action='store_false', help='Allow SNP IDs not beginning with rs') parser.add_argument('--genotype-vcf', dest='genotype_vcf', type=str, default="", help='VCF file name for genotyping (default: empty)') parser.add_argument('--genotype-gene-list', dest='genotype_gene_list', type=str, default="", help='A comma-separated list of genes to be genotyped (default: empty)') parser.add_argument('--extra-files', dest='extra_files', action='store_true', help='Output extra files such as _backbone.fa and .ref') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help='also print some statistics to stderr') args = parser.parse_args() if not args.genome_file or \ not args.VCF_fnames or \ not args.base_fname: parser.print_help() exit(1) args.VCF_fnames = args.VCF_fnames.split(',') if args.genotype_vcf != "": if args.genotype_gene_list == "": genes = set() if args.genotype_vcf.endswith(".gz"): vcf_cmd = ["gzip", "-cd", args.genotype_vcf] else: vcf_cmd = ["cat", args.genotype_vcf] vcf_proc = subprocess.Popen(vcf_cmd, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w')) for line in vcf_proc.stdout: if line.startswith("#"): continue info = line.strip().split()[-1] if info.find("GENEINFO=") == -1: continue gene = info.split("GENEINFO=")[1] gene = gene.split(':')[0] genes.add(gene) args.genotype_gene_list = list(genes) else: args.genotype_gene_list = args.genotype_gene_list.split(',') if len(args.genotype_gene_list) == 0: print >> sys.stderr, "Error: please specify --genotype-gene-list." sys.exit(1) else: args.genotype_gene_list = [] main(args.genome_file, args.VCF_fnames, args.base_fname, args.inter_gap, args.intra_gap, args.only_rs, args.reference_type, args.genotype_vcf, args.genotype_gene_list, args.extra_files, args.verbose)