#!/usr/bin/env python # # Copyright 2015, Daehwan Kim # # This file is part of HISAT 2. # # HISAT 2 is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # HISAT 2 is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with HISAT 2. If not, see . # import sys, os, subprocess, re import inspect import random from argparse import ArgumentParser, FileType # Gold Standard (experimentally verified, a lot of literature, ...) gold_allele_info = { "NA12877" : {"A" : ["03:01", "02:01"], "B" : ["15:01", "44:02"], "C" : ["05:01", "03:04"], "DQA1" : ["03:03", "03:01"], "DQB1" : ["03:02", "03:01"], "DRB1" : ["04:03", "04:03"]}, "NA12878" : {"A" : ["01:01", "11:01"], "B" : ["08:01", "56:01"], "C" : ["01:02", "07:01"], "DQA1" : ["05:01", "01:01"], "DQB1" : ["02:01", "05:01"], "DRB1" : ["03:01", "01:01"]}, "NA12879" : {"A" : ["01:01", "02:01"], "B" : ["08:01", "15:01"], "C" : ["03:04", "07:01"], "DQA1" : ["03:01", "05:01"], "DQB1" : ["03:02", "02:01"], "DRB1" : ["03:01", "04:03"]}, "NA12880" : {"A" : ["02:01", "01:01"], "B" : ["15:01", "08:01"], "C" : ["03:04", "07:01"], "DQA1" : ["03:01", "05:01"], "DQB1" : ["03:02", "02:01"], "DRB1" : ["03:01", "04:03"]}, "NA12881" : {"A" : ["03:01", "11:01"], "B" : ["44:02", "56:01"], "C" : ["05:01", "01:02"], "DQA1" : ["03:03", "01:01"], "DQB1" : ["03:01", "05:01"], "DRB1" : ["04:03", "01:01"]}, "NA12882" : {"A" : ["02:01", "11:01"], "B" : ["15:01", "56:01"], "C" : ["01:02", "03:04"], "DQA1" : ["03:01", "01:01"], "DQB1" : ["03:02", "05:01"], "DRB1" : ["04:03", "01:01"]}, "NA12883" : {"A" : ["03:01", "11:01"], "B" : ["44:02", "56:01"], "C" : ["01:02", "05:01"], "DQA1" : ["03:03", "01:01"], "DQB1" : ["03:01", "05:01"], "DRB1" : ["01:01", "04:03"]}, "NA12884" : {"A" : ["02:01", "11:01"], "B" : ["15:01", "56:01"], "C" : ["01:02", "03:04"], "DQA1" : ["03:01", "01:01"], "DQB1" : ["03:02", "05:01"], "DRB1" : ["01:01", "04:03"]}, "NA12885" : {"A" : ["03:01", "01:01"], "B" : ["44:02", "08:01"], "C" : ["05:01", "07:01"], "DQA1" : ["03:03", "05:01"], "DQB1" : ["03:01", "02:01"], "DRB1" : ["03:01", "04:03"]}, "NA12886" : {"A" : ["03:01", "01:01"], "B" : ["44:02", "08:01"], "C" : ["07:01", "05:01"], "DQA1" : ["03:03", "05:01"], "DQB1" : ["02:01", "03:01"], "DRB1" : ["03:01", "04:03"]}, "NA12887" : {"A" : ["02:01", "01:01"], "B" : ["15:01", "08:01"], "C" : ["03:04", "07:01"], "DQA1" : ["03:01", "05:01"], "DQB1" : ["03:02", "02:01"], "DRB1" : ["03:01", "04:03"]}, "NA12888" : {"A" : ["01:01", "02:01"], "B" : ["08:01", "15:01"], "C" : ["07:01", "03:04"], "DQA1" : ["03:01", "05:01"], "DQB1" : ["03:02", "02:01"], "DRB1" : ["03:01", "04:03"]}, "NA12889" : {"A" : ["03:01", "03:01"], "B" : ["07:02", "44:02"], "C" : ["05:01", "07:02"], "DQA1" : ["03:03", "01:02"], "DQB1" : ["03:01", "06:02"], "DRB1" : ["15:01", "04:03"]}, "NA12890" : {"A" : ["03:01", "02:01"], "B" : ["44:03", "15:01"], "C" : ["16:01", "03:04"], "DQA1" : ["03:01", "02:01"], "DQB1" : ["03:02", "02:02"], "DRB1" : ["04:03", "07:01"]}, "NA12891" : {"A" : ["24:02", "01:01"], "B" : ["08:01", "07:02"], "C" : ["07:02", "07:01"], "DQA1" : ["05:01", "01:02"], "DQB1" : ["06:02", "02:01"], "DRB1" : ["03:01", "15:01"]}, "NA12892" : {"A" : ["02:01", "11:01"], "B" : ["15:01", "56:01"], "C" : ["01:02", "04:01"], "DQA1" : ["01:01", "01:01"], "DQB1" : ["05:01", "05:01"], "DRB1" : ["01:01", "01:02"]}, "NA12893" : {"A" : ["02:01", "11:01"], "B" : ["15:01", "56:01"], "C" : ["01:02", "03:04"], "DQA1" : ["03:01", "01:01"], "DQB1" : ["03:02", "05:01"], "DRB1" : ["01:01", "04:03"]} } # CEPH pedigree (17 family members) pedigree = { "NA12889" : {"gender" : "M", "spouse" : "NA12890", "children" : ["NA12877"]}, "NA12890" : {"gender" : "F", "spouse" : "NA12889", "children" : ["NA12877"]}, "NA12877" : {"gender" : "M", "father" : "NA12889", "mother" : "NA12890", "spouse" : "NA12878", "children" : ["NA12879", "NA12880", "NA12881", "NA12882", "NA12883", "NA12884", "NA12885", "NA12886", "NA12887", "NA12888", "NA12893"]}, "NA12891" : {"gender" : "M", "spouse" : "NA12892", "children" : ["NA12878"]}, "NA12892" : {"gender" : "F", "spouse" : "NA12891", "children" : ["NA12878"]}, "NA12878" : {"gender" : "F", "father" : "NA12892", "mother" : "NA12891", "spouse" : "NA12877", "children" : ["NA12879", "NA12880", "NA12881", "NA12882", "NA12883", "NA12884", "NA12885", "NA12886", "NA12887", "NA12888", "NA12893"]}, "NA12879" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"}, "NA12880" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"}, "NA12881" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"}, "NA12882" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, "NA12883" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, "NA12884" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, "NA12885" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"}, "NA12886" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, "NA12887" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"}, "NA12888" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, "NA12893" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, } """ """ def test_HLA_genotyping(reference_type, hla_list, aligners, query_genomes, exclude_allele_list, num_mismatch, verbose): # Current script directory curr_script = os.path.realpath(inspect.getsourcefile(test_HLA_genotyping)) ex_path = os.path.dirname(curr_script) if not os.path.exists("illumina/HLA"): print >> sys.stderr, "Error: illumina/HLA data is needed (please send an email to infphilo@gmail.com for getting the data)" sys.exit(1) num_test, num_success = 0, 0 for genome in sorted(gold_allele_info.keys()): if not genome in query_genomes: continue genes = gold_allele_info[genome] read_fname_1, read_fname_2 = "illumina/HLA/%s.fished_1.fq" % genome, "illumina/HLA/%s.fished_2.fq" % genome if not os.path.exists(read_fname_1) or not os.path.exists(read_fname_2): continue print >> sys.stderr, genome cmd_aligners = ['.'.join(aligners[i]) for i in range(len(aligners))] test_hla_script = os.path.join(ex_path, "hisat2_test_HLA_genotyping.py") for gene in sorted(genes.keys()): if not gene in hla_list: continue alleles = genes[gene] print >> sys.stderr, "\t%s - %s" % (gene, ' / '.join(alleles)) test_hla_cmd = [test_hla_script, "--reference-type", reference_type, "--hla-list", gene, "--aligner-list", ','.join(cmd_aligners), "--reads", "%s,%s" % (read_fname_1, read_fname_2), "--best-alleles", "--exclude-allele-list", ','.join(exclude_allele_list), "--num-mismatch", str(num_mismatch)] if verbose: print >> sys.stderr, ' '.join(test_hla_cmd) proc = subprocess.Popen(test_hla_cmd, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w')) num_test += 2 test_alleles = set() for line in proc.stdout: print "\t\t", line, model, allele = line.split()[:2] if model != "SingleModel": continue allele = allele.split('*')[1] allele = ':'.join(allele.split(':')[:2]) test_alleles.add(allele) proc.communicate() for allele in alleles: if allele in test_alleles: num_success += 1 print >> sys.stderr, "%d/%d (%.2f%%)" % (num_success, num_test, num_success * 100.0 / num_test) """ """ if __name__ == '__main__': parser = ArgumentParser( description='test HLA genotyping for Platinum Genomes') parser.add_argument("--reference-type", dest="reference_type", type=str, default="gene", help="Reference type: gene, chromosome, and genome (default: gene)") parser.add_argument("--hla-list", dest="hla_list", type=str, default="A,B,C,DQA1,DQB1,DRB1", help="A comma-separated list of HLA genes (default: A,B,C,DQA1,DQB1,DRB1)") parser.add_argument("--aligner-list", dest="aligners", type=str, default="hisat2.graph", help="A comma-separated list of aligners (default: hisat2.graph)") genomes_default = ','.join(gold_allele_info.keys()) parser.add_argument("--genome-list", dest="genome_list", type=str, default=genomes_default, help="A comma-separated list of genomes (default: %s)" % genomes_default) parser.add_argument("--exclude-allele-list", dest="exclude_allele_list", type=str, default="", help="A comma-separated list of allleles to be excluded") parser.add_argument("--num-mismatch", dest="num_mismatch", type=int, default=0, help="Maximum number of mismatches per read alignment to be considered (default: 0)") parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help='also print some statistics to stderr') args = parser.parse_args() if not args.reference_type in ["gene", "chromosome", "genome"]: print >> sys.stderr, "Error: --reference-type (%s) must be one of gene, chromosome, and genome." % (args.reference_type) sys.exit(1) args.hla_list = args.hla_list.split(',') if args.aligners == "": print >> sys.stderr, "Error: --aligners must be non-empty." sys.exit(1) args.aligners = args.aligners.split(',') for i in range(len(args.aligners)): args.aligners[i] = args.aligners[i].split('.') args.genome_list = args.genome_list.split(',') args.exclude_allele_list = args.exclude_allele_list.split(',') test_HLA_genotyping(args.reference_type, args.hla_list, args.aligners, args.genome_list, args.exclude_allele_list, args.num_mismatch, args.verbose)