#!/usr/bin/env python # # Manta - Structural Variant and Indel Caller # Copyright (c) 2013-2016 Illumina, Inc. # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # # import sys from os import path from os.path import exists, abspath, dirname, basename, splitext, join def check_genotype(probandGT, fatherGT, motherGT): isConsistent = False fatherGTItems = fatherGT.split('/') motherGTItems = motherGT.split('/') for it1 in fatherGTItems: for it2 in motherGTItems: temp = [it1, it2] temp.sort() GT = temp[0]+'/'+temp[1] if GT == probandGT: isConsistent = True break return isConsistent def process_vcf(vcfFile, probandID, fatherID, motherID): vcfFile = abspath(vcfFile) dataDir = dirname(vcfFile) filePrefix = splitext(basename(vcfFile))[0] outFile = join(dataDir, filePrefix+".de_novo.vcf") statsFile = join(dataDir, filePrefix+".de_novo.stats.txt") fpOut = open(outFile, 'wb') fpStats = open(statsFile, 'wb') countPassed = 0 countFiltered = 0 consistencyDict = {} # parser isInfoAdded = False isIxFound = False colNameLine = "" probandIx = -1 fatherIx = -1 motherIx = -1 fpVcf = open(vcfFile, 'rb') for line in fpVcf: if line[0] == '#': if not(isInfoAdded) and (line[:8] == "##FORMAT"): fpOut.write("##INFO=\n") isInfoAdded = True fpOut.write(line) colNameLine = line continue elif not(isIxFound): # parse format line to get the columns of proband & parents tokens = colNameLine.split() for ix in xrange(len(tokens)): if tokens[ix] == probandID: probandIx = ix elif tokens[ix] == fatherID: fatherIx = ix elif tokens[ix] == motherID: motherIx = ix wrongID = "" if probandIx == -1: wrongID = probandID if fatherIx == -1: wrongID += (',%s' % fatherID) if motherIx == -1: wrongID += (',%s' % motherID) if wrongID: errMsg = ('The sample ID %s does not exist in the vcf.' % wrongID) sys.stderr.write(errMsg + '\nProgram exits.') sys.exit(1) tokens = line.split() info = tokens[7] format = tokens[8] items = format.split(':') GTix = -1 for ix in xrange(len(items)): if items[ix] == "GT": GTix = ix items = tokens[probandIx].split(':') probandGT = items[GTix] items = tokens[fatherIx].split(':') fatherGT = items[GTix] items = tokens[motherIx].split(':') motherGT = items[GTix] isConsistent = check_genotype(probandGT, fatherGT, motherGT) if not(isConsistent): info += ";DQ=60" # stats filter = tokens[6] if filter.upper() == "PASS": countPassed += 1 else: countFiltered += 1 GTstring = probandGT + '-' + fatherGT + '-' + motherGT if not(GTstring in consistencyDict): consistencyDict[GTstring] = 0 consistencyDict[GTstring] += 1 else: info += ";DQ=0" newLine = "" for i in xrange(7): newLine += tokens[i] + "\t" newLine += info for i in xrange(8, len(tokens)): newLine += "\t" + tokens[i] fpOut.write(newLine+"\n") fpVcf.close() fpOut.close() fpStats.write("# of passed SVs: %s\n" % (countPassed)) fpStats.write("# of filtered SVs: %s\n" % (countFiltered)) fpStats.write("probandGT-fatherGT-motherGT\tcounts\n") genotypes = consistencyDict.keys() genotypes.sort() for gt in genotypes: fpStats.write("%s\t%s\n" % (gt, consistencyDict[gt])) fpStats.close() if __name__=='__main__': usage = "denovo_scoring.py \n" if len(sys.argv) <= 4: sys.stderr.write(usage) sys.exit(1) vcfFile = sys.argv[1] probandID = sys.argv[2] fatherID = sys.argv[3] motherID = sys.argv[4] if not(exists(vcfFile)): errMsg = ('The file %s does not exist.' % vcfFile) sys.stderr.write(errMsg + '\nProgram exits.') sys.exit(1) process_vcf(vcfFile, probandID, fatherID, motherID)