#! /usr/bin/env python # # Manta - Structural Variant and Indel Caller # Copyright (c) 2013-2016 Illumina, Inc. # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # # """ filter vcf to remove overlapping diploid calls which can't be resolved to two haplotypes """ import sys import re from os.path import exists, isfile from optparse import OptionParser def getKeyVal(string,key) : match=re.search("%s=([^;\t]*);?" % (key) ,string) if match is None : return None return match.group(1); VCF_CHROM = 0 VCF_POS = 1 VCF_REF = 3 VCF_ALT = 4 VCF_QUAL = 5 VCF_FILTER = 6 VCF_INFO = 7 VCF_FORMAT = 8 VCF_SAMPLE = 9 class VcfRecord : def __init__(self, line) : #self.line = line w = line.strip().split('\t') self.chrom = w[VCF_CHROM] self.pos = int(w[VCF_POS]) self.isPass = (w[VCF_FILTER] == "PASS") self.end = self.pos+len(w[VCF_REF])-1 val = getKeyVal(w[VCF_INFO],"END") if val is not None : self.end = int(val) self.svLen = None val = getKeyVal(w[VCF_INFO],"SVLEN") if val is not None : self.svLen = int(val) self.svType = getKeyVal(w[VCF_INFO],"SVTYPE") fmt = w[VCF_FORMAT] gtIx = fmt.split(':').index("GT") self.gtType = [] for sample in w[VCF_SAMPLE:] : gt = sample.split(':')[gtIx] t = gt.split('/') self.gtType.append(int(t[0]) + int(t[1])) def getOptions(): usage = "usage: %prog [options] vcf > filtered_vcf" parser = OptionParser(usage=usage) (options,args) = parser.parse_args() if len(args) != 1 : parser.print_help() sys.exit(2) # validate input: if not isfile(args[0]) : raise Exception("Can't find input vcf file: " + args[0]) return (options,args) def process_block(recordBlock, nextPos, filteredSites): # sys.stderr.write("processing a block with %s sites...\n" % len(recordBlock)) while (len(recordBlock) > 0): target = recordBlock[0] targetEnd = target.end # when a new target's end is larger than # the pos of the next site to be read, # we need to read in more sites if targetEnd > nextPos: break targetLen = -1 if target.svLen is not None: targetLen = abs(target.svLen) targetType = target.svType ploidySum = [] for gtPloidy in target.gtType : ploidySum.append(gtPloidy) overlapIds = [0] for ix in xrange(1, len(recordBlock)): record = recordBlock[ix] pos = record.pos svLen = -1 if record.svLen is not None: svLen = abs(record.svLen) svType = record.svType # collecting stacked sites # with the same type and similar size if pos < targetEnd: if ( # (svType == targetType) and (svLen < 2*targetLen) and (svLen > 0.5*targetLen)): for (sampleIndex, gtPloidy) in enumerate(record.gtType) : ploidySum[sampleIndex] += gtPloidy overlapIds.append(ix) else: break overlapIds.reverse() isAnomPloidy = False for psum in ploidySum : if psum > 2 : isAnomPloidy = True if isAnomPloidy: # sites to be filtered due to ploidity for i in overlapIds: site = recordBlock.pop(i) chrm = site.chrom pos = site.pos end = site.end if not(chrm in filteredSites): filteredSites[chrm] = {} filteredSites[chrm][(pos, end)] = True else: # sites to be kept for i in overlapIds: recordBlock.pop(i) def find_stacked_variants(vcfFile): filteredSites = {} recordBlock = [] maxEnd = -1 count = 0 for line in open(vcfFile): if line[0] == "#": continue record = VcfRecord(line) chrm = record.chrom pos = record.pos svType = record.svType count += 1 # ignore filtered records isPassed = record.isPass if not(isPassed): continue # consider DEL & DUP only if (svType != "DEL") and (svType != "DUP"): continue end = record.end # set up the first target site if (len(recordBlock) == 0): targetChrm = chrm targetEnd = end else: targetChrm = recordBlock[0].chrom targetEnd = recordBlock[0].end # keep reading into the block until exceeding the target's end if (chrm == targetChrm) and (pos < targetEnd): recordBlock.append(record) maxEnd = max(maxEnd, end) else: nextPos = pos if (chrm != targetChrm): nextPos = maxEnd + 1 maxEnd = -1 # process the block until pos < the new target's end process_block(recordBlock, nextPos, filteredSites) recordBlock.append(record) maxEnd = max(maxEnd, end) # process the last block process_block(recordBlock, maxEnd+1, filteredSites) sys.stderr.write("Processed %s sites in the vcf.\n" % count) numFiltered = 0 for c in filteredSites: numFiltered += len(filteredSites[c]) sys.stderr.write("Filtered %s sites due to ploidy.\n" % numFiltered) sys.stderr.write("Filtered sites: %s\n" % filteredSites) return filteredSites def check_filtered_sites(site, filteredSites): chrm = site.chrom pos = site.pos end = site.end return ((chrm in filteredSites) and ((pos, end) in filteredSites[chrm])) def filter_variants(vcfFile, filteredSites): isHeaderAdded = False filterHeadline = "##FILTER=\n" vcfOut = sys.stdout for line in open(vcfFile): if line[0] != '#': site = VcfRecord(line) # only filter on DEL & DUP for now if (site.isPass and ((site.svType == "DEL") or (site.svType == "DUP"))): isFiltered = check_filtered_sites(site, filteredSites) if isFiltered: w = line.strip().split('\t') # add the "Ploidy" filter w[VCF_FILTER] = "Ploidy" line = "\t".join(w)+"\n" elif not(isHeaderAdded) and (line[:8] == "##FILTER"): vcfOut.write(filterHeadline) isHeaderAdded = True vcfOut.write(line) if __name__=='__main__': # Command-line args (options,args) = getOptions() vcfFile = args[0] filteredSites = find_stacked_variants(vcfFile) filter_variants(vcfFile, filteredSites)