#!/usr/bin/env perl use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); do_consensus($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "Usage: cat ref.fa | vcf-consensus [OPTIONS] in.vcf.gz > out.fa\n", "Options:\n", " -h, -?, --help This help message.\n", " -H, --haplotype Apply only variants for the given haplotype (1,2)\n", " -i, --iupac-codes Apply variants in the form of IUPAC ambiguity codes\n", " -s, --sample If not given, all variants are applied\n", "Examples:\n", " # Get the consensus for one region. The fasta header lines are then expected\n", " # in the form \">chr:from-to\".\n", " samtools faidx ref.fa 8:11870-11890 | vcf-consensus in.vcf.gz > out.fa\n", "\n"; } sub parse_params { my $opts = { }; while (my $arg=shift(@ARGV)) { if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( $arg eq '-s' || $arg eq '--sample' ) { $$opts{sample}=shift(@ARGV); next; } if ( $arg eq '-i' || $arg eq '--iupac-codes' ) { $$opts{iupac}=1; next; } if ( $arg eq '-H' || $arg eq '--haplotype' ) { $$opts{haplotype}=shift(@ARGV); next; } if ( -e $arg && !exists($$opts{vcf_file}) ) { $$opts{vcf_file}=$arg; next; } error("Unknown parameter \"$arg\". Run -h for help.\n"); } if ( $$opts{iupac} ) { delete($$opts{iupac}); $$opts{iupac}{CT} = 'Y'; $$opts{iupac}{TC} = 'Y'; $$opts{iupac}{AG} = 'R'; $$opts{iupac}{GA} = 'R'; $$opts{iupac}{AT} = 'W'; $$opts{iupac}{TA} = 'W'; $$opts{iupac}{GC} = 'S'; $$opts{iupac}{CG} = 'S'; $$opts{iupac}{TG} = 'K'; $$opts{iupac}{GT} = 'K'; $$opts{iupac}{CA} = 'M'; $$opts{iupac}{AC} = 'M'; $$opts{iupac}{AA} = 'A'; $$opts{iupac}{CC} = 'C'; $$opts{iupac}{GG} = 'G'; $$opts{iupac}{TT} = 'T'; } if ( exists($$opts{haplotype}) && !exists($$opts{sample}) ) { error("Expected -s option with -H.\n"); } return $opts; } sub do_consensus { my ($opts) = @_; my $vcf = Vcf->new(file=>$$opts{vcf_file}); $vcf->parse_header; if ( exists($$opts{sample}) ) { if ( !exists($$vcf{has_column}{$$opts{sample}}) ) { error("No such sample: $$opts{sample}"); } $$opts{vcf} = $vcf; $$opts{sample_col} = $$vcf{has_column}{$$opts{sample}}; } my $chrs = $vcf->get_chromosomes(); my %chrs = map { $_=>0 } @$chrs; my ($chr,$vcf_pos,$warned,$vcf_line); while (my $line=) { if ( $line=~/^>([^:\s]+)/ ) { $chr = $1; for my $line (@{$$vcf{buffer}}) { apply_variant($opts,$line); } flush_fa_buffer($opts,0); my $rest = $'; if ( $rest=~/^:(\d+)-\d+$/ ) { print STDERR "Looks as fasta file snippet, the sequence $chr starts at position $1\n"; $$opts{fa_pos} = $1; } else { $$opts{fa_pos} = 1; } $$opts{fa_idx} = 0; $$opts{fa_frz} = 0; if ( exists($chrs{$chr}) ) { $chrs{$chr}=1; } my $region = $$opts{fa_pos} > 1 ? "$chr:$$opts{fa_pos}" : $chr; $vcf->open(region=>$region); print $line; next; } chomp($line); if ( !$$opts{case_known} ) { if ( uc($line) eq $line ) { $$opts{case_known} = 'u'; } elsif ( lc($line) eq $line ) { $$opts{case_known} = 'l'; } else { $$opts{case_known} = 'u'; } } $$opts{fa_buf} .= $line; $$opts{fa_len} += length($line); while ( defined($vcf_line = $vcf->next_data_array()) ) { # can the beginning of the buffer be printed? if ( $$opts{fa_pos}+$$opts{fa_len}-$$opts{fa_idx}<=$$vcf_line[1] ) { $vcf->_unread_line($vcf_line); flush_fa_buffer($opts,60); last; } # is the buffer long enough? if ( $$opts{fa_pos}+$$opts{fa_len}-$$opts{fa_idx}<=$$vcf_line[1]+length($$vcf_line[3]) ) { $vcf->_unread_line($vcf_line); last; } apply_variant($opts,$vcf_line); } if ( !defined $vcf_line ) { flush_fa_buffer($opts,60); } } flush_fa_buffer($opts,0); for my $chr (keys %chrs) { if ( !$chrs{$chr} ) { warn("The sequence \"$chr\" not found in the fasta file.\n"); } } } sub flush_fa_buffer { my ($opts,$len) = @_; while ( $$opts{fa_len} && $$opts{fa_len}>=60 ) { print substr($$opts{fa_buf},0,60,''), "\n"; $$opts{fa_len} -= 60; $$opts{fa_pos} += 60 - $$opts{fa_idx}; $$opts{fa_idx} = 0; } if ( $len or !$$opts{fa_len} ) { return; } print $$opts{fa_buf},"\n"; $$opts{fa_pos} += $$opts{fa_len}-$$opts{fa_idx}; $$opts{fa_len} = 0; $$opts{fa_buf} = ''; $$opts{fa_idx} = 0; } sub apply_variant { my ($opts,$vline) = @_; if ( $$vline[4] eq '.' ) { return; } my $hap = exists($$opts{haplotype}) ? $$opts{haplotype} : 0; my $alt; if ( !exists($$opts{sample_col}) ) { # No sample requested, applying all sites, first ALT my $idx; $alt = ($idx=index($$vline[4],','))==-1 ? $$vline[4] : substr($$vline[4],0,$idx); if ( exists($$opts{iupac}) && length($$vline[3])==1 && length($alt)==1 ) { $alt = uc($$vline[3].$alt); if ( !exists($$opts{iupac}{$alt}) ) { error("No IUPAC code for \"$alt\"\n"); } $alt = $$opts{iupac}{$alt}; } } else { my $igt = $$opts{vcf}->get_tag_index($$vline[8],'GT',':'); if ( $igt==-1 ) { return; } my $gt = $$opts{vcf}->get_field($$vline[$$opts{sample_col}-1],$igt); my @als = $$opts{vcf}->split_gt($gt); if ( $hap ) { # Note: we are not checking the phase or phase blocks, assuming the VCF is perfect if ( $hap <= @als && $als[$hap-1] ne '0' ) { $alt = $$opts{vcf}->get_field($$vline[4],$als[$hap-1]-1,','); } } else { if ( exists($$opts{iupac}) && length($$vline[3])==1 ) # only for SNPs and with -i { my @alts; for my $al (@als) { if ( $al eq '.' ) { last; } if ( $al eq '0' ) { push @alts,uc($$vline[3]); } else { $alt = $$opts{vcf}->get_field($$vline[4],$al-1,','); push @alts, uc($alt); if ( length($alt)!=1 ) { last; } } } if ( @alts==2 ) { if ( !exists($$opts{iupac}{$alts[0].$alts[1]}) ) { error("No IUPAC code for \"$alts[0]/$alts[1]\"\n"); } $alt = $$opts{iupac}{$alts[0].$alts[1]}; } elsif ( length($alts[0])==1 ) { if ( !exists($$opts{iupac}{$alts[0].$alts[0]}) ) { error("No IUPAC code for \"$alts[0]/$alts[0]\"\n"); } $alt = $$opts{iupac}{$alts[0].$alts[0]}; } } else { for my $al (@als) { if ( $al eq '0' or $al eq '.' ) { next; } $alt = $$opts{vcf}->get_field($$vline[4],$al-1,','); last; } } } if ( !defined $alt or $alt eq $$vline[3] ) { return; } } if ( $$vline[1] <= $$opts{fa_frz} ) { print STDERR "Note: Conflicting variants at (or near) $$vline[0]:$$vline[1], cannot apply both.\n"; return; } my $pos = $$vline[1] - $$opts{fa_pos} + $$opts{fa_idx}; if ( $pos<0 or $pos>=$$opts{fa_len} ) { error("FIXME: $$vline[0]:$$vline[1] .. $$opts{fa_pos},$pos,$$opts{fa_len},$$opts{fa_frz}\n"); } # Sanity check my $ref_len = length($$vline[3]); if ( $$vline[3] ne uc(substr($$opts{fa_buf},$pos,$ref_len)) ) { error(sprintf "The fasta sequence does not match the REF at $$vline[0]:$$vline[1]. %s(%s) in .fa, %s in .vcf, frz=%d\n", substr($$opts{fa_buf},$pos,$ref_len), substr($$opts{fa_buf},$pos+1,$ref_len+5), $$vline[3], $$opts{fa_frz}?$$opts{fa_frz}:0); } if ( $$opts{case_known} eq 'l' ) { $alt = lc($alt); } my $alt_len = length($alt); substr($$opts{fa_buf},$pos,$ref_len,$alt); $$opts{fa_len} += $alt_len - $ref_len; $$opts{fa_pos} += $ref_len; # position with respect to the original reference sequence $$opts{fa_idx} += $alt_len; # position in the modified sequence $$opts{fa_frz} = $$vline[1] + $ref_len - 1; # freeze changes until this position }