#!/usr/bin/env perl # # Copyright (C) 2012-2014,2018 Genome Research Ltd. # # Author: Petr Danecek # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. use strict; use warnings; use Carp; my $opts = parse_params(); parse_bamcheck($opts); if ( @{$$opts{bamcheck}} > 1 ) { merge_bamcheck($opts); exit; } plot_qualities($opts); plot_acgt_cycles($opts); plot_gc($opts); plot_gc_depth($opts); plot_isize($opts); plot_coverage($opts); plot_mismatches_per_cycle($opts); plot_indel_dist($opts); plot_indel_cycles($opts); create_html($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { confess @msg; } die "About: Parses output of samtools stats (former bamcheck) and calls gnuplot to create graphs.\n", "Usage: plot-bamstats [OPTIONS] file.bam.bc\n", " plot-bamstats -p outdir/ file.bam.bc\n", "Options:\n", " -k, --keep-files Do not remove temporary files.\n", " -l, --log-y Set the Y axis scale of the Insert Size plot to log 10.\n", " -m, --merge Merge multiple bamstats files and output to STDOUT.\n", " -p, --prefix The output files prefix, add a slash to create new directory.\n", " -r, --ref-stats Optional reference stats file with expected GC content (created with -s).\n", " -s, --do-ref-stats Calculate reference sequence GC for later use with -r\n", " -t, --targets Restrict -s to the listed regions (tab-delimited chr,from,to. 1-based, inclusive)\n", " -h, -?, --help This help message.\n", "\n"; } sub parse_params { $0 =~ s{^.+/}{}; my $opts = { args => join(' ',$0,@ARGV), # for merging sections => [ { id=>'SN', header=> "# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n", }, { id=>'FFQ', header=> "# First Fragment Qualities. Use `grep ^FFQ | cut -f 2-` to extract this part.\n" . "# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n", }, { id=>'LFQ', header=> "# Last Fragment Qualities. Use `grep ^LFQ | cut -f 2-` to extract this part.\n" . "# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n", }, { id=>'MPC', header=> "# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n" . "# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n" . "# is the number of N's and the rest is the number of mismatches\n", }, { id=>'GCF', header=>"# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n", }, { id=>'GCL', header=>"# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n", }, { id=>'GCC', header=>"# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [\%]; and N and O counts as a percentage of all A/C/G/T bases [\%]\n", }, { id => 'FBC', header=>"# ACGT content per cycle for first fragments. Use `grep ^FBC | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [\%]; and N and O counts as a percentage of all A/C/G/T bases [\%]\n", }, { id => 'LBC', header=>"# ACGT content per cycle for last fragments. Use `grep ^LBC | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [\%]; and N and O counts as a percentage of all A/C/G/T bases [\%]\n", }, { id=>'IS', header=>"# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: insert size, pairs total, inward oriented pairs, outward oriented pairs, other pairs\n", }, { id=>'RL', header=>"# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n", }, { id=>'FRL', header=>"# Read lengths - first fragments. Use `grep ^FRL | cut -f 2-` to extract this part. The columns are: read length, count\n", }, { id=>'LRL', header=>"# Read lengths - last fragments. Use `grep ^LRL | cut -f 2-` to extract this part. The columns are: read length, count\n", }, { id=>'ID', header=>"# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n", }, { id=>'IC', header=>"# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n", }, { id=>'COV', header=>"# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n", }, # { # id=>'GCD', # header=>"# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n", # }, ], # for merging merge_keys => { 'sum' => [ 'raw total sequences:', 'filtered sequences:', 'sequences:', '1st fragments:', 'last fragments:', 'reads mapped:', 'reads mapped and paired:', 'reads unmapped:', 'reads properly paired:', 'reads paired:', 'reads duplicated:', 'reads MQ0:', 'reads QC failed:', 'non-primary alignments:', 'total length:', 'total first fragment length:', 'total last fragment length:', 'bases mapped:', 'bases mapped (cigar):', 'bases trimmed:', 'bases duplicated:', 'mismatches:', 'inward oriented pairs:', 'outward oriented pairs:', 'pairs with other orientation:', 'pairs on different chromosomes:', ], 'min' => [ 'is sorted:', ], 'max' => [ 'maximum length:', ], }, }; for my $key (keys %{$$opts{merge_keys}}) { for my $val (@{$$opts{merge_keys}{$key}}) { $$opts{hmerge}{$val} = $key; } } while (defined(my $arg=shift(@ARGV))) { if ( $arg eq '-k' || $arg eq '--keep-files' ) { $$opts{keep_files}=1; next; } if ( $arg eq '-l' || $arg eq '--log-y' ) { $$opts{y_axis_log10}=1; next; } if ( $arg eq '-m' || $arg eq '--merge' ) { $$opts{merge}=1; next; } if ( $arg eq '-r' || $arg eq '--ref-stats' ) { $$opts{ref_stats}=shift(@ARGV); next; } if ( $arg eq '-s' || $arg eq '--do-ref-stats' ) { $$opts{do_ref_stats}=shift(@ARGV); next; } if ( $arg eq '-t' || $arg eq '--targets' ) { $$opts{targets}=shift(@ARGV); next; } if ( $arg eq '-p' || $arg eq '--prefix' ) { $$opts{prefix}=shift(@ARGV); next; } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( $arg eq '-' || -e $arg ) { push @{$$opts{bamcheck}},$arg; next; } error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n"); } if ( exists($$opts{do_ref_stats }) ) { do_ref_stats($opts); exit; } if ( !exists($$opts{bamcheck}) ) { error("No samtools stats file?\n") } if ( !exists($$opts{prefix}) ) { if ( !$$opts{merge} ) { error("Expected -p parameter.\n") } $$opts{prefix} = './'; } elsif ( $$opts{merge} ) { error("Only one of -p or -m should be given.\n"); } if ( $$opts{merge} ) { if ( @{$$opts{bamcheck}} < 2 ) { error("Nothing to merge\n") } } else { if ( !exists($$opts{prefix}) ) { error("Expected -p parameter.\n") } if ( $$opts{prefix}=~m{/$} ) { `mkdir -p $$opts{prefix}`; } elsif ( !($$opts{prefix}=~/-$/) ) { $$opts{prefix} .= '-'; } } return $opts; } # Creates GC stats for either the whole reference or only on target regions for exome QC sub do_ref_stats { my ($opts) = @_; my %targets = (); if ( exists($$opts{targets}) ) { my ($prev_chr,$prev_pos); open(my $fh,'<',$$opts{targets}) or error("$$opts{targets}: $!"); while (my $line=<$fh>) { if ( $line=~/^#/ ) { next; } my ($chr,$from,$to) = split(/\s+/,$line); chomp($to); push @{$targets{$chr}}, $from,$to; if ( !defined $prev_chr or $chr ne $prev_chr ) { $prev_chr=$chr; $prev_pos=$from } if ( $prev_pos > $from ) { error("The file must be sorted: $$opts{targets}\n"); } $prev_pos = $from; } close($fh); } my $nlen = 0; my %lens = (); my %gc_counts = (); my ($skip_chr,$pos,$ireg,$regions); open(my $fh,'<',$$opts{do_ref_stats}) or error("$$opts{do_ref_stats}: $!"); while (my $line=<$fh>) { if ( $line=~/^>/ ) { if ( !scalar %targets ) { next; } if ( !($line=~/>(\S+)/) ) { error("FIXME: could not determine chromosome name: $line"); } if ( !exists($targets{$1}) ) { $skip_chr=$1; next; } undef $skip_chr; $pos = 0; $ireg = 0; $regions = $targets{$1}; next; } if ( defined $skip_chr ) { next; } # Only $_len sized lines are considered and no chopping for target regions. chomp($line); my $len = length($line); $lens{$len}++; $nlen++; if ( scalar %targets ) { while ( $ireg<@$regions && $$regions[$ireg+1]<=$pos ) { $ireg += 2; } $pos += $len; if ( $ireg==@$regions ) { next; } if ( $pos < $$regions[$ireg] ) { next; } } my $gc_count = 0; for (my $i=0; $i<$len; $i++) { my $base = substr($line,$i,1); if ( $base eq 'g' || $base eq 'G' || $base eq 'c' || $base eq 'C' ) { $gc_count++; } } $gc_counts{$gc_count}++; } # Calculate median length my $n = 0; my $median_len = 0; for my $len (sort { $a<=>$b } keys %lens) { $n += $lens{$len}; if ( $n >= $nlen ) { $median_len = $len; last; } } if ( !$median_len ) { error("FIXME: median_len=$median_len??\n"); } print "# Generated by $$opts{args}\n"; print "# The columns are: GC content bin, normalized frequency\n"; my $max; for my $count (values %gc_counts) { if ( !defined $max or $count>$max ) { $max=$count; } } for my $gc (sort {$a<=>$b} keys %gc_counts) { if ( $gc==0 ) { next; } printf "%f\t%f\n", $gc*100./$median_len, $gc_counts{$gc}/$max; } } sub plot { my ($cmdfile) = @_; my $cmd = "gnuplot $cmdfile"; system($cmd); if ( $? ) { error("The command exited with non-zero status $?:\n\t$cmd\n\n"); } } sub open_file { my ($file) = @_; my $fh; if ( $file eq '-' ) { $fh = *STDIN; } elsif ( $file=~/\.gz$/i ) { open($fh, "gunzip -c $file |") or error("gunzip -c $file: $!"); } else { open($fh,'<',$file) or error("$file: $!"); } return $fh; } sub parse_bamcheck1 { my ($opts, $i) = @_; my $file = $$opts{bamcheck}[$i]; print STDERR "Parsing samtools stats output: $file\n" unless !$$opts{verbose}; my $fh = open_file($file); my $line = <$fh>; if ( !($line=~/^# This file was produced by (\S+)/) ) { error("Sanity check failed: was this file generated by samtools stats or plot-bamstats?"); } if ( $1 ne 'plot-bamstats' && $1 ne 'samtools' ) { error("Sanity check failed: was this file generated by samtools stats or plot-bamstats?"); } my %dat; my $sn_order = $$opts{sn_order} || []; my %sn_seen = map { $_ => 1 } @$sn_order; while ($line=<$fh>) { if ( $line=~/^#/ ) { next; } my @items = split(/\t/,$line); chomp($items[-1]); if ( $items[0] eq 'SN' ) { $dat{SN}{$items[1]} = $items[2]; if (!exists($sn_seen{$items[1]})) { push(@$sn_order, $items[1]); $sn_seen{$items[1]} = 1; } next; } push @{$dat{$items[0]}}, [splice(@items,1)]; } close($fh) unless $file eq '-'; $$opts{sn_order} = $sn_order; # Check sanity if ( !exists($dat{'SN'}{'sequences:'}) or !$dat{'SN'}{'sequences:'} ) { error("Sanity check failed: no sequences found by samtools stats??\n"); } my $nseq_new = $dat{'SN'}{'sequences:'}; my $nseq_ori = exists($$opts{dat}{'SN'}) ? $$opts{dat}{'SN'}{'sequences:'} : 0; my %add = map { $_ => 1 } (qw(FFQ LFQ MPC GCF GCL IS ID IC RL FRL LRL)); for my $a (keys %dat) { if ( !exists($$opts{dat}{$a}) ) { $$opts{dat}{$a} = $dat{$a}; next; } # first bamcheck file if ( $a eq 'SN' ) { for my $b (keys %{$dat{$a}}) { if ( exists($$opts{hmerge}{$b}) ) { if ( $$opts{hmerge}{$b} eq 'sum' ) { $$opts{dat}{$a}{$b} += $dat{$a}{$b}; next; } if ( $$opts{hmerge}{$b} eq 'min' ) { $$opts{dat}{$a}{$b} = $$opts{dat}{$a}{$b} < $dat{$a}{$b} ? $$opts{dat}{$a}{$b} : $dat{$a}{$b}; } if ( $$opts{hmerge}{$b} eq 'max' ) { $$opts{dat}{$a}{$b} = $$opts{dat}{$a}{$b} > $dat{$a}{$b} ? $$opts{dat}{$a}{$b} : $dat{$a}{$b}; } } } next; } if ( $add{$a} ) { add_to_matrix($$opts{dat}{$a}, $dat{$a}, 0); next; } if ( $a eq 'COV' ) { merge_coverage($$opts{dat}{$a}, $dat{$a}); next; } if ( $a eq 'GCC' || $a eq 'FBC' || $a eq 'LBC' ) { merge_gcc($nseq_ori, $$opts{dat}{$a}, $nseq_new, $dat{$a}); next; } warn("Not processed: $a\n"); } if ( $i==0 ) { return; } $$opts{dat}{SN}{'error rate:'} = sprintf "%e", get_value($opts,'SN','mismatches:') / get_value($opts,'SN','bases mapped (cigar):'); update_average_length($opts); update_average_quality($opts); update_average_isize($opts); } sub parse_bamcheck { my ($opts) = @_; for (my $i=0; $i<@{$$opts{bamcheck}}; $i++) { parse_bamcheck1($opts,$i); } # Determine the default title, for now use the first BAM name: # 5446_6/5446_6.bam.bc.gp -> 5446_6 # test.aaa.png -> test.aaa if ( !($$opts{bamcheck}[0]=~m{([^/]+?)(?:\.bam)?(?:\.bc)?$}i) ) { error("FIXME: Could not determine the title from [$$opts{bamcheck}[0]]\n"); } $$opts{title} = $1; } sub add_to_matrix { my ($dst,$src,$key) = @_; my $id = 0; my $is = 0; while ($is<@$src) { while ( $id<@$dst && $$src[$is][$key] > $$dst[$id][$key] ) { $id++; } if ( $id<@$dst && $$src[$is][$key] == $$dst[$id][$key] ) { for (my $j=0; $j<@{$$src[$is]}; $j++) { if ( $j==$key ) { next; } $$dst[$id][$j] += $$src[$is][$j]; } } else { splice(@$dst,$id,0,$$src[$is]); } $is++; } } sub rebin_values { my ($vals,$bin_size,$col) = @_; error("pd3\@sanger todo: rebin_values\n"); } sub merge_coverage { my ($dst,$src) = @_; if ( !($$dst[0][0]=~/^\[(\d+)-(\d+)\]$/) ) { error("Could not determine bin size in COV: $$dst[0][0]\n"); } my $dst_bin = $2 - $1 + 1; if ( !($$src[0][0]=~/^\[(\d+)-(\d+)\]$/) ) { error("Could not determine bin size in COV: $$src[0][0]\n"); } my $src_bin = $2 - $1 + 1; my (@dst, @src); for my $row (@$dst) { splice(@$row,0,1); push @dst,$row; } for my $row (@$src) { splice(@$row,0,1); push @src,$row; } my $dst_out = pop(@dst); my $src_out = pop(@src); my $bin_size = $dst_bin; if ( $src_bin > $dst_bin ) { my $vals = rebin_values(\@dst,$src_bin,0); @dst = @$vals; $bin_size = $src_bin; } elsif ( $src_bin < $dst_bin ) { my $vals = rebin_values(\@src, $dst_bin, 0); @src = @$vals; } add_to_matrix(\@dst, \@src, 0); for my $row (@dst) { unshift(@$row, sprintf("[%d-%d]", $$row[0],$$row[0]+$bin_size-1)); } push @dst, [sprintf("[%d<]", $dst[-1][1]), $dst[-1][1], $$dst_out[1]+$$src_out[1] ]; @$dst = @dst; } sub merge_gcc { my ($ndst,$dst,$nsrc,$src) = @_; if ( @$dst != @$src ) { error("todo: improve me\n"); } for (my $i=0; $i<@$dst; $i++) { if ( $$dst[$i][0] ne $$src[$i][0] ) { error("todo: improve me\n"); } for (my $j=1; $j<@{$$dst[$i]}; $j++) { $$dst[$i][$j] = sprintf("%.2f", ($$dst[$i][$j]*$ndst + $$src[$i][$j]*$nsrc) / ($ndst + $nsrc)); } } } sub merge_gcd { # todo } sub get_value { my ($opts, $id, $key) = @_; if ( !exists($$opts{dat}{$id}) ) { return undef; } if ( !exists($$opts{dat}{$id}{$key}) ) { return undef} return $$opts{dat}{$id}{$key}; } sub get_values { my ($opts, $id) = @_; if ( !exists($$opts{dat}{$id}) ) { return (); } # todo: add version sanity check here return (@{$$opts{dat}{$id}}); } sub update_average_length { my ($opts) = @_; my @vals = get_values($opts,'RL'); my $sum = 0; my $avg = 0; for my $val (@vals) { $sum += $$val[1]; } for my $val (@vals) { $avg += $$val[0]*$$val[1] / $sum; } $$opts{dat}{SN}{'average length:'} = sprintf "%.1f", $avg; } sub update_average_quality { my ($opts) = @_; my @vals; push @vals, get_values($opts, 'FFQ'), get_values($opts,'LFQ'); my $qsum = 0; for my $row (@vals) { for (my $i=1; $i<@$row; $i++) { $qsum += $$row[$i] } } my $qavg = 0; for my $row (@vals) { for (my $i=1; $i<@$row; $i++) { $qavg += ($i-1)*$$row[$i] / $qsum; } } $$opts{dat}{SN}{'average quality:'} = sprintf "%.1f", $qavg; } sub update_average_isize { my ($opts) = @_; my @vals = get_values($opts, 'IS'); my $sum = 0; for my $row (@vals) { $sum += $$row[1]; } my $avg = 0; for my $row (@vals) { $avg += $$row[0]*$$row[1] / $sum; } my $dev = 0; for my $row (@vals) { $dev += ($avg - $$row[0])*($avg - $$row[0]) * $$row[1] / $sum; } $$opts{dat}{SN}{'insert size average:'} = sprintf "%.1f", $avg; $$opts{dat}{SN}{'insert size standard deviation:'} = sprintf "%.1f", sqrt($dev); } sub get_defaults { my ($opts,$img_fname,%args) = @_; if ( !($img_fname=~/\.png$/i) ) { error("FIXME: currently only PNG supported. (Easy to extend.)\n"); } # Determine the gnuplot script file name my $gp_file = $img_fname; $gp_file =~ s{\.[^.]+$}{.gp}; if ( !($gp_file=~/.gp$/) ) { $gp_file .= '.gp'; } my $title = $$opts{title}; my $dir = $gp_file; $dir =~ s{/[^/]+$}{}; if ( $dir && $dir ne $gp_file ) { `mkdir -p $dir`; } my $wh = exists($args{wh}) ? $args{wh} : '600,400'; open(my $fh,'>',$gp_file) or error("$gp_file: $!"); return { title => $title, gp => $gp_file, img => $img_fname, fh => $fh, terminal => qq[set terminal png size $wh truecolor], grid => 'set grid xtics ytics y2tics back lc rgb "#cccccc"', }; } sub percentile { my ($p,@vals) = @_; my $N = 0; for my $val (@vals) { $N += $val; } my $n = $p*($N+1)/100.; my $k = int($n); my $d = $n-$k; if ( $k<=0 ) { return 0; } if ( $k>=$N ) { return scalar @vals-1; } my $cnt; for (my $i=0; $i<@vals; $i++) { $cnt += $vals[$i]; if ( $cnt>=$k ) { return $i; } } error("FIXME: this should not happen [percentile]\n"); } sub plot_qualities { my ($opts) = @_; my @ffq = get_values($opts, 'FFQ'); if ( !@ffq ) { return; } my $yrange = @{$ffq[0]} > 50 ? @{$ffq[0]} : 50; my $is_paired = get_value($opts,'SN','reads paired:'); # Average quality per cycle, forward and reverse reads in one plot my $args = get_defaults($opts,"$$opts{prefix}quals.png"); my $fh = $$args{fh}; print $fh qq[ $$args{terminal} set output "$$args{img}" $$args{grid} set ylabel "Average Quality" set xlabel "Cycle" set yrange [0:$yrange] set title "$$args{title}" noenhanced plot '-' using 1:2 with lines title 'Forward reads' ] . ($is_paired ? q[, '-' using 1:2 with lines title 'Reverse reads'] : '') . q[ ]; my (@fp75,@fp50,@fmean); my (@lp75,@lp50,@lmean); my ($fmax,$fmax_qual,$fmax_cycle); my ($lmax,$lmax_qual,$lmax_cycle); for my $cycle (@ffq) { my $sum=0; my $n=0; for (my $iqual=1; $iqual<@$cycle; $iqual++) { $sum += $$cycle[$iqual]*$iqual; $n += $$cycle[$iqual]; if ( !defined $fmax or $fmax<$$cycle[$iqual] ) { $fmax=$$cycle[$iqual]; $fmax_qual=$iqual; $fmax_cycle=$$cycle[0]; } } my $p25 = percentile(25,(@$cycle)[1..$#$cycle]); my $p50 = percentile(50,(@$cycle)[1..$#$cycle]); my $p75 = percentile(75,(@$cycle)[1..$#$cycle]); if ( !$n ) { next; } push @fp75, "$$cycle[0]\t$p25\t$p75\n"; push @fp50, "$$cycle[0]\t$p50\n"; push @fmean, sprintf "%d\t%.2f\n", $$cycle[0],$sum/$n; printf $fh $fmean[-1]; } print $fh "end\n"; my @lfq = (); if ( $is_paired ) { @lfq = get_values($opts, 'LFQ'); for my $cycle (@lfq) { my $sum=0; my $n=0; for (my $iqual=1; $iqual<@$cycle; $iqual++) { $sum += $$cycle[$iqual]*$iqual; $n += $$cycle[$iqual]; if ( !defined $lmax or $lmax<$$cycle[$iqual] ) { $lmax=$$cycle[$iqual]; $lmax_qual=$iqual; $lmax_cycle=$$cycle[0]; } } my $p25 = percentile(25,(@$cycle)[1..$#$cycle]); my $p50 = percentile(50,(@$cycle)[1..$#$cycle]); my $p75 = percentile(75,(@$cycle)[1..$#$cycle]); if ( !$n ) { next; } push @lp75, "$$cycle[0]\t$p25\t$p75\n"; push @lp50, "$$cycle[0]\t$p50\n"; push @lmean, sprintf "%d\t%.2f\n", $$cycle[0],$sum/$n; printf $fh $lmean[-1]; } print $fh "end\n"; } close($fh); plot($$args{gp}); # Average, mean and quality percentiles per cycle, forward and reverse reads in separate plots $args = get_defaults($opts,"$$opts{prefix}quals2.png",wh=>$is_paired ? '700,500' : '600,400'); $fh = $$args{fh}; my $pos_size = $is_paired ? " set rmargin 0; set lmargin 0; set tmargin 0; set bmargin 0; set origin 0.1,0.1; set size 0.4,0.8" : ''; print $fh qq[ $$args{terminal} set output "$$args{img}" $$args{grid} set multiplot $pos_size set yrange [0:$yrange] set ylabel "Quality" set xlabel "Cycle (fwd reads)" plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#cccccc" t '25-75th percentile' , '-' using 1:2 with lines lc rgb "#000000" t 'Median', '-' using 1:2 with lines lt 1 t 'Mean' ]; print $fh join('',@fp75),"end\n"; print $fh join('',@fp50),"end\n"; print $fh join('',@fmean),"end\n"; if ( $is_paired ) { print $fh qq[ set origin 0.55,0.1 set size 0.4,0.8 unset ytics set y2tics mirror set yrange [0:$yrange] unset ylabel set xlabel "Cycle (rev reads)" set label "$$args{title}" at screen 0.5,0.95 center noenhanced plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#cccccc" t '25-75th percentile' , '-' using 1:2 with lines lc rgb "#000000" t 'Median', '-' using 1:2 with lines lt 2 t 'Mean' ]; print $fh join('',@lp75),"end\n"; print $fh join('',@lp50),"end\n"; print $fh join('',@lmean),"end\n"; } close($fh); plot($$args{gp}); # Quality distribution per cycle, the distribution is for each cycle plotted as a separate curve $args = get_defaults($opts,"$$opts{prefix}quals3.png",wh=>$is_paired ? '600,600' : '600,400'); $fh = $$args{fh}; my $nquals = @{$ffq[0]}-1; my $ncycles = @ffq; $pos_size = $is_paired ? " set rmargin 0; set lmargin 0; set tmargin 0; set bmargin 0; set origin 0.15,0.52; set size 0.8,0.4" : ''; print $fh qq[ $$args{terminal} set output "$$args{img}" $$args{grid} set multiplot $pos_size set title "$$args{title}" noenhanced set ylabel "Frequency (fwd reads)" set label "Cycle $fmax_cycle" at $fmax_qual+1,$fmax unset xlabel set xrange [0:$nquals] set format x "" plot '-' using 1:2:3 with lines linecolor variable title '' ]; for my $cycle (0..$#ffq) { for (my $iqual=1; $iqual<$nquals; $iqual++) { printf $fh "%d\t%d\t%d\n", $iqual, $ffq[$cycle]->[$iqual], $cycle + 1; } print $fh "\n"; } print $fh "end\n"; if ( $is_paired ) { print $fh qq[ set origin 0.15,0.1 set size 0.8,0.4 unset title unset format set xtics set xlabel "Quality" unset label set label "Cycle $lmax_cycle" at $lmax_qual+1,$lmax set ylabel "Frequency (rev reads)" plot '-' using 1:2:3 with lines linecolor variable title '' ]; for my $cycle (0..$#lfq) { for (my $iqual=1; $iqual<$nquals; $iqual++) { printf $fh "%d\t%d\t%d\n", $iqual, $lfq[$cycle]->[$iqual], $cycle + 1; } print $fh "\n"; } print $fh "end\n"; } close($fh); plot($$args{gp}); # Heatmap qualitites $args = get_defaults($opts,"$$opts{prefix}quals-hm.png", wh=>'600,500'); $fh = $$args{fh}; my $max = defined $lmax && $lmax > $fmax ? $lmax : $fmax; my @ytics; for my $cycle (@ffq) { if ( $$cycle[0]%10==0 ) { push @ytics,qq["$$cycle[0]" $$cycle[0]]; } } my $ytics = join(',', @ytics); $pos_size = $is_paired ? " set origin 0,0.46\n set size 0.95,0.6" : ''; print $fh qq[ $$args{terminal} set output "$$args{img}" unset key unset colorbox set palette defined (0 0 0 0, 1 0 0 1, 3 0 1 0, 4 1 0 0, 6 1 1 1) set cbrange [0:$max] set yrange [0:$ncycles] set xrange [0:$nquals] set view map set multiplot set rmargin 0 set lmargin 0 set tmargin 0 set bmargin 0 $pos_size set obj 1 rectangle behind from first 0,0 to first $nquals,$ncycles set obj 1 fillstyle solid 1.0 fillcolor rgbcolor "black" set ylabel "Cycle (fwd reads)" offset character -1,0 unset ytics set ytics ($ytics) unset xtics set title "$$args{title}" noenhanced splot '-' matrix with image ]; for my $cycle (@ffq) { for (my $iqual=1; $iqual<@$cycle; $iqual++) { print $fh "\t$$cycle[$iqual]"; } print $fh "\n"; } print $fh "\nend\n"; if ( $is_paired ) { @ytics = (); for my $cycle (@lfq) { if ( $$cycle[0]%10==0 ) { push @ytics,qq["$$cycle[0]" $$cycle[0]]; } } $ytics = join(',', @ytics); print $fh qq[ set origin 0,0.03 set size 0.95,0.6 set ylabel "Cycle (rev reads)" offset character -1,0 set xlabel "Base Quality" unset title unset ytics set ytics ($ytics) set xrange [0:$nquals] set xtics set colorbox vertical user origin first ($nquals+1),0 size screen 0.025,0.812 set cblabel "Number of bases" splot '-' matrix with image ]; for my $cycle (@lfq) { for (my $iqual=1; $iqual<@$cycle; $iqual++) { print $fh "\t$$cycle[$iqual]"; } print $fh "\n"; } print $fh "\nend\n"; } close($fh); plot($$args{gp}); } sub plot_acgt_cycles { my ($opts) = @_; my @gcc = get_values($opts, 'GCC'); if ( !@gcc ) { return; } my $args = get_defaults($opts,"$$opts{prefix}acgt-cycles.png"); my $fh = $$args{fh}; print $fh qq[ $$args{terminal} set output "$$args{img}" $$args{grid} set style line 1 linecolor rgb "green" set style line 2 linecolor rgb "red" set style line 3 linecolor rgb "black" set style line 4 linecolor rgb "blue" set style increment user set ylabel "Base content [%]" set xlabel "Read Cycle" set yrange [0:100] set title "$$args{title}" noenhanced plot '-' w l ti 'A', '-' w l ti 'C', '-' w l ti 'G', '-' w l ti 'T' ]; for my $base (1..4) { for my $cycle (@gcc) { print $fh $$cycle[0]+1,"\t",$$cycle[$base],"\n"; } print $fh "end\n"; } close($fh); plot($$args{gp}); } sub plot_gc { my ($opts) = @_; my $is_paired = get_value($opts,'SN','reads paired:'); my $args = get_defaults($opts,"$$opts{prefix}gc-content.png"); my $fh = $$args{fh}; my ($gcl_max,$gcf_max,$lmax,$fmax); my @gcf = get_values($opts, 'GCF'); my @gcl = get_values($opts, 'GCL'); for my $gc (@gcf) { if ( !defined $gcf_max or $gcf_max<$$gc[1] ) { $gcf_max=$$gc[1]; $fmax=$$gc[0]; } } for my $gc (@gcl) { if ( !defined $gcl_max or $gcl_max<$$gc[1] ) { $gcl_max=$$gc[1]; $lmax=$$gc[0]; } } my $gcmax = $is_paired && $gcl_max > $gcf_max ? $lmax : $fmax; print $fh qq[ $$args{terminal} set output "$$args{img}" $$args{grid} set title "$$args{title}" noenhanced set ylabel "Normalized Frequency" set xlabel "GC Content [%]" set yrange [0:1.1] set label sprintf("%.1f",$gcmax) at $gcmax,1 front offset 1,0 plot ] . (exists($$opts{ref_stats}) ? q['-' smooth csplines with lines lt 0 title 'Reference', ] : '') . q['-' smooth csplines with lines lc 1 title 'First fragments' ] . ($is_paired ? q[, '-' smooth csplines with lines lc 2 title 'Last fragments'] : '') . q[ ]; if ( exists($$opts{ref_stats}) ) { open(my $ref,'<',$$opts{ref_stats}) or error("$$opts{ref_stats}: $!"); while (my $line=<$ref>) { print $fh $line } close($ref); print $fh "end\n"; } for my $cycle (@gcf) { printf $fh "%d\t%f\n", $$cycle[0],$$cycle[1]/$gcf_max; } print $fh "end\n"; if ( $is_paired ) { for my $cycle (@gcl) { printf $fh "%d\t%f\n", $$cycle[0],$$cycle[1]/$gcl_max; } print $fh "end\n"; } close($fh); plot($$args{gp}); } sub plot_gc_depth { my ($opts) = @_; my @gcd = get_values($opts,'GCD'); if ( @gcd <= 1 ) { return; } # Find unique sequence percentiles for 30,40, and 50% GC content, just to draw x2tics. my @tics = ( {gc=>30},{gc=>40},{gc=>50} ); for my $gc (@gcd) { for my $tic (@tics) { my $diff = abs($$gc[0]-$$tic{gc}); if ( !exists($$tic{pr}) or $diff<$$tic{diff} ) { $$tic{pr}=$$gc[1]; $$tic{diff}=$diff; } } } my @x2tics; for my $tic (@tics) { push @x2tics, qq["$$tic{gc}" $$tic{pr}]; } my $x2tics = join(',',@x2tics); my $args = get_defaults($opts,"$$opts{prefix}gc-depth.png", wh=>'600,500'); my $fh = $$args{fh}; print $fh qq[ $$args{terminal} set output "$$args{img}" $$args{grid} set ylabel "Mapped depth" set xlabel "Percentile of mapped sequence ordered by GC content" set x2label "GC Content [%]" set title "$$args{title}" noenhanced set x2tics ($x2tics) set xtics nomirror set xrange [0.1:99.9] plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#dedede" t '10-90th percentile' , \\ '-' using 1:2:3 with filledcurve lt 1 lc rgb "#bbdeff" t '25-75th percentile' , \\ '-' using 1:2 with lines lc rgb "#0084ff" t 'Median' ]; for my $gc (@gcd) { print $fh "$$gc[1]\t$$gc[2]\t$$gc[6]\n"; } print $fh "end\n"; for my $gc (@gcd) { print $fh "$$gc[1]\t$$gc[3]\t$$gc[5]\n"; } print $fh "end\n"; for my $gc (@gcd) { print $fh "$$gc[1]\t$$gc[4]\n"; } print $fh "end\n"; close($fh); plot($$args{gp}); } sub plot_isize { my ($opts) = @_; my $is_paired = get_value($opts,'SN','reads paired:'); my @isizes = get_values($opts, 'IS'); if ( !$is_paired or !@isizes ) { return; } my ($isize_max,$isize_cnt); for my $isize (@isizes) { if ( !defined $isize_max or $isize_cnt<$$isize[1] ) { $isize_cnt=$$isize[1]; $isize_max=$$isize[0]; } } my $args = get_defaults($opts,"$$opts{prefix}insert-size.png"); my $fh = $$args{fh}; print $fh qq[ $$args{terminal} set output "$$args{img}" $$args{grid} set rmargin 5 set label sprintf("%d",$isize_max) at $isize_max+10,$isize_cnt set ylabel "Number of pairs" set xlabel "Insert Size" set title "$$args{title}" noenhanced]; print $fh qq[ set logscale y 10] if exists $$opts{y_axis_log10}; print $fh qq[ plot \\ '-' with lines lc rgb 'black' title 'All pairs', \\ '-' with lines title 'Inward', \\ '-' with lines title 'Outward', \\ '-' with lines title 'Other' ]; for my $isize (@isizes) { print $fh "$$isize[0]\t$$isize[1]\n"; } print $fh "end\n"; for my $isize (@isizes) { print $fh "$$isize[0]\t$$isize[2]\n"; } print $fh "end\n"; for my $isize (@isizes) { print $fh "$$isize[0]\t$$isize[3]\n"; } print $fh "end\n"; for my $isize (@isizes) { print $fh "$$isize[0]\t$$isize[4]\n"; } print $fh "end\n"; close($fh); plot($$args{gp}); } sub plot_coverage { my ($opts) = @_; my @cov = get_values($opts, 'COV'); if ( !@cov ) { return; } my @vals; for my $cov (@cov) { push @vals,$$cov[2]; } my $i = percentile(99.8,@vals); my $p99 = $cov[$i][1]; my $args = get_defaults($opts,"$$opts{prefix}coverage.png"); my $fh = $$args{fh}; print $fh qq[ $$args{terminal} set output "$$args{img}" $$args{grid} set ylabel "Number of mapped bases" set xlabel "Coverage" set log y set style fill solid border -1 set title "$$args{title}" noenhanced set xrange [:$p99] plot '-' with lines notitle ]; for my $cov (@cov) { if ( $$cov[2]==0 ) { next; } print $fh "$$cov[1]\t$$cov[2]\n"; } print $fh "end\n"; close($fh); plot($$args{gp}); } sub plot_mismatches_per_cycle { my ($opts) = @_; my @mpc = get_values($opts, 'MPC'); if ( !@mpc ) { return; } my $nquals = @{$mpc[0]} - 2; my $ncycles = @mpc; my ($style,$with); if ( $ncycles>100 ) { $style = ''; $with = 'w l'; } else { $style = 'set style data histogram; set style histogram rowstacked'; $with = ''; } my $args = get_defaults($opts,"$$opts{prefix}mism-per-cycle.png"); my $fh = $$args{fh}; print $fh qq[ $$args{terminal} set output "$$args{img}" $$args{grid} set style line 1 linecolor rgb "#e40000" set style line 2 linecolor rgb "#ff9f00" set style line 3 linecolor rgb "#bbbb00" set style line 4 linecolor rgb "#4ebd68" set style line 5 linecolor rgb "#0061ff" set style increment user set key left top $style set ylabel "Number of mismatches" set xlabel "Read Cycle" set style fill solid border -1 set title "$$args{title}" noenhanced set xrange [-1:$ncycles] plot '-' $with ti 'Base Quality>30', \\ '-' $with ti '30>=Q>20', \\ '-' $with ti '20>=Q>10', \\ '-' $with ti '10>=Q', \\ '-' $with ti "N's" ]; for my $cycle (@mpc) { my $sum; for my $idx (31..$#$cycle) { $sum += $$cycle[$idx]; } print $fh "$sum\n"; } print $fh "end\n"; for my $cycle (@mpc) { my $sum; for my $idx (22..31) { $sum += $$cycle[$idx]; } print $fh "$sum\n"; } print $fh "end\n"; for my $cycle (@mpc) { my $sum; for my $idx (12..21) { $sum += $$cycle[$idx]; } print $fh "$sum\n"; } print $fh "end\n"; for my $cycle (@mpc) { my $sum; for my $idx (2..11) { $sum += $$cycle[$idx]; } print $fh "$sum\n"; } print $fh "end\n"; for my $cycle (@mpc) { print $fh "$$cycle[1]\n"; } print $fh "end\n"; close($fh); plot($$args{gp}); } sub plot_indel_dist { my ($opts) = @_; my @indels = get_values($opts, 'ID'); if ( !@indels ) { return; } my $args = get_defaults($opts,"$$opts{prefix}indel-dist.png"); my $fh = $$args{fh}; print $fh qq[ $$args{terminal} set output "$$args{img}" $$args{grid} set style line 1 linetype 1 linecolor rgb "red" set style line 2 linetype 2 linecolor rgb "black" set style line 3 linetype 3 linecolor rgb "green" set style increment user set ylabel "Indel count [log]" set xlabel "Indel length" set y2label "Insertions/Deletions ratio" set log y set y2tics nomirror set ytics nomirror set title "$$args{title}" noenhanced plot '-' w l ti 'Insertions', '-' w l ti 'Deletions', '-' axes x1y2 w l ti "Ins/Dels ratio" ]; for my $len (@indels) { print $fh "$$len[0]\t$$len[1]\n"; } print $fh "end\n"; for my $len (@indels) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n"; for my $len (@indels) { printf $fh "%d\t%f\n", $$len[0],$$len[2]?$$len[1]/$$len[2]:0; } print $fh "end\n"; close($fh); plot($$args{gp}); } sub plot_indel_cycles { my ($opts) = @_; my @indels = get_values($opts, 'IC'); if ( !@indels ) { return; } my $is_paired = get_value($opts,'SN','reads paired:'); my $args = get_defaults($opts,"$$opts{prefix}indel-cycles.png"); my $fh = $$args{fh}; print $fh qq[ $$args{terminal} set output "$$args{img}" $$args{grid} set style line 1 linetype 1 linecolor rgb "red" set style line 2 linetype 2 linecolor rgb "black" set style line 3 linetype 3 linecolor rgb "green" set style line 4 linetype 4 linecolor rgb "blue" set style increment user set ylabel "Indel count" set xlabel "Read Cycle" set title "$$args{title}" noenhanced ]; if ( $is_paired ) { print $fh qq[plot '-' w l ti 'Insertions (fwd)', '' w l ti 'Insertions (rev)', '' w l ti 'Deletions (fwd)', '' w l ti 'Deletions (rev)'\n]; for my $len (@indels) { print $fh "$$len[0]\t$$len[1]\n"; } print $fh "end\n"; for my $len (@indels) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n"; for my $len (@indels) { print $fh "$$len[0]\t$$len[3]\n"; } print $fh "end\n"; for my $len (@indels) { print $fh "$$len[0]\t$$len[4]\n"; } print $fh "end\n"; } else { print $fh qq[plot '-' w l ti 'Insertions', '' w l ti 'Deletions'\n]; for my $len (@indels) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n"; for my $len (@indels) { print $fh "$$len[0]\t$$len[4]\n"; } print $fh "end\n"; } close($fh); plot($$args{gp}); } sub merge_bamcheck { my ($opts) = @_; my $fh = *STDOUT; if ( !$$opts{merge} ) { open($fh,'>',"$$opts{prefix}merge.bchk") or error("$$opts{prefix}merge.bchk: $!\n"); } print $fh "# This file was produced by plot-bamstats and can be plotted using plot-bamstats\n# The command line was $$opts{args}\n"; for my $sec (@{$$opts{sections}}) { my $sid = $$sec{id}; if ( !exists($$opts{dat}{$sid}) ) { next; } print $fh $$sec{header}; if ($sid eq 'SN') { for my $key (@{$$opts{sn_order}}) { if (exists($$opts{dat}{$sid}{$key})) { print "$sid\t$key\t$$opts{dat}{$sid}{$key}\n"; } } next; } elsif ( ref($$opts{dat}{$sid}) eq 'HASH' ) { for my $key (keys %{$$opts{dat}{$sid}}) { print "$sid\t$key\t$$opts{dat}{$sid}{$key}\n"; } next; } elsif ( ref($$opts{dat}{$sid}) eq 'ARRAY' ) { for my $rec (@{$$opts{dat}{$sid}}) { print $fh "$sid\t", join("\t",@$rec), "\n"; } } # # if ( $sid eq 'ID' ) # { # print $fh "# $$opts{id2sec}{SN}{header}\n$$opts{id2sec}{SN}{exp}"; # # output summary numbers here # for my $id (keys %{$$opts{dat}}) # { # if ( exists($$opts{exp}{$id}) ) { next; } # for my $key (keys %{$$opts{dat}{$id}}) # { # print $fh "SN\t$id\t$key\t$$opts{dat}{$id}{$key}\n"; # } # } # } } } sub bignum { my ($num) = @_; if ( !defined $num ) { return 0; } my $out = ''; my $slen = length($num); for (my $i=0; $i<$slen; $i++) { $out .= substr($num,$i,1); if ( $i+1<$slen && ($slen-$i-1)%3==0 ) { $out .= ','; } } return $out; } sub create_html { my ($opts) = @_; my ($fname,$prefix); if ( $$opts{prefix}=~m{/$} ) { $fname = "$$opts{prefix}index.html"; $prefix = ''; } else { $prefix = $$opts{prefix}; $prefix =~ s{^.*/}{}; $fname = $$opts{prefix}; $fname =~ s/-$/.html/; } open(my $fh,'>',$fname) or error("$fname: $!"); print $fh q[ ]; my %imgs = ( 'insert-size' => 'Insert size', 'gc-content' => 'GC content', 'acgt-cycles' => 'Per-base sequence content', 'mism-per-cycle' => 'Mismatches per cycle', 'quals' => 'Quality per cycle', 'quals2' => 'Quality per cycle', 'quals3' => 'Quality per cycle', 'quals-hm' => 'Quality per cycle', 'indel-cycles' => 'Indels per cycle', 'indel-dist' => 'Indel lengths', 'gc-depth' => 'Mapped depth vs GC', # I think this may be broken: 'coverage' => '', ); my @imgs = (qw(insert-size gc-content acgt-cycles mism-per-cycle quals quals2 quals3 quals-hm indel-cycles indel-dist gc-depth)); for (my $i=0; $i<@imgs; $i++) { if ( $i % 3 == 0 ) { # new row if ( $i>0 ) { print $fh "\n"; } print $fh ""; } if ( -e "$$opts{prefix}$imgs[$i].png" ) { my $desc = $imgs{$imgs[$i]}; print $fh qq[

\n]; } else { print $fh "

\n"; } } my $reads_total = get_value($opts,'SN','raw total sequences:'); my $reads_filtered = get_value($opts,'SN','filtered sequences:'); my $percent_filtered = sprintf "(%.1f%%)", $reads_filtered * 100. / $reads_total; my $reads_mapped = get_value($opts,'SN','reads mapped:'); my $percent_mapped = sprintf "(%.1f%%)", $reads_mapped * 100. / ($reads_total - $reads_filtered); my $reads_dup = get_value($opts,'SN','reads duplicated:'); my $percent_dup = sprintf "(%.1f%%)", $reads_dup * 100. / ($reads_total - $reads_filtered); my $reads_mq0 = get_value($opts,'SN','reads MQ0:'); my $percent_mq0 = sprintf "(%.1f%%)", ($reads_mapped ? $reads_mq0 * 100. / $reads_mapped : 0); my $reads_nonprim = get_value($opts,'SN','non-primary alignments:'); my $read_avglen = get_value($opts,'SN','average length:'); my $bases_total = get_value($opts,'SN','total length:'); my $bases_mapped = get_value($opts,'SN','bases mapped (cigar):'); my $bpercent_mapped = sprintf "(%.1f%%)", $bases_mapped * 100. / $bases_total; my $error_rate = sprintf "%.2f%%", 100.*get_value($opts,'SN','error rate:'); $reads_total = bignum($reads_total); $reads_filtered = bignum($reads_filtered); $reads_mapped = bignum($reads_mapped); $reads_dup = bignum($reads_dup); $reads_mq0 = bignum($reads_mq0); $reads_nonprim = bignum($reads_nonprim); $bases_total = bignum($bases_total); $bases_mapped = bignum($bases_mapped); print $fh qq[

Reads

total:	$reads_total
filtered:	$reads_filtered	$percent_filtered
non-primary:	$reads_nonprim
duplicated:	$reads_dup	$percent_dup
mapped:	$reads_mapped	$percent_mapped
zero MQ:	$reads_mq0	$percent_mq0
avg read length:	$read_avglen

Bases

total:	$bases_total
mapped:	$bases_mapped	$bpercent_mapped
error rate:	$error_rate

]; close($fh); }