#!/usr/bin/env perl # Copyright 2013-2015, Derrick Wood # # This file is part of the Kraken taxonomic sequence classification system. # # Kraken is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # Kraken is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Kraken. If not, see . # Reports a summary of Kraken's results, in a MetaPhlAn-like format use strict; use warnings; use File::Basename; use Getopt::Long; my $PROG = basename $0; my $KRAKEN_DIR = "/home/ugene/kraken"; # Test to see if the executables got moved, try to recover if we can if (! -e "$KRAKEN_DIR/classify") { use Cwd 'abs_path'; $KRAKEN_DIR = dirname abs_path($0); } require "$KRAKEN_DIR/krakenlib.pm"; my $show_zeros = 0; my $header_line = 0; my $intermediate = 0; my $db_prefix; my @RANK_CODES = qw/D K P C O F G S/; GetOptions( "help" => \&display_help, "version" => \&display_version, "show-zeros" => \$show_zeros, "header-line" => \$header_line, "intermediate-ranks" => \$intermediate, "db=s" => \$db_prefix, ); eval { $db_prefix = krakenlib::find_db($db_prefix); }; if ($@) { die "$PROG: $@"; } sub usage { my $exit_code = @_ ? shift : 64; my $default_db = "none"; eval { $default_db = '"' . krakenlib::find_db() . '"'; }; print STDERR <<__EOF__; Usage: $PROG [--db KRAKEN_DB_NAME] [options] Options: --db NAME Name of Kraken database (default: $default_db) --show-zeros Display taxa even if they lack a read in any sample --header-line Display a header line indicating sample IDs (sample IDs are the filenames) --intermediate-ranks Display taxa not at the standard ranks with x__ prefix __EOF__ exit $exit_code; } sub display_help { usage(0); } sub display_version { print "Kraken version 1.0\n"; print "Copyright 2013-2015, Derrick Wood (dwood\@cs.jhu.edu)\n"; exit 0; } my (%child_lists, %name_map, %rank_map); load_taxonomy($db_prefix); my @file_data; my %hit_taxa; for my $file (@ARGV) { open FILE, "<", $file or die "$PROG: can't open $file: $!\n"; my %taxo_counts; while () { my @fields = split; $taxo_counts{$fields[2]}++; } my %clade_counts = %taxo_counts; dfs_summation(1, \%clade_counts); for (keys %clade_counts) { if ($clade_counts{$_}) { $hit_taxa{$_}++ } } push @file_data, \%clade_counts; } if ($intermediate) { push @RANK_CODES, "X"; } my %output_lines; $output_lines{$_} = "" for @RANK_CODES; if ($header_line) { print "#Sample ID\t" . join("\t", @ARGV) . "\n"; } dfs_report(1); print $output_lines{$_} for @RANK_CODES; sub dfs_report { my $node = shift; my $name = shift; if (! $show_zeros) { return unless $hit_taxa{$node}; } my $code = rank_code($node); if ($code ne "-" || $intermediate) { $code = "X" if $code eq "-"; if (defined $name) { $name .= "|" } else { $name = ""; } $name .= lc($code) . "__" . sanitize_name($node); my $output = $name; for my $file (@file_data) { $output .= "\t" . ($file->{$node} || 0); } $output .= "\n"; $output_lines{$code} .= $output; } my $children = $child_lists{$node}; if ($children) { for my $child (@$children) { dfs_report($child, $name); } } } sub sanitize_name { my $node = shift; my $name = $name_map{$node}; $name =~ tr/|.//d; $name =~ tr/ /_/; return $name; } sub rank_code { my $node = shift; my $rank = $rank_map{$node}; for ($rank) { $_ eq "species" and return "S"; $_ eq "genus" and return "G"; $_ eq "family" and return "F"; $_ eq "order" and return "O"; $_ eq "class" and return "C"; $_ eq "phylum" and return "P"; $_ eq "superkingdom" and return "D"; $_ eq "kingdom" and return "K"; } return "-"; } sub dfs_summation { my $node = shift; my $count_ref = shift; my $children = $child_lists{$node}; if ($children) { for my $child (@$children) { dfs_summation($child, $count_ref); $count_ref->{$node} += ($count_ref->{$child} || 0); } } } sub load_taxonomy { my $prefix = shift; open NAMES, "<", "$prefix/taxonomy/names.dmp" or die "$PROG: can't open names file: $!\n"; while () { chomp; s/\t\|$//; my @fields = split /\t\|\t/; my ($node_id, $name, $type) = @fields[0,1,3]; if ($type eq "scientific name") { $name_map{$node_id} = $name; } } close NAMES; open NODES, "<", "$prefix/taxonomy/nodes.dmp" or die "$PROG: can't open nodes file: $!\n"; while () { chomp; my @fields = split /\t\|\t/; my ($node_id, $parent_id, $rank) = @fields[0,1,2]; if ($node_id == 1) { $parent_id = 0; } $child_lists{$parent_id} ||= []; push @{ $child_lists{$parent_id} }, $node_id; $rank_map{$node_id} = $rank; } close NODES; }