#!/usr/bin/env perl # Copyright 2013-2015, Derrick Wood # # This file is part of the Kraken taxonomic sequence classification system. # # Kraken is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # Kraken is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Kraken. If not, see . # Reports a summary of Kraken's results. use strict; use warnings; use File::Basename; use Getopt::Long; my $PROG = basename $0; my $KRAKEN_DIR = "/home/ugene/kraken"; # Test to see if the executables got moved, try to recover if we can if (! -e "$KRAKEN_DIR/classify") { use Cwd 'abs_path'; $KRAKEN_DIR = dirname abs_path($0); } require "$KRAKEN_DIR/krakenlib.pm"; my $show_zeros = 0; my $db_prefix; GetOptions( "help" => \&display_help, "version" => \&display_version, "show-zeros" => \$show_zeros, "db=s" => \$db_prefix, ); eval { $db_prefix = krakenlib::find_db($db_prefix); }; if ($@) { die "$PROG: $@"; } sub usage { my $exit_code = @_ ? shift : 64; print STDERR "Usage: $PROG [--db KRAKEN_DB_NAME] [--show-zeros] \n"; my $default_db; eval { $default_db = krakenlib::find_db(); }; if (defined $default_db) { print STDERR "\n Default database is \"$default_db\"\n"; } exit $exit_code; } sub display_help { usage(0); } sub display_version { print "Kraken version 1.0\n"; print "Copyright 2013-2015, Derrick Wood (dwood\@cs.jhu.edu)\n"; exit 0; } my (%child_lists, %name_map, %rank_map); load_taxonomy($db_prefix); my %taxo_counts; my $seq_count = 0; $taxo_counts{0} = 0; while (<>) { my @fields = split; $taxo_counts{$fields[2]}++; $seq_count++; } my $classified_count = $seq_count - $taxo_counts{0}; my %clade_counts = %taxo_counts; dfs_summation(1); for (keys %name_map) { $clade_counts{$_} ||= 0; } my $unclassified_percent = 100; if ($seq_count) { $unclassified_percent = $clade_counts{0} * 100 / $seq_count; } printf "%6.2f\t%d\t%d\t%s\t%d\t%s%s\n", $unclassified_percent, $clade_counts{0}, $taxo_counts{0}, "U", 0, "", "unclassified"; dfs_report(1, 0); sub dfs_report { my $node = shift; my $depth = shift; if (! $clade_counts{$node} && ! $show_zeros) { return; } printf "%6.2f\t%d\t%d\t%s\t%d\t%s%s\n", ($clade_counts{$node} || 0) * 100 / $seq_count, ($clade_counts{$node} || 0), ($taxo_counts{$node} || 0), rank_code($rank_map{$node}), $node, " " x $depth, $name_map{$node}; my $children = $child_lists{$node}; if ($children) { my @sorted_children = sort { $clade_counts{$b} <=> $clade_counts{$a} } @$children; for my $child (@sorted_children) { dfs_report($child, $depth + 1); } } } sub rank_code { my $rank = shift; for ($rank) { $_ eq "species" and return "S"; $_ eq "genus" and return "G"; $_ eq "family" and return "F"; $_ eq "order" and return "O"; $_ eq "class" and return "C"; $_ eq "phylum" and return "P"; $_ eq "kingdom" and return "K"; $_ eq "superkingdom" and return "D"; } return "-"; } sub dfs_summation { my $node = shift; my $children = $child_lists{$node}; if ($children) { for my $child (@$children) { dfs_summation($child); $clade_counts{$node} += ($clade_counts{$child} || 0); } } } sub load_taxonomy { my $prefix = shift; open NAMES, "<", "$prefix/taxonomy/names.dmp" or die "$PROG: can't open names file: $!\n"; while () { chomp; s/\t\|$//; my @fields = split /\t\|\t/; my ($node_id, $name, $type) = @fields[0,1,3]; if ($type eq "scientific name") { $name_map{$node_id} = $name; } } close NAMES; open NODES, "<", "$prefix/taxonomy/nodes.dmp" or die "$PROG: can't open nodes file: $!\n"; while () { chomp; my @fields = split /\t\|\t/; my ($node_id, $parent_id, $rank) = @fields[0,1,2]; if ($node_id == 1) { $parent_id = 0; } $child_lists{$parent_id} ||= []; push @{ $child_lists{$parent_id} }, $node_id; $rank_map{$node_id} = $rank; } close NODES; }