#!/usr/bin/env perl # Copyright 2013-2015, Derrick Wood # # This file is part of the Kraken taxonomic sequence classification system. # # Kraken is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # Kraken is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Kraken. If not, see . # For each classified read, prints sequence ID and full taxonomy use strict; use warnings; use File::Basename; use Getopt::Long; my $PROG = basename $0; my $KRAKEN_DIR = "/home/ugene/kraken"; # Test to see if the executables got moved, try to recover if we can if (! -e "$KRAKEN_DIR/classify") { use Cwd 'abs_path'; $KRAKEN_DIR = dirname abs_path($0); } require "$KRAKEN_DIR/krakenlib.pm"; my $db_prefix; my $mpa_format = 0; GetOptions( "help" => \&display_help, "version" => \&display_version, "db=s" => \$db_prefix, "mpa-format" => \$mpa_format ); eval { $db_prefix = krakenlib::find_db($db_prefix); }; if ($@) { die "$PROG: $@"; } sub usage { my $exit_code = @_ ? shift : 64; print STDERR "Usage: $PROG [--db KRAKEN_DB_NAME] [--mpa-format] \n"; my $default_db; eval { $default_db = krakenlib::find_db(); }; if (defined $default_db) { print STDERR "\n Default database is \"$default_db\"\n"; } exit $exit_code; } sub display_help { usage(0); } sub display_version { print "Kraken version 1.0\n"; print "Copyright 2013-2015, Derrick Wood (dwood\@cs.jhu.edu)\n"; exit 0; } my (%parent_map, %name_map, %rank_map); load_taxonomy($db_prefix); my %known_taxonomy_strings; while (<>) { next unless /^C/; chomp; my @fields = split; my ($seqid, $taxid) = @fields[1,2]; my $taxonomy_str = get_taxonomy_str($taxid); print "$seqid\t$taxonomy_str\n"; } sub get_taxonomy_str { my $taxid = shift; if (! exists $known_taxonomy_strings{$taxid}) { my @nodes; while ($taxid > 0) { if ($mpa_format) { my $rank_code = rank_code($rank_map{$taxid}); my $name = $name_map{$taxid}; $name =~ tr/ /_/; unshift @nodes, lc($rank_code) . "__" . $name if $rank_code ne "-"; } else { unshift @nodes, $name_map{$taxid}; } $taxid = $parent_map{$taxid}; } if ($mpa_format) { $known_taxonomy_strings{$taxid} = @nodes ? join("|", @nodes) : "root"; } else { $known_taxonomy_strings{$taxid} = join(";", @nodes); } } return $known_taxonomy_strings{$taxid}; } sub rank_code { my $rank = shift; for ($rank) { $_ eq "species" and return "S"; $_ eq "genus" and return "G"; $_ eq "family" and return "F"; $_ eq "order" and return "O"; $_ eq "class" and return "C"; $_ eq "phylum" and return "P"; $_ eq "kingdom" and return "K"; $_ eq "superkingdom" and return "D"; } return "-"; } sub load_taxonomy { my $prefix = shift; open NAMES, "<", "$prefix/taxonomy/names.dmp" or die "$PROG: can't open names file: $!\n"; while () { chomp; s/\t\|$//; my @fields = split /\t\|\t/; my ($node_id, $name, $type) = @fields[0,1,3]; if ($type eq "scientific name") { $name_map{$node_id} = $name; } } close NAMES; open NODES, "<", "$prefix/taxonomy/nodes.dmp" or die "$PROG: can't open nodes file: $!\n"; while () { chomp; my @fields = split /\t\|\t/; my ($node_id, $parent_id, $rank) = @fields[0,1,2]; if ($node_id == 1) { $parent_id = 0; } $parent_map{$node_id} = $parent_id; $rank_map{$node_id} = $rank; } close NODES; }