#!/usr/bin/env perl
# $Id: update_blastdb.pl 663543 2023-02-23 15:35:33Z ivanov $
# ===========================================================================
#
#                            PUBLIC DOMAIN NOTICE
#               National Center for Biotechnology Information
#
#  This software/database is a "United States Government Work" under the
#  terms of the United States Copyright Act.  It was written as part of
#  the author's official duties as a United States Government employee and
#  thus cannot be copyrighted.  This software/database is freely available
#  to the public for use. The National Library of Medicine and the U.S.
#  Government have not placed any restriction on its use or reproduction.
#
#  Although all reasonable efforts have been taken to ensure the accuracy
#  and reliability of the software and data, the NLM and the U.S.
#  Government do not and cannot warrant the performance or results that
#  may be obtained by using this software or data. The NLM and the U.S.
#  Government disclaim all warranties, express or implied, including
#  warranties of performance, merchantability or fitness for any particular
#  purpose.
#
#  Please cite the author in any work or product based on this material.
#
# ===========================================================================
#
# Author:  Christiam Camacho
#
# File Description:
#   Script to download the pre-formatted BLAST databases.
#
# ===========================================================================

use strict;
use warnings;
use Net::FTP;
use Getopt::Long;
use Pod::Usage;
use File::stat;
use Digest::MD5;
use Archive::Tar;
use File::Temp;
use JSON::PP;

use constant NCBI_FTP => "ftp.ncbi.nlm.nih.gov";
use constant BLAST_DB_DIR => "/blast/db";
use constant USER => "anonymous";
use constant PASSWORD => "anonymous";
use constant DEBUG => 0;
use constant MAX_DOWNLOAD_ATTEMPTS => 3;
use constant EXIT_FAILURE => 1;
use constant LEGACY_EXIT_FAILURE => 2;

use constant AWS_URL => "http://s3.amazonaws.com";
use constant AMI_URL => "http://169.254.169.254/latest/meta-data/local-hostname";
use constant AWS_BUCKET => "ncbi-blast-databases";

use constant GCS_URL => "https://storage.googleapis.com";
use constant GCP_URL => "http://metadata.google.internal/computeMetadata/v1/instance/id";
use constant GCP_BUCKET => "blast-db";

# TODO: deprecate this in the next release 2.14.x
#use constant BLASTDB_MANIFEST => "blastdb-manifest.json";
use constant BLASTDB_MANIFEST_VERSION => "1.0";

use constant BLASTDB_METADATA => "blastdb-metadata-1-1.json";
use constant BLASTDB_METADATA_VERSION => "1.1";

# Process command line options
my $opt_verbose = 1;
my $opt_quiet = 0;
my $opt_force_download = 0;     
my $opt_help = 0;
my $opt_passive = 1;
my $opt_blastdb_ver = undef;
my $opt_timeout = 120;
my $opt_showall = undef;
my $opt_show_version = 0;
my $opt_decompress = 0;
my $opt_source;
my $opt_legacy_exit_code = 0;
my $opt_nt = &get_num_cores();
my $opt_gcp_prj = undef;
my $result = GetOptions("verbose+"          =>  \$opt_verbose,
                        "quiet"             =>  \$opt_quiet,
                        "force"             =>  \$opt_force_download,
                        "passive:s"         =>  \$opt_passive,
                        "timeout=i"         =>  \$opt_timeout,
                        "showall:s"         =>  \$opt_showall,
                        "version"           =>  \$opt_show_version,
                        "blastdb_version:i" =>  \$opt_blastdb_ver,
                        "decompress"        =>  \$opt_decompress,
                        "source=s"          =>  \$opt_source,
                        "gcp-project=s"     =>  \$opt_gcp_prj,
                        "num_threads=i"     =>  \$opt_nt,
                        "legacy_exit_code"  =>  \$opt_legacy_exit_code,
                        "help"              =>  \$opt_help);
$opt_verbose = 0 if $opt_quiet;
die "Failed to parse command line options\n" unless $result;
pod2usage({-exitval => 0, -verbose => 2}) if $opt_help;
if (length($opt_passive) and ($opt_passive !~ /1|no/i)) {
    pod2usage({-exitval => 1, -verbose => 0,
            -msg => "Invalid value for passive option: '$opt_passive'"});
}
pod2usage({-exitval => 0, -verbose => 2}) unless (scalar @ARGV or 
                                                  defined($opt_showall) or
                                                  $opt_show_version);
if (defined $opt_blastdb_ver) {
    pod2usage({-exitval => 1, -verbose => 0, 
               -msg => "Invalid BLAST database version: $opt_blastdb_ver. Supported values: 4, 5"}) 
        unless ($opt_blastdb_ver == 4 or $opt_blastdb_ver == 5);
}
pod2usage({-exitval => 1, -verbose => 0, -msg => "Invalid number of threads"}) 
    if ($opt_nt <= 0);
if (length($opt_passive) and $opt_passive =~ /n|no/i) {
    $opt_passive = 0;
} else {
    $opt_passive = 1;
}
my $exit_code = 0;
$|++;

if ($opt_show_version) {
    my $revision = '$Revision: 663543 $';
    $revision =~ s/\$Revision: | \$//g;
    print "$0 version $revision\n";
    exit($exit_code);
}
my $curl = &get_curl_path();
my $gsutil = &get_gsutil_path();
my $gcloud = &get_gcloud_path();

my $location = "NCBI";
# If provided, the source takes precedence over any attempts to determine the closest location
if (defined($opt_source)) {
    if ($opt_source =~ /^ncbi/i) {
        $location = "NCBI";
    } elsif ($opt_source =~ /^gc/i) {
        $location = "GCP";
    } elsif ($opt_source =~ /^aws/i) {
        $location = "AWS";
    }
} else {
    # Try to auto-detect whether we're on the cloud
    if (defined($curl)) {
        my $tmpfile = File::Temp->new();
        my $gcp_cmd = "$curl --connect-timeout 3 --retry 3 --retry-max-time 30 -sfo $tmpfile -H 'Metadata-Flavor: Google' " . GCP_URL;
        my $aws_cmd = "$curl --connect-timeout 3 --retry 3 --retry-max-time 30 -sfo /dev/null " . AMI_URL;
        print "$gcp_cmd\n" if DEBUG;
        if (system($gcp_cmd) == 0) { 
            # status not always reliable.  Check that curl output is all digits.
            my $tmpfile_content = do { local $/; <$tmpfile>};
            print "curl output $tmpfile_content\n" if DEBUG;
            $location = "GCP" if ($tmpfile_content =~ m/^(\d+)$/);
        } elsif (DEBUG) {
            # Consult https://ec.haxx.se/usingcurl/usingcurl-returns
            print "curl to GCP metadata server returned ", $?>>8, "\n";
        }

        print "$aws_cmd\n" if DEBUG;
        if (system($aws_cmd) == 0) {
            $location = "AWS";
        } elsif (DEBUG) {
            # Consult https://ec.haxx.se/usingcurl/usingcurl-returns
            print "curl to AWS metadata server returned ", $?>>8, "\n";
        }
        print "Location is $location\n" if DEBUG;
    }
}
if ($location =~ /aws/i and not defined $curl) {
    print "Error: $0 depends on curl to fetch data from cloud storage, please install this utility to access this data source.\n";
    exit(EXIT_FAILURE);
}
if ($location =~ /gcp/i and defined($opt_gcp_prj) and (not defined $gsutil or not defined $gcloud)) {
    print "Error: when providing a GCP project, $0 depends on gsutil and gcloud to fetch data from cloud storage, please install these utilities to access this data source.\n";
    exit(EXIT_FAILURE);
}
my $gcp_prj = $opt_gcp_prj;
#my $gcp_prj = ($location =~ /gcp/i) ? &get_gcp_project() : undef;
#if ($location =~ /gcp/i and not defined $gcp_prj) {
#    print "Error: $0 depends on gcloud being configured to fetch data from cloud storage, please configure it per the instructions in https://cloud.google.com/sdk/docs/initializing .\n";
#    exit(EXIT_FAILURE);
#}

my $ftp;

sub validate_metadata_file
{
    my $json = shift;
    my $url = shift;
    my $metadata = decode_json($json);
    if (ref($metadata) eq 'HASH') {
        unless (exists($$metadata{version}) and ($$metadata{version} eq BLASTDB_MANIFEST_VERSION)) {
            print STDERR "ERROR: Invalid version in manifest file $url, please report to blast-help\@ncbi.nlm.nih.gov\n";
            exit(EXIT_FAILURE);
        }
    } elsif (not ref($metadata) eq 'ARRAY') {
        print STDERR "ERROR: Invalid metadata format in $url, please report to blast-help\@ncbi.nlm.nih.gov\n";
        exit(EXIT_FAILURE);
    }
}

sub showall_from_metadata_file
{
    my $json = shift;
    my $url = shift;
    &validate_metadata_file($json, $url);
    my $metadata = decode_json($json);
    my $print_header = 1;
    foreach my $db (sort keys %$metadata) {
        next if ($db =~ /^version$/);
        if ($opt_showall =~ /tsv/i) {
            printf("%s\t%s\t%9.4f\t%s\n", $db, $$metadata{$db}{description}, 
                $$metadata{$db}{size}, $$metadata{$db}{last_updated});
        } elsif ($opt_showall =~ /pretty/i) {
            if ($print_header) {
                printf("%-60s %-120s %-11s %15s\n", "BLASTDB", 
                    "DESCRIPTION", "SIZE (GB)", "LAST_UPDATED");
                $print_header = 0;
            }
            printf("%-60s %-120s %9.4f %15s\n", $db, $$metadata{$db}{description}, 
                $$metadata{$db}{size}, $$metadata{$db}{last_updated});
        } else {
            print "$db\n";
        }
    }
}

# Display metadata from version 1.1 of BLASTDB metadata files
sub showall_from_metadata_file_1_1($$)
{
    my $json = shift;
    my $url = shift;
    &validate_metadata_file($json, $url);
    my $metadata = decode_json($json);
    my $print_header = 1;
    foreach my $db (sort @$metadata) {
        next if ($$db{version} ne BLASTDB_METADATA_VERSION);
        my $gb_total = sprintf("%.4f", $$db{'bytes-total'} * 1e-9);
        my $last_updated = $$db{'last-updated'};
        if ($opt_showall =~ /tsv/i) {
            printf("%s\t%s\t%9.4f\t%s\n", $$db{dbname}, $$db{description}, 
                $gb_total, $last_updated);
        } elsif ($opt_showall =~ /pretty/i) {
            if ($print_header) {
                printf("%-60s %-120s %-11s %15s\n", "BLASTDB", 
                    "DESCRIPTION", "SIZE (GB)", "LAST_UPDATED");
                $print_header = 0;
            }
            $last_updated = $$db{'last-updated'} =~ s/T.*//r;
            printf("%-60s %-120s %9.4f %15s\n", $$db{dbname}, $$db{description}, 
                $gb_total, $last_updated);
        } else {
            print "$$db{dbname}\n";
        }
    }
}

sub get_files_from_json_metadata_1_1($$)
{
    my $json = shift;
    my $url = shift;
    my @retval = ();
    &validate_metadata_file($json, $url);
    my $metadata = decode_json($json);
    foreach my $db (sort @$metadata) {
        next if ($$db{version} ne BLASTDB_METADATA_VERSION);
        push @retval, map { s,ftp://,https://,; $_; } @{$$db{files}};
    }
    return @retval;
}

if ($location ne "NCBI") {
    die "Only BLASTDB version 5 is supported at GCP and AWS\n" if (defined $opt_blastdb_ver and $opt_blastdb_ver != 5);
    my $latest_dir = &get_latest_dir($location);
    my ($json, $url) = &get_blastdb_metadata($location, $latest_dir);
    unless (length($json)) {
        print STDERR "ERROR: Missing manifest file $url, please report to blast-help\@ncbi.nlm.nih.gov\n";
        exit(EXIT_FAILURE);
    }
    print "Connected to $location\n" if $opt_verbose;
    print "Metadata source $url\n" if ($opt_verbose > 3);
    &validate_metadata_file($json, $url);
    my $metadata = decode_json($json);
    if (defined($opt_showall)) {
        &showall_from_metadata_file_1_1($json, $url);
    } else {
        &ensure_available_disk_space($json, $url);
        my @files2download;
        for my $requested_db (@ARGV) {
            my $found = 0;
            foreach my $dbm (sort @$metadata) {
                if ($$dbm{'dbname'} eq $requested_db) {
                    push @files2download, @{$$dbm{files}};
                    $found = 1;
                    last;
                }
            }
            if (not $found) {
                print STDERR "ERROR: $requested_db does not exist in $location ($latest_dir)\n";
                my $exit_code = ($opt_legacy_exit_code == 1 ? LEGACY_EXIT_FAILURE : EXIT_FAILURE);
                exit $exit_code;
            }
        }
        if (@files2download) {
            my $gsutil = &get_gsutil_path();
            my $awscli = &get_awscli_path();
            my $cmd;
            my $fh = File::Temp->new();
            if ($location eq "GCP" and defined($gcp_prj)) {
                $cmd = "$gsutil -u $gcp_prj ";
                if ($opt_nt > 1) {
                    $cmd .= "-m -q ";
                    $cmd .= "-o 'GSUtil:parallel_thread_count=1' -o 'GSUtil:parallel_process_count=$opt_nt' ";
                    $cmd .= "cp ";
                } else {
                    $cmd .= "-q cp ";
                }
                $cmd .= join(" ", @files2download) . " .";
            } elsif ($location eq "AWS" and defined ($awscli)) {
                # https://registry.opendata.aws/ncbi-blast-databases/#usageexamples
                my $aws_cmd = "$awscli s3 cp --no-sign-request ";
                $aws_cmd .= "--only-show-errors " unless $opt_verbose >= 3;
                print $fh join("\n", @files2download);
                $cmd = "/usr/bin/xargs -P $opt_nt -n 1 -I{}";
                $cmd .= " -t" if $opt_verbose > 3;
                $cmd .= " $aws_cmd {} .";
                $cmd .= " <$fh " ;
            } else { # fall back to curl
                my $url = $location eq "AWS" ? AWS_URL : GCS_URL;
                s,gs://,$url/, foreach (@files2download);
                s,s3://,$url/, foreach (@files2download);
                if ($opt_nt > 1 and -f "/usr/bin/xargs") {
                    print $fh join("\n", @files2download);
                    $cmd = "/usr/bin/xargs -P $opt_nt -n 1";
                    $cmd .= " -t" if $opt_verbose > 3;
                    $cmd .= " $curl -sSOR";
                    $cmd .= " <$fh " ;
                } else {
                    $cmd = "$curl -sSR --remote-name-all ";
                    $cmd .= join(" " , @files2download);
                }
            }
            print "$cmd\n" if $opt_verbose > 3;
            unless (system($cmd) == 0) {
                print STDERR "Failed to run '$cmd': $!!\n";
                return EXIT_FAILURE;
            }
        }
    }

} else {
    # Connect and download files
    $ftp = &connect_to_ftp();
    my ($json, $url) = &get_blastdb_metadata($location, '', $ftp);
    unless (length($json)) {
        print STDERR "ERROR: Missing manifest file $url, please report to blast-help\@ncbi.nlm.nih.gov\n";
        exit(EXIT_FAILURE);
    }
    print "Metadata source $url\n" if ($opt_verbose > 3);

    if (defined $opt_showall) {
        &showall_from_metadata_file_1_1($json, $url)
    } else {
        &ensure_available_disk_space($json, $url);
        my @files = sort(&get_files_to_download($ftp, $json, $url));
        my @files2decompress;
        $exit_code = &download(\@files, \@files2decompress);
        if ($exit_code == 1) {
            foreach (@files2decompress) {
                $exit_code = &decompress($_);
                last if ($exit_code != 1);
            }
        }
        unless ($opt_legacy_exit_code) {
            $exit_code = ($exit_code == 1 ? 0 : $exit_code);
        }
    }
    $ftp->quit() unless not defined($ftp);
}

exit($exit_code);

# Connects to NCBI ftp server
sub connect_to_ftp
{
    return undef if ($^O =~ /darwin/);  # Net::FTP appears unreliable on Mac
    my %ftp_opts;
    $ftp_opts{'Passive'} = 1 if $opt_passive;
    $ftp_opts{'Timeout'} = $opt_timeout if ($opt_timeout >= 0);
    $ftp_opts{'Debug'}   = 1 if ($opt_verbose > 1);
    my $ftp = Net::FTP->new(NCBI_FTP, %ftp_opts)
        or die "Failed to connect to " . NCBI_FTP . ": $!\n";
    $ftp->login(USER, PASSWORD) 
        or die "Failed to login to " . NCBI_FTP . ": $!\n";
    my $ftp_path = BLAST_DB_DIR;
    $ftp_path .= "/v$opt_blastdb_ver" if (defined $opt_blastdb_ver);
    $ftp->cwd($ftp_path);
    $ftp->binary();
    if ($opt_verbose) {
        if (defined $opt_blastdb_ver) {
            print "Connected to $location; downloading BLASTDBv$opt_blastdb_ver\n";
        } else {
            print "Connected to $location\n";
        }
    }
    return $ftp;
}

# Gets the list of available databases on NCBI FTP site
sub get_available_databases
{
    my @blast_db_files = $ftp->ls();
    my @retval = ();

    foreach (@blast_db_files) {
        next unless (/\.tar\.gz$/);
        push @retval, &extract_db_name($_);
    }
    my %seen = ();
    return grep { ! $seen{$_} ++ } @retval;
}

# Returns the last modified date for a file on the NCBI FTP or the number of
# seconds since epoch at the time of invocation of this function
sub get_last_modified_date_from_ncbi_ftp
{
    my $file = shift;
    my $retval = time();
    if (defined($ftp)) {
        $retval = $ftp->mdtm($file);
    } else {
        use Time::Local;
        my %month2int = (Jan=>0, Feb=>1, Mar=>2, Apr=>3, May=>4, Jun=>5,
            Jul=>6, Aug=>8, Sep=>8, Oct=>9, Nov=>10, Dec=>11);
        my $cmd = "$curl --user " . USER . ":" . PASSWORD . " -sI $file | grep ^Last-Modified";
        chomp(my $date_str = `$cmd`);
        if ($date_str =~ /Last-Modified:\s+\w+, (\d+) (\w+) (\d+) (\d+):(\d+):(\d+) GMT/) {
            # Sample output: Wed, 07 Dec 2022 10:37:08 GMT
            my $mday = int($1);
            my $mon = $month2int{$2};
            my $year = int($3) - 1900;
            my $hour = int($4);
            my $min = int($5);
            my $sec = int($6);
            $retval = Time::Local::timegm($sec, $min, $hour, $mday, $mon, $year);
            print "$file $retval\n" if DEBUG;
        }
    }
    return $retval;
}

# This function exits the program if not enough disk space is available for the
# selected BLASTDBs
sub ensure_available_disk_space
{
    my $json = shift;
    my $url = shift;
    my $space_needed = 0;
    my $space_available = &get_available_disk_space;
    print "Available disk space in bytes: $space_available\n" if ($opt_verbose > 3);
    return unless $space_available;

    for my $requested_db (@ARGV) {
        my $x = &get_database_size_from_metadata_1_1($requested_db, $json, $url);
        $space_needed += $x if ($x > 0);
    }
    print "Needed disk space in bytes: $space_needed\n" if ($opt_verbose > 3);
    if ($space_needed > $space_available) {
        my $msg = "ERROR: Need $space_needed bytes and only ";
        $msg .= "$space_available bytes are available\n";
        print STDERR $msg;
        my $exit_code = ($opt_legacy_exit_code == 1 ? LEGACY_EXIT_FAILURE : EXIT_FAILURE);
        exit $exit_code;
    }
}

# Returns the available disk space in bytes of the current working directory
# Not supported in windows
sub get_available_disk_space
{
    my $retval = 0;
    return $retval if ($^O =~ /mswin/i);

    my $BLK_SIZE = 512;
    my $cmd = "df -P --block-size $BLK_SIZE .";
    $cmd = "df -P -b ." if ($^O =~ /darwin/i);
    foreach (`$cmd 2>/dev/null`) {
        chomp;
        next if (/^Filesystem/);
        my @F = split;
        $retval = $F[3] * $BLK_SIZE if (scalar(@F) == 6);
    }
    print STDERR "WARNING: unable to compute available disk space\n" unless $retval;
    return $retval;
}

sub get_database_size_from_metadata_1_1
{
    my $db = shift;
    my $json = shift;
    my $url = shift;
    my $retval = -1;
    &validate_metadata_file($json, $url);
    my $metadata = decode_json($json);
    foreach my $dbm (sort @$metadata) {
        if ($$dbm{'dbname'} eq $db) {
            $retval = $$dbm{'bytes-total'};
            last;
        }
    }
    print STDERR "Warning: No BLASTDB metadata for $db\n" if ($retval == -1);
    return $retval;
}

# Obtains the list of files to download
sub get_files_to_download($$$)
{
    my $ftp = shift;
    my $json = shift;
    my $url = shift;
    my @blast_db_files = (defined($ftp) 
        ? $ftp->ls() 
        : &get_files_from_json_metadata_1_1($json, $url));
    my @retval = ();

    if ($opt_verbose > 2) {
        print "Found the following files on ftp site:\n";
        print "$_\n" for (@blast_db_files);
    }

    for my $requested_db (@ARGV) {
        for my $file (@blast_db_files) {
            next unless ($file =~ /\.tar\.gz$/);    
            if (defined($ftp) and $file =~ /^$requested_db\..*/) {
                push @retval, $file;
            } elsif ($file =~ /\/$requested_db\..*/) {
                # for Mac, which no longer relies on Net::FTP
                push @retval, $file;
            }
        }
    }

    if ($opt_verbose) {
        for my $requested_db (@ARGV) {
            unless (grep(/$requested_db/, @retval)) {
                print STDERR "$requested_db not found, skipping.\n" 
            }
        }
    }

    return @retval;
}

# Download the requested files only if their checksum files are missing or if
# these (or the archives) are newer in the FTP site. Returns 0 if no files were
# downloaded, 1 if at least one file was downloaded (so that this can be the
# application's exit code)
sub download($$)
{
    my @requested_dbs = @ARGV;
    my @files2download = @{$_[0]};
    my $files2decompress = $_[1];
    my $retval = 0;

    for my $file (@files2download) {

        my $attempts = 0;   # Download attempts for this file
        if ($opt_verbose and &is_multivolume_db($file) and $file =~ /\.00\./) {
            my $db_name = &extract_db_name($file);
            my $nvol = &get_num_volumes($db_name, @files2download);
            print "Downloading $db_name (" . $nvol . " volumes) ...\n" unless ($opt_quiet);
        }

        # We preserve the checksum files as evidence of the downloaded archive
        my $checksum_file = "$file.md5";
        my $rmt_checksum_file_mtime = &get_last_modified_date_from_ncbi_ftp($checksum_file);
        my $rmt_file_mtime = &get_last_modified_date_from_ncbi_ftp($file);
        my $lcl_checksum_file_mtime = (-e &rm_protocol($checksum_file)
            ? stat(&rm_protocol($checksum_file))->mtime : 0);
        print "RMT checksum file mtime $rmt_checksum_file_mtime\n" if DEBUG;
        print "LCL checksum file mtime $lcl_checksum_file_mtime\n" if DEBUG;
        my $update_available = ($lcl_checksum_file_mtime < $rmt_checksum_file_mtime);
        if (-e $file and (stat(&rm_protocol($file))->mtime < $rmt_file_mtime)) {
            $update_available = 1;
        }

download_file:
        if ($opt_force_download or $update_available) {
            print "Downloading $file..." if $opt_verbose;
            if (defined($ftp)) {
                # Download errors will be checked later when reading checksum files
                $ftp->get($file);
                $ftp->get($checksum_file);
            } else {
                my $cmd = "$curl --user " . USER . ":" . PASSWORD . " -sSR ";
                $cmd .= "--remote-name-all $file $file.md5";
                print "$cmd\n" if $opt_verbose > 3;
                system($cmd);
            }
            my $rmt_digest = &read_md5_file($checksum_file);
            my $lcl_digest = &compute_md5_checksum($file);
            print "\nRMT $file Digest $rmt_digest" if (DEBUG);
            print "\nLCL $file Digest $lcl_digest\n" if (DEBUG);
            if ($lcl_digest ne $rmt_digest) {
                unlink &rm_protocol($file), &trim_ftp_prefix($checksum_file);
                if (++$attempts >= MAX_DOWNLOAD_ATTEMPTS) {
                    print STDERR "too many failures, aborting download!\n";
                    return EXIT_FAILURE;
                } else {
                    print "corrupt download, trying again.\n";
                    goto download_file;
                }
            }
            push @$files2decompress, $file if ($opt_decompress);
            print " [OK]\n" if $opt_verbose;
            $retval = 1 if ($retval == 0);
        } else {
            if ($opt_decompress and -f $file) {
                push @$files2decompress, $file;
                $retval = 1;
            } else {
                my $msg = ($opt_decompress 
                           ? "The contents of $file are up to date in your system." 
                           : "$file is up to date.");
                print "$msg\n" if $opt_verbose;
            }
        }
    }
    return $retval;
}

# Try to decompress using /bin/tar as Archive::Tar is known to be slower (as
# it's pure perl)
sub _decompress_impl($)
{
    my $file = shift;
    if ($^O eq "cygwin") {
        local $ENV{PATH} = "/bin:/usr/bin";
        my $cmd = "tar -zxf $file 2>/dev/null";
        return 1 unless (system($cmd));
    }
    unless ($^O =~ /mswin/i) {
        local $ENV{PATH} = "/bin:/usr/bin";
        my $cmd = "gzip -cd $file 2>/dev/null | tar xf - 2>/dev/null";
        print "$cmd\n" if $opt_verbose > 3;
        return 1 unless (system($cmd));
    }
    return Archive::Tar->extract_archive($file, 1);
}

# Decompresses the file passed as its argument
# Returns 1 on success, and 2 on failure, printing an error to STDERR
sub decompress($)
{
    my $file = shift;
    $file = &rm_protocol($file);
    print "Decompressing $file ..." unless ($opt_quiet);
    my $succeeded = &_decompress_impl($file);
    unless ($succeeded) {
        my $msg = "Failed to decompress $file ($Archive::Tar::error), ";
        $msg .= "please do so manually.";
        print STDERR "$msg\n";
        return EXIT_FAILURE;
    }
    print "rm $file\n" if $opt_verbose > 3;
    unlink $file;   # Clean up archive, but preserve the checksum file
    print " [OK]\n" unless ($opt_quiet);
    return 1;
}

sub compute_md5_checksum($)
{
    my $file = shift;
    my $digest = "N/A";
    $file = &rm_protocol($file);
    if (open(DOWNLOADED_FILE, $file)) {
        binmode(DOWNLOADED_FILE);
        $digest = Digest::MD5->new->addfile(*DOWNLOADED_FILE)->hexdigest;
        close(DOWNLOADED_FILE);
    }
    return $digest;
}

# Removes the protocol prefix from the file name passed in
sub rm_protocol($)
{
    my $retval = shift;
    if ($retval =~ /^https:/) {
        my $prefix = "https://" . NCBI_FTP . BLAST_DB_DIR . "/";
        $retval =~ s/$prefix//;
    }
    return $retval;
}

sub read_md5_file($)
{
    my $md5file = shift;
    $md5file = &rm_protocol($md5file);
    return '' unless (-f $md5file);
    open(IN, $md5file);
    $_ = <IN>;
    close(IN);
    my @retval = split;
    return $retval[0];
}

# Determine if a given pre-formatted BLAST database file is part of a
# multi-volume database
sub is_multivolume_db
{
    my $file = shift;
    return 1 if ($file =~ /\.\d{2,3}\.tar\.gz$/);
    return 0;
}

# Extracts the database name from the pre-formatted BLAST database archive file
# name
sub extract_db_name
{
    my $file = shift;
    my $retval = "";
    if (&is_multivolume_db($file)) {
        $retval = $1 if ($file =~ m/(.*)\.\d{2,3}\.tar\.gz$/);
    } else {
        $retval = $1 if ($file =~ m/(.*)\.tar\.gz$/);
    }
    return $retval;
}

# Returns the number of volumes for a BLAST database given the file name of a
# pre-formatted BLAST database and the list of all databases to download
sub get_num_volumes
{
    my $db = shift;
    my $retval = 0;
    foreach (@_) {
        if (/$db/) {
            if (/.*\.(\d{2,3})\.tar\.gz$/) {
                $retval = int($1) if (int($1) > $retval);
            }
        }
    }
    return $retval + 1;
}

# Retrieves the name of the 'subdirectory' where the latest BLASTDBs reside
sub get_latest_dir
{
    my $source = shift;
    my ($retval, $url, $cmd);
    if ($source eq "AWS") {
        $url = AWS_URL . "/" . AWS_BUCKET . "/latest-dir";
        $cmd = "$curl -s $url";
    } else {
        if (defined($gcp_prj)) {
            $url = 'gs://' . GCP_BUCKET . "/latest-dir";
            $cmd = "$gsutil -u $gcp_prj cat $url";
        } else {
            $url = GCS_URL . "/" . GCP_BUCKET . "/latest-dir";
            $cmd = "$curl -s $url";
        }
    }
    print "$cmd\n" if DEBUG;
    chomp($retval = `$cmd`);
    unless (length($retval)) {
        print STDERR "ERROR: Missing file $url, please try again or report to blast-help\@ncbi.nlm.nih.gov\n";
        exit(EXIT_FAILURE);
    }
    print "$source latest-dir: '$retval'\n" if DEBUG;
    return $retval;
}

# Fetches the JSON text containing the BLASTDB metadata
sub get_blastdb_metadata
{
    my $source = shift;
    my $latest_dir = shift;
    my $ftp = shift;
    my ($url, $cmd);
    my $retval;

    if ($source eq "AWS") {
        $url = AWS_URL . "/" . AWS_BUCKET . "/$latest_dir/" . BLASTDB_METADATA;
        $cmd = "curl -sf $url";
    } elsif ($source eq "GCP") {
        if (defined($gcp_prj)) {
            $url = 'gs://' . GCP_BUCKET . "/$latest_dir/" . BLASTDB_METADATA;
            $cmd = "$gsutil -u $gcp_prj cat $url";
        } else {
            $url = GCS_URL . "/" . GCP_BUCKET . "/$latest_dir/" . BLASTDB_METADATA;
            $cmd = "curl -sf $url";
        }
    } else {
        $url = 'https://' . NCBI_FTP . "/blast/db/" . BLASTDB_METADATA;
        $cmd = "curl -sf $url";
    }
    if (defined $ftp) {
        my $tmpfh = File::Temp->new();
        print "Downloading " . BLASTDB_METADATA . " to $tmpfh\n" if ($opt_verbose > 3);
        $ftp->get(BLASTDB_METADATA, $tmpfh);
        $tmpfh->seek(0, 0);
        $retval = do {
            local $/ = undef;
            <$tmpfh>;
        };
    } else {
        print "$cmd\n" if DEBUG;
        chomp($retval = `$cmd`);
    }
    return ($retval, $url);
}

# Returns the path to the gsutil utility or undef if it is not found
sub get_gsutil_path
{
    return undef if ($^O =~ /mswin/i);
    foreach (qw(/google/google-cloud-sdk/bin /usr/local/bin /usr/bin /snap/bin)) {
        my $path = "$_/gsutil";
        return $path if (-f $path);
    }
    chomp(my $retval = `which gsutil`);
    return $retval if (-f $retval);
    return undef;
}

sub get_gcloud_path
{
    return undef if ($^O =~ /mswin/i);
    foreach (qw(/google/google-cloud-sdk/bin /usr/local/bin /usr/bin /snap/bin)) {
        my $path = "$_/gcloud";
        return $path if (-f $path);
    }
    chomp(my $retval = `which gcloud`);
    return $retval if (-f $retval);
    return undef;
}

sub get_gcp_project
{
    return undef if ($^O =~ /mswin/i);
    my $gcloud = &get_gcloud_path();
    chomp(my $retval = `$gcloud config get-value project`);
    return $retval;
}

# Returns the path to the aws CLI utility or undef if it is not found
sub get_awscli_path
{
    return undef if ($^O =~ /mswin/i);
    foreach (qw(/usr/local/bin /usr/bin)) {
        my $path = "$_/aws";
        return $path if (-f $path);
    }
    chomp(my $retval = `which aws`);
    return $retval if (-f $retval);
    return undef;
}

# Returns the number of cores, or 1 if unknown
sub get_num_cores
{
    my $retval = 1;
    if ($^O =~ /linux/i) {
        open my $fh, "/proc/cpuinfo" or return $retval;
        $retval = scalar(map /^processor/, <$fh>);
        close($fh);
    } elsif ($^O =~ /darwin/i) {
        chomp($retval = `/usr/sbin/sysctl -n hw.ncpu`);
    }
    return $retval;
}

# Returns the path to the curl utility, or undef if it is not found
sub get_curl_path
{
    foreach (qw(/usr/local/bin /usr/bin)) {
        my $path = "$_/curl";
        return $path if (-f $path);
    }
    if ($^O =~ /mswin/i) {
        my $retval = `where curl`;
        if (defined $retval) {
            chomp($retval);
            return $retval if (-f $retval);
        }
    }
    return undef;
}

__END__

=head1 NAME

B<update_blastdb.pl> - Download pre-formatted BLAST databases

=head1 SYNOPSIS

update_blastdb.pl [options] blastdb ...

=head1 OPTIONS

=over 2

=item B<--source>

Location to download BLAST databases from (default: auto-detect closest location).
Supported values: ncbi, aws, or gcp.

=item B<--decompress>

Downloads, decompresses the archives in the current working directory, and
deletes the downloaded archive to save disk space, while preserving the
archive checksum files (default: false).

=item B<--showall>

Show all available pre-formatted BLAST databases (default: false). The output
of this option lists the database names which should be used when
requesting downloads or updates using this script.

It accepts the optional arguments: 'tsv' and 'pretty' to produce tab-separated values
and a human-readable format respectively. These parameters elicit the display of
additional metadata if this is available to the program.
This metadata is displayed in columnar format; the columns represent:

name, description, size in gigabytes, date of last update (YYYY-MM-DD format).

=item B<--blastdb_version>

Specify which BLAST database version to download (default: 5).
Supported values: 4, 5

=item B<--passive>

Use passive FTP, useful when behind a firewall or working in the cloud (default: true).
To disable passive FTP, configure this option as follows: --passive no

=item B<--timeout>

Timeout on connection to NCBI (default: 120 seconds).

=item B<--force>

Force download even if there is a archive already on local directory (default:
false).

=item B<--verbose>

Increment verbosity level (default: 1). Repeat this option multiple times to 
increase the verbosity level (maximum 2).

=item B<--quiet>

Produce no output (default: false). Overrides the B<--verbose> option.

=item B<--version>

Prints this script's version. Overrides all other options.

=item B<--num_threads>

Sets the number of threads to utilize to perform downloads in parallel when data comes from the cloud.
Defaults to use all available CPUs on the machine (Linux and macos only).

=item B<--legacy_exit_code>

Enables exit codes from prior to version 581818, BLAST+ 2.10.0 release, for
downloads from NCBI only. This option is meant to be used by legacy applications that rely
on this exit codes:
0 for successful operations that result in no downloads, 1 for successful
downloads, and 2 for errors.

=back

=head1 DESCRIPTION

This script will download the pre-formatted BLAST databases requested in the
command line from the NCBI ftp site.

=head1 EXIT CODES

This script returns 0 on successful operations and non-zero on errors.

=head1 DEPENDENCIES

This script depends on curl for retrieval from cloud service providers.

=head1 BUGS

Please report them to <blast-help@ncbi.nlm.nih.gov>

=head1 COPYRIGHT

See PUBLIC DOMAIN NOTICE included at the top of this script.

=cut
