#!/usr/bin/env perl # # Copyright 2011, Ben Langmead # # This file is part of Bowtie 2. # # Bowtie 2 is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # Bowtie 2 is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Bowtie 2. If not, see . # # bowtie2: # # A wrapper script for bowtie2. Provides various advantages over running # bowtie2 directly, including: # # 1. Handling compressed inputs # 2. Redirecting output to various files # 3. Output directly to bam (not currently supported) use strict; use warnings; use Getopt::Long qw(GetOptions); use File::Spec; use POSIX; use Sys::Hostname; my $host = hostname(); my ($vol,$script_path,$prog); $prog = File::Spec->rel2abs( __FILE__ ); while (-f $prog && -l $prog){ my (undef, $dir, undef) = File::Spec->splitpath($prog); $prog = File::Spec->rel2abs(readlink($prog), $dir); } ($vol,$script_path,$prog) = File::Spec->splitpath($prog); my $os_is_nix = $^O ne "MSWin32"; my $align_bin_s = $os_is_nix ? 'bowtie2-align-s' : 'bowtie2-align-s.exe'; my $build_bin = $os_is_nix ? 'bowtie2-build' : 'bowtie2-build.exe'; my $align_bin_l = $os_is_nix ? 'bowtie2-align-l' : 'bowtie2-align-l.exe'; my $align_prog_s= File::Spec->catpath($vol,$script_path,$align_bin_s); my $align_prog_l= File::Spec->catpath($vol,$script_path,$align_bin_l); my $align_prog = $align_prog_s; my $idx_ext_l = 'bt2l'; my $idx_ext_s = 'bt2'; my $idx_ext = $idx_ext_s; my %signo = (); my @signame = (); sub quote_params { my %params_2_quote = ('-S' => 1, '-U' => 1, '-1' => 1, '-2' => 1, '-x' => 1, '-b' => 1, '--interleaved' => 1, '--rg' => 1,'--rg-id' => 1, '--tab5' => 1, '--tab6' => 1, '--sam-acc' => 1, '--bam' => 1 ); my $param_list = shift; my $quoting = 0; for (my $i=0; $i[$i] = "\"".$param_list->[$i]."\""; next; } $quoting = 1 if(exists($params_2_quote{$param_list->[$i]})); } } { # Get signal info use Config; my $i = 0; for my $name (split(' ', $Config{sig_name})) { $signo{$name} = $i; $signame[$i] = $name; $i++; } } (-x "$align_prog") || Fail("Expected bowtie2 to be in same directory with bowtie2-align:\n$script_path\n"); # Get description of arguments from Bowtie 2 so that we can distinguish Bowtie # 2 args from wrapper args sub getBt2Desc($) { my $d = shift; my $cmd = "\"$align_prog\" --wrapper basic-0 --arg-desc"; open(my $fh, "$cmd |") || Fail("Failed to run command '$cmd'\n"); while(readline $fh) { chomp; next if /^\s*$/; my @ts = split(/\t/); $d->{$ts[0]} = $ts[1]; } close($fh); $? == 0 || Fail("Description of arguments failed!\n"); } my %desc = (); my %wrapped = ("1" => 1, "2" => 1); getBt2Desc(\%desc); # Given an option like -1, determine whether it's wrapped (i.e. should be # handled by this script rather than being passed along to Bowtie 2) sub isWrapped($) { return defined($wrapped{$_[0]}); } my $debug = 0; my $sanitized = 0; my %read_fns = (); my %read_compress = (); my $cap_out = undef; # Filename for passthrough my $no_unal = 0; my $large_idx = 0; sub handle_un_or_al { my ($opt, $value) = @_; my ($name, $type_of_compression) = $opt =~ /((?:al|un)(?:-conc|-mates)?)(?:-(gz|bz2|lz4))?/; $read_fns{$name} = $value; $read_compress{$name} = ""; $read_compress{$name} = "gzip" if defined $type_of_compression && $type_of_compression eq "gz"; $read_compress{$name} = "bzip2" if defined $type_of_compression && $type_of_compression eq "bz2"; $read_compress{$name} = "lz4" if defined $type_of_compression && $type_of_compression eq "lz4"; } my @unps = (); my @mate1s = (); my @mate2s = (); my @tab5_mates = (); my @tab6_mates = (); my @interleaved_mates = (); my @bam_files = (); my @to_delete = (); my @to_kill = (); my $temp_dir = "/tmp"; my $bam_out = 0; my $ref_str = undef; my $no_pipes = 0; my $keep = 0; my $verbose = 0; my $readpipe = undef; my $log_fName = undef; my $help = 0; # Travis' MacOS image does not like bundling_values eval { Getopt::Long::Configure("pass_through", "no_ignore_case", "no_auto_abbrev", "bundling_values"); 1; } or do { Getopt::Long::Configure("pass_through", "no_ignore_case", "no_auto_abbrev"); }; GetOptions( "1=s{1,}" => \@mate1s, "2=s{1,}" => \@mate2s, "reads|U=s{1,}" => \@unps, "tab5=s{1,}" => \@tab5_mates, "tab6=s{1,}" => \@tab6_mates, "interleaved=s{1,}" => \@interleaved_mates, "b=s{1,}" => \@bam_files, "temp-directory=s" => \$temp_dir, "bam" => \$bam_out, "no-named-pipes" => \$no_pipes, "ref-string|reference-string=s" => \$ref_str, "keep" => \$keep, "verbose" => \$verbose, "debug" => \$debug, "sanitized" => \$sanitized, "large-index" => \$large_idx, "no-unal" => \$no_unal, "un=s" => \&handle_un_or_al, "un-gz=s" => \&handle_un_or_al, "un-bz2=s" => \&handle_un_or_al, "un-lz4=s" => \&handle_un_or_al, "al=s" => \&handle_un_or_al, "al-gz=s" => \&handle_un_or_al, "al-bz2=s" => \&handle_un_or_al, "al-lz4=s" => \&handle_un_or_al, "un-conc=s" => \&handle_un_or_al, "un-conc-gz=s" => \&handle_un_or_al, "un-conc-bz2=s" => \&handle_un_or_al, "un-conc-lz4=s" => \&handle_un_or_al, "al-conc=s" => \&handle_un_or_al, "al-conc-gz=s" => \&handle_un_or_al, "al-conc-bz2=s" => \&handle_un_or_al, "al-conc-lz4=s" => \&handle_un_or_al, "un-mates=s" => \&handle_un_or_al, "un-mates-gz=s" => \&handle_un_or_al, "un-mates-bz2=s" => \&handle_un_or_al, "un-mates-lz4=s" => \&handle_un_or_al, "log-file=s" => \$log_fName, ); my @bt2_args = @ARGV; if ($verbose == 1) { push @bt2_args, "--verbose"; } # If the user asked us to redirect some reads to files, or to suppress # unaligned reads, then we need to capture the output from Bowtie 2 and pass it # through this wrapper. my $passthru = 0; if(scalar(keys %read_fns) > 0 || $no_unal) { $passthru = 1; push @bt2_args, "--passthrough"; $cap_out = "-"; for(my $i = 0; $i < scalar(@bt2_args); $i++) { next unless defined($bt2_args[$i]); my $arg = $bt2_args[$i]; if($arg eq "-S" || $arg eq "--output") { $i < scalar(@bt2_args)-1 || Fail("-S/--output takes an argument.\n"); $cap_out = $bt2_args[$i+1]; $bt2_args[$i] = undef; $bt2_args[$i+1] = undef; } } @bt2_args = grep defined, @bt2_args; } my $old_stderr; if ($log_fName) { open($old_stderr, ">&STDERR") or Fail("Cannot dup STDERR!\n"); open(STDERR, ">", $log_fName) or Fail("Cannot redirect to log file $log_fName.\n"); } sub cat_file($$) { my ($ifn, $ofh) = @_; my $ifh = undef; if($ifn =~ /\.gz$/) { open($ifh, "gzip -dc \"$ifn\" |") || Fail("Could not open gzipped read file: $ifn \n"); } elsif($ifn =~ /\.bz2/) { open($ifh, "bzip2 -dc \"$ifn\" |") || Fail("Could not open bzip2ed read file: $ifn \n"); } elsif($ifn =~ /\.lz4/) { open($ifh, "lz4 -dc \"$ifn\" |") || Fail("Could not open lz4ed read file: $ifn \n"); } else { open($ifh, $ifn) || Fail("Could not open read file: $ifn \n"); } while(readline $ifh) { print {$ofh} $_; } close($ifh); } sub write_files { my ($input_files, $output_file, $no_pipes) = @_; if (!$no_pipes) { mkfifo($output_file, 0700) || Fail("mkfifo($output_file) failed.\n"); } my $pid = 0; unless ($no_pipes) { $pid = fork(); push @to_kill, $pid if $pid; } if ($pid == 0) { open(my $ofh, ">$output_file") || Fail("Can't open '$output_file' for writing.\n"); for my $fn (@$input_files) { cat_file($fn, $ofh); close($ofh); exit 0 unless $no_pipes; } } } sub maybe_wrap_input { my ($orig_m1s, $orig_m2s, $ext) = @_; my @m1s = (); my @m2s = (); my @wrapped_m1s = (); my @wrapped_m2s = (); if (scalar(@$orig_m2s) == 0) { my $fn = "$temp_dir/${host}_$$." . $ext; for (my $i = 0; $i < scalar(@$orig_m1s); $i++) { if ($orig_m1s->[$i] !~ /\.bz2$/ && $orig_m1s->[$i] !~ /\.lz4$/) { push @m1s, $orig_m1s->[$i]; } else { push @wrapped_m1s, $orig_m1s->[$i]; } } if (scalar(@wrapped_m1s) > 0) { push @m1s, $fn; push @to_delete, $fn; write_files(\@wrapped_m1s, $fn, $no_pipes); } } else { my $fn1 = "$temp_dir/${host}_$$" . $ext . "1"; my $fn2 = "$temp_dir/${host}_$$" . $ext . "2"; for (my $i = 0; $i < scalar(@$orig_m1s); $i++) { if ($orig_m1s->[$i] !~ /\.bz2$/ && $orig_m1s->[$i] !~ /\.lz4$/ && $orig_m2s->[$i] !~ /\.bz2$/ && $orig_m2s->[$i] !~ /\.lz4$/) { push @m1s, $orig_m1s->[$i]; push @m2s, $orig_m2s->[$i]; } else { push @wrapped_m1s, $orig_m1s->[$i]; push @wrapped_m2s, $orig_m2s->[$i]; } } if (scalar(@wrapped_m1s) > 0) { push @m1s, $fn1; push @to_delete, $fn1; write_files(\@wrapped_m1s, $fn1, $no_pipes); } if (scalar(@wrapped_m2s) > 0) { push @m2s, $fn2; push @to_delete, $fn2; write_files(\@wrapped_m2s, $fn2, $no_pipes); } } @$orig_m1s = @m1s; @$orig_m2s = @m2s; } sub Info { if ($verbose) { print STDERR "(INFO): " ,@_; } } sub Error { my @msg = @_; $msg[0] = "(ERR): ".$msg[0]; printf STDERR @msg; } sub Fail { Error(@_); die("Exiting now ...\n"); } sub Extract_IndexName_From { my $index_opt = $ref_str ? '--index' : '-x'; for (my $i=0; $i<@_; $i++) { if ($_[$i] eq $index_opt){ my $idx_basename = $_[$i+1]; my @idx_filenames = glob($idx_basename . "*.bt2{,l}"); unless (@idx_filenames) { if (exists $ENV{"BOWTIE2_INDEXES"}) { @idx_filenames = glob("$ENV{'BOWTIE2_INDEXES'}/$idx_basename" . "*.bt2{,l}"); $idx_basename = $ENV{"BOWTIE2_INDEXES"} . "/" . $idx_basename; } if (!@idx_filenames) { Fail("\"" . $idx_basename . "\" does not exist or is not a Bowtie 2 index\n"); } } return $idx_basename; } } Info("Cannot find any index option (--reference-string, --ref-string or -x) in the given command line.\n"); } if(scalar(@mate2s) > 0) { # Just pass all the mate arguments along to the binary maybe_wrap_input(\@mate1s, \@mate2s, "mate"); push @bt2_args, ("-1", join(",", @mate1s)); push @bt2_args, ("-2", join(",", @mate2s)); } if(scalar(@unps) > 0) { maybe_wrap_input(\@unps, [], "unps"); push @bt2_args, ("-U", join(",", @unps)); } if (scalar(@tab5_mates) > 0) { maybe_wrap_input(\@tab5_mates, [], "tab5"); push @bt2_args, ("--tab5", join(",", @tab5_mates)); } if (scalar(@tab6_mates) > 0) { maybe_wrap_input(\@tab6_mates, [], "tab6"); push @bt2_args, ("--tab6", join(",", @tab6_mates)); } if (scalar(@interleaved_mates) > 0) { maybe_wrap_input(\@interleaved_mates, [], "int"); push @bt2_args, ("--interleaved", join(",", @interleaved_mates)); } if (scalar(@bam_files) > 0) { maybe_wrap_input(\@bam_files, [], "bam"); push @bt2_args, ("-b", join(",", @bam_files)); } if(defined($ref_str)) { my $ofn = "$temp_dir/${host}_$$.ref_str.fa"; open(my $ofh, ">$ofn") || Fail("could not open temporary fasta file '$ofn' for writing.\n"); print {$ofh} ">1\n$ref_str\n"; close($ofh); push @to_delete, $ofn; system("$build_bin $ofn $ofn") == 0 || Fail("bowtie2-build returned non-0 exit level.\n"); push @bt2_args, ("--index", "$ofn"); push @to_delete, ("$ofn.1.".$idx_ext, "$ofn.2.".$idx_ext, "$ofn.3.".$idx_ext, "$ofn.4.".$idx_ext, "$ofn.rev.1.".$idx_ext, "$ofn.rev.2.".$idx_ext); } Info("After arg handling:\n"); Info(" Binary args:\n[ @bt2_args ]\n"); my $index_name = Extract_IndexName_From(@bt2_args); if ($large_idx) { Info("Using a large index enforced by user.\n"); $align_prog = $align_prog_l; $idx_ext = $idx_ext_l; if (not -f $index_name.".1.".$idx_ext_l) { Fail("Cannot find the large index ${index_name}.1.${idx_ext_l}\n"); } Info("Using large index (${index_name}.1.${idx_ext_l}).\n"); } else { if ((-f $index_name.".1.".$idx_ext_l) && (not -f $index_name.".1.".$idx_ext_s)) { Info("Cannot find a small index but a large one seems to be present.\n"); Info("Switching to using the large index (${index_name}.1.${idx_ext_l}).\n"); $align_prog = $align_prog_l; $idx_ext = $idx_ext_l; } else { Info("Using the small index (${index_name}.1.${idx_ext_s}).\n") } } my $suffix = ""; if ($debug) { $suffix = "-debug"; } elsif($sanitized) { $suffix = "-sanitized"; } # Construct command invoking bowtie2-align quote_params(\@bt2_args); my $cmd = "\"$align_prog$suffix\" --wrapper basic-0 ".join(" ", @bt2_args); # Possibly add read input on an anonymous pipe $cmd = "$readpipe $cmd" if defined($readpipe); Info("$cmd\n"); my $ret; if(defined($cap_out)) { # Open Bowtie 2 pipe open(BT, "$cmd |") || Fail("Could not open Bowtie 2 pipe: '$cmd |'\n"); # Open output pipe my $ofh = *STDOUT; my @fhs_to_close = (); if($cap_out ne "-") { open($ofh, ">$cap_out") || Fail("Could not open output file '$cap_out' for writing.\n"); } my %read_fhs = (); for my $i ("al", "un", "al-conc", "un-conc", "un-mates") { if(defined($read_fns{$i})) { my ($vol, $base_spec_dir, $base_fname) = File::Spec->splitpath($read_fns{$i}); if (-d $read_fns{$i}) { $base_spec_dir = $read_fns{$i}; $base_fname = undef; } if($i =~ /-conc$|^un-mate$/) { # Open 2 output files, one for mate 1, one for mate 2 my ($fn1, $fn2); if ($base_fname) { ($fn1, $fn2) = ($base_fname,$base_fname); } else { ($fn1, $fn2) = ($i.'-mate',$i.'-mate'); } if($fn1 =~ /%/) { $fn1 =~ s/%/1/g; $fn2 =~ s/%/2/g; } elsif($fn1 =~ /\.[^.]*$/) { $fn1 =~ s/\.([^.]*)$/.1.$1/; $fn2 =~ s/\.([^.]*)$/.2.$1/; } else { $fn1 .= ".1"; $fn2 .= ".2"; } $fn1 = File::Spec->catpath($vol,$base_spec_dir,$fn1); $fn2 = File::Spec->catpath($vol,$base_spec_dir,$fn2); $fn1 ne $fn2 || Fail("$fn1\n$fn2\n"); my ($redir1, $redir2) = (">$fn1", ">$fn2"); $redir1 = "| gzip -c $redir1" if $read_compress{$i} eq "gzip"; $redir1 = "| bzip2 -c $redir1" if $read_compress{$i} eq "bzip2"; $redir1 = "| lz4 -c $redir1" if $read_compress{$i} eq "lz4"; $redir2 = "| gzip -c $redir2" if $read_compress{$i} eq "gzip"; $redir2 = "| bzip2 -c $redir2" if $read_compress{$i} eq "bzip2"; $redir2 = "| lz4 -c $redir2" if $read_compress{$i} eq "lz4"; open($read_fhs{$i}{1}, $redir1) || Fail("Could not open --$i mate-1 output file '$fn1'\n"); open($read_fhs{$i}{2}, $redir2) || Fail("Could not open --$i mate-2 output file '$fn2'\n"); push @fhs_to_close, $read_fhs{$i}{1}; push @fhs_to_close, $read_fhs{$i}{2}; } else { my $redir = ">".File::Spec->catpath($vol,$base_spec_dir,$i."-seqs"); if ($base_fname) { $redir = ">$read_fns{$i}"; } $redir = "| gzip -c $redir" if $read_compress{$i} eq "gzip"; $redir = "| bzip2 -c $redir" if $read_compress{$i} eq "bzip2"; $redir = "| lz4 -c $redir" if $read_compress{$i} eq "lz4"; open($read_fhs{$i}, $redir) || Fail("Could not open --$i output file '$read_fns{$i}'\n"); push @fhs_to_close, $read_fhs{$i}; } } } while() { chomp; my $filt = 0; unless(substr($_, 0, 1) eq "@") { # If we are supposed to output certain reads to files... my $tab1_i = index($_, "\t") + 1; my $tab2_i = index($_, "\t", $tab1_i); my $fl = substr($_, $tab1_i, $tab2_i - $tab1_i); my $unal = ($fl & 4) != 0; $filt = 1 if $no_unal && $unal; if($passthru) { if(scalar(keys %read_fhs) == 0 || ($fl & 256) != 0) { # Next line is read with some whitespace escaped my $l = ; } else { my $mate1 = (($fl & 64) != 0); my $mate2 = (($fl & 128) != 0); my $unp = !$mate1 && !$mate2; my $pair = !$unp; # Next line is read with some whitespace escaped my $l = ; chomp($l); $l =~ s/%(..)/chr(hex($1))/eg; if((defined($read_fhs{un}) || defined($read_fhs{al})) && $unp) { if($unal) { # Failed to align print {$read_fhs{un}} $l if defined($read_fhs{un}); } else { # Aligned print {$read_fhs{al}} $l if defined($read_fhs{al}); } } if((defined($read_fhs{"un-conc"}) || defined($read_fhs{"al-conc"})) && $pair) { my $conc = (($fl & 2) != 0); if ($conc && $mate1) { print {$read_fhs{"al-conc"}{1}} $l if defined($read_fhs{"al-conc"}); } elsif($conc && $mate2) { print {$read_fhs{"al-conc"}{2}} $l if defined($read_fhs{"al-conc"}); } elsif(!$conc && $mate1) { print {$read_fhs{"un-conc"}{1}} $l if defined($read_fhs{"un-conc"}); } elsif(!$conc && $mate2) { print {$read_fhs{"un-conc"}{2}} $l if defined($read_fhs{"un-conc"}); } } # paired read failed to align concordantly or discordantly if (defined($read_fhs{"un-mates"}) && $pair && $unal && ($fl & 8) != 0) { if ($mate1) { print {$read_fhs{"un-mates"}{1}} $l if defined($read_fhs{"un-mates"}); } else { print {$read_fhs{"un-mates"}{2}} $l if defined($read_fhs{"un-mates"}); } } } } } print {$ofh} "$_\n" if !$filt; } for my $k (@fhs_to_close) { close($k); } close($ofh); close(BT); $ret = $?; } else { $ret = system($cmd); } if(!$keep) { for(@to_delete) { unlink($_); } } kill 'HUP', @to_kill; if ($ret == -1) { Error("Failed to execute bowtie2-align: $!\n"); exit 1; } elsif ($ret & 127) { my $signm = "(unknown)"; $signm = $signame[$ret & 127] if defined($signame[$ret & 127]); my $ad = ""; $ad = "(core dumped)" if (($ret & 128) != 0); Error("bowtie2-align died with signal %d (%s) $ad\n", ($ret & 127), $signm); exit 1; } elsif($ret != 0) { Error("bowtie2-align exited with value %d\n", ($ret >> 8)); } exit ($ret >> 8);