#
# NikoUtil.pm - Utilities
# Copyright 2007, 2008
# Niko Beerenwinkel,
# Nicholas Eriksson,
# Lukas Geyrhofer,
# Osvaldo Zagordi,
# ETH Zurich
# This file is part of ShoRAH.
# ShoRAH is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# ShoRAH is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with ShoRAH. If not, see .
package NikoUtil;
use strict;
use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION);
use Exporter;
$VERSION = 1.00;
@ISA = qw(Exporter);
# autoexport:
@EXPORT = qw(
trim
is_numeric
round
intrand discr_rand
concat_list
read_fasta read_fastas print_fasta
print_matrix transpose exp_matrix
where_clause_list
hamming
hamming_distance alphabet
makeDistMatrix
random
Nmin Nmax
);
# export on request:
@EXPORT_OK = qw();
sub concat_list { # concatenate lists
my @lists = @_; # list of refs. to lists to be concatenated
# concatenate all lists
my @all;
for (@lists) {
push @all, @{$_};
}
# count occurences in concatenation
my (@union, %count);
for (@all) {
$count{"@{$_}"}++;
push @union, $_ unless ($count{"@{$_}"} > 1);
}
return \@union; # multiple occurences appear only once
}
sub is_numeric { # Cookbook p. 44
my $val = shift || ''; # return FALSE if $val is FALSE
return ($val =~ /^-?(?:\d+(?:\.\d*)?|\.\d+)$/);
}
sub round { # round to nearest integer
my @input = @_;
for (@input) {
$_ = int($_ + 0.5);
}
return wantarray ? @input : $input[0];
}
sub intrand {
# random integer number between and including $a and $b
my $a = shift;
my $b = shift;
return int($a) + int(rand($b-$a+1));
}
sub discr_rand {
# random integers in given proportion
my @proportion = @_;
my $n = scalar(@proportion);
my @cum;
my $i;
my $sum = 0.0;
for $i (0 .. $n - 1) {
$sum += $proportion[$i];
}
$cum[0] = ($proportion[0] / $sum);
for $i (1 .. $n - 1) {
$cum[$i] = $cum[$i-1] + ($proportion[$i] / $sum);
}
my $r = rand();
#print "@cum ---> $r\n";
$i = 0;
while (($i < $n-1) && ($cum[$i] < $r)) {
$i++;
}
return $i;
}
sub trim { # remove white spaces to the left and right
my @input = @_;
for (@input) {
s/^\s+//;
s/\s+$//;
}
return wantarray ? @input : $input[0];
}
sub read_fasta {
my $filename = shift;
my $sep = chr(13); # CR
open FASTA, $filename
or die "Can't open fasta file $filename!\n";
my @lines = ;
close FASTA;
for my $i (0 .. $#lines) {
splice (@lines, $i, 1, (split /$sep/, $lines[$i]));
}
return (substr($lines[0], 1), join('', @lines[1..$#lines]));
}
sub read_fastas {
my $filename = shift;
my (@header, @seq);
my $i = -1;
open FASTA, $filename
or die "Can't open fasta file -- $filename!\n";
while (my $line = ) {
next if ($line =~ /^#/);
if (substr($line, 0, 1) eq ">") { # header
$i++;
$header[$i] = trim(substr($line, 1));
}
else { # sequence
die "Error - fasta file without header lines @ $line\n" if ($i == -1);
$seq[$i] .= trim($line);
}
}
close FASTA;
return (\@header, \@seq);
}
sub print_fasta {
my $head = shift;
my $seq = shift;
my $width = 60;
my @seq = split //, $seq;
my $fasta = ">$head\n";
while (scalar(@seq) > 0) {
$fasta .= join('', splice(@seq, 0, $width)) . "\n";
}
#return $fasta . "\n";
return $fasta;
}
sub print_matrix {
# print matrix (tab- and newline-separated)
my @mat = @{shift @_}; # data im $mat[][]
my $row_label = shift; # row labels == first column to print
my $col_label = shift; # column labels == first row to print
my $mat_label = shift || "\\"; # upper leftmost cell
print $mat_label, "\t" if ($row_label);
if ($col_label) {
print join("\t", @$col_label), "\n";
}
for my $i (0 .. $#mat) {
print $row_label->[$i], "\t" if ($row_label);
for my $j (0 .. $#{$mat[$i]}) {
if (defined($mat[$i][$j])) {
print $mat[$i][$j], "\t";
}
else {
print "undef\t";
}
}
print "\n";
}
}
sub transpose {
# transpose matrix
my @M = @{shift @_};
my @Mt;
for my $i (0 .. $#M) {
for my $j (0 .. $#{$M[0]}) {
$Mt[$j][$i] = $M[$i][$j];
}
}
return \@Mt;
}
sub exp_matrix {
my $mat = shift;
my $base = shift || 10;
my $emat;
for my $i (0 .. scalar(@$mat)-1) {
for my $j (0 .. scalar(@{$mat->[0]})-1) {
$emat->[$i][$j] = $base ** $mat->[$i][$j];
}
}
return $emat;
}
sub where_clause_list {
my $field = shift;
my @values = @_;
my @cond = ();
for (@values) {
if (is_numeric($_)) {
push @cond, sprintf "%s=%d", $field, $_;
}
else {
push @cond, sprintf "%s='%s'", $field, $_;
}
}
return join(' or ', @cond);
}
sub Nmax {
# Maximum of a vector containing undef's
my @input = @_;
my ($max, $argmax);
for my $i (0 .. $#input) {
if (defined($input[$i])) {
if (! defined($argmax)) {
$max = $input[$i];
$argmax = $i;
}
elsif ($input[$i] > $max) { # same action, but avoids warning
$max = $input[$i];
$argmax = $i;
}
}
}
return wantarray ? ($max, $argmax) : $max;
}
sub Nmin {
# Minimum of a vector containing undef's
my @input = @_;
my ($min, $argmin);
for my $i (0 .. $#input) {
if (defined($input[$i])) {
if (! defined($argmin)) {
$min = $input[$i];
$argmin = $i;
}
elsif ($input[$i] < $min) { # same action, but avoids warning
$min = $input[$i];
$argmin = $i;
}
}
}
return wantarray ? ($min, $argmin) : $min;
}
sub hamming_distance {
my $X = uc(shift);
my $Y = uc(shift);
my $min;
if (length($X) > length($Y)) {
$min = length($Y);
} else {
$min = length($X);
}
my $comparisons = 0;
my $dist = 0;
my ($x, $y);
for my $pos (0 .. $min - 1) {
$x = substr($X,$pos,1);
$y = substr($Y,$pos,1);
if (($x ne '.') && ($y ne '.')) {
$comparisons++;
if ($x ne $y) {
$dist++;
}
}
}
#die "Incomparable strings:\n$X\n$Y\n" if ($comparisons == 0);
if ($comparisons == 0) {
#print "Error: incomparable strings:\n$X\n$Y\n";
return 100;
}
return ($dist / $comparisons);
}
sub makeDistMatrix {
my $seqs = shift;
my @ans;
my $N = @$seqs;
foreach my $i (0..$N-1) {
foreach my $j (0..$N-1) {
$ans[$i][$j] = hamming($$seqs[$i],$$seqs[$j]);
}
#print "$i\n" if ($i % 10 == 0);
}
#print "distance matrix:\n";
#print_matrix(\@ans);
#print "\n\n";
return \@ans;
}
sub hamming {
#hamming distance, absolute, not caring about '.'
my $s = shift;
my $t = shift;
my $diff = 0;
# compute numer of differences between $s and $t
#die "ERROR in hamming $s\n$t\n" if (length($s) != length($t));
my $len = Nmin(length($s), length($t));
foreach my $i (0.. $len-1) {
if (!(substr($s,$i,1) eq substr($t,$i,1))) {
$diff++;
}
}
return $diff;
}
sub alphabet {
my $type = uc(shift);
my (@Alphabet, @Special);
if ($type eq 'AA') {
@Alphabet = qw(A C D E F G H I K L M N P Q R S T V W Y);
@Special = ("-", ".", "*");
}
elsif ($type eq 'NT') {
@Alphabet = qw(A C G T);
@Special = ("-", ".", "N");
}
elsif ($type eq '01') {
@Alphabet = qw(0 1);
@Special = ("-", ".");
}
else {
die "Unknown alphabet -- $type!\n";
}
# indices:
my %LtrIdx;
for my $k (0 .. $#Special) {
$LtrIdx{$Special[$k]} = -$k - 1;
}
for my $k (0 .. $#Alphabet) {
$LtrIdx{$Alphabet[$k]} = $k;
}
return (\@Alphabet, \@Special, \%LtrIdx);
}
sub random { # generate an integer random number
my $x = shift; # between $x
my $y = shift; # and $y (including both)
return int(rand($y - $x + 1)) + $x;
}
1;