# utilities for reading files of reads
#
# Copyright 2007, 2008
# Niko Beerenwinkel,
# Nicholas Eriksson,
# Lukas Geyrhofer,
# Osvaldo Zagordi,
# ETH Zurich
# This file is part of ShoRAH.
# ShoRAH is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# ShoRAH is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with ShoRAH. If not, see .
package ReadUtil;
use strict;
use NikoUtil;
use DNA;
use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION);
use Exporter;
$VERSION = 1.00;
@ISA = qw(Exporter);
# autoexport:
@EXPORT = qw(read_ReadSim
readReadFile readFastaFile
compareToCons compareReadToCons
);
# export on request:
@EXPORT_OK = qw();
sub read_ReadSim {
my $ReadSimOut = shift;
my @readList;
my ($h, $s) = read_fastas($ReadSimOut);
foreach my $i (0 .. @$s-1) {
# process $h->[$i] = read_0001|beg|2|length|92|forward|HIV
# and $s->[$i] = TCAAATCACTCTTTGGCAACGACCCCTCGTCGACAATAA
#split on '|' doesn't work well, so first substitute spaces
$h->[$i] =~ s/\|/ /g;
my @head = split ' ', $h->[$i];
die "missing header $h->[$i]\n" if (@head <= 1);
my $seqID = $head[0];
my $startpos = $head[2];
my $dir = $head[5];
my $source = $head[6];
my %read;
$read{id} = $seqID;
$read{st} = $startpos;
if ($dir eq 'forward') {
$read{seq} = $s->[$i];
} else {
$read{seq} = revComp($s->[$i]);
}
$read{len} = length($s->[$i]);
$read{end} = $startpos + $read{len} - 1;
$read{dir} = $dir;
$read{source} = $source;
push(@readList, \%read);
}
return \@readList;
}
sub readReadFile {
# read a file of reads given by
# startpos sequence
# startpos sequence
# startpos sequence
# ...
my $file = $_[0];
my @readList;
if (! defined $file) {
die "Error: No fasta file specified\n";
}
if (! -r $file) {
die "Fasta file \"$file\" not readable\n";
}
open(IN, "<$file") or die "Can't open $file: $!";
while () {
chomp;
next if (/^#/);
my ($start, $seq) = split;
my %read;
$read{st} = $start;
$read{seq} = $seq;
$read{len} = length($seq);
$read{end} = $start + $read{len} - 1;
push(@readList, \%read);
}
return \@readList;
}
sub readFastaFile {
my $file = $_[0];
my $seqID;
my $startpos;
my @readList;
my $seq = '';
my @head;
if (! defined $file) {
die "Error: No fasta file specified\n";
}
if (! -r $file) {
die "Fasta file \"$file\" not readable\n";
}
open(IN, "<$file") or die "Can't open $file: $!";
while () {
chomp;
if (/^\#/) {
next;
} elsif ($_ eq '') {
next;
} elsif (/^>/) {
# header looks like ">seqID startpos ..."
if (length($seq) > 0) {
#we've found a sequence, so store it
my %read;
$read{id} = $seqID;
$read{st} = $startpos;
$read{seq} = $seq;
$read{len} = length($seq);
$read{end} = $startpos + $read{len} - 1;
#my @tmp = @head;
#$read{qs} = \@tmp;
push(@readList, \%read);
$seq = '';
}
@head = split;
if (@head > 1) {
$seqID = shift @head; #$head[0];
$seqID =~ s/>//;
$startpos = shift @head; #$head[1];
} else {
print "Error - @head missing characters\n";
}
} else {
#a sequence line
#remove spaces at beginning and end
s/^\s+//;
s/\s+$//;
$seq .= $_;
}
}
#still have sequence in the queue, need to put the last one in
#this should be done in a better fashion...
my %read;
$read{id} = $seqID;
$read{st} = $startpos;
$read{seq} = $seq;
$read{len} = length($seq);
$read{end} = $startpos + $read{len} - 1;
# $read{qs} = \@head;
push(@readList, \%read);
close IN;
return \@readList;
}
sub compareToCons {
my $seq = shift;
my $cons = shift;
my @ans;
#die "$seq\n$cons\n" if (length($seq) != length($cons));
my $minl = Nmin(length($seq), length($cons));
#input: two strings of same length
#output: list of differences in format A90I
foreach my $i (1..$minl) {
my $c1 = substr($seq,$i-1,1);
my $c0 = substr($cons,$i-1,1);
if (!($c1 eq $c0)) {
push(@ans, $c0 . $i . $c1);
}
}
return \@ans;
}
sub compareReadToCons {
# not really right, since compareToCons
# prints in local coordinates...
my $read = shift;
my $cons = shift;
#input a read %r = {id:AHAHA seq : ACGTAT, len : 6, start : 80}
#and a string
#output: %r = {id: AHAHA, ..., muts = join(',', compareToCons(read, cons) }
#my %r;
#$r{id} = $read->{id};
#$r{len} = $read->{len};
$read->{muts} = join(',', compareToCons($read->{seq}, substr($cons, $read->{st}, $read->{len})));
}
1;