# utilities for reading files of reads # # Copyright 2007, 2008 # Niko Beerenwinkel, # Nicholas Eriksson, # Lukas Geyrhofer, # Osvaldo Zagordi, # ETH Zurich # This file is part of ShoRAH. # ShoRAH is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # ShoRAH is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with ShoRAH. If not, see . package ReadUtil; use strict; use NikoUtil; use DNA; use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION); use Exporter; $VERSION = 1.00; @ISA = qw(Exporter); # autoexport: @EXPORT = qw(read_ReadSim readReadFile readFastaFile compareToCons compareReadToCons ); # export on request: @EXPORT_OK = qw(); sub read_ReadSim { my $ReadSimOut = shift; my @readList; my ($h, $s) = read_fastas($ReadSimOut); foreach my $i (0 .. @$s-1) { # process $h->[$i] = read_0001|beg|2|length|92|forward|HIV # and $s->[$i] = TCAAATCACTCTTTGGCAACGACCCCTCGTCGACAATAA #split on '|' doesn't work well, so first substitute spaces $h->[$i] =~ s/\|/ /g; my @head = split ' ', $h->[$i]; die "missing header $h->[$i]\n" if (@head <= 1); my $seqID = $head[0]; my $startpos = $head[2]; my $dir = $head[5]; my $source = $head[6]; my %read; $read{id} = $seqID; $read{st} = $startpos; if ($dir eq 'forward') { $read{seq} = $s->[$i]; } else { $read{seq} = revComp($s->[$i]); } $read{len} = length($s->[$i]); $read{end} = $startpos + $read{len} - 1; $read{dir} = $dir; $read{source} = $source; push(@readList, \%read); } return \@readList; } sub readReadFile { # read a file of reads given by # startpos sequence # startpos sequence # startpos sequence # ... my $file = $_[0]; my @readList; if (! defined $file) { die "Error: No fasta file specified\n"; } if (! -r $file) { die "Fasta file \"$file\" not readable\n"; } open(IN, "<$file") or die "Can't open $file: $!"; while () { chomp; next if (/^#/); my ($start, $seq) = split; my %read; $read{st} = $start; $read{seq} = $seq; $read{len} = length($seq); $read{end} = $start + $read{len} - 1; push(@readList, \%read); } return \@readList; } sub readFastaFile { my $file = $_[0]; my $seqID; my $startpos; my @readList; my $seq = ''; my @head; if (! defined $file) { die "Error: No fasta file specified\n"; } if (! -r $file) { die "Fasta file \"$file\" not readable\n"; } open(IN, "<$file") or die "Can't open $file: $!"; while () { chomp; if (/^\#/) { next; } elsif ($_ eq '') { next; } elsif (/^>/) { # header looks like ">seqID startpos ..." if (length($seq) > 0) { #we've found a sequence, so store it my %read; $read{id} = $seqID; $read{st} = $startpos; $read{seq} = $seq; $read{len} = length($seq); $read{end} = $startpos + $read{len} - 1; #my @tmp = @head; #$read{qs} = \@tmp; push(@readList, \%read); $seq = ''; } @head = split; if (@head > 1) { $seqID = shift @head; #$head[0]; $seqID =~ s/>//; $startpos = shift @head; #$head[1]; } else { print "Error - @head missing characters\n"; } } else { #a sequence line #remove spaces at beginning and end s/^\s+//; s/\s+$//; $seq .= $_; } } #still have sequence in the queue, need to put the last one in #this should be done in a better fashion... my %read; $read{id} = $seqID; $read{st} = $startpos; $read{seq} = $seq; $read{len} = length($seq); $read{end} = $startpos + $read{len} - 1; # $read{qs} = \@head; push(@readList, \%read); close IN; return \@readList; } sub compareToCons { my $seq = shift; my $cons = shift; my @ans; #die "$seq\n$cons\n" if (length($seq) != length($cons)); my $minl = Nmin(length($seq), length($cons)); #input: two strings of same length #output: list of differences in format A90I foreach my $i (1..$minl) { my $c1 = substr($seq,$i-1,1); my $c0 = substr($cons,$i-1,1); if (!($c1 eq $c0)) { push(@ans, $c0 . $i . $c1); } } return \@ans; } sub compareReadToCons { # not really right, since compareToCons # prints in local coordinates... my $read = shift; my $cons = shift; #input a read %r = {id:AHAHA seq : ACGTAT, len : 6, start : 80} #and a string #output: %r = {id: AHAHA, ..., muts = join(',', compareToCons(read, cons) } #my %r; #$r{id} = $read->{id}; #$r{len} = $read->{len}; $read->{muts} = join(',', compareToCons($read->{seq}, substr($cons, $read->{st}, $read->{len}))); } 1;