package PGF::Parsers::FastaParse::More; =head1 NAME PGF::Parsers::FastaParse::More - Perl module containing utilities to extract entries from a fasta file. =head1 VERSION $Revision: 1.2 $ $Date: 2009-08-26 17:18:31 $ =head1 SYNOPSIS Example of usage: use PGF::Parsers::ParseFasta::More; my $obj = PGF::Parsers::FastaParse::More->new('myfile.fasta'); # get all read entries by name. returns an array of references to a hash # containing fasta entry data. An optional regular expression can be passed # into the method. If not specified, all entries are returned. # my @entries = $obj->getFastaEntry( '\D+\d+' ); foreach my $entry (@entries) { print "Tag=$entry->{tag}\n"; print "Name=$entry->{name}\n"; print "Comment=$entry->{comment}\n"; print "Sequence=$entry->{sequence}\n"; print "Length=$entry->{length}\n"; } # split fasta file into multiple fasta files grouped by library. # returns a hash where the key=library name, value=reference to hash containing # fasta entry data. # %entryByLibs = $obj->splitFastaEntryByLibrary(); foreach my $entry (keys %entryByLibs) { print "Tag=$entry->{tag}\n"; print "Name=$entry->{name}\n"; print "Comment=$entry->{comment}\n"; print "Sequence=$entry->{sequence}\n"; print "Length=$entry->{length}\n"; } # get all libraries found in the fasta file after splitting entries by library. # returns an array of unique library names. # my @libraries = $obj->getLibrariesFromFasta; =head1 DESCRIPTION This module reads entries in a fasta file and provides methods to retrieve the fasta entries. The follow are the methods available for use in this module. =head2 Class and object methods =head3 new( I<(value)>, [I] ) Creates the object and reads the input fasta file. Specify the input fasta (and path if necessary) in I. The following optional criteria could be assigned by passing a reference to hash after the input of the fasta file: -rawFormat=> 0 or 1 -specify 1 to store sequences with newlines, 0 to remove newlines from sequences. Example: $obj->PGF::Parsers::FastaParse::More::new('myfile.fasta', {-rawFormat=>1}); =head3 getFastaEntry( [I] ) Returns either a scalar or array of references to a hash containing the information of the fasta entries. An optional regular expression could be specified in I to restrict the return values. If I is not specified, all entries in the fasta file is returned. The returned reference is a hash containing the following key/value pairs: tag => header line for each entry (line starting with '>' tag). name => first word in header line. comment => subsequent words in header line. sequence => sequence of entry. length => length of sequence. =head3 splitFastaByLibrary Splits the fasta file into multiple library fasta entries and returns a hash where the key=library name, value=reference to hash containing fasta entry data. =head3 getLibrariesFromFasta Returns an array of the libraries found in the fasta file. Must call splitFastaEntryByLibrary before using this method. =head1 AUTHOR(S) Stephan Trong =head1 HISTORY =over =item * S.Trong 10/17/2005 Creation =back =cut use strict; use Carp; use Carp qw(cluck); use FindBin; use lib "$FindBin::Bin/../lib"; use PGF::Parsers::FastaParse; use File::Basename; #============================================================================# sub new { my $self = shift; my $file = shift; my $H_args = shift; my $rawFormat = defined $H_args->{-rawFormat} ? $H_args->{-rawFormat}:0; croak "file must be specified.\n" if !defined $file; my $hash = {}; # if fasta file is defined, then parse file and store data in object. # if ( !-e $file ) { croak "Cannot find fasta file $file.\n"; } $hash->{_file} = $file; $hash->{_data} = _parseFile($file,$rawFormat); return bless( $hash, ref($self) || $self); } #============================================================================# sub _parseFile { my $file = shift; my $rawFormat = shift; my @values = (); my $fasObject = PGF::Parsers::FastaParse->new($file); my %libFound = (); while ($fasObject->MoreEntries) { $fasObject->ReadNextEntry(-rawFormat=>$rawFormat); push @values, { name=>$fasObject->Name, comment=>$fasObject->Comment, sequence=>$fasObject->Seq, length=>$fasObject->Length, tag=>$fasObject->Tag, }; } $fasObject->Close; return \@values; } #============================================================================# sub splitFastaEntryByLibrary { my $self = shift; my $path = shift; # if path to write library files to is defined, then validate location of $path. # if ( defined $path && !-d $path ) { croak "Cannot find the specified path for creating files.\n"; } # if path to create library files not defined, default to directory of fasta file. # $path = dirname( $self->{_file} ) if !defined $path; my @datas = @{$self->{_data}}; my %libReads = (); my @libraries = (); my %found = (); foreach my $ref ( @datas ) { # parse library from read name if format is correct. if ( $ref->{name} =~ /^(\D+)\d+/ ) { push @{$libReads{$1}}, $ref; push @libraries, $1 if !$found{$1}++; } } $self->{_libraries} = \@libraries; return %libReads; } #============================================================================# sub getLibrariesFromFasta { my $self = shift; return @{$self->{_libraries}} if $self->{_libraries}; return ''; } #============================================================================# sub getFastaEntry { my $self = shift; my $name = shift || ''; my @entries = (); if ( length $name ) { foreach (@{$self->{_data}}) { push @entries, $_ if $_->{name} =~ /$name/; } } else { @entries = @{$self->{_data}} if @{$self->{_data}}; } if (@entries) { return wantarray ? @entries : $entries[0]; } else { return wantarray ? () : ''; } } #============================================================================# 1;