package PGF::Parsers::ParseAce; =head1 NAME ParseAce - Parse and read ace file =head1 SYNOPSYS use PGF::Parsers::ParseAce; # Parse ace file and create ace object. # $ace = PGF::Parsers::ParseAce->parse( $aceFile, { %HASH }); where, $aceFile is the name of the ace file. %HASH is an optional hash input for the following conditions: -verbose=>1 # set this to 1 if you want the module to print # the progress of the parsing to STDOUT. EXAMPLE: $ace = ParseAce->parse( 'myAceFile.ace' ); # Once you have an $ace object, you can get info from the ace file # using the following methods: # $ace->totalContigs $ace->allContigs $ace->getTags $ace->tag $ace->getReadTags $ace->contig(...) $ace->getreads(...) $ace->read(...) $ace->unpadded(...) $ace->readTagPosition(...) # Returns scalar of the total number of contigs: # $contigs = $ace->totalContigs; # Returns array of all contig numbers found in ace file (sorted in ascending order): # @contigs = $ace->allContigs; # Returns array containing block types found in the ace file. blocks here refer to anything # that starts with a tag name and is enclosed in {} brackets. example: 'CT', 'RT', ... # @blockTags = $ace->getTags; # Returns array containing the data within all blocks for a given block tag type: # @data = $ace->tag( $type ); where $type is the tag name of the block (e.g., 'CT', 'WA', ...) EXAMPLE: @data = $ace->tag('CT'); # here, each element contains the string found within a CT block, # (i.e,. consensus tag). # Returns array of read tag types found in RT{} block. example: 'matchElsewhereHighQual', # 'chimera',... # @readTags = $ace->getReadTags; # To get info for a contig: # $stream = $ace->contig( $contigName ); where $contigName is the contig number to retrieve. $contigName has to match the token in the 'CO ' line within the ace file. EXAMPLE: $stream = $ace->contig('Contig100'); # getting info for contig 100. $contigName = $stream->{number}; # contig number $sequence = $stream->{sequence}; # contig sequence $seqUnpad = $stream->{sequence_unpadded}; # contig sequence unpadded (untrimmed). $size = $stream->{size}; # contig size padded (untrimmed) $sizeUnpadded = $stream->{size_unpadded}; # contig size unpadded (untrimmed). $quality = $stream->{quality}; # contig quality scores $AF_tag = $stream->{AF_tag}; # string containing all AF tags for this contig. $BS_tag = $stream->{BS_tag}; # string containing all BS tags for this contig. $totalNumberOfReads = $stream->{totalReads}; # total number of reads in the contig # To get reads within the ace file: # # There are two methods for doing this: # # 1st method: $ace->fetchreads( %HASH ); where %HASH is a parameter used to limit the return list. The following key/value pairs are to be used and could be used in conjunction with one another: -contig=>$contigName # returns only reads within this contig. -name=>$readName # returns only reads with this name; you may specify # a regular expression in $readName. -tag=>$readTag # returns only reads containing this read tag type. # read tags correspond to RT{} tags found for each read # (e.g., 'matchElsewhereHighQual') # This prepares a data structure for fetching the attributes of each read # using the fetch method: $read = $ace->fetch( %HASH ); where %HASH is a parameter used to set the following coditions: -unpadded=>0|1 # set this to true to return unpadded and alignment # positions of the read. (default=1). Note that # converting padded to unpadded positions increases # the parsing time. The fetch method returns a reference to a hash containing the read's attributes. EXAMPLE: $ace->fetchreads( -contig=>'Contig10' ); while( $read = $ace->fetch() ) { $readName = $read->{name}; # name of read $contig = $read->{contig}; # contig read belongs in $startPos = $read->{start}; # starting location of read in contig $endPos = $read->{stop}; # ending location of read in contig $sequence = $read->{sequence}; # sequence of read $complement = $read->{complement}; # returns either "C" (complement) or "U" (uncomplement) $tag = $read->{tag}; # read tag lines $size = $read->{size}; # length of read padded $AF_tag = $read->{AF_tag}; # data of AF tag $QA_tag = $read->{QA_tag}; # data of QA tag $RD_tag = $read->{RD_tag}; # data of RD tag # the following is available only if -unpadded=>1 is specified: $sizeUnpad = $read->{size_unpadded}; # length of read unpadded $seqUnpad = $read->{sequence_unpadded}; # unpadded sequence of read $startPosUnpad = $read->{start_unpadded}; # unpadded starting location of read in contig $endPosUnpad = $read->{stop_unpadded}; # unpadded ending location of read in contig $startAlign = $read->{align_start}; # starting alignment position of read to contig. $endAlign = $read->{align_stop}; # ending alignment position of read to contig. } # 2nd method: # # This returns an array of pointers to a data structure containing the read # information. Each element corresponds to one read. See example below on how to # dereference the pointer to get values for the read. # # Caveat: because this method returns an array of read attributes, the memory # footprint may be large and could potentially cause problems if the ace # file is large. # @reads = $ace->getreads( %HASH ); where %HASH is a parameter used to limit the return list. The following key/value pairs are to be used and could be used in conjunction with one another: -contig=>$contigName # returns only reads within this contig. -name=>$readName # returns only reads with this name; you may specify # a regular expression in $readName. -tag=>$readTag # returns only reads containing this read tag type. # read tags correspond to RT{} tags found for each read # (e.g., 'matchElsewhereHighQual') -unpadded=>1 # set this to true to return unpadded and alignment # positions of the read. (takes longer to retrieve; default). -verbose=>1 # set this to 1 if you want the module to print # the progress of the parsing to STDOUT. EXAMPLES: # Get all reads in contig 100 with .x extension. # # NOTE: this returns an array of pointers to a data structure containing the read # information. Each element corresponds to one read. See example below on how to # dereference the pointer to get values for the read. # @reads = $ace->getreads( -contig=>'Contig100', -name=>'\.x', -unpadded=>1 ); # Get all reads with the read tag 'chimera' # @reads = $ace->getreads( -tag=>'chimera' ); # To get info for each read once you have a list of reads from $ace->getreads: # foreach $read (@reads) { $readName = $read->{name}; # name of read $contig = $read->{contig}; # contig read belongs in $startPos = $read->{start}; # starting location of read in contig $endPos = $read->{stop}; # ending location of read in contig $sequence = $read->{sequence}; # sequence of read $complement = $read->{complement}; # returns either "C" (complement) or "U" (uncomplement) $tag = $read->{tag}; # read tag lines $size = $read->{size}; # length of read padded $AF_tag = $read->{AF_tag}; # data of AF tag $QA_tag = $read->{QA_tag}; # data of QA tag $RD_tag = $read->{RD_tag}; # data of RD tag # the following is available only if -unpadded=>1 is specified: $sizeUnpad = $read->{size_unpadded}; # length of read unpadded $seqUnpad = $read->{sequence_unpadded}; # unpadded sequence of read $startPosUnpad = $read->{start_unpadded}; # unpadded starting location of read in contig $endPosUnpad = $read->{stop_unpadded}; # unpadded ending location of read in contig $startAlign = $read->{align_start}; # starting alignment position of read to contig. $endAlign = $read->{align_stop}; # ending alignment position of read to contig. } # To get the info for a read based on the read name: # $read = $ace->read( $readName, { %HASH } ); where $readName is the name of the read you want to get info for, %HASH is an optional hash input for the following conditions: -unpadded=>1 # set this to true to return unpadded and alignment # positions of the read. (takes longer to retrieve). -contig_sequence=>sequence # speeds up the unpadded calculations. EXAMPLE: $read = $ace->read( 'ABCD1234.x1' ); $readName = $read->{name}; $contig = $read->{contig}; $startPos = $read->{start}; $endPos = $read->{stop}; $sequence = $read->{sequence}; $complement = $read->{complement}; $tag = $read->{tag}; $size = $read->{size}; $AF_tag = $read->{AF_tag}; $QA_tag = $read->{QA_tag}; $RD_tag = $read->{RD_tag}; # The following is available only if -unpadded=>1 is specified: # $startPosUnpad = $read->{start_unpadded}; $endPosUnpad = $read->{stop_unpadded}; $seqUnpad = $read->{sequence_unpadded}; $sizeUnpad = $read->{size_unpadded}; $startAlign = $read->{align_start}; $endAlign = $read->{align_stop}; # To get unpadded positions of the read if the -unpadded parameter is not specified when # using the getreads or the read method, the attributes corresponding to the unpadded # feature can be obtained usnig the unpadded method. # $read = $ace->unpadded( $read, \%args ); where, $read = reference to hash containing read attributes obtained from the getreads or read method (required). %args = hash with the following optional key/values: (optional) -contig_sequence = padded sequence of contig (unpadded calculations run faster if this is supplied). EXAMPLES: # if used with the getreads method: # @reads = $ace->getreads( -contig=>'Contig400', -unpadded=>0 ); foreach my $read (@reads) { $read = $ace->unpadded($read); # # this adds the following hash element to the reference: # size_unpadded # sequence_unpadded # start_unpadded # stop_unpadded # align_start # align_stop } # if used with the read method: # $readObj = $ace->read( 'ABC1234.x1' ); $read = $ace->unpadded($readObj); # NOTE: The retrieval of unpadded positions is slightly faster and more efficient if the # -unpadded parameter is initially specified when using the getreads or read method. # To get unpadded positions from a read tag, use the readTagPosition method. # %attribs = $ace->readTagPosition( $tag, \%args ); where, $tag = string containing read tag. (required) (example: 'AAOZ9379.b1 matchElsewhereLowQual phrap 45 82 Thu Jul 29 10:29:54 2004') $H_args = reference to hash with the following optional key/values: (optional) -read = reference to hash containing read attributes (obtained from the getreads or read method. (speeds up processing by not having to look up these values for the read) -contig_sequence = sequence of contig the read is in (speeds up processing by not having to look up contig sequence) EXAMPLE: $tags=$read->{tag}; # # Here, $tags contain a scalar of lines present in the read tag. The lines are separated # by a newline character. # foreach $readTag ( split/\n/, $tags ) { %attribs = $ace->readTagPosition( $readTag ); $unpaddedStart=$attribs{start_unpadded}; # unpadded start position of the read tag $unpaddedStop=$attribs{stop_unpadded}; # unpadded stop position of the read tag $paddedStart=$attribs{start}; # padded start position of the read tag $paddedStop=$attribs{stop}; # padded stop position of the read tag $type=$attribs{type}; # type of read tag $read=$attribs{read}; # name of read } =head1 CAVEATS The unpadded and readTagPosition methods use a suboptimal algorithm to convert padded to unpadded read positions. This may result in a decrease in performance. For best performance in obtaining unpadded positions for reads, you should use the getreads method with the -contig=>$contig parameter. On rare occasions, the unpadded method returns incorrect unpadded positions. This issue remains to be fixed. =head1 AUTHOR Stephan Trong Lawrence Livermore National Laboratory DOE Joint Genome Institute 2800 Mitchell Drive Walnut Creek, Ca 94598 trong1@llnl.gov =version comment $Revision: 1.10 $ $Date: 2009-08-26 17:18:27 $ version 1.10: 06/2005 - S.Trong Modified conversion of padded to unpadded read positions to increase speed. -created a hash conversion table for each read position that translated the pad location on the contig to unpad location. version 1.11: 08/17/2005 - S.Trong Fixed bug in getreads when calling without any arguments. Fixed bug in getreads when calling argument with -contig and -tag combo. version 1.12: 09/07/2006 - S.Trong Improved pad to unpad conversion when calling getreads method. This method now uses the PGF::Parsers::ParseAce::Unpad module to convert padded to unpadded read positions. =cut use strict; use warnings; use vars qw( $VERSION $RealBin ); use Carp; use FindBin qw($RealBin); use lib "$RealBin/ParseAce"; use PGF::Parsers::ParseAce::Unpad; $VERSION = 1.12; sub parse { =head Creates an object that contains a set of data retrieved from the ace file. ARGUMENTS: $file = ace file (required) $H_args = reference to hash with the following optional key/values: (optional) -verbose = prints processing info to STDOUT. RETURNS: ace object containing data structure of parsed values from the ace file. =cut my ($class,$file,$args) = @_; my $self=""; $self=_parse_ace($file,$args); $self->[5] = []; # holds read list if fetchreads is called. $self->[6] = []; # holds unpadded conversion table. bless $self, $class; return $self; } sub _parse_ace { =head This subroutine parses a given ace file and stores the data in a data structure. ARGUMENTS: $file = ace file (required) $H_args = reference to hash with the following optional key/values: (optional) -verbose = prints processing info to STDOUT. RETURNS: The referent that is returned to the object is an array containing the following data structure: $self->[0]->ace file $self->[1]->[contigNumber-1] = "startingFileAddr_of_CO, startingFilleAddr_of_sequence,endingFileAddr_of_sequence, startingFileeAddr_of_quality_score,endingFileAddr_of_quality_score, startingFileAddr_of_AF_tag,endingFileAddr_of_AF_tag" startingFileAddr_of_BS_tag,endingFileAddr_of_BS_tag" $self->[2]->{readName} = "contig,startingFileAddr_of_AF_tag,endingFileAddr_of_AF_tag, startingFileAddr_of_RD_tag,endingFileAddr_RD_tag,ending_FileAddr_of_sequence FileAddr_of_QA_tag, FileAddr_of_DS_tag, startingFileAddr_of_RT_tag,readTagTypes" $self->[3]->[] = array holding all RT tag types. (e.g., 'matchElsewhereHighQual',...) $self->[4]->{} = reference to hash containing key=ace tag, value=description =cut my ($file,$args) = @_; my %args=defined $args?%$args:(); my $verbose=defined $args{-verbose}?$args{-verbose}:0; my $contig=""; my $contigSeq=""; my $cnum=""; my %contigs=(); my %readAttribs=(); my @attribs=(); my @readTagTypes=(); my %aceTags=(); my %foundThisReadTag=(); my %foundAllReadTag=(); my %foundAceTag=(); my $readName; my $fileAddr=0; croak "No ace file found.\n" if !defined $file; open _IFILE, $file or croak "Problem opening ace file $file:$!"; while ( my $line = <_IFILE> ) { if ( defined $line && $line =~ /^CO (\S+) \S+ (\S+)/ ) { $contig = $1; print "Parsing $2 reads from $contig ...\n" if $verbose; $contigs{$contig} = "$fileAddr"; $fileAddr = tell(_IFILE); $contigs{$contig} .= ",$fileAddr"; while( defined $line && $line =~ /\S/ ) { $fileAddr = tell(_IFILE); $line = <_IFILE>; } $contigs{$contig} .= ",$fileAddr"; while ( $line !~ /^BQ\s*$/ ) { $line = <_IFILE>; } $fileAddr = tell(_IFILE); $contigs{$contig} .= ",$fileAddr"; $line = <_IFILE>; while( defined $line && $line =~ /\d/ ) { $fileAddr = tell(_IFILE); $line = <_IFILE>; } $contigs{$contig} .= ",$fileAddr"; } if ( defined $line && $line =~ /^AF (\S+) \S+ \S+/ ) { $contigs{$contig} .= ",$fileAddr"; $readAttribs{$1} = $contig if !defined $readAttribs{$1}; $readAttribs{$1} .= ",$fileAddr"; $fileAddr = tell(_IFILE); $readAttribs{$1} .= ",$fileAddr"; $line = <_IFILE>; while( defined $line && $line =~ /^AF (\S+) \S+ \S+/ ) { $readAttribs{$1} = $contig if !defined $readAttribs{$1}; $readAttribs{$1} .= ",$fileAddr"; $fileAddr = tell(_IFILE); $readAttribs{$1} .= ",$fileAddr"; $line = <_IFILE>; } $contigs{$contig} .= ",$fileAddr"; } if ( defined $line && $line =~ /^BS / ) { $contigs{$contig} .= ",$fileAddr"; $line = <_IFILE>; if ( defined $line && $line =~ /^BS / ) { while( defined $line && $line =~ /^BS / ) { $fileAddr = tell(_IFILE); $line = <_IFILE>; } } else { $fileAddr = tell(_IFILE); } $contigs{$contig} .= ",$fileAddr"; } if ( defined $line && $line =~ /^RD (\S+) \S+/ ) { $readName = $1; $readAttribs{$readName} .= ",$fileAddr"; $fileAddr = tell(_IFILE); $readAttribs{$readName} .= ",$fileAddr"; while( defined $line && $line =~ /\S/ ) { $fileAddr = tell(_IFILE); $line = <_IFILE>; } $readAttribs{$readName} .= ",$fileAddr"; %foundThisReadTag = (); $fileAddr = tell(_IFILE); $line = <_IFILE>; if ( defined $line && $line =~ /^QA \-*\d+/ ) { $readAttribs{$readName} .= ",$fileAddr"; } else { $readAttribs{$readName} .= ",NULL"; } $fileAddr = tell(_IFILE); $line = <_IFILE>; if ( defined $line && $line =~ /^DS CHROMAT_FILE/ ) { $readAttribs{$readName} .= ",$fileAddr"; } else { $readAttribs{$readName} .= ",NULL"; } } if ( defined $line && $line =~ /^RT{/ ) { my $desc = ""; my $read = ""; my $tag = ""; $readAttribs{$readName} .= ",$fileAddr"; $line = <_IFILE>; if ( defined $line && $line !~ /^}/ ) { if ( $line =~ /\w/ ) { ($read, $tag) = split /\s+/, $line; if ( length $read && length $tag ) { $readAttribs{$readName} .= ",$tag"; push @readTagTypes, $tag if !$foundAllReadTag{$tag}++; } } $line = <_IFILE>; while ( defined $line && $line !~ /^}/ ) { $line = <_IFILE>; } } } elsif ( defined $line && $line =~ /^([^\{]+)\{/ ) { my $tag = $1; my $desc = ""; $line = <_IFILE>; while( defined $line && $line !~ /^}/ ) { $desc .= $line if $line =~ /\S/; $line = <_IFILE>; } chop $desc; push @{$aceTags{$tag}}, $desc; } $fileAddr = tell(_IFILE); } close _IFILE; return [ $file, \%contigs, \%readAttribs, \@readTagTypes, \%aceTags ]; } sub contig { =head This subroutine reassigns the contig attributes stored in an array to readable names stored in hash. ARGUMENTS: $contig = contig number. (required) RETURNS: The return value is a reference to a hash with the following keys: number = contig number size = size (untrimmed) size_unpadded = unpadded size (untrimmed) sequence = sequence sequence_unpadded = unpadded sequence totalReads = total number of reads in the contig quality = quality score CO_tag = line of CO tag =cut my ($self, $contig) = @_; my $seq=""; my $qual=""; my $line=""; my $ref=""; my $bsTag=""; my $afTag=""; my %attrib=(); $attrib{number}=""; $attrib{size}=""; $attrib{totalReads}=""; $attrib{sequence}=""; $attrib{sequence_unpadded}=""; $attrib{quality}=""; $attrib{size_unpadded}=""; $attrib{CO_tag}=""; $attrib{BS_tag}=""; $attrib{AF_tag}=""; return $ref if !defined $self->[1]->{$contig}; my @addrs = split /,/, $self->[1]->{$contig}; open _IFILE, $self->[0] or die "Cannot read ace file.\n"; $line = _read_line_by_addr( *_IFILE, $addrs[0], $addrs[1] ) if @addrs>3; if ( defined $line && $line =~ /^CO (\S+) (\S+) (\S+)/ ) { $attrib{CO_tag}=$line; $attrib{number}=$1 if length $1; $attrib{size}=$2 if length $1; $attrib{totalReads}=$3 if length $1; } $seq = _read_line_by_addr( *_IFILE, $addrs[1], $addrs[2] ) if @addrs>2; $seq =~ s/\n//g; $qual = _read_line_by_addr( *_IFILE, $addrs[3], $addrs[4] ) if @addrs>4; $qual =~ s/\n//g; $afTag = _read_line_by_addr( *_IFILE, $addrs[5], $addrs[6] ) if @addrs>6; $bsTag = _read_line_by_addr( *_IFILE, $addrs[7], $addrs[8] ) if @addrs>8; close _IFILE; my $seqUnpad=$seq; $seqUnpad =~ s/\*//g; $attrib{sequence}=$seq if length $seq; $attrib{sequence_unpadded}=$seqUnpad if length $seqUnpad; $attrib{quality}=$qual if length $qual; $attrib{size_unpadded}=length($seqUnpad) if length $seqUnpad; $attrib{BS_tag}=$bsTag if length $bsTag; $attrib{AF_tag}=$afTag if length $afTag; $ref = {%attrib} if %attrib; return $ref; } sub getreads { =head This subroutine returns either a scalar or array of references to a hash containing values of reads within the ace file. ARGUMENTS: %args = hash with the following keys/value pair to filter the read list (optional). If none passed, all reads will be returned. -contig = contig number -name = read name (regular expression accepted) -tag = ace tag type as defined in RT{} block. (regular expression accepted) (e.g., 'matchElswhereHighQual', 'chimera', 'G_dropout', ...) -unpadded = set to 0 to skip conversion of padded to unpadded read positions. -verbose = set to 1 to print progress to STDOUT. RETURNS: Returns a reference to a hash containing the attributes of a given read name. The returned referent is a hash with the following key/value pair: name = read name complement = read complement (C||U) contig = contig number the read belongs in start = starting location of read stop = end location of read sequence = sequence of the read size = size of read tag = RT tag values AF_tag = AF tag line RD_tag = RD tag line QA_tag = QA tag line DS_tag = DS tag line # additional key/value pair only if -unpadded is set to 1: start_unpadded = unpadded starting location of read stop_unpadded = unpadded end location of read sequence_unpadded = unpadded sequence of the read size_unpadded = unpadded size of read align_start = starting position of read aligned to consensus align_stop = ending position of read aligned to consensus =cut my ($self, %args) = @_; my @reads=(); my @wantReads=(); my %contigSeqs=(); my $ct=0; my $verbose=defined $args{-verbose}?$args{-verbose}:0; my $computeUnpadPos=defined $args{-unpadded}?$args{-unpadded}:1; delete $args{-verbose} if defined $args{-verbose}; delete $args{-unpadded} if defined $args{-unpadded}; # get reads by tag type. # if ( defined $args{-tag} ) { my @lookups = @wantReads ? @wantReads : keys %{$self->[2]}; foreach my $read ( @lookups ) { my @tags = split /,/,$self->[2]->{$read}; if ( @tags > 6 ) { @tags = @tags[0,6..$#tags]; my $contig=shift @tags; if ( defined $args{-contig} ) { next if $contig ne $args{-contig}; push @wantReads, $read if grep /^$args{-tag}$/, @tags; } else { push @wantReads, $read if grep /^$args{-tag}$/, @tags; } } } } # get reads by contig. # if ( defined $args{-contig} && !@wantReads ) { my $contig = $args{-contig}; # get reads for this contig by parsing the 'AF' tags # if ( defined $self->[1]->{$contig} ) { open _IFILE, $self->[0] or die "Cannot read ace file.\n"; my @addrs = split/,/, $self->[1]->{$contig}; if ( @addrs > 6 ) { @wantReads = map /^AF (\S+)/, split /\n/, _read_line_by_addr( *_IFILE, $addrs[5], $addrs[6] ); } close _IFILE; } } # get reads by name (regex accepted). # if ( defined $args{-name} ) { if ( @wantReads ) { @wantReads = grep /$args{-name}/, @wantReads; } elsif ( defined %{$self->[2]} ) { push @wantReads, grep /$args{-name}/, keys %{$self->[2]}; } } # get all reads if no filters defined. # if ( !%args && defined %{$self->[2]} ) { @wantReads = keys %{$self->[2]}; } print "Defining attributes for ",scalar(@wantReads)," reads ...\n" if $verbose; # If unpadded positions are specified, then we need to convert pad locs of each # read to unpad locs. This process may take considerable time if the contig and/or # number of reads are large. # if ( $computeUnpadPos ) { @reads = _computeUnpadPositions($self,\@wantReads,{-verbose=>$verbose}); $self->[6] = []; } else { foreach my $name ( @wantReads ) { push @reads, $self->read($name); } } if ( wantarray ) { return @reads; } elsif ( defined $reads[0] ) { return $reads[0]; } else { return @reads; } } sub fetchreads { =head This subroutine prepares a list of reads to be returned by the fetch method. ARGUMENTS: %args = hash with the following keys/value pair to filter the read list (optional). If none passed, all reads will be returned. -contig = contig number -name = read name (regular expression accepted) -tag = ace tag type as defined in RT{} block. (regular expression accepted) (e.g., 'matchElswhereHighQual', 'chimera', 'G_dropout', ...) =cut my ($self, %args) = @_; my @reads=(); my @wantReads=(); my %contigSeqs=(); # get reads by tag type. # if ( defined $args{-tag} ) { my @lookups = @wantReads ? @wantReads : keys %{$self->[2]}; foreach my $read ( @lookups ) { my @tags = split /,/,$self->[2]->{$read}; if ( @tags > 6 ) { @tags = @tags[0,6..$#tags]; my $contig=shift @tags; if ( defined $args{-contig} ) { next if $contig ne $args{-contig}; push @wantReads, $read if grep /^$args{-tag}$/, @tags; } else { push @wantReads, $read if grep /^$args{-tag}$/, @tags; } } } } # get reads by contig. # if ( defined $args{-contig} && !@wantReads ) { my $contig = $args{-contig}; # get reads for this contig by parsing the 'AF' tags # if ( defined $self->[1]->{$contig} ) { open _IFILE, $self->[0] or die "Cannot read ace file.\n"; my @addrs = split/,/, $self->[1]->{$contig}; if ( @addrs > 6 ) { @wantReads = map /^AF (\S+)/, split /\n/, _read_line_by_addr( *_IFILE, $addrs[5], $addrs[6] ); } close _IFILE; } } # get reads by name (regex accepted). # if ( defined $args{-name} ) { if ( @wantReads ) { @wantReads = grep /$args{-name}/, @wantReads; } elsif ( defined %{$self->[2]} ) { push @wantReads, grep /$args{-name}/, keys %{$self->[2]}; } } # get all reads if no filters defined. # if ( !%args && defined %{$self->[2]} ) { @wantReads = keys %{$self->[2]}; } $self->[5] = \@wantReads; } sub fetch { =head Returns the reference of the next read's attributes. Must call fetchreads first. =cut my $self = shift; my %args = @_; if ( !@{$self->[5]} ) { $self->[6] = []; return ''; } my $computeUnpadPos=defined $args{-unpadded}?$args{-unpadded}:1; my $readName = shift @{$self->[5]}; my $readData = {}; if ( $computeUnpadPos ) { ($readData) = _computeUnpadPositions($self,[$readName]); } else { $readData = $self->read($readName); } return $readData; } sub _computeUnpadPositions { =head Converts padded start and stop position of each read to unpadded position and returns array of read attributes including the unpadded positions as well as alignment positions. ARGUMENTS: $readlist = reference to array containing the read names to define attributes. $args = reference to hash for optional features. The following features are available: -verbose=>1 display message showing progress of the computations. RETURNS: Hash containing the following key/value pairs: start = padded start position of read tag stop = padded stop position of read tag start_unpadded = unpadded start position of read tag stop_unpadded = unpadded stop position of read tag align_start = start of alignment position of read tag align_stop = end of alignment position of read tag read = name of read with this tag type = type of read tag (e.g., 'matchElsewhereHighQual', 'chimera', ...) =cut my ($self,$readlist,$args) = @_; my @wantReads = @{$readlist}; my %args = defined $args ? %{$args} : (); my $verbose = $args{-verbose} ? 1:0; my $ct=0; my %displayed=(); my @retReads=(); my $unpadObject = PGF::Parsers::ParseAce::Unpad->new(); # first, we need to store reads by contig. # my %contigReads = (); foreach my $name ( @wantReads ) { if ( defined $self->[2]->{$name} ) { my $contig = (split /,/,$self->[2]->{$name})[0]; push @{$contigReads{$contig}}, $self->read($name); } } # Now, loop thru each contig and compute the unpadded positions for # each read within the contig. # foreach my $contig ( keys %contigReads ) { next if !@{$contigReads{$contig}}; if ( defined $self->[6][0] && $self->[6][0] eq $contig ) { $unpadObject = $self->[6][1]; } else { # get contig sequence (padded). # my $contigSeq = ""; my @addrs = split /,/,$self->[1]->{$contig}; if ( @addrs > 2 ) { open _IFILE, $self->[0] or die "Cannot read ace file.\n"; $contigSeq = _read_line_by_addr( *_IFILE, $addrs[1], $addrs[2] ); $contigSeq =~ s/\n//g; close _IFILE; } $unpadObject->use($contigSeq); $self->[6] = [$contig,$unpadObject]; } my @reads = @{$contigReads{$contig}}; print "Creating unpadded hash table for $contig ...\n" if $verbose; print "done.\n" if $verbose; # now loop thru each read and calculate unpadded positions. # foreach my $read ( @reads ) { # Store unpadded position and sequence in hash reference. # my $readStart = $read->{start}; my $readStop = $read->{stop}; my $alignStart = $read->{align_clipping_start} + $readStart - 1; my $alignStop = $read->{align_clipping_stop} + $readStart - 1; my $unpadStart = $unpadObject->padToUnpad($readStart); my $unpadStop = $unpadObject->padToUnpad($readStop); my $unpadAlignStart = $unpadObject->padToUnpad($alignStart); my $unpadAlignStop = $unpadObject->padToUnpad($alignStop); my $readSeq = $read->{sequence}; $readSeq =~ s/\*//g; $unpadAlignStart = 1 if $unpadAlignStart == -1; $unpadAlignStop = 1 if $unpadAlignStop == -1; # store attributes. $read->{sequence_unpadded}=$readSeq; $read->{size_unpadded}=length($readSeq); $read->{start_unpadded}=$unpadStart; $read->{stop_unpadded}=$unpadStop; $read->{align_start}=$unpadAlignStart; $read->{align_stop}=$unpadAlignStop; push @retReads, $read; if ( $verbose ) { my $pct=++$ct/scalar(@wantReads)*100; if ( $pct > 1 && int($pct)%10==0 && !$displayed{int($pct)}++) { print sprintf(" %3d", int($pct)),"% complete ... $ct reads processed.\n"; } } } } return @retReads; } sub readTagPosition { =head Parses a read tag and returns the padded unpadded positions of the tag and the read and type of the tag. ARGUMENTS: $tag = string containing read tag. (required) $H_args = reference to hash with the following optional key/values: (optional) -read = reference to hash containing read attributes (obtained from the getreads or read method. (speeds up processing by not having to look up these values for the read) -contig_sequence = sequence of contig the read is in (speeds up processing by not having to look up contig sequence) RETURNS: Hash containing the following key/value pairs: start = padded start position of read tag stop = padded stop position of read tag start_unpadded = unpadded start position of read tag stop_unpadded = unpadded stop position of read tag read = name of read with this tag type = type of read tag (e.g., 'matchElsewhereHighQual', 'chimera', ...) =cut my ($self, $tag, $H_args) = @_; return $tag unless ( $tag =~ /^(\S+) (\S+) \S+ (\d+) (\d+)/ ); my %attrib=(); $attrib{start}=""; $attrib{stop}=""; $attrib{start_unpadded}=""; $attrib{stop_unpadded}=""; $attrib{type}=""; $attrib{read}=""; my $read=$1; my $type=$2; my $tagStart=$3; my $tagStop=$4; my %args = defined %$H_args ? %$H_args : (); my %readAttribs = (); my $unpadStart=0; my $padStart=0; my $contigSeq=""; my $numStartPads=0; my $numStopPads=0; my $contig=0; if ( !defined $args{-read} ) { $args{-read}=$self->read($read,{-unpadded=>1}); return %attrib if !length $args{-read}; } # If reference to hash containing read attributes is passed using the -read # parameter, then get contig, padded start and unpadded start positions # from the attributes. # if ( defined $args{-read} && defined %{$args{-read}} ) { $padStart=$args{-read}->{start}; $contig=$args{-read}->{contig}; if ( length $args{-read}->{start_unpadded} ) { $unpadStart=$args{-read}->{start_unpadded}; } elsif ( defined $args{-contig_sequence} ) { my $ref=$self->unpadded( $args{-read}, {-contig_sequence=>$args{-contig_sequence}} ); $unpadStart=$ref->{start_unpadded}; } else { my $ref=$self->unpadded( $args{-read} ); $unpadStart=$ref->{start_unpadded}; } } # If contig sequence is passed using the -contig_sequence parameter, use sequence # passed for counting pads. # if ( defined $args{-contig_sequence} ) { $contigSeq=$args{-contig_sequence}; # Otherwise, need to look up contig sequence for counting pads. # } else { my $ref=$self->contig($contig); $contigSeq=$ref->{sequence}; } $unpadStart=1 if $unpadStart<1; $padStart=1 if $padStart<1; # Calculate number of pads before start of tag position. # my $seq=substr($contigSeq,$padStart-1,$tagStart-1); $numStartPads=$seq =~ tr/\*/\*/ if defined $seq; # Calculate number of pads within the tag position. # if ( $padStart+$tagStart-1 > length($contigSeq) ) { $numStopPads=0; } elsif ( $padStart+$tagStart-1+($tagStop-$tagStart) > length($contigSeq)) { my $seq=substr($contigSeq,$padStart+$tagStart-2,length($contigSeq)-($padStart+$tagStart-1)); $numStopPads=$seq =~ tr/\*/\*/ if defined $seq; } else { $seq=substr($contigSeq,$padStart+$tagStart-2,$tagStop-$tagStart+1); $numStopPads=$seq =~ tr/\*/\*/ if defined $seq; } # Compute padded start and stop positions of the tag. # my $startUnpadded=$unpadStart+$tagStart-$numStartPads-1; my $stopUnpadded=$unpadStart+$tagStop-$numStartPads-$numStopPads-1; $attrib{start}=$tagStart; $attrib{stop}=$tagStop; $attrib{type}=$type; $attrib{start_unpadded}=$startUnpadded; $attrib{stop_unpadded}=$stopUnpadded; $attrib{read}=$read; return %attrib; } sub read { =head Parameters: $name = read name $H_args = reference to hash with the following optional key/values: -unpadded = returns additional unpadded and alignment values. -contig_sequence = sequence of contig (unpadded calculations run faster if this is supplied). Returns a reference to a hash containing the attributes of a given read name. The returned referent is a hash with the following key/value pair: name = read name complement = read complement (C||U) contig = contig number the read belongs in start = starting location of read stop = end location of read sequence = sequence of the read size = size of read tag = RT tag values align_clipping_start = alignment clipping start (padded); align_clipping_stop = alignment clipping stop (padded); AF_tag = AF tag line RD_tag = RD tag lines QA_tag = QA tag line DS_tag = DS tag line Additional tags that are defined if using the -unpadded option. start_unpadded = unpadded starting location of read stop_unpadded = unpadded end location of read sequence_unpadded = unpadded sequence of the read size_unpadded = unpadded size of read align_start = starting position of read aligned to consensus align_stop = ending position of read aligned to consensus =cut my ($self, $name, $H_args) = @_; my %args = %$H_args if defined $H_args; my %attrib=(); $attrib{name}=""; $attrib{complement}=""; $attrib{sequence}=""; $attrib{size}=""; $attrib{start}=""; $attrib{stop}=""; $attrib{sequence_unpadded}=""; $attrib{size_unpadded}=""; $attrib{start_unpadded}=""; $attrib{stop_unpadded}=""; $attrib{align_start}=""; $attrib{align_stop}=""; $attrib{contig}=""; $attrib{tag}=""; $attrib{AF_tag}=""; $attrib{RD_tag}=""; $attrib{QA_tag}=""; $attrib{DS_tag}=""; $attrib{align_clipping_start}=0; $attrib{align_clipping_stop}=0; return \%attrib if !defined $self->[2]->{$name}; # Array index of addrs: # # 0-startingFileAddr_of_AF_tag, # 1-endingFileAddr_of_AF_tag, # 2-startingFileAddr_of_RD_tag, # 3-endingFileAddr_RD_tag, # 4-ending_FileAddr_of_sequence # 5-FileAddr_of_QA_tag, # 6-FileAddr_of_DS_tag, # 7-startingFileAddr_of_RT_tag, # 8-readTagTypes" my $line; my @addrs = split /,/,$self->[2]->{$name}; my $contig = shift @addrs; $attrib{contig}=$contig if defined $contig; open _IFILE, $self->[0] or die "Cannot read ace file.\n"; # get AF tag. # if ( @addrs > 1 && $addrs[0] =~ /^\d+$/ && $addrs[1] =~ /^\d+$/ ) { $line = _read_line_by_addr( *_IFILE, $addrs[0], $addrs[1] ); if ( defined $line && $line =~ /^AF (\S+) (\S+) (\S+)/ ) { chomp $line; $attrib{name}=$1; $attrib{complement}=$2; $attrib{start}=$3; $attrib{AF_tag}=$line; } } # get RD tag. # if ( @addrs > 3 && $addrs[2] =~ /^\d+$/ && $addrs[3] =~ /^\d+$/ ) { $line = _read_line_by_addr( *_IFILE, $addrs[2], $addrs[3] ); if ( defined $line && $line =~ /^RD/ ) { chomp $line; $attrib{RD_tag}=$line; } } # get QA tag. # if ( @addrs > 5 && $addrs[5] =~ /^\d+$/ ) { seek(_IFILE,$addrs[5],0); $line = <_IFILE>; if ( defined $line && $line =~ /^QA/ ) { chomp $line; $attrib{QA_tag}=$line; if ( $line =~ /^QA \-*\d+ \-*\d+ (\-*\d+) (\-*\d+)/ ) { $attrib{align_clipping_start}=$1; $attrib{align_clipping_stop}=$2; } else { } } } # get DS tag. # if ( @addrs > 6 && $addrs[6] =~ /^\d+$/ ) { seek(_IFILE,$addrs[6],0); $line = <_IFILE>; if ( defined $line && $line =~ /^DS/ ) { chomp $line; $attrib{DS_tag}=$line; } } # get sequence and optionally, unpadded positions, alignment positions and unpadded # sequence size. # if ( @addrs > 4 && $addrs[3] =~ /^\d+$/ && $addrs[4] =~ /^\d+$/ ) { $line = _read_line_by_addr( *_IFILE, $addrs[3], $addrs[4] ); if ( defined $line ) { $line =~ s/\n//g; my $readSeq=$line; $attrib{sequence}=$readSeq; $attrib{size}=length($readSeq); $attrib{stop}=length($readSeq) + $attrib{start} - 1; } } # get lines in RT{} blocks. # if ( @addrs > 7 && $addrs[7] =~ /^\d+$/ ) { my $desc=""; seek(_IFILE,$addrs[7],0); $line = <_IFILE>; while ( defined $line && $line !~ /^RD/ && $line !~ /^CO/ ) { if ( defined $line && $line =~ /^RT{/ ) { $line = <_IFILE>; while( defined $line && $line !~ /^}/ ) { $desc .= $line if ( $line =~ /\w/ ); $line = <_IFILE>; } $attrib{tag}=$desc; } $line = <_IFILE>; } } close _IFILE; # determine unpadded sequence and positions if $args{-unpadded} is specified. # # Note: this may slow the execution time, especially if the contig is large due to the manual # removal of the padded character '*' from the sequence of the contig based on location of the # read and re-calculating the positions of the read. # if ( defined $args{-unpadded} && $args{-unpadded} ) { my $ref = $self->unpadded( \%attrib, \%args ); %attrib = %$ref; } return \%attrib; } sub unpadded { =head Calculates unpadded start, stop, alignment start and alignment stop positions of a given read attribute. The argument is a reference to a hash containing the read attributes obtained by calling the getreads or read method. The return value is a reference to a hash containing the original hash elements plus the following: ARGUMENTS: $read = reference to hash containing read attributes obtained from the getreads or read method (required). $H_args = reference to hash with the following optional key/values: (optional) -contig_sequence = padded sequence of contig (unpadded calculations run faster if this is supplied). RETURNS: Reference to hash from $read plus the following key/value pairs: start_unpadded = unpadded starting location of read stop_unpadded = unpadded end location of read sequence_unpadded = unpadded sequence of the read size_unpadded = unpadded size of read align_start = starting position of read aligned to consensus align_stop = ending position of read aligned to consensus =cut my ( $self, $read, $H_args ) = @_; return $read if !defined %{$read}; my %args = defined %$H_args ? %$H_args:(); my %attrib = %{$read}; my $contigSeq = ""; if ( defined $args{-contig_sequence} && length $args{-contig_sequence} ) { $contigSeq = $args{-contig_sequence}; # get contig sequence from ace file if not present. # Need this to calculate unpadded positions of the read. # } else { my $contig = $attrib{contig}; open _IFILE, $self->[0] or die "Cannot read ace file.\n"; my ($addrContigSeq) = (split /,/,$self->[1]->{$contig})[1]; seek(_IFILE,$addrContigSeq,0); my $line = <_IFILE>; while( defined $line && $line =~ /\S/ ) { chomp $line; $contigSeq .= $line; $line = <_IFILE>; } close _IFILE; } my $readStart=$attrib{start}; my $readStop=$attrib{stop}; my $paddedStart=0; my $paddedStop=0; my $unpadStart=$attrib{start}; my $unpadStop=$readStop; my $size = $attrib{size}; $size += $readStart if $readStart<1; $readStart=1 if $readStart<1; $readStop=1 if $readStop<1; # calculate unpadded starting and ending positions of the read. # my $cutContig = substr($contigSeq,0,$readStart); $paddedStart = $cutContig =~ tr/\*/\*/ if defined $cutContig; $unpadStart = $readStart-$paddedStart if $paddedStart; $size = length($contigSeq)-$readStart+1 if $size > length($contigSeq)-$readStart+1; if ( $size > 0 ) { $cutContig = substr($contigSeq,$readStart-1,$size); $paddedStop = $cutContig =~ tr/\*/\*/ if defined $cutContig; $unpadStop = $readStop-$paddedStart-$paddedStop if $paddedStop; } else { print "could not calculate unpadded positions for ",$read->{readName},"!\n"; } my $readSeq = $attrib{sequence}; $readSeq =~ s/\*//g; my $alignClippingStart=0; my $alignClippingStop=0; my $paddedReadLength=0; if ( $attrib{QA_tag} =~ /^QA \d+ \d+ (\d+) (\d+)/ ) { $alignClippingStart=$1; $alignClippingStop=$2; } if ( $attrib{RD_tag} =~ /^RD \S+ (\d+)/ ) { $paddedReadLength=$1; } my $alignStart = $readStart + $alignClippingStart - 1; my $alignStop = $readStart + $alignClippingStop - 1; if ( $unpadStart == 0 ) { $alignStart--; $alignStop--; } elsif ( $unpadStart < 0 ) { $alignStart += $unpadStart - 1; $alignStop += $unpadStart - 1; } $cutContig = substr($contigSeq,0,$alignStart); my $numOfPads = $cutContig =~ tr/\*/\*/ if defined $cutContig; $alignStart -= $numOfPads; $cutContig = substr($contigSeq,0,$alignStop); $numOfPads = $cutContig =~ tr/\*/\*/ if defined $cutContig; $alignStop -= $numOfPads; $attrib{sequence_unpadded}=$readSeq; $attrib{size_unpadded}=length($readSeq); $attrib{start_unpadded}=$unpadStart; $attrib{stop_unpadded}=$unpadStop; $attrib{align_start}=$alignStart; $attrib{align_stop}=$alignStop; return \%attrib; } sub getTags { =head Returns all tag types that are found in the ace file. Tag types correspond to blocks enclosed in {} brackets. Examples: RT{}, CT{}, WA{}. The tag names for this example would be RT, CT, WA. ARGUMENTS: none RETURNS: Array containing all tag types. =cut my $self = shift; if ( defined %{$self->[4]} ) { return keys %{$self->[4]}; } else{ return; } } sub tag { =head This subroutine returns an array containing the data within the ace tags. Each element stores a string consisting of the data within the tag. ARGUMENTS: Array of tag types (optional). Pass in the tag type ("CT" and/or "RT") in list format to return specific ace tags you would like to retrieve (required). If no values are passed, all ace tags are returned. RETURNS: Array containig data within the ace tags. =cut my ($self, @types) = @_; my @tagValues=(); my %tags=(); @tags{@types}=(1)x@types if @types; foreach my $type ( @types ) { if ( defined @{$self->[4]->{$type}} ) { push @tagValues, @{$self->[4]->{$type}}; } } return @tagValues; } sub getReadTags { =head This subroutine returns an array containing the read tag names that are found in the RT{} block in the ace file. Examples of tag names include 'matchElswhereHighQual', 'chimera', ... ARGUMENTS: none RETURNS: Array containing read tag names found in the read tag block. =cut my $self = shift; if ( defined @{$self->[3]} ) { return @{$self->[3]}; } else { return; } } sub totalContigs { =head Returns the total number of contigs found in the ace file. ARGUMENTS: none RETURNS: scalar containing total number of contigs. =cut my $self=shift; return scalar keys %{$self->[1]}; } sub allContigs { =head Returns an array containing all contig numbers found in the ace file. ARGUMENTS: none RETURNS; Array containing all contigs sorted in ascending numerical order. =cut my $self=shift; return _sortalphanum( keys %{$self->[1]} ); } sub _read_line_by_addr { =head Returns data from a file handle for two given bytes of data. The first byte is the length up to the starting location to parse the data. The second byte is the length of the ending location to parse the data. ARGUMENT: $FH = file handle to opened ace file. (required) $addr1 = starting file address to extract data (required) $addr2 = endind file address to extract data (required) =cut my ($FH,$addr1,$addr2) = @_; my $line=""; return $line if $addr2<$addr1; seek($FH,$addr1,0); CORE::read($FH,$line,$addr2-$addr1); return $line; } sub _sortalphanum { =head Returns a list sorted alphanumerically. =cut my ( @datas ) = @_; my $formatted; my @tosort; foreach (@datas) { if ( /\d+/ ) { ($formatted = $_) =~ s/(\d+)/sprintf( "%010d", $1 )/eg; } else { $formatted = $_; } push @tosort, [ $_, $formatted ]; } my @sorted = map { $_->[0] } sort { $a->[1] cmp $b->[1] } map { [ $_->[0], $_->[1] ] } @tosort; return @sorted; } 1;