# Summary: # This module allows you to pair up reads based on the phrap extensions # (e.g., "x", "y", "b", "g"). The iteration number after the phrap # extension is used to loosely associate the read's pair but does not # have to correspond exactly to the read's pair. For instance: # ABC123.x1 will pair with ABC123.y1 if ABC123.y1 exists. # ABC123.x1 will pair with ABC123.y2 if ABC123.y1 does not exist and ABC123.y2 exists. # # Example of usage: # # my $obj = new PairReadsByPhrapExt; # # #associate .x and .y extensions as read pairs, .b and .g as read pairs. # # # $obj->phrapExtension( "x"=>"y", "b"=>"g" ); # # Note that the default delimiter between the read name and the phrap extension is '.'. # # To specify a alternative delimiter between the read name and the extension: # $ob->phrapExtension( "x"=>"y", "b"=>"g", "delimiter"=>'_' ); # # To specify multiple delimiters between the read name and the extension: # $ob->phrapExtension( "x"=>"y", "b"=>"g", "delimiters=>"['.','_'] ); # # To assign read for pairing: # $obj->addread("ABCD123.x1"); # $obj->addread("ABCD123.y1"); # $obj->addread("ABCD123.x2"); # $obj->addread("ABCD123.y2"); # $obj->addread("ABCD789.x2"); # $obj->addread("ABCD789.y3"); # # you can store a string with the addread method which would # allow you to return the mate's string value by doing the # following: # # $obj->addread("ABCD123.x1",{value=>'ABC123.x1,1,1000,Contig20'}); # $obj->addread("ABCD123.y1",{value=>'ABC123.y1,1000,2000,Contig20'}); # $obj->getReadPair("ABCD789.x1",{-retValue=>1}) returns "ABCD123.y1,1000,2000,Contig20" # # To get read pair: # # $obj->pairReads; <-- need to perform this to initialize the pairing of reads. # $obj->getReadPair("ABCD123.x1") returns "ABCD123.y1" # $obj->getReadPair("ABCD123.x2") returns "ABCD123.y2" # $obj->getReadPair("ABCD789.x2") returns "ABCD123.y3" # $obj->getReadPair("ABCD789.y3") returns "ABCD123.x2" # # To return a unique number for each pairing (pairing numbers are incremented # when links are formed between paired reads, starting with the lowest iteration # number): # $obj->computeCloneIteration; <-- performs computation of clone iteration (required). # need to run this after executing $obj->pairReads. # $obj->getPairIndex("ABCD123.x1") returns 1; # $obj->getPairIndex("ABCD123.y1") returns 1; # $obj->getPairIndex("ABCD123.x2") returns 2; # $obj->getPairIndex("ABCD123.y2") returns 2; # $obj->getPairIndex("ABCD789.x2") returns 1; # $obj->getPairIndex("ABCD789.y3") returns 1; # # # $Revision: 1.7 $ # # S.Trong 7/18/2005 #=======================================================================# package PGF::Utilities::PairReadsByPhrapExt; use strict; use Carp; use Carp qw(cluck); my $VERSION = 1.00; sub new { my ($class) = @_; my $self={}; bless $self, $class; return $self; } sub phrapExtension { =head Argument: hash containing relationship between forward and reverse read extension. Example: $obj->phrapExtension( "x"=>"y", "b"=>"g" ) designates that .x and .y are read pairs and .b and .g are read pairs. =cut my ($self,%params) = @_; my %phrapExtensions; my @delimiters = (); foreach my $param (keys %params ) { if ( $param eq 'delimiter' ) { @delimiters = $params{$param}; } elsif ( $param eq 'delimiters' ) { @delimiters = @{$params{$param}}; } else { $phrapExtensions{$param} = $params{$param}; } } # if no readname to extension delimiter is specified, set default delimiter # to '.' @delimiters = ('.') if !@delimiters; $self->{_phrapExtensionsForPairing}=\%phrapExtensions; $self->{_phrapExtensionDelimiter}=\@delimiters; } sub addread { =head Argument: name of read optional reference to hash containing the following key/value pair: -value=>$string, where $string is any string you want to store with read. =cut my ($self,$read,$args) = @_; my $validPhrapExtensions = join "|", %{$self->{_phrapExtensionsForPairing}}; my $validDelimiters = join "|", @{$self->{_phrapExtensionDelimiter}}; my ($clone) = $read =~ /^([^$validDelimiters]+)/; my $value = ""; my $phrapExtension = ""; if ( defined $args ) { $value = $$args{-value} if defined $$args{-value}; } if ( !defined $clone ) { confess "Warning: cannot determine clone name for readname $read.\n"; return; } if ( $read =~ /$clone(?:$validDelimiters)($validPhrapExtensions)\d+/ ) { $phrapExtension = $1; push @{$self->{_clone}->{$clone}->{$phrapExtension}}, $read; $self->{_readValue}->{$read}=$value if length $value; } else { $self->{_pairedReads}->{$read} = ""; } push @{$self->{_readNames}}, $read; } sub addRead{ addread(@_); } sub pairReads { my $self = shift; my %pairs = (); if ( !%{$self->{_phrapExtensionsForPairing}} ) { confess "Need to first define phrap extension using: \$obj->phrapExtension(%hash)\n"; } foreach my $clone ( keys %{$self->{_clone}} ) { foreach my $phrapExtension ( keys %{$self->{_phrapExtensionsForPairing}} ) { # separate reads into fwd and rev pairs (based on phrap extension, not # orientation to contig. my @fwdReads = (); my @revReads = (); my $pairedPhrapExtension = $self->{_phrapExtensionsForPairing}->{$phrapExtension}; if ( defined $self->{_clone}->{$clone}->{$phrapExtension}->[0] ) { @fwdReads = sort _alphanum @{$self->{_clone}->{$clone}->{$phrapExtension}}; } if ( defined $self->{_clone}->{$clone}->{$pairedPhrapExtension} ) { @revReads = sort _alphanum @{$self->{_clone}->{$clone}->{$pairedPhrapExtension}}; } my $ct = 0; my @larger = (); my @smaller = (); if ( @revReads > @fwdReads ) { @larger = @revReads; @smaller = @fwdReads; } else { @larger = @fwdReads; @smaller = @revReads; } while (@larger) { my $fwd = shift @larger; my $fwdReadName = $fwd; if ( @smaller ) { my $rev = @smaller ? shift @smaller:""; my $revReadName = $rev; $self->{_pairedReads}->{$fwdReadName}=$revReadName; $self->{_pairedReads}->{$revReadName}=$fwdReadName; } else { $self->{_pairedReads}->{$fwdReadName}=""; } } } } } sub computeCloneIteration { my $self = shift; my %cloneIterationNumber = (); my %readPairingNumber = (); foreach my $readName ( @{$self->{_readNames}} ) { next if defined $self->{_cloneIterationNumber}->{$readName}; my ($clone) = $readName =~ /^([^.+]+)\./; my $pairReadName = $self->{_pairedReads}->{$readName}; my $iterationNumber = 0; if ( defined $clone ) { $cloneIterationNumber{$clone} = 0 if !exists $cloneIterationNumber{$clone}; if ( length $pairReadName ) { $cloneIterationNumber{$clone}++; if ( !$readPairingNumber{$readName}{$pairReadName} ) { $readPairingNumber{$readName}{$pairReadName}=$cloneIterationNumber{$clone}; $readPairingNumber{$pairReadName}{$readName}=$cloneIterationNumber{$clone}; $iterationNumber = $cloneIterationNumber{$clone}; } else { $iterationNumber = $readPairingNumber{$readName}{$pairReadName}; } $self->{_cloneIterationNumber}->{$readName} = $iterationNumber; $self->{_cloneIterationNumber}->{$pairReadName} = $iterationNumber; } elsif ( length $clone ) { $cloneIterationNumber{$clone} += 1; $iterationNumber = $cloneIterationNumber{$clone}; $self->{_cloneIterationNumber}->{$readName} = $iterationNumber; } } else { $self->{_cloneIterationNumber}->{$readName} = $iterationNumber; } } } sub getReadPair { =head Argument: name of read to find pair. optional reference to hash containing the following key/value pair: -retValue=1 returns the string that associated with the read pair defined when using the addread method. =cut my ($self,$read,$args)=@_; if ( !defined $read ) { die "readname must be specified in parameter.\n"; } if ( defined $args ) { my %args = %$args; my $pair = defined $self->{_pairedReads}->{$read} ? $self->{_pairedReads}->{$read} : ""; return defined $self->{_readValue}->{$pair} ? $self->{_readValue}->{$pair} : ""; } else { return defined $self->{_pairedReads}->{$read} ? $self->{_pairedReads}->{$read} : ""; } } sub getPairIndex { my ($self,$read)=@_; if ( !defined $read ) { die "readname must be specified in parameter.\n"; } if ( exists $self->{_cloneIterationNumber}->{$read} ) { return $self->{_cloneIterationNumber}->{$read}; } else { return exists $self->{_pairedReads}->{$read} ? 0:""; } } sub _alphanum { my $aval = $a; my $bval = $b; if ( $aval =~ /\d+/ ) { $aval =~ s/(\d+)/sprintf( "%010d", $1 )/eg; } if ( $bval =~ /\d+/ ) { $bval =~ s/(\d+)/sprintf( "%010d", $1 )/eg; } if ( $aval =~ /^\-*\d+$/ && $bval =~ /^\-*\d+$/ ) { return $aval <=> $bval; } else { return $aval cmp $bval; } } #============================================================================# 1;