# Summary: 
# This module allows you to pair up reads based on the phrap extensions 
# (e.g., "x", "y", "b", "g").  The iteration number after the phrap
# extension is used to loosely associate the read's pair but does not
# have to correspond exactly to the read's pair.  For instance:
# ABC123.x1 will pair with ABC123.y1 if ABC123.y1 exists.
# ABC123.x1 will pair with ABC123.y2 if ABC123.y1 does not exist and ABC123.y2 exists.
#
# Example of usage:
#
# my $obj = new PairReadsByPhrapExt;
#
# #associate .x and .y extensions as read pairs, .b and .g as read pairs.
# #
# $obj->phrapExtension( "x"=>"y", "b"=>"g" ); 
#
# Note that the default delimiter between the read name and the phrap extension is '.'.
#
# To specify a alternative delimiter between the read name and the extension:
# $ob->phrapExtension( "x"=>"y", "b"=>"g", "delimiter"=>'_' );
#
# To specify multiple delimiters between the read name and the extension:
# $ob->phrapExtension( "x"=>"y", "b"=>"g", "delimiters=>"['.','_'] );
#
# To assign read for pairing:
# $obj->addread("ABCD123.x1");
# $obj->addread("ABCD123.y1");
# $obj->addread("ABCD123.x2");
# $obj->addread("ABCD123.y2");
# $obj->addread("ABCD789.x2");
# $obj->addread("ABCD789.y3");
#
# you can store a string with the addread method which would
# allow you to return the mate's string value by doing the 
# following:
#
# $obj->addread("ABCD123.x1",{value=>'ABC123.x1,1,1000,Contig20'});
# $obj->addread("ABCD123.y1",{value=>'ABC123.y1,1000,2000,Contig20'});
# $obj->getReadPair("ABCD789.x1",{-retValue=>1}) returns "ABCD123.y1,1000,2000,Contig20"
#
# To get read pair:
#
# $obj->pairReads;   <-- need to perform this to initialize the pairing of reads.
# $obj->getReadPair("ABCD123.x1") returns "ABCD123.y1"
# $obj->getReadPair("ABCD123.x2") returns "ABCD123.y2"
# $obj->getReadPair("ABCD789.x2") returns "ABCD123.y3"
# $obj->getReadPair("ABCD789.y3") returns "ABCD123.x2"
#
# To return a unique number for each pairing (pairing numbers are incremented
# when links are formed between paired reads, starting with the lowest iteration
# number):
# $obj->computeCloneIteration;  <-- performs computation of clone iteration (required).
#                                   need to run this after executing $obj->pairReads.
# $obj->getPairIndex("ABCD123.x1") returns 1;
# $obj->getPairIndex("ABCD123.y1") returns 1;
# $obj->getPairIndex("ABCD123.x2") returns 2;
# $obj->getPairIndex("ABCD123.y2") returns 2;
# $obj->getPairIndex("ABCD789.x2") returns 1;
# $obj->getPairIndex("ABCD789.y3") returns 1;
#
#
# $Revision: 1.7 $
#
# S.Trong 7/18/2005
#=======================================================================#

package PGF::Utilities::PairReadsByPhrapExt;
use strict;
use Carp;
use Carp qw(cluck);

my $VERSION = 1.00;

sub new {
    my ($class) = @_;
    my $self={};
    bless $self, $class;
    return $self;
}

sub phrapExtension {

=head
Argument:
  hash containing relationship between forward and reverse read
  extension.
  Example: $obj->phrapExtension( "x"=>"y", "b"=>"g" )
      designates that .x and .y are read pairs and .b and .g are read pairs.
=cut

    my ($self,%params) = @_;
    
    my %phrapExtensions;
    my @delimiters = ();
    
    foreach my $param (keys %params ) {
        if ( $param eq 'delimiter' ) {
            @delimiters = $params{$param};
        } elsif ( $param eq 'delimiters' ) {
            @delimiters = @{$params{$param}};
        } else {
            $phrapExtensions{$param} = $params{$param};
        }
    }
    
    # if no readname to extension delimiter is specified, set default delimiter
    # to '.'
    @delimiters = ('.') if !@delimiters;
    
    $self->{_phrapExtensionsForPairing}=\%phrapExtensions;
    $self->{_phrapExtensionDelimiter}=\@delimiters;
}

sub addread {

=head
Argument:
  name of read
  optional reference to hash containing the following key/value pair:
    -value=>$string, where $string is any string you want to store with
    read.
=cut

    my ($self,$read,$args) = @_;
    my $validPhrapExtensions = join "|", %{$self->{_phrapExtensionsForPairing}};
    my $validDelimiters = join "|", @{$self->{_phrapExtensionDelimiter}};
    my ($clone) = $read =~ /^([^$validDelimiters]+)/;
    my $value = "";
    my $phrapExtension = "";
    
    if ( defined $args ) {
        $value = $$args{-value} if defined $$args{-value};
    }

    if ( !defined $clone ) {
        confess "Warning: cannot determine clone name for readname $read.\n";
        return;
    }
    if ( $read =~ /$clone(?:$validDelimiters)($validPhrapExtensions)\d+/ ) {
        $phrapExtension = $1;
        push @{$self->{_clone}->{$clone}->{$phrapExtension}}, $read;
        $self->{_readValue}->{$read}=$value if length $value;
    } else {
        $self->{_pairedReads}->{$read} = "";
    }
    push @{$self->{_readNames}}, $read;
}

sub addRead{ addread(@_); }

sub pairReads {

    my $self = shift;
    my %pairs = ();

    if ( !%{$self->{_phrapExtensionsForPairing}} ) {
        confess "Need to first define phrap extension using: \$obj->phrapExtension(%hash)\n";
    }

    foreach my $clone ( keys %{$self->{_clone}} ) {
        foreach my $phrapExtension ( keys %{$self->{_phrapExtensionsForPairing}} ) {
            # separate reads into fwd and rev pairs (based on phrap extension, not
            # orientation to contig.
            my @fwdReads = ();
            my @revReads = ();
            my $pairedPhrapExtension = $self->{_phrapExtensionsForPairing}->{$phrapExtension};
            if ( defined $self->{_clone}->{$clone}->{$phrapExtension}->[0] ) {
                @fwdReads = sort _alphanum @{$self->{_clone}->{$clone}->{$phrapExtension}};
            }
            if ( defined $self->{_clone}->{$clone}->{$pairedPhrapExtension} ) {
                @revReads = sort _alphanum @{$self->{_clone}->{$clone}->{$pairedPhrapExtension}};
            }

            my $ct = 0;
            my @larger = ();
            my @smaller = ();

            if ( @revReads > @fwdReads ) {
                @larger = @revReads;
                @smaller = @fwdReads;
            } else {
                @larger = @fwdReads;
                @smaller = @revReads;
            }

            while (@larger) {
                my $fwd = shift @larger;
                my $fwdReadName = $fwd;

                if ( @smaller ) {
                    my $rev = @smaller ? shift @smaller:"";
                    my $revReadName = $rev;
                    $self->{_pairedReads}->{$fwdReadName}=$revReadName;
                    $self->{_pairedReads}->{$revReadName}=$fwdReadName;
                } else {
                    $self->{_pairedReads}->{$fwdReadName}="";
                }
            }
        }
    }

}

sub computeCloneIteration {

    my $self = shift;
    my %cloneIterationNumber = ();
    my %readPairingNumber = ();

    foreach my $readName ( @{$self->{_readNames}} ) {
        next if defined $self->{_cloneIterationNumber}->{$readName};
        my ($clone) = $readName =~ /^([^.+]+)\./;
        my $pairReadName = $self->{_pairedReads}->{$readName};
        my $iterationNumber = 0;

        if ( defined $clone ) {
            $cloneIterationNumber{$clone} = 0 if !exists $cloneIterationNumber{$clone};
            if ( length $pairReadName ) {
                    $cloneIterationNumber{$clone}++;
                if ( !$readPairingNumber{$readName}{$pairReadName} ) {
                    $readPairingNumber{$readName}{$pairReadName}=$cloneIterationNumber{$clone};
                    $readPairingNumber{$pairReadName}{$readName}=$cloneIterationNumber{$clone};
                    $iterationNumber = $cloneIterationNumber{$clone};
                } else {
                    $iterationNumber = $readPairingNumber{$readName}{$pairReadName};
                }
                $self->{_cloneIterationNumber}->{$readName} = $iterationNumber;
                $self->{_cloneIterationNumber}->{$pairReadName} = $iterationNumber;
            } elsif ( length $clone ) {
                $cloneIterationNumber{$clone} += 1;
                $iterationNumber = $cloneIterationNumber{$clone};
                $self->{_cloneIterationNumber}->{$readName} = $iterationNumber;
            }
        } else {
            $self->{_cloneIterationNumber}->{$readName} = $iterationNumber;
        }
    }

}

sub getReadPair {

=head
Argument:
  name of read to find pair.
  optional reference to hash containing the following key/value pair:
    -retValue=1 returns the string that associated with the read pair defined
    when using the addread method.
=cut

    my ($self,$read,$args)=@_;

    if ( !defined $read ) {
        die "readname must be specified in parameter.\n";
    }
    if ( defined $args ) {
        my %args = %$args;
        my $pair = defined $self->{_pairedReads}->{$read} ? $self->{_pairedReads}->{$read} : "";
        return defined $self->{_readValue}->{$pair} ? $self->{_readValue}->{$pair} : "";
    } else {
        return defined $self->{_pairedReads}->{$read} ? $self->{_pairedReads}->{$read} : "";
    }
}

sub getPairIndex {
    my ($self,$read)=@_;

    if ( !defined $read ) {
        die "readname must be specified in parameter.\n";
    }

    if ( exists $self->{_cloneIterationNumber}->{$read} ) {
        return $self->{_cloneIterationNumber}->{$read};
    } else {
        return exists $self->{_pairedReads}->{$read} ? 0:"";
    }
}

sub _alphanum {
    my $aval = $a;
    my $bval = $b;
    if ( $aval =~ /\d+/ ) {
        $aval =~ s/(\d+)/sprintf( "%010d", $1 )/eg;
    }
    if ( $bval =~ /\d+/ ) {
        $bval =~ s/(\d+)/sprintf( "%010d", $1 )/eg;
    }
    if ( $aval =~ /^\-*\d+$/ && $bval =~ /^\-*\d+$/ ) {
        return $aval <=> $bval;
    } else {
        return $aval cmp $bval;
    }
}

#============================================================================#
1;