#!/usr/bin/env perl

=head1 NAME

newblerAce2ReadPair.pl

=head1 SYNOPSIS

  newblerAce2ReadPair.pl [options] <acefile> <outputfile>
  
  Options:
  -cf <file>    File containing contig name and orientation (+/-) delimited by a
                tab, one per line (optional; use this to define contigs that need
                to be reverse complemented)
  -h            help message (optional)

=head1 DESCRIPTION

This script takes as input an ace file and generates a file containing
the information of the reads and it's mate.  The format of the output file is as
follows, where each item is separated by a tab:

  1.  read name
  2.  read start
  3.  read stop
  4.  read strand (+/-)
  5.  contig name of the read
  6.  contig length of the read
  7.  type of pairing (P-paired, U-unpaired, M-multiply placed)
  8.  read pair name
  9.  read pair start
  10. read pair stop
  11. read pair strand (+/-)
  12. contig name of the read pair 
  13. contig length of the read pair

=head1 VERSION

$Revision: 1.14 $

$Date: 2010-09-03 20:27:26 $

=head1 HISTORY

=over

=item *

S.Trong 2008/10/13 Creation

=item *

S.Trong 2009/02/20 - added support for phrap based read pairing

=back

=cut

# Includes
use strict;
use warnings;
use Carp;
use Carp qw(cluck);
use Pod::Usage;
use Getopt::Long;
use FindBin qw($RealBin);
use lib "$RealBin/../lib";
use PGF::Parsers::ParseAce;
use PGF::Parsers::ParseAce::Unpad;
use PGF::Newbler::PairNewblerReads;
use PGF::Utilities::PairReadsByPhrapExt;
use PGF::GapResolution::ReadName;
use vars qw( $RealBin $optHelp $optVerbose $optContigOrientFile );

#============================================================================#
# INPUT VALIDATION
#============================================================================#
if( !GetOptions(
        "h"=>\$optHelp,
        "cf=s"=>\$optContigOrientFile,
        "v"=>\$optVerbose,
    )
) {
    printHelp();
}

printHelp(2) if ( $optHelp );
printHelp() if @ARGV != 2;

#============================================================================#
# INITIALIZE VARIABLES
#============================================================================#

# create object for pairing reads.
#
my $OBJ_NEWBLER_READ_PAIRING = PGF::Newbler::PairNewblerReads->new();
my $OBJ_PHRAP_READ_PAIRING = PGF::Utilities::PairReadsByPhrapExt->new();
   $OBJ_PHRAP_READ_PAIRING->phrapExtension( "x"=>"y", "b"=>"g" );

# get ace file and output files from arguments.
#
my ($acefile,$outfile) = @ARGV;

#============================================================================#
# INPUT VALIDATION
#============================================================================#
if ( !-s $acefile ) {
    print STDERR "The ace file you specified $acefile does not exist or is zero size.\n";
    exit 1;
}
    
if ( $optContigOrientFile && !-s $optContigOrientFile ) {
    print STDERR "The -cf file you specified $optContigOrientFile does not exist or is zero size.\n";
    exit 1;
}

#============================================================================#
# MAIN
#============================================================================#

# Parse file containing contig orientation.
#
my %contigOrientations = parseContigFile($optContigOrientFile) if
    $optContigOrientFile;
    
# Parse ace file ang get read pairing information.
#
my ($H_contigLengths, $H_readDatas) = getReadData(%contigOrientations);

# Write read pairing information to output file.
#

print STDOUT "###createOutputFile: $outfile\n";
createOutputFile($outfile, $H_contigLengths, $H_readDatas);

# sort output file by 5th (contigName) column
#
my $sortCmd = "sort -k 5 $outfile > ${outfile}.sort;  mv ${outfile}.sort $outfile";    
print STDOUT "###now sorting $outfile by 5th column...\n";
system($sortCmd);




# create output file.
#
if ( !-s $outfile ) {
    confess "Creation of output file $outfile failed.\n";
} else {
    print "Created file $outfile\n" if $optVerbose;
}

exit 0;

#============================================================================#
sub parseContigFile {
    
    my $file = shift;
    
    my %contigOrientations = ();
    
    unless ( open FH, $file ) {
        confess "ERROR: failed to open file $file: $!\n";
    }
    while (my $entry = <FH>) {
        chomp $entry;
        if ( $entry =~ /^(\S+)\t([+-])/ ) {
            $contigOrientations{$1} = $2;
        }
    }
    close FH;
    
    return %contigOrientations;
    
}

#============================================================================#
sub getReadData {
    
    my %contigOrientations = @_;
    
    # parse ace file.
    #
    print "READING ace file...\n" if $optVerbose;
    my $objAceParser = PGF::Parsers::ParseAce->parse($acefile,{-verbose=>$optVerbose});
    print "done.\n\n" if $optVerbose;

    # get all contigs.
    #
    my @contigs = $objAceParser->allContigs;
    my %reads = ();
    my %contigLengths = ();
    my %paddedContigSequences = ();
    my %unpairedNumberOfReads = ();
    
    # exit if no contigs found.
    #
    if ( !@contigs) {
        confess "No contigs found in ace file.\n";
    }
    
    # loop thru each contig to get read data.
    #
    foreach my $contig (@contigs) {
        print "Processing read data for $contig ...\n" if $optVerbose;
        
        # get contig orientation.
        #
        my $contigOrientation = exists $contigOrientations{$contig} ?
            $contigOrientations{$contig} : '+';
    
        # get contig attributes.
        #
        my $refContigData = $objAceParser->contig($contig);
    
        # check for valid contig reference.
        #
        if ( !defined $refContigData || ref($refContigData) ne 'HASH' ) {
            confess "ERROR: failed to get contig reference from ace object.\n";
        }
        if ( !exists $refContigData->{size_unpadded} ) {
            confess "ERROR: failed to get unpadded contig size from contig reference.\n";
        }
        if ( !exists $refContigData->{sequence} ) {
            confess "ERROR: failed to get unpadded contig size from contig reference.\n";
        }
        
        $contigLengths{$contig} = $refContigData->{size_unpadded};
    
        # get reads for this contig.
        # 
        $objAceParser->fetchreads( -contig=>$contig, -unpadded=>1, -verbose=>0 );
    
        # get attributes for each read.
        #
        my $ct=0;
        while( my $refReadData = $objAceParser->fetch() ) {
            
            # check for valid read reference.
            #
            if ( !defined $refReadData || ref($refReadData) ne 'HASH' ) {
                confess "ERROR: failed to get read reference from ace object.\n";
            }
            if ( !exists $refReadData->{name} ) {
                confess "ERROR: failed to get read name from read reference.\n";
            }
            if ( !exists $refReadData->{complement} ) {
                confess "ERROR: failed to get complement from read reference.\n";
            }
            if ( !exists $refReadData->{align_start} ) {
                confess "ERROR: failed to get unpadded align start from read reference.\n";
            }
            if ( !exists $refReadData->{align_stop} ) {
                confess "ERROR: failed to get unpadded align stop from read reference.\n";
            }
            
            $ct++;
            my $library = "";
            my $cloneName = "";
            my $readName = $refReadData->{name};
            my $readOrientation = $refReadData->{complement} eq 'U' ? 'F':'R';
            my $start = $refReadData->{align_start}; # unpadded align start
            my $stop = $refReadData->{align_stop};   # unpadded align stop
    
            # If contig is in the reverse orientation, change read coords.
            #
            if ( $contigOrientation eq '-' ) {
                ($start, $stop) = reverseOrient($start, $stop, $contigLengths{$contig});
                $readOrientation = $readOrientation eq 'F' ? 'R':'F';
            }
            
            my $objReadName = PGF::GapResolution::ReadName->new($readName);
            
            # Check if read is phrap based by looking for phrap extensions
            # (x,y,b,g) followed by a number. If not, assume read is from
            # newbler.
            #
            if ( $objReadName->isPhrapRead ) {
                $OBJ_PHRAP_READ_PAIRING->addRead($readName);
            } else {
                $OBJ_NEWBLER_READ_PAIRING->addRead($readName);
            }
            
            $reads{$readName} = [$contig,$start,$stop,$readOrientation];
            if ( $optVerbose && $ct%10000 == 0) {
                print "  fetching $ct reads ...\n";
            }
        }
        print "  fetching $ct reads ...\n" if $optVerbose;
    }
    
    # Perform any pairing of phrap reads.
    #
    $OBJ_PHRAP_READ_PAIRING->pairReads;
    
    # check for valid hashes.
    #
    if ( !%contigLengths ) {
        confess "ERROR: failed to create hash of contig lengths.\n";
    }
    if ( !%reads ) {
        confess "ERROR: failed to create hash of read datas.\n";
    }

    return \%contigLengths, \%reads;
    
}

#============================================================================#
sub reverseOrient {
    
    my $start = shift;
    my $stop = shift;
    my $contigLength = shift;
    
    my $newStart = $contigLength - $stop;
    my $newStop = $contigLength - $start;
    
    return $newStart, $newStop;
    
}

#============================================================================#
sub createOutputFile {
    
    my $outputFile = shift;
    my $H_contigLengths = shift;
    my $H_readDatas = shift;
    
    print "Writing to file...\n" if $optVerbose;
    open OFILE, ">$outfile" or confess "Cannot create output file.\n";
    
    foreach my $readName ( keys %$H_readDatas ) {
        my $refReadData = $H_readDatas->{$readName};
        my $readContig = $refReadData->[0];
        my $readStart = $refReadData->[1];
        my $readStop = $refReadData->[2];
        my $readStrand = $refReadData->[3] eq "F" ? "+":"-";
        my @readPairs = ();
        
        my $objReadName = PGF::GapResolution::ReadName->new($readName);
        
        # Check if read is phrap based by looking for phrap extensions
        # (x,y,b,g) followed by a number. If not, assume read is from
        # newbler.
        #
        if ( $objReadName->isPhrapRead() ) {
            my $readPair = $OBJ_PHRAP_READ_PAIRING->getReadPair($readName);
            push @readPairs, $readPair if length $readPair;
        } else {
            @readPairs = $OBJ_NEWBLER_READ_PAIRING->getReadPair($readName);
        }
        
        my $readInfo = "$readName\t".
                       "$readStart\t".
                       "$readStop\t".
                       "$readStrand\t".
                       "$readContig\t".
                       "$H_contigLengths->{$readContig}";
        
        if ( @readPairs ) {
            my $status = '';
            if ( @readPairs == 1 ) {
                $status = "P"; # status paired
            } else {
                $status = "M"; # status paired and multiply placed
            }
            foreach my $readPairName (@readPairs) {
                my $refReadPairData = $H_readDatas->{$readPairName};
                my $readPairContig = $refReadPairData->[0];
                my $readPairStart = $refReadPairData->[1];
                my $readPairStop = $refReadPairData->[2];
                my $readPairStrand = $refReadPairData->[3] eq "F" ? "+":"-";
                print OFILE "$readInfo\t$status\t".
                            "$readPairName\t".
                            "$readPairStart\t".
                            "$readPairStop\t".
                            "$readPairStrand\t".
                            "$readPairContig\t".
                            "$H_contigLengths->{$readPairContig}\n";
        
            }
        } else {
            print OFILE $readInfo."\tU\t\t\t\t\t\t\n";
        }
        
    }
    
    close OFILE;

}

#============================================================================#
sub printHelp {
    
    my $optVerbose = shift || 1;
    pod2usage(-verbose=>$optVerbose);
    exit 1;
}

#============================================================================#
