#!/usr/bin/env perl

=head1 NAME

getRepeatContig.pl

=head1 SYNOPSIS

  getRepeatContig.pl [options] <scaffinfo.txt> <readinfo.txt> <fastaOfAllContigs>
    <qualOfAllContigs> <outputContigListFile>

  Options:
  -rs <number>            Length of region from gap to look for read pairs (required)
  -contig <contig name>   Name of contig adjacent to the gap (required)
  -pos <L or R>           Position of contig (L-left, R-right) relative to gap (required)
  -min <number>           Minimun repeat contig length (required)
  -max <number>           Maximum repeat contig length (required)
  -rl <number>            Minimum number of linking reads between specified contig
                          and repeat contig.
  -od <dir>               Output directory to create the fasta and qual files (optional; default is current working directory)
  -rlog <file>            Name of file containing repeat contig reads (optional; for debugging purposes)
  -fastaExt <string>      File extension to name the repeat fasta file. Prefix is contig name (required)
  -qualExt <string>       File extension to name the repeat qual file. Prefix is contig name (required)
  -h help message (optional)

=head1 DESCRIPTION

Part of the Gap Resolution sub system, this software component identifies repeat
contigs based on the read pairing information on the specified contig.  A
fasta and qual file of each repeat contig is generated and a list of repeat
contig names are created in the file specified in <outputContigListFile>.  The
following steps are performed to identify a repeat contig for creating a fasta
and qual files.

1. Look for reads that contain pairs within the specified contig and within the
region from the start of the gap to the range specified using the -regionsize
parameter. The reads that have pairs must point towards the gap. The read pairs
are obtained from the <readinfo.txt> file and only read pairs with the status P
(paired) or M (multiply placed) are used.

2. If the read pair is in a different contig and scaffold, and there are at least
N number (specified using the -rl option) of read links present, mark the repeat
contig for consideration in creating fasta and qual files.  The repeat contig
length must be >= the minimum repeat contig length (specified using the -min option)
and <= the maximum repeat contig length (specified using the -max option).

In summary, the repeat contigs must pass the following criteria to be considered
for creating the fasta and qual files:
  a. repeat contig must be >= the minimum repeat contig length (specified using
     the -min option) and must be <= to the maximum repeat contig length (specified
     using the -max option).
  b. A minimum of N read links (specified using the -rl option) between the
     specified contig and the repeat must exist.
  c. repeat contig must be in a different scaffold than specified contig.
  d. read pairs in the specified contig must be located within a specified distance
    from the gap (using -rs option).

An <outputContigListFile> is created containing the names of the repeat contigs.

Description of inputs:

  scaffinfo.txt - scaffold info file generated by createSubProject.pl
  readinfo.txt - read pairing information file generated from newblerAce2ReadPair.pl
  fastaOfAllContigs - fasta file containing specified contig and all repeat contigs
  qualOfAllContigs - qual file containing specified contig and all repeat contigs
  outputContigListFile - name of file containing the list of repeat contigs described in step 2.

=head1 VERSION 

$Revision: 1.7 $

$Date: 2009-08-28 22:57:07 $

=head1 AUTHOR(S)

Stephan Trong

=head1 HISTORY

=over

=item *

S.Trong 2008/10/29 creation

=back

=cut

use strict;
use warnings;
use Pod::Usage;
use Getopt::Long;
use Carp;
use Carp qw(cluck);
use Cwd;
use Cwd qw(abs_path);
use File::Path;
use File::Basename;
use FindBin qw($RealBin);
use lib "$RealBin/../lib";
use PGF::GapResolution::ParseScaffInfo;
use PGF::Parsers::FastaParse;
use vars qw( $optHelp $optMaxRepeatContigLength $optContigPosToGap $optContigName
    $optMinNumberOfReadLinks $optMinRepeatContigLength $optOutputFastaDir
    $optRepeatContigReadsFile $optRegionSize $optRepeatFastaFileExtension
    $optRepeatQualFileExtension );

#============================================================================#
# INPUT VALIDATION
#============================================================================#
if( !GetOptions(
        "rs=s"=>\$optRegionSize,
        "pos=s"=>\$optContigPosToGap,
        "rl=s"=>\$optMinNumberOfReadLinks,
        "min=s"=>\$optMinRepeatContigLength,
        "max=s"=>\$optMaxRepeatContigLength,
        "contig=s"=>\$optContigName,
        "od=s"=>\$optOutputFastaDir,
        "rlog=s"=>\$optRepeatContigReadsFile,
        "fastaExt=s"=>\$optRepeatFastaFileExtension,
        "qualExt=s"=>\$optRepeatQualFileExtension,
        "h"=>\$optHelp,
    )
) {
    usage();
}

usage(2) if $optHelp;

#============================================================================#
# INITIALIZE VARIABLES
#============================================================================#
my $DEBUG = 0;

if ( @ARGV != 5 ) {
    print STDERR "Required inputs are missing.\n";
    usage();
}

my ($scaffoldFile, $readInfoFile, $fastaFile, $qualFile, $outputContigListFile)
    = @ARGV;

my $outputFastaDir = $optOutputFastaDir ? $optOutputFastaDir : getcwd;

$optRepeatFastaFileExtension = '.fasta' if !$optRepeatFastaFileExtension;
$optRepeatQualFileExtension = '.fasta.qual' if !$optRepeatQualFileExtension;


#============================================================================#
# VALIDATE INPUTS
#============================================================================#

if ( !-s $scaffoldFile ) {
    print STDERR "The input scaffold file does not exist or is zero size.\n";
    exit 1;
}

if ( !-s $readInfoFile ) {
    print STDERR "The input read info file does not exist or is zero size.\n";
    exit 1;
}

if ( !-s $fastaFile ) {
    print STDERR "The input fasta file does not exist or is zero size.\n";
    exit 1;
}

if ( !-s $qualFile ) {
    print STDERR "The input fasta qual file does not exist or is zero size.\n";
    exit 1;
}

if ( !$optMaxRepeatContigLength) {
    print STDERR "The -gsize <gapSize> parameter must be specified.\n";
    usage();
}

if ( !$optContigPosToGap) {
    print STDERR "The -gpos <L or R> parameter must be specified.\n";
    usage();
}

if ( !$optMinNumberOfReadLinks ) {
    print STDERR "The -rl <minNumberOfReadLinks> parameter must be specified.\n";
    usage();
}

if ( $optContigPosToGap ne "L" && $optContigPosToGap ne "R" ) {
    print STDERR "The -gpos <L or R> parameter must be either 'L' or 'R'.\n";
    usage();
}

if ( !$optContigName ) {
    print STDERR "The -contig <contigName> parameter must be specified.\n";
    usage();
}

if ( !$optMinRepeatContigLength ) {
    print STDERR "The -cl <minRepeatContigLength> parameter must be specified.\n";
    usage();
}

if ( !$optRegionSize ) {
    print STDERR "The -rs <region size> parameter must be specified.\n";
    usage();
}

#============================================================================#

#============================================================================#
# MAIN
#============================================================================#

createDirectory($outputFastaDir) if !-e $outputFastaDir;

# Get contig length from scaffolds file.
#
my $contigLength = getContigLengthFromScaffold($scaffoldFile, $optContigName,
    $optContigPosToGap);

# Get list of repeat contigs.
#
my @repeatContigDatas = getRepeatContigs($optRegionSize, $optContigName,
    $contigLength, $readInfoFile, $optContigPosToGap, $optMinNumberOfReadLinks,
    $optMinRepeatContigLength, $optMaxRepeatContigLength);

# Filter repeat contigs that are from the same scaffold as specified contig.
#
@repeatContigDatas = filterContigsNotInSameScaffold($scaffoldFile, $optContigName,
    @repeatContigDatas);

# Print repeat contig info if debug is on.
#
printRepeatContigDatas(@repeatContigDatas) if $DEBUG;

# Create file containing repeat contig read pairing information if
# -rlog was specified.
#
createRepeatContigReadFile($optRepeatContigReadsFile, @repeatContigDatas)
    if $optRepeatContigReadsFile;
    
my @repeatContigNames = getRepeatContigNames(@repeatContigDatas);

if ( @repeatContigNames ) {
    foreach my $repeatContig (@repeatContigNames) {
        
        # Create contig fasta and qual files.
        #
        createFastaFile($fastaFile, "$outputFastaDir/${repeatContig}$optRepeatFastaFileExtension",
            $repeatContig);
        createFastaFile($qualFile, "$outputFastaDir/${repeatContig}$optRepeatQualFileExtension",
            $repeatContig);
    }

    # create file containing list of repeat contig names.
    #
    createContigListFile($outputContigListFile, @repeatContigNames);
    
} else {
    print STDERR "No repeat contigs found.\n";
}
    
exit 0;

#============================================================================#
# SUBROUTINES
#============================================================================#
sub createDirectory {

    my $dir = shift;

    if ( !-d $dir ) {
        eval { mkpath($dir, {mode=>0755}) };
        if ($@) {
            my $errMsg = "ERROR: failed to create directory $dir: $@\n";
            logError($errMsg, 1);
        }
    }
}

#============================================================================#
sub getContigLengthFromScaffold {
    
    my $scaffoldFile = shift;
    my $contig = shift;
    my $contigPosToGap = shift;
    
    my $contigLength = 0;
    my $objScaffold = PGF::GapResolution::ParseScaffInfo->new($scaffoldFile);
    my $scaffoldName = $objScaffold->getScaffoldName();
    
    if ( $contigPosToGap eq 'L' ) {
        $contigLength = $objScaffold->getLeftContigLength();
    } elsif ( $contigPosToGap eq 'R' ) {
        $contigLength = $objScaffold->getRightContigLength();
    }
        
    if ( !$contigLength ) {
        confess "ERROR: failed to get contig length for contig $contig from $scaffoldFile.\n";
    }
    
    return $contigLength;
    
}

#============================================================================#
sub getRepeatContigs {
    
    my $regionSize = shift;
    my $contig = shift;
    my $contigLenth = shift;
    my $readInfoFile = shift;
    my $contigPosToGap = shift;
    my $minNumberOfReadLinks = shift;
    my $minRepeatContigLength = shift;
    my $maxRepeatContigLength = shift;
    
    my $lineNumber = 0;
    my @repeatContigs = ();
    my %repeatContigInfos = ();
    my %repeatContigFound = ();
    my %numberOfReadPairs = ();
    my %validRepeatContig = ();
    
    # Compute region to get reads from.
    #
    my $regionStart = $contigPosToGap eq "L" ? $contigLenth - $regionSize : 1;
    my $regionStop = $contigPosToGap eq "L" ? $contigLenth : $regionSize;
    
    open FH, $readInfoFile or confess "ERROR: failed to open read info file $readInfoFile: $!\n";
    while (my $readLine = <FH>) {
        $lineNumber++;
        chomp $readLine;
        my @readDatas = split /\t/, $readLine;
        
        if ( @readDatas < 7 ) {
            confess "ERROR: line $lineNumber in file $readInfoFile is not formatted correctly.\n";
        }
        
        my $readStart = $readDatas[1];
        my $readStop = $readDatas[2];
        my $readContig = $readDatas[4];
        my $readPairingType = $readDatas[6];
        
        # skip if read is outsize region of interest.
        next if $contigPosToGap eq "L" && $readStop < $regionStart;
        next if $contigPosToGap eq "R" && $readStart > $regionStop;
        
        # skip if read's contig is not what is specified in input parameter.
        next if $readContig ne $contig;
        
        # skip if read type is not P or M.
        next if $readPairingType !~ /[PM]/;
        
        # check read pair entry.  should have 13 columns.
        #
        if ( @readDatas < 13 ) {
            confess "ERROR: line $lineNumber in file $readInfoFile is not formatted correctly. Expecting read pairing information\n";
        }
        
        my $readOrientation = $readDatas[3];
        my $readPairContig = $readDatas[11];
        my $readPairContigLength = $readDatas[12];
        
        # skip if read pairs are in the same contig.
        #
        next if $readContig eq $readPairContig;
        
        # filter read pair contigs that are less than min length or greater
        # than max length.
        #
        next if $readPairContigLength < $minRepeatContigLength;
        next if $readPairContigLength > $maxRepeatContigLength;
        
        # skip if read's orientation is such that it is pointing away from
        # the gap.
        #
        next if ( $readOrientation eq '+' && $contigPosToGap eq 'R' );
        next if ( $readOrientation eq '-' && $contigPosToGap eq 'L' );
        
        $numberOfReadPairs{$readPairContig}++;
        
        push @{$repeatContigInfos{$readPairContig}{reads}}, $readLine;
        $repeatContigInfos{$readPairContig}{contigLength} = $readPairContigLength;
        
        # If number of read pairs meets threshold, add to list.
        #
        if ( $numberOfReadPairs{$readPairContig} >= $minNumberOfReadLinks ) {
            $validRepeatContig{$readPairContig} = 1;
        }
    }
    close FH;
    
    foreach my $readPairContig ( keys %validRepeatContig ) {
        push @repeatContigs, [$readPairContig,
                              $repeatContigInfos{$readPairContig}{contigLength},
                              $repeatContigInfos{$readPairContig}{reads}
                             ];
    }
    
    return @repeatContigs;
    
}

#============================================================================#
sub filterContigsNotInSameScaffold {
    
    my $scaffoldFile = shift;
    my $contig = shift;
    my @repeatContigDatas = @_;
    
    my @filteredContigs = ();
    my %contigsInScaffold = ();
    my $objScaffold = PGF::GapResolution::ParseScaffInfo->new($scaffoldFile);
    my $scaffoldName = $objScaffold->getScaffoldName();
    my @contigsInScaffold = (
        $objScaffold->getLeftContig(),
        $objScaffold->getRightContig()
    );
    
    @contigsInScaffold{ @contigsInScaffold } = (1) x @contigsInScaffold;
    
    if ( $DEBUG ) {
        print "scaffoldName: $scaffoldName\n";
        print map "  contig: $_\n", keys %contigsInScaffold;
        print "\n";
    }

    foreach my $refContig (@repeatContigDatas) {
        my $repeatContig = $refContig->[0];
        if ( !$contigsInScaffold{$repeatContig} ) {
            push @filteredContigs, $refContig;
        }
    }
    
    return @filteredContigs;
    
}

#============================================================================#
sub printRepeatContigDatas {
    
    my @repeatContigs = @_;
    
    print "GAP CONTIG: $optContigName\n";
    print "GAP CONTIG LENGTH: $contigLength\n";
    print "MIN REPEAT CONTIG LENGTH: $optMinRepeatContigLength\n";
    print "GAP SIZE: $optMaxRepeatContigLength\n";
    print "POS OF CONTIG TO GAP: $optContigPosToGap\n";
    print "MIN READ LINKS: $optMinNumberOfReadLinks\n";
    print "\n";
    
    foreach my $refContig (@repeatContigs) {
        print "REP_CONTIG: $refContig->[0], length: $refContig->[1]\n";
        print map "-->$_\n", @{$refContig->[2]};
        print "\n";
    }
    
}
    
#============================================================================#
sub createRepeatContigReadFile {
    
    my $outputFile = shift;
    my @repeatContigs = @_;
    
    open FH, ">$outputFile" or confess "ERROR: failed to create file $outputFile: $!\n";
    print FH "GAP CONTIG: $optContigName\n";
    print FH "MIN REPEAT CONTIG LENGTH: $optMinRepeatContigLength\n";
    print FH "GAP SIZE: $optMaxRepeatContigLength\n";
    print FH "POS OF CONTIG TO GAP: $optContigPosToGap\n";
    print FH "MIN READ LINKS: $optMinNumberOfReadLinks\n";
    print FH "\n";
    
    foreach my $refContig (@repeatContigs) {
        print FH "REP_CONTIG: $refContig->[0], length: $refContig->[1]\n";
        print FH map "$_\n", @{$refContig->[2]};
        print FH "\n";
    }
    
    close FH;
    
    if ( -s $outputFile) {
        print "Created $outputFile\n";
    } else {
        warn "Failed to create $outputFile\n";
    }
    
}
    
#============================================================================#
sub getRepeatContigNames {
    
    my @repeatContigDatas = @_;
    
    my @repeatContigNames = ();
    
    foreach my $refContig (@repeatContigDatas) {
        push @repeatContigNames, $refContig->[0];
    }
    
    return @repeatContigNames;
    
}
    
#============================================================================#
sub createFastaFile {
    
    my $inputFile = shift;
    my $outputFile = shift;
    my $fastaTag = shift;
    
    my $fastaEntry = '';
    my $objFastaParser = new PGF::Parsers::FastaParse($inputFile);

    while ($objFastaParser->MoreEntries) {
        $objFastaParser->ReadNextEntry( -rawFormat=>1 );
        next if $objFastaParser->Name ne $fastaTag;
        
        my $fastaTag = $objFastaParser->Tag;
        my $seq = $objFastaParser->Seq;
        $fastaEntry = "$fastaTag\n$seq";
        last;
    }
    
    open FH, ">$outputFile" or confess "ERROR: failed to create file $outputFile: $!\n";
    print FH $fastaEntry;
    close FH;
    
    if ( -s $outputFile) {
        print "Created $outputFile\n";
    } else {
        warn "Failed to create $outputFile\n";
    }
    
}

#============================================================================#
sub createContigListFile {
    
    my $outputFile = shift;
    my @repeatContigs = @_;
    
    open FH, ">$outputFile" or confess "ERROR: failed to create file $outputFile: $!\n";
    foreach my $contig (@repeatContigs) {
        print FH "$contig\n";
    }
    close FH;
    
    if ( -s $outputFile) {
        print "Created $outputFile\n";
    } else {
        warn "Failed to create $outputFile\n";
    }
    
}

#============================================================================#
sub usage {
    my $verbose = shift || 1;
    pod2usage(-verbose=>$verbose);
    exit 1;
}

#============================================================================#
