#! /usr/bin/perl -w
=head1 LICENSE

Copyright (c) 2007-2009 Illumina, Inc.

This software is covered by the "Illumina Genome Analyzer Software
License Agreement" and the "Illumina Source Code License Agreement",
and certain third party copyright/licenses, and any user of this
source file is bound by the terms therein (see accompanying files
Illumina_Genome_Analyzer_Software_License_Agreement.pdf and
Illumina_Source_Code_License_Agreement.pdf and third party
copyright/license notices).

This file is part of the Consensus Assessment of Sequence And VAriation
(CASAVA) software package.

=cut

use strict;
use Getopt::Long;

use constant FALSE => 0;
use constant TRUE  => 1;

##
## Export (sorted) file format
##
use constant MACHINE_NAME_IDX   => 0;
use constant RUN_NUM_IDX        => 1;
use constant LANE_IDX           => 2;
use constant TILE_IDX           => 3;
use constant X_CORD_IDX         => 4;
use constant Y_CORD_IDX         => 5;
use constant INDEX_STR_IDX      => 6;
use constant READ_NUM_IDX       => 7;
use constant READ_IDX           => 8;
use constant QUAL_STR_IDX       => 9;
use constant MATCH_CHR_IDX      => 10;
use constant MATCH_CONTIG_IDX   => 11;
use constant MATCH_POS_IDX      => 12;
use constant MATCH_STRAND_IDX   => 13;
use constant MATCH_DESC_IDX     => 14;
use constant SINGLE_ALN_IDX     => 15;
use constant PAIRED_ALN_IDX     => 16;
use constant PARTNER_CHR_IDX    => 17;
use constant PARTNER_CONTIG_IDX => 18;
use constant PARTNER_OFFSET_IDX => 19;
use constant PARTNER_STRAND_IDX => 20;
use constant FILTER_IDX         => 21;

##
## Contig data structure format
##
use constant CON_ID       => 0;
use constant CON_GROUP    => 1;
use constant CON_READS    => 2;
use constant CON_START    => 3;
use constant CON_END      => 4;
use constant CON_SEQ      => 5;
use constant CON_MINSTART => 6;
use constant CON_MAXEND   => 7;
use constant CON_PAIR     => 8;
use constant CON_DIST     => 9;

##
## Input variables
##
my $contigFile;
my $sortedFile;
my $outputFile;

my $sortedType = "sorted";

my $readLen = 100;
my $indelSizeLimit = 10000;

my $verbose;
my $debug;

##
## Data structures
##
my %startMap = ();  ## This data structure contains all the contig starting map positions
my %endMap   = ();  ## This data structure contains all the contig ending map positions

my @conData  = ();  ## This contains all contig information

GetOptions("c=s"       => \$contigFile,
	   "s=s"       => \$sortedFile,
	   "r=i"       => \$readLen,
	   "t=s"       => \$sortedType,
	   "l=i"       => \$indelSizeLimit,
	   "verbose|v" => \$verbose,
	   "debug|d"   => \$debug,
	   );

if (! defined($contigFile) || 
    ! defined($sortedFile) ||
    ! defined($readLen) 
    ) {
    print STDERR "[Usage]: $0 -c contigFile -s sortedFile " .
	"-r readLength [ -t sortedType sorted|shadow ] [options]\n";
    print STDERR "[Usage]: Missing arguments! Exiting...\n";
    exit(1);
}

if ($sortedType ne "sorted" && $sortedType ne "shadow") {
    print STDERR "[Usage]: SortedType (-t) must be 'sorted' or 'shadow'\n";
    print STDERR "[Usage]: Incorrect argument value! Exiting...\n";
    exit(1);
}

##
## Do some quick file manipulation: THIS SHOULD BE REPLACED
##
my $cmd = "cp $contigFile $contigFile.org";
system($cmd);
$outputFile = $contigFile . ".tmp";

open(IN, "<$contigFile")
    or die "Could not open $contigFile for reading: $!";

mssg("\# Loading all contrigs from $contigFile\n");

my $maxContigEnd;
while(<IN>) {
    s/^\s+//;
    s/\s+$//;
    next if /^(\s)*$/;
    next if /^\#.*$/;

    my $line = $_;

    if ($line =~ m/^>.*$/) {

	my ($con, $grp, $num, $start, $end);
	if ($line =~ m/^>contig=([0-9]+)\|group=([0-9]+)\|reads=([0-9]+)\|start=([-0-9]+)\|end=([-0-9]+)$/) {
	    $con   = $1;
	    $grp   = $2;
	    $num   = $3;
	    $start = $4;
	    $end   = $5;

	    my $len = abs($end - $start);

	    if ($start <= 0) {
		$start = 1;
	    }
	    if ($end <= $start) {
		$end = $start + $len;
	    }
	} else {
	    print STDERR "[ERROR]: Unable to parse contig line: $line\n";
	    print STDERR "[ERROR]: Exiting...\n";
	    exit(1);
	}

	## Store contig data
	push @conData, [ $con, $grp, $num, $start, $end ];

	if (! defined($maxContigEnd) || $maxContigEnd < $end) {
	    $maxContigEnd = $end;
	}
    } else {
	## Otherwise this must be the sequence just add it to the last conData
	my $idx = @conData - 1;
	if (! defined($conData[$idx][CON_SEQ])) {
	    $conData[$idx][CON_SEQ] = $line;
	} else {
	    $conData[$idx][CON_SEQ] .= $line;
	}
    }
}
close(IN);

mssg("\# Loaded " . @conData . " contigs, now building maps\n");

for (my $idx = 0; $idx < @conData; $idx++) {
    my $start = $conData[$idx][CON_START];
    my $end   = $conData[$idx][CON_END] - $readLen;

    $conData[$idx][CON_MINSTART] = $start;
    $conData[$idx][CON_MAXEND]   = $end;

    ## Set start maping regions
    if (! defined($startMap{$start})) {
	$startMap{$start} = $idx;
    } else {
	$startMap{$start} .= ";" . $idx;
    }

    ## Set end maping regions
    if (! defined($endMap{$end})) {
	$endMap{$end} = $idx;
    } else {
	$endMap{$end} .= ";" . $idx;
    }
}

mssg("\# Set all start and end contig maps!\n");

##
## Now process sorted file to identify all spanning pairs
##
open(IN, "<$sortedFile")
    or die "Could not open $sortedFile for reading: $!";

mssg("\# Searching sorted file: $sortedFile for maximum spanning pairs\n");

my $spanCnt = 0;
my $lineCnt = 0;
my $exrtCnt = 0;
while(<IN>) {
    s/^\s+//;
    s/\s+$//;
    next if /^(\s)*$/;
    next if /^\#.*$/;

    my $line = $_;
    my @lineArray = split(/\t/, $line);

    if ($lineCnt % 100000 == 0) {
	mssg("\# Line count: $lineCnt\n");
    }

    $lineCnt++;

    ##
    ## First fix the line if its shadow format
    ##
    if ($sortedType eq "shadow") {
	my ($rStart, $rEnd);
	$rStart = shift(@lineArray);
	$rEnd   = shift(@lineArray);
    }

    ##
    ## Skip pairs who's match position and partner offset
    ##  is not numeric
    ##
    next if (! defined($lineArray[MATCH_POS_IDX]) ||
	     ! defined($lineArray[PARTNER_OFFSET_IDX]));
    next if (length($lineArray[MATCH_POS_IDX]) == 0 ||
	     length($lineArray[PARTNER_OFFSET_IDX]) == 0);
    next if ($lineArray[MATCH_POS_IDX] !~ m/^[0-9]+$/);
    ##
    ## Since we only need to know if a pair spans it we
    ##  can only consider one of the pairs (i.e. the one
    ##  that spans with the partner down stream offset > 0).
    ##
    ## Basically this means I can skip offsets with negative
    ##  values.
    ##
    next if ($lineArray[PARTNER_OFFSET_IDX] !~ m/^[0-9]+$/);

    ##
    ## FIXME:
    ## Really we should have loaded all the reads
    ##  in the aligned shadows file so we can skip them
    ##  here, but for this first pass we'll just skip it.
    ##
    ## In theory this could be fixed by only reading the 
    ##  anomolous read pairs that Richard has added to the
    ##  shadows file.
    ##

    ##
    ## Also, I pretty sure we can only consider the forward
    ##  read of a pair. This prevents having a negative value
    ##  for PARTNER_OFFSET
    ##
    next if ($lineArray[PARTNER_OFFSET_IDX] < 0);

    ##
    ## Check to see if we have a start and end match in
    ##  the spanning range
    ##
    my %startMatch = ();
    my %endMatch   = ();
    my $spanStart  = $lineArray[MATCH_POS_IDX];
    my $spanEnd    = $lineArray[MATCH_POS_IDX] + $lineArray[PARTNER_OFFSET_IDX];
    my $spanDist   = $spanEnd - $spanStart;

    ## Saftey check for extermly large spaning reads
    if (abs($spanEnd - $spanStart) > $indelSizeLimit ||
	abs($lineArray[PARTNER_OFFSET_IDX]) > $indelSizeLimit) {
	$exrtCnt++;
	next;
    }

    ## Stop checking if we past the last contig position by 
    ##  a fair amount
    last if ($maxContigEnd + $indelSizeLimit < $spanStart);

    for (my $ii = $spanStart; $ii < $spanEnd; $ii++) {
	if (defined($startMap{$ii})) {
	    my @ids = split(/;/, $startMap{$ii});
	    foreach my $id (@ids) {
		$startMatch{$id} = 1;
	    }
	}
	if (defined($endMap{$ii})) {
	    my @ids = split(/;/, $endMap{$ii});
	    foreach my $id (@ids) {
		$endMatch{$id} = 1;
	    }
	}
    }

    ##
    ## Now check for ids that are defined in both
    ##  start match and end match
    ##
    foreach my $idx (keys %startMatch) {
	if (defined($endMatch{$idx})) {
	    ## Store the max distance pair
	    if (! defined($conData[$idx][CON_PAIR]) ||
		$conData[$idx][CON_DIST] < $spanDist) {
		$conData[$idx][CON_DIST] = $spanDist;
		$conData[$idx][CON_PAIR] = [ @lineArray ];
	    }

	    ## Now update contig min and max starts
	    if (! defined($conData[$idx][CON_MINSTART]) ||
		$conData[$idx][CON_MINSTART] > $spanStart) {
		$conData[$idx][CON_MINSTART] = $spanStart;
	    }
	    if (! defined($conData[$idx][CON_MAXEND]) ||
		$conData[$idx][CON_MAXEND] < $spanEnd) {
		$conData[$idx][CON_MAXEND] = $spanEnd;
	    }

	    ## We should be able to do some error checking here
	    if ($spanStart > $conData[$idx][CON_START]) {
		print STDERR "[ERROR]: Span start is greater than mapped contig\n";
		print STDERR "[ERROR]: $spanStart > $conData[$idx][CON_START]\n";
		print STDERR "[ERROR]: Line: $line\n";
		print STDERR "[ERROR]: ";
		foreach my $val (@{$conData[$idx]}) {
		    print STDERR "$val\t";
		}
		print STDERR "\n";
		exit(1);
	    }
	    ## Check the end
	    if ($spanEnd + $readLen < $conData[$idx][CON_END]) {
		print STDERR "[ERROR]: Span end is less than mapped contig\n";
		print STDERR "[ERROR]: $spanEnd + $readLen < $conData[$idx][CON_END]\n";
		print STDERR "[ERROR]: Line: $line\n";
		print STDERR "[ERROR]: ";
		foreach my $val (@{$conData[$idx]}) {
		    print STDERR "$val\t";
		}
		print STDERR "\n";
		exit(1);
	    }

	    if (FALSE) {
		print STDERR "[HIT]: $spanStart > $conData[$idx][CON_START]\n";
		print STDERR "[HIT]: $spanEnd + $readLen < $conData[$idx][CON_END]\n";
		print STDERR "[HIT]: Line: $line\n";
		print STDERR "[HIT]: ";
		foreach my $val (@{$conData[$idx]}) {
		    print STDERR "$val\t";
		}
		print STDERR "\n\n";
	    }

	    $spanCnt++;
	}
    }
}
close(IN);

mssg("\# Done procssing sorted file, found $spanCnt spanning reads and $exrtCnt extreme spanning reads.\n");

open(OUT, ">$outputFile")
    or die "Could not open $outputFile for writing: $!";

open(ANN, ">$outputFile.anomalous")
    or die "Could not open $outputFile.anomalous for writing: $!";

my $inCnt = 0;
for (my $idx = 0; $idx < @conData; $idx++) {
    my $start = $conData[$idx][CON_MINSTART];
    my $end   = $conData[$idx][CON_MAXEND];

    ## We'll set the start and end to the originals for default now and
    ##  update if there is a spanning pair below.
    $start = $conData[$idx][CON_START];
    $end   = $conData[$idx][CON_END];

    ## Let's reset the span distance to max pair.
    ##  NOTE: This should only happen when a pair exits,
    ##  if one doesn't then the original dist is used
    if (defined($conData[$idx][CON_PAIR])) {
	if ($debug) {
	    print OUT "\# SPAN\t";
	    foreach my $val (@{$conData[$idx][CON_PAIR]}) {
		print OUT "$val\t";
	    }
	    print OUT "\n";
	}

	my $buf = int(0.05 * ($conData[$idx][CON_END] - $conData[$idx][CON_START]));

	my $start2 = $conData[$idx][CON_PAIR][MATCH_POS_IDX] - $buf;
	my $end2   = $conData[$idx][CON_PAIR][MATCH_POS_IDX] + $conData[$idx][CON_PAIR][PARTNER_OFFSET_IDX] +
	    $readLen + $buf;

	$start = $start2;
	$end   = $end2;

	## Only switch if it bigger, actually we don't want to do this...
	if ($start2 < $start) {
	    #$start = $start2;
	}
	if ($end2 > $end) {
	    #$end = $end2;
	}
    }

    if ($conData[$idx][CON_MINSTART] == $conData[$idx][CON_START] &&
	$conData[$idx][CON_MAXEND]   == $conData[$idx][CON_END]) {

	$inCnt++;
	mssg("\# INSERT: " .
	     ">contig=$conData[$idx][CON_ID]|group=$conData[$idx][CON_GROUP]|" .
	     "reads=$conData[$idx][CON_READS]|start=$start|end=$end\n");
    }

    ##
    ## Add a final check to throw spurious (i.e. > maxInsertSize) to
    ##  anamolous file.
    ##
    if ($end - $start > $indelSizeLimit) {
	print ANN ">contig=$conData[$idx][CON_ID]|group=$conData[$idx][CON_GROUP]|" .
	    "reads=$conData[$idx][CON_READS]|start=$start|end=$end\n";
	print ANN "$conData[$idx][CON_SEQ]\n";
    } else {
	print OUT ">contig=$conData[$idx][CON_ID]|group=$conData[$idx][CON_GROUP]|" .
	    "reads=$conData[$idx][CON_READS]|start=$start|end=$end\n";
	print OUT "$conData[$idx][CON_SEQ]\n";
    }
}
close(OUT);
close(ANN);

##
## Swpa to the original: THIS SHOULD BE REPLACED
##
$cmd = "cp $outputFile $contigFile";
system("$cmd");

mssg("\# Possible insert count: $inCnt\n");

##
## Subroutines
##
sub mssg
{
    my ($line) = @_;

    if ($verbose) {
        print STDERR "$line";
    }
}

## End of file
