#!/usr/bin/env perl

=head1 LICENSE

Copyright (c) 2007-2009 Illumina, Inc.

This software is covered by the "Illumina Genome Analyzer Software
License Agreement" and the "Illumina Source Code License Agreement",
and certain third party copyright/licenses, and any user of this
source file is bound by the terms therein (see accompanying files
Illumina_Genome_Analyzer_Software_License_Agreement.pdf and
Illumina_Source_Code_License_Agreement.pdf and third party
copyright/license notices).

This file is part of the Consensus Assessment of Sequence And VAriation
(CASAVA) software package.

=head1 NAME

consolidatePairs.pl

=head1 SYNOPSIS

consolidatePairs.pl <pair1> <pair2> ... <pairN> > pair.xml

=back

=head1 DESCRIPTION

This script builds a single _pair.xml file for a given lane, based on 
a set of _pais.xml produced for a set of tiles 

=head1 DIAGNOSTICS

=head2 Exit status

0: successful completion
1: abnormal completion
2: fatal error

=head2 Errors

All error messages are prefixed with "ERROR: ".

=head2 Warnings

All warning messages generated by CASAVA are prefixed with "WARNING: ".

=head1 BUGS AND LIMITATIONS

There are no known bugs in this module.

All documented features are fully implemented.

Please report problems to Illumina Technical Support (support@illumina.com)

Patches are welcome.

=head1 AUTHOR

Mauricio Varea

=cut

use warnings FATAL => 'all';
use strict;

use Carp;
use XML::Simple;
use Data::Dumper;
use List::Util qw(reduce);

unless (@ARGV)
{
    print <<"END";
Usage: $0 [--int] <pair1> [<pair2> [... <pairN>]] > pair.xml
       (output goes to stdout)
Where --int is given only if you want to round all float values to integers.
END
    exit -1;
}

my $int = 0;
if ($ARGV[0] eq '--int')
{
    $int = 1;
    shift @ARGV;
}

my $output = {};
foreach my $inputFile (@ARGV)
{
    croak "ERROR: $inputFile: file does not exist"  unless -e $inputFile;
    my $input = XMLin($inputFile, SuppressEmpty => 1, ForceArray => 1)
              or croak "ERROR: couldn't load sub-pair.xml file: $!\n";

    foreach (qw(ControlParametersUsed InsertSize Length Orientation Pairs Reads))
    {
        carp "WARNING: no '$_' element in $inputFile.\n"  unless exists $input->{$_};
    }

    if (exists $input->{ControlParametersUsed})
    {
        # take the first ControlParametersUsed...
        $output->{ControlParametersUsed} = [@{$input->{ControlParametersUsed}}]
            unless exists $output->{ControlParametersUsed};
        # ...and check that all pair.xml have the same ControlParametersUsed section
        while (my ($k,$v) = each %{$input->{ControlParametersUsed}->[0]})
        {
            $output->{ControlParametersUsed}->[0]->{$k}->[0] = $v->[0]
                   unless defined $output->{ControlParametersUsed}->[0]->{$k}->[0];
            carp "WARNING: $inputFile: found a variation in element 'ControlParametersUsed/$k' "
               . "('$v->[0]' != '$output->{ControlParametersUsed}->[0]->{$k}->[0]')\n"
                 if defined $v->[0] && defined $output->{ControlParametersUsed}->[0]->{$k}->[0]
                 && ($v->[0] ne $output->{ControlParametersUsed}->[0]->{$k}->[0]);
        }
    }

    if (exists $input->{InsertSize} && exists $input->{Pairs}->[0]->{ClustersUsedToComputeInsert})
    {
        # Total Median is not Median of Median, so a good approximation is to go for (weighted) Mean of Medians
        unless (exists $output->{InsertSize})
        {
            $output->{InsertSize} = [@{$input->{InsertSize}}];
            foreach my $stat (qw(HighSD LowSD Max Median Min))
            {
                if (exists $output->{InsertSize}->[0]->{$stat})
                {
                    $output->{InsertSize}->[0]->{$stat}->[0] *= $input->{Pairs}->[0]->{ClustersUsedToComputeInsert}->[0];
                } else {
                    $output->{InsertSize}->[0]->{$stat}->[0] = 0;
                }
            }
        } else {
            foreach my $stat (qw(HighSD LowSD Max Median Min))
            {
                $output->{InsertSize}->[0]->{$stat}->[0] += $input->{InsertSize}->[0]->{$stat}->[0]
                                                          * $input->{Pairs}->[0]->{ClustersUsedToComputeInsert}->[0]
                                                  if exists $input->{InsertSize}->[0]->{$stat};
            }
        }
    }

    if (exists $input->{Length})
    {
        # take the first Length...
        $output->{Length} = [@{$input->{Length}}]  unless exists $output->{Length};
        # ...and check that all pair.xml have the same Length section
        foreach my $r (qw(Read1 Read2))
        {
            if (exists $input->{Length}->[0]->{$r})
            {
                while (my ($k,$v) = each %{$input->{Length}->[0]->{$r}->[0]})
                {
                    $output->{Length}->[0]->{$r}->[0]->{$k}->[0] = $v->[0]
                           unless defined $output->{Length}->[0]->{$r}->[0]->{$k}->[0];
                    carp "WARNING: $inputFile: found a variation in element 'Length/$r/$k' "
                       . "('$v->[0]' != '$output->{Length}->[0]->{$r}->[0]->{$k}->[0]')\n"
                         if defined $v->[0] && defined $output->{Length}->[0]->{$r}->[0]->{$k}->[0]
                         && ($v->[0] ne $output->{Length}->[0]->{$r}->[0]->{$k}->[0]);
                }
            } else {
                carp "WARNING: $inputFile: 'Length/$r' does not exist\n";
            }
        }
    }
    
    if (exists $input->{Orientation})
    {
        # Orientation is mainly aggregation of its sub-elements
        unless (exists $output->{Orientation})
        {
            $output->{Orientation} = [@{$input->{Orientation}}];
            foreach (qw(NominalOrientationButLargeInsert NominalOrientationButSmallInsert))
            {
                $output->{Orientation}->[0]->{$_}->[0] = 0
                         unless exists $output->{Orientation}->[0]->{$_};
            }
        } else {
            foreach my $o (qw(Fm Fp Rm Rp))
            {
                if (exists $output->{Orientation}->[0]->{$o})
                {
                    $output->{Orientation}->[0]->{$o}->[0] += $input->{Orientation}->[0]->{$o}->[0]
                                                    if exists $input->{Orientation}->[0]->{$o};
                } else {
                    $output->{Orientation}->[0]->{$o}->[0] = 0;
                }
            }
            foreach my $o (qw(Fm Fp Rm Rp))
            {
                $output->{Orientation}->[0]->{Nominal}->[0] = $o
                    if ($output->{Orientation}->[0]->{ $o }->[0] 
                      > $output->{Orientation}->[0]->{ $output->{Orientation}->[0]->{Nominal}->[0] }->[0]);
            }
            $output->{Orientation}->[0]->{NominalOrientationButLargeInsert}->[0]+= 
             $input->{Orientation}->[0]->{NominalOrientationButLargeInsert}->[0]
                     if exists $input->{Orientation}->[0]->{NominalOrientationButLargeInsert};
            $output->{Orientation}->[0]->{NominalOrientationButSmallInsert}->[0]+= 
             $input->{Orientation}->[0]->{NominalOrientationButSmallInsert}->[0]
                     if exists $input->{Orientation}->[0]->{NominalOrientationButSmallInsert};
        }
    }

    if (exists $input->{Pairs})
    {
        # Pairs is mainly aggregation of its sub-elements
        unless (exists $output->{Pairs})
        {
            $output->{Pairs} = [@{$input->{Pairs}}];
            foreach (qw(ClustersPassedFiltering ClustersTotal ClustersUsedToComputeInsert))
            {
                $output->{Pairs}->[0]->{$_}->[0] = 0  unless exists $output->{Pairs}->[0]->{$_};
            }
        } else {
            $output->{Pairs}->[0]->{ClustersPassedFiltering}->[0]     += $input->{Pairs}->[0]->{ClustersPassedFiltering}->[0]
                                                               if exists $input->{Pairs}->[0]->{ClustersPassedFiltering};
            $output->{Pairs}->[0]->{ClustersTotal}->[0]               += $input->{Pairs}->[0]->{ClustersTotal}->[0]
                                                               if exists $input->{Pairs}->[0]->{ClustersTotal};
            $output->{Pairs}->[0]->{ClustersUsedToComputeInsert}->[0] += $input->{Pairs}->[0]->{ClustersUsedToComputeInsert}->[0]
                                                               if exists $input->{Pairs}->[0]->{ClustersUsedToComputeInsert};
        }
    }

    if (exists $input->{Reads})
    {
        # Reads is purely aggregation of its sub-elements
        # (not worth a recursive implementation, as we know beforehand the depth of the elements we want to deal with)
        unless (exists $output->{Reads})
        {
            $output->{Reads} = [@{$input->{Reads}}]
        } else {
            my $reads = $input->{Reads}->[0];
            foreach my $i (keys %$reads) {
                    my $ii = $reads->{$i}->[0];
                    foreach my $j (keys %$ii) {
                            my $jj = $ii->{$j}->[0];
                            foreach my $k (keys %$jj) {
                                    my $kk = $jj->{$k}->[0];
                                    if ($kk =~ /hash/i)
                                    {
                                        foreach my $l (keys %$kk)
                                        {
                                            $output->{Reads}->[0]->{$i}->[0]->{$j}->[0]->{$k}->[0]->{$l}->[0] += $kk->{$l}->[0];
                                        }
                                    } else {
                                        $output->{Reads}->[0]->{$i}->[0]->{$j}->[0]->{$k}->[0] += $kk;
                                    }
                            }
                    }            
            }
        }
    }
}

# These calculations need some totals, hence they are placed after the main loop
if (exists $output->{Pairs})
{
    if ($output->{Pairs}->[0]->{ClustersUsedToComputeInsert}->[0])
    {
        if (exists $output->{InsertSize})
        {
            foreach my $stat (qw(HighSD LowSD Max Median Min))
            {
                $output->{InsertSize}->[0]->{$stat}->[0] /= $output->{Pairs}->[0]->{ClustersUsedToComputeInsert}->[0];
                $output->{InsertSize}->[0]->{$stat}->[0]  = sprintf("%.0f", $output->{InsertSize}->[0]->{$stat}->[0])  if $int;
            }
        }

        # percentages should not get rounded
        my $fmt = "%-5.5f";  # ($int ? "%.0f" : "%-5.5f");
        if (exists $output->{Orientation})
        {
            my $pairsTotal = reduce{our $a+ our $b} map {exists $output->{Orientation}->[0]->{$_} ? 
                $output->{Orientation}->[0]->{$_}->[0] : 0} (qw(Fm Fp Rm Rp));
            $output->{Orientation}->[0]->{NominalOrientationButLargeInsertPercent}->[0] = sprintf( $fmt,
                   100.0 * $output->{Orientation}->[0]->{NominalOrientationButLargeInsert}->[0] 
                         / $pairsTotal );
            $output->{Orientation}->[0]->{NominalOrientationButSmallInsertPercent}->[0] = sprintf( $fmt,
                   100.0 * $output->{Orientation}->[0]->{NominalOrientationButSmallInsert}->[0] 
                         / $pairsTotal );
            $output->{Orientation}->[0]->{NominalOrientationPercent}->[0] = sprintf( $fmt,
                   100.0 * $output->{Orientation}->[0]->{ $output->{Orientation}->[0]->{Nominal}->[0] }->[0] 
                         / $pairsTotal );
            $output->{Pairs}->[0]->{InitialUniquePairsPercent}->[0] = sprintf( $fmt,
                   100.0 * $pairsTotal / $output->{Pairs}->[0]->{ClustersTotal}->[0] );
        }

    } else {
        carp "WARNING: 0 clusters used to compute insert\n";
    }
}

print STDOUT "<?xml version='1.0' standalone='yes'?>\n";
print STDOUT XMLout($output, RootName => "ReadPairProperties"); #, NoSort => 1);

