#!/usr/bin/env perl

=head1 LICENSE

Copyright (c) 2007-2009 Illumina, Inc.

This software is covered by the "Illumina Genome Analyzer Software
License Agreement" and the "Illumina Source Code License Agreement",
and certain third party copyright/licenses, and any user of this
source file is bound by the terms therein (see accompanying files
Illumina_Genome_Analyzer_Software_License_Agreement.pdf and
Illumina_Source_Code_License_Agreement.pdf and third party
copyright/license notices).

This file is part of the Consensus Assessment of Sequence And VAriation
(CASAVA) software package.

=head1 NAME

generateVariantsStats.pl - Produce {coverage,snps,indels}.summary.txt file.

=head1 SYNOPSIS

generateVariantsStats.pl options

Options:

=over 4

=item *

--projectDir=s

Path to casava project folder

=back

=head1 DESCRIPTION

=head1 DIAGNOSTICS

=head2 Exit status

0: successful completion
1: abnormal completion
2: fatal error

=head2 Errors

All error messages are prefixed with "ERROR: ".

=head2 Warnings

All warning messages generated by CASAVA are prefixed with "WARNING: ".

=head1 CONFIGURATION AND ENVIRONMENT

=head1 DEPENDENCIES

=over 4

=item Standard perl modules

strict, warnings, Getopt::Long, Pod::Usage, File::Spec

=item External perl modules

=item CASAVA perl modules

Casava::Common::Log,
Casava::PostAlignment::Sequencing::Config,
Casava::Common::IOLib

=back

=head1 BUGS AND LIMITATIONS

There are no known bugs in this module.

All documented features are fully implemented.

Please report problems to Illumina Technical Support (support@illumina.com)

Patches are welcome.

=head1 AUTHOR

Chris Saunders

=cut

use warnings FATAL => 'all';
use strict;

use Getopt::Long;
use Pod::Usage;

use lib '/home/psgendb/local/pkg/CASAVA_v1.8.2-build/lib/CASAVA-1.8.2/perl';
use Casava::Common::Log qw(errorExit);
use Casava::PostAlignment::Sequencing::Config qw(loadConfiguration
  %CONF_APP %CONF_PROJ %chrEnds isSpliceJunctionChrom readProjectParameters);

$ENV{PATH} = $1 if $ENV{PATH} =~ /^(.*)$/;

my $projectDir = "";
my $help       = 0;
my $man        = 0;

my $result     = GetOptions(
    "projectDir=s"   => \$projectDir,
    "help|h"         => \$help,
    'man'            => \$man) or pod2usage(2);

pod2usage(1) if $help;
pod2usage(-exitstatus => 0, -verbose => 2, -input => $1) if ($man and $0 =~ /(.*)/);
pod2usage("$0: --projectDir not specified.\n") unless $projectDir;

loadConfiguration($projectDir);

#############################################################
# Configuration
my $confDir           = $CONF_APP{dirConf};
my $confFile          = $CONF_APP{projectConf};
my $buildResultsDir   = $CONF_PROJ{dirBuildParsed};
my $minQsnp           = $CONF_PROJ{variantsSummaryMinQsnp};
my $snpsFileName      = "snps.txt";
my $snpsSummaryFileName = "snps.summary.txt";

my $isNoIndels      = ((defined $CONF_PROJ{variantsNoIndels}) and $CONF_PROJ{variantsNoIndels});
my $minQindel         = $CONF_PROJ{variantsSummaryMinQindel};
my $indelsFileName    = "indels.txt";
my $indelsSummaryFileName    = "indels.summary.txt";

my $covSummaryFileName       = "coverage.summary.txt";

#############################################################
my %buildChromsBinSizes   = ();
readProjectParameters( %buildChromsBinSizes, "BUILD_BIN_SIZES", $projectDir );

my @chroms = sort { $chrEnds{$b}->{length} <=> $chrEnds{$a}->{length} } keys(%buildChromsBinSizes);

my $statsPath = File::Spec->catdir( $projectDir, "stats" );

if (not (-e $statsPath and -d $statsPath) ) {
    errorExit "ERROR: Can't find CASAVA stats directory: $statsPath\n";
}



sub getsnpType {
    my ($ref,$maxGT) = @_;

    my ($gt0,$gt1) = split(//,$maxGT);

    return "hom" if($gt0 eq $gt1);
    return "het" if(($gt0 eq $ref) or ($gt1 eq $ref));
    return "nonref_het";
}



sub getLabelIndex {
    my ($clabels,$tlabel) = @_;
    my $col = 0;
    for (@{$clabels}) {
        return $col if ($tlabel eq $_);
        ++$col;
    }
    return undef;
}



sub checkCol {
    my ($col,$label,$file) = @_;
    errorExit "ERROR: Got data line without the position of '$label' column defined first in file: $file" unless defined $col;
}

my %covChrom  = ();
for my $chrom (@chroms) {
}

my $covChromFile = File::Spec->catfile( $statsPath, $covSummaryFileName );
open( my $COFH, "> $covChromFile" )
  or errorExit "ERROR: Could not open output file: $covChromFile\n";

my $is_first=1;
for my $chrom (@chroms){
    next if ( isSpliceJunctionChrom($chrom, %CONF_PROJ) );

    my $chromDir = File::Spec->catdir($buildResultsDir, $chrom);
    my $covFile = File::Spec->catfile($chromDir, $covSummaryFileName);

    next if ( ! -e $covFile );
    open( my $covFH, "$covFile" )
        or errorExit "ERROR: Could not open coverage summary file: $covFile\n";

    while(<$covFH>) {
        next if((not $is_first) and /^#/);
        print $COFH $_;
    }
    close($covFH);

    $is_first=0;
}
close($COFH);



my %snpsChrom  = ();
for my $chrom (@chroms) {
    next if ( isSpliceJunctionChrom($chrom, %CONF_PROJ) );

    my $chromDir = File::Spec->catdir($buildResultsDir, $chrom);
    my $snpsFile = File::Spec->catfile($chromDir, $snpsFileName);

    $snpsChrom{$chrom}->{hom} = 0;
    $snpsChrom{$chrom}->{het} = 0;
    $snpsChrom{$chrom}->{nonref_het} = 0;

    next if ( ! -e $snpsFile );
    open( my $snpsFH, "$snpsFile" )
        or errorExit "ERROR: Could not open small_variants snp file: $snpsFile\n";

    my $refCol;
    my $qsnpCol;
    my $maxGTCol;
    my $refLabel = 'ref';
    my $qsnpLabel = 'Q(snp)';
    my $maxGTLabel = 'max_gt|poly_site';

    my $isHeader = 1;
    my $isCol = 0;
    while(my $line=<$snpsFH>) {
        chomp $line;
        if( $isHeader ) {
            if ($line =~ /^#/) {
                if ( $line =~ /^#\$ COLUMNS\s+(.*)/) {
                    errorExit "ERROR: unexpected meta-data format in file: $snpsFile\n" if($isCol);

                    my @clabels = split (/\s+/, $1);
                    $refCol = getLabelIndex(\@clabels,$refLabel);
                    $qsnpCol = getLabelIndex(\@clabels,$qsnpLabel);
                    $maxGTCol = getLabelIndex(\@clabels,$maxGTLabel);
                    $isCol = 1;
                }
                next;
            }

            checkCol($refCol,$refLabel,$snpsFile);
            checkCol($qsnpCol,$qsnpLabel,$snpsFile);
            checkCol($maxGTCol,$maxGTLabel,$snpsFile);
            $isHeader = 0;
        }

        my @s = split("\t",$line);
        my $ref = $s[$refCol];
        my $qsnp = $s[$qsnpCol];
        my $maxGT = $s[$maxGTCol];
        $snpsChrom{$chrom}->{getsnpType($ref,$maxGT)}++ if($qsnp >= $minQsnp);
    }
    close($snpsFH);
}    # for

my $snpsChromFile = File::Spec->catfile( $statsPath, $snpsSummaryFileName );
open( my $OFH, "> $snpsChromFile" )
  or errorExit "ERROR: Could not open output file: $snpsChromFile\n";

print $OFH "# ** CASAVA reference sequence snp summary **\n";
print $OFH "#\$ SUMMARY_MIN_QSNP $minQsnp\n";
print $OFH "#\$ COLUMNS seq_name hom het het_nonref\n";

for my $chrom (@chroms)
{
    next if ( isSpliceJunctionChrom($chrom, %CONF_PROJ) );
    printf $OFH "%s\t%s\t%s\t%s\n",$chrom,$snpsChrom{$chrom}->{hom},$snpsChrom{$chrom}->{het},$snpsChrom{$chrom}->{nonref_het};
}
close($OFH);


exit 1 if($isNoIndels);

sub getIndelType {
    my ($desc) = @_;
    if     ($desc =~ /_CONFLICT$/) {
        return 'none';
    } elsif($desc =~ /^[0-9]+I/) {
        return 'insertion';
    } elsif($desc =~ /^[0-9]+D/) {
        return 'deletion';
    } elsif($desc =~ /^BP_/) {
        return 'breakpoint';
    } else {
        return 'none';
    }
}


my %indelChrom      = ();
foreach my $chrom (@chroms) {
    my $indelFile = File::Spec->catdir( $buildResultsDir, $chrom, $indelsFileName);

    $indelChrom{$chrom}->{insertion} = 0;
    $indelChrom{$chrom}->{deletion} = 0;
    $indelChrom{$chrom}->{breakpoint} = 0;
    $indelChrom{$chrom}->{het} = 0;
    $indelChrom{$chrom}->{hom} = 0;

    next if ( ! -e $indelFile );
    open( my $IFILE, "$indelFile" )
        or errorExit "ERROR: Could not open indel file: $indelFile\n";

    my $typeCol = undef;
    my $qindelCol = undef;
    my $maxGTCol = undef;
    my $typeLabel = 'type';
    my $qindelLabel = 'Q(indel)';
    my $maxGTLabel = 'max_gtype';

    my $isHeader = 1;
    while(<$IFILE>) {
        my $line = $_;
        chomp $line;
        if( $isHeader ) {
            if ($line =~ /^#/) {
                if ( not defined $typeCol and $line =~ /^#\$ COLUMNS(.*)/) {
                    my $cstring = $1;
                    $cstring =~ s/^\s+//;
                    my @clabels = split (/\s+/, $cstring);
                    $typeCol = getLabelIndex(\@clabels,$typeLabel);
                    $qindelCol = getLabelIndex(\@clabels,$qindelLabel);
                    $maxGTCol = getLabelIndex(\@clabels,$maxGTLabel);
                }
                next;
            }

            checkCol($typeCol,$typeLabel,$indelFile);
            checkCol($qindelCol,$qindelLabel,$indelFile);
            checkCol($maxGTCol,$maxGTLabel,$indelFile);
            $isHeader = 0;
        }

        my @indelf = split("\t",$line);
        my $type = $indelf[$typeCol];
        my $qindel = $indelf[$qindelCol];
        my $maxGT = $indelf[$maxGTCol];
        my $itype = getIndelType($type);
        if( ($itype ne 'none')  and
            ($qindel >= $minQindel) ) {
            $indelChrom{$chrom}->{$itype}++;
            $indelChrom{$chrom}->{$maxGT}++;
        }
    }
    close($IFILE);
}    # foreach

my $indelsChromFile = File::Spec->catfile( $statsPath, $indelsSummaryFileName );
open( my $IOUT, "> $indelsChromFile" )
  or errorExit "ERROR: Could not open output file: $indelsChromFile\n";

print($IOUT "# ** CASAVA reference sequence indel summary **\n");
print($IOUT "#\$ SUMMARY_MIN_QINDEL $minQindel\n");
print($IOUT "#\$ COLUMNS seq_name insertion deletion breakpoint het hom\n");

foreach my $chrom (@chroms)
{
    next if ( isSpliceJunctionChrom($chrom, %CONF_PROJ) );
    printf($IOUT "%s\t%s\t%s\t%s\t%s\t%s\n",
        $chrom,
        $indelChrom{$chrom}->{insertion},
        $indelChrom{$chrom}->{deletion},
        $indelChrom{$chrom}->{breakpoint},
        $indelChrom{$chrom}->{het},
        $indelChrom{$chrom}->{hom}
        );
}
close($IOUT);


1;
