#!/usr/bin/env perl
# PROJECT: CASAVA
# MODULE:  $RCSfile: generateStats.pl,v $
# AUTHOR:  Richard Carter, Lukasz Szajkowski
#
# Copyright (c) 2008 Illumina
# This software is covered by the "Illumina Genome Analyzer Software
# License Agreement" and the "Illumina Source Code License Agreement",
# and certain third party copyright/licenses, and any user of this
# source file is bound by the terms therein (see accompanying files
# Illumina_Genome_Analyzer_Software_License_Agreement.pdf and
# Illumina_Source_Code_License_Agreement.pdf and third party
# copyright/license notices).
#
# This script need lot of refactoring !!!
#
use warnings FATAL => 'all';
use strict;
use POSIX qw(strftime);

use IO::File;
use Sys::Hostname;
use Getopt::Long;

use lib '/home/psgendb/local/pkg/CASAVA_v1.8.2-build/lib/CASAVA-1.8.2/perl';
use Casava::Common::Log;
use Casava::PostAlignment::Sequencing::Config qw(loadConfiguration
  %CONF_APP %CONF_PROJ %CONF_RUN %runsConfig %chrEnds isSpliceJunctionChrom readProjectParameters);
use Casava::Common::IOLib
  qw(executeCmd genId createDirs
  createDirs file2table table2file table2fileEx);
use Casava::Common::Template;


my $argvStr           = join ' ', @ARGV;
my $cmdline           = "$0 $argvStr";

my $usage = "generateStats.pl [options]\n" 
. "\t--projectDir - Project directory\n"
. "\t--sampleRateSetSuffix - suffix to be added to the input and output file names\n"
. "\t                      for coverage simulation support"
. "\t--target     - Target which part of the script to run\n"
;
my $projectDir = "";
my $help       = 0;
my $target     = 'all';
my $sampleRateSetSuffix = '';

my $result     = GetOptions(
	'projectDir|p=s' => \$projectDir,
	'target=s'       => \$target,
	'sampleRateSetSuffix=s' => \$sampleRateSetSuffix,
	'help|h'         => \$help
);

errorExit "ERROR: Incorrect number of arguments\n$usage" if ( $projectDir eq "" );
loadConfiguration($projectDir);
#############################################################
# Configuration
my $cmd               = "";
my $timeStampFormat   = $CONF_APP{formatTimeStamp};
my $hostname          = hostname();
my $projectResultsDir = $CONF_PROJ{dirBuildExport};
my $buildResultsDir   = $CONF_PROJ{dirBuildParsed};
my $projectBinSize    = $CONF_PROJ{binSizeProject};

#############################################################
if ($help) {
	print $usage;
	exit(0);
}
printLog(
	"Running $hostname:[$cmdline] at "
	  . ( strftime $timeStampFormat, localtime ) . "\n",
	5
);
my %coverageFields = (
	position => 0,
	chrom    => 1,
	binId    => 2,
	binSize  => 3,
	positionsCovered    => 4,
	coverage => 5,
);
my %snpStatsFields = (
	'position'      => 0,
	'chrom'         => 1,
	'binId'         => 2,
	'het1Count'     => 3,
	'het2Count'     => 4,
	'otherHetCount' => 5,
	'diffCount'     => 6,
);

my %projectChromsBinSizes = ();
my %buildChromsBinSizes   = ();
readProjectParameters( %projectChromsBinSizes, "PROJECT_BIN_SIZES", $projectDir);
readProjectParameters( %buildChromsBinSizes, "BUILD_BIN_SIZES", $projectDir);

my @chromsProject = sort keys %projectChromsBinSizes;

my $statsPath = File::Spec->catdir( $projectDir, "stats" );

if ( $target eq 'configure' || $target eq 'all' ) {
	my @dirs = ('stats');
	createDirs( $projectDir, @dirs );
}
if ( $target eq 'duplicates' ) {
    my %stats = ();
    $stats{isData} = 0;
    for my $chrom (@chromsProject) {
        my $maxNumberOfBins = $projectChromsBinSizes{$chrom};
        printLog( "processing $chrom\n", 6 );
        for ( my $i = 0 ; $i < $maxNumberOfBins ; $i++ ) {
            my $binId = sprintf "%04d", $i;
            my $srcDir = File::Spec->catdir( $projectResultsDir, $chrom, $binId );
            my $dupCountFile = File::Spec->catfile( $srcDir, 'dupCount.txt' );
            $stats{$chrom}{isData} = 0;
            if ( -e $dupCountFile ) {
                printLog( "processing $dupCountFile\n", 6 );
                open( DUP, "<$dupCountFile" )
                  || errorExit "ERROR: Could not open $dupCountFile $!\n";
                while (<DUP>) {
                    if (/(\d+) reads/) {
                        $stats{$chrom}{$binId}{chrTotal} = $1;
                        $stats{$chrom}{chrTotal} += $1;
                    }    # if
                    elsif (/(\d+) were not marked/) {
                        $stats{$chrom}{$binId}{chrOk} = $1;
                        $stats{$chrom}{chrOk} += $1;
                    }    # elsif
                    elsif (/(\d+) were marked/) {
                        $stats{$chrom}{$binId}{chrDup} = $1;
                        $stats{$chrom}{chrDup} += $1;
                    }    # elsif
                }    # while
                close(DUP);
                $stats{$chrom}{$binId}{isData} = 1;
                $stats{$chrom}{isData}         = 1;
                $stats{isData}                 = 1;
            }    # if
            else {
                $stats{$chrom}{$binId}{isData}     = 0;
                $stats{$chrom}{$binId}{chrDup}     = 0;
                $stats{$chrom}{$binId}{chrOk}      = 0;
                $stats{$chrom}{$binId}{chrTotal}   = 0;

                #$stats{isData} = 0;
                printLog( "Warn: no data for $dupCountFile\n", 7 );
            }    # else
        }    # for
        # if ( $stats{$chrom}{isData} > 0 ) {
        # 	if ( defined $stats{$chrom}{chrTotal} ) {
        # 		$stats{chrTotal}   += $stats{$chrom}{chrTotal};
        # 		$stats{chrOk}      += $stats{$chrom}{chrOk};
        # 		$stats{chrDup}     += $stats{$chrom}{chrDup};
        # 	}    # if
        # }    # if
        printLog( "$chrom processed\n", 6 );
    }    # for
    printLog( "finished reading duplicates\n", 6 );
    my @table = ();
    if ( defined $stats{isData} && $stats{isData} > 0 ) {
        my @chrom_by_size = sort {$chrEnds{$b}->{length} <=> $chrEnds{$a}->{length}} keys %chrEnds;
        for my $chrom (@chrom_by_size)
        {
            my @row        = ();
            if ( defined $stats{$chrom}{chrTotal} ) {
                my $fmtChrOk    = 0;
                if ( defined $stats{$chrom}{chrOk} ) {
                    $fmtChrOk = $stats{$chrom}{chrOk};
                }    # if
                my $fmtChrDup = 0;
                if ( defined $stats{$chrom}{chrDup} ) {
                    $fmtChrDup = $stats{$chrom}{chrDup};
                }    # if
                push @row, $chrom;
                push @row, $fmtChrOk;
                push @row, $fmtChrDup;
            }    # if
            push @table, \@row;
        }    # foreach
        my $header =
          "# ** CASAVA reference sequence duplicate summary **\n" .
          "#\$ COLUMNS seq_name nonduplicate_pairs duplicate_pairs\n";
        table2fileEx( @table,
                      File::Spec->catfile( $statsPath, 'dupCount.summary.txt') , $header );
		# @table = ();
		# foreach my $chrom (@chrom_by_size)
		# {
		# 	$maxNumberOfBins = $projectChromsBinSizes{$chrom};
		# 	foreach ( my $i = 0 ; $i < $maxNumberOfBins ; $i++ ) {
		# 		my $chrTotal   = -1;
		# 		my $chrDup     = -1;
		# 		my $chrOk      = -1;
		# 		my @row        = ();
		# 		my $binId      = sprintf "%04d", $i;
		# 		printLog( "processing 2 $chrom/$binId\n", 6 );
		# 		if ( defined $stats{$chrom}{$binId}{chrTotal} ) {
		# 			my $fmtChrTotal = $stats{$chrom}{$binId}{chrTotal};
		# 			my $fmtChrOk    = 0;
		# 			if ( defined $stats{$chrom}{$binId}{chrOk} ) {
		# 				$fmtChrOk = $stats{$chrom}{$binId}{chrOk};
		# 			}
		# 			my $fmtChrDup = 0;
		# 			if ( defined $stats{$chrom}{$binId}{chrDup} ) {
		# 				$fmtChrDup = $stats{$chrom}{$binId}{chrDup};
		# 			}
		# 			push @row, $chrom;
		# 			push @row, $binId;
		# 			push @row, $fmtChrTotal;
		# 			push @row, $fmtChrOk;
		# 			push @row, $fmtChrDup;
		# 		}
		# 		push @table, \@row;
		# 	}
		# }
		# printLog( "save to dupCount.bin.txt\n", 6 );

		# table2file( @table,
		# 	File::Spec->catfile( $statsPath, 'dupCount.bin.txt' ) );
    }
    else {
        printLog( "Warn: no data for dupCount stats\n", 6 );
    }
}    # if

if ( $target eq 'coverage'  ) 
{
	my $coverageFileOut = File::Spec->catfile( $statsPath, "coverage$sampleRateSetSuffix.txt" );
	my $coverageChromFileOut = File::Spec->catfile( $statsPath, "coverage.chrom$sampleRateSetSuffix.txt" );
	my @coverageTable = ();
	my %coverage      = ();
	foreach my $chrom (@chromsProject)
	{
		my $maxNumberOfBins = $buildChromsBinSizes{$chrom};
		foreach ( my $i = 0 ; $i < $maxNumberOfBins ; $i++ ) {
			my $binId = sprintf "%04d", $i;

			#printLog( "processing $chrom/$binId\n", 5 );
			my $srcDir =
			  File::Spec->catdir( $buildResultsDir, $chrom, $binId );
			my $coverageFileIn =
			  File::Spec->catfile( $srcDir, "coverage$sampleRateSetSuffix.txt" );
			if ( -e $coverageFileIn ) {
				my @tableIn = ();
				file2table( @tableIn, $coverageFileIn );
				foreach my $rowRef (@tableIn) {

					#my $str = join ',', @{$rowRef};
					if ( $rowRef->[ $coverageFields{position} ] eq 'Total' )
					{
						$coverage{ $rowRef->[ $coverageFields{chrom} ] }
						  ->{positionsCovered} += $rowRef->[ $coverageFields{positionsCovered} ];
						$coverage{ $rowRef->[ $coverageFields{chrom} ] }
						  ->{coverage} +=
						  $rowRef->[ $coverageFields{coverage} ];
					}
					else {
					}
					push @coverageTable, \@{$rowRef};
				}    # foreach
			}    # if
		}    # foreach
	}    # foreach
	my @coverageChromTable = ();
	foreach my $chrom (
		sort { $chrEnds{$b}->{length} <=> $chrEnds{$a}->{length} }
		keys %chrEnds
	  )
	{
		next if ( isSpliceJunctionChrom($chrom, %CONF_PROJ) );
		next if ( !defined $coverage{$chrom}->{positionsCovered} );
		my $refSeqFile = $chrEnds{$chrom}->{file};
		$coverage{$chrom}->{chromosomeLength} = $chrEnds{$chrom}->{length};
		$coverage{$chrom}->{knownBases} = $chrEnds{$chrom}->{knownBases};
		my @chromRow = (
			$chrom,
			$coverage{$chrom}->{positionsCovered},
			$coverage{$chrom}->{coverage},
			$coverage{$chrom}->{chromosomeLength},
			$coverage{$chrom}->{knownBases}
		);
		push @coverageChromTable, \@chromRow;
	}
	table2fileEx( @coverageChromTable, $coverageChromFileOut, "#\$ COLUMNS chrom\tpositionsCovered\tcoverage\tchromosomeLength\tknownBases\n" );
	table2fileEx( @coverageTable,      $coverageFileOut, "#\$ COLUMNS position\tchrom\tbinId\tbinSize\tpositionsCovered\tcoverage\n" );
}    # if

if ( $target eq 'snp' || $target eq 'coverageSimulation' ) {
    my $snpFileOut = File::Spec->catfile( $statsPath, "snpStats$sampleRateSetSuffix.txt" );
    my $snpChromFileOut = File::Spec->catfile( $statsPath, "snpChromStats$sampleRateSetSuffix.txt" );
    my @coverageTable = ();
    my %snpChrom      = ();
    foreach my $chrom (@chromsProject) {
    	my $maxNumberOfBins = $buildChromsBinSizes{$chrom};
    	foreach ( my $i = 0 ; $i < $maxNumberOfBins ; $i++ ) {
    		my $binId = sprintf "%04d", $i;
    
    		# printLog( "processing $chrom/$binId\n", 5 );
    		my $srcDir =
    		  File::Spec->catdir( $buildResultsDir, $chrom, $binId );
    		my $coverageFileIn =
    		  File::Spec->catfile( $srcDir, "snpStats$sampleRateSetSuffix.txt" );
    		if ( -e $coverageFileIn ) {
    			my @tableIn = ();
    			file2table( @tableIn, $coverageFileIn );
    			foreach my $rowRef (@tableIn) {
    				if ( $rowRef->[ $snpStatsFields{position} ] eq 'Total' )
    				{
    
    					#my $str = join ',', @{$rowRef};
    					#print "$str\n";
    					$snpChrom{ $rowRef->[ $snpStatsFields{chrom} ] }
    					  ->{het1Count} +=
    					  $rowRef->[ $snpStatsFields{het1Count} ];
    					$snpChrom{ $rowRef->[ $snpStatsFields{chrom} ] }
    					  ->{het2Count} +=
    					  $rowRef->[ $snpStatsFields{het2Count} ];
    					$snpChrom{ $rowRef->[ $snpStatsFields{chrom} ] }
    					  ->{otherHetCount} +=
    					  $rowRef->[ $snpStatsFields{otherHetCount} ];
    					$snpChrom{ $rowRef->[ $snpStatsFields{chrom} ] }
    					  ->{diffCount} +=
    					  $rowRef->[ $snpStatsFields{diffCount} ];
    				}
    				else {
    				}
    				push @coverageTable, \@{$rowRef};
    			}    # foreach
    		}    # if
    	}    # foreach
    }    # foreach
    my @snpChromTable = ();
    foreach my $chrom (
    	sort { $chrEnds{$b}->{length} <=> $chrEnds{$a}->{length} }
    	keys %chrEnds
      )
    {
    	next if ( isSpliceJunctionChrom($chrom, %CONF_PROJ) );
    
    	next if ( !defined $snpChrom{$chrom}->{het1Count} );
    	my @chromRow         = (
    		$chrom,
    		$snpChrom{$chrom}->{het1Count},
    		$snpChrom{$chrom}->{het2Count},
    		$snpChrom{$chrom}->{otherHetCount},
    		$snpChrom{$chrom}->{diffCount},
    	);
    	push @snpChromTable, \@chromRow;
    }
    table2file( @snpChromTable, $snpChromFileOut );
    table2file( @coverageTable, $snpFileOut );
}    # if




if ( $target eq 'coverage4insert' ) {
	foreach my $chrom (@chromsProject) {
		my $srcDir = '';
		my $maxNumberOfBins = $projectChromsBinSizes{$chrom};
		printLog( "processing $chrom\n", 0 );
		my %chrCoverage;
		my $lastBasePrinted;
		my $covFile = File::Spec->catfile( $srcDir, 'sort.txt' );
		open( RES, ">>$projectResultsDir/$chrom/cover.txt" )
		  || errorExit "ERROR: Could not open $projectResultsDir $!\n";
		foreach ( my $i = 0 ; $i < $maxNumberOfBins ; $i++ ) {
			my $binId = sprintf "%04d", $i;
			$srcDir = File::Spec->catdir( $projectResultsDir, $chrom, $binId );
			my $sortFile = File::Spec->catfile( $srcDir, 'sort.txt' );

			# have a look at the normal reads
			if ( -e $sortFile ) {
				open( SRT, "<$sortFile" )
				  or errorExit "ERROR: Could not open file $sortFile $!\n";
				while (<SRT>) {
					my @temp = split /\s+/, $_;
					my ( $start, $end );
					if ( ( $temp[5] eq "F" ) && ( $temp[17] eq "R" ) ) {
						$start = $temp[4];
						$end   = $temp[16] + length( $temp[13] );
					}    # if
					elsif ( ( $temp[5] eq "R" ) && ( $temp[17] eq "F" ) ) {
						$start = $temp[16];
						$end   = $temp[4] + length( $temp[1] );
					}    #elsif
					foreach my $ele ( $start .. $end ) {
						$chrCoverage{$ele}++;
					}    # foreach
				}    # while
				close(SRT);
			}    # if
			my $thisPrintEnd =
			  ( $binId * $projectBinSize ) + ( $projectBinSize / 2 );
			foreach my $cov ( sort { $a <=> $b } keys %chrCoverage ) {
				last if ( $_ > $thisPrintEnd );
				printf RES " %10d     |  %6d\n", $cov, $chrCoverage{$cov};
				delete $chrCoverage{$cov};
				$lastBasePrinted = $cov;
			}    # foreach
		}
		foreach my $cov ( sort { $a <=> $b } keys %chrCoverage ) {
			errorExit "ERROR: Found some more coverage for $cov "
			  . "which we have already printed\n"
			  if ( $cov < $lastBasePrinted );
			printf RES " %10d     |  %6d\n", $cov, $chrCoverage{$cov};
		}    # foreach
		close(RES);
	}
}    # if

