#!/usr/bin/env perl
# PROJECT: CASAVA
# MODULE:  $RCSfile: runGrouperBin.pl,v $
# AUTHOR:  Chris Saunders
#
# Copyright 2009 Illumina, Inc
# This software is covered by the "Illumina Genome Analyzer Software
# License Agreement" and the "Illumina Source Code License Agreement",
# and certain third party copyright/licenses, and any user of this
# source file is bound by the terms therein (see accompanying files
# Illumina_Genome_Analyzer_Software_License_Agreement.pdf and
# Illumina_Source_Code_License_Agreement.pdf and third party
# copyright/license notices).

use warnings FATAL => 'all';
use strict;

use Sys::Hostname;
use Carp;
use Getopt::Long;

use lib '/home/psgendb/local/pkg/CASAVA_v1.8.2-build/lib/CASAVA-1.8.2/perl';
use Casava::Common::Log;
use Casava::Common::IOLib qw(executeCmd);
use Casava::PostAlignment::Sequencing::Config qw(loadConfiguration
  %chrEnds %CONF_APP %CONF_PROJ);
use Casava::PostAlignment::Sequencing::SamLib qw(getSamtoolsBin);


my $argvStr     = join(' ', @ARGV);
my $cmdline     = "$0 $argvStr";
my $scriptName = (File::Spec->splitpath($0))[2];

my $projectDir        = "";
my $binId             = 0;
my $chrom             = "";
my $sampleRateSetSuffix = '';
my $help              = 0;

my $usage       =<<END;
$scriptName [options]
\t--projectDir   - project directory
\t--binId        - bin id
\t--chrom        - chromosome
\t-h|--help      - print this message
END


my $result            = GetOptions(
    "projectDir=s" => \$projectDir,
    "binId=s"      => \$binId,
    "chrom=s"      => \$chrom,
    "help|h"       => \$help
);

if ((not $result)  or $help) {
    errorExit "\n\n$usage";
}

errorExit "ERROR: Incorrect number of arguments\n$usage"
  unless ( $projectDir ne "" );

if(not ($binId =~ /^\d+$/)){
    errorExit "ERROR: Invalid binId: $binId\n";
}

if(@ARGV) {
    errorExit "ERROR: Unrecognized arguments: " . join(" ",@ARGV) . "\n";
}

{
    my $hostname          = hostname();
    printLog( "Running $hostname:[$cmdline]\n", 0);
}

loadConfiguration( $projectDir );


my $dirCurrentBuild  = $CONF_PROJ{dirBuildParsed};
my $chromDir         = File::Spec->catdir( $dirCurrentBuild, $chrom);
my $bamDir           = File::Spec->catdir( $chromDir, $CONF_APP{dirBam} );
my $dataSetSuffix    = $CONF_PROJ{dataSetSuffix};
my $binSize          = $CONF_PROJ{binSizeBuild};
my $casavaBinDir     = '/home/psgendb/local/pkg/CASAVA_v1.8.2-build/bin';
my $libexecDir       = '/home/psgendb/local/pkg/CASAVA_v1.8.2-build/libexec/CASAVA-1.8.2';
my $casavaBin        = File::Spec->catfile($casavaBinDir, 'CASAVA');
my $spanContigBin    = File::Spec->catfile($libexecDir, 'spanContigs.pl');

my $sdFlankWeight       = $CONF_PROJ{indelsSdFlankWeight};
my $spReadThresholdIndels = $CONF_PROJ{indelsSpReadThresholdIndels};
my $srasThreshold       = $CONF_PROJ{indelsSrasThreshold};
my $prasThreshold       = $CONF_PROJ{indelsPrasThreshold};
my $alignScoreThresh    = $CONF_PROJ{indelsAlignScoreThresh};
my $minGroupSize        = $CONF_PROJ{indelsMinGroupSize};
my $spReadThresholdClusters = $CONF_PROJ{indelsSpReadThresholdClusters};
my $minCoverage         = $CONF_PROJ{indelsMinCoverage};
my $minContext          = $CONF_PROJ{indelsMinContext};
my $isSaveTemp          = ((defined $CONF_PROJ{indelsSaveTempFiles}) and $CONF_PROJ{indelsSaveTempFiles});
my $isFilterUnanchored  = ((defined $CONF_PROJ{isIgnoreUnanchored}) and $CONF_PROJ{isIgnoreUnanchored});
my $grouperVerbosity  = 3;


sub reportMsg {
    my ($msg,$is_error) = @_;
    if($is_error) {
        errorExit "ERROR: $msg\n";
    } else  {
        logWarning($msg);
    }
}


sub checkNonEmptyFile {
    my ($file,$is_error) = @_;
    if        ( not -e $file ) {
        reportMsg("File \"$file\" does not exist.",$is_error);
    } elsif ( -z $file ) {
        reportMsg("File \"$file\" is empty.",$is_error);
    }
}


# if bin directory is missing, then skip this bin:
if(not -d $bamDir) {
    reportMsg("Path \"$bamDir\" is missing or does not refer to a directory.",1);
}

my $suffixStr = "";
if ( $dataSetSuffix ne "" ) {
    $suffixStr = "_$dataSetSuffix";
}


# find the bam reads file:
my $bamFileName="sorted.bam";
my $bamPath = File::Spec->catfile( $bamDir , $bamFileName );
checkNonEmptyFile($bamPath,0);
if( (not -e $bamPath) ) { exit(0); }


# find the reference seqeunce:
my $refSeqFile = $chrEnds{ $chrom }->{file};
checkNonEmptyFile($refSeqFile,1);


my $indelDir = File::Spec->catdir( $chromDir, "Indel" );
my $indelFile          = File::Spec->catfile( $indelDir, "indel_$binId$suffixStr.txt" );
my $statsFile          = File::Spec->catfile( $indelDir, "stats_$binId$suffixStr.txt" );
my $shadowFile         = File::Spec->catfile( $indelDir, "shadow_$binId$suffixStr.txt" );
my $shadowsAlignedFile = File::Spec->catfile( $indelDir, "shadows_aligned_$binId$suffixStr.txt" );
my $clustersFile       = File::Spec->catfile( $indelDir, "clusters_$binId$suffixStr.txt" );
my $mergedClustersFile = File::Spec->catfile( $indelDir, "merged_clusters_$binId$suffixStr.txt" );
my $contigFile         = File::Spec->catfile( $indelDir, "contig_$binId$suffixStr.fa" );
my $readsFile          = File::Spec->catfile( $indelDir, "reads_$binId$suffixStr.txt" );
my $alignFile          = File::Spec->catfile( $indelDir, "align_$binId$suffixStr.txt" );
my $alignedContigFile  = File::Spec->catfile( $indelDir, "aligned_contig_$binId$suffixStr.fa" );

my $binIdBinSize   = $binId * $binSize;
my $binIdBinSize1  = ( $binId + 1 ) * $binSize;
my $beginPos       = $binIdBinSize + 1;
my $endPos         = $binIdBinSize1;


my @tempFiles;

my $bamRegion = "$chrom:$beginPos-$endPos";


#############################################################################
# indel finder:
my $indelFinderCmd = "$casavaBin "
  . " --applicationType=IndelFinder"
  . " --inReadsPath=$bamPath"
  . " --spReadThreshold=$spReadThresholdIndels"
  . " --prasThreshold=$prasThreshold"
  . " --srasThreshold=$srasThreshold"
  . " --summaryFilePath=$statsFile"
  . " --refSequence=$refSeqFile"
  . " --bamRegion=$bamRegion"
  . " --outputFilePath=$shadowFile";

$indelFinderCmd .= " --filterUnanchored" if($isFilterUnanchored);

# grouper paper:
#  . " --spReadThreshold=10" #$spReadThresholdIndels"

$indelFinderCmd .= " > $indelFile";

executeCmd( $indelFinderCmd, $grouperVerbosity );
push @tempFiles, $shadowFile;


#############################################################################
# align cand:
my $alignCandIndelReadsCmd = "$casavaBin --applicationType=AlignCandIndelReads"
  . " --inReadsPath=$shadowFile"
  . " --sampleStatsPath=$statsFile"
  . " --sdFlankWeight=$sdFlankWeight"
  . " --alignScoreThresh=$alignScoreThresh"
  . " --refSequence=$refSeqFile"
  . " --outputFilePath=$shadowsAlignedFile";

# grouper paper:
#  . " --sdFlankWeight=10"#$sdFlankWeight"
#  . " --alignScoreThresh=300"#$alignScoreThresh"

executeCmd( $alignCandIndelReadsCmd, $grouperVerbosity );
push @tempFiles, $shadowsAlignedFile;


#############################################################################
# cluster finder:
my $clusterFinderCmd = "$casavaBin --applicationType=ClusterFinder"
  . " --minGroupSize=$minGroupSize"
  . " --inReadsPath=$shadowsAlignedFile"
  . " --summaryFilePath=$statsFile"
  . " --outputFilePath=$clustersFile"
  . " --spReadThreshold=$spReadThresholdClusters";

# grouper paper:
#  . " --maxDistance=15"

executeCmd( $clusterFinderCmd, $grouperVerbosity );
push @tempFiles, $clustersFile;


#############################################################################
# cluster merger:
my $clusterMergerCmd = "$casavaBin --applicationType=ClusterMerger"
  . " --rawClustersFilePath='$clustersFile'"
  . " --outputFilePath='$mergedClustersFile'";

executeCmd( $clusterMergerCmd, $grouperVerbosity );
push @tempFiles, $mergedClustersFile;


#############################################################################
# small assember:
my $smallAssemblerCmd = "$casavaBin --applicationType=SmallAssembler"
  . " --min-coverage=$minCoverage"
  . " --inReadsPath=$mergedClustersFile"
  . " --outputFilePath=$contigFile"
  . " --outputReadsFilePath=$readsFile";

# grouper paper:
#  . " --word-length=37" (default)
#  . " --max-word-length=47" (default)
#  . " --min-coverage=1"#$minCoverage" (default)
#  . " --min-seed-reads=2"

executeCmd( $smallAssemblerCmd, $grouperVerbosity );


#############################################################################
# span contigs:
my $spanContigCmd = "$spanContigBin -c $contigFile -s $shadowFile -t shadow ";

executeCmd( $spanContigCmd, $grouperVerbosity );


#############################################################################
# align contigs:
my $alignContigCmd = "$casavaBin --applicationType=AlignContig"
  . " --refSequence=$refSeqFile"
  . " --variant"
  . " --report"
  . " --min-context=$minContext"
  . " --contigFilePath=$contigFile"
  . " --outputFilePath=$alignFile"
  . " --alignedContigsPath=$alignedContigFile"
  # " --score-gap-open=24 --score-gap-extend=0 --min-score=0"
  #. " --score-gap-open=12 --score-gap-extend=0 --min-score=0"
  . " --score-gap-open=6 --score-gap-extend=0 --min-score=0"
  . " --score-match=1 --score-mismatch=2";

# grouper paper:
#  . " --score-match=2"
#  . " --score-mismatch=5"
#  . " --score-gap-open=30"
#  . " --score-gap-extend=0"
#  . " --min-score=0"

executeCmd( $alignContigCmd, $grouperVerbosity );


unlink(@tempFiles) if(not $isSaveTemp);

printLog( "$0 Finished\n", 0);
