#!/usr/bin/env perl
# PROJECT: CASAVA
# MODULE:  $RCSfile: callSmallVariants.pl,v $
# AUTHOR:  Chris Saunders
#
# Copyright 2009 Illumina, Inc
# This software is covered by the "Illumina Genome Analyzer Software
# License Agreement" and the "Illumina Source Code License Agreement",
# and certain third party copyright/licenses, and any user of this
# source file is bound by the terms therein (see accompanying files
# Illumina_Genome_Analyzer_Software_License_Agreement.pdf and
# Illumina_Source_Code_License_Agreement.pdf and third party
# copyright/license notices).

use warnings FATAL => 'all';
use strict;

use lib '/home/psgendb/local/pkg/CASAVA_v1.8.2-build/lib/CASAVA-1.8.2/perl';
use IO::File;
use Sys::Hostname;
use Carp;
use Getopt::Long;
use File::Temp;
use File::Copy qw(move);

use Casava::Common::Log;
use Casava::Common::IOLib qw(executeCmd);
use Casava::PostAlignment::Sequencing::Config qw(loadConfiguration
  %chrEnds %CONF_APP %CONF_PROJ isSpliceJunctionChrom readProjectParameters inputBltQC);
use Casava::PostAlignment::Sequencing::SamLib qw(getSamtoolsBin getChangeChrLabelType changeChrLabel getRefSizeFH);


my $argvStr     = join ' ', @ARGV;
my $cmdline     = "$0 $argvStr";
my $scriptName = (File::Spec->splitpath($0))[2];

my $projectDir        = "";
my $binId             = 0;
my $chrom             = "";
my $sampleRateSetSuffix = '';
my $help              = 0;

my $usage       =<<END;
$scriptName [options]
\t--projectDir   - project directory
\t--binId        - bin id
\t--chrom        - chromosome
\t-h|--help      - print this message
END


my $result            = GetOptions(
    "projectDir=s" => \$projectDir,
    "binId=s"      => \$binId,
    "chrom=s"      => \$chrom,
    "help|h"       => \$help
);

if ((not $result)  or $help) {
    errorExit "\n\n$usage";
}

errorExit "ERROR: Incorrect number of arguments\n$usage"
  unless ( $projectDir ne "" );

if(not ($binId =~ /^\d+$/)){
    errorExit "ERROR: Invalid binId: $binId\n";
}

errorExit "ERROR: Unrecognized arguments: " . join(" ",@ARGV) . "\n" if(@ARGV);

{
    my $hostname          = hostname();
    printLog( "Running $hostname:[$cmdline]\n", 0);
}

loadConfiguration( $projectDir );

inputBltQC(%CONF_PROJ);

my $minQbasecall = $CONF_PROJ{variantsMinQbasecall};
if(($CONF_PROJ{qualityType} eq "Phred64") and
   ($minQbasecall < 0)) {
    $minQbasecall = 0;
}

################################################################################
# Configuration
my $confDir           = $CONF_APP{dirConf};
my $confFile          = $CONF_APP{projectConf};
my $bamDirName        = $CONF_APP{dirBam};
my $binSize           = $CONF_PROJ{binSizeBuild};
my $dirCurrentBuild   = $CONF_PROJ{dirBuildParsed};
my $readMode          = $CONF_PROJ{readMode};
my $chromDir          = File::Spec->catdir( $dirCurrentBuild, $chrom);
my $inputDir          = File::Spec->catdir( $chromDir, $binId);
my $bam_f             = 'sorted.bam';
my $sites_txt_f       = "sites$sampleRateSetSuffix.txt";
my $snps_txt_f        = "snps.unfiltered$sampleRateSetSuffix.txt";
my $indels_txt_f      = "indels.unfiltered$sampleRateSetSuffix.txt";

my $binDir            = '/home/psgendb/local/pkg/CASAVA_v1.8.2-build/bin';
my $libexecDir        = '/home/psgendb/local/pkg/CASAVA_v1.8.2-build/libexec/CASAVA-1.8.2';

my $samtoolsBin      = getSamtoolsBin(%CONF_PROJ);
my $finishBin           = "python ". File::Spec->catfile($binDir, "finish_variant_files.py");
my $starlingBin        = File::Spec->catfile( $libexecDir , 'starling' );

my $binIdBinSize      = $binId * $binSize;
my $binIdBinSize1     = ( $binId + 1 ) * $binSize;

my $isNoReadTrim      = ((defined $CONF_PROJ{variantsNoReadTrim}) and $CONF_PROJ{variantsNoReadTrim});

my $isPaired          = ($readMode eq 'paired');

my $isNoIndels        = ((defined $CONF_PROJ{variantsNoIndels}) and $CONF_PROJ{variantsNoIndels});
my $isSkipContigs     = ((defined $CONF_PROJ{variantsSkipContigs}) and $CONF_PROJ{variantsSkipContigs});
$isSkipContigs        = ($isSkipContigs or (not $isPaired));

my $isWriteRealigned  = ((defined $CONF_PROJ{variantsWriteRealigned}) and $CONF_PROJ{variantsWriteRealigned});

my $isNoSites         = ((defined $CONF_PROJ{variantsNoSitesFiles}) and $CONF_PROJ{variantsNoSitesFiles});
my $isAddBacon        = ((defined $CONF_PROJ{variantsAddBacon}) and $CONF_PROJ{variantsAddBacon});

my $minPEScore        = $CONF_PROJ{QVCutoff};
#my $minPEScore        = $CONF_PROJ{variantsMinPEMapScore};
my $minSEScore        = $CONF_PROJ{QVCutoffSingle};
#my $minSEScore        = $CONF_PROJ{variantsMinSEMapScore};
my $isSEScoreRescue   = ((defined $CONF_PROJ{variantsSEMapScoreRescue}) and $CONF_PROJ{variantsSEMapScoreRescue});

my $isIndyModel   = ((defined $CONF_PROJ{variantsIndependentErrorModel}) and $CONF_PROJ{variantsIndependentErrorModel});

my $maxIndelSize  = ($isPaired ? $CONF_PROJ{variantsMaxIndelSizePEDefault} : $CONF_PROJ{variantsMaxIndelSizeSEDefault} );
$maxIndelSize  = ((defined $CONF_PROJ{variantsMaxIndelSize}) ? $CONF_PROJ{variantsMaxIndelSize} : $maxIndelSize );

my $isFilterUnanchored  = ((defined $CONF_PROJ{isIgnoreUnanchored}) and $CONF_PROJ{isIgnoreUnanchored});

################################################################################
my %buildChromsBinSizes = ();
readProjectParameters( %buildChromsBinSizes, "BUILD_BIN_SIZES", $projectDir);
my @buildChroms = keys %buildChromsBinSizes;


sub reportMsg {
    my ($msg,$is_error) = @_;
    if($is_error) {
        errorExit "ERROR: $msg\n";
    } else  {
        logWarning($msg);
    }
}


sub checkNonEmptyFile {
    my ($file,$is_error) = @_;
    if        ( not -e $file ) {
        reportMsg("File \"$file\" does not exist.",$is_error);
    } elsif ( -z $file ) {
        reportMsg("File \"$file\" is empty.",$is_error);
    }
}

# if bin directory is missing, then skip allele calling in this bin:
if(not -e $inputDir) {
    reportMsg("Bin directory \"$inputDir\" does not exist.",0);
    exit(0);
}

if(not -d $inputDir) {
    reportMsg("Path \"$inputDir\" does not refer to a directory.",1);
}

# Find the input files:
my $bamFile = File::Spec->catfile( $chromDir, $bamDirName, $bam_f );
checkNonEmptyFile($bamFile,0);

# find the reference seqeunce:
my $refSeqFile = $chrEnds{ $chrom }->{file};
checkNonEmptyFile($refSeqFile,1);

# translate casava configuration into the corresponding starling variant-caller commands:
#
my $sitesFile = File::Spec->catfile( $inputDir, $sites_txt_f );
my $snpsFile = File::Spec->catfile( $inputDir, $snps_txt_f );
my $vFlags = " -bsnp-diploid $CONF_PROJ{variantsSnpTheta}";
$vFlags .= " -bsnp-diploid-file $snpsFile -clobber";
if(not $isNoSites) {
    $vFlags .= " -bsnp-diploid-allele-file $sitesFile";
}

$vFlags .= " -min-paired-align-score $minPEScore";
$vFlags .= " -min-single-align-score $minSEScore";

if($isSEScoreRescue){
    $vFlags .= " -single-align-score-rescue-mode";
}

if( (defined $CONF_PROJ{variantsIncludeSingleton}) and $CONF_PROJ{variantsIncludeSingleton} ){
    $vFlags .= " -include-singleton";
}

if( (defined $CONF_PROJ{variantsIncludeAnomalous}) and $CONF_PROJ{variantsIncludeAnomalous} ){
    $vFlags .= " -include-anomalous";
}

if(not $isIndyModel) {
    $vFlags .= " -bsnp-ssd-no-mismatch $CONF_PROJ{variantsSiteErrorDepNoMismatch}";
    $vFlags .= " -bsnp-ssd-one-mismatch $CONF_PROJ{variantsSiteErrorDepDefault}";
    if ( defined($CONF_PROJ{variantsMinVexp}) ) {
        $vFlags .= " -min-vexp $CONF_PROJ{variantsMinVexp}";
    }

    if ( defined($CONF_PROJ{variantsVexpIter}) ) {
        $vFlags .= " -max-vexp-iterations $CONF_PROJ{variantsVexpIter}";
    }
}

if ( $CONF_PROJ{singleScoreForPE} eq "YES") {
    $vFlags .= " -single-align-score-exclude-mode";
}
$vFlags .= " -min-qscore $minQbasecall";
if ( $CONF_PROJ{qualityType} eq "Solexa64" ) {
    $vFlags .= " -qlogodds";
}

$vFlags .= " -filter-unanchored" if($isFilterUnanchored);

my $beginPos=$binIdBinSize+1;
my $endPos=$binIdBinSize1;
$vFlags .= " -report-range-begin $beginPos -report-range-end $endPos";
$vFlags .= " -single-seq-reference $refSeqFile";

my $isMDFilter=(($CONF_PROJ{variantsMDFilterCount}>=0) and
                ($CONF_PROJ{variantsMDFilterFlank}>=0));
if ( $isMDFilter ){
    $vFlags .= " -max-window-mismatch $CONF_PROJ{variantsMDFilterCount} $CONF_PROJ{variantsMDFilterFlank}"
}

if ( defined($CONF_PROJ{variantsSnpMaxFilterFrac}) ) {
    $vFlags .= " -snp-max-basecall-filter-fraction $CONF_PROJ{variantsSnpMaxFilterFrac}";
}

if( (defined $CONF_PROJ{variantsPrintAllGT}) and $CONF_PROJ{variantsPrintAllGT} ){
    $vFlags .= " -print-all-poly-gt";
}

if( (not defined $CONF_PROJ{variantsPrintUsedAlleleCounts}) or $CONF_PROJ{variantsPrintUsedAlleleCounts} ){
    $vFlags .= " -print-used-allele-counts";
}

if($isNoReadTrim) {
    $vFlags .= " -no-ambiguous-path-clip";
}


my $realignedHeader = "unsorted.realigned";
my $realignedBamPath = File::Spec->catfile( $inputDir, $realignedHeader . ".bam" );
if( $isWriteRealigned ) {
    $vFlags .= " -realigned-read-file $realignedBamPath -realign-submapped-reads";
}

if( $isAddBacon ) {
    my $baconAllelesFile = File::Spec->catfile( $inputDir, "sort.count");
    my $baconSnpsFile = File::Spec->catfile( $inputDir, "snp.txt" );
    $vFlags .= " -bacon-allele $baconAllelesFile -bacon-snp $baconSnpsFile";

    my $isDenseAlleleCalls = 0;
    if( exists $CONF_PROJ{isDenseAlleleCalls} and ($CONF_PROJ{isDenseAlleleCalls} == 1) ) {
        $isDenseAlleleCalls = 1;
    }

    if( not $isDenseAlleleCalls ){
        $vFlags .= " -bacon-allele-print-empty";
    }

    $vFlags .= " -bacon-call-thresh $CONF_PROJ{snpThreshold}";
    $vFlags .= " -bacon-second-call-thresh $CONF_PROJ{snpThreshold2}";
    $vFlags .= " -bacon-het-snp-ratio-thresh $CONF_PROJ{snpMaxRatio}";
}

$vFlags .= " -bam-seq-name '$chrom'";

if(-e $bamFile) {
    $vFlags .= " -bam-file $bamFile";
} else {
    exit(0);
}

if(defined $CONF_PROJ{skipVariableMetadata} and $CONF_PROJ{skipVariableMetadata}) {
    $vFlags .= " -skip-variable-metadata";
}




######################################################################
#
# add indel calling options:
#


my $indelsFile;

if(not $isNoIndels) {

    my $indelsTheta         = $CONF_PROJ{variantsIndelTheta};
    my $indelsErrorRate     = $CONF_PROJ{variantsIndelErrorRate};

    $indelsFile = File::Spec->catfile( $inputDir, $indels_txt_f );

    my $isIndelsErrorRate = ((defined $indelsErrorRate) and ($indelsErrorRate ne ""));
    if($isIndelsErrorRate){
        errorExit "ERROR: Invalid argument for --indelsErrorRate=x, expecting 0<=x<=1\n" 
          if(($indelsErrorRate > 1.) or ($indelsErrorRate < 0.));
    }

    # loop over chromosomes to get the total non-ambiguous genome size:
    my $knownBasesGenome = 0;
    foreach my $chrom (@buildChroms) {
        next if ( isSpliceJunctionChrom($chrom, %CONF_PROJ) );
        $knownBasesGenome += $chrEnds{$chrom}->{knownBases};
    }

    $vFlags .= " -bindel-diploid $indelsTheta";
    $vFlags .= " -bindel-diploid-file $indelsFile";
    $vFlags .= " -genome-size $knownBasesGenome";
    $vFlags .= " -max-indel-size $maxIndelSize";

    if(defined $CONF_PROJ{variantsCanIndelMin}) {
        $vFlags .= " -min-candidate-indel-reads $CONF_PROJ{variantsCanIndelMin}";
    }

    if(defined $CONF_PROJ{variantsCanIndelMinFrac}) {
        $vFlags .= " -min-candidate-indel-read-frac $CONF_PROJ{variantsCanIndelMinFrac}";
    }

    if(defined $CONF_PROJ{variantsSmallCanIndelMinFrac}) {
        $vFlags .= " -min-small-candidate-indel-read-frac $CONF_PROJ{variantsSmallCanIndelMinFrac}";
    }

    if(defined $CONF_PROJ{variantsCanIndelMaxDensity}) {
        $vFlags .= " -max-candidate-indel-density $CONF_PROJ{variantsCanIndelMaxDensity}";
    }

    if($isIndelsErrorRate){
        $vFlags .= " -indel-error-rate $indelsErrorRate";
    }
}


my $finishedAlignedContigFile;
my $finishedReadsFile;

if(not $isSkipContigs) {
    my $indelDir = File::Spec->catdir( $chromDir, "Indel" );
    if(not (-e $indelDir and -d $indelDir)) {
        reportMsg("Can't find indel directory \"$indelDir\".",1);
    }

    my $dataSetSuffix       = $CONF_PROJ{dataSetSuffix};
    my $suffixStr = ( ($dataSetSuffix ne "") ? "_$dataSetSuffix" : "");

    my $alignedContigFile = File::Spec->catfile( $indelDir, "aligned_contig_$binId$suffixStr.fa" );
    my $readsFile = File::Spec->catfile( $indelDir, "reads_$binId$suffixStr.txt" );

    errorExit("ERROR: Can't find file: $alignedContigFile\n") unless (-e $alignedContigFile);
    errorExit("ERROR: Can't find file: $readsFile\n") unless (-e $readsFile);

    my $finishCmd = "$finishBin -contigs '$alignedContigFile' -reads '$readsFile'";

    executeCmd($finishCmd , 3);

    $finishedAlignedContigFile = $alignedContigFile . ".finished";
    $finishedReadsFile = $readsFile . ".finished";

    errorExit("ERROR: Can't find file: $finishedAlignedContigFile\n")
      unless (-e $finishedAlignedContigFile);
    errorExit("ERROR: Can't find file: $finishedReadsFile\n")
      unless (-e $finishedReadsFile);

    $vFlags .= " -indel-contigs $finishedAlignedContigFile";
    $vFlags .= " -indel-contig-reads $finishedReadsFile";
}


my $vOutFile = File::Spec->catfile( $inputDir, "variants.info$sampleRateSetSuffix.txt" );
$vFlags .= " > $vOutFile";


# Actually run starling...:
#
my $vCmd = $starlingBin . $vFlags;
executeCmd( $vCmd, 5 );


# compress sites file
#
# TODO -- this should be a direct zlib write by starling
#
if(not $isNoSites) {
    my $cmd = "$CONF_PROJ{cmdGzip} -f $sitesFile";
    executeCmd( $cmd , 5);
}

if(not $isSkipContigs) {
    unlink($finishedAlignedContigFile,$finishedReadsFile);
}

# process realigned BAM
if( $isWriteRealigned ) {
    # my $bamChangeChrType = getChangeChrLabelType($CONF_PROJ{bamChangeChromLabels});

    # my $tmpSamPath = $realignedSamPath.".tmp";
    # open(my $newSamFH,"> $tmpSamPath");
    # open(my $SAMFH,$realignedSamPath);
    # while(<$SAMFH>) {
    #     if(/^@/)
    #     {
    #         print $newSamFH $_;
    #         next;
    #     }
    #     my @spl=split("\t");
    #     $spl[2] = changeChrLabel($spl[2],$bamChangeChrType);
    #     print $newSamFH join("\t",@spl);
    # }
    # move($tmpSamPath,$realignedSamPath) or errorExit("ERROR: File move failed: $!\n");

    # my $samtoolsBin = getSamtoolsBin(%CONF_PROJ);
    # my $refSizeFH = getRefSizeFH($bamChangeChrType,@buildChroms);

    my $sortedRealignedHeader = "sorted.realigned";
    my $sortedRealignedBamPrefix = File::Spec->catfile( $inputDir, $sortedRealignedHeader );
    my $bamSortCmd = "$samtoolsBin sort $realignedBamPath $sortedRealignedBamPrefix";
    executeCmd($bamSortCmd, 0);
    unlink($realignedBamPath);
}

printLog( "$0 Finished\n", 0);

