#!/usr/bin/env perl

=head1 NAME

=head1 SYNOPSIS

  createSubProjectFakes.pl [options] <validationSummary.txt>

  Options:
  -log <file>  Log file (optional; default is createSubProjectFakes.pl.log)
  -warn <file> Warning file (optional; default is defined in gapRes.config)
  -h           Detailed message (optional)

=head1 DESCRIPTION

This is a wrapper program for the Gap Resolution sub system that is responsible
generating the fasta and qual files of closed assemblies to be used as fake
reads.

=head1 DEPENDENCIES

The following scripts (configurable in config file) must exist in the same
path as validateSubProject.pl unless the path to the script is defined in the config
file:

  * shredFasta.pl
  
The input file contains information about the validation statuses for
each sub project directory. The format of the validationSummary.txt file are as
follows, one per line with each item delimited by a tab.

  1. sub project directory
  2. PASS|FAIL
  3. comment (optional)
  
Each sub project directory must contain a validinfo.txt file (previously
generated by validateSubProject.pl).  Note that this file contains the
validation status.  However, the status defined in the input validationSummary.txt
file supercedes the status within the validinfo.txt file. In other words, the
status in validationSummary is the one that is used to determine whether or not
to process the creation of fakes (status=pass) or to create primer files
(status=fail) for each sub project.

The status within the validationSummary.txt file determines the course of action.

If the validation is successful, create fasta and qual files of the contig
containing the anchors and place them in the sub project directory naming it
<subProjectDirName>.a1 and <subProjectDirName>.a1.qual. Copy the fasta and
qual files to the fakes dir in the current working directory.

The following files must exist in each sub project directory:

  * validinfo.txt - create by validateSubProject.pl
  * 454AllContigs.fna - created by Newbler
  * 454AllContigs.qual - created by Newbler

The format of the primerinfo.txt file is as follows:
 
  FASTA_TAG=<fasta tag name>
  TARGET_REGION=<start> <length>
  TEMPLATE=GDNA
  LEFT_PRIMER_NAME=<name of left primer>
  RIGHT_PRIMER_NAME=<name of right primer>
  EXCLUDED_REGION=<start> <length>
      
  Note that the EXCLUDE_REGION key/value pair is optional. If used, more than one
  entry could be defined.
	       
A default config file named gapRes.config residing in 
<installPath>/config is used to specify the following parameters:

(configurable)

  createSubProjectFakes.trimClosedGapConsensusUsingAnchorPos=0
    Specify if the closed gap consensus should be trimmed at the anchor position (0|1)
    
  createSubProjectFakes.trimConsensusSeqToThisManyBasesAwayFromAnchor=0
    If trimming consensus of closed gaps is turned on, specify how many bases
    away from anchor position to keep.
    
  createSubProjectFakes.shredClosedGapConsensus=1
    Specify if the closed consensus fasta and qual files should be shredded (0|1).
    
  createSubProjectFakes.shredClosedGapConsensusIfGreaterThanThisLength=2000
    Specify the minimum length of the closed consensus sequence to be considered for
    shredding.

  shredFasta.fragmentLength=1000
    Specify the fragment length when shredding the repeat contig consensus.
  
  shredFasta.overlapLength=100
    Specify the overlap length when shredding the repeat contig consensus.

(system configuration)

  createSubProjectFakes.fakesDir=fakes
    Specify the name of the directory to store the fasta and qual files of the
    fakes.
    
  createSubProjectFakes.fakesFileExtension=.a1
    Specify the file extension of the fakes.
    
=head1 VERSION

$Revision: 1.10 $

$Date: 2010-01-06 22:00:39 $

=head1 AUTHOR(S)

Stephan Trong

=head1 HISTORY

=over

=item *

S.Trong 2009/07/25 creation

=item *

S.Trong 2009/12/29 - added -log and -warn options.

=back

=cut

use strict;
use warnings;
use Pod::Usage;
use Cwd;
use Cwd qw(abs_path);
use Carp;
use Carp qw(cluck);
use Getopt::Long;
use File::Path;
use File::Copy;
use File::Basename;
use FindBin qw($RealBin);
use lib "$RealBin/../lib";
use PGF::Utilities::RunProcess qw(runProcess);
use PGF::Utilities::Properties;
use PGF::GapResolution::ParseGapValidationInfo;
use PGF::GapResolution::ParseValidationSummary;
use PGF::GapResolution::Warnings;
use PGF::Parsers::FastaParse;
use PGF::Utilities::Logger;
use PGF::Utilities::FastaSeq qw(formatSeq);
use vars qw( $optHelp $optDebug $optLogFile $optWarningsFile );

#============================================================================#
# INPUT VALIDATION
#============================================================================#
my $programExecution = abs_path(dirname($0))."/".basename($0)." @ARGV";

if( !GetOptions(
        "debug"=>\$optDebug,
        "log=s"=>\$optLogFile,
        "warn=s"=>\$optWarningsFile,
        "h"=>\$optHelp,
    )
) {
    printhelp(1);
}

printhelp(2) if $optHelp;

if ( @ARGV != 1 ) {
    my $errMsg = "Required input parameters are missing in command line.";
    print STDERR "$errMsg\n";
    printhelp(1);
}

#============================================================================#
# INITIALIZATION
#============================================================================#

my $DEBUG = $optDebug ? 1:0;
my $configFile = defined $ENV{GAPRES_CONFIG} ?
    $ENV{GAPRES_CONFIG} : "$RealBin/../config/gapRes.config";
my $OBJ_PROPS = PGF::Utilities::Properties->new(-configFile=>$configFile);
   $OBJ_PROPS->setExceptionIfEntryNotFound(1); # confess if entry in config file
                                               # is not found.
my $OBJ_LOGGER = PGF::Utilities::Logger->new();
my $outputDir = getcwd;
my $OBJ_WARNINGS = PGF::GapResolution::Warnings->new(
    path=>$outputDir, logger=>$OBJ_LOGGER);
   $OBJ_WARNINGS->setLogFile($optWarningsFile) if $optWarningsFile;
my $logfile = $optLogFile ? $optLogFile : "$outputDir/".basename($0) . ".log";

my $validationSummaryFile = $ARGV[0];
    
#============================================================================#
# VALIDATE INPUTS
#============================================================================#
my $errMsg = '';

# Set path for logging.
#
setFileForLogging($logfile);

# Log execution into log file.
#
logExecution($programExecution);

# Check sub project list file.
#
if (!-s $validationSummaryFile) {
    $errMsg .= "The $validationSummaryFile is missing or zero size.\n";
}

if ( $errMsg  ){
    print STDERR "$errMsg.\nUse the -h option for more info.\n";
    logError($errMsg,1);
}

#============================================================================#
# MAIN
#============================================================================#

my @validationStatuses = ();

# validate and get sub project directories.
#
my $objValidation = PGF::GapResolution::ParseValidationSummary->new(
    $validationSummaryFile);

while ( $objValidation->getNextEntry ) {
    my $subProjectDir = $objValidation->getPath;
    next if !length $subProjectDir;
    
    if ( !-s $subProjectDir ) {
        my $errMsg = "in file $validationSummaryFile, directory $subProjectDir does not exist.\n";
        print STDERR "WARNING: $errMsg";
        logError($errMsg);
        next;
    }
    
    # Add sub project dir name to warnings.
    #
    $OBJ_WARNINGS->setSubProjectName($subProjectDir);

    # Initialize file name and paths
    #
    my $subProjectName = basename($subProjectDir);
    my $assemDir = $subProjectDir.'/'.
        $OBJ_PROPS->getProperty("validateSubProject.assemblyDirectory");
    my $validInfoFile = $subProjectDir.'/'.
        $OBJ_PROPS->getProperty("validateSubProject.validateGapAssemblyOutputFile");
    my $contigsFasta = $assemDir.'/'.
        $OBJ_PROPS->getProperty("validateSubProject.assemblyContigsFasta");
    my $contigsQual= $assemDir.'/'.
        $OBJ_PROPS->getProperty("validateSubProject.assemblyContigsQual");
        
    # check for existence of required files
    #
    next if !checkForRequiredFiles(
        $validInfoFile,
        $contigsFasta,
        $contigsQual,
    );
    
    # Create consensus fasta and qual files of the sub project.
    #
    if( $objValidation->isSuccessful ) {
        createOutputFiles($subProjectDir, $validInfoFile, $contigsFasta, $contigsQual);
    }
}

# If warning messages present, then save in .warnings.out file.
#
$OBJ_WARNINGS->createFile() if $OBJ_WARNINGS->getNumberOfWarnings;

exit 0;

#============================================================================#
# SUBROUTINES
#============================================================================#
sub checkForRequiredFiles {
    
    my @files = @_;
    
    my $success = 1;
    
    foreach my $file (@files) {
        if ( !$OBJ_WARNINGS->fileExists($file) ) {
            $success = 0;
            last;
        }
    }
    
    return $success;
}

#============================================================================#
sub createContigOrientationFile {
    
    my $scaffoldFile = shift;
    my $outputFile = shift;
    
    logOutput("Creating contig orientation file ...", 1);
    
    my $objScaffold = PGF::Newbler::Scaffolds454->new($scaffoldFile);
    $objScaffold->createContigOrientationFile($outputFile);
    
    my $success = $OBJ_WARNINGS->fileExists($outputFile);
    
    logOutputFileCreation($outputFile);
    
    return $success;
    
}

#============================================================================#
sub createOutputFiles {
    
    my $subProjectDir = shift;
    my $validationInfoFile = shift;
    my $contigsFasta = shift;
    my $contigsQual = shift;
    
    my $success = 1;
    my $objGapValidation = PGF::GapResolution::ParseGapValidationInfo->new(
        $validationInfoFile);
    
    # Get contig where left anchor is found.
    #
    my $leftAnchorContig = $objGapValidation->getLeftContig();
    if ( !$leftAnchorContig ) {
        my $errMsg = "failed to parse left anchor contig from file $validationInfoFile\n";
        logError($errMsg);
        $success = 0;
    }
    
    # Get contig where right anchor is found.
    #
    my $rightAnchorContig = $objGapValidation->getRightContig();
    if ( !$rightAnchorContig ) {
        my $errMsg ="failed to parse right anchor contig from file $validationInfoFile\n";
        logError($errMsg);
        $success = 0;
    }
    
    # If both anchors are on the same contig and validation is successful,
    # then create fakes.
    #
    if ( $success ) {
        my $outputFastaFile = $subProjectDir.'/'.basename($subProjectDir).
            $OBJ_PROPS->getProperty("createSubProjectFakes.fakesFileExtension");
        my $fastaTag = basename($subProjectDir)." $leftAnchorContig";
        $success = createClosedConsensus($objGapValidation, $leftAnchorContig,
            $fastaTag, $contigsFasta, $contigsQual, $outputFastaFile);
    }
    
    return $success;
    
}

#============================================================================#
sub createClosedConsensus {
    
    my $objGapValidation = shift;
    my $contig = shift;
    my $fastaTag = shift;
    my $contigFasta = shift;
    my $contigQual = shift;
    my $subProjectFastaFile = shift;
    
    my $success = 1;
    my $fakesDir = $OBJ_PROPS->getProperty("createSubProjectFakes.fakesDir");
    my $shredConsensus = $OBJ_PROPS->getProperty("createSubProjectFakes.shredClosedGapConsensus");
    my $trimConsensus = $OBJ_PROPS->getProperty("createSubProjectFakes.trimClosedGapConsensusUsingAnchorPos");
    my $minFastaLengthToShred =
        $OBJ_PROPS->getProperty("createSubProjectFakes.shredClosedGapConsensusIfGreaterThanThisLength");
    my $subProjectQualFile = $subProjectFastaFile . '.qual';
    
    if ( !-e $fakesDir ) {
        mkdir( $fakesDir, 0777 );
    }
    
    # Create consensus fasta and qual files (fakes)
    #
    my $contigOrientation = '+';
    my $seq = getConsensusSeq($contig, $contigOrientation, $contigFasta, 1);
    my $qual = getConsensusSeq($contig, $contigOrientation, $contigQual, 1);
    my $isQualFile = 0;
    
    if ( $trimConsensus ) {
        $seq =~ s/\s+//g;
        $qual =~ s/\n/ /g;
        $seq = trimConsensus($objGapValidation, $seq, $isQualFile) if $trimConsensus;
        $isQualFile = 1;
        $qual = trimConsensus($objGapValidation, $qual, $isQualFile) if $trimConsensus;
        $seq = formatSeq($seq, 50);
        $qual = formatSeq($qual, 50);
    }
        
    my $successFasta = createFastaFile($subProjectFastaFile, $fastaTag, $seq);
    my $successQual = createFastaFile($subProjectQualFile, $fastaTag, $qual);
    
    # Copy fasta file to fakes dir, then run the fasta shredder if
    # shredding is turned on in the config file.
    #
    if ( $successFasta ) {
        # shred consensus?
        if ( $shredConsensus && length($seq) > $minFastaLengthToShred ) {
            $isQualFile = 0;
            $success = shredConsensus($subProjectFastaFile, $isQualFile, $fakesDir);
        }
        # copy fasta to fakes dir.
        #
        if (-e $subProjectFastaFile) {
            my $fakesFastaFile = $fakesDir.'/'.basename($subProjectFastaFile);
            $success = copy($subProjectFastaFile, $fakesFastaFile);
            logError("failed to copy $subProjectFastaFile to $fakesFastaFile: $!\n")
                if !$success;
            logOutputFileCreation($fakesFastaFile);
        }
    } else {
        $success = 0;
    }
    if ( $successQual ) {
        # shred qual?
        if ( $shredConsensus && length($seq) > $minFastaLengthToShred ) {
            $isQualFile = 1;
            $success = shredConsensus($subProjectQualFile, $isQualFile, $fakesDir);
        }
        # copy qual to fakes dir.
        #
        if (-e $subProjectQualFile) {
            my $fakesFastaFile = $fakesDir.'/'.basename($subProjectQualFile);
            $success = copy($subProjectQualFile, $fakesFastaFile);
            logError("failed to copy $subProjectQualFile to $fakesFastaFile: $!\n")
                if !$success;
            logOutputFileCreation($fakesFastaFile);
        }
    } else {
        $success = 0;
    }
    
    return $success;
    
}

#============================================================================#
sub trimConsensus {

    my $objGapValidation = shift;
    my $seq = shift;
    my $isQualFile = shift;
    
    my $nBasesAwayFromAnchorToKeep = $OBJ_PROPS->getProperty(
        "createSubProjectFakes.trimConsensusSeqToThisManyBasesAwayFromAnchor");
    my ($anchorStart, $anchorEnd) = $objGapValidation->getAnchorPos();
    my $consensusLength = $objGapValidation->getLeftContigLength();
    
    # Compute number of bases to trim from the beginning of the left anchor
    # position and the end of the right anchor position plus a padding
    # defined in the config file with the parameter
    # trimConsensusSeqToThisManyBasesAwayFromAnchor
    #
    my $nBasesToTrimFromBeg = $anchorStart - 1 - $nBasesAwayFromAnchorToKeep;
    my $nBasesToTrimFromEnd = $consensusLength - $anchorEnd-
        $nBasesAwayFromAnchorToKeep;
    $nBasesToTrimFromBeg = 0 if $nBasesToTrimFromBeg < 0;
    $nBasesToTrimFromEnd = 0 if $nBasesToTrimFromEnd < 0;

    # Trim consensus sequence.
    #
    if ( $isQualFile ) {
        $seq = trimQuality($seq, $nBasesToTrimFromBeg, $nBasesToTrimFromEnd);
    } else {
        $seq = substr($seq, $nBasesToTrimFromBeg);
        $seq = substr($seq, 0, -1*$nBasesToTrimFromEnd);
    }
    
    return $seq;
    
}

#============================================================================#
sub trimQuality {
    
    my $seq = shift;
    my $nBasesToTrimFromBeg = shift;
    my $nBasesToTrimFromEnd = shift;
    
    my $idx = 0;
    my $trimmedSeq = '';
    while ( $seq =~ /(\d+)/g && $idx < $nBasesToTrimFromBeg ) {
        $idx++;
        $trimmedSeq = $';
    }
    $seq = reverse $trimmedSeq;
    $idx = 0;
    while ( $seq =~ /(\d+)/g && $idx < $nBasesToTrimFromEnd ) {
        $idx++;
        $trimmedSeq = $';
    }
    $trimmedSeq =~ s/^\s+|\s+$//g;
    $seq = reverse $trimmedSeq;
    
    return $seq;
}
    
#============================================================================#
sub shredConsensus {
    
    my $inputFile = shift;
    my $isQualFile = shift;
    my $fakesDir = shift;
    
    my $fragLength = $OBJ_PROPS->getProperty("shredFasta.fragmentLength");
    my $overlapLength = $OBJ_PROPS->getProperty("shredFasta.overlapLength");
    
    my $outputFile = "$inputFile.shred";
    
    # shred fasta sequences if fasta file found.
    #
    logOutput("Shredding $inputFile to $fragLength bp lengths with $overlapLength bp overlap",1);
    my $success = runShredFasta($inputFile, $isQualFile, $outputFile, $fragLength, $overlapLength);
        
    # Rename output shredded file back to repeat fasta file.
    #
    if ( $success ) {
        if ( -s $inputFile && -s $outputFile) {
            unlink( $inputFile);
            rename ($outputFile, $inputFile);
            logOutput("Renaming $outputFile to $inputFile ...",1);
        }
        
    }
    
    return $success;
    
}
    
#============================================================================#
sub runShredFasta {
    
    my $fastaFile = shift;
    my $isQualityFile = shift;
    my $outputFile = shift;
    my $fragLength = shift;
    my $overlapLength = shift;
    
    my $script = getScript("script.shredFasta");
    my $args = "-l $fragLength -o $overlapLength";
       $args .= " -q" if $isQualityFile;
       $args .= " $fastaFile $outputFile";
    
    my $cmd = "$script $args";

    my %processInfo = runCommand($cmd);
     
    checkProcess(%processInfo);
    
    my $success = $OBJ_WARNINGS->fileExists($outputFile);
    
    logOutputFileCreation($outputFile);
    
    return $success;
    
}

#============================================================================#
sub getConsensusSeq {
    
    my $contig = shift;
    my $contigOrientation = shift;
    my $fastaFile = shift;
    my $rawFormat = shift || 0;
    
    my $seq = '';
    my $defLine = '';
    my $objFastaParser = PGF::Parsers::FastaParse->new($fastaFile);
    
    while ($objFastaParser->MoreEntries) {
        $objFastaParser->ReadNextEntry( -rawFormat=>$rawFormat );
        my $tagName = $objFastaParser->Name;
        if ( $tagName eq $contig ) {
            $objFastaParser->reverseComp if $contigOrientation eq '-';
            $seq = $objFastaParser->Seq;
            $defLine = $objFastaParser->Comment;
            last;
        }
    }
    $seq =~ s/^\s+|\s+$//g;
    
    if ( wantarray ) {
        return $seq, $defLine;
    } else {
        return $seq;
    }
}

#============================================================================#
sub createFastaFile {
    
    my $file = shift;
    my $header = shift;
    my $seq = shift;
    
    my $success = 0;
    
    unless (open FH, ">$file" ) {
        my $errMsg = "failed to create file $file: $!\n";
        print STDERR "$errMsg";
        logError($errMsg);
        return $success;
    }
    print FH ">$header\n";
    print FH "$seq\n";
    close FH;
    
    $success = $OBJ_WARNINGS->fileExists($file);
    
    logOutputFileCreation($file);
    
    return $success;
}

#============================================================================#
sub checkForFileExistence {
    
    my $file = shift;
    
    my $success = 1;
    
    if ( !-s $file ) {
        my $errMsg = "file $file does not exist or is zero size.\n";
        logError($errMsg, 0);
        $success = 0;
    }
    
    return $success;
    
}

#============================================================================#
sub setFileForLogging {
    
    my $logFile = shift;
    
    $OBJ_LOGGER->setLogOutFileAppend($logFile);
    $OBJ_LOGGER->setLogErrorFileAppend($logFile);
    
}

#============================================================================#
sub getScript {
    
    my $scriptType = shift;
    
    my $script = $OBJ_PROPS->getProperty("$scriptType");
       $script = "$FindBin::RealBin/$script" if
            $script !~ /\//; # add path to script if not specified in config file
            
    # Check if script exists.
    #
    if ( !-e $script ) {
        my $errMsg = "cannot find script $script defined in config file as $scriptType.\n";
        print $errMsg;
        logError($errMsg, 1);
    }
    
    return $script;
    
}

#============================================================================#
sub checkProcess {
    
    my %processInfo = @_;
    
    my $success = 1;
    
    if ( $processInfo{exitCode} ) {
        my $errMsg = '';
        if ( defined $processInfo{logStdoutMessage} &&
            $processInfo{logStdoutMessage} ) {
            $errMsg .= $processInfo{stdoutMessage};
        }
        $errMsg .= $processInfo{stderrMessage}."\n";
        print STDERR "$errMsg\n";
        logError($errMsg,0);
        $success = 0;
    }
    
    return $success;
}

#============================================================================#
sub runCommand {
   
    my $cmd = shift;
    
    # Execute command, capture stderr, stdout, exitcode
    #
    my $errMessage = "CMD: $cmd\n";
    
    print "Running $cmd ...\n\n";
    logOutput($cmd);
    
    my %processInfo = runProcess($cmd,
        {-checkExecutable=>0,
        }
    );
    
    return %processInfo;
    
}

#============================================================================#
sub logExecution {
    
    my $programExecution = shift;
    
    my $msg = "Command: ".$programExecution."\n".
              "Current directory: ".getcwd;
    logOutput($msg);
}

#============================================================================#
sub logError {
    my $message = shift;
    my $confess = shift || 0;
    
    $OBJ_LOGGER->logError($message);
    $OBJ_WARNINGS->add($message);
    
    if ( $confess ) {
        $OBJ_WARNINGS->createFile() if $OBJ_WARNINGS->getNumberOfWarnings;
        confess $message;
    }
    
}
    
#============================================================================#
sub logOutput {
    my $message = shift;
    my $printMsg = shift || 0;
    
    $OBJ_LOGGER->logOut($message);
    print "$message\n" if $printMsg;
}
    
#============================================================================#
sub logOutputFileCreation {
    
    my $outputFile = shift;
    
    $outputFile = abs_path($outputFile);
    
    my $fileExists = -s $outputFile ? 1:0;
    my $msg = $fileExists ? "Created $outputFile" :
        "failed to create $outputFile";
    logOutput($msg,1);
    
    return $fileExists;
    
}

#============================================================================#
sub printhelp {
    my $verbose = shift || 1;
    pod2usage(-verbose=>$verbose);
    exit 1;
}

#============================================================================#
