#!/usr/bin/env perl

=head1 NAME

=head1 SYNOPSIS

  newblerAssemSubProjectWithSanger.pl [options] <subdirectory list> <sffInfo file>

  Options:
  -o <dir>      output assembly directory (optional) default=Newbler
  -log <file>   log file (optional; default is newblerAssemSubProject.pl.log)
  -warn <file>  warning file (optional; default is defined in gapRes.config)
  -h            detailed message   (optional)
  -d            debug printing     (optional)


=head1 DESCRIPTION

This program takes a list of gap project subdirectories as well as an 
sffInfo file, which lists the name and locations of each sff file for the 
project, and converts any reads specified in a read list into sff or 
fasta/qual depending on if it is a 454 or sanger type read. It organizes 
the data into three directories created within each gap subdirectory for 
paired 454 data, unpaired 454 data, and sanger-type data (fasta/quals).

The organized read data for each gap subproject is then assembled with newbler.
It finds the location of newblers runAssembly using the config file param
"newblerEnv" which points to the isntallation of newbler.  This is also used 
to set the env variable SFFFILE_PATH needed by reads2sff.
The assembly output can be found in a directory within each gap subdirectory 
and its name is configurable (default = Newbler).  

A default config file named gapRes.config residing in
<installPath>/config is used to specify the following parameters:

  script.reads2sff=reads2sff.pl
  newblerAssemSubProject.newbler=<path to newbler>
  newblerAssemSubProject.newblerOptions=-consed -g -nrm -rip
  newblerAssemSubProject.pairedDataDir=pairedData
  newblerAssemSubProject.unpairedDataDir=unpairedData

If the value of the config parameter script.nnnn doesn't contain file paths, 
newblerAssemSubProject.pl assumes that the script name resides in the
<installPath>/bin directory.

=head1 VERSION

$Revision: 1.6 $

$Date: 2010-01-11 21:24:06 $

=head1 AUTHOR(S)

=head1 HISTORY

=over

=item *

Kurt M. LaButti 2009-08-11 creation

=item *

Stephan Trong 2009-08-05
  - added ability to skip and create warnings file if sub project fails.
  
Kurt M. LaButti
  - modified this code (newblerAssemSubproject.pl) into a new componant.

Stephan Trong 2010-01-11
  - added -log and -warn options.
  
=back

=cut

use strict;
use warnings;
use Pod::Usage;
use Cwd;
use Cwd qw(abs_path);
use Carp;
use Carp qw(cluck);
use Getopt::Long;
use File::Path;
use File::Copy;
use File::Basename;
use FindBin qw($RealBin);
use lib "$RealBin/../lib";
use PGF::Utilities::Properties;
use PGF::Utilities::RunProcess qw(runProcess);
use PGF::Utilities::Logger;
use PGF::GapResolution::Warnings;
use vars qw( $optHelp $optDebug $optOutputDir $optLogFile $optWarningsFile);

#============================================================================#
# INPUT VALIDATION
#============================================================================#
my $programExecution = abs_path(dirname($0))."/".basename($0)." @ARGV";

if( !GetOptions(
		"h"           => \$optHelp,
		"d"           => \$optDebug,
		"o=s"         => \$optOutputDir,
                "log=s"       => \$optLogFile,
                "warn=s"      => \$optWarningsFile,
		)
    ) {
    printhelp(1);
}

printhelp(1) if !@ARGV;
printhelp(2) if $optHelp;


#============================================================================#
# INITIALIZE VARIABLES
#============================================================================#

# input file
#
my $optInputSubdirList = shift @ARGV;
my $sffInfoFile        = shift @ARGV;

# config file property handling
#
my $DEBUG = 0;
my $configFile = defined $ENV{GAPRES_CONFIG} ?
   $ENV{GAPRES_CONFIG} : "$RealBin/../config/gapRes.config";
my $OBJ_PROPS = PGF::Utilities::Properties->new(-configFile=>$configFile);
   $OBJ_PROPS->setExceptionIfEntryNotFound(1); # confess if entry in config file
                                               # is not found.
# config properties object
#
my $OBJ_LOGGER = PGF::Utilities::Logger->new();
my $OBJ_WARNINGS = PGF::GapResolution::Warnings->new(logger=>$OBJ_LOGGER);
   $OBJ_WARNINGS->setLogFile($optWarningsFile) if $optWarningsFile;

# output directory name
#
my $outputAssemDir = ($optOutputDir) ? $optOutputDir : $OBJ_PROPS->getProperty(
    "newblerAssemSubProject.newblerAssemblyDirectory");

# config file params
#
my $outputLogDir    = getcwd; #$OBJ_PROPS->getProperty("createGapResProject.assemInfoDirName");
my $logFile         = $optLogFile ? $optLogFile : "$outputLogDir/".basename($0).".log";
my $scaffoldInfoFile= $OBJ_PROPS->getProperty("idContigRepeats.scaffFileName");
my $anchorFileExt   = $OBJ_PROPS->getProperty("idContigRepeats.anchorFileExtension");
my $readsListFof    = $OBJ_PROPS->getProperty("getSubProjReads.readListFileName");
my $fastaDataDir    = $OBJ_PROPS->getProperty("getSubProjReads.outputFastaDirectory");
my $pairedDataDir   = $OBJ_PROPS->getProperty("newblerAssemSubProject.pairedDataDir");
my $unpairedDataDir = $OBJ_PROPS->getProperty("newblerAssemSubProject.unpairedDataDir");
my $newblerEnv      = $OBJ_PROPS->getProperty("newblerAssemSubProject.newblerEnv");
my $newblerOptions  = $OBJ_PROPS->getProperty("newblerAssemSubProject.newblerOptions");


# Set path for logging.
#
setFileForLogging($logFile);

# Log execution into log file.
#
logExecution($programExecution);


# set newbler env and SFFFILE_PATH variable using the location of newbler
#
logOutput("setting ENV PATH: $newblerEnv");
$ENV{PATH}         = $newblerEnv.":$ENV{PATH}";
$ENV{SFFFILE_PATH} = $newblerEnv."/sfffile";
print "PATH: $ENV{PATH}\n" if $optDebug;
print "PATH: $ENV{SFFFILE_PATH}" if $optDebug;

#============================================================================#
# VALIDATE INPUTS
#============================================================================#
    
# @ARGV, input is a file of subdirectory locations
#
if ( !-s $optInputSubdirList ) {
   confess "Input subdirectory list $optInputSubdirList does not exist or is empty!\n";
} 
# check sff info file
#
if ( !-s "$sffInfoFile" ) {
   confess "sffInfo file: $sffInfoFile does not exist or is empty.\n";
}

#============================================================================#
# MAIN
#============================================================================#

# validate subdirectories and necessary files
# 
#    $array[0] = [0]=subdir, [1]=@readListFiles
#
print "Validation stage...\n" if $optDebug;
print "$optInputSubdirList $readsListFof $pairedDataDir $unpairedDataDir $fastaDataDir  $optInputSubdirList" if $optDebug;


my $subdirListArray = validateSubdirectories ($optInputSubdirList, 
					      $readsListFof, 
					      $pairedDataDir,
					      $unpairedDataDir,
					      $fastaDataDir,
					      $optInputSubdirList);


# Loop through each subdirectory 
#
print "Looping through subdirs...\n" if $optDebug;
foreach my $row (@$subdirListArray) {
    
    my $subdir = $row->[0];
    my $readsList = join " ", @{$row->[1]};
    print "\t$subdir...\n" if $optDebug;

    # Add sub project dir name to warnings.
    #
    $OBJ_WARNINGS->setSubProjectName($subdir);
    
    # Run reads2sff for each contig in the subdir
    #
    my $commandLine = 
	"-sffinfo $sffInfoFile" 
	." -odir $subdir/$unpairedDataDir"
	." -odirPair $subdir/$pairedDataDir"
	." $readsList"; 
    print "Running reads2sff $commandLine\n" if $optDebug;
    runMyScript("script.reads2sff", "$commandLine");
     
    # Run non454Reads2Fasta
    #
    $commandLine = 
	"-si $sffInfoFile"
	." -gs $subdir" 
	." -od $fastaDataDir"
	." $readsList"; 
    print "Running non454Reads2Fasta $commandLine\n" if $optDebug;
    runMyScript("script.non454Reads2Fasta", "$commandLine");
    
    
    
    # check 3 expected read data dirs and include in the command line if not empty
    # fasta dir is a special case, grep everything but . .. and qual to use
    my $commandPortion;
   
    $commandPortion .=  checkDirContents("$subdir/$pairedDataDir") == 0 ?   
	" -p $subdir/$pairedDataDir/*sff"   :  "";
    $commandPortion .=  checkDirContents("$subdir/$unpairedDataDir") == 0 ?
	" $subdir/$unpairedDataDir/*sff"    : "";
    $commandPortion .= checkDirContents("$subdir/$fastaDataDir") == 0 ? 
	getFastasInDir("$subdir/$fastaDataDir/") : "";
    
    # runAssembly command line
    #
    $commandLine = ();
    $commandLine = 
	"runAssembly $newblerOptions" 
	." -o $subdir/$outputAssemDir" 
	."$commandPortion"
	." 2> $subdir/runAssembly.tmp.log";
    
    # assemble; kick stderr to file
    #    
    print "Running Newbler: $commandLine\n\n";
    logOutput($commandLine);
    system ("$commandLine");
    
    # Check for success of assembly, if it failed log error
    #
    print "Checking for assembly success\n" if $optDebug;
    my $result = checkError("$subdir/runAssembly.tmp.log");
    
    # Remove tmp stderr file
    #
    unlink "$subdir/runAssembly.tmp.log" if ($result == 0);
    print "\n\n" if $optDebug;
}

# If warning messages present, then save in .warnings.out file.
#
$OBJ_WARNINGS->createFile() if $OBJ_WARNINGS->getNumberOfWarnings;


exit 0;

#============================================================================#
# SUBROUTINES
#============================================================================#
sub runMyScript {
    
    my ($configScript, $params, ) = @_;
     
    $params = defined $params ? $params : " ";
    my $script = getScript("$configScript"); 
     my $cmd = "$script $params";
    my %processInfo = runCommand($cmd);
    
    checkProcess(%processInfo);
    
}
    
#============================================================================#
sub setFileForLogging {
    
    my $logFile = shift;
    
    $OBJ_LOGGER->setLogOutFileAppend($logFile);
    $OBJ_LOGGER->setLogErrorFileAppend($logFile);
    
}

#============================================================================#
sub getScript {
    
    my $scriptType = shift;
    
    my $script = $OBJ_PROPS->getProperty("$scriptType");
       $script = "$FindBin::RealBin/$script" if
            $script !~ /\//; # add path to script if not specified in config file
            
    # Check if script exists.
    #
    if ( !-e $script ) {
        my $errMsg = "ERROR: cannot find script $script defined in config file as $scriptType.\n";
        print $errMsg;
        logError($errMsg, 1);
    }
    
    return $script;
    
}

#============================================================================#
sub checkProcess {
    
    my %processInfo = @_;
    
    if ( $processInfo{exitCode} ) {
        my $errMsg = '';
        if ( defined $processInfo{logStdoutMessage} &&
            $processInfo{logStdoutMessage} ) {
            $errMsg .= $processInfo{stdoutMessage};
        }
        $errMsg .= $processInfo{stderrMessage};
        logError($errMsg);
    }
    
}

#============================================================================#
sub runCommand {
   
    my $cmd = shift;
    
    # Execute command, capture stderr, stdout, exitcode
    #
    my $errMessage = "CMD: $cmd\n";
    
    print "Running $cmd ...\n\n";
    logOutput($cmd);
    
    my %processInfo = runProcess($cmd,
        {-checkExecutable=>0,
        }
    );
    
    return %processInfo;
    
}

#============================================================================#
sub logExecution {
    
    my $programExecution = shift;
    
    my $msg = "Command: ".$programExecution."\n".
              "Current directory: ".getcwd;
    logOutput($msg);
}

#============================================================================#
sub logError {
    my $message = shift;
    my $confess = shift || 0;
    
    $OBJ_LOGGER->logError($message);
    $OBJ_WARNINGS->add($message);
    
    if ( $confess ) {
        $OBJ_WARNINGS->createFile() if $OBJ_WARNINGS->getNumberOfWarnings;
        confess $message;
    }
    
}

#============================================================================#
sub logOutput {
    my $message = shift;
    
    $OBJ_LOGGER->logOut($message);
}
    
#============================================================================#
sub printhelp {
    my $verbose = shift || 1;
    pod2usage(-verbose=>$verbose);
    exit 1;
}

#============================================================================#
sub validateSubdirectories {
    my ( $subdirList, 
	 $readsListFof, 
	 $pairedDataDir,
	 $unpairedDataDir,
	 $fastaDataDir,
	 $optInputSubdirList)  = @_;
    my $ref_to_dirList;
    my $c = 0;

    print "$subdirList"
	 ." $readsListFof"
	 ." $pairedDataDir"
	 ." $unpairedDataDir"
	 ." $fastaDataDir"
	 ." $optInputSubdirList\n" if $optDebug;
    
    # open subdir list
    #
    unless( open DIRLIST, "$subdirList" ) {
        my $errMsg = "ERROR: failed to open file $subdirList.\n";
        logError($errMsg, 1);
    }
    
    # go through each subdir 
    #
    while (my $dir = <DIRLIST>) {
        chomp $dir;

	print "subdir=$dir\n" if $optDebug;

	my @readsList = ();

	# check sub directory 
	#
	if ( !-s $dir) {
            my $errMsg = "WARNING: in file $subdirList, directory $dir does not exist.\n";
            logError($errMsg);
            next;
        }
		
	# Check for read list fof file, validate contents if it exists
	#
	if ( !-s "$dir/$readsListFof") {

            my $errMsg = "WARNING: read list fof: $dir/$readsListFof does not exist or is empty.\n";
            logError($errMsg);
            next;
	    
	} else {

	    open FOF, "$dir/$readsListFof";
	    @readsList = <FOF>;
	    close FOF;	    
	    my $counter = 0;
	    
	    foreach my $list  (@readsList) {

		chomp $list;
		
		print "list=$list\n" if $optDebug;
		
		if ( !-s "$dir/$list") {
		    my $errMsg = "WARNING: read list: $dir/$list does not exist or is empty.\n";
		    logError($errMsg);
                    last;
		}
		
		$readsList[$counter] = "$dir/$list";
		++$counter;
	    }
	    
	}
		
	$ref_to_dirList->[$c][0] = $dir;
	$ref_to_dirList->[$c][1] = [@readsList];
	++$c;    
    }
    close DIRLIST;
    
    return $ref_to_dirList; 
}
 
#============================================================================#
sub readScaffInfo {
    
    my ($subdir, $scaffoldInfoFile) = @_;
    my $leftContig;
    my $rightContig;
    
    print "readScaffInfo: $subdir $scaffoldInfoFile\n" if $optDebug;
 
    open FILE, "$subdir/$scaffoldInfoFile" 
        or confess "Can't open $subdir/$scaffoldInfoFile ($!)\n";
    
    while (my $line = <FILE>) {
        chomp $line;
        my ($gapName,
            $gapSize,
            $leftContigSize,
            $leftContigName,
            $rightContigSize, 
            $rightContigName,
            $scaffoldName) = split /\s+/, $line;
 
        $rightContig = $rightContigName;
        $leftContig = $leftContigName; 
    }
 
    close FILE;
 
    return $leftContig, $rightContig;
}

#============================================================================#
sub checkError {
    my ($file) = @_;
    my $fileContents;
 
    #if the assembly was successful the 2> file will be empty
    #   
    if (-s $file) {
	
	print "assembly FAILED!!\n" if $optDebug;

	# read file
	#
	open FILE, "$file" or confess "Can't open file $file\n";
	local $/ = undef;
	$fileContents = <FILE>;
	close FILE;
	
	# log contents (error)
	#        
	logError($fileContents);	
	return 1;
    }
    
    return 0;
}

#============================================================================#
sub checkDirContents
{
    my ($path) = @_;

    print "Checking Dir $path..." if $optDebug;

    opendir DIR, $path;
    while(my $entry = readdir DIR) {
        next if($entry =~ /^\.\.?$/);
        closedir DIR;
	print "returning 0\n" if $optDebug;
        return 0;
    }
    closedir DIR;
    print "returning 1\n" if $optDebug;
    return 1;
}

#============================================================================#
sub getFastasInDir {
    my ($dir) = @_;
    my $fileString;
    my @files;
    
    print "Grabbing Fastas from $dir..." if $optDebug;

    opendir DIR, $dir or confess "Can't open directory $dir: $!\n";;

    @files = grep !/^\.|qual$/, readdir DIR;   
    
    closedir DIR;
    
    foreach my $fileName (@files) {
	$fileString .= " $dir$fileName";
    }
    
    return $fileString;
}







