#!/usr/bin/env perl

=head1 NAME

  idContigRepeats.pl - Identifies unique/repeat boundary for contigs flanking a 
                      gap and generates unique sequence tags that can be used 
                      to check for gap closure.

=head1 SYNOPSIS

  idContigRepeats.pl [-h] -scaff <454 scaff file> -ace <acefile> -subdirs <list> -lib <lib info file> -od <path>

  Options:
  -h        detailed message (optional)
  -scaff    454Scaffolds.txt file
  -lib      librart info file
  -ace      path to acefile  (required)
  -od       output directory (required) (where assemInfo and gapdirs reside)
  -subdirs  list of gap subdirectories (required)
            *subdirs should contain a file detailing the following in 
            tab delimited format:
              1. subproject name
              2. est. gap size
              3. left contig size
              4. left contig name
              5. right contig size
              6. right contig name
              7. scaffold name

=head1 DESCRIPTION

This program is a wrapper that calls on several components to identify the 
unique/repeat boundary for contigs in a subproject.  This boundary is used 
by subsequent software to identify pools of reads for use in reassembly, 
and for primer design.  A Subproject represents the contig information 
flanking a single gap in a scaffold and should contain a file that details 
the following in a tab delim format:

    1. subproject name
    2. est. gap size
    3. left contig size
    4. left contig name
    5. right contig size
    6. right contig name
    7. scaffold name

The input to this program is the newbler 454Scaffolds.txt, library info file,
acefile, and a list of subdirectories.  As previously mentioned the subdirectories 
should contain a text file detailing the contig information flanking a gap.  
The Scaffolds.txt file is used to reverse complement individual contig fasta if it 
is in the negative orientation in the scaffold.  

#ace2contigs
Fasta and Qual sequence for the -ace <acefile> is generated by calling 
ace2contigs and is deposited in a configurable location (see config section 
below).  
(See component help menu for further details.)

#fasta2MegaBlastDb.pl
The Fasta sequence for the acefile is then used to create a blast database
by calling fasta2MegaBlastDb.pl.  The database is deposited in the same 
location as the acefile Fasta seq. 
(See component help menu for further details.)

#fastaParser.pl
The scaffInfo.txt file in each subdirectory listed in -subdirs <list>  is 
parsed and fastaParser.pl is used to create the Fasta sequence for each contig.
If the contig is in the - orientation it will be reverse complemented. 
(See component help menu for further details.)

#idRepeatBoundary.pl
The blast database and contig fastas are then used by idRepeatBoundary.pl.  The
 contig fasta is aligned to the database and results are parsed for repeats 
that meet configurable thresholds.  A configurable sliding window is used to 
check for the presence of repeat nearest the gap.  If repeat is identified the
 window keeps sliding away from the gap until a configurable amount of unique 
sequence is found.  This defines the unique/repeat boundary that is used to 
determine which data can be trusted for reassembly and for primer design. 
(See component help menu for further details.)

The output is a subDirectory/<contigname>.boundary file and a 
subDirectory/<contigname>.anchor file deposited in each subdirectory 
in -subdirs <list>.

<contigname>.boundary:
#uniqueStart    uniqueEnd       repeatStart     repeatEnd
51      39630   1       50

<contigname>.anchor:
>contig00013
GTCGAGCGGGATGGTGCCGGTCTCGCCGATCCCCTGCCACGCCACCGTCC


A default config file named gapRes.config residing in <installDir/config> is 
used to specify the name and location of the software components as well as 
options for each component.  To specify your own config file, set the 
environmental variable GAP_RES_CONFIG to the path and name of the custom 
config file.

The config parameters used by idContigRepeats.pl are as follows:
(components are identified by "script.").

# idContigRepeats.pl
#
script.createFastaFromAce=ace2contigs
script.createRefBlastDb=fasta2MegaBlastDb.pl
script.createContigFasta=parsefasta2
script.identifyRepeatBoundary=idRepeatBoundary.pl
script.blastLocation=/usr/X11R6/bin/

#**NOTE**
#The blast location can also be defined by the environmental 
#variable BLAST_LOC;
#********

idRepeatBoundary.aligner=blastall 
idContigRepeats.cleanupTmpFiles=1
idContigRepeats.outputLogDir=assemInfo
idContigRepeats.scaffFileName=scaffInfo.txt
idContigRepeats.bondaryFileExt=.boundary
idContigRepeats.anchorFileExt=.anchor

# fasta 2 aligner Db config
#
fasta2MegaBlastDb.formatDbOptions=-p F -o -i

# idRepeatBoundary.pl
#
idRepeatBoundary.blastOptions=-p blastn -F F -e 1e-5
idRepeatBoundary.repeatLength=100
idRepeatBoundary.repeatIdentity=95
idRepeatBoundary.windowLength=500
idRepeatBoundary.subWindowLength=100
idRepeatBoundary.windowStep=250
idRepeatBoundary.uniqueAnchorLength=50
idRepeatBoundary.boundaryPadLength=50
idRepeatBoundary.aligner=blastall 

# ace2contigs
#
ace2contigs.outputFileName=454AllContigs
ace2contigs.options=-q 


=head1 VERSION

$Revision: 1.25 $

$Date: 2010-01-07 19:42:23 $

=head1 AUTHOR(S)

=head1 HISTORY

=over

=item *

Kurt M. LaButti 2008/10/31 creation

=item *

S.Trong 2009/08/05 - added ability to skip and create warnings file if sub project fails.

=back

=cut

use strict;
use warnings;
use Pod::Usage;
use Cwd;
use Cwd qw(abs_path);
use Carp;
use Carp qw(cluck);
use Getopt::Long;
use File::Path;
use File::Copy;
use File::Basename;
use FindBin qw($RealBin);
use lib "$RealBin/../lib";
use PGF::Utilities::RunProcess qw(runProcess);
use PGF::Utilities::Properties;
use PGF::Utilities::RunProcess qw(runProcess);
use PGF::Utilities::Logger;
use PGF::Newbler::Scaffolds454;
use PGF::GapResolution::Warnings;
use vars qw(  $optHelp $optDebug $optInputSubdirList $optOutputDir 
	      $optLibraryInfoFile $optInputAcefile $optInputScaffoldFile);

#============================================================================#
# INPUT VALIDATION
#============================================================================#
my $programExecution = abs_path(dirname($0))."/".basename($0)." @ARGV";


if( !GetOptions(
		"h"           => \$optHelp,
		"d"           => \$optDebug,
                "od=s"        => \$optOutputDir,
		"subdirs=s"   => \$optInputSubdirList,
		"lib=s"       => \$optLibraryInfoFile,
		"ace=s"       => \$optInputAcefile,
		"scaff=s"     => \$optInputScaffoldFile
		)
    ) {
    printhelp(1);
}

if ( defined $optHelp ) {
    printhelp(2);
}


printhelp() if !$optInputAcefile;
printhelp() if !$optInputSubdirList;
printhelp() if !$optInputScaffoldFile;
printhelp() if !$optLibraryInfoFile;
printhelp() if !$optOutputDir;


#============================================================================#
# VALIDATE INPUTS
#============================================================================#
    
# validation code goes here...
if ( !-s $optInputSubdirList ) {
    confess "Input subdirectory list $optInputSubdirList does not exist or is empty.\n";
}

if ( !-s $optInputAcefile ) {
    confess "Input acefile $optInputAcefile does not exist or is empty.\n";
}

if ( !-s $optInputScaffoldFile ) {
    confess "Input scaffold file $optInputScaffoldFile does not exist or is empty.\n";
}

if ( !-e $optOutputDir ) {
    confess "Output directory $optOutputDir does not exist or is empty.\n";
}

if ( !-s $optLibraryInfoFile ) {
    confess "Input library file $optLibraryInfoFile does not exist or is empty.\n";
}
#============================================================================#
# INITIALIZE VARIABLES
#============================================================================#

#initialize objects
#
my $configFile = defined $ENV{GAPRES_CONFIG} ?
    $ENV{GAPRES_CONFIG} : "$RealBin/../config/gapRes.config";
my $outputDir = getcwd;
my $propertyObj = PGF::Utilities::Properties->new(-configFile=>$configFile);
   $propertyObj->setExceptionIfEntryNotFound(1); # confess if entry in config file
                                                 # is not found.
my $OBJ_LOGGER = PGF::Utilities::Logger->new();
my $OBJ_SCAFF  = PGF::Newbler::Scaffolds454->new("$optInputScaffoldFile");
my $OBJ_WARNINGS = PGF::GapResolution::Warnings->new(
    path=>$outputDir, logger=>$OBJ_LOGGER);

#get config file scripts and params
#
my $assemInfoDir = $propertyObj->getProperty("createGapResProject.assemInfoDirName");
   $assemInfoDir = "$optOutputDir/$assemInfoDir" if $assemInfoDir !~ /\//;
# prepend output dir to assem dir if path is not specified in $assemInfoDir.
my $logfile      = "$optOutputDir/".basename($0).".log";

my $blastLoc     =  $propertyObj->getProperty("script.blastLocation"); 
$ENV{BLAST_LOC}  = $blastLoc;

my $outputAllFasta       = $propertyObj->getProperty("ace2contigs.outputFileName"); 
my $ace2contigsOptions   = $propertyObj->getProperty("ace2contigs.options"); 
my $scaffoldInfoFileName = $propertyObj->getProperty("idContigRepeats.scaffFileName"); 
my $cleanupTmpFiles = $propertyObj->getProperty("idContigRepeats.cleanupTmpFiles"); 
my $boundaryFileExt = $propertyObj->getProperty("idContigRepeats.boundaryFileExtension");
my $anchorFileExt   = $propertyObj->getProperty("idContigRepeats.anchorFileExtension");
my $formatDbOptions = $propertyObj->getProperty("fasta2MegaBlastDb.formatDbOptions"); 
my $blastAligner    = $propertyObj->getProperty("idRepeatBoundary.aligner");
my $blastOptions    = $propertyObj->getProperty("idRepeatBoundary.blastOptions");
my $repeatLength    = $propertyObj->getProperty("idRepeatBoundary.repeatLength");
my $repeatIdentity  = $propertyObj->getProperty("idRepeatBoundary.repeatIdentity");
my $windowLength    = $propertyObj->getProperty("idRepeatBoundary.windowLength");
my $subWindowLength = $propertyObj->getProperty("idRepeatBoundary.subWindowLength");
my $windowStep      = $propertyObj->getProperty("idRepeatBoundary.windowStep");
my $anchorLength    = $propertyObj->getProperty("idRepeatBoundary.uniqueAnchorLength");
my $padLength       = $propertyObj->getProperty("idRepeatBoundary.boundaryPadLength");


my %optionsHash = (
		   'scaffInfoFile'  => $scaffoldInfoFileName,
		   'blastLoc'       => "$blastLoc/$blastAligner",
		   'blastOptions'   => $blastOptions,
		   'outputAllFasta' => $outputAllFasta,
		   'repeatLength'   => $repeatLength,
		   'libraryInfoFile'=> $optLibraryInfoFile,
		   'repeatIdentity' => $repeatIdentity,
		   'windowLength'   => $windowLength,
		   'subWindowLength'=> $subWindowLength,
		   'windowStep'     => $windowStep,     
		   'anchorLength'   => $anchorLength,   
		   'padLength'      => $padLength,      
		   'boundaryFileExt'=> $boundaryFileExt,
		   'anchorFileExt'  => $anchorFileExt,  
		   'assemInfoDir'   => $assemInfoDir
		   );

#============================================================================#
# INITIALIZATION
#============================================================================#

# Set path for logging.
#
setFileForLogging($logfile);

# Log execution into log file.
#
logExecution($programExecution);

#============================================================================#
# MAIN
#============================================================================#


#validate subdirectories
#
my $subdirArray = validateSubdirectories ($scaffoldInfoFileName, $optInputSubdirList);

if ($optDebug) {
    print "subDirList:\n";
    foreach my $dir (@$subdirArray) { print "$dir\n"; }
    print "\n";
}

#create ace fasta
#                                   #script  #options
runMyScript("script.createFastaFromAce", "$ace2contigsOptions $optInputAcefile $assemInfoDir/$outputAllFasta");

#create ace fasta database
#
runMyScript("script.createRefBlastDb", "-o \'$formatDbOptions\' -if $assemInfoDir/${outputAllFasta}.fasta");

#create fasta for each contig in each subdir and identify the repeat boundary
#for each
#
analyzeSubdir (\%optionsHash, $subdirArray);

# If warning messages present, then save in .warnings.out file.
#
$OBJ_WARNINGS->createFile() if $OBJ_WARNINGS->getNumberOfWarnings;

exit 0;

#============================================================================#
# SUBROUTINES
#============================================================================#
sub runMyScript {
    
    my ($configScript, $params) = @_;
    
    my $script = getScript("$configScript"); 
    my $cmd = "$script $params";
    my %processInfo = runCommand($cmd);
    
    checkProcess(%processInfo);
    
}
    
#============================================================================#
sub setFileForLogging {
    
    my $logFile = shift;
    
    $OBJ_LOGGER->setLogOutFileAppend($logFile);
    $OBJ_LOGGER->setLogErrorFileAppend($logFile);
    
}

#============================================================================#
sub getScript {
    
    my $scriptType = shift;
    
    my $script = $propertyObj->getProperty("$scriptType");
       $script = "$FindBin::RealBin/$script" if
            $script !~ /\//; # add path to script if not specified in config file
            
    # Check if script exists.
    #
    if ( !-e $script ) {
        my $errMsg = "ERROR: cannot find script $script defined in config file as $scriptType.\n";
        print $errMsg;
        logError($errMsg, 1);
    }
    
    return $script;
    
}

#============================================================================#
sub checkProcess {
    
    my %processInfo = @_;
    
    if ( $processInfo{exitCode} ) {
        my $errMsg = '';
        if ( defined $processInfo{logStdoutMessage} &&
            $processInfo{logStdoutMessage} ) {
            $errMsg .= $processInfo{stdoutMessage};
        }
        $errMsg .= $processInfo{stderrMessage};
        logError($errMsg,1);
    }
    
}

#============================================================================#
sub runCommand {
   
    my $cmd = shift;
    
    # Execute command, capture stderr, stdout, exitcode
    #
    my $errMessage = "CMD: $cmd\n";
    
    print "Running $cmd ...\n\n";
    logOutput($cmd);
    
    my %processInfo = runProcess($cmd,
        {-checkExecutable=>0,
        }
    );
    
    return %processInfo;
    
}

#============================================================================#
sub logExecution {
    
    my $programExecution = shift;
    
    my $msg = "Command: ".$programExecution."\n".
              "Current directory: ".getcwd;
    logOutput($msg);
}

#============================================================================#
sub logError {
    my $message = shift;
    my $confess = shift || 0;
    
    $OBJ_LOGGER->logError($message);
    $OBJ_WARNINGS->add($message);
    
    if ( $confess ) {
        $OBJ_WARNINGS->createFile() if $OBJ_WARNINGS->getNumberOfWarnings;
        confess $message;
    }
    
}
    
#============================================================================#
sub logOutput {
    my $message = shift;
    
    $OBJ_LOGGER->logOut($message);
}
    
#============================================================================#
sub printhelp {
    my $verbose = shift || 1;
    pod2usage(-verbose=>$verbose);
    exit 1;
}

#============================================================================#
sub validateSubdirectories {
 
    my ($scaffoldInfoFileName, $fileOfDirs) = @_;
    my $ref_to_A;
    
    print "$scaffoldInfoFileName $fileOfDirs\n" if $optDebug;


    unless( open DIRLIST, "$fileOfDirs" ) {
        my $errMsg = "ERROR: failed to open file $fileOfDirs.\n";
        logError($errMsg, 1);
    }
    
    while (my $dir = <DIRLIST>) {
	chomp $dir;

	if ( !-e $dir) {
            my $errMsg = "Directory $dir does not exist.\n";
            logError($errMsg, 1);
        }

	if ( !-e "$dir/$scaffoldInfoFileName") {
            my $errMsg = "File $scaffoldInfoFileName does not exist.\n";
            logError($errMsg, 1);
        }

        push @$ref_to_A, $dir;
    }
    close DIRLIST;
    
    return $ref_to_A; 
}

#============================================================================#
sub readScaffInfo {
    
    my ($subdir, $scaffoldInfoFileName) = @_;
    my $leftContig;
    my $rightContig;
    
    print "readScaffInfo: $subdir $scaffoldInfoFileName\n" if $optDebug;

    open FILE, "$subdir/$scaffoldInfoFileName" 
	or confess "Can't open $subdir/$scaffoldInfoFileName ($!)\n";
    
    while (my $line = <FILE>) {
	chomp $line;
	my ($gapName,
	    $gapSize,
	    $leftContigSize,
	    $leftContigName,
	    $rightContigSize, 
	    $rightContigName,
	    $scaffoldName) = split /\s+/, $line;
	
	$rightContig = $rightContigName;
	$leftContig = $leftContigName; 
    }

    close FILE;

    return $leftContig, $rightContig;
}



#============================================================================#
sub analyzeSubdir {
    my ($optionsHash, $subDirArray) = @_;    

#$optionsHash = (
#		   'scaffInfoFile'  => $scaffoldInfoFileName,
#		   'blastLoc'       => "$blastLoc/$blastAligner",
#		   'blastOptions'   => $blastOptions,
#		   'outputAllFasta' => $outputAllFasta,
#		   'repeatLength'   => $repeatLength
#		   'libraryInfoFile' => $optLibraryInfoFile,
#		   'repeatIdentity' => $repeatIdentity,
#		   'windowLength'   => $windowLength,
#		   'subWindowLength'=> $subWindowLength,
#		   'windowStep'     => $windowStep,     
#		   'anchorLength'   => $anchorLength,   
#		   'padLength'      => $padLength,      
#		   'boundaryFileExt'=> $boundaryFileExt,
#		   'anchorFileExt'  => $anchorFileExt,  
#                  'assemInfoDir'      => $assemInfoDir
#		   );

    
    print "analyzeSubdir: [$optionsHash->{scaffInfoFile}][$optionsHash->{blastLoc}][$optionsHash->{blastOptions}][$optionsHash->{outputAllFasta}] $optionsHash->{assemInfoDir}\n" if $optDebug;
    print join "\n", @$subDirArray if $optDebug;    
    print "\n" if $optDebug;    


    foreach my $subdir (@$subdirArray) {
	
	chomp $subdir;
	
	my ($leftContig, $rightContig) = readScaffInfo ($subdir, $optionsHash->{scaffInfoFile});

	my $c = 0;
	
	for ($leftContig, $rightContig ) {

	    my $end = $c == 0 ? "L" : "R";
	    
	    #build idRepeatBondary command line
	    #
	    my $options = 
		"-id $optionsHash->{assemInfoDir}/$optionsHash->{outputAllFasta}.fasta " 
		."-if $subdir/${_}.fasta " 
		."-e $end " 
		."-k "
		."-li $optionsHash->{libraryInfoFile} " 
		."-blastcmd \'$optionsHash->{blastLoc} $optionsHash->{blastOptions}\' " 
		."-bo $subdir/${_}$optionsHash->{boundaryFileExt} " 
		."-ao $subdir/${_}$optionsHash->{anchorFileExt} " 
		."-rl $optionsHash->{repeatLength} " 
		."-ri $optionsHash->{repeatIdentity} " 
		."-wl $optionsHash->{windowLength} " 
		."-ws $optionsHash->{windowStep} " 
		."-sl $optionsHash->{subWindowLength} " 
		."-pl $optionsHash->{padLength} "
		."-al $optionsHash->{anchorLength} ";

	    #determine which orientation the contig is in the scaffold
	    #
	    my $orientation = $OBJ_SCAFF->getContigOrientation("$_");
	    
	    my $commandLine = 
		($orientation =~ /\-/) ? "-r -f $optionsHash->{assemInfoDir}/$optionsHash->{outputAllFasta}.fasta $_ -o $subdir/${_}.fasta"
		: "-f $optionsHash->{assemInfoDir}/$optionsHash->{outputAllFasta}.fasta  $_ -o $subdir/${_}.fasta";
	    
	    runMyScript("script.createContigFasta", "$commandLine"); 
	    runMyScript("script.identifyRepeatBoundary",$options);    
			

	    
	    #check for expected output  (anchor will not exist for all repeat contigs)
	    #
	    unless (-s "$subdir/${_}.anchor")   { warn "$subdir/${_}.anchor does not exist or is file size 0\n";}      #update 5jan2010
	    unless (-s "$subdir/${_}.boundary") { confess "$subdir/${_}.boundary does not exist or is file size 0\n";}      
	    
	    ++$c;	
	    
	}
    }
    
    
    
}




