#!/usr/bin/env perl

=head1 NAME

non454Reads2Fasta.pl

=head1 SYNOPSIS
  
  non454Reads2Fasta.pl   -si       <sffinfo.txt file>
                         -gs       <path to gap subproject>
                         -od       <outputdir name>
                         -d        debug mode
                         -h        this help menu

                         readlistfile1 readlistfile2 ... etc

  Note:all given options are required except help


=head1 DESCRIPTION

    This script takes as input a readlist or readlist to retrieve.
    The read id to be retrieved must be in the 1st (whitespace 
    separated) column of the readlist(s) supplied on the command line.

    The non sff files specified in the -si <sffinfo.txt> file are searched
    and the fasta/quals depsited in a multi fasta file, named after the 
    input file specified in the sffinfo.txt file, in a specified output
    directory -od in the gap subproject -gs.

    The sffinfo.txt file contains the following 3 tab delim column format 
    libraryName fastaFileLocation P||U (for paired or unpaired data) 

    Note: Newbler ace file information (eg. _left,_right,.pr)
          is stripped from the read id to enable use of readlists
          generated by NewblerAce2ReadPair.pl


    Options are as follows:
    -si <sffinfo.txt file>
    -gs path to gap subproject.  
    -od Output Fasta/Quals will be deposited in this directory in 
        the gap subproject directory.



    space separated command line arguments are the read lists.

=head1 VERSION

$Revision: 1.7 $

$Date: 2010-01-07 19:24:12 $

=head1 HISTORY

=over

=item *

Kurt M. LaButti 2009/08/11 Creation

=back

=cut

# Includes



use strict;
use Carp;
use vars qw( $RealBin $optDebug $optSffInfo $optGapSubproject $optOutputDir $opth );
use FindBin qw($RealBin);
use File::Basename;
use Getopt::Long;
use Pod::Usage;
use lib "$RealBin/../lib";
use File::Path;
use File::Basename;

#============================================================================#
# INPUT VALIDATION
#============================================================================#

if( !GetOptions(
		"si=s"=>\$optSffInfo,
		"gs=s"=>\$optGapSubproject,
		"od=s"=>\$optOutputDir,
		"d"=>\$optDebug,
		"h"=>\$opth,
		)
) {
    printHelp();
}

pod2usage(-verbose=>2) and exit 1 if defined $opth;

# Check ARGVs
#
if (@ARGV < 1){
    print STDERR "no input read files given\n";
    printHelp();
}
else{
    foreach my $file (@ARGV){
	if (! -e $file){
	    print STDERR "$file given on command line not accessible\n";
	    printHelp();
	}
    }
}

# Validate all options
#

if (! defined $optSffInfo || ! defined $optGapSubproject || ! defined $optOutputDir ){
    printHelp();
}

if (! -s $optSffInfo){
    print STDERR "$optSffInfo is not found\n";
    printHelp();    
}

if ( ! -d $optGapSubproject){
    confess "output directory: $optGapSubproject does not exist\n";
}

if ( ! -d "$optGapSubproject/$optOutputDir") {
    mkpath("$optGapSubproject/$optOutputDir");
}
if ( ! -w "$optGapSubproject/$optOutputDir") {
    confess "$optGapSubproject/$optOutputDir Not writable\n";
    printHelp();
}

#============================================================================#
# MAIN
#============================================================================#

# set script to pull fasta
#
my $script = "$RealBin/fastaParser.pl";   

# parse sffinfo file for sanger type libs
#
my $A_fastaLocs = getSangerFastaLocs($optSffInfo);

# execute the next code only if there is non 454 data
#
unless ($#$A_fastaLocs < 0) {
        
    # get non 454 reads from the reads lists and store them in tmp files
    #
    my $A_tmpReadsLists = parseNon454Reads(\@ARGV);
    
    # print fasta and quals specified in each tmpFile for each library
    #
    unless ($#$A_tmpReadsLists < 0) {
	printFastaQuals($script, $optGapSubproject, $optOutputDir, $A_fastaLocs, $A_tmpReadsLists);
	unless ($optDebug) { cleanup($A_tmpReadsLists); }
    }
}

#============================================================================#
sub printHelp {
     my $optVerbose = shift || 1;
    pod2usage(-verbose=>$optVerbose);
    exit 1;
}

#============================================================================#
sub cleanup {
    my ($A_readLists) = @_;
    foreach my $readsList (@$A_readLists){
	unlink $readsList or confess "Can't cleanup $readsList: $!\n";	
    }
}

#============================================================================#
sub parseNon454Reads {
    my ($A_readLists) = @_;
    my %seq;
    my (@tmpFiles,@F,$read);
    foreach my $readsList (@$A_readLists){
	open (IN , $readsList) or confess "can't open $readsList\n";
	my @reads;
	while(<IN>){
	    chomp;
	    @F = (split/\s+/,$_);
	    

	    next if ($F[0] =~ /\w{14}((\.\d+\-\d+)|(\_(right|left)))?$/ ); #added $ 7jan10
	    
	    
	    print "non454Read: $read\n" if $optDebug;

	    ($read = $F[0]) =~ s/\_(?:left|right).*$//;
	    $read =~ s/\.\d+\-\d+.*$//;
	    $read =~ s/\.pr\d+.*$//;
	    
	    push(@reads, $read);
	    
	}

	close(IN);
	
	if ($#reads >=0) {
	    my $tmpFile = "${readsList}.non454.tmp";
	    push(@tmpFiles, $tmpFile);
	    open TMP, ">$tmpFile" or confess "can't open $tmpFile: $!\n";
	    print TMP join "\n", @reads;
	    print TMP "\n";    
	    close TMP;
	}
    }
    return \@tmpFiles;    
}

#============================================================================#
sub printFastaQuals {
    my ($script, $gapSubproject, $outputDir, $A_fastaLocs, $A_tmpReadsLists) = @_;
    my $cmd;
    
    foreach my $fastaLoc (@$A_fastaLocs) {
	my ($library, $location, $isPaired) = split /\s+/, $fastaLoc;
	my $outputFastaFile = "$gapSubproject/$outputDir/$library";
	my $outputQualFile  = "$gapSubproject/$outputDir/${library}.qual";
	#clear any pre existing file
	open OF, ">$outputFastaFile" or confess "can't open $outputFastaFile: $!\n";
	open OQ, ">$outputQualFile"  or confess "can't open $outputQualFile: $!\n";

	foreach my $tmpReadsList (@$A_tmpReadsLists) {
	    if (-s "${location}.qual") {
		$cmd = "$script -f $location -i $tmpReadsList >> $outputFastaFile";
		print "$cmd\n" if ($optDebug);	 
		system($cmd);
		$cmd = "$script -f ${location}.qual -i $tmpReadsList >> $outputQualFile";
		print "$cmd\n" if ($optDebug);	 
		system($cmd);
	    } else {
		confess "can't open $outputQualFile: $!\n";
	    }
	}
	close OF;
	close OQ;
   #delete output if they are empty (no reads found)
	unlink $outputFastaFile unless (-s $outputFastaFile);
	unlink $outputQualFile  unless (-s $outputQualFile);
	
    }
}

#============================================================================#
sub getSangerFastaLocs {
   my  $sffInfoFile = shift @_;
   my @sangerLocs;
   open (SFFINFO,"$sffInfoFile") or die "Can't open $sffInfoFile\n";
    while (my $line = <SFFINFO>) {
	chomp $line;	
	my ($lib, $location, $isPaired) = split /\s+/, $line;	
	next if ($lib =~ /sff$/);
	print "SANGER=$line\n";
	push (@sangerLocs, $line);
    }
    close (SFFINFO);
   return \@sangerLocs;
}

#============================================================================#
sub seq2fasta{
    my $seq = $_[0];
    $seq =~ s/(.{50})/$1\n/g;
    ($seq .= "\n") =~ s/\n+$/\n/;
    return $seq;
}

#====================================================================================
sub qual2fasta{
    my $qual = $_[0];
    $qual =~ s/^\s*/ /;
    $qual =~ s/((?:\s\d+){50})/$1\n/g;
    ($qual .= "\n") =~ s/(?:\n\s|\n+)/\n/g;
    $qual =~ s/^\s*//;
    return $qual;
}

exit 0;
