#!/usr/bin/env perl

=head1 NAME

shredFasta.pl

=head1 SYNOPSIS

  shredFasta.pl [options] <fastaFile> <outputFile>

  Options:
  -l <number>           Fragment length (required)
  -o <number>           Fragment overlapping length (required)
  -q                    Specify this if <fastaFile> is the quality file
  -qmax <number>        In conjunction with the -q option, specify the maximum
                        consensus qualtiy (optional)
  -qoverlap <number>    In conjunction with the -q option, specify the maximum qualtiy
                        value at overlapping ends (optional)
  -h                    Help message (optional)

=head1 DESCRIPTION

This script is used to chop sequences within a fasta file into overlapping
fragments.  A fasta containing multiple entries is allowed.  Each fragment 
is separated by a fasta header tag named using the original read name with
an iteration number appened to the end (using a '_' delimiter).

$Revision: 1.4 $

$Date: 2009-10-08 17:26:38 $

=head1 AUTHOR(S)

Stephan Trong

=head1 HISTORY

=over

=item *

S.Trong 2008/11/13 creation

=back

=cut

use strict;
use warnings;
use Pod::Usage;
use Getopt::Long;
use Carp;
use Carp qw(cluck);
use Cwd;
use Cwd qw(abs_path);
use File::Path;
use File::Basename;
use FindBin qw($RealBin);
use lib "$RealBin/../lib";
use PGF::Newbler::Scaffolds454;
use PGF::Parsers::FastaParse;
use PGF::Utilities::FastaSeq qw(formatSeq);
use vars qw( $optHelp $optFragmentLength $optFragmentOverlap $optFileIsQual
    $optQualityAtOverlap $optMaxQuality);

#============================================================================#
# INPUT VALIDATION
#============================================================================#
if( !GetOptions(
        "l=n"=>\$optFragmentLength,
        "o=n"=>\$optFragmentOverlap,
        "q"=>\$optFileIsQual,
        "qoverlap"=>\$optQualityAtOverlap,
        "qmax"=>\$optMaxQuality,
        "h"=>\$optHelp,
    )
|| !@ARGV ) {
    usage();
}

usage(2) if $optHelp;

$optQualityAtOverlap = -1 if !$optQualityAtOverlap;
$optMaxQuality = -1 if !$optMaxQuality;

#============================================================================#
# INITIALIZE VARIABLES
#============================================================================#
my $DEBUG = 0;

if ( @ARGV != 2 ) {
    print STDERR "Required inputs are missing.\n";
    usage();
}

my ($inputFile, $outputFile) = @ARGV;

#============================================================================#
# VALIDATE INPUTS
#============================================================================#

if ( !-s $inputFile) {
    print STDERR "The input file does not exist or is zero size.\n";
    exit 1;
}

if ( !$optFragmentLength  ){
    print STDERR "The -l option must be defined.\n";
    usage();
    exit 1;
}


if ( !$optFragmentOverlap){
    print STDERR "The -o option must be defined.\n";
    usage();
    exit 1;
}

if ( $optFragmentLength <= $optFragmentOverlap ) {
    print STDERR "The fragment length (-l option) must be greater than the fragment overlap (-o option).\n";
    usage();
    exit 1;
}

#============================================================================#
# MAIN
#============================================================================#

my $objFastaParser = new PGF::Parsers::FastaParse($inputFile);

open FH, ">$outputFile" or confess "ERROR: failed to create file $outputFile: $!|";

while ($objFastaParser->MoreEntries) {
    $objFastaParser->ReadNextEntry( -rawFormat=>0 );
    my $fastaTag = $objFastaParser->Tag;
    my $fastaName = $objFastaParser->Name;
    my $fastaComment = $objFastaParser->Comment;
    my $seq = $objFastaParser->Seq;
    my $length = $objFastaParser->Length;
    my $shred = '';
    
    if ( $length > $optFragmentOverlap ) {
        if ( $optFileIsQual ) {
            $seq = shredQual($fastaName, $fastaComment, $seq, $optFragmentLength,
                $optFragmentOverlap, $optQualityAtOverlap, $optMaxQuality);
        } else {
            $seq = shredFasta($fastaName, $fastaComment, $seq, $optFragmentLength,
                $optFragmentOverlap);
        }
    } else {
        print FH "$fastaTag\n";
        $seq = formatSeq($seq, 50);
    }
    print FH "$seq\n" if length $seq;
}

close FH;

#============================================================================#
# SUBROUTINES
#============================================================================#
sub shredFasta {
    
    my $fastaName = shift;
    my $fastaComment = shift;
    my $seq = shift;
    my $fragmentLength = shift;
    my $overlapLength = shift;

    my $iter = 1;
    my $len = length($seq);
    my $start=0;
    my $shred = '';
    
    while ( $start+$fragmentLength < $len ) {
        my $readSeq = substr($seq,$start,$fragmentLength);
	$readSeq = formatSeq($readSeq, 50);
        my $header = ">${fastaName}_$iter";
           $header .= " $fastaComment" if length $fastaComment;
        $shred .= "$header\n$readSeq\n";	    
        $start += $fragmentLength-$overlapLength;
        $iter++;
    }

    if ( $start+$fragmentLength>=$len) {
        my $readSeq = substr($seq,$start,$len-$start+1);
	$readSeq = formatSeq($readSeq, 50);
        my $header = ">${fastaName}_$iter";
           $header .= " $fastaComment" if length $fastaComment;
        $shred .= "$header\n$readSeq\n";	    
    }
    
    $shred =~ s/\n+$//;
    
    return $shred;
    
}

#============================================================================#
sub shredQual {
    
    my $fastaName = shift;
    my $fastaComment = shift;
    my $qual = shift;
    my $fragmentLength = shift;
    my $overlapLength = shift;
    my $qualityAtOverlap = shift;
    my $maxQuality = shift;
    
    my $iter = 1;
    my $len = length($qual);
    
    my $shred = '';
    my $qualPosition = 0;
    my $readQual = '';
    my $overlappingQvals = '';
    my $lengthOfOverlappingQvals = 0;

    while ( $qual =~ /(\d+)/g ) {
	my $qval = $1;
	
	# if $maxQuality is not a negative number, set quality value to
	# $maxQuality if qual value exceeds this.
	#
	$qval = $maxQuality if $maxQuality >= 0 && $qval > $maxQuality;

	$readQual .= "$qval ";
	$qualPosition++;

	# check if we need to create a new fragment.
	#
	if ( $qualPosition >= ($fragmentLength * $iter) -
	     ($lengthOfOverlappingQvals * ($iter - 1))
	) {
	    
	    # new fragment is overlapping end from previous fragment +
	    # this new fragment.
	    #
	    $readQual = $overlappingQvals . $readQual;
	    chop $readQual;
	    
	    # get quality values at overlapping ends.
	    #
	    ($overlappingQvals, $lengthOfOverlappingQvals) =
		getQualityAtOverlappingEnd(
		    $readQual,
		    $fragmentLength,
		    $overlapLength,
		    $qualityAtOverlap
	    );

	    $readQual = formatSeq($readQual, 50);
	    
            my $header = ">${fastaName}_$iter";
               $header .= " $fastaComment" if length $fastaComment;
            $shred .= "$header\n$readQual\n";	    
	    $iter++;
	    $readQual = '';
	}
    }
    
    if ( length $readQual ) {
	$readQual = $overlappingQvals . $readQual;
	chop $readQual;
	$readQual = formatSeq($readQual, 50);
        my $header = ">${fastaName}_$iter";
           $header .= " $fastaComment" if length $fastaComment;
        $shred .= "$header\n$readQual\n";	    
    }
    
    $shred =~ s/\n+$//;
    
    return $shred;
    
}
    
#============================================================================#
sub getQualityAtOverlappingEnd {
    
    my $readQual = shift;
    my $fragmentLength = shift;
    my $overlapLength = shift;
    my $qualityAtOverlap = shift;
    
    my @qvalsAtReadEnd = (split /\s/, $readQual)[$fragmentLength-
	$overlapLength..$fragmentLength-1];
    my $lengthOfOverlappingQVals = scalar @qvalsAtReadEnd;
    my $qvalsAtOverlappingEnd = '';
    
    # If $qualityAtOverlap is not a negative number (e.g., -qoverlap option
    # was specified in the argument), set the quality for the overlapping
    # end to $qualityAtOverlap if the quality at the right end of the
    # input fragment is less than $qualityAtOverlap. Otherwise, assign the
    # overlapping quality as the quality of the input fragment.
    #
    if ( $qualityAtOverlap >= 0 ) {
	foreach my $qval ( @qvalsAtReadEnd ) {
	    $qval = $qualityAtOverlap if $qval > $qualityAtOverlap;
	    $qvalsAtOverlappingEnd .= "$qval ";
	}

    # If $qualityAtOverlap is negative (e.g., -qoverlap option was not
    # specified), set the quality of the overlapping end to the same quality
    # as the right end of the input fragment.
    #
    } else {
	$qvalsAtOverlappingEnd = (join " ", @qvalsAtReadEnd) . " ";
    }
    
    return $qvalsAtOverlappingEnd, $lengthOfOverlappingQVals;

}

#============================================================================#
sub usage {
    my $verbose = shift || 1;
    pod2usage(-verbose=>$verbose);
    exit 1;
}

#============================================================================#
