#!/usr/bin/env perl

=head1 NAME

reads2fasta.pl

=head1 SYNOPSIS
  
  reads2fasta.pl      -pd       <phd_dir>
                      -of       <path to fasta output file>
                      -oq       <path to qual output file>
                      -help

                      readlistfile1 readlistfile2 ... etc

  Note:all given options are required except help


=head1 DESCRIPTION

    This script takes as input a readlist or readlist to retrieve.
    The read id to be retrieved must be in the 1st (whitespace 
    separated) column of the readlist(s) supplied on the command line.

    If the phd file for the read exists in the phd_dir specified
    by -pd by the command line, the fasta and quals are written.

    Note: Newbler ace file information (eg. _left,_right,.pr)
          is stripped from the read id to enable use of readlists
          generated by NewblerAce2ReadPair.pl


    Options are as follows:
    -pd phd_dir where the phd files are kept
    -of path to output fasta file.  This file will contain 
        the fasta sequences.
    -of path to output qual file.  This file will contain 
        the quality scores.

    space separated command line arguments are the read lists.

=head1 VERSION

$Revision: 1.3 $

$Date: 2009-08-26 17:18:15 $

=head1 HISTORY

=over

=item *

Brian Foster 2008/12/12 Creation

=back

=cut

# Includes



use strict;
use Carp;
use vars qw( $RealBin $optphddir $optofasta $optoqual $opth );
use FindBin qw($RealBin);

use Getopt::Long;
use Pod::Usage;
use lib "$RealBin/../lib";
use File::Path;
use File::Basename;

#============================================================================#
# INPUT VALIDATION
#============================================================================#

if( !GetOptions(
		"pd=s"=>\$optphddir,
		"of=s"=>\$optofasta,
		"oq=s"=>\$optoqual,
		"h"=>\$opth,
		)
) {
    printHelp();
}

pod2usage(-verbose=>2) and exit 1 if defined $opth;

# Check ARGVs
#
if (@ARGV < 1){
    print STDERR "no input read files given\n";
    printHelp();
}
else{
    foreach my $file (@ARGV){
	if (! -e $file){
	    print STDERR "$file given on command line not accessible\n";
	    printHelp();
	}
    }
}

# Validate all options
#

if (! defined $optphddir || ! defined $optofasta || ! defined $optoqual ){
    printHelp();
}
if (-e $optofasta || -e $optoqual){
    confess "$optofasta or $optoqual already exist\n";
    printHelp();
}
$optphddir =~ s/\/$//;
if (! -d $optphddir){
    print STDERR "$optphddir is not found\n";
    printHelp();    
}

if ( ! -d dirname($optofasta)){
    mkpath (dirname($optofasta));
}
if ( ! -w dirname($optofasta)){
    print STDERR dirname($optofasta), "Not writable\n";
    printHelp();
}

if ( ! -d dirname($optoqual)){
    mkpath (dirname($optoqual));
}
if ( ! -w dirname($optoqual)){
    print STDERR dirname($optoqual), "Not writable\n";
    printHelp();
}

#============================================================================#
# MAIN
#============================================================================#

# get hash of phd files key = read id , value = full path to phd
#
opendir(DIR, "$optphddir")or confess "Cannot opendir $optphddir $!";
# note what about phd.2 etc
my %phdfiles =  map{$_=~/^(.*)\.phd\.\d+$/,"${optphddir}/$_"} grep {$_=~/^.*\.phd\.\d+$/}  readdir(DIR);
closedir(DIR);

# get read/phd info
#
my %seq;
my (@F,$read);
foreach my $file (@ARGV){
    open (IN , $file) or confess "can't open $file\n";
    while(<IN>){
	chomp;
	@F = (split/\s+/,$_);
	($read = $F[0]) =~ s/\_(?:left|right).*$//;
	$read =~ s/\.\d+\-\d+.*$//;
	$read =~ s/\.pr\d+.*$//;
	if (exists $phdfiles{$read}){
	    ($seq{$read}{seq},$seq{$read}{qual}) = _parsePhd($phdfiles{$read});
	}
    }
    close(IN);
}
# print fasta and quals if found
#
if (scalar(keys %seq)){
    open(FASTA,">$optofasta") or die "Can't open $optofasta\n";
    open(QUAL,">$optoqual") or die "Can't open $optoqual\n";
    foreach my $read (keys %seq){
	print FASTA ">$read\n", seq2fasta($seq{$read}{seq});
	print QUAL  ">$read\n", qual2fasta($seq{$read}{qual});
    }
    close(FASTA);
    close(QUAL);
}


#============================================================================#
sub printHelp {
     my $optVerbose = shift || 1;
    pod2usage(-verbose=>$optVerbose);
    exit 1;
}

#============================================================================#
sub _parsePhd{
    my ($file) = @_;
    my $parse = 0;
    my ($seq,$qual);
    my ($b,$q,$rest);
    open (IN_PHD,"$file") or die "Can't open $file\n";
    $seq = $qual = "";
    while(<IN_PHD>){
	chomp;
	if ($_ =~ /BEGIN_DNA/){
	    $parse = 1;
	    next;
	}
	next unless ($parse);
	last if ($_ =~ /END_DNA/);
	($b,$q,$rest) = split/\s+/,$_;
	$seq .= $b;
	$qual.= " " . $q;
    }
    close(IN_PHD);
    return $seq,$qual;
}

#============================================================================#
sub seq2fasta{
    my $seq = $_[0];
    $seq =~ s/(.{50})/$1\n/g;
    ($seq .= "\n") =~ s/\n+$/\n/;
    return $seq;
}

#====================================================================================
sub qual2fasta{
    my $qual = $_[0];
    $qual =~ s/^\s*/ /;
    $qual =~ s/((?:\s\d+){50})/$1\n/g;
    ($qual .= "\n") =~ s/(?:\n\s|\n+)/\n/g;
    $qual =~ s/^\s*//;
    return $qual;
}

exit 0;
