#!/usr/bin/env perl

=head1 LICENSE

Copyright (c) 2007-2009 Illumina, Inc.

This software is covered by the "Illumina Genome Analyzer Software
License Agreement" and the "Illumina Source Code License Agreement",
and certain third party copyright/licenses, and any user of this
source file is bound by the terms therein (see accompanying files
Illumina_Genome_Analyzer_Software_License_Agreement.pdf and
Illumina_Source_Code_License_Agreement.pdf and third party
copyright/license notices).

This file is part of the Consensus Assessment of Sequence And VAriation
(CASAVA) software package.

=head1 NAME

spliceSites.pl - produce non-redundant exon set.

=head1 SYNOPSIS

 spliceSites.pl options < /path/to/refFlat.txt >output_file_name 

Options:

=over 4

=item *

--refSequences=s

The genome folder with assembled chromosome sequences in fasta format.

=item *

--leftFlank=i

The number of bases that should be saved from the left exon flanking the splice 
site, or shorter if the adjacent exon(s) are shorter than the flank length.
There is no default for this parameter.

=item *

--rightFlank=i

The number of bases that should be saved from the right exon flanking the splice
site, or shorter if the adjacent exon(s) are shorter than the flank length. If 
no right flank is entered, the --leftFlank value is used.

=item *

--verbose=i

Level of logging verbosity. Default is 1 (warnings and informational messages only).

=cut

#=item *
#
#--inputChromPrefix=s
#
#If specified, the value is replaced by '--outputChromPrefix' in refFlat.txt 
#chromosome names. Default value: 'chr'. Note, it is strongly recommended to
#use RefFlat / reference genome combination that does not require tweaking 
#--inputChromPrefix and --outputChromPrefix
#
#=item *
#
#--outputChromPrefix=s
#
#If specified, the value replaces the '--inputChromPrefix' in refFlat.txt chromosome 
#names. Default value: 'chr'. Note, it is strongly recommended to use RefFlat / reference 
#genome combination that does not require tweaking --inputChromPrefix and --outputChromPrefix

=item *

--ignoreEmptyResults

If specified, the script will not fail in case the result set of splice sites is empty. 
Otherwise, the script will exit with non-zero exit code and a message.

=back

=head1 DESCRIPTION

The splice junction file is a fasta file containing a non-redundant set of 
splice junctions from known RefSeq transcripts.

The splice junctions are defined as the boundaries between two exons. An example
could be splice junction AB which is the juction between exon A and exon B. An 
equal number of bases are taken from the end of exon A and the start of exon B. 
If the total length of the exon is less that this number, the length of the exon
is taken.The sequences are taken from the forward genomic strand, irrespective 
if the gene is encoded on the forward or reverse strand. These two sequences are
then concatenated to form the splice junction AB.

=head1 DIAGNOSTICS

=head2 Exit status

0: successful completion
1: abnormal completion
2: fatal error

=head2 Errors

All error messages are prefixed with "ERROR: ".

=head2 Warnings

All warning messages generated by CASAVA are prefixed with "WARNING: ".

=head1 CONFIGURATION AND ENVIRONMENT

=head1 DEPENDENCIES

=over 4

=item Standard perl modules

strict, warnings, Getopt::Long, Pod::Usage

=item External perl modules

=item CASAVA perl modules

Casava::Common::Log

=back

=head1 BUGS AND LIMITATIONS

There are no known bugs in this module.

All documented features are fully implemented.

Please report problems to Illumina Technical Support (support@illumina.com)

Patches are welcome.

=head1 AUTHOR

Jerry Kakol

=cut
use strict;
use warnings;
use Getopt::Long qw(:config gnu_compat);
use Pod::Usage;

use lib '/home/psgendb/local/pkg/CASAVA_v1.8.2-build/lib/CASAVA-1.8.2/perl';

use Casava::Common::Log qw(initLog errorExit logInfo logWarning logDebug);
use Casava::PostAlignment::Sequencing::GenomicIOLib qw(readFasta);

initLog( undef, 1, 0, 0, 0);

my $outputChromPrefix = '';
my $inputChromPrefix = '';
my $chromNameSource = '';
my $maskRefSeqFiles = undef;
my $help = 0;
my $man = 0;
my $log_level = 1;

my $chr_fasta_dir;
my $left_flank;
my $right_flank;

my $splice_sites_produced=0;
my $ignoreEmptyResults=0;

GetOptions('outputChromPrefix=s'    => \$outputChromPrefix,
           'inputChromPrefix=s'     => \$inputChromPrefix,
           'chromNameSource=s'      => \$chromNameSource,
           'maskRefSeqFiles=s'      => \$maskRefSeqFiles,
           'refSequences=s'         => \$chr_fasta_dir,
           'leftFlank=s'            => \$left_flank,
           'rightFlank=s'           => \$right_flank,
           'verbose|v=i'            => \$log_level,
           'help|?'                 => \$help,
           'ignoreEmptyResults'     => \$ignoreEmptyResults,
           'man'                    => \$man) or pod2usage(2);

pod2usage(1) if $help;
pod2usage(-exitstatus => 0, -verbose => 2, -input => $1) if ($man and $0 =~ /(.*)/);
pod2usage("$0: --chromNameSource must be contigName or fileName.\n") unless grep /^$chromNameSource$/, ('contigName', 'fileName');
pod2usage("$0: --maskRefSeqFiles not specified.\n") unless $maskRefSeqFiles;
pod2usage("$0: --refSequences not specified.\n") unless $chr_fasta_dir;
pod2usage("$0: --leftFlank not specified.\n") unless $left_flank;

initLog( undef, $log_level, 0, 0, 0);

$right_flank = $left_flank unless defined $right_flank;

# Input as per refFlat table is defined as follows:
#
# CREATE TABLE `refFlat` (
#   `geneName` varchar(255) NOT NULL default '',
#   `name` varchar(255) NOT NULL default '',
#   `chrom` varchar(255) NOT NULL default '',
#   `strand` char(1) NOT NULL default '',
#   `txStart` int(10) unsigned NOT NULL default '0',
#   `txEnd` int(10) unsigned NOT NULL default '0',
#   `cdsStart` int(10) unsigned NOT NULL default '0',
#   `cdsEnd` int(10) unsigned NOT NULL default '0',
#   `exonCount` int(10) unsigned NOT NULL default '0',
#   `exonStarts` longblob NOT NULL,
#   `exonEnds` longblob NOT NULL);
#
# Warning:
#    `exonStarts` and `exonEnds` fileds end with spurious commas.


sub printSpliceSites($$\%)
{
    my ($chr, $chr_sequence, $refGenes) = @_;
    my $sitesPrinted = 0;

    my $lookupChrom = $chr;
    $lookupChrom =~ s/\.[^\.]+$// if ('fileName' eq $chromNameSource);
    $lookupChrom =~ s/^$outputChromPrefix//;

    logDebug "Printing $chr from annotation $lookupChrom", 2;
    foreach my $symbol (keys %{$refGenes->{$lookupChrom}})
    {
        foreach my $site (keys %{$refGenes->{$lookupChrom}{$symbol}})
        {
            my ($start, $end, $prev_exon_end,
                $next_exon_start) = split (/\t/, $site);
            my $lf = $prev_exon_end - $start + 1;
            my $rf = $end - $next_exon_start + 1;
            my $contig = "$symbol"."_$lf"."_$rf"."_${chr}_${prev_exon_end}_${next_exon_start}";
            if (length($chr_sequence) < $next_exon_start - 1 + $rf) {
                logWarning "Failed to extract sequence for $contig. Likely cause: invalid combination of Reference genome and annotation file";
            }
            else {
                my $seq = substr ($chr_sequence, $start - 1, $lf) . substr ($chr_sequence, $next_exon_start - 1, $rf);
                print ">$contig\n";
                print "$seq\n";
                ++$sitesPrinted;
            }
        }
    }
    return $sitesPrinted;
}

my %genes;

while (<>)
{
    chomp;
    my ($symbol, $accession, $chromosome, $strand,
        $tx_start, $tx_end, $cds_start, $cds_end,
        $exon_count, $exon_starts, $exon_ends) = split (/\t/);
    $exon_starts =~ s/,$//;
    $exon_ends    =~ s/,$//;
    my @exon_starts = split (/,/, $exon_starts);
    my @exon_ends    = split (/,/, $exon_ends);
    for (my $i = 0; $i < @exon_starts; $i++) {
        ++$exon_starts[$i]; }
    if (@exon_starts == 1)
    {
        #logInfo "Transcript $accession of gene $symbol is comprised of one exon only. Skipping.", 0;
        next;
    }
    $symbol = $accession if (!$symbol);
    if ($symbol !~ /^[[:alnum:]]+$/)
    {
        my $logSymbol = $symbol;
        $symbol =~ s/[^[:alnum:]]+/-/g;
        logDebug "$logSymbol\t$symbol", 3;
    }
    $chromosome =~ s/^$inputChromPrefix//;
    my $l = @exon_starts - 1;
    for (my $i = 0; $i < $l; $i++)
    {
        my $left_coord = $exon_ends[$i] - $left_flank + 1;
        $left_coord = $exon_starts[$i] if $left_coord < $exon_starts[$i];
        my $right_coord = $exon_starts[$i + 1] + $right_flank - 1;
        $right_coord = $exon_ends[$i + 1] if $right_coord > $exon_ends[$i + 1];
        # This statement stores the splice sites and at the same time makes them non-redundant
        $genes{$chromosome}{$symbol}{"$left_coord\t$right_coord\t$exon_ends[$i]\t" . ($exon_starts[$i + 1])} = undef;
  }
}
logDebug "Finished gathering and consolidating splice sites coordinates. Extracting sequences...",2;

my $globMask = File::Spec->catfile( $chr_fasta_dir, $maskRefSeqFiles);
my @files = glob($globMask);
errorExit "ERROR: Could not find a single reference in $globMask" if (!scalar(@files));

my $fastaFilesString = join (' ', map {"'$_'"} @files); 
my $transformFastaCmd = File::Spec->catfile('/home/psgendb/local/pkg/CASAVA_v1.8.2-build/libexec/CASAVA-1.8.2', 'transformFasta.pl') .
    " --targetPath - --chromNameSource=$chromNameSource --removeBreaks $fastaFilesString";
logDebug "executing: $transformFastaCmd", 2;
open FASTA, '-|', "$transformFastaCmd ";
my $refChromName = '';
while (<FASTA>)
{
    chomp;
    my $line = $_;
    if ($line =~ m/^>([^\s]*)/)
    {
        errorExit "ERROR: unexpected header while looking for fasta data line in: $transformFastaCmd" if $refChromName;
        $refChromName = $1;
    }
    else
    {
        errorExit "ERROR: unexpected data while looking for fasta header in: $transformFastaCmd" if !$refChromName;
        $splice_sites_produced += printSpliceSites($refChromName, $line, %genes);
        $refChromName = '';
    }
}

$ignoreEmptyResults || $splice_sites_produced || 
    errorExit "ERROR: Generated empty set of splice sites. Most likely cause is mismatch between annotation chromosome names and chromosome file names in ${chr_fasta_dir}. Use --ignoreEmptyResults to avoid failure.";
