=head1 LICENSE

Copyright (c) 2007-2009 Illumina, Inc.

This software is covered by the "Illumina Genome Analyzer Software
License Agreement" and the "Illumina Source Code License Agreement",
and certain third party copyright/licenses, and any user of this
source file is bound by the terms therein (see accompanying files
Illumina_Genome_Analyzer_Software_License_Agreement.pdf and
Illumina_Source_Code_License_Agreement.pdf and third party
copyright/license notices).

This file is part of the Consensus Assessment of Sequence And VAriation
(CASAVA) software package.

=cut

package Casava::Io::SeqGeneMd::SeqGeneFile;

use strict;
use warnings "all";
use Carp;
use Exporter 'import';

use lib '@CASAVA_FULL_PERL_LIBDIR@'; # substituted by CMake during install

use Casava::Io::SeqGeneMd::GeneFeature;
use Casava::Io::SeqGeneMd::GeneCollection;
use Casava::Io::SeqGeneMd::Gene;
use Casava::Io::SeqGeneMd::RNA;
use Casava::Io::SeqGeneMd::UTR;
use Casava::Io::SeqGeneMd::CDS;

my $FALSE   = 0;
my $TRUE    = 1;


sub new {
    my $class = shift;
    my $args = shift;   # accept hash ref with args

    if (!defined $args) {
        die "Error - Seq_Gene_File has required arguments\n";

    # check for required args
    } elsif (! exists $args->{'fh'}) {
        die "Error - fh is required in Seq_Gene_File\n";
    } elsif (! exists $args->{'group_label'}) {
        die "Error - group_label is required in Seq_Gene_File\n";

    } else {
        bless $args, $class;
    }
}


sub fh {
    my ($self, $val) = @_;
    if (defined $val) {
        $self->{'fh'} = $val;
    } else {
        return $self->{'fh'};
    }
}
sub group_label {
    my ($self, $val) = @_;
    if (defined $val) {
        $self->{'group_label'} = $val;
    } else {
        return $self->{'group_label'};
    }
}
# a Gene_Collection object
sub gene_collection {
    my ($self, $val) = @_;
    if (defined $val) {
        $self->{'gene_collection'} = $val;
    } else {
        return $self->{'gene_collection'};
    }
}


#
# print seq_gene to given FH, or STDOUT if FH not given;
# only print entries with this group label
# if rename_group_label is also given, rename these group labels
#
# 
sub print_seq_gene {
    my ($self, $FH_OUTPUT, $rename_group_label) = @_;
    
    $FH_OUTPUT      = *STDOUT if (!defined $FH_OUTPUT);
    my $FH_INPUT    = $self->fh;
    my $group_label = $self->group_label;

    # print seq_gene.md header
    my $hdr = <$FH_INPUT>;
    print $FH_OUTPUT $hdr;

    my $match_group_label = $FALSE;

    while (<$FH_INPUT>) {
        chomp;
        my ($tax_id,     $chromosome,   $chr_start,  $chr_end,
            $chr_orient, $contig,       $ctg_start,  $ctg_end,
            $ctg_orient, $feature_name, $feature_id, $feature_type,
            $label,      $accver,       $evidence
        ) = split(/\t/);


        # restrict parsing to primary assembly
        next if ($label ne $group_label);
        $match_group_label = $TRUE;

        # rename group label, if desired
        if (defined $rename_group_label) {
            $label = $rename_group_label;
        }

        # print seq_gene line
        my $print_line = join ( "\t",
            (   $tax_id,     $chromosome,   $chr_start,  $chr_end,
                $chr_orient, $contig,       $ctg_start,  $ctg_end,
                $ctg_orient, $feature_name, $feature_id, $feature_type,
                $label,      $accver,       $evidence,
            ) );
        print $FH_OUTPUT "$print_line\n";
    }

    # throw error if group_label wasn't matched
    if ($match_group_label == $FALSE) {
        die "ERROR: group label " . $self->group_label . " didn't match any entry in seq_gene.md\n";
    }
}


# parse seq_gene file into objects, and print to fh; type is string that must
# contain refFlat or refGene
sub print_refFlat_refGene {
    my ($self, $type, $fh) = @_;
    $fh = *STDOUT if (!defined $fh);

    # check print type
    if ( (!defined $type) 
            || (($type !~ /refFlat/) && ($type !~ /refGene/))){
        print STDERR "ERROR: print type must be refFlat or refGene\n";
        return;
    }

    $self->gene_collection( Casava::Io::SeqGeneMd::GeneCollection->new() );
    
    # first sort seq_gene by chr, then chr_start, then feature type
    # so we always see GENE first, then RNAs, then UTRs and CDSs
    my @lines = $self->_sort_seq_gene();

    # next parse entire file for GENE entries, counting the number of times
    # each GeneId is found. Since a GeneId may occur more than once in a genome,
    # and we must produce a unique transcript_id for refFlat, we will append
    # an 'index' number to transcipt_ids of genes we observe multiple times.
    my $gene_cnt = $self->_count_multi_genes(\@lines);

    my $match_group_label = $FALSE;

    foreach my $line (@lines) {
        chomp $line;
        my (
            $tax_id,     $chr,          $chr_start,  $chr_end,
            $chr_orient, $contig,       $ctg_start,  $ctg_end,
            $ctg_orient, $feature_name, $feature_id, $feature_type,
            $label,      $transcript,   $evidence
        ) = split(/\t/, $line);

        # restrict parsing to primary assembly
        next if ($label ne $self->group_label);
        $match_group_label = $TRUE;

        # ignore features that don't have a transcript id
        next if $feature_type ne "GENE" and $transcript eq "-";

        if ( $feature_id =~ /^GeneID:(\d+)$/ ) {
            my $gene_id = $1;


            if ( $feature_type eq "GENE" ) {

                # empty and print past GENEs that we've passed up
                $self->gene_collection->print_finished_genes($type, $fh, $chr, $chr_start);

                # get index of multi-gene, if this gene was found more than once
                my $cnt = 0;
                if (exists $gene_cnt->{$gene_id}) {
                    $cnt = $gene_cnt->{$gene_id}--;
                }

                # a GENE 'feature_name' is actually the gene symbol
                my $gene = Casava::Io::SeqGeneMd::Gene->new( {
                    'gene_id'   => $gene_id,
                    'symbol'    => $feature_name,
                    'chr'       => $chr,
                    'start'     => $chr_start,
                    'end'       => $chr_end,
                    'orient'    => $chr_orient,
                    'index'     => $cnt,
                     } );
                $self->gene_collection->add_gene($gene);

            } elsif ( $feature_type eq "RNA" ) {

                # a RNA 'feature_name' is actually the transcript_id 
                # (which is redundant with $transcript)
                my $rna = Casava::Io::SeqGeneMd::RNA->new( {
                    'transcript_id' => $transcript,
                    'chr'           => $chr,
                    'start'         => $chr_start,
                    'end'           => $chr_end,
                    'orient'        => $chr_orient,
                     } );
                $self->gene_collection->add_feature($gene_id, $transcript, $rna);

            } elsif ( $feature_type eq "UTR" ) {

                # a RNA 'feature_name' is actually the transcript_id 
                # (which is redundant with $transcript)
                my $utr = Casava::Io::SeqGeneMd::UTR->new( {
                    'chr'           => $chr,
                    'start'         => $chr_start,
                    'end'           => $chr_end,
                    'orient'        => $chr_orient,
                     } );
                $self->gene_collection->add_feature($gene_id, $transcript, $utr);

            } elsif ( $feature_type eq "CDS" ) {

                # a CDS 'feature_name' is actually the protein_id
                my $cds = Casava::Io::SeqGeneMd::CDS->new( {
                    'chr'           => $chr,
                    'start'         => $chr_start,
                    'end'           => $chr_end,
                    'orient'        => $chr_orient,
                    'protein_id'    => $feature_name,
                     } );
                $self->gene_collection->add_feature($gene_id, $transcript, $cds);

            # unrecognized feature type
            } elsif ( $feature_type eq "PSEUDO" ) {

            } else {
                print STDERR "WARNING: Unrecognized feature_type $feature_type. Skipping.\n";
            }
    
    
        } else {
            print STDERR "WARNING: feature_id $feature_id didn't include GeneID\n\n";
        }
        
    }
    # print remaining genes
    $self->gene_collection->print_all_genes($type, $fh);


    # throw error if group_label wasn't matched
    if ($match_group_label == $FALSE) {
        die "ERROR: group label " . $self->group_label . " didn't match any entry in seq_gene.md\n";
    }
}


# return hash of number of times each GENE's GeneID was encountered
# but only for genes observed more than once
sub _count_multi_genes {
    my ($self, $lines) = @_;
    my %gene_cnt;

    foreach my $line (@$lines) {
        chomp $line;
        my (
            $tax_id,     $chr,          $chr_start,  $chr_end,
            $chr_orient, $contig,       $ctg_start,  $ctg_end,
            $ctg_orient, $feature_name, $feature_id, $feature_type,
            $label,      $transcript,   $evidence
        ) = split(/\t/, $line);

        # restrict parsing to primary assembly
        next if ($label ne $self->group_label);

        # if GENE, add to count of this GeneID
        if ( $feature_id =~ /^GeneID:(\d+)$/ ) {
            my $gene_id = $1;

            if ( $feature_type eq "GENE" ) {
                $gene_cnt{$gene_id}++;
            }
        }
    }

    # now remove genes that were only seen once
    foreach my $gene (keys %gene_cnt) {
        if ($gene_cnt{$gene} == 1) {
            delete $gene_cnt{$gene};   
        }
    }
    return \%gene_cnt;
}


# sort seq_gene lines -- see _sort_Seq_Gene method for criteria
sub _sort_seq_gene {
    my $self = shift;
    my $fh = $self->fh;
    my @lines;

    #discard header
    my $hdr = <$fh>;
    
    # slurp all lines into a list
    while(<$fh>) {
       push @lines, $_; 
    }
    return (sort _sort_Seq_Gene @lines);
}


# Sort Seq_Gene lines in order of chromosome, chrom start pos, then feature_type
# This sort allows parsing in a simple single pass, so we encounter GENE types
# first, then RNA types, and then their associated UTR and CDS types.
sub _sort_Seq_Gene {

    my @first  = split( '\t', $a);
    my @second = split( '\t', $b);
    my $compare;

    # first sort (alpha) by chromosome
    $compare = ($first[1] cmp $second[1] );
    if ($compare != 0) {
        return $compare;
    }
    # sort (numeric) by chr start position
    $compare = ($first[2] <=> $second[2] );
    if ($compare != 0) {
        return $compare;
    }
    # sort by feature_type in this order: GENE, RNA, CDS, UTR, PSEUDO
    my $first_type = $first[11];
    my $second_type = $second[11];
    my $first_rank;
    my $second_rank;

    my %feature_type = (
        "GENE"  => 1,
        "RNA"   => 2,
        "UTR"   => 3,
        "CDS"   => 4,
        "PSEUDO"=> 5,   );

    if (!exists $feature_type{$first_type}) {
        $first_rank = 10;
    } else {
        $first_rank = $feature_type{$first_type};
    }
    if (!exists $feature_type{$second_type}) {
        $second_rank = 10;
    } else {
        $second_rank = $feature_type{$second_type};
    }
    $compare = ($first_rank <=> $second_rank);

    return $compare;
}

1;