#!/usr/bin/env perl
# PROJECT: CASAVA
# MODULE:  $RCSfile: snp2gff.pl,v $
#
# Copyright (c) 2006 Solexa, 2007,2008 Illumina
# This software is covered by the "Illumina Genome Analyzer Software
# License Agreement" and the "Illumina Source Code License Agreement",
# and certain third party copyright/licenses, and any user of this
# source file is bound by the terms therein (see accompanying files
# Illumina_Genome_Analyzer_Software_License_Agreement.pdf and
# Illumina_Source_Code_License_Agreement.pdf and third party
# copyright/license notices).
#
# The cript converts CASAVA snp file to gff format. See usage.
use warnings FATAL => 'all';
use strict;
use POSIX qw(strftime);

use IO::File;
use Sys::Hostname;
use Getopt::Long;

use lib '/home/psgendb/local/pkg/CASAVA_v1.8.2-build/lib/CASAVA-1.8.2/perl';
use Casava::Common::Log;
use Casava::Common::Gff;
my $usage =
    "The script converts CASAVA snp file to gff format.\n"
  . "snp2gff.pl [options]\n"
  . "\t-s, --snpFile=PATH    - PATH to input SNP file\n"
  . "\t-g, --gffFile=PATH    - PATH to output GFF file\n"
  . "\t-c, --chromosome=NAME - chromosome/scaffold NAME\n"
  . "OPTIONAL\n"
  . "\t--shift=NUMBER        - Shift all coordinates by NUMBER\n"
  . "\t-m, --methodName=NAME - Method name (default \"Illumina_CASAVA\"\n"
  . "\t-b, --binSize=NUMBER  - The data will be divided to NUMBER size bins (default 10000000)\n"
  . "\t-t, --title=NAME      - Title in the gff header\n"
  . "\t-h, --help            - prints this messege\n"
  . "EXAMPLES\n"
  . "./snp2gff.pl -c c1 --snpFile=ch1.snp.txt --gffFile=ch1.snp.gff\n";
my $help         = 0;
my $snpFile      = '';
my $gffFile      = '';
my $chromosome   = '';
my $shift        = 0;
my $binSizeBuild = 10000000;
my $title        = 'Illumina CASAVA SNP file';
my $methodName   = 'Illumina_CASAVA';
my $result       = GetOptions(
    "methodName|m=s" => \$methodName,
    "chromosome|c=s" => \$chromosome,
    "shift=s"        => \$shift,
    "title|t=s"      => \$title,
    "binSize|b=s"    => \$binSizeBuild,
    "snpFile|s=s"    => \$snpFile,
    "gffFile|g=s"    => \$gffFile,
    "help|h"         => \$help
);

if ( $result == 0 || $gffFile eq '' || $snpFile eq '' || $chromosome eq '' ) {
    errorExit "\n$usage";
}    # if
if ($help) {
    print $usage;
    exit(0);
}    # if
my %snpFields4 = (
    
      position => 0,
    A             => 1,
    C             => 2,
    G             => 3,
    T             => 4,
    modified_call => 5,
    used          => 6,
    total         => 7,
    reference     => 10,
    type          => 11
);
my %snpFields5 = (
    position      => 0,
    A             => 1,
    C             => 2,
    G             => 3,
    T             => 4,
    modified_call => 5,
    used          => 6,
    total         => 7,
    score         => 8,
    confidence    => 9,
    reference     => 11,
    type          => 12
);
my %snpFields = ();
my $snpFileIn = IO::File->new( "<" . $snpFile )
  || errorExit "$0::Couldn't create/open file handle for [$snpFile] $!\n";
my %config = (
    title  => $title,
    source => $methodName,
);
my $gffSetRef = createGFF(%config);
my $rowIndex  = 0;
#my $temp = '^c';
#$chromosome =~ s/$temp(.+)/chr$1/g;
$gffFile    =~ s/\.gff//g;
my $bin     = -1;
my $prefBin = -1;
my $binId   = sprintf "%04d", $bin;

while (<$snpFileIn>) {
    chomp($_);
    my @row = split "\t", $_;
    if ( $rowIndex == 0 ) {
        if ( scalar(@row) == 12 ) {
            %snpFields = %snpFields4;

            #print "type 4\n";
        }    # if
        elsif ( scalar(@row) == 13 ) {

            #print "type 5\n";
            %snpFields = %snpFields5;
        }    #elsif
        else {
            print( STDERR
                  "ERROR: unrecognized format of snp file [$snpFile] # fields="
                  . scalar(@row)
                  . "\n" );
            %snpFields = %snpFields4;

        }    # else
        $rowIndex++;
        next;
    }
    my $start = $row[ $snpFields{position} ] + $shift;
    $bin = $start / $binSizeBuild;
    $binId = sprintf "%04d", $bin;

    my $end    = $row[ $snpFields{position} ] + $shift;
    my $score  = 1000;                                 # TODO refactor, redefine
    my $strand = '+';
    my $type   = $row[ $snpFields{type} ];
    my $base   = $row[ $snpFields{modified_call} ];
    if ( $type eq 'SNP_diff' ) {
        $base = $row[ $snpFields{modified_call} ];
    }
    $type =~ s/SNP_diff/hom/g;
    $type =~ s/SNP_//g;
    my $snpName = sprintf( "%s_%s", $base, $type );
    my $attribute =
      sprintf( "ID=%s_%d;Name=%s", $chromosome, $start, $snpName );
    addSNPFeature( %{$gffSetRef}, $chromosome, $start, $end, $score, $strand,
        $attribute );
    $rowIndex++;
}
close $snpFileIn;
$binId = sprintf "%04d", $bin;

my $file = IO::File->new( ">" . $gffFile . ".gff" )
  || errorExit "$0::Couldn't create/open file handle for [$gffFile.gff] $!\n";
dumpGFFHeader( %{$gffSetRef}, $file );
dumpGFF( %{$gffSetRef}, $file );
close $file;
