#!/usr/bin/env perl
# PROJECT: CASAVA
# MODULE:  $RCSfile: makeGenomeBamFasta.pl,v $
#
# Copyright (c) 2010 Illumina
# This software is covered by the "Illumina Genome Analyzer Software
# License Agreement" and the "Illumina Source Code License Agreement",
# and certain third party copyright/licenses, and any user of this
# source file is bound by the terms therein (see accompanying files
# Illumina_Genome_Analyzer_Software_License_Agreement.pdf and
# Illumina_Source_Code_License_Agreement.pdf and third party
# copyright/license notices).
#
# \author Chris Saunders
#
# This script merges reference sequences into a single fasta file with
# headers designed to allow easy viewing of CASAVA's BAM files
#

use warnings FATAL => 'all';
use strict;

use File::Copy qw(move);
use File::Spec;
use Getopt::Long;
use Sys::Hostname;

use lib '/home/psgendb/local/pkg/CASAVA_v1.8.2-build/lib/CASAVA-1.8.2/perl';
use Casava::Common::Log;
use Casava::Common::IOLib qw(executeCmd);
use Casava::PostAlignment::Sequencing::Config
  qw(loadConfiguration isSpliceJunctionChrom %chrEnds %CONF_APP %CONF_PROJ readProjectParameters);
use Casava::PostAlignment::Sequencing::SamLib qw(getSamtoolsBin getChangeChrLabelType changeChrLabel);

my $bamChangeChrType = 0;

#------------------------------------------------------------------------------

sub getBamChrLabel($)
{
    my ($chrName) = @_;

    return changeChrLabel($chrName,$bamChangeChrType);
}

#------------------------------------------------------------------------------

my $argvStr     = join ' ', @ARGV;
my $scriptName = (File::Spec->splitpath($0))[2];

my $projectDir  = "";
my $help        = 0;

my $usage =
    "$scriptName [options]\n"
  . "\t--projectDir   - project directory\n"
  . "\t--help         - print this message\n";

my $result      = GetOptions(
    "projectDir=s"  => \$projectDir,
    "help|h"        => \$help
);

if ((not $result) or $help) {
    errorExit "\n\n$usage";
}
errorExit "ERROR: Incorrect number of arguments\n$usage"
  if ($projectDir eq "");

loadConfiguration($projectDir);

# Configuration:
my $confDir         = $CONF_APP{dirConf};
my $projConfFile    = $CONF_APP{projectConf};
my $bamDirName      = $CONF_APP{dirBam};
my $samtoolsBin      = getSamtoolsBin(%CONF_PROJ);
$bamChangeChrType = getChangeChrLabelType($CONF_PROJ{bamChangeChromLabels});

my %buildChromsBinSizes = ();
readProjectParameters( %buildChromsBinSizes, "BUILD_BIN_SIZES", $projectDir );
my @buildChroms = keys %buildChromsBinSizes;

#------------------------------------------------------------------------------

{
    my $hostname        = hostname();
    printLog("Running $hostname:[$0 $argvStr]\n", 0);
}


my $genomeDir = 'genome';
my $genomeBamFilename = 'sorted.bam';

my $outputPath = File::Spec->catdir($projectDir,$genomeDir,$bamDirName);
errorExit "ERROR: Can't find genome bam output directory: $outputPath\n" unless (-d $outputPath);


# get the complete list of reference sequences to concatenate into the 
# bam fasta:
#
my $genomeFastaPath = File::Spec->catfile($outputPath, $genomeBamFilename. ".fa.gz");
my $tmpGenomeFastaPath = "$genomeFastaPath.incomplete";
my $gzipCmd = "| gzip -c > $tmpGenomeFastaPath";
open(my $genomeFastaFH, $gzipCmd) 
    or errorExit("ERROR: can't open process: '$gzipCmd' $!\n");

# order references in fasta file by chromosome size:
my @chromList = sort { $chrEnds{$b}->{length} <=> $chrEnds{$a}->{length} } @buildChroms;
foreach my $chrom (@chromList) {
    next if ( isSpliceJunctionChrom($chrom, %CONF_PROJ) );

    # get the reference sequence corresponding to this chromosome:
    my $refSeqFile = $chrEnds{$chrom}->{file};
    open(my $refSeqFH,"<$refSeqFile") or errorExit("ERROR: can't open file: '$refSeqFile'\n");
    my $header = <$refSeqFH>;
    errorExit("ERROR: unexpected header format in fasta file: '$refSeqFile'\n") if( substr($header,0,1) ne ">");

    my $printChrom = getBamChrLabel($chrom);
    print $genomeFastaFH ">$printChrom ".substr($header,1);
    while(<$refSeqFH>) { print $genomeFastaFH $_; }
}
close($genomeFastaFH)
    or errorExit("ERROR: Failed to close gzip process: $!\n");

move($tmpGenomeFastaPath,$genomeFastaPath) 
    or errorExit("ERROR: Failed to move fasta file from '$tmpGenomeFastaPath' to '$genomeFastaPath' $!\n");
