#!/usr/bin/env perl
# PROJECT: CASAVA
# AUTHOR:  Roman Petrovski
#
# Copyright (c) 2008 Illumina
# This software is covered by the "Illumina Genome Analyzer Software
# License Agreement" and the "Illumina Source Code License Agreement",
# and certain third party copyright/licenses, and any user of this
# source file is bound by the terms therein (see accompanying files
# Illumina_Genome_Analyzer_Software_License_Agreement.pdf and
# Illumina_Source_Code_License_Agreement.pdf and third party
# copyright/license notices).
#
# The scirpt creates the workflow for alignability computation
#

use warnings FATAL => 'all';
use strict;

use Getopt::Long;

use lib '/home/psgendb/local/pkg/CASAVA_v1.8.2-build/lib/CASAVA-1.8.2/perl';

use Casava::Common::Log;
initLog( "", 0, 5 );

my $batchSize = 30000000; #lines per one query file
my $readLength = undef;
my $qryFilePathStart = "";
my $help = 0;

my $result = GetOptions(
    "readLength|rl=s"       => \$readLength, 
	"batchSize|bs=i"          => \$batchSize,
    "queryFilePathStart=s"    => \$qryFilePathStart,
    "help"                  => \$help
);


my $usage       =
    "fasta2query.pl [options]\n"
  . "Creates query files out of a single fasta file\n"
  . "\t--readLength|-rl     - read length to compute the alignability on (default $readLength)\n"
  . "\t--batchSize|-bs      - length of single query file (default $batchSize)\n"
  . "\t--queryFilePathStart - beginning of query file path. Sequential numbers will be appended to it\n"
  . "\t--help               - Print this help\n";

if ($help) {
    print $usage;
    exit(0);
}    # if

errorExit "ERROR: queryFilePathStart not specified\n$usage" if ( $qryFilePathStart eq "");
errorExit "ERROR: readLength not specified\n$usage" if ( !defined ($readLength) );


my $a = "";
my $batchLeft = 0; #lines left to dump for current query file
my $qryFilesNumber = 0;
my $pos = 0;
my $partno = 1;

sub flushAndClose
{
    my $old_fh = select(RESULT);
    $| = 1;
    select($old_fh);

    close(RESULT);
}

sub checkOpenNewChunk 
{
    if ( 0 >= $batchLeft ) 
    {
        my $numberString = sprintf "%d_%02d", $partno, $qryFilesNumber++;
        my $queryFilePath = "${qryFilePathStart}${numberString}";


        flushAndClose();
        open( RESULT, ">$queryFilePath" )
          || die "ERROR: fasta2query.pl Couldn\\'t open file handle for $queryFilePath $!\n";
        $batchLeft = $batchSize;
        printLog( "Creating $queryFilePath\n", 0 );
        
    }
}

sub dumpRecords 
{
    my $len = length($a);
    if ($len)
    {
        $pos = 0;
        for ( my $i = 0 ; $i <= $len - $readLength ; ++$i ) 
        {
            checkOpenNewChunk();
            my $read = substr( $a, $i, $readLength);
            $read =~ s/[^NACGT]{1}/N/g;
            my $queryLine = ">$pos\n" . $read . "\n";
            print RESULT $queryLine;
            ++$pos;
            $batchLeft -= 2;
        }
        $a = "";
        ++$partno;
    }
}


while (<>)
{
    chomp;
    if ($_ =~ /^>/)
    {
        dumpRecords();
    }
    else
    {
        $a.=$_
    } 
}
dumpRecords();
flushAndClose();

