#!/usr/bin/env perl

=head1 LICENSE

Copyright (c) 2007-2009 Illumina, Inc.

This software is covered by the "Illumina Genome Analyzer Software
License Agreement" and the "Illumina Source Code License Agreement",
and certain third party copyright/licenses, and any user of this
source file is bound by the terms therein (see accompanying files
Illumina_Genome_Analyzer_Software_License_Agreement.pdf and
Illumina_Source_Code_License_Agreement.pdf and third party
copyright/license notices).

This file is part of the Consensus Assessment of Sequence And VAriation
(CASAVA) software package.

=cut

=head1 NAME

buildSeq.pl

Build a fastq/fasta/scarf format file from a list of _qseq.txt files or their binary equivalent.

=cut

=head1 DESCRIPTION

Used the content of a tiles-file to identify the list of files to use as input.

=cut

=head1 AUTHOR

Come Raczy

=head1 SYNOPSIS

    $ buildSeq.pl
        [--fastq|--fasta|--scarf]
        [--numeric|--symbolic]
        [--quality-filter ]
        [--contam-filter ]
        [--input-directory <input-directory>]
        [--filters-directory <filters-directory>]
        [--positions-directory <positions-directory>]
        [--positions-format <positions-format>]
        [--use-bases <use-bases-mask>]
        [--cycles <cycles-list>]
        [--instrument <instrument>]
        [--run-number <run-number>]
        [--eamss <true|false>]
        [--flow-cell-id <flow-cell-id>]
        [--tiles-file <tiles-file>]
        [--lane-prefix <lane-prefix>]
        [--read <read-number>]
        [--qseq-suffix <qseq-suffix>]

Runs buildSeq.pl on the specified input.

=head1 DETAIL

The top level operations are:

* Open the files
* iterate through the available clusters
* create the name of the sequence
* produce the sequence

=cut

use warnings;
use strict;
use File::Basename;
use Getopt::Long;
use Pod::Usage;
use Carp;

use lib '/home/psgendb/local/pkg/CASAVA_v1.8.2-build/lib/CASAVA-1.8.2/perl'; # substituted by CMake during install
use Casava::Common::Qseq;
use Casava::Common::Bcl;
use Casava::Common::QseqRead;
use Casava::Common::Utils;


my $usage=<<"END";
Usage: $0 
    [--fastq|--fasta|--scarf]
    [--numeric|--symbolic]
    [--quality-filter ]
    [--input-directory <input-directory>]
    [--input-format <bcl|qseq>]
    [--filters-directory <filters-directory>]
    [--positions-directory <positions-directory>]
    [--positions-format <pos|locs|clocs>]
    [--use-bases <use-bases-mask>]
    [--cycles <cycles-list>]
    [--instrument <instrument>]
    [--run-number <run-number>]
    [--eamss <true|false>]
    [--flow-cell-id <flow-cell-id>]
    [--tiles-file <tiles-file>]
    [--lane-prefix <lane-prefix>]
    [--read-number <read-number>]
    [--qseq-suffix <qseq-suffix>]
END

my $man = 0;
my $help = 0;
my ($fastq, $fasta, $scarf) = (undef, undef, undef);
my ($numeric, $symbolic) = (undef, undef);
my $qualityFilter = undef;
my $inputDirectory = undef;
my $inputFormat = "bcl"; # default input format is BCL
my $filtersDirectory = undef;
my $positionsDirectory = undef;
my $positionsFormat = undef;
my $useBasesString = undef;
my $cyclesString = undef;
my $instrument = undef;
my $runNumber = undef;
my $eamss = undef;
my $flowCellId = undef;
my $tilesFile = undef;
my $lanePrefix = undef;
my $readNumber = undef;
my $qseqSuffix = undef;

GetOptions('help|?' => \$help,
           'man' => \$man,
           'fastq' => \$fastq,
           'fasta' => \$fasta,
           'scarf' => \$scarf,
           'numeric' => \$numeric,
           'symbolic' => \$symbolic,
           'quality-filter' => \$qualityFilter,
           'input-directory=s' => \$inputDirectory,
           'input-format=s' => \$inputFormat,
           'filters-directory=s' => \$filtersDirectory,
           'positions-directory=s' => \$positionsDirectory,
           'positions-format=s' => \$positionsFormat,
           'use-bases=s' => \$useBasesString,
           'cycles=s' => \$cyclesString,
           'instrument=s' => \$instrument,
           'run-number=s' => \$runNumber,
           'eamss=s' => \$eamss,
           'flow-cell-id=s' => \$flowCellId,
           'tiles-file=s' => \$tilesFile,
           'lane-prefix=s' => \$lanePrefix,
           'read-number=s' => \$readNumber,
           'qseq-suffix=s' => \$qseqSuffix,
           ) or pod2usage(2);
pod2usage(1) if $help;
pod2usage(-verbose => 2,  -input => $1) if ($man and $0 =~ /(.*)/);
pod2usage("$0: too many positional arguments.\n")  if (0 < @ARGV);

my $outputMode = ($scarf ? 2 : ($fasta ? 1 : 0)); # default output to fastq

pod2usage("$0: the use-bases string must be specified\n")  unless $useBasesString;
my @useBases = parseUseBases($useBasesString);

pod2usage("$0: the list of cycles must be specified\n")  unless $cyclesString;
my @cycles = split(/\s+/, $cyclesString);

pod2usage("$0: the full path of the tiles-file must be specufied\n")  unless $tilesFile;
pod2usage("$0: the lane prefix must be specified\n")  unless $lanePrefix;
my @tiles;
croak "ERROR: didn't find any tile for $lanePrefix in $tilesFile" unless get_tiles_by_lane($tilesFile, $lanePrefix, @tiles);

my $withEamss = 1;
carp "WARNING: Invalid 'eamss' option: $eamss: assuming 'true'\n" unless uc($eamss) =~ /^YES|NO|TRUE|FALSE$/;
$withEamss = 0 if uc($eamss) =~ /^FALSE|NO$/;

# sub expandQuality
# Expand symbolic quality values to numeric ones
sub expandQuality($)
{
    my $qv=$_[0];
    if ($numeric)
    {
        $qv=expandSymbolicQualityString($qv);
    } # else
    return ($qv);
} # expandQuality


foreach my $tileName (@tiles) {
    my $qseqMap;
    my $tileNumber = $1 if $tileName =~ /^${lanePrefix}_0*(\d+)$/;
    if ('bcl' eq $inputFormat)
    {
        my $laneNumber = $1 if $lanePrefix =~ /^s_(\d+)$/;
        $qseqMap = Casava::Common::Bcl->new($instrument, $runNumber, $laneNumber, $tileNumber, $inputDirectory,
                                                      \@cycles, [], $readNumber, $positionsDirectory,
                                                      $positionsFormat, $filtersDirectory, $withEamss);
    }
    elsif ('qseq' eq $inputFormat)
    {
        my $ubQseqFilePath = File::Spec->catfile($inputDirectory, sprintf("%s_%u_%04u%s", $lanePrefix, $readNumber, $tileNumber, $qseqSuffix));
        $qseqMap = Casava::Common::Qseq->new($ubQseqFilePath, "<");
    }
    else
    {
        die "Unsupported input format: $inputFormat: supported formats are 'bcl' and 'qseq'";
    }
    my $qualString;

    while ( defined( my $readTmpRef = $qseqMap->getRead ) ) {

        if ($qualityFilter) {
            my $filter = $readTmpRef->filter();
            next unless ($filter==1);
        }

        my $lane = $readTmpRef->lane();
        my $readNum = $readTmpRef->readNum();
        my $runNumber = $readTmpRef->runNumber();
        my $tile = $readTmpRef->tile();
        my $x = $readTmpRef->X();
        my $y = $readTmpRef->Y();
        my $machineName = $readTmpRef->machineName();
        my $index = $readTmpRef->Index();

        my $seqString = $readTmpRef->seq();
        $seqString =~ tr/./N/;
        my $qualityString = $readTmpRef->qualityString(); 

        my $seqName = getReadID($machineName, $runNumber, $flowCellId, $lane, $tile, $x, $y,
                                $readNum, $index);
        if (@useBases)
        {
            my @data    = split('',$seqString);
            my @quality = split('',$qualityString);
            $seqString = join('',@data[@useBases]);
            $qualityString  = join('',@quality[@useBases]);
        }
        if ($outputMode==0) {
            $qualString=expandQuality($qualityString);
            print '@'."${seqName}\n${seqString}\n+${seqName}\n${qualString}\n";
        }
        elsif ($outputMode==1) {
            print ">${seqName}\n${seqString}\n"; 
        }
        elsif ($outputMode==2) {
            $qualString=expandQuality($qualityString);
            print "${seqName}:${seqString}:${qualString}\n";
        }
    }
    $qseqMap->close;
}
