#!/usr/bin/env perl

# File: buildCoverage.pl
# Copyright (c) 2005, 2006 Solexa
# Author: A. J. Cox
# This software is covered by the "Illumina Genome Analyzer Software
# License Agreement" and the "Illumina Source Code License Agreement",
# and certain third party copyright/licenses, and any user of this
# source file is bound by the terms therein (see accompanying files
# Illumina_Genome_Analyzer_Software_License_Agreement.pdf and
# Illumina_Source_Code_License_Agreement.pdf and third party
# copyright/license notices).

# Description: build coverage map from sets of alignment data

use warnings;
use strict;

print "#RUN_TIME ". scalar(localtime)."\n";
print '#SOFTWARE_VERSION CASAVA-1.8.2'."\n";


die "Usage: buildCoverage.pl exportFile genomeName [scoreThreshold]"
    unless ((@ARGV == 2) || (@ARGV == 3));

my $exportName = $ARGV[0];
my $genomeName = $ARGV[1];
my $scoreThreshold= (@ARGV > 2) ? $ARGV[2] : -100000000;
my @bases=('A', 'C', 'G', 'T');

open(EXPORT, "<$exportName") || die("problem opening $exportName");
open(GENOME, "<$genomeName") || die("problem opening $genomeName");

my $line=<GENOME>; # ignore 1st line
my $isDos=0;

if ($line =~ /\r\n$/)
{
    print STDERR "WARNING: $genomeName looks like a DOS format text file, will attempt to compensate\n";
    $isDos=1;
} # if

my %baseCall;
my $seq;
my $genome="?"; # to make everything index from 1
my $i;
my $max=0;
my $maxNonWild=0;
my $maxWild;
my $cons;
my $adjPos; # adj for circularization
my $genomeLength;
my $base;

while (<GENOME>)
{
    s/\r\n$/\012/g if ($isDos);
    chomp;
    $genome .=  $_;
} # while

close(GENOME);

$genomeLength=length($genome)-1; 
# -1 to account for '?' at the beginning
$genome=uc($genome);

# set arrays to correct size (saves multiple memory reallocations)
for $base (@bases)
{
    $baseCall{$base}[$genomeLength]=0;
}

# Export file fields.
my $expectedNumFields = 22;
my $readFieldInd = 8;
my $matchPosFieldInd = 12;
my $matchStrandFieldInd = 13;
my $singleReadAlignScoreFieldInd = 15;
my $passedFilterReadInd = 21;

while (<EXPORT>)
{
    if (/^\#/) { print; next; } 
    chomp;
    my @fields = split(/\t/, $_);

    die("Unexpected number of fields " . scalar(@fields))
	unless (@fields == $expectedNumFields);

    # Skip read unless passed filter.
    next if ($fields[$passedFilterReadInd] ne 'Y');

    # Skip read if not aligned.
    next if ($fields[$matchPosFieldInd] eq '');

    # Skip read if alignment score too low.
    next if ($fields[$singleReadAlignScoreFieldInd] < $scoreThreshold);

    if ($fields[$matchStrandFieldInd] eq 'R')
    {
	$seq = reverse($fields[$readFieldInd]);
	$seq =~ tr/ACGTacgt/TGCATGCA/;
    } #if
    else
    {
	$seq = uc($fields[$readFieldInd]);
    } #else

    for ($i=0; $i<length($seq);$i++)
    {
        $adjPos = $fields[$matchPosFieldInd] + $i;
        $adjPos-=$genomeLength if ($adjPos>$genomeLength);
        $adjPos+=$genomeLength if ($adjPos<1);
        $base=substr($seq,$i,1)++;
        $baseCall{$base}[$adjPos]++ if ($base=~/[ACGT]/);
    } #for
} #while

close(EXPORT);

my $genomeBase;

for ($i=1; $i<=$genomeLength; $i++)
{
    $genomeBase=substr($genome, $i, 1);
    print $i, " ", $genomeBase;
    $genomeBase=uc($genomeBase);
    $max=0;
    $maxNonWild=0;
    $cons="X";
    foreach $base (@bases)
    {
	$baseCall{$base}[$i]=0 unless defined ($baseCall{$base}[$i]);
	print "\t$baseCall{$base}[$i]";
	if ($baseCall{$base}[$i]>$max)
	{
	    $cons=$base;
	    $max=$baseCall{$base}[$i];
	} #if
	elsif (($max!=0)&&($baseCall{$base}[$i]==$max))
	{ # a tie
	    $cons.=$base;
	} #elsif
	next if ($base eq $genomeBase);
	if ($baseCall{$base}[$i]>$maxNonWild)
	{
	    $maxNonWild=$baseCall{$base}[$i];
	} #if
    } #foreach
    $maxWild=$baseCall{$genomeBase}[$i];
    $maxWild=0 unless (defined($maxWild));
    print "\t$cons\t$maxWild\t$maxNonWild\n";
} #for i
