#!/usr/bin/env perl

# Copyright (c) Illumina 2009
# Author: Richard Shaw
# This software is covered by the "Illumina Genome Analyzer Software
# License Agreement" and the "Illumina Source Code License Agreement",
# and certain third party copyright/licenses, and any user of this
# source file is bound by the terms therein (see accompanying files
# Illumina_Genome_Analyzer_Software_License_Agreement.pdf and
# Illumina_Source_Code_License_Agreement.pdf and third party
# copyright/license notices).

# Takes a CASAVA Task Manager task file as input and produces a graph of
# the specified workflow in DOT format.
# (This can then be displayed by tools such as dotty.)

use warnings;
use strict;

use File::Basename;
use Getopt::Long;
use Carp;

use lib '/home/psgendb/local/pkg/CASAVA_v1.8.2-build/lib/CASAVA-1.8.2/perl';

use Casava::Common::Log;

#------------------------------------------------------------------------------

my $label_field_ind = 0;
my $label_num_field_ind = 1;
my $rule_type_field_ind = 2;
my $user_label_field_ind = 3;
my $state_field_ind = 4;
my $prereq_label_field_ind = 5;
my $cmd_field_ind =  6;
my $num_fields = 7;

my $discard = 0;
my $keep_name = 1;
my $keep_value = 2;
my $keep_name_and_value = 3;

my $use_description_field = 0;
my $dont_abbreviate_commands = 0;
my $help = 0;
my $workflow_file = "";
my $discard_regex = "";


my $result     = GetOptions(
    "tasks|t=s" => \$workflow_file,
    "useDescription" => \$use_description_field,
    "dontAbbreviate" => \$dont_abbreviate_commands,
    "discardRegex=s" => \$discard_regex,
    "help|h"         => \$help
);

my $usage       =
    "task2dot.pl [options]\n"
  . "Generates visual graph from TaskManager task file in .dot format\n"  
  . "\t--tasks|t            - PATH to tasks file (required).\n"
  . "\t--useDescription     - use task description field for name generation instead of being smart about the command line content.\n"
  . "\t--dontAbbreviate     - Don't abbreviate commands. Works only if useDescription is not set\n"
  . "\t--discardRegex       - regular expression to discard matches from generated label names. Works only with dontAbbreviate\n"
  . "\t--help|-h            - Print this help\n";


if ($help) {
    print $usage;
    exit(0);
}    # if

errorExit "ERROR: tasks file not specified\n$usage" if ( $workflow_file eq "");


my %option_use_map = ('exportDir' => $discard,
		      'projectDir' => $discard,
		      'runId' => $discard,
		      'referenceSeq' => $discard,
		      'snpFile' => $discard,
		      'sortCountFile' => $discard,
		      'gffFile' => $discard,
		      'binSize' => $discard,
		      'maxSD' => $discard,
		      'minObservations' => $discard,
		      'lane' => $keep_name_and_value,
		      'chrom' => $keep_name_and_value,
		      'binId' => $keep_value,
		      'target' => $keep_value);

my $not_applic_str = 'N/A';

my $indent_str = ' ' x 4;

my $box_shape = 'box';
my $inv_triangle_shape = 'invtriangle';

my $label_qualifier = 'LABEL';
my $shape_qualifier = 'SHAPE';

#------------------------------------------------------------------------------

sub replace_dot_special_chars($)
{
    my ($str) = @_;
    $str =~ tr/\/\-\./\_/;
    return $str;
}

#------------------------------------------------------------------------------

sub make_key($;$)
{
    my ($qualifier, $main_key) = @_;
    return join(':', $qualifier, $main_key);
}

#------------------------------------------------------------------------------

sub append_label_num($;$)
{
    my ($label_str, $label_num) = @_;

    # Square brackets are significant in DOT, so cannot use the TASKS
    # notation of Task Manager; instead append _N<num>.
    return $label_str . '_N' . $label_num;
}

#------------------------------------------------------------------------------

sub abbreviate_cmd($)
{
    my ($cmd_str) = @_;

    die("Empty command field")
	if ($cmd_str eq '');

    $cmd_str =~ s/\s\s+/ /g;

    my @cmd_parts = split(/\s/, $cmd_str);

    my $main_cmd = shift @cmd_parts;

    # If the first token is an interpreter, the actual cmd is the next token.
    if ($main_cmd eq 'perl') {
	die("Interpreter but no script")
	    if (@cmd_parts < 1);

	$main_cmd = shift @cmd_parts;

	if ($main_cmd eq '-I') {
	    die("Must have at least library path and script after perl -I")
		if (@cmd_parts < 2);

	    shift @cmd_parts; # Discard the library path.
	    $main_cmd = shift @cmd_parts; # Extract the script.
	}
    }

    # Strip off any path.
    $main_cmd = basename($main_cmd);

    my @abbrev_cmd_parts = ($main_cmd);

    foreach my $cmd_part (@cmd_parts) {
	my $option;
	my $value;

    if ($cmd_part =~ /\-\-(\w+)\=(\w+)/) 
    {
	    $option = $1;
	    $value = $2;
	} 
	elsif ($cmd_part =~ /\-\-(\w+)/) 
	{
	    $option = $1;
	}

	if (defined($option)) {
	    my $option_use = $option_use_map{$option};

	    die("Unhandled option $option")
		if (!defined($option_use));

	    next if ($option_use == $discard);

	    if (($option_use == $keep_name)
		|| ($option_use == $keep_name_and_value)) {
		push(@abbrev_cmd_parts, $option);
	    }

	    if (($option_use == $keep_value)
		|| ($option_use == $keep_name_and_value)) {
		die("No value specified for option $option")
		    if (!defined($value));

		push(@abbrev_cmd_parts, $value);
	    }
	} else {
	    # Not an option but a simple arg -> include it.

	    push(@abbrev_cmd_parts, $cmd_part);
	}
    }

    return join(' ', @abbrev_cmd_parts);
}

sub abbreviate_simple($)
{
    my ($cmd) = @_;

    if ("" ne $discard_regex)
    {
        my $pattern = "$discard_regex";
        $cmd =~ s/$pattern/.../g;
    }
 
    my $lineLength = int (sqrt length($cmd)) * 2;
    my $pattern = " ";#"(.{$lineLength})";
    my $ret = join('\n', split(/$pattern/, $cmd));
    $ret =~ s/\\n\\n/\\n/g;
    $ret =~ s/^\\n//;

    $ret =~ s/\"/\\"/g;
    
    return $ret;        
}

#------------------------------------------------------------------------------

sub parse_prereqs($)
{
    my ($cmd) = @_;
    my @prereqs;

    if ($cmd =~ /TASKS \((.*)\)/) {
	my $prereqs_str = $1;
	my @prereq_strs = split(',', $prereqs_str);

	foreach my $prereq_str (@prereq_strs) {
	    $prereq_str = replace_dot_special_chars($prereq_str);

	    if ($prereq_str =~ /(\w+)\[(\d+)\]/) {
		my $prereq = append_label_num($1, $2);
		push(@prereqs, $prereq);
	    } else {
		die("Unexpected format of prereq string : $prereq_str");
	    }
	}

	# print "Prereqs : ", join(' ', @prereqs), "\n";
    } else {
	die("Unexpected prereqs string format : $cmd");
    }

    return @prereqs;
}

#------------------------------------------------------------------------------

sub write_dot_line($;$)
{
    local *OSTRM = shift;
    my $line = shift;

    print OSTRM $indent_str, $line, "\n"; 
}

#------------------------------------------------------------------------------

sub write_dot_header($)
{
    local *OSTRM = shift;
    print OSTRM 'digraph G {', "\n";
    write_dot_line(*OSTRM, 'size ="8,11";');
    write_dot_line(*OSTRM, 'rankdir= LR;');
    write_dot_line(*OSTRM, 'rotate=90;');    
}

#------------------------------------------------------------------------------

sub write_dot_flow_line($;$;$)
{
    local *OSTRM = shift;
    my $from_str = shift;
    my $to_str = shift;
    my $line = join(' -> ', $from_str, $to_str) . ';';
    write_dot_line(*OSTRM, $line);
}

#------------------------------------------------------------------------------

sub add_token_dot_label($;$;$;$)
{
    my ($token_dot_label_map_ref, $token, $dot_label, $shape) = @_;

    if (!defined($shape)) {
	$shape = $box_shape;
    }

    $token_dot_label_map_ref->{make_key($label_qualifier, $token)}
    = $dot_label;

    if ($shape ne $box_shape) {
	$token_dot_label_map_ref->{make_key($shape_qualifier, $token)}
	= $shape;
    }
}

#------------------------------------------------------------------------------

sub write_token_dot_label_map($;\%)
{
    local *OSTRM = shift;
    my $token_dot_label_map_ref = shift;

    foreach my $key (keys %$token_dot_label_map_ref) 
    {
    	my ($qualifier, @token_parts) = split(':', $key);
    	my $token = join(':', @token_parts); # just in case colon in token.
    
    	next if ($qualifier ne $label_qualifier);
    
    	my $dot_label = $token_dot_label_map_ref->{$key};
    
    	die("No dot_label for token $token")
    	    if (!defined($dot_label));
    
    	my $shape 
    	    = $token_dot_label_map_ref->{make_key($shape_qualifier, $token)};
    
    	if (!defined($shape)) {
    	    $shape = $box_shape;
    	}
    
    	my $shape_str = 'shape=' . $shape;
    	my $label_str = 'label="' . $dot_label . '"';
    	my $orientation_str = ( $inv_triangle_shape eq $shape ) ? 'orientation=90' : '';
    
    	my $line
    	    = $token . "\t" . '[' . join(',', $shape_str, $label_str, $orientation_str) . ']';
    
    	write_dot_line(*OSTRM, $line);
    }
}

#------------------------------------------------------------------------------

sub write_dot_footer($)
{
    local *OSTRM = shift;
    print OSTRM '}', "\n";
}

#------------------------------------------------------------------------------

sub process_synch_rule($;$;$;$;$;\%)
{
    local *OSTRM = shift;
    my ($label, $label_num, $user_label, $cmd, $token_dot_label_map_ref) = @_;

    # Do not want to append the _N<num> to the display label.
    my $display_label = $label;

    $label = append_label_num($label, $label_num);

    my @prereqs = parse_prereqs($cmd);

    if ($use_description_field)
    {
       $display_label = $user_label; 
    }

    # Whether or not the display label differs from the Task Manager label,
    # want to display it in a triangle.
    add_token_dot_label($token_dot_label_map_ref, $label, $display_label,
			$inv_triangle_shape);


    foreach my $prereq (@prereqs) {
	write_dot_flow_line(*DOT, $prereq, $label);
    }
}

#------------------------------------------------------------------------------

my %token_dot_label_map;
my %label_shape_map;

my $dot_file;

if ($workflow_file =~ /(\S+).txt$/) {
    $dot_file = join('.', $1, 'dot');
} else {
    $dot_file = join('.', $workflow_file, 'dot');
}

open(DOT, ">$dot_file")
    or die("Failed to open $dot_file for writing\n");

write_dot_header(*DOT);

open(WORKFLOW, "<$workflow_file")
    or die("Failed to open $workflow_file");

while (my $line = <WORKFLOW>) {
    chomp $line;
    my @fields = split(/\t/, $line);

    # Although the fields are tab-separated they are also space-padded!
    # Trim off the spaces so the same tokens in different fields match.
    for (my $field_ind = 0; $field_ind < $num_fields; ++$field_ind) {
	$fields[$field_ind] =~ s/^\s+//;
	$fields[$field_ind] =~ s/\s+$//;
    }

    my $label = $fields[$label_field_ind];
    my $label_num = $fields[$label_num_field_ind];
    my $rule_type = $fields[$rule_type_field_ind];
    my $user_label = $fields[$user_label_field_ind];
    my $state = $fields[$state_field_ind];
    my $prereq_label = $fields[$prereq_label_field_ind];
    my $cmd = $fields[$cmd_field_ind];

    # Task Manager allows forward slashes and dashes in labels
    # but dotty does not allow these in tokens.
    $label = replace_dot_special_chars($label);

    if ($rule_type eq 'task') {
	# Make the label unique by appending its number using the notation 
	# _#<num>.
	# Square brackets as in the TASKS list are significant in DOT so
	# cannot be used.
	my $raw_label = $label;
	$label = append_label_num($label, $label_num);

    if ($use_description_field)
    {
        $cmd = $user_label;
    }
    elsif (!$dont_abbreviate_commands)
    {
	   $cmd = abbreviate_cmd($cmd);
    }
    else
    {
        $cmd = abbreviate_simple($cmd);
    }

	# default shape is box
	add_token_dot_label(\%token_dot_label_map, $label, $cmd);

	# Explicit dependency
	if ($prereq_label ne $not_applic_str) {
	    # Replace special chars (inc. slash) after checking for 'N/A'.
	    $prereq_label = replace_dot_special_chars($prereq_label);

	    # A simple prereq must be unique, so assume it has a label number
	    # of 0.
	    my $prereq_label_num = 0;
	    $prereq_label = append_label_num($prereq_label, $prereq_label_num);

	    write_dot_flow_line(*DOT, $prereq_label, $label);
	}

	# Implicit dependency : If label_num > 0 : dependency upon task with
	# same label but label_num one less.
	if ($label_num > 0) {
	    my $implicit_prereq_label
		= append_label_num($raw_label, $label_num - 1);
	    write_dot_flow_line(*DOT, replace_dot_special_chars($implicit_prereq_label), $label);
	}
	    

	# print $cmd, "\n";
    } elsif ($rule_type eq 'synch') {
	process_synch_rule(*DOT, $label, $label_num, $user_label, $cmd,
			   %token_dot_label_map);
    } else {
	die("Unhandled rule type $rule_type");
    }
}

close(WORKFLOW);

write_token_dot_label_map(*DOT, %token_dot_label_map);
write_dot_footer(*DOT);

close(DOT);


#------------------------------------------------------------------------------

