#!/usr/local/bin/perl -wT

#########
# author: rmp for edgrif
#
# This script was originally written by Roger Pettet for Ed Griffiths. It's purpose
# is trawl through all the newsletters in one or more YearNNNN directories and prepare
# an index.shtml file for each directory. The index file contains nested lists of links
# to the major sections within the newsletter. This makes it a bit easier to just eyeball
# the newsletters and see what's in the latest one.
#
# N.B. We rely on the format of the newsletters being just-so and in particular we rely on
# the format of the anchors and headers being:
#
#     <a name="header_abbreviation"></a><hN>The actual header</hN>
#
# The script picks out the text of the anchor flag and the text of the actual header.
#
#
#


use strict;
my $WEB_PATH    = '.';
my $FILE_SUFFIX = 'html';
my $DEFAULT_ANCHOR = 'Top' ;

for my $dir (@ARGV) {
  if(! -d $dir ) {
    print STDERR qq($dir is not a directory\n);
    next;
    
    } else {
      
      #########
      # untaint $dir
      # regexp out any nasty characters here
      # unfortunately file & directories can have virtually anything in
      # so we do a naughty, useless untaint:
      #
      $dir  =~ m/(.*)/s;
      my $cleandir = $1;
      
      my $content = "";
      &munge_files(\$content, $cleandir);
      &print_index(\$content, $cleandir);
      }
  }

1;


sub munge_files {
  my ($content_ref, $dir) = @_;
  
  my @valid_files = qw(January February March April May June July August September October November December);
  
  for my $file_head (@valid_files) {
    my $file = qq($dir/$file_head.$FILE_SUFFIX);
    
    if(! -e qq($file) ) {
      print STDERR qq($file not found\n);
      next;
      
      } else {
	
	eval {
	  open(FIN, $file) || die;
	  };
	
	if($@) {
	  print STDERR qq(Could not open $file\n);
	  next;
	  }
	
	#########
	# nuke input record separator
	#
	my $irs = $/;
	undef($/);
	
	my $file_contents = <FIN>;
	
	
	#########
	# be nice. reset record separator
	#
	$/ = $irs;
	
	close(FIN);

	
	my (@headers) = $file_contents =~ /<a.*?><h.>.*?<\/h.>/mig;
	print STDERR qq(Found ), scalar @headers, qq( headers in $file\n);
	
	
	if(scalar @headers > 0) {
	  my $htype = "" ;
	  

	  # we could insert an extra link here to get to the overview in one
	  # go...


	  $$content_ref .= qq(\n<h3>$file_head</h3>\n<ul>\n);
	  
	  for my $header (@headers) {

	    if (($header !~ /Next User Group Meeting/)
		&& ($header !~ /monthly build now available/)
		&& ($header !~ /ACEDB User Group Newsletter/)) {

	      #########
	      # remove any extra html tags in here just to clean up a little
	      #
	      my $link = $header;
	      $link =~ s/<.*?>//mig;

	      # extract the anchor name from the header, if there isn't one
	      # set it to the default.
	      my $aname = $header ;
	      $aname =~ s/<a name="(.*?)">.*/$1/i;
	      $aname ||= $DEFAULT_ANCHOR ;
	      
	      # check what level header it is, <h2> will start a new section
	      # and all others will be subheaders until the next <h2>
	      if($header =~ /<h2>/ig) {
		$htype = "h2" ;
		} else {
		  $htype = "" ;
		  }

	      if(!$htype) {
		$$content_ref .= qq(    <ul>\n    );
		}

	      my $year = $dir;
	      $year =~ s/^.*\/(.*?)/$1/ig;
	      
	      #########
	      # append the link to our content
	      #
	      $$content_ref .= qq(  <li><a href="$WEB_PATH/$file_head.$FILE_SUFFIX#$aname">$link</a></li>\n);

	      if(!$htype) {
		$$content_ref .= qq(    </ul>\n);
		}
	      }
	    
	    }
	  
	  $$content_ref .= qq(</ul><br />);
	  }
	}
    }
  }

sub print_index {
  my($content_ref, $dir) = @_;
  
  my $year = $dir;
  $year =~ s/^.*\/(.*?)$/$1/ig;
  
  open(FOUT, ">$dir/index.shtml") || die(qq(Could not open index file));

  print FOUT qq(<html>\n) ;
  
  # warn user that file is machine generated...
  print FOUT qq(\n<!--                                                                          -->\n) ;
  print FOUT qq(<!-- This file was machine generated by mkindex.pl                            -->\n) ;
  print FOUT qq(<!-- You should not hand edit this file, alter mkindex.pl instead.            -->\n) ;
  print FOUT qq(<!--                                                                          -->\n\n) ;

  print FOUT qq(<head>
<title>Acedb Newsletters for $year</title>
<link rel="stylesheet" type="text/css" href="../newsletter.css" TITLE="Style sheet for Newsletters">
</head>
<body>
<h1 align="center">Acedb Newsletters for $year</h1>\n\n);
  
  print FOUT $$content_ref;
  
  print FOUT qq(\n\n<HR>\n);

  print FOUT qq(<ADDRESS><A HREF="mailto:acedb\@sanger.ac.uk">acedb\@sanger.ac.uk</A></ADDRESS>\n);

  print FOUT qq(\n</body>\n</html>\n);

  close(FOUT);

  print STDERR qq(Generated index $dir/index.shtml\n);
}
