#!/usr/bin/perl
##---------------------------------------------------------------------------##
##  File:
##      @(#) RepeatProteinMask
##  Author:
##      Arian Smit <asmit@systemsbiology.org>
##      Robert Hubley <rhubley@systemsbiology.org>
##  Description:
##      Given a sequence file and a protein database, mask
##      all hits using blastx.
##
#******************************************************************************
#* Copyright (C) Institute for Systems Biology 2005 Developed by
#* Arian Smit and Robert Hubley.
#*
#* This work is licensed under the Open Source License v2.1.  To view a copy
#* of this license, visit http://www.opensource.org/licenses/osl-2.1.php or
#* see the license.txt file contained in this distribution.
#*
###############################################################################
#  ChangeLog:
#
#    $Log: RepeatProteinMask,v $
#    Revision 1.23  2009/03/26 23:44:11  rhubley
#      - WUBlast to ABBlast changes
#      - Fixed the matrix parsing in WUBlastSearchEngine
#      - Fixed an optimization problem in ProcessRepeats
#
#    Revision 1.22  2009/03/26 17:27:20  rhubley
#       - Stupid Debian/Ubuntu switched /bin/sh from BASH to DASH thus breaking
#         many many scripts in the holy name of POSIX compliance.  To quote the
#         mormon south park episode: "Dumb dumb dumb dumb dumb".
#         I went through and changed all occurances of ">&" to "> file 2>&1"
#
#    Revision 1.21  2008/10/28 22:33:28  rhubley
#      - added a "-noTRF" option to RepeatProteinMask.
#
#    Revision 1.20  2008/05/01 22:28:46  rhubley
#      - Added support in configure for RepeatProteinMask (TRF)
#
#    Revision 1.19  2006/09/06 20:42:19  rhubley
#     - formating changes
#
#    Revision 1.18  2006/08/08 18:48:02  rhubley
#      - prettyPrinted' everything
#      - Added usage header to ProcessRepeats
#      - Added -lcambig option to ProcessRepeats: Prints ambiguous DNA transposon
#        fragment names in lower case ( everything else in upper case ).
#
#    Revision 1.17  2006/03/31 20:22:44  rhubley
#      - Documentation in ProcessRepeats
#      - Bug fixes in RepeatProteinMask
#
#    Revision 1.16  2005/05/10 16:22:27  rhubley
#      - Changed spacing
#
#    Revision 1.15  2005/05/02 21:09:04  rhubley
#      - type column work
#
#    Revision 1.14  2005/05/02 21:01:45  rhubley
#      - Fixed querystatlen
#
#    Revision 1.13  2005/05/02 20:54:49  rhubley
#     - More adding type column
#
#    Revision 1.12  2005/05/02 20:53:13  rhubley
#      - Added type column
#
#    Revision 1.11  2005/05/02 18:49:49  rhubley
#      - Updated default parameters
#
#    Revision 1.10  2005/04/26 00:04:03  rhubley
#      - fixed querystatlen
#
#    Revision 1.9  2005/04/21 20:14:04  rhubley
#      - Complexity adjust mand effective query length
#
#    Revision 1.8  2005/04/19 23:52:34  rhubley
#      - fixed a bug where it was marking hits as coming from RepeatMasker instead
#        of wublastx
#
#    Revision 1.7  2005/04/19 23:20:34  rhubley
#      - Give this a shot
#
#    Revision 1.6  2005/04/15 20:48:24  rhubley
#      - Bug fix ( didn't print all TRF matches )
#      - default changes ( 0.0001, and 333 )
#
#    Revision 1.5  2005/04/14 23:28:04  rhubley
#      - Updates
#
#    Revision 1.4  2005/04/14 19:16:43  rhubley
#     - Small changes
#
#    Revision 1.3  2005/04/14 00:21:33  rhubley
#      - Working toward a perfect RepeatProteinMask
#
#    Revision 1.2  2005/04/12 23:36:10  rhubley
#      - Updates including Arians changes
#
#    Revision 1.1  2005/04/12 17:52:18  rhubley
#      - Adding a new component to the RepeatMasker suite.  RepeatProteinMask
#        masks sequence based on it's similarity with interspersed repeat
#        peptides ( RepeatPeps.lib ).  This uses WUBlastX to do it's work.
#
#
###############################################################################
#
# To Do:
#
#

=head1 NAME

RepeatProteinMask - Mask Repeat Proteins in DNA sequence

=head1 SYNOPSIS

  RepeatProteinMask [-pvalue #] [-minscore #] [-wordsize #] [-maxAADist] 
                    [-noLowSimple] [-noTRF] [-queryStatLen #] <fasta file>

=head1 DESCRIPTION

The options are:

=over 4

=item -h(elp)

=item -pvalue #

The threshold for accepting matches. Matches must hava a pvalue
less than this number. Default is *NOT* to threshold...it used
to be 0.0001!!!!

=item -minscore #

Threshold on minscore.  Note no default is added. So all hits 
will be returned unless a minscore is used.

=item -wordsize #

The wordsize to use with the wublastx search. Default is 3

=item -querystatlen #

The effective length of the query to use in statistical calculations.

=item -maxaadist #

The maximum distance to consider two blastx hits as the same
(and thus contributing to Sum P scores).  Default is 333.

=item -noLowSimple

Turns off masking/annotating of low complexity and simple repeats
in the final output.  Low complexity and simple repeat analysis 
will still occur prior to looking for matches to the RepeatPep
database.

=item -noTRF

Turns off masking/annotating of tandem repeats in the input sequence.

Detailed help

=back

=head1 SEE ALSO

=over 4

RepeatModeler

=back

=head1 COPYRIGHT

Copyright 2005 Institute for Systems Biology

=head1 AUTHOR

Robert Hubley <rhubley@systemsbiology.org>

=cut

#
# Module Dependence
#
use strict;
use FindBin;
use lib $FindBin::RealBin;
use Data::Dumper;
use Carp;
use Getopt::Long;

# RepeatMasker Libraries
use RepeatMaskerConfig;
use SeqDBI;
use FastaDB;
use WUBlastXSearchEngine;
use CrossmatchSearchEngine;
use TRF;
use TRFResult;
use SearchEngineI;

#
# Class Globals & Constants
#
my $CLASS = "RepeatProteinMask";
my $DEBUG = 0;
$DEBUG = 1 if ( $RepeatMaskerConfig::DEBUGALL == 1 );

#
# Option processing
#  e.g.
#   -t: Single letter binary option
#   -t=s: String parameters
#   -t=i: Number paramters
#
my @opts =
    qw( help consensi=s wordsize=s pvalue=s maxaadist=s nolowsimple minscore=s querystatlen=s );

#
# Get the supplied command line options, and set flags
#
my %options = ();
unless ( &GetOptions( \%options, @opts ) ) {
  exec "pod2text $0";
  exit( 1 );
}

# Print the internal POD documentation if something is missing
if ( $options{'help'} || !$#ARGV < 1 ) {

  # This is a nifty trick so we don't have to have
  # a duplicate "USAGE()" subroutine.  Instead we
  # just recycle our POD docs.  See PERL POD for more
  # details.
  exec "pod2text $0";
  die;
}

# This will miss alot. But in GC rich DNA a lower score
# will cause false positives
my $minScore = 20;
if ( defined $options{'minscore'} ) {
  $minScore = $options{'minscore'};
}

my $maxAADist = 333;
if ( defined $options{'maxaadist'} ) {
  $maxAADist = $options{'maxaadist'};
}

#determines which matches above which P value will be ignored
#my $cutoffP = 0.0001;
#if ( defined $options{'pvalue'} ) {
#  $cutoffP = $options{'pvalue'};
#}

my $wordSize = 3;
if ( defined $options{'wordsize'} ) {
  $wordSize = $options{'wordsize'};
}

my $fastaFile = "";
my $fileDir   = "";
if ( -s $ARGV[ 0 ] ) {
  $fastaFile = $ARGV[ 0 ];
  $fileDir   = ( File::Spec->splitpath( $fastaFile ) )[ 1 ];
}
else {
  die $CLASS . ": Missing fasta file parameter!\n";
}

#
# Assume we want to place the results next to the original file
#
if ( $fileDir ne "." && $fileDir ne "" ) {
  chdir( $fileDir );
}

# open up the consensi database
#
if ( exists $options{'nolowsimple'} ) {
  print
      "Identifying Simple and Low Complexity Repeats...(masking turned off)\n";
}
else {
  print "Masking Simple and Low Complexity Repeats...\n";
}
my $seqDB = FastaDB->new( fileName => $fastaFile,
                          openMode => SeqDBI::ReadOnly,
                          maxIDLength => 50 );

if ( $seqDB->getSeqCount() <= 0 ) {
  die "\n\nSomething is wrong with the input sequence. Possibly a formatting "
      . "problem.  Please correct your sequence data and resubmit.\n";
}

my $scratchDir = "/tmp";
my $trf = TRF->new( pathToEngine => $RepeatMaskerConfig::TRF_PRGM,
                    workDir      => $scratchDir );

my ( $numMasked, $TRFResults ) = ();
if ( ! defined $options{'noTRF'} )
{ 
  ( $numMasked, $TRFResults ) =
    TRFMask( $seqDB, $trf, "$fastaFile.trf_masked", $scratchDir );
  print "   - TRF         : " . $numMasked . "\n";
}

if ( !-s "$fastaFile.trf_masked" ) {
  system( "cp $fastaFile $fastaFile.trf_masked" );
}

#
# RepeatMasker
#
`$RepeatMaskerConfig::REPEATMASKER_DIR/RepeatMasker -engine crossmatch -qq -noint -no_is $fastaFile.trf_masked > /dev/null 2>&1`;
my $RMResults;
if ( -e "$fastaFile.trf_masked.out" && -e "$fastaFile.trf_masked.masked" ) {
  $RMResults =
      CrossmatchSearchEngine::parseOutput(
                                  searchOutput => "$fastaFile.trf_masked.out" );
  print "   - RepeatMasker: " . $RMResults->size() . "\n";
  system( "mv $fastaFile.trf_masked.masked $fastaFile.trf_rm_masked" );
}
else {
  print "   - RepeatMasker: 0\n";
  system( "mv $fastaFile.trf_masked $fastaFile.trf_rm_masked" );
}

unlink( "$fastaFile.trf_masked" )     if ( -e "$fastaFile.trf_masked" );
unlink( "$fastaFile.trf_masked.out" ) if ( -e "$fastaFile.trf_masked.out" );
unlink( "$fastaFile.trf_masked.cat" ) if ( -e "$fastaFile.trf_masked.cat" );
unlink( "$fastaFile.trf_masked.tbl" ) if ( -e "$fastaFile.trf_masked.tbl" );
unlink( "$fastaFile.trf_masked.log" ) if ( -e "$fastaFile.trf_masked.log" );

#
# Comparison against database of transposon proteins
#
# wublastx the simple-masked consensi vs the transposable element
# protein database with the fasta line format
#      >TWIFBIG#DNA/HAT-Ac
#
# This name may be *immediately* followed by #ReverseORF to indicate
# that the product is encoded on the opposite strand of the
# transposable element. It needs to be right after it, otherwise it
# may fall on the next line in the blastx output's hit description.
#
print "Masking Repeat Proteins...\n";

# Set the environment
$ENV{BLASTMAT}   = "$RepeatMaskerConfig::WUBLAST_DIR/matrix";
$ENV{WUBLASTMAT} = $ENV{BLASTMAT};

if ( !-s "$RepeatMaskerConfig::REPEATMASKER_LIB_DIR/RepeatPeps.lib.xps" ) {
  system(
"$RepeatMaskerConfig::XDFORMAT_PRGM -p $RepeatMaskerConfig::REPEATMASKER_LIB_DIR/RepeatPeps.lib"
  );
}

my $searchEngineX =
    WUBlastXSearchEngine->new( pathToEngine => "$RepeatMaskerConfig::WUBLAST_DIR/blastx" );
$searchEngineX->setQuery( "$fastaFile.trf_rm_masked" );
$searchEngineX->setSubject(
                   "$RepeatMaskerConfig::REPEATMASKER_LIB_DIR/RepeatPeps.lib" );
$searchEngineX->setFilterWords( 1 );
if ( defined $options{'pvalue'} ) {
  $searchEngineX->setPValueCutoff( $options{'pvalue'} );
}
if ( $minScore ne "" ) {
  $searchEngineX->setMinScore( $minScore );
}
my $additionalParams = "-W=$wordSize hspsepqmax=$maxAADist";
my $queryStatLen     = 1000000;
if ( defined $options{'querystatlen'} ) {
  $queryStatLen = $options{'querystatlen'};
}
$additionalParams .= " -Y=$queryStatLen";

$searchEngineX->setAdditionalParameters( $additionalParams );
$searchEngineX->setScoreMode( SearchEngineI::complexityAdjustedScoreMode );
$searchEngineX->setMaskLevel( 85 );
my ( $status, $resultCollection ) = $searchEngineX->search();

print "   - Protein Hits = " . $resultCollection->size() . "\n";
if ( $resultCollection->size() > 0 ) {
  if ( exists $options{'nolowsimple'} ) {
    unlink( "$fastaFile.trf_rm_masked" );
    maskResults( resultsRef => $resultCollection,
                 fastaFile  => $fastaFile );
  }
  else {
    maskResults( resultsRef => $resultCollection,
                 fastaFile  => "$fastaFile.trf_rm_masked" );
    unlink( "$fastaFile.trf_rm_masked" );
    system( "mv $fastaFile.trf_rm_masked.masked $fastaFile.masked" );
  }
}
else {
  if ( exists $options{'nolowsimple'} ) {
    system( "cp $fastaFile $fastaFile.masked" );
  }
  else {
    system( "mv $fastaFile.trf_rm_masked $fastaFile.masked" );
  }
}

# Sort through results and create annotation file.
open ANO, ">$fastaFile.annot";
my $hdrStr = sprintf(
"%-10.10s %6s %-8.8s %-18.18s %-8.8s %-8.8s %1s %-18.18s %-18.18s %-8.8s %-8.8s\n",
  "pValue", "Score",  "Method", "SeqID", "Begin", "End",
  " ",      "Repeat", "Type",   "Begin", "End"
);
print ANO $hdrStr;
unless ( exists $options{'nolowsimple'}
         || !defined $RMResults )
{
  $resultCollection->addAll( $RMResults );
  $resultCollection->sort(
    sub ($$) {
      $_[ 0 ]->getQueryName() cmp $_[ 1 ]->getQueryName()
          || $_[ 0 ]->getQueryStart() <=> $_[ 1 ]->getQueryStart();
    }
  );
}

if ( $resultCollection->size() > 0 ) {
  my $prevQueryName = $resultCollection->get( 0 )->getQueryName();
  for ( my $k = 0 ; $k < $resultCollection->size() ; $k++ ) {
    my $result = $resultCollection->get( $k );
    unless ( exists $options{'nolowsimple'} ) {
      my $queryStart = $result->getQueryStart();
      my $queryName  = $result->getQueryName();
      while (
                 exists $TRFResults->{$queryName}
              && @{ $TRFResults->{$queryName} }
              && (    $queryStart > $TRFResults->{$queryName}->[ 0 ]->getStart()
                   || $queryName ne $prevQueryName )
          )
      {
        my $outStr = sprintf(
"%-10.2s %6d %8s %-18.18s %-8d %-8d %1s %-18.18s %-18.18s %8.8s %8.8s\n",
          "-",
          $TRFResults->{$queryName}->[ 0 ]->getScore(),
          "TRF",
          $queryName,
          $TRFResults->{$queryName}->[ 0 ]->getStart(),
          $TRFResults->{$queryName}->[ 0 ]->getEnd(),
          "+",
          "",
          "Tandem_Repeat",
          "-",
          "-"
        );

        print ANO "$outStr";
        shift @{ $TRFResults->{$queryName} };
      }
    }
    my $orient = "+";
    $orient = "-" if ( $result->getOrientation() eq "C" );

    # Break up the subject name into repeat name and repeat type
    my $subjName = $result->getSubjName();
    my $subjType = $result->getSubjType();
    if ( $result->getSubjName() =~ /(\S+)\#(\S+)/ ) {
      $subjName = $1;
      $subjType = $2;
    }

    my $outStr;
    if ( !$result->getPValue() =~ /[\d\.\-\e]+/ ) {

      # Result came from RepeatMasker
      $outStr = sprintf(
           "%-10.2s %6d %8s %-17.17s %-8d %-8d %1s %-15.15s %-15.15s %8d %8d\n",
           "-",                      $result->getScore(),
           "RMasker",                $result->getQueryName(),
           $result->getQueryStart(), $result->getQueryEnd(),
           $orient,                  $subjName,
           $subjType,                $result->getSubjStart(),
           $result->getSubjEnd()
      );

    }
    else {
      $outStr = sprintf(
           "%-10.2e %6d %8s %-17.17s %-8d %-8d %1s %-15.15s %-15.15s %8d %8d\n",
           $result->getPValue(),     $result->getScore(),
           "WUBlastX",               $result->getQueryName(),
           $result->getQueryStart(), $result->getQueryEnd(),
           $orient,                  $subjName,
           $subjType,                $result->getSubjStart(),
           $result->getSubjEnd()
      );

    }
    print ANO "$outStr";

  }
}
else {
  unless ( exists $options{'nolowsimple'} ) {

    # See if there are any TRF results to print
    foreach my $queryID ( keys( %{$TRFResults} ) ) {
      foreach my $result ( @{ $TRFResults->{$queryID} } ) {
        my $outStr =
            sprintf(
"%-10.2s %6d %8s %-17.17s %-8d %-8d %1s %-15.15s %-15.15s %8.8s %8.8s\n",
          "-", $result->getScore(), "TRF", $queryID, $result->getStart(),
          $result->getEnd(), "+", "", "Tandem_Repeat", "-", "-" );
        print ANO "$outStr";
      }
    }
  }
}
close ANO;

# Cya!
print "Done!\n";
exit;

#-------------------- S U B R O U T I N E S ------------------------------#

##---------------------------------------------------------------------##
##
##  maskResults()
##
##  Use: maskDatabase( resultsRef => #ref#,
##                     fastaFile => "/jo/bob/seq.fa",
##                   );
##
##
##
##---------------------------------------------------------------------##
sub maskResults {
  my %parameters = @_;

  # Parameter checking
  die $CLASS
      . "::maskResults(): Missing or invalid resultsRef "
      . "parameter!\n"
      if ( !defined $parameters{'resultsRef'} );
  my $resultsRef = $parameters{'resultsRef'};

  die $CLASS . "::maskResults(): Missing fastaFile parameter!\n"
      if (    !defined $parameters{'fastaFile'}
           || !-s $parameters{'fastaFile'} );
  my $fastaFile = $parameters{'fastaFile'};

  my %maskRanges = ();
  my $maskDB = FastaDB->new( fileName => $fastaFile,
                             openMode => SeqDBI::ReadOnly );
  open OUT, ">$fastaFile.masked";

  for ( my $k = 0 ; $k < $resultsRef->size() ; $k++ ) {
    my $result = $resultsRef->get( $k );
    push @{ $maskRanges{ $result->getQueryName() } },
        [
          $result->getQueryStart() - 1,
          $result->getQueryEnd() - $result->getQueryStart() + 1
        ];
  }

  my %idsSeen = ();
  foreach my $idKey ( keys( %maskRanges ) ) {
    $idsSeen{$idKey} = 1;
    print OUT ">" . $idKey . " " . $maskDB->getDescription( $idKey ) . "\n";
    my $seq = $maskDB->getSequence( $idKey );
    foreach my $range ( @{ $maskRanges{$idKey} } ) {

      #print " Masking seq: " .  $range->[ 0 ] . " - " .  $range->[ 1 ] . "\n";
      substr( $seq, $range->[ 0 ], $range->[ 1 ] ) = "N" x $range->[ 1 ];
    }
    $seq =~ s/(.{50})/$1\n/g;
    print OUT "$seq\n";
  }

  # Write out any records which didn't have any masking
  foreach my $idKey ( $maskDB->getIDs() ) {
    next if ( exists $idsSeen{$idKey} );
    print OUT ">" . $idKey . " " . $maskDB->getDescription( $idKey ) . "\n";
    my $seq = $maskDB->getSequence( $idKey );
    $seq =~ s/(.{50})/$1\n/g;
    print OUT "$seq\n";
  }
  close OUT;

}

########################################################################################
########################################################################################
########################################################################################
########################################################################################

sub TRFMask {
  my $seqDB      = shift;
  my $trfObj     = shift;
  my $maskFile   = shift;
  my $scratchDir = shift;

  open OUT, ">$maskFile"
      || die $CLASS . "::TRFMask: Could not open file $maskFile!\n";

  # Create a tempDirectory for us
  my $tmpDir;
  do {
    $tmpDir = $scratchDir . "/trfResults-" . time();
  } while ( -d $tmpDir );
  mkdir( $tmpDir );
  $scratchDir = $tmpDir;

  # Foreach sequence
  my $repeatsMasked = 0;
  my %allResults    = ();
  foreach my $seqID ( $seqDB->getIDs() ) {

    print OUT ">" . $seqID . " " . $seqDB->getDescription( $seqID ) . "\n";
    my $seqLen = $seqDB->getSeqLength( $seqID );

    # TODO: Make this capable of batching small sequences!
    # Break into 5mb pieces...NOTE: Must keep track of seqID
    for ( my $i = 0 ; $i < $seqLen ; $i += 5000000 ) {
      my $batchSeq;

      # Create temp seq file
      open TMPFILE, ">$scratchDir/tmpseq.fa"
          || die $CLASS
          . ": Could not open "
          . "temporary file $scratchDir/tmpseq.fa for output!\n";
      print TMPFILE ">seq1\n";
      if ( $i + 5000000 > $seqLen ) {
        $batchSeq = $seqDB->getSubstr( $seqID, $i );
      }
      else {
        $batchSeq = $seqDB->getSubstr( $seqID, $i, 5000000 );
      }
      print TMPFILE "$batchSeq\n";
      close TMPFILE;

      # Run TRF
      my $resultRef = $trf->search( sequenceFile => "$scratchDir/tmpseq.fa",
                                    workDir      => $scratchDir );

      #print $CLASS. ": TRF Returned " . scalar( @{$resultRef} ) . " results\n"
      #if ( $DEBUG );

      foreach my $result ( @{$resultRef} ) {

        # TODO: Document why?
        if (    $result->getCopyNumber() > 4
             && $result->getPeriod() > 1 )
        {

          # Mask
          my $start = $result->getStart() - 1;
          my $len   = $result->getEnd() - $start;

          #print "Masking: ".$result->toString()."\n";
          substr( $batchSeq, $start, $len ) = "N" x $len;
          $repeatsMasked++;
          push @{ $allResults{$seqID} }, $result;
        }
      }

      # write chunk out
      $batchSeq =~ s/(.{50})/$1\n/g;
      print OUT "$batchSeq\n";

    }
  }
  close OUT;
  unlink( "$scratchDir/tmpseq.fa" ) if ( -e "$scratchDir/tmpseq.fa" );

  system( "rm -rf $scratchDir" );

  #print "   $repeatsMasked Tandem Repeats Masked\n";

  return ( $repeatsMasked, \%allResults );

}

1;