#!/usr/bin/perl
##---------------------------------------------------------------------------##
##  File:
##      @(#) DupMasker
##  Author:
##      Robert M. Hubley   rhubley@systemsbiology.org
##  Description:
##      Search a sequence against a library of known segmental duplications
##
#******************************************************************************
#* Copyright (C) Institute for Systems Biology 2002-2004 Developed by
#* Robert Hubley, Zhaoshi Jiang, Arian Smit and Evan Eichler.
#*
#* This work is licensed under the Open Source License v2.1.  To view a copy
#* of this license, visit http://www.opensource.org/licenses/osl-2.1.php or
#* see the license.txt file contained in this distribution.
#*
###############################################################################
#
# ChangeLog
#
#     $Log: DupMasker,v $
#     Revision 1.7  2009/03/26 17:27:20  rhubley
#        - Stupid Debian/Ubuntu switched /bin/sh from BASH to DASH thus breaking
#          many many scripts in the holy name of POSIX compliance.  To quote the
#          mormon south park episode: "Dumb dumb dumb dumb dumb".
#          I went through and changed all occurances of ">&" to "> file 2>&1"
#
#     Revision 1.6  2008/05/29 21:37:40  rhubley
#       - Fixed numbering problem in alignments.  When there were complete lines
#         of gap characters the index would still increment/decrement by one.
#
#       - Fixed problem with fragmenting alignments.  The "x" cliping boundaries
#         were not being honored.  This left single "X" characters in the
#         alignments.
#
#       - Added some more error checking to DupMasker
#
#     Revision 1.5  2008/05/14 18:29:15  rhubley
#      - Added "-gi" fix to WUBlastSearchEngine.pm
#
#     Revision 1.4  2008/04/28 19:50:50  rhubley
#       - Added gff option to DupMasker.
#       - Fixed WUBlastSearchEngine default temp directory
#
#     Revision 1.3  2008/03/24 18:45:59  rhubley
#      - Documentation change
#
#     Revision 1.2  2008/03/24 18:21:22  rhubley
#       - Pretty-fy the wublast missing message
#
#     Revision 1.1  2008/03/17 19:03:57  rhubley
#       - DupMasker integration
#       - Some ALU Work in ProcessRepeats
# 
#
###############################################################################
#
# To Do:
#
=head1 NAME

DupMasker - Annotate segmental duplications in a sequence

=head1 SYNOPSIS

  DupMasker [-version] [-maxDiv #] [-maxWidth #] [-forceSearch] 
            [-align] [-gff] <myfile.fa>

=head1 DESCRIPTION

  Search a file against a library known segmental 
  duplications. If the file doesn't already have a 
  RepeatMasker *.out file this script will generate
  one.  The output is placed in a file named 
  myfile.fa.duplicons.

  The basic program flow is:

     - RepeatMask the sequence ( Human libraries )
     - Search the masked squence against the duplicon library
     - Build realignment regions for each duplicon
     - Realign using non-repeat-masked dna
     - Join duplicons and output results

The options are:

=over 4

=item -version

Displays the version of the program

=item -maxDiv <value>

Filter out duplicon seeds which have a divergence greater than 
this value.

=item -maxWidth <value>

The maximum non-repetitive/non-seed realign gaps, default is 300bp

=item -forceSearch

DupMasker uses RepeatMasker .out and previous runs of DupMasker's *.dupout
files by default.  Use this option if you would like to rerun these searches.

=item -align

Produce alignments. These are stored in the output file.

=item -gff     

Creates an additional Gene Feature Finding (gff) output file.

=back

=head1 SEE ALSO

RepeatMasker

=head1 COPYRIGHT

Copyright 2007-2008 Robert Hubley, Institute for Systems Biology

=head1 AUTHOR

Robert Hubley <rhubley@systemsbiology.org>

=cut

#
# Module Dependence
#
use strict;
use FindBin;
use lib $FindBin::RealBin;
use Getopt::Long;
use Data::Dumper;

# RepeatMasker Libraries
use RepeatMaskerConfig;
use SearchResult;
use SearchResultCollection;
use WUBlastSearchEngine;
use CrossmatchSearchEngine;
use SeqDBI;
use SimpleBatcher;
use FastaDB;

#
# Version
#
#  This is a neat trick.  CVS allows you to tag
#  files in a repository ( i.e. cvs tag "2003/12/03" ).
#  If you check out that release into a new
#  directory with "cvs co -r "2003/12/03" it will
#  place this string into the $Name: open-3-3-0 $ space below
#  automatically.  This will help us keep track
#  of which release we are using.  If we simply
#  check out the code as "cvs co Program" the
#  $Name: open-3-3-0 $ macro will be blank so we should default
#  to what the ID tag for this file contains.
#
my $CVSNameTag = '$Name: open-3-3-0 $';
my $CVSIdTag = '$Id: DupMasker,v 1.7 2009/03/26 17:27:20 rhubley Exp $';
my $Version = $CVSNameTag;
$Version = $CVSIdTag if ( $Version eq "" );

#
# Magic numbers/constants here
#  ie. my $PI = 3.14159;
#   
my $DEBUG = 0;
my $dupliconLib = "$FindBin::RealBin/Libraries/dupliconlib.fa";
my $REPEATMASKER = "$FindBin::RealBin/RepeatMasker";
my $WUBLASTN = "$RepeatMaskerConfig::WUBLASTN_PRGM";
my $XDFORMAT = "$RepeatMaskerConfig::XDFORMAT_PRGM";
my $MATRICES = "$FindBin::RealBin/Matrices/wublast/nt";

die   "\n\nThe WUBlast search engine doesn't appear to be configued.\n"
    . "Please install WUBlast and rerun the RepeatMasker configure\n"
    . "program.\n\n" if ( ! -x $WUBLASTN );

die   "\n\nThe dupliconlib doesn't appear to be installed in the\n"
    . "RepeatMasker Libraries/ directory.  Please download and install\n"
    . "this library before proceeding.\n\n" if ( ! -s $dupliconLib );

## Seed clustering parameters 
my $maxSeedInsertion = 7000;
my $maxDupliconSeedOverlap = 30;

## Seed realignment boundaries parameters
# realignNonRepMaxWidth: A boundary for realignment ends at the consensus 
#                        boundaries or the first occurance of a 
#                        non-repetitive,non-seed sequence window of
#                        this size. 
my $realignNonRepMaxWidth = 300; 


#
# Option processing
#  e.g.
#   -t: Single letter binary option
#   -t=s: String parameters
#   -t=i: Number paramters
#
my @getopt_args = (
    '-version', # print out the version and exit
    '-maxDiv=i' , # maximal duplicon seed divergence
    '-maxWidth=i', # realignNonRepMaxWidth zj 5-10-07
    '-forceSearch',
    '-gff',
    '-align'
);

my %options = ();
Getopt::Long::config("noignorecase", "bundling_override");
unless (GetOptions(\%options, @getopt_args)) {
    usage();
}

sub usage {
  print "$0 - $Version\n";
  exec "pod2text $0";
  exit;
}

if ($options{'version'}) {
  print "$Version\n";
  exit;
}

my $maxDupliconDiv = 0;
if ( $options{'maxDiv'} )
{
  $maxDupliconDiv = $options{'maxDiv'};
}

if( $options{'maxWidth'} ) { #zj added 5-10-07
  $realignNonRepMaxWidth=$options{'maxWidth'};			
}


#
# ARGV Processing
#
if ( ! defined $ARGV[0] ) {
  usage();
}
my $fastaFile = $ARGV[0];

die "\nFile does not exist $fastaFile!!!\n\n" if ( ! -s $fastaFile );

#
#  Open up the duplicon library
#
my $libDB = FastaDB->new( fileName => $dupliconLib,
                          openMode => SeqDBI::ReadOnly, 
                          maxIDLength => 50 );

## 
## Create a repeatmasked sequence file if necessary.
##
my $db = FastaDB->new( fileName => $fastaFile, openMode => SeqDBI::ReadOnly,
                       maxIDLength => 50 );
if ( $options{'forceSearch'} || ! -e "$fastaFile.out" )
{
  print "Searching for repeats...\n";
  system("$REPEATMASKER -w $fastaFile > /dev/null 2>&1" );
}
else
{
  print "Using existing repeat annotations in $fastaFile.out\n";
}

# TODO: Make a temporary directory for all the output files.
my $tmpIdx = 1;
my $tmpMaskedFile = "$fastaFile.dup.tmpmask";
if ( -e $tmpMaskedFile )
{
  #die   "$tmpMaskedFile already exists!  Please remove this file and rerun"
  #    . "the program.\n";
}
&maskSequence( seqDB => $db, annotationFile => "$fastaFile.out", 
               outputFile => $tmpMaskedFile );


##
## Search for duplicons using RepeatMasker unless already done.
##
if ( $options{'forceSearch'} || ! -e "$fastaFile.dupout" )
{
  print "Searching for duplicons...\n";

  my $cmd = "$REPEATMASKER -w -nolow -lib $dupliconLib $tmpMaskedFile";
  print "$cmd\n" if ( $DEBUG );
  system(   "$cmd > /dev/null 2>&1" );

  unless ( $DEBUG ) 
  {
    unlink("$tmpMaskedFile") if ( -e "$tmpMaskedFile" );
    unlink("$tmpMaskedFile.cat") if ( -e "$tmpMaskedFile.cat" );
    unlink("$tmpMaskedFile.tbl") if ( -e "$tmpMaskedFile.tbl" );
    unlink("$tmpMaskedFile.log") if ( -e "$tmpMaskedFile.log" );
    unlink("$tmpMaskedFile.ori.out") if ( -e "$tmpMaskedFile.ori.out" );
    unlink("$tmpMaskedFile.masked") if ( -e "$tmpMaskedFile.masked" );
  }

  if ( -s "$tmpMaskedFile.out" )
  {
    print "  -Created duplicon seed file $fastaFile.dupout\n";
    system("mv $tmpMaskedFile.out $fastaFile.dupout");
  }
  else
  {
    print "No duplicon seeds found in the sequence file!\n";
    exit;
  }
}else
{
  print "Using existing duplicon seeds in $fastaFile.dupout\n";
}

if ( ! -e "$fastaFile.dupout" ) 
{
  die "Could not generate $fastaFile.dupout\n";
}


##
## Read in annotations
##
my $repResultsCollection =
       CrossmatchSearchEngine::parseOutput( searchOutput => "$fastaFile.out" );

my $dupResultsCollection =
       CrossmatchSearchEngine::parseOutput( searchOutput => "$fastaFile.dupout" );


##
## Filter out low scoring duplicons
## Apply divergence filter if required
##
my $dupResIter = $dupResultsCollection->getIterator();
if ( $DEBUG ) 
{
  print "Original Search ( ** denotes removed annots with scores < 300 ";
  if ( $maxDupliconDiv ) 
  {
    print ", or div >= $maxDupliconDiv ";
  }
  print "):\n";
  print "-------------------------------------------\n";
}
my %queryLengths = ();
while ( $dupResIter->hasNext() ) 
{
  my $annot = $dupResIter->next();
  if ( $annot->getScore() < 300
       || ( $maxDupliconDiv && $annot->getPctDiverge() >= $maxDupliconDiv ) )   
  { #zj 5-10-07
    print "**" . $annot->toStringFormatted() if ( $DEBUG );
    $dupResIter->remove();
  }else {
    print "  " . $annot->toStringFormatted() if ( $DEBUG );
  }
  if ( ! defined $queryLengths{ $annot->getQueryName() } )
  { 
    $queryLengths{ $annot->getQueryName() } =
         $annot->getQueryEnd() + $annot->getQueryRemaining();
  }
}
undef $dupResIter;

print "\n\n" if ( $DEBUG );

##
## Cluster collinear duplicon hits.
##    This creates groups of duplicons that require realignment.
##
$dupResIter = $dupResultsCollection->getIterator();
my %originalSeedBoundaries = ();  
my $subElementsString = undef;
if ( $DEBUG ) 
{
  print "Realignment Clusters ( unexpanded )\n";
  print "-----------------------------------\n";
}
while ( $dupResIter->hasNext() ) 
{
  my $annot = $dupResIter->next();

  my $neighborIter = $dupResIter->getIterator();
  my $conRemaining = $annot->getSubjRemaining();
  my $annotConGapEnd = -$annot->getSubjEnd();
  if ( $annot->getOrientation() eq "C" ) 
  {
    $conRemaining = $annot->getSubjStart();
    $annotConGapEnd = $annot->getSubjStart();
  }
  push @{ $originalSeedBoundaries{ $annot } }, 
                   { 'start' => $annot->getQueryStart(),
                     'end' => $annot->getQueryEnd(),
                     'orient' => $annot->getOrientation() };
  $subElementsString = " -> " . $annot->toStringFormatted if ( $DEBUG );
  while ( $neighborIter->hasNext() ) 
  {
    my $neighbor = $neighborIter->next();

    #print "considering:\n  Annot: " . $annot->toStringFormatted() . 
    #      "\n  Neighbor: " . $neighbor->toStringFormatted() ."\n"
    #  if ( $DEBUG );

    # last if ( too-far-out > consensus + maxInsert );
    last if ( ( $neighbor->getQueryStart() - $annot->getQueryEnd() ) > 
              ( $conRemaining + $maxSeedInsertion ) );
    # last if different query sequence
    last if ( $annot->getQueryName() ne $neighbor->getQueryName() );

    my $neighConGapEnd = $neighbor->getSubjStart();
    if ( $neighbor->getOrientation() eq "C" ) 
    {
      $neighConGapEnd = -$neighbor->getSubjEnd();
    }
 
    # is this element compatible?
    if (    $annot->getSubjName() eq $neighbor->getSubjName() 
         && $annot->getOrientation eq $neighbor->getOrientation()
         && ( $annotConGapEnd + $neighConGapEnd ) > -$maxDupliconSeedOverlap )
    {
      # Yes - Remove element and resize range of the parent
      $annot->setQueryEnd( $neighbor->getQueryEnd() );
      $annot->setQueryRemaining( $neighbor->getQueryRemaining() );
      push @{ $originalSeedBoundaries{ $annot } }, 
                      { 'start' => $neighbor->getQueryStart(),
                        'end' => $neighbor->getQueryEnd(),
                        'orient' => $neighbor->getOrientation() };
      $subElementsString .= " -> " . $neighbor->toStringFormatted if ( $DEBUG );
      $neighborIter->remove();
    }
  }
  if ( $DEBUG ) 
  { 
    print "" . $annot->toStringFormatted();
    if ( @{ $originalSeedBoundaries{ $annot } } > 1 )
    {
      print "$subElementsString";
    }
  }
}
undef $dupResIter;

print "\n\n" if ( $DEBUG );

##
## Create a uniq block list from the repeats
##    A list of regions/blocks that are neither seed or
##    repeat sequence.  This is used to determine
##    realignment ranges in a subsequent step.
## 
my $repResIter = $repResultsCollection->getIterator();
my %uniqBlocks = ();
my $lastRepeat;
while ( $repResIter->hasNext() ) 
{
  my $repeat = $repResIter->next();

  if ( defined $lastRepeat && 
       $lastRepeat->getQueryName() eq $repeat->getQueryName() ) 
  {
    my $gapSize = $repeat->getQueryStart() - 
                  $lastRepeat->getQueryEnd() - 1;
    if ( $gapSize > 0 ) 
    {
      push @{ $uniqBlocks{ $repeat->getQueryName() } },
             { 'start' => $lastRepeat->getQueryEnd(),
               'end' => $repeat->getQueryStart() };
    }
    
  }
  $lastRepeat = $repeat;
}


## 
## Expand clusters/singletons to include flanking repeat regions.
##   - Place elements which cannot be expanded into an orphan list.
##
my $orphans = SearchResultCollection->new();
$dupResIter = $dupResultsCollection->getIterator();
if ( $DEBUG ) 
{
  print "Realignment Clusters ( expanded )\n";
  print "-----------------------------------\n";
}
while ( $dupResIter->hasNext() ) 
{
  my $annot = $dupResIter->next();

  my $leftStart = $annot->getQueryStart() - $annot->getSubjStart() + 1;
  my $leftEnd = $annot->getQueryStart();
  my $rightStart = $annot->getQueryEnd();
  my $rightEnd = $annot->getQueryEnd + $annot->getSubjRemaining();
  if ( $annot->getOrientation() eq "C" ) 
  {
    $leftStart = $annot->getQueryStart() - $annot->getSubjRemaining() + 1;
    $rightEnd = $annot->getQueryEnd + $annot->getSubjStart();
  }
  $leftStart = 1 if ( $leftStart < 1 );
  $rightEnd = $annot->getQueryEnd() + $annot->getQueryRemaining() 
      if ( $rightEnd > ( $annot->getQueryEnd() + 
                         $annot->getQueryRemaining() ) );

  ## Loop over uniq blocks
  my $leftBoundry = $leftStart;
  my $rightBoundry = $rightEnd;
  foreach my $uniqBlock (  @{ $uniqBlocks{ $annot->getQueryName() } } )
  {
    next if ( $uniqBlock->{'end'} < $leftStart );
    last if ( $uniqBlock->{'start'} > $rightEnd );

    if ( getOverlapSize( $uniqBlock->{'start'}, $uniqBlock->{'end'},
                         $leftStart, $leftEnd ) > $realignNonRepMaxWidth ) 
    {
      $leftBoundry = $uniqBlock->{'end'};
    }
    if ( getOverlapSize( $uniqBlock->{'start'}, $uniqBlock->{'end'},
                         $rightStart, $rightEnd ) > $realignNonRepMaxWidth )
    {
      $rightBoundry = $uniqBlock->{'start'};
      last;
    }
  }
  $leftBoundry = $annot->getQueryStart() 
          if ( $leftBoundry > $annot->getQueryStart() );
  $rightBoundry = $annot->getQueryEnd() 
          if ( $rightBoundry < $annot->getQueryEnd() );

  if ( $leftBoundry == $annot->getQueryStart() &&
       $rightBoundry == $annot->getQueryEnd() ) 
  {
    $orphans->add( $annot );
    $dupResIter->remove();
    print "O " . $annot->toStringFormatted() if ( $DEBUG );
    next;
  }
  $annot->setQueryStart( $leftBoundry );
  $annot->setQueryEnd( $rightBoundry );
  print "  " . $annot->toStringFormatted() if ( $DEBUG );

}
undef $dupResIter;

print "\n\n" if ( $DEBUG );

##
## Group overlapping realignment ranges into one search
##
$dupResultsCollection->sort( \&byNameBeginEndrev );
$dupResIter = $dupResultsCollection->getIterator();
my @ranges = ();
if ( $DEBUG )
{
  print "Realignment Blocks:\n";
  print "-------------------\n";
}
while ( $dupResIter->hasNext() ) 
{
  my %consensi;
  my $annot = $dupResIter->next();
  # Look for overlap
  my $neighborIter = $dupResIter->getIterator();
  my $rangeStart = $annot->getQueryStart();
  my $rangeEnd = $annot->getQueryEnd();
  push @{ $consensi{ $annot->getSubjName() } }, 
              ( @{ $originalSeedBoundaries{ $annot } } ); 
  my $lastAnnot = $annot;
  # Cluster clusters
  print " considering $rangeStart - $rangeEnd\n" if ( $DEBUG );
  print "      --> " . $annot->toStringFormatted() if ( $DEBUG );
  while ( $neighborIter->hasNext() ) 
  {
    my $neighbor = $neighborIter->next();
    print "   vs " . $neighbor->toStringFormatted() if ( $DEBUG );
    last if ( $neighbor->getQueryStart() > $lastAnnot->getQueryEnd() );
    last if ( $neighbor->getQueryName() ne $lastAnnot->getQueryName() );
    $rangeEnd = $neighbor->getQueryEnd() 
         if ( $neighbor->getQueryEnd() > $rangeEnd ); 
    print "  adding: " . $neighbor->getQueryEnd() . "\n" if ( $DEBUG );
    push @{ $consensi{ $neighbor->getSubjName() } }, 
               ( @{ $originalSeedBoundaries{ $neighbor } } ); 
    $lastAnnot = $neighbor;
    $neighborIter->remove();
  }
  # Add orphans which are fully contained inside a realignment range
  # and remove from orphan list
  my $orphanIter = $orphans->getIterator();
  while ( $orphanIter->hasNext() ) 
  {
    my $orphan = $orphanIter->next();
    next if ( $orphan->getQueryEnd() < $rangeStart ||
              $orphan->getQueryEnd() > $rangeEnd );
    last if ( $orphan->getQueryStart() > $rangeEnd );
    if ( $orphan->getQueryStart() >= $rangeStart )
    {
      print "Adding Orphan -> " . $orphan->toStringFormatted() if ( $DEBUG );
      print "  to range: " . $rangeStart . " - " . $rangeEnd . "\n" if ( $DEBUG );
      push @{ $consensi{ $orphan->getSubjName() } },
         ( @{ $originalSeedBoundaries{ $orphan } } );
      my $annot = $orphanIter->remove();
    }
  }
  push @ranges, { 'id' => $annot->getQueryName(),
                  'start' => $rangeStart,
                  'end' => $rangeEnd,
                  'consensi' => { %consensi } };
  if ( $DEBUG ) 
  {
    print   "  " . $annot->getQueryName() 
          . ":$rangeStart-$rangeEnd vs " 
          . join( ",", keys( %consensi )) . "\n";
  }
  undef $neighborIter;
}
undef $dupResIter;

print "\n\n" if ( $DEBUG );

#
# Setup the search engine
#
my $searchEngine =
    WUBlastSearchEngine->new( pathToEngine => $WUBLASTN );
if ( not defined $searchEngine ) {
  die "Cannot execute $WUBLASTN";
}
$searchEngine->setMinScore( 180 );
if ( $options{'align'} )
{
  $searchEngine->setGenerateAlignments( 1 );
}else 
{
  $searchEngine->setGenerateAlignments( 0 );
}
$searchEngine->setGapInit( -35 );
$searchEngine->setBandwidth( 20 );
$searchEngine->setInsGapExt( -7 );
$searchEngine->setDelGapExt( -6 );
$searchEngine->setMaskLevel( 1 );
$searchEngine->setMinMatch( 9 );
$searchEngine->setUseDustSeg( 1 );
$searchEngine->setScoreMode( SearchEngineI::complexityAdjustedScoreMode );
$searchEngine->setMatrix( "$MATRICES/20p41g.matrix" );
$searchEngine->setMaskLevel( 90 );

##
## Realign
##
my $totalResultCollection = SearchResultCollection->new();

my $uniqID = 1;
print "Realigning: ";
foreach my $reAlignRange ( @ranges ) 
{
  # Create a temp file name
  $tmpIdx = 1;
  my $tmpLibFile = "/tmp/tmplibfile-$tmpIdx";
  while ( -e $tmpLibFile )
  {
    $tmpLibFile = "/tmp/tmplibfile-" . $tmpIdx++;
  }

  # Extract consensi and save to temp file
  open OUT, ">$tmpLibFile" or die "Can't create lib file $tmpLibFile: $?\n";

  foreach my $conID ( keys( %{ $reAlignRange->{'consensi'} } ) ) 
  {
    my $seq = $libDB->getSequence( $conID );
    print OUT ">" . $conID . "\n";
    $seq =~ s/(\S{50})/$1\n/g;
    $seq .= "\n"
        unless ( $seq =~ /.*\n+$/s );
    print OUT $seq;
  }
  close OUT;

  # Create a temp file name
  $tmpIdx = 1;
  my $tmpQueryFile = "/tmp/tmpqueryfile-$tmpIdx";
  while ( -e $tmpQueryFile )
  {
    $tmpQueryFile = "/tmp/tmpqueryfile-" . $tmpIdx++;
  }

  # Extract query range and save to a temp file
  open OUT, ">$tmpQueryFile" or die "Can't create lib file $tmpQueryFile: $?\n";
  print OUT ">" . $reAlignRange->{'id'} . 
            "_[" . $reAlignRange->{'start'} . "-" . 
            $reAlignRange->{'end'} . "]\n";
  my $seq = $db->getSubstr( $reAlignRange->{'id'}, $reAlignRange->{'start'}, 
                  ( $reAlignRange->{'end'} - $reAlignRange->{'start'} ) );
  $seq =~ s/(\S{50})/$1\n/g;
  $seq .= "\n"
      unless ( $seq =~ /.*\n+$/s );
  print OUT $seq;
  close OUT;
   
  # Run WUBLAST
  $searchEngine->setSubject( "$tmpLibFile" );
  system( "$XDFORMAT -n $tmpLibFile > /dev/null 2>&1" );
 
  $searchEngine->setQuery( "$tmpQueryFile" );

  if ( $DEBUG ) 
  {
    print "Running a realignment... $tmpQueryFile vs $tmpLibFile\n";
  }
  print ".";
  my ( $status, $searchResultCol ) = $searchEngine->search();
  if ( $status ) {
    die "ERROR from search engine (", $? >> 8, ") \n";
  }

  # Remove temporary files
  unless ( $DEBUG )
  {
    unlink( $tmpQueryFile );
    unlink( $tmpLibFile );
    unlink( $tmpLibFile . ".xnd" );
    unlink( $tmpLibFile . ".xns" );
    unlink( $tmpLibFile . ".xnt" );
  }

  # Adjust coordinates
  my $resIter = $searchResultCol->getIterator();
  while ( $resIter->hasNext() ) 
  {
    my $annot = $resIter->next();
    my $seqID = $annot->getQueryName();
    print "Adjusting $seqID\n" if ( $DEBUG );
    print $annot->toStringFormatted( SearchResult::NoAlign ) ."\n" 
        if ( $DEBUG );
    if ( $seqID =~ /(.*)_\[(\d+)-(\d+)\]/ )
    {
      my $startPos = $2;
      $annot->setQueryName( $1 );
      $annot->setQueryStart( $annot->getQueryStart() + $startPos );
      $annot->setQueryEnd( $annot->getQueryEnd() + $startPos );
      $annot->setQueryRemaining(   $queryLengths{ $annot->getQueryName() } 
                                 - $annot->getQueryEnd() );
      print $annot->toStringFormatted( SearchResult::NoAlign ) ."\n" 
          if ( $DEBUG );
    }
    else
    {
      die "Seqname $seqID is not in the valid format!\n";
    }
  }
  undef $resIter;


  #
  # New Filter
  #
  my $searchIter = $searchResultCol->getIterator();
  while ( $searchIter->hasNext() )
  {
    my $searchAnnot = $searchIter->next();
    my $searchNeighIter = $searchIter->getIterator();
    my $chainSeedScore = 0;

    next if ( defined $searchAnnot->getOverlap() &&
                $searchAnnot->getOverlap() eq "u" );

    # Determine if this annotation overlaps a previous seed.  
    if ( defined $reAlignRange->{'consensi'}->{ $searchAnnot->getSubjName() } )
    {
      foreach my $range (
         @{ $reAlignRange->{'consensi'}->{ $searchAnnot->getSubjName() } } )
          {
        # check range against this annotation -- increasing seed 
        # score if overlaps and short circuiting foreach
        if ( $searchAnnot->getOrientation() eq $range->{'orient'} &&
             getOverlapSize( $range->{'start'}, $range->{'end'},
                             $searchAnnot->getQueryStart(),
                             $searchAnnot->getQueryEnd() ) > 0 )
        {
          $chainSeedScore++;
          last;
        }
      }

      if ( $chainSeedScore > 0 ) 
      {
        $totalResultCollection->add( $searchAnnot );
        $searchAnnot->setOverlap( "u" );

        # Go left
        my $leftIter = $searchIter->getIterator();
        $leftIter->previous();
        my $count = 0;
        my $lastAnnot = $searchAnnot;
        while ( $leftIter->hasPrevious() ) 
        {
          my $leftAnnot = $leftIter->previous();
          next if ( $leftAnnot->getScore() < 300 );
          next if ( defined $leftAnnot->getOverlap() && $leftAnnot->getOverlap() eq "u" );
          last if ( $count > 10 );
          $count++;
          if ( &areCongruous( $lastAnnot, $leftAnnot ) == 1 ) 
          { 
            $totalResultCollection->add( $leftAnnot );
            $leftAnnot->setOverlap( "u" );
            $lastAnnot = $leftAnnot;
          } 
        }
 

        # Go Right
        my $rightIter = $searchIter->getIterator();
        $lastAnnot = $searchAnnot;
        $count = 0;
        while ( $rightIter->hasNext() ) 
        {
          my $rightAnnot = $rightIter->next();
          next if ( $rightAnnot->getScore() < 300 );
          next if ( defined $rightAnnot->getOverlap() && $rightAnnot->getOverlap() eq "u" );
          last if ( $count > 10 );
          $count++;
          if ( &areCongruous( $lastAnnot, $rightAnnot ) == 1 ) 
          { 
            $totalResultCollection->add( $rightAnnot );
            $rightAnnot->setOverlap( "u" );
            $lastAnnot = $rightAnnot;
          } 
        }
      }
    }
    else {
      warn "Can't find " . $searchAnnot->getSubjName() . " in consensi hash!\n";
    }
  }
}
print "\n";


##
## Reintegrate orphan annotations
##
my $orphanIter = $orphans->getIterator();
while ( $orphanIter->hasNext() ) 
{
  my $orphan = $orphanIter->next();
  # TODO: put back ID generation mechanism.
  #  $orphan->setId( $uniqID++ );
  $totalResultCollection->add( $orphan );
}


##
## Report
##
print "Results:\n" if ( $DEBUG );
print "--------\n" if ( $DEBUG );
$totalResultCollection->sort( \&byNameBeginEndrev );
my $resIter = $totalResultCollection->getIterator();
my %usedIDs = ();
my $newID = 1;
while ( $resIter->hasNext() ) 
{
  my $annot = $resIter->next();
  $annot->setOverlap( undef );
  # TODO: put back ID generation mechanism.
  #if ( $usedIDs{ $annot->getId() } )
  #{
  #  $annot->setId( $usedIDs{ $annot->getId() } );
  #}
  #else 
  #{
  #  $usedIDs{ $annot->getId() } = $newID;
  #  $annot->setId( $newID++ );
  #}
  print "".  $annot->toStringFormatted() if ( $DEBUG );
}
undef $resIter;


##
## Save results
##
print "Saving final annotations to $fastaFile.duplicons\n";
if ( $options{'align'} )
{
  $totalResultCollection->write( "$fastaFile.duplicons", 
                                 SearchResult::AlignWithQuerySeq );
}else
{
  $totalResultCollection->write( "$fastaFile.duplicons", 
                                 SearchResult::NoAlign );
}
if ( $options{'gff'} )
{
  print "Saving final annotations in GFF format to $fastaFile.duplicons.gff\n";
  &exportGFF( $totalResultCollection, $fastaFile );
}


##
## Goodbye
##
exit;


######################## S U B R O U T I N E S ############################

sub exportGFF 
{
  my $resultsColl = shift;
  my $fileName = shift;

  open OUTGFF, ">$fileName.duplicons.gff" || 
        die "exportGFF(): Could not open up $fileName.duplicons.gff for"
          . "writing!\n";
  my $resIter = $resultsColl->getIterator();
  print OUTGFF "##gff-version 2\n";
  printf OUTGFF "##date %4d-%02d-%02d\n", ( localtime )[ 5 ] + 1900,
      ( localtime )[ 4 ] + 1, ( localtime )[ 3 ];    # date as 1999-09-24...
  ( my $seqname = $fileName ) =~ s/^.*\///;
  print OUTGFF "##sequence-region $seqname\n";
  while ( $resIter->hasNext() ) 
  {
    my $currentAnnot = $resIter->next();
    print OUTGFF ""
            . $currentAnnot->getQueryName()
            . "\tDupMasker\tsimilarity\t"
            . $currentAnnot->getQueryStart() . "\t"
            . $currentAnnot->getQueryEnd() . "\t"
            . $currentAnnot->getScore() . "\t"
            . ( $currentAnnot->getOrientation() eq 'C' ? '-' : '+' ) . "\t.\t"
            . "Target \"Motif:"
            . $currentAnnot->getSubjName() . "\" "
            . $currentAnnot->getSubjStart() . " "
            . $currentAnnot->getSubjEnd() . "\n";
  }
  undef $resIter;
  close OUT;
}

sub areCongruous 
{
  my $annot1 = shift;
  my $annot2 = shift;


  # Are they in the same contig sequence?
  if ( $annot1->getQueryName() eq $annot2->getQueryName() )
  {
    # Are they in the same contig sequence?
    if ( $annot1->getSubjName() eq $annot2->getSubjName() )
    {
      # Are they in the same orientation?
      if ( $annot1->getOrientation() eq $annot2->getOrientation() ) 
      {
        # Are their consensus positions reasonable?
        my $subjGapSize = $annot2->getSubjStart() - $annot1->getSubjEnd();
        $subjGapSize = $annot1->getSubjStart() - $annot2->getSubjEnd() 
               if ( $annot1->getOrientation() eq "C" );
        my $queryGapSize = $annot2->getQueryStart() - $annot1->getQueryEnd();
  
        my $gapAdjusted = $subjGapSize - $queryGapSize;
        #if ( $gapSize > -300 && $gapSize < 7000 ) 
        #if ( $gapSize > -7000 && $gapSize < 7000 ) 
        if ( $gapAdjusted > -7000 && $gapAdjusted < 7000 ) 
        {
          return( 1 ); 
        }
      }
    }
  }
  return 0;
}


sub getOverlapSize
{
  my $range1Begin = shift;
  my $range1End = shift;
  my $range2Begin = shift;
  my $range2End = shift;

  my $overlap = 0;
  if (    $range1Begin >= $range2Begin
       && $range1Begin <= $range2End   )
  {
    #      -------
    #   ------
    # or
    #     -----
    #   --------
    if ( $range1End <= $range2End )
    {
      #     -----
      #   --------
      $overlap = $range1End - $range1Begin + 1;
    }else
    {
      #      -------
      #   ------
      $overlap = $range2End - $range1Begin + 1;
    }
  }elsif (    $range1End >= $range2Begin
           && $range1End <= $range2End   )
  {
    #   -------
    #      ------
    # or
    #   --------
    #    -----
    if ( $range1End >= $range2End )
    {
      #   --------
      #    -----
      $overlap = $range2End - $range2Begin + 1;
    }else
    {
      #   -------
      #      ------
      $overlap = $range1End - $range2Begin + 1;
    }
  }
  return $overlap;
}

sub byNameBeginEndrev ($$) {
  ( $_[0]->getQueryName() ) cmp( $_[1]->getQueryName() )
      || ( $_[0]->getQueryStart() ) <=> ( $_[1]->getQueryStart() )
      || ( $_[1]->getQueryEnd() ) <=> ( $_[0]->getQueryEnd() );
}


##-------------------------------------------------------------------------##
## Use:  my ( $seq_cnt, $totalSeqLen, $nonMaskedSeqLen, $totGCLevel,
##             $totBPMasked ) =
##                    &maskSequence ( $seqDB, $annotationFile,
##                                    $outputFile );
##  Returns
##
##     $seq_cnt:          The number of sequences in the FASTA file.
##     $totalSeqLen:      The absoulte length of all sequences combined.
##     $nonMaskedSeqLen:  Length of sequence (excluding runs of >20 N's
##                         and X's) of the pre-masked sequence.
##     $totGCLevel:       The GC content of the original sequence.
##     $totBPMasked:      The total bp we masked
##
##-------------------------------------------------------------------------##
sub maskSequence {
  my %parameters = @_;

  my $maskFormat     = $parameters{'maskFormat'} || "";
  my $seqDB          = $parameters{'seqDB'};
  my $annotationFile = $parameters{'annotationFile'};
  my $outputFile     = $parameters{'outputFile'};
  my $minDivergence  = $parameters{'minDivergence'};
  my $maxDivergence  = $parameters{'maxDivergence'};


  my %annots = ();

  #
  # Open up a search results object
  #
  my $searchResults =
      CrossmatchSearchEngine::parseOutput( searchOutput => $annotationFile );

  #
  # Read in annotations and throw away the rest
  #
  my $prevResult;
  for ( my $i = 0 ; $i < $searchResults->size() ; $i++ ) {
    my $result = $searchResults->get( $i );
    my $start  = $result->getQueryStart();
    my $end    = $result->getQueryEnd();
    if (    defined $prevResult
         && $prevResult->getQueryName() eq $result->getQueryName()
         && $prevResult->getQueryEnd() >= $start )
    {
      next if ( $prevResult->getQueryEnd() >= $end );
      $start = $prevResult->getQueryEnd() + 1;
    }
    next if ( defined $minDivergence && $result->getPctDiverge() < $minDivergence );
    next if ( defined $maxDivergence && $result->getPctDiverge() > $maxDivergence );
    push @{ $annots{ $result->getQueryName() } },
        {
          'begin' => $start,
          'end'   => $end
        };
    $prevResult = $result;
  }
  undef $searchResults;

  my @seqIDs     = $seqDB->getIDs();
  my $seq_cnt    = scalar( @seqIDs );
  my $sublength  = $seqDB->getSubtLength();
  my $totGCLevel = 0;
  if ( $sublength > 0 )
  {
    $totGCLevel = 100 * $seqDB->getGCLength() / $sublength;
  }
 
  $totGCLevel = sprintf "%4.2f", $totGCLevel;
  my $totalSeqLen     = 0;
  my $totBPMasked     = 0;
  my $nonMaskedSeqLen = 0;
  my $workseq         = "";
  open OUTFILE, ">$outputFile";

  foreach my $seqID ( @seqIDs ) {
    my $seq = $seqDB->getSequence( $seqID );
    $totalSeqLen += length $seq;
    $workseq = $seq;
    $nonMaskedSeqLen += length $workseq;
    while ( $workseq =~ /([X,N]{20,})/ig )
    {
      $nonMaskedSeqLen -= length( $1 );
    }
    foreach my $posRec ( @{ $annots{$seqID} } ) {
      my $beginPos = $posRec->{'begin'};
      my $endPos   = $posRec->{'end'};
      my $repLen   = $endPos - $beginPos + 1;
      substr( $workseq, $beginPos - 1, $repLen ) = "0" x ( $repLen );
      if ( $maskFormat eq 'xsmall' ) {
        substr( $seq, $beginPos - 1, $repLen ) =
            lc( substr( $seq, $beginPos - 1, $repLen ) );
      }
      elsif ( $maskFormat eq 'x' ) {
        substr( $seq, $beginPos - 1, $repLen ) = "X" x ( $repLen );
      }
      else {
        substr( $seq, $beginPos - 1, $repLen ) = "N" x ( $repLen );
      }
      $totBPMasked += $repLen;
    }
    print OUTFILE ">" . $seqID;
    my $desc = $seqDB->getDescription( $seqID );
    if ( $desc ne "" ) {
      print OUTFILE " " . $desc;
    }
    print OUTFILE "\n";
    $seq =~ s/(\S{50})/$1\n/g;
    $seq .= "\n"
        unless ( $seq =~ /.*\n+$/s );
    print OUTFILE $seq;
  }
  close OUTFILE;

  return ( $seq_cnt, $totalSeqLen, $nonMaskedSeqLen, $totGCLevel,
           $totBPMasked );
}

1;