#!/usr/bin/env perl

=head1 NAME

validateGapAssembly.pl

=head1 SYNOPSIS

  validateGapAssembly.pl [options] <leftAnchorFastaFile> <rightAnchorFastaFile>
    <readinfo.txt> <libinfo.txt> <sffinfo.txt> <contigs.fasta> <contigs.qual>
    <outputFile (e.g., validinfo.txt)>
    
  Options:
  -gsize <number>             Gap size (required)
  -gsizeStd <number>          Gap size standard deviation (required)
  -aligner <name>             Path and name of aligner to use for aligning anchors to reference (required)
  -alignerParams <params>     Aligner parameters (required)
  -formatdb <name>            Path and name of formatdb (required)
  -pctId <number>             Minimum percent identity for aligning anchors to reference (optional; default=95)
  -alignLen <number>          Minimum alignment length for aligning anchors to reference (optional; default=40)
  -pctValidReads <number>     Minimum percent of valid read pairs (optional; default=90)
  -minQual <number>           Minimum avg consensus quality between anchors (optional; default=30)
  -insertSizeStdMult <number> Library insert size standard deviation multiplier (optional; default=1)
  -debug                      Prints additional information in output file (optional)
  -h                          Detailed help message (optional)

=head1 DESCRIPTION

This software component is part of the Gap Resolution sub system that is
responsible for validating a sub project for closure after it has been
reassembled. The following validation are performed:

Unless otherwise noted, anchor here refers to the anchor sequence obtained
from the left and right contigs of the gap prior to reassembly.  This is done
upstream using idContigRepeats.pl.

1. Validate anchor distance.
   The left and right anchors are aligned (using -aligner <aligner>) to the
   contigs of the assembly. If the anchors reside on the same contig and the
   distance is within the gap size (-gsize option) +/- standard deviation (-gsizeStd
   option), the anchor distance is considered to be valid. Otherwise, the anchor
   distance is invalid.  Alignments of the anchors to the assembly contigs are
   filtered by percent identity (-pctId option) and the alignment length (-alignLen
   option).

2. Validate read pairing.
   For each read pair, determine the library insert size by mapping the read
   to the it's corresponding sff file (via 454's sffinfo script) and determining
   the library insert sizes and standard deviations x a multiplier
   (-insertSizeStdMult option) using the sffinfo.txt and libinfo.txt files. Read
   pairs are considered valid only if they meet all of the following criteria:
   a) the read pairs are located on the same contig and their distance are
   within the library insert size +/- standard deviation * a multiplier
   (-insertSizeStdMult option), b) the orientation of the reads point towards
   each other, then that read pair is deemed valid, and c) the percent of the
   valid read pairs to invalid read pairs is >= 90% (configurable using
   -pctValidReads option).

3. Validate consensus quality.
   If the anchors reside on the same contig of the assembly, the average quality
   between the anchors is determined and must be >= 30 (configurable using
   -minQual option) to be considered valid. Otherwise, the consensus quality is
   invalid.

4. If validation fails and the anchors are on different contigs, set
   doPrimerDesign=1 for designing primers.

The specified output file contains information pertaining to the validation
in a key/value pair.  The following entries are reported:

  leftAnchorContig=name of contig
  leftAnchorContigLength=number
  leftAnchorStart=number
  leftAnchorEnd=number
  rightAnchorContig=name of contig
  rightAnchorContigLength=number
  rightAnchorStart=number
  rightAnchorEnd=number
  anchorStart=number
  anchorEnd=number
  anchorDistance=number
  gapSize=number
  gapSizeStdDev=number
  numConsistentReadPairs=number
  numInconsistentReadPairs=number
  pctConsistent=number
  avgConsensusQualityBetweenAnchors=number
  isDistanceValid=0|1
  isReadPairingValid=0|1
  isQualityValid=0|1
  status=PASS|FAIL
  doPrimerDesign=0|1
  Comment=comment entry

The Status is defined as SUCCESSFUL if all three validations passed.  Otherwise,
it is reported as FAILED.

Description of input files:

  * leftAnchorFastaFile - fasta file containing the sequence of the left anchor
  * rightAnchorFastaFile - fasta file containing the sequence of the right anchor
  * readinfo.txt - file containing read pairing information of the assembly.
    This file is generated by newblerAce2ReadPair.pl. For more information on
    the format of the file, refer to newblerAce2ReadPair.pl's documentation.
  * libinfo.txt - file containing library insert size and standard deviation.
    This file is generated by parseNewblerMetrics.pl.  For more information on
    the format of the file, refer to parseNewblerMetrics.pl's documentation.
  * sffinfo.txt - file containing the path of the sff file, it's corresponding
    library and the type of the sff file. This file is generated by
    parseNewblerMetrics.pl. For more information on the format of the file,
    refer to parseNewblerMetrics.pl's documentation.
  * contigs.fasta - fasta file containing the contigs of the assembly.
  * contigs.qual - qual file of the corresponding contigs.fasta.
  * outputFile - name of the output file containing the information of the validation.

=head1 VERSION

$Revision$

$Date$

=head1 AUTHOR(S)

Stephan Trong

=head1 HISTORY

=over

=item *

S.Trong 2008/12/05 creation

=item *

S.Trong 2009/03/02 - added library insert size checking for phrap based reads.

=back

=cut

use strict;
use warnings;
use Pod::Usage;
use Getopt::Long;
use Carp;
use Carp qw(cluck);
use Cwd;
use Cwd qw(abs_path);
use File::Path;
use File::Basename;
use FindBin qw($RealBin);
use lib "$RealBin/../lib";
use PGF::GapResolution::AlignAnchorSeqs;
use PGF::Utilities::Properties;
use PGF::Parsers::FastaParse;
use PGF::GapResolution::SffReads;
use PGF::GapResolution::ParseSffInfo;
use PGF::GapResolution::ParseLibInfo;
use PGF::GapResolution::ReadName;
use vars qw( $optHelp $optGapSize $optGapSizeStdDev $optDebug
    $optMinAnchorSeqPctIdentity $optMinAnchorSeqAlignmentLength
    $optPctReadPairConsistency $optMinAvgConsensusQualityBetweenAnchors
    $optFormatdb $optAligner $optAlignerParams $optLibInsertSizeStdDevMultiplier);

#============================================================================#
# INPUT VALIDATION
#============================================================================#
if( !GetOptions(
        "gsize=f"=>\$optGapSize,
        "gsizeStd=f"=>\$optGapSizeStdDev,
        "pctId=n"=>\$optMinAnchorSeqPctIdentity,
        "alignLen=n"=>\$optMinAnchorSeqAlignmentLength,
        "pctValidReads=n"=>\$optPctReadPairConsistency,
        "minQual=n"=>\$optMinAvgConsensusQualityBetweenAnchors,
        "insertSizeStdMult=f"=>\$optLibInsertSizeStdDevMultiplier,
        "formatdb=s"=>\$optFormatdb,
        "aligner=s"=>\$optAligner,
        "alignerParams=s"=>\$optAlignerParams,
        "debug"=>\$optDebug,
        "h"=>\$optHelp,
    )
) {
    usage();
}

usage(2) if $optHelp;

if ( @ARGV != 8 ) {
    print STDERR "Some arguments are missing.\n\n";
    usage();
}

#============================================================================#
# INITIALIZE VARIABLES
#============================================================================#
my $leftAnchorSeqFile = $ARGV[0];
my $rightAnchorSeqFile = $ARGV[1];
my $readInfoFile = $ARGV[2];
my $libInfoFile = $ARGV[3];
my $sffInfoFile = $ARGV[4];
my $contigsFasta = $ARGV[5];
my $contigsQual = $ARGV[6];
my $outputFile = $ARGV[7];

my $DEBUG = $optDebug ? $optDebug : 0;
my $blastOutputExt = '.blast.out';
my $leftAnchorSeqBlastOutFile = $leftAnchorSeqFile.$blastOutputExt;
my $rightAnchorSeqBlastOutFile = $rightAnchorSeqFile.$blastOutputExt;

if ( defined $ENV{SFFFILE_PATH} ) {
    if ( ! -x $ENV{SFFFILE_PATH} ) {
        print STDERR "$ENV{SFFFILE_PATH} is not found or not executable\n";
        usage();
    }
}
else {
    print STDERR "ERROR: the SFFFILE_PATH environmental variable specifying the location of the sffinfo executable is not defined!\n";
    usage();
}

# Set default params.
#
$optMinAvgConsensusQualityBetweenAnchors = 30 if !$optMinAvgConsensusQualityBetweenAnchors;
$optMinAnchorSeqPctIdentity = 95 if !$optMinAnchorSeqPctIdentity;
$optMinAnchorSeqAlignmentLength = 40 if !$optMinAnchorSeqAlignmentLength;
$optPctReadPairConsistency = 90 if !$optPctReadPairConsistency;
$optLibInsertSizeStdDevMultiplier = 1 if !$optLibInsertSizeStdDevMultiplier;

#============================================================================#
# VALIDATE INPUTS
#============================================================================#

if ( !$optFormatdb ) {
    print STDERR "The -formatdb option must be specified.\n";
    exit 1;
}

if ( !$optAligner ) {
    print STDERR "The -aligner option must be specified.\n";
    exit 1;
}

if ( !$optAlignerParams ) {
    print STDERR "The -alignerParams option must be specified.\n";
    exit 1;
}

if ( !$optFormatdb ) {
    print STDERR "The -formatdb option must be specified.\n";
    exit 1;
}

if ( !-s $leftAnchorSeqFile ) {
    print STDERR "The left anchor seq file $leftAnchorSeqFile is either not found or is zero size.\n";
    exit 1;
}

if ( !-s $rightAnchorSeqFile ) {
    print STDERR "The right anchor seq file $rightAnchorSeqFile is either not found or is zero size.\n";
    exit 1;
}

if ( !-s $readInfoFile ) {
    print STDERR "The read info file $readInfoFile is either not found or is zero size.\n";
    exit 1;
}

if ( !-s $libInfoFile ) {
    print STDERR "The library info file $libInfoFile is either not found or is zero size.\n";
    exit 1;
}

if ( !-s $sffInfoFile ) {
    print STDERR "The sff info file $sffInfoFile is either not found or is zero size.\n";
    exit 1;
}

if ( !-s $contigsFasta ) {
    print STDERR "The contigs fasta file $contigsFasta is either not found or is zero size.\n";
    exit 1;
}

if ( !$optGapSize ) {
    print STDERR "The -gs <gapSize> option is required.\n\n";
    usage();
}

if ( !$optGapSizeStdDev ) {
    print STDERR "The -gsd <gapSizeStandardDev> option is required.\n\n";
    usage();
}

#============================================================================#
# MAIN
#============================================================================#

# Align anchor sequences to the assembled contigs.
#
my $objAligner = PGF::GapResolution::AlignAnchorSeqs->new();

# Set alignment filtering conditions.
#
$objAligner->setFormatdb($optFormatdb);
$objAligner->setAligner($optAligner);
$objAligner->setAlignerParams($optAlignerParams);
$objAligner->setMinPctIdentity($optMinAnchorSeqPctIdentity);
$objAligner->setMinAlignmentLength($optMinAnchorSeqAlignmentLength);

# Perform alignment.
#
$objAligner->alignAnchorSeqs($contigsFasta, $leftAnchorSeqFile, $rightAnchorSeqFile,
    $leftAnchorSeqBlastOutFile, $rightAnchorSeqBlastOutFile);

if ( !-s $leftAnchorSeqBlastOutFile && !-s $rightAnchorSeqBlastOutFile ) {
    confess "ERROR: The blast files $leftAnchorSeqBlastOutFile and/or $rightAnchorSeqBlastOutFile does not exist or is zero size.\n";
}

# Parse alignment files.
#
my ($leftAnchorContig, $leftAnchorStart, $leftAnchorEnd) =
    $objAligner->parseAlignmentFile($leftAnchorSeqBlastOutFile);
my ($rightAnchorContig, $rightAnchorStart, $rightAnchorEnd) =
    $objAligner->parseAlignmentFile($rightAnchorSeqBlastOutFile);
my ($leftAnchorContigLength, $rightAnchorContigLength) = getContigLengths(
    $contigsFasta, $leftAnchorContig, $rightAnchorContig);
    
open OFILE, ">$outputFile" or confess "ERROR: failed to create output file $outputFile\n";

print OFILE "leftAnchorContig=$leftAnchorContig\n";
print OFILE "leftAnchorContigLength=$leftAnchorContigLength\n";
print OFILE "leftAnchorStart=$leftAnchorStart\n";
print OFILE "leftAnchorEnd=$leftAnchorEnd\n";
print OFILE "rightAnchorContig=$rightAnchorContig\n";
print OFILE "rightAnchorContigLength=$rightAnchorContigLength\n";
print OFILE "rightAnchorStart=$rightAnchorStart\n";
print OFILE "rightAnchorEnd=$rightAnchorEnd\n";

# If both anchor sequences align to the same reference contig, evaluate
# distance of anchors, read pair consistency and consensus quality.
#
my $isDistanceValid = 0;
my $isReadParingValid = 0;
my $isQualityValid = 0;
my $doPrimerDesign = 0;
my $status = '';
my $statusComment = '';

# Get anchor start and end accounting for reverse complementation.
#
my $anchorStart = 0;
my $anchorEnd = 0;

if ( $leftAnchorContig eq $rightAnchorContig ) {
    ($anchorStart, $anchorEnd) = getPosOfAnchors($leftAnchorStart,
        $leftAnchorEnd, $rightAnchorStart, $rightAnchorEnd);
}
        

print OFILE "anchorStart=$anchorStart\n";
print OFILE "anchorEnd=$anchorEnd\n";

# If both anchors aligned to one or more contigs, check validation criteria.
#
if ( length $leftAnchorContig && length $rightAnchorContig ) {
    # Check if anchors are on the same contig...
    #
    if ( $leftAnchorContig eq $rightAnchorContig ) {
        # If anchors aligned properly, then check other validation conditions.
        #
        if ( $anchorStart && $anchorEnd ) {
            # validate that the anchors are within gapsize +/- stdDev.
            #
            $isDistanceValid = validateAnchorDistance($anchorStart, $anchorEnd,
                $optGapSize, $optGapSizeStdDev, *OFILE);
            
            # validate that the read pairing, using the following criteria:
            # 1. read pairing orientation points towards each other
            # 2. read pairing insert sizes are valid
            #
            $isReadParingValid = validateReadPairing($optPctReadPairConsistency,
                $optLibInsertSizeStdDevMultiplier, $readInfoFile, $leftAnchorContig,
                $libInfoFile, $sffInfoFile, *OFILE);
            
            # validate that the avg quality of consensus sequence between anchors meet
            # threshold.
            #
            $isQualityValid = validateConsensusQuality($optMinAvgConsensusQualityBetweenAnchors,
                $contigsQual, $leftAnchorContig, $anchorStart, $anchorEnd, *OFILE);
            
            $statusComment .= "invalid anchor distance, " if !$isDistanceValid;
            $statusComment .= "invalid read pairing, " if !$isReadParingValid;
            $statusComment .= "invalid consensus quality, " if !$isQualityValid;
            
            # If anchors aligned improperly, report and fail validation.
            #
        } else {
            $statusComment .= "invalid anchor orientation on contig";
        }
    # If anchors aligned on different contigs, set flag to design primers.
    #
    } else {
        $doPrimerDesign = 1;
        $statusComment .= "anchors are on different contigs, ";
    }
    
# If one or more anchors failed to align to the contigs, report and fail
# validation.
#
} else {
    $statusComment .= "left anchor failed to align to contig, "
        if !length $leftAnchorContig;
    $statusComment .= "right anchor failed to align to contig, "
        if !length $rightAnchorContig;
}

# Set validation status.
#
if ( $isDistanceValid && $isReadParingValid && $isQualityValid ) {
    $status = "PASS";
} else {
    $status = "FAIL";
}

$statusComment =~ s/,\s+$//;

print OFILE
    "isDistanceValid=$isDistanceValid\n".
    "isReadPairingValid=$isReadParingValid\n".
    "isQualityValid=$isQualityValid\n".
    "status=$status\n".
    "doPrimerDesign=$doPrimerDesign\n".
    "comment=$statusComment\n";

close OFILE;

exit 0;
    
#============================================================================#
# SUBROUTINES
#============================================================================#
sub getPosOfAnchors {
    
    my $leftAnchorStart = shift;
    my $leftAnchorEnd = shift;
    my $rightAnchorStart = shift;
    my $rightAnchorEnd = shift;
    
    my $anchorStart = 0;
    my $anchorEnd = 0;
    
    # If anchors are aligned to the same contig in this orientation
    # left anchor -->   --> right anchor, where --> denotes start to end
    # then set left anchor start as anchor start and right anchor end as
    # anchor end.
    #
    if ( $leftAnchorStart < $leftAnchorEnd &&
         $rightAnchorStart < $rightAnchorEnd &&
         $leftAnchorEnd < $rightAnchorStart ) {
        $anchorStart = $leftAnchorStart;
        $anchorEnd = $rightAnchorEnd;
        
    # If anchors are aligned to the same contig in this orientation
    # right anchor <--   <-- left anchor, where <-- denotes end to start
    # then set left anchor start as anchor start and right anchor end as
    # anchor end.
    #
    } elsif ( $leftAnchorEnd < $leftAnchorStart &&
              $rightAnchorEnd < $rightAnchorStart &&
              $rightAnchorStart < $leftAnchorEnd ) {
        $anchorStart = $rightAnchorEnd;
        $anchorEnd = $leftAnchorStart;
    }
    
    return $anchorStart, $anchorEnd;
    
}

#============================================================================#
sub getContigLengths {
    
    my $fastaFile = shift;
    my @fastaTags = @_;
    
    my $objFastaParser = new PGF::Parsers::FastaParse($fastaFile);
    my %seqLengths = ();
    my @returnSeqLengths = ();
    my $found = 0;
    
    while ($objFastaParser->MoreEntries) {
        $objFastaParser->ReadNextEntry( -rawFormat=>0 );
        my $fastaDefName = $objFastaParser->Name;
        foreach my $tag (@fastaTags) {
            next if $tag ne $fastaDefName;
            $seqLengths{$tag} = $objFastaParser->Length;
            $found++;
        }
        last if $found eq scalar(@fastaTags);
    }
    
    foreach my $tag (@fastaTags) {
        push @returnSeqLengths, $seqLengths{$tag};
    }
    
    return @returnSeqLengths;
    
}

#============================================================================#
sub validateAnchorDistance {
    
    my $anchorStart = shift;
    my $anchorEnd = shift;
    my $gapSize = shift;
    my $gapSizeStdDev = shift;
    my $fh = shift;
    
    my $isValid = 0;
    my $anchorLength = $anchorEnd - $anchorStart + 1;
    
    if ( $anchorLength >= $gapSize - $gapSizeStdDev &&
         $anchorLength <= $gapSize + $gapSizeStdDev ) {
        $isValid = 1;
    }
    
    print $fh "anchorDistance=$anchorLength\n";
    print $fh "gapSize=$gapSize\n";
    print $fh "gapSizeStdDev=$gapSizeStdDev\n";
    
    return $isValid;
    
}
    
#============================================================================#
sub validateReadPairing {
    
    my $validPctReadPairConsistency = shift;
    my $libInsertSizeStdDevMult = shift;
    my $readInfoFile = shift;
    my $refContig = shift;
    my $libInfoFile = shift;
    my $sffInfoFile = shift;
    my $fh = shift;
    
    my $sffinfoCmd = $ENV{SFFFILE_PATH};
    my $happyReadPairCount = 0;
    my $unhappyReadPairCount = 0;
    my $isValid = 0;
    
    # Parse the sffinfo.txt file and store in object.
    #
    my $objParseSffInfo = PGF::GapResolution::ParseSffInfo->new($sffInfoFile);
    my @sffFiles = $objParseSffInfo->getSffFiles();
    
    # This object is used to map read names to it's corresponding sff file
    # by using 454's sffinfo command.  This will be used to map the read
    # to it's library using the read's sff file mapping, the lookup of the
    # sff file to it's library in the sffinfo.txt file.
    #
    my $objSffReads = PGF::GapResolution::SffReads->new();
    $objSffReads->set454SffInfoCmd($sffinfoCmd);
    foreach my $sffFile (@sffFiles) {
        $objSffReads->addSffFile($sffFile);
    }
    
    # Get all library insert sizes parsed from lib info file.  Hash contains
    # key=libraryName, value=[minInsertSize, maxInsertSize]
    #
    my %libInsertSizes = getLibraryInsertSizes($libInfoFile, $libInsertSizeStdDevMult);

    if ($DEBUG) {
        print $fh map "LIB:$_, min:$libInsertSizes{$_}[0], max:$libInsertSizes{$_}[1]\n", keys %libInsertSizes;
    }
    
    unless ( open IFILE, $readInfoFile ) {
        confess "ERROR: failed to open read info file $readInfoFile: $!\n";
    }
    
    print $fh "refContig=$refContig\n" if $DEBUG;
    
    while (my $entry = <IFILE>) {
        chomp $entry;
        next if !length $entry;
        next if $entry =~ /^#/;
        
        my @values = split /\t/, $entry;
        next if @values < 13;
        
        my $readName = $values[0];
        my $readStart = $values[1];
        my $readEnd = $values[2];
        my $readOrientation = $values[3];
        my $readContig = $values[4];
        my $readPairingType = $values[6];
        my $mateName = $values[7];
        my $mateStart = $values[8];
        my $mateEnd= $values[9];
        my $mateOrientation = $values[10];
        my $mateContig = $values[11];
        my $sffName = getSffNameFromRead($readName);
        
        # skip if read's contig is not in the same contig as the reference.
        next if $readContig ne $refContig;
        
        # skip if read pairing is not of type 'P' (pair)
        next if $readPairingType !~ /[PM]/;
        
        if ( $DEBUG ) {
            print $fh "READ: readName:$readName, contig:$readContig, dir: $readOrientation, start:$readStart, end:$readEnd\n";
            print $fh "      mateName:$mateName, contig:$mateContig, dir: $mateOrientation, start:$mateStart, end:$mateEnd\n";
            print $fh "      pairType:$readPairingType, sffName:$sffName\n";
            print $fh "      numHappyReads:$happyReadPairCount, numUnhappyReads:$unhappyReadPairCount\n";
        }
        
        # Add to unhappy pair count, then skip if read pairing type
        # is 'M' (multiply placed)
        if ( $readPairingType eq 'M' ) {
            $unhappyReadPairCount++;
            print $fh "      comment: read pair is multiply placed ... invalid.\n" if $DEBUG;
            next;
        }
        
        # skip if sff name cannot be determined from read name.
        if ( !length $sffName ) {
            print $fh "      comment: sffName is could not be determined ... skipping.\n" if $DEBUG;
            next;
        }
        
        # if read and mate are on different contigs, add to unhappy pair count
        # and skip.
        if ( $readContig ne $mateContig ) {
            $unhappyReadPairCount++;
            print $fh "      comment: read pairs not on same contig ... invalid.\n" if $DEBUG;
            next;
        }
        
        my $checkLibraryInsertSize = 0;
        my $readStartForLibCheck = 0;
        my $readEndForLibCheck = 0;
        
        # read -->  <-- mate
        if ( $readEnd < $mateStart ) {
            if ( $readOrientation eq '+' && $mateOrientation eq '-' ) {
                $readStartForLibCheck = $readStart;
                $readEndForLibCheck = $mateEnd;
                $checkLibraryInsertSize = 1;
            } else {
                $unhappyReadPairCount++;
                print $fh "      comment: wrong orientation ... invalid.\n" if $DEBUG;
            }
            
        # mate -->  <-- read
        } elsif ( $mateEnd < $readStart ) {
            if ( $mateOrientation eq '+' && $readOrientation eq '-' ) {
                $readStartForLibCheck = $mateStart;
                $readEndForLibCheck = $readEnd;
                $checkLibraryInsertSize = 1;
            } else {
                $unhappyReadPairCount++;
                print $fh "      comment: wrong orientation ... invalid.\n" if $DEBUG;
            }
            
        # read -->
        #       <-- mate
        } elsif ( $readStart < $mateEnd && $readEnd >= $mateStart ) {
            $unhappyReadPairCount++;
            print $fh "      comment: read pairs overlap ... invalid.\n" if $DEBUG;
            
        # mate -->
        #       <-- read
        } elsif ( $mateStart < $readEnd && $mateEnd >= $$readStart ) {
            $unhappyReadPairCount++;
            print $fh "      comment: read pairs overlap ... invalid.\n" if $DEBUG;
        }
        
        # If pairing seems valid, check library insert size.
        #
        if ( $checkLibraryInsertSize ) {
            # check if read name is phrap based, if so library checking routine
            # is different than if read is from newbler.
            #
            my $objReadName = PGF::GapResolution::ReadName->new($readName);
            my $isInsertSizeValid = 0;
            if(  $objReadName->isPhrapRead() ) {
                my $library= $objReadName->getLibrary();
                $isInsertSizeValid = checkLibraryInsertSize($library, $readStartForLibCheck,
                    $readEndForLibCheck, \%libInsertSizes, $fh);
            } else {
                $isInsertSizeValid = isNewblerInsertSizeValid($objSffReads,
                    $objParseSffInfo, $sffName, $readStartForLibCheck, $readEndForLibCheck,
                    \%libInsertSizes, $fh);
            }
            if ( $isInsertSizeValid ) {
                $happyReadPairCount++;
            } else {
                $unhappyReadPairCount++;
                print $fh "      comment: wrong insert size ... invalid.\n" if $DEBUG;
            }
        }
        
    }
    close IFILE;
    
    # Check if percent of valid read pairing consistency meets threshold.
    #
    my $totalReadPairCount = $happyReadPairCount + $unhappyReadPairCount;
    my $percentReadPairConsistency = $totalReadPairCount ?
        sprintf("%.2f",$happyReadPairCount/$totalReadPairCount*100) : 0.00;
    
    $isValid = 1 if $percentReadPairConsistency >=
        $validPctReadPairConsistency;
    
    print $fh "numConsistentReadPairs=$happyReadPairCount\n";
    print $fh "numInconsistentReadPairs=$unhappyReadPairCount\n";
    print $fh "pctConsistent=$percentReadPairConsistency\n";
    
    return $isValid;
    
}

#============================================================================#
sub isNewblerInsertSizeValid {
    
    my $objSffReads = shift;
    my $objParseSffInfo = shift;
    my $sffName = shift;
    my $start = shift;
    my $end = shift;
    my $hrefLibInsertSizes = shift;
    my $fh = shift;
    
    # To get the library of the read, we must first map the read to the
    # sff file by using 454's sffinfo program to check if the read is in
    # the sff file.  Once we map the read to the sff file, we look up
    # the library name in the sffinfo.txt file based on the sff file name.
    # With the library name, we can use the libinfo.txt file to get the
    # library insert size and standard deviation.
    #
    my $sffFile = $objSffReads->getSffFile($sffName);
    my $library = $objParseSffInfo->getLibrary($sffFile);
    
    print $fh "      sffFile=$sffFile\n" if $DEBUG;
    print $fh "      library=$library\n" if $DEBUG;
    
    my $isValid = checkLibraryInsertSize($library, $start,
        $end, $hrefLibInsertSizes, $fh);

    return $isValid;
    
}

#============================================================================#
sub checkLibraryInsertSize {
    
    my $library = shift;
    my $start = shift;
    my $end = shift;
    my $hrefLibInsertSizes = shift;
    my $fh = shift;
    
    my $isValid = 0;
    my $readPairLength = $end - $start + 1;
    
    print $fh "      checking insert size ...\n" if $DEBUG;
    
    if ( length $library && $$hrefLibInsertSizes{$library} ) {
        my $minInsertSize = $$hrefLibInsertSizes{$library}->[0];
        my $maxInsertSize = $$hrefLibInsertSizes{$library}->[1];
        my $anchorDistance = $end - $start + 1;
        
        if ( $anchorDistance >= $minInsertSize && $anchorDistance <= $maxInsertSize ) {
            $isValid = 1;
        }
        
        if ($DEBUG) {
            print $fh "      library=$library, start:$start, end:$end, length:",$end-$start+1,"\n";
            print $fh "      minInsertSize:$minInsertSize, maxInsertSize:$maxInsertSize\n";
            print $fh "      isValid=$isValid\n";
        }
    } else {
        print $fh "      library name was not found in hash table.\n" if $DEBUG;
    }
    
    return $isValid;
    
}

#============================================================================#
sub validateConsensusQuality {
    
    my $minConsensusQual = shift;
    my $qualityFile = shift;
    my $fastaTag = shift;
    my $start = shift;
    my $end = shift;
    my $fh = shift;
    
    my $isValid = 0;
    my $qAvg = 0;
    my $objFastaParser = new PGF::Parsers::FastaParse($qualityFile);
    
    while ($objFastaParser->MoreEntries) {
        $objFastaParser->ReadNextEntry( -rawFormat=>0 );
        next if ( $objFastaParser->Name ne $fastaTag );
        my $qual = $objFastaParser->Seq;
        my $sumQval = 0;
        my $numQval = 0;
        my $pos = 0;
        while ($qual =~ /(\d+)\s*/g) {
            $pos++;
            last if $pos > $end;
            if ( $pos >= $start ) {
                $sumQval += $1;
                $numQval++;
            }
        }
        $qAvg = $numQval > 0 ? sprintf("%.2f", $sumQval/$numQval) : 0;
        last;
    }    
    
    print $fh "avgConsensusQualityBetweenAnchors=$qAvg\n";
    
    $isValid = 1 if $qAvg >= $minConsensusQual;
    
    return $isValid;
    
}

#============================================================================#
sub getSffNameFromRead {
    
    my $readName = shift;
    
    # pertains to 454 read names only!
    #
    my $sffName = $readName;
    $sffName =~ s/[\_\.].+$//;
    
    return $sffName;
    
}

#============================================================================#
sub getLibraryInsertSizes {
    
    my $libInfoFile = shift;
    my $libInsertSizeStdDevMultiplier = shift;
    
    my %libraryInsertSizes = ();
    my $objParseLibInfo = PGF::GapResolution::ParseLibInfo->new($libInfoFile);
    
    my @libraries = $objParseLibInfo->getLibraries();
    
    foreach my $library (@libraries) {
        my $insertSize = $objParseLibInfo->getInsertSize($library);
        my $stdDev = $objParseLibInfo->getStdDev($library);
        my $minLibInsertSize = $insertSize - $stdDev *
                $libInsertSizeStdDevMultiplier;
           $minLibInsertSize = 0 if $minLibInsertSize < 0;
        my $maxLibInsertSize = $insertSize + $stdDev *
            $libInsertSizeStdDevMultiplier;
        $libraryInsertSizes{$library} = [$minLibInsertSize,
            $maxLibInsertSize]
    }
    
    return %libraryInsertSizes;
}
    
#============================================================================#
sub usage {
    my $verbose = shift || 1;
    pod2usage(-verbose=>$verbose);
    exit 1;
}

#============================================================================#
