package PGF::Polishing::FindTargets;

=head1 NAME

PGF::Polishing::FindTargets
Module for finding target regions for polishing.

=head1 VERSION

$Revision: 1.6 $

$Date: 2009-08-26 17:18:36 $

=head1 SYNOPSIS

=head1 DESCRIPTION

This module contains methods for finding regions of low quality,
x's in consensus sequences and single subclone regions.

=head1 AUTHOR(S)

Stephan Trong

=head1 HISTORY

=over

=item *

Stephan Trong 10/25/2006 Creation

=back

=cut

#============================================================================#

use strict;
use warnings;
use Carp;
use Carp qw(cluck);
use vars qw($RealBin);
use FindBin qw($RealBin);
use lib "$RealBin/../lib";
use PGF::Polishing::FindTargets::Region;
use PGF::Cloneview::ComputeReadDepth;
use PGF::Parsers::ParseAce::Unpad;

my $_findRegionObj = PGF::Polishing::FindTargets::Region->new();
my $_unpadObject = PGF::Parsers::ParseAce::Unpad->new();

#============================================================================#
sub new {
    
    my $class = shift;
    my %params = @_;
    
    if ( !defined $params{aceObject} ) {
        croak "Need to define aceObject.";
    }

    my $aceObject = $params{aceObject};
    my $debug = defined $params{debug} ? $params{debug} : 0;
    
    my $self = { _aceObject=>$aceObject,
                 _consensusTags=>{},
                 _debug=>$debug
               };
    
    bless $self, $class;
    return $self;
    
}

#============================================================================#
sub setContig {

# Stores the contig name, sequence and quality of a given contig as attributes
# of the object.

    my $self = shift;
    my $contigName = shift;

    my $aceObject = $self->{_aceObject};
    my $contigRef = $aceObject->contig($contigName);

    $self->{_contigDataRef} = { name=>$contigRef->{number},
                                sequence=>$contigRef->{sequence_unpadded},
                                quality=>$contigRef->{quality},
                              };

}

#============================================================================#
sub findLowQualityRegions {

# Returns the start/stop positions of regions containing quality values less
# than that specified.  The return value is an array of references to a
# hash containing key=start/stop, value=position.

    my $self = shift;
    my $minPhdQuality = shift;
    
    my $contigRef = $self->{_contigDataRef};
    my $consensusQuality = $contigRef->{quality};
    
    my @regions = $_findRegionObj->lowQuality($consensusQuality, 
        $minPhdQuality);

    return @regions;

}

#============================================================================#
sub findXsInConsensus {

# Returns the start/stop positions of regions containing 'x' or 'X' in the
# sequence.  The return value is an array of references to a
# hash containing key=start/stop, value=position.

    my $self = shift;

    my $contigRef = $self->{_contigDataRef};
    my $consensusSequence = $contigRef->{sequence};

    my @regions = $_findRegionObj->xsInBase($consensusSequence);

    return @regions;

}

#============================================================================#
sub findSingleSubclone {

# Returns single subclone regions.  The return value is an array of references
# to a hash containing:
# key=start, value=position,
# key=stop, value=position,
# key=typeOfRegion, value=name of region type
# key=readDepth, value=value of depth of coverage at that region.

    my $self = shift;
    my %params = @_;

    # %params can be:
    # regexForDefiningRegionTypes = hash reference in the form:
    #   hash=<regular expression for matches in read names>
    #   value=<name of region type> 
    #
    # This is used to determine if all reads spanning a region
    # matches the specified regular expression for defining
    # a region type. If one or more reads in the region do not match
    # criteria or if this parameter is not defined, then the type of
    # region is set to 'singleSubclone'.
    #
    # Example: 
    # For determining if a region contains only 454 reads, set:
    # $params{regexForDefiningRegionTypes} = { '^(?:[^\.]+)\.c'=>'454only' }
    #
    # will look for all reads spanning a region in which the 
    # read names contain .c extension.  If true, then define
    # region type as '454only'.
    #
    my %regexForDefiningRegionTypes = ();
    if ( defined $params{regexForDefiningRegionTypes} ) {
        %regexForDefiningRegionTypes = %{$params{regexForDefiningRegionTypes}};
    }

    my $aceObject = $self->{_aceObject};
    my $contigRef = $self->{_contigDataRef};
    my $contigName = $contigRef->{name};
    my @readDataRefs = ();
    my @readPositions = ();
    my @regions = ();
    
    # Initialize $unpadObject to use padded sequence from contig $contigName.
    # This will be used to convert start and end regions to padded positions
    # for determining the read sequence within the defined region.
    #
    $self->_setUnpadObject($contigName);

    # Get all reads for this contig from ace file and store in data structure.
    # Use aligned positions of the read.
    #
    $aceObject->fetchreads(-contig=>$contigName);

    while ( my $readDataRef = $aceObject->fetch() ) {
        my $readName = $readDataRef->{name};
        my $readStartPosition = $readDataRef->{align_start};
        my $readStopPosition = $readDataRef->{align_stop};
        push @readDataRefs, { name=>$readName, 
                              start=>$readStartPosition,
                              stop=>$readStopPosition
                            };
        push @readPositions, "$readStartPosition,$readStopPosition";
    }

    # Compute read depth for this contig.
    #
    my @readDepths = computeReadDepthByPosition( @readPositions );
    
    # Find regions with depth of 1x and 2x.
    #
    my @depthsToGet = ( 1, 2 ); # get 1x and 2x regions.
    my %depthPositions = $_findRegionObj->regionsByDepth(
        \@readDepths, \@depthsToGet );

    if ( $self->{_debug} ) {
        print map "1x: $_->{start}, $_->{stop}\n", @{$depthPositions{1}};
        print map "2x: $_->{start}, $_->{stop}\n", @{$depthPositions{2}};
        print "\n\n";
    }


    # For regions with depth of 2, check single subclone criteria for reads
    # spanning this region...
    #
    # First, sort array by read stop position.  Need to this because
    # getReadsSpanningRegion (which is called within _getRegionsAt1XCoverage
    # and _getRegionsAt2XCoverage) expects the list of read data to be sorted
    # by stop position in ascending order.
    #
    @readDataRefs = map { $_->[0] }
                    sort {$a->[1] <=> $b->[1] } 
                    map { [$_, $_->{stop}] }
                    @readDataRefs;
                    
    # For 1x regions, get type of region and add to @regions array.
    #
    my $atThisReadDepth = 1; # get read spanning 1x region.
    
    push @regions, $self->_getRegionsAt1XCoverage(
        \@readDataRefs,
        \%regexForDefiningRegionTypes,
        @{$depthPositions{$atThisReadDepth}}
    );
    
    # Get reads spanning 2x regions and check for single subclone criteria.
    #
    $atThisReadDepth = 2; # get reads spanning 2x regions.
    
    push @regions, $self->_getRegionsAt2XCoverage(
        \@readDataRefs,
        \%regexForDefiningRegionTypes,
        @{$depthPositions{$atThisReadDepth}}
    );

    # sort regions by starting position.
    #
    @regions = map { $_->[0] }
               sort {$a->[1] <=> $b->[1] }
               map { [$_, $_->{start}] }
               @regions;

    return @regions;

}

#============================================================================#
sub mergeRegions {
   
# Merges regions that are a specified number of bases away from each other.
# The return value is an array of references to a hash with key=start/stop,
# value=position.

    my $self = shift;
    my $A_regions = shift;
    my $mergeIfThisManyBasesApart = shift || 0;
    
    return $_findRegionObj->mergeRegions($A_regions,
                                         $mergeIfThisManyBasesApart);
}

#============================================================================#
sub initConsensusTags {

# Looks for consensus tags grouped by contig and stores as an object attribute.

    my $self = shift;
    my @tagTypes = @_; # example: 'polishLowQualConsensus',
                       # 'polishSingleSubclone'
                       # if defined, only get these consensus tags.
                     
    my %cTagsByContig = $self->getConsensusTagsByContig(@tagTypes);
    
    $self->{_consensusTags} = \%cTagsByContig;
    
}

#============================================================================#
sub getConsensusTagsByContig {

# Returns consensus tags grouped by contig.  The return value is a hash
# where the key=name of contig, value=array holding the string for each
# consensus tag for that contig.

    my $self = shift;
    my @tagTypes = @_; # example: 'polishLowQualConsensus',
                       # 'polishSingleSubclone'
                       # if defined, only get these consensus tags.
    
    my $targetTypeRegEx = @tagTypes ? join "|", @tagTypes : '\S+';
    my $aceObject = $self->{_aceObject};
    my @cTags = $aceObject->tag('CT');
    my %cTagsByContig = ();
    
    foreach my $rawAceCtTag (@cTags) {
        # look for tag like:
        # 'Contig1 polishLowQualConsensus findPolishingRegions 12760 1304'
        #
        if ( $rawAceCtTag =~ /^(\S+) (?:$targetTypeRegEx)/ ) {
            push @{$cTagsByContig{$1}}, $rawAceCtTag;
        }
            
    }

    return %cTagsByContig;
    
}
    
#============================================================================#
sub undefConsensusTags {

# Clears the consensus tags data from the object.

    my $self = shift;
    
    $self->{_consensusTags} = [];

}

#============================================================================#
sub getRegionsToPolishByContig {

# Returns start/stop positions of regions to polish within a given contig.
# The return value is an array of references to a hash with key/value pairs:
# key=start, value=position,
# key=stop, value=position,
# key=tagType, value=name of tag type (available only if mergeRegions=0)

    my $self = shift;
    my %args = @_;
    
    # %args can be:
    #
    # contig=>name of contig (required)
    # extendRegionByThisNumBases=>number of bases to extend ends of region.
    # doNotPolishThisManyBasesAtEndOfContig=>number of bases at ends of contig
    #   to skip polishing.
    # mergeRegions=>0|1  Defines whether to merge overlapping regions. Default
    #                    =0.
    # tagTypes=>array containing specific tag types to retrieve (optional).
    
    return () unless %{$self->{_consensusTags}};
    
    # Get arguments.
    #
    my $contig = $args{contig} || croak 'contig is not defined.';
    my $extendRegionByThisNumBases = $args{extendRegionByThisNumBases} || 0;
    my $doNotPolishThisManyBasesAtEndOfContig =
        $args{doNotPolishThisManyBasesAtEndOfContig} || 0;
    my $mergeRegions = $args{mergeRegions} || 0;
    my $A_tagType = $args{tagType} || [];
    
    my %tagTypesToRetrieve = ();
    
    # if tagTypes is defined, then set %tagTypesToRetrieve hash where
    # key=name of tag type, value=1.
    #
    if ( @{$A_tagType} ) {
        @tagTypesToRetrieve{ @$A_tagType } = (1) x @$A_tagType;
    }
    
    # Get contig attributes.
    #
    my $aceObject = $self->{_aceObject};
    my $contigRef = $aceObject->contig($contig);
    my $contigSequencePadded = $contigRef->{sequence};
    my $contigLength = $contigRef->{size_unpadded};

    my @mergedRegions = ();
    my @targetRegions = ();
    
    # Declare this for the Unpad object to create a pad/unpad table for
    # converting pad to unpad positions and vice versa.
    #
    $_unpadObject->use($contigSequencePadded);    
    
    # Get consensus tags for this contig.
    #
    my @cTags = defined $self->{_consensusTags}{$contig} ?
        @{$self->{_consensusTags}{$contig}} : ();
        
    
    # Get positions of polishing tags for this contig.
    #
    foreach my $rawAceCtTag (@cTags) {
        
        # Look for tag like:
        # 'Contig1 polishLowQualConsensus findPolishingRegions 12760 1304'
        # $1=padded start, $2=padded stop
        #
        if ( $rawAceCtTag =~ /$contig (\S+) \S+ (\-*\d+) (\-*\d+)/ ) {
            
            my $rawAceCtTagType = $1;
            
            # Convert padded positions to unpadded positions.
            #
            my $start = $_unpadObject->padToUnpad($2);
            my $stop = $_unpadObject->padToUnpad($3);
            
            # Don't include region if it is within the ends of the contig
            # by which to ignore polishing.
            #
            next if $stop < $doNotPolishThisManyBasesAtEndOfContig;
            next if $start > $contigLength - $doNotPolishThisManyBasesAtEndOfContig;
            
            # Extend start and stop positions by N number of bases defined
            # by user.
            #
            $start -= $extendRegionByThisNumBases;
            $stop += $extendRegionByThisNumBases;
            
            # Adjust start and stop positions to exclude N number of bases
            # at ends of contig (to skip polishing), where N is defined in
            # $doNotPolishThisManyBasesAtEndOfContig.
            #
            $start = $doNotPolishThisManyBasesAtEndOfContig if
                $start < $doNotPolishThisManyBasesAtEndOfContig;
            $stop = $contigLength - $doNotPolishThisManyBasesAtEndOfContig if
                $stop > $contigLength - $doNotPolishThisManyBasesAtEndOfContig;
            
            if ( $start <= $stop ) {
                
                # If tagTypes to retrieve are defined, check if this tag type
                # is in list.  If not, don't add to target regions.
                #
                if ( %tagTypesToRetrieve &&
                    !$tagTypesToRetrieve{$rawAceCtTagType} ) {
                    next;
                }
                
                push @targetRegions, { start=>$start,
                                       stop=>$stop,
                                       tagType=>$rawAceCtTagType,
                                     };
            }

        }
    }

    # Merge regions if mergeRegion is defined as true in argument.
    #
    if ( $mergeRegions && @targetRegions ) {
        return $_findRegionObj->mergeRegions(\@targetRegions);
    
    # If mergeRegions not defined, return target regions.
    #
    } else {
        return @targetRegions;
    }
    
}
    
#============================================================================#
sub getDoNotPolishRegions {

# Determines the start/stop positions of all regions that are not part of the
# specified regions.  The return value is an array of references to a
# hash containing key=start/stop, value=position.

    my $self = shift;
    my $startOfSegment = shift;
    my $endOfSegment = shift;
    my @polishingRegions = @_;
    
    my @doNotPolishRegions = ();
    
    if ( @polishingRegions ) {
        @doNotPolishRegions = $_findRegionObj->getRegionsNotDefinedInList(
            $startOfSegment, $endOfSegment, \@polishingRegions);
    } else {
        @doNotPolishRegions = ( {start=>1, stop=>$endOfSegment} );
    }
    
    return @doNotPolishRegions;
    
}
    
#============================================================================#
sub changeConsensusQuality {

# Change the given consensus quality to a specified value within the specified
# regions.  The return value is the string containing the modified quality
# values.
    
    my $self = shift;
    my %args = @_;

    # Parameters for %args:
    #
    # consensusQuality=> string of consensus quality values.
    # newQualityValue=> new quality value to adjust within the specified region
    # regions=> reference to array of hashes specifying regions to change
    #           the consensus quality values.
    #           reference to array containing hash with
    #           key/value:
    #           start=>start of position,
    #           stop=>end of position.   
    # modifyQualtiyScoreOnlyIfAboveNewQualityValue=> set this to 1 if you don't
    #           want the quality scores lower than newQualityValue to be changed.
    
    return $_findRegionObj->changeConsensusQuality(%args);
    
}

#============================================================================#
sub _getRegionsAt1XCoverage {
  
# Get regions of 1X coverage.  The return value is an array of references
# to a hash containing:
# key=start, value=position,
# key=stop, value=position,
# key=typeOfRegion, value=name of region type
# key=readDepth, value=value of depth of coverage at that region.

    my $self = shift;
    my $A_readDataRefs = shift;
    my $H_regexForDefiningRegionTypes = shift;
    my @depthPositions = @_;
    
    # Arguments:
    #
    # $A_readDataRefs = reference to array containing hash with key/value:
    #    name=readname,
    #    start=>start position of read,
    #    stop=>stop position of read,
    #
    # $H_regexForDefiningRegionTypes = hash reference in the form:
    #   hash=<regular expression for matches in read names>
    #   value=<name of region type> 
    #
    #   This is used to determine if all reads spanning a region
    #   matches the specified regular expression for defining
    #   a region type. If one or more reads in the region do not match
    #   criteria or if this parameter is not defined, then the type of
    #   region is set to 'singleSubclone'.
    #
    #   Example: 
    #     For determining if a region contains only 454 reads, set:
    #     $params{regexForDefiningRegionTypes} = { '^(?:[^\.]+)\.c'=>'454only' }
    #
    #     will look for all reads spanning a region in which the 
    #     read names contain .c extension.  If true, then define
    #     region type as '454only'.
    #
    # @depthPositions = array of references to a hash containing key=start/stop,
    # value=position.
    
    my $atThisReadDepth = 1;
    my @regions = ();
    
    foreach my $readDepthRef( @depthPositions ) {
        my $startOfRegion = $readDepthRef->{start};
        my $endOfRegion = $readDepthRef->{stop};
        my @readDataRefsSpanningRegion = $_findRegionObj->getReadsSpanningRegion(
            $A_readDataRefs, $startOfRegion, $endOfRegion, $atThisReadDepth);

        my $typeOfRegion = $_findRegionObj->determineTypeOfRegion(
            \@readDataRefsSpanningRegion, $H_regexForDefiningRegionTypes);
        
        push @regions, { start=>$startOfRegion, 
                         stop=>$endOfRegion,
                         typeOfRegion=>$typeOfRegion,
                         readDepth=>$atThisReadDepth
                       };
    }
    
    return @regions;
}

#============================================================================#
sub _getRegionsAt2XCoverage {

# Get regions of 1X coverage.  The return value is an array of references
# to a hash containing:
# key=start, value=position,
# key=stop, value=position,
# key=typeOfRegion, value=name of region type
# key=readDepth, value=value of depth of coverage at that region.

    my $self = shift;
    my $A_readDataRefs = shift;
    my $H_regexForDefiningRegionTypes = shift;
    my @depthPositions = @_;

    # Arguments:
    #
    # $A_readDataRefs = reference to array containing hash with key/value:
    #    name=readname,
    #    start=>start position of read,
    #    stop=>stop position of read,
    #
    # $H_regexForDefiningRegionTypes = hash reference in the form:
    #   hash=<regular expression for matches in read names>
    #   value=<name of region type> 
    #
    #   This is used to determine if all reads spanning a region
    #   matches the specified regular expression for defining
    #   a region type. If one or more reads in the region do not match
    #   criteria or if this parameter is not defined, then the type of
    #   region is set to 'singleSubclone'.
    #
    #   Example: 
    #     For determining if a region contains only 454 reads, set:
    #     $params{regexForDefiningRegionTypes} = { '^(?:[^\.]+)\.c'=>'454only' }
    #
    #     will look for all reads spanning a region in which the 
    #     read names contain .c extension.  If true, then define
    #     region type as '454only'.
    #
    # @depthPositions = array of references to a hash containing key=start/stop,
    # value=position.
    
    my $atThisReadDepth = 2;
    my @regions = ();
    
    foreach my $readDepthRef ( @depthPositions ) {
        my $startOfRegion = $readDepthRef->{start};
        my $endOfRegion = $readDepthRef->{stop};
        my $paddedStartOfRegion = $_unpadObject->unpadToPad($startOfRegion);
        my $paddedEndOfRegion = $_unpadObject->unpadToPad($endOfRegion);
        my $isSingleSubclone = 0;

        if ( $self->{_debug} ) {
            print "At 2x: $startOfRegion, $endOfRegion, ".
                  "padded: $paddedStartOfRegion, $paddedEndOfRegion\n";
        }

        # Get reads spanning this region.
        #
        my @readDataRefsSpanningRegion = $_findRegionObj->getReadsSpanningRegion(
            $A_readDataRefs, $startOfRegion, $endOfRegion, $atThisReadDepth);

        # Determine the type of region (e.g., singleSubclone, 454Only,...).
        # This is largely determined by how $H_regexForDefiningRegionTypes
        # is defined.  If the regex defined in the hash does not match all of
        # the read names for all reads spanning this region, then the type of
        # region is defined as singleSubclone.  Otherwise, it is defined based
        # on the value of the hash.
        #
        my $typeOfRegion = $_findRegionObj->determineTypeOfRegion(
            \@readDataRefsSpanningRegion, $H_regexForDefiningRegionTypes);

        # If type of region is 'singleSubclone', then check if the sequences
        # of the reads spanning the regions are identical.  If not, flag as
        # single subclone.
        #
        if ( $typeOfRegion eq 'singleSubclone' ) {
            $isSingleSubclone = $self->_checkReadsForSingleSubcloneCriteria(
                $paddedStartOfRegion, $paddedEndOfRegion,
                \@readDataRefsSpanningRegion);
        
        # If type of region is other (specified by
        # %regexForDefiningRegionTypes), then flag as single subclone.  This
        # means that both reads spanning the region matches the regex
        # criteria defined in the hash.
        #
        } else {
            $isSingleSubclone = 1;
        }

        if ( $self->{_debug} )  {
            print "isSingleSubclone=$isSingleSubclone, ".
                  "typeOfRegion=$typeOfRegion\n\n";
        }

        # Add region if single subclone criteria is met.
        #
        if ( $isSingleSubclone ) {
            push @regions, { start=>$startOfRegion, 
                             stop=>$endOfRegion, 
                             typeOfRegion=>$typeOfRegion,
                             readDepth=>$atThisReadDepth
                           };
        }
    }
    
    return @regions;
    
}
    
#============================================================================#
sub _checkReadsForSingleSubcloneCriteria {
    
# Returns 1 if:
# a) only one read in the region (single coverage).
# b) reads are from the same clone.
# c) reads are from different clones but sequences are different.
#
    
    my $self = shift;
    my $paddedStartOfRegion = shift;
    my $paddedEndOfRegion = shift;
    my $A_readDatas = shift;
    
    # Arguments:
    #
    # $paddedStartOfRegion = padded start of contig region to get sequence.
    # $paddedEndOfRegion = padded end of contig region to get sequence.
    # $A_readDatas = reference to array of references to a hash containing
    #   key=start/stop, value=position.

    # If only one read in region, then return 1.
    #
    return 1 if @$A_readDatas == 1; 

    my $readDataRef = $A_readDatas->[0];
    my $readName = $readDataRef->{name};
    my ($cloneName) = $readName =~ /^([^\.]+)/;
    my $sequenceInRegion = $self->_getReadSequenceInRegion($readDataRef,
        $paddedStartOfRegion, $paddedEndOfRegion);

    # Do all reads belong to the same clone?
    #
    my $numberOfReadsFromSameClone = grep { $_->{name} =~ /^$cloneName\./ } 
        @$A_readDatas;

    if ( $self->{_debug} ) {
        print "NumberOfReadsFromSameClone=$numberOfReadsFromSameClone\n"
    }

    return 1 if $numberOfReadsFromSameClone == @$A_readDatas;
    
    # Do all reads have the same sequence?
    #
    my $isSingleSubclone = 0;
    for ( my $index = 1; $index <= $#$A_readDatas; $index++ ) {
        my $readDataRef = $A_readDatas->[$index];
        my $sequenceOfReadInRegion = $self->_getReadSequenceInRegion(
            $readDataRef, $paddedStartOfRegion, $paddedEndOfRegion);
        
        if ( lc $sequenceOfReadInRegion ne lc $sequenceInRegion ) {
            $isSingleSubclone = 1;
        }
    }
    
    return $isSingleSubclone;
    
}
    
#============================================================================#
sub _getReadSequenceInRegion {

# Returns the read sequence based on the 

    my $self = shift;
    my $readDataRef = shift;         # 
    my $paddedStartOfRegion = shift;
    my $paddedEndOfRegion = shift;
    
    my $aceObject = $self->{_aceObject};
    my $readName = $readDataRef->{name};
    
    # get read sequence from ace file.
    #
    my $aceReadData = $aceObject->read($readName);
    my $paddedSequence = $aceReadData->{sequence};
    my $paddedReadStart = $aceReadData->{start};
    my $paddedReadStop = $aceReadData->{stop};

    my $startOfCut = $paddedStartOfRegion - $paddedReadStart;
    my $lengthOfCut = $paddedEndOfRegion - $paddedStartOfRegion + 1;
    my $sequenceInRegion = substr($paddedSequence, $startOfCut, $lengthOfCut);

    if ( $self->{_debug} ) {
        print "read=$readName, ".
              "seq=",length($paddedSequence).", ".
              "readStart=$paddedReadStart, ".
              "readStop=$paddedReadStop, ".
              "startOfCut=$startOfCut, ".
              "lengthOfCut=$lengthOfCut\n";
        print $sequenceInRegion."\n";
    }

    return $sequenceInRegion;
    
}

#============================================================================#
sub _setUnpadObject {

# Creates and stores the pad/unpad conversion table as an object attribute.

    my $self = shift;
    my $contigName = shift;

    my $aceObject = $self->{_aceObject};
    my $aceContigDataRef = $aceObject->contig($contigName);
    my $paddedContigSequence = $aceContigDataRef->{sequence};

    $_unpadObject->use($paddedContigSequence);

}

#============================================================================#
1;