package PGF::Polishing::FindTargets::Region;

=head1 NAME

PGF::Polishing::FindTargets::Region
Polishing module for finding target regions

=head1 VERSION

$Revision: 1.5 $

$Date: 2009-08-26 17:18:38 $
=head1 SYNOPSIS

=head1 DESCRIPTION

Finds regions of low quality and xs in consensus sequence.

=head1 AUTHOR(S)

Stephan Trong

=head1 HISTORY

=over

=item *

Stephan Trong 09/25/2006 Creation

=item *

Stephan Trong 09/16/2008 Added findOverlaps method.

=back

=cut

#============================================================================#

use strict;
use warnings;
use Carp;

#============================================================================#
sub new {
    
    my $class = shift;
    my %params = @_;
    
    my $self = {};
    
    bless $self, $class;
    return $self;
    
}

#============================================================================#
sub lowQuality {

# Returns the start/stop positions of regions where the quality <= a specified
# quality value.  The return value is an array of references to a hash
# containing key=start/stop, value=position of regions.

    my $self = shift;
    my $sequenceQuality = shift;    # string containing quality values,
                                    # each separated by a space.
    my $minPhdQuality = shift;      # if region contains quality <= this, then
                                    # return start/stop position of the region.
    
    my @regions = ();
    my $basePosition = 0;
    my $startPosition = -1;
    my $stopPosition = -1;

    while ( $sequenceQuality =~ /(\d+)/g ) {
        my $baseQuality = $1;
        $basePosition++;

        if ( $baseQuality <= $minPhdQuality ) {
            if ( $startPosition < 0 ) {
                $startPosition = $basePosition;
            }
            $stopPosition = $basePosition;

        } else {
            if ( $startPosition > 0 && $stopPosition > 0 ) {
                push @regions, { start=>$startPosition, 
                                 stop=>$stopPosition 
                               };
            }
            $startPosition = -1;
            $stopPosition = -1;
        }
    }

    if ( $startPosition > 0 && $stopPosition > 0 ) {
        push @regions, { start=>$startPosition, 
                         stop=>$stopPosition 
                       };
    }

    return @regions;

}

#============================================================================#
sub xsInBase {

# Returns start/stop positions of regions containing 'x' or 'X' as a base.
# The return value is an array of references to a hash containing key=start/stop,
# value=position.

    my $self = shift;
    my $sequence = shift;  # string containing bases.

    my @regions = ();
    my $startPositionOfRegion = -1;
    my $stopPositionOfRegion = -1;

    while ( $sequence =~ /(x)/ig ) {
        my $seqPosition = pos $sequence;

        if ( $seqPosition == $stopPositionOfRegion + 1 ){
            $stopPositionOfRegion = $seqPosition;

        } else {
            if ( $startPositionOfRegion > 0 && $stopPositionOfRegion > 0 ) { 
                push @regions, { start=>$startPositionOfRegion, 
                                 stop=>$stopPositionOfRegion
                               };
            }
            $startPositionOfRegion = $seqPosition;
            $stopPositionOfRegion = $seqPosition;
        }
    }

    if ( $startPositionOfRegion > 0 && $stopPositionOfRegion > 0 ) { 
        push @regions, { start=>$startPositionOfRegion, 
                         stop=>$stopPositionOfRegion
                       };
    }

    return @regions;

}

#============================================================================#
sub regionsByDepth {

# Groups start/stop positions of regions by the depth of coverage.  The
# return value is a hash where key=depth, value=array of references to a
# hash with key=start/stop, value=position.

    my $self = shift;
    my $A_depthDatas = shift;  # array containing changes in depth in the 
                               # format "position,depth".  You can use
                               # PGF::Cloneview::ComputeReadDepth::computeReadDepthByPosition;
                               # to generate this list.
    my $A_depthsToGet = shift; # array containing depth values to retrieve.

    my $index = 0;
    my %depthPositions = ();
    foreach my $depthData ( @$A_depthDatas ) {
        my ($startPosition, $depth) = split /,/, $depthData;
        if ( grep /^$depth$/, @$A_depthsToGet ) {
            if ( $index <= $#$A_depthDatas - 1 ) {
                my ($stopPosition) = split /,/, $A_depthDatas->[$index+1];
                push @{$depthPositions{$depth}}, {
                    start=>$startPosition,
                    stop=>$stopPosition-1
                };
            }
        }
        $index++;
    }

    return %depthPositions;

}

#============================================================================#
sub getReadsSpanningRegion {

# Returns a list of reads spanning a given region based on the specified
# start and stop position.  The return value is an array of references to a
# hash containing key=start/stop, value=position.

    my $self = shift;
    my $A_readDatas = shift;            # Array containing has reference:
                                        #    start=>startPosition,
                                        #    stop=>stopPosition
                                        # ordered by stop position.
    my $startOfRegion = shift;          # Start position of region.
    my $endOfRegion = shift;            # End position of region.
    my $expectedReadDepth = shift || 0; # Define this to stop scanning for
                                        # for reads if this many found.

    # Get read index of the read where the end position is closest to
    # or on the position of the start of the region.
    #
    my $index = _getReadIndexAtStartOfRegion($A_readDatas, $startOfRegion);

    # Get all reads spanning start and end of region.
    #
    my @readLists = ();
    my $numberOfReadsSpanningRegion = 0;
    while ( $index <= $#$A_readDatas ) {
        last if ( $expectedReadDepth && @readLists == $expectedReadDepth );
        my $readData = $A_readDatas->[$index];
        my $readStart = $readData->{start};
        my $readStop = $readData->{stop};
        if ( $readStart <= $startOfRegion && $readStop >= $endOfRegion ) {
            push @readLists, $readData;
        }
        $index++;
    }

    return @readLists;

}

#============================================================================#
sub determineTypeOfRegion {

# Determines the type of a polishing region based on the specified regular
# expression match to the name of the reads within the region.  See notes
# on $H_regexForRegionType below for more details.

    my $self = shift;
    my $A_readDataRefsSpanningRegion = shift;
    my $H_regexForRegionType = shift;

    # Input variables:
    #
    # $A_readDataRefsSpanningRegion is a reference to the array returned by
    # the getReadsSpanningRegion method.
    #
    # $H_regexForRegionType is a reference to a hash containing:
    #   hash=<regular expression for matches in read names>
    #   value=<name of region type>
    #
    # This is used to determine if all reads spanning a region
    # matches the specified regular expression for defining
    # a region type.
    #
    # Example:
    # For determining if a region contains only 454 reads, set:
    # $params{regexForDefiningRegionTypes} = { '^(?:[^\.]+)\.c'=>'454only' }
    #
    # will look for all reads spanning a region in which the
    # read names contain .c extension.  If true, then define
    # region type as '454only'.
    #
    # If no match is found, the region type is set to 'singleSubclone'.

    my $typeOfRegion = 'singleSubclone';

    foreach my $regex ( keys %$H_regexForRegionType ) {
        my $numberOfMatches = grep { $_->{name} =~ /$regex/ }
            @$A_readDataRefsSpanningRegion;

        # If the number of matches equals the number of reads spanning
        # the region, set the region type to that defined in
        # $H_regexForRegionType.
        #
        if ( $numberOfMatches == @$A_readDataRefsSpanningRegion ) {
            $typeOfRegion = $H_regexForRegionType->{$regex};
            last;
        }
    }

    return $typeOfRegion;

}

#============================================================================#
sub mergeRegions {

# Merges regions that are a specified number of bases away from each other.
# The return value is an array of references to a hash with key=start/stop,
# value=position.

    my $self = shift;
    my $A_regions = shift; # reference to array containing hash with key/value:
                           # start=>start of position,
                           # stop=>end of position.
    my $mergeIfThisManyBasesApart = shift || 0;
    
    my @regions = @$A_regions;
    my @mergedRegions = ();

    # sort regions by starting position.
    #
    @regions = map { $_->[0] }
               sort {$a->[1] <=> $b->[1] }
               map { [$_, $_->{start}] }
               @regions;
    
    my $firstRegionRef = shift @regions;
    my $lastStartOfRegion = $firstRegionRef->{start};
    my $lastEndOfRegion = $firstRegionRef->{stop};
    
    while ( my $regionRef = shift @regions ) {
        my $startOfRegion = $regionRef->{start};
        my $endOfRegion = $regionRef->{stop};
        if ( $lastEndOfRegion + $mergeIfThisManyBasesApart <
             $startOfRegion - 1) {
            push @mergedRegions, { start=>$lastStartOfRegion,
                                   stop=>$lastEndOfRegion
                                 };
            $lastStartOfRegion = $startOfRegion;
        }
        $lastEndOfRegion = $endOfRegion if $lastEndOfRegion < $endOfRegion;
    }
    
    push @mergedRegions, { start=>$lastStartOfRegion,
                           stop=>$lastEndOfRegion
                         };
    
    return @mergedRegions;
    
}

#============================================================================#
sub findOverlaps {
    
# Finds overlapping regions for two list of regions passed in as a reference
# to an array containing hash with key/value start=>x, stop=>y.
# The return value is an array of references to a hash with key=start/stop,
# value=position of the overlapping regions.

    my $self = shift;
    my $A_region1 = shift; # reference to array containing hash with key/value:
                           # start=>start of position,
                           # stop=>end of position of first region
    my $A_region2 = shift; # reference to array containing hash with key/value:
                           # start=>start of position,
                           # stop=>end of position of second region
                           
    my @results = ();
    
    foreach my $refRegion1 ( @$A_region1 ) {
        my $start1 = $refRegion1->{start};
        my $stop1 = $refRegion1->{stop};
        
        foreach my $refRegion2 (@$A_region2 ) {
            my $startOverlap;
            my $stopOverlap;
            my $start2 = $refRegion2->{start};
            my $stop2= $refRegion2->{stop};
            
            # case:
            # start1 --------- stop1
            #    start2 ------------- stop2
            # OR
            #    start2 --- stop2
            #
            if ( $start2 >= $start1 && $start2 < $stop1 ) {
                $startOverlap = $start2;
                
                # case:
                # start1 --------- stop1
                #    start2 ------------- stop2
                #
                if ( $stop2 > $stop1 ) {
                    $stopOverlap = $stop1;
                    
                # case:
                # start1 --------- stop1
                #    start2 --- stop2
                #
                } else {
                    $stopOverlap = $stop2;
                }
            
            # case:
            #     start1 ------------- stop1
            # start2 ------------- stop2
            # OR
            #         start2 --- stop2
            #
            } elsif ( $stop2 >= $start1 && $stop2 <= $stop1 ) {
                $stopOverlap = $stop2;
                
                # case:
                #     start1 ------------- stop1
                # start2 ------------- stop2
                #
                if ( $start2 < $start1 ) {
                    $startOverlap = $start1;
                    
                # case:
                #     start1 ------------- stop1
                #          start2 --- stop2
                #
                } else {
                    $startOverlap = $start2;
                }
                
            # case:
            #    start1 --- stop1
            # start2 --------- stop2
            #
            } elsif ( $start1 >= $start2 && $stop1 <= $stop2 ) {
                $startOverlap = $start1;
                $stopOverlap = $stop1;
            }
            
            if ( defined $startOverlap && defined $stopOverlap ) {
                push @results, { start=>$startOverlap,
                                 stop=>$stopOverlap
                               };
            }
        }
    }
        
    return @results;
    
}

#============================================================================#
sub getRegionsNotDefinedInList {
    
# Determines the start/stop positions of all regions that are not part of the
# specified regions.  The return value is an array of references to a
# hash containing key=start/stop, value=position.

    my $self = shift;
    my $startOfSegment = shift;   # the start position of the entire segment.
    my $endOfSegment = shift;     # the end position of the entire segment.
    my $A_definedRegions = shift; # reference to array containing hash with
                                  # key/value:
                                  # start=>start of position,
                                  # stop=>end of position.
                                  
    my @regionsNotDefined = ();
    
    return @regionsNotDefined if !@$A_definedRegions;
    
    # sort defined regions by starting position.
    #
    my @definedRegions = map { $_->[0] }
               sort {$a->[1] <=> $b->[1] }
               map { [$_, $_->{start}] }
               @$A_definedRegions;

    my $firstRegionRef = shift @definedRegions;
    my $lastDefinedRegionStart = $firstRegionRef->{start};
    my $lastDefinedRegionStop = $firstRegionRef->{stop};
    
    if ( $lastDefinedRegionStart > $startOfSegment ) {
        push @regionsNotDefined, { start=>$startOfSegment,
                                   stop=>$lastDefinedRegionStart-1
                                 }
    }
    
    foreach my $definedRegionsRef ( @definedRegions ) {
        my $definedStart = $definedRegionsRef->{start};
        my $definedStop = $definedRegionsRef->{stop};
        push @regionsNotDefined, { start=>$lastDefinedRegionStop+1,
                                   stop=>$definedStart-1
                                 };
        $lastDefinedRegionStop = $definedStop;
    }
    
    push @regionsNotDefined, { start=>$lastDefinedRegionStop+1,
                               stop=>$endOfSegment
                             };
    
    return @regionsNotDefined;

}

#============================================================================#
sub changeConsensusQuality {

# Change the given consensus quality to a specified value within the specified
# regions.  The return value is the string containing the modified quality
# values.

    my $self = shift;
    my %args = @_;
    
    # Parameters for %args:
    #
    # consensusQuality=> string of consensus quality values, with each score
    #                    separated by a base.
    # newQualityValue=> new quality value to adjust within the specified region
    # regions=> reference to array of hashes specifying regions to change
    #           the consensus quality values.
    #           reference to array containing hash with
    #           key/value:
    #           start=>start of position,
    #           stop=>end of position.
    # modifyQualtiyScoreOnlyIfAboveNewQualityValue=> set this to 1 if you don't want
    #           the quality scores lower than newQualityValue to be changed.
    
    my $consensusQuality = $args{consensusQuality};
    my $newQualityValue = $args{newQualityValue};
    my $A_regions = $args{regions};
    my $modifyQualtiyScoreOnlyIfAboveNewQualityValue =
        $args{modifyQualtiyScoreOnlyIfAboveNewQualityValue} || 0;

    # Sort regions by starting position.
    #
    my @regions = map { $_->[0] }
                  sort {$a->[1] <=> $b->[1] }
                  map { [$_, $_->{start}] }
                  @$A_regions;

    my $position = 0;
    my $newConsensusQuality = '';
    
    while ( $consensusQuality =~ /(\d+)/g ) {
        my $qualityValue = $1;
        $position++;
        
        shift @regions if @regions && $position > $regions[0]->{stop};
      
        # if $modifyQualtiyScoreOnlyIfAboveNewQuality is set to true and
        # consensus quality is <= $newQualityValue, don't change.
        #
        if ( $modifyQualtiyScoreOnlyIfAboveNewQualityValue &&
             $qualityValue <= $newQualityValue ) {
            $newConsensusQuality .= "$qualityValue ";
            next;
        }
        
        foreach my $refRegions (@regions) {
            if ( $position >= $refRegions->{start} &&
                 $position <= $refRegions->{stop} ) {
                $qualityValue = $newQualityValue;
                last;
            }
        }

        $newConsensusQuality .= "$qualityValue ";
    }

    $newConsensusQuality =~ s/\s+$//;

    return $newConsensusQuality;

}

#============================================================================#
sub _getReadIndexAtStartOfRegion {

# Binary search algorithm to return the index of  of the read where the end
# position is closest to or on the position of the start of the region.
    
    my $A_readDatas = shift;
    my $position = shift;

    my ($low, $high) = (0, $#$A_readDatas);

    while ( $low <= $high ) {
        my $try = int( ($high+$low)/2 );
        if ( $A_readDatas->[$try]{stop} == $position ) {
            return $try;
        } elsif ( $low == $high ) {
            return $try;
        } elsif ( $A_readDatas->[$try]{stop} < $position ) {
            $low = $try+1;
        } elsif ( $A_readDatas->[$try]{stop} > $position ) {
            $high = $try-1;
        }
    }

    return -1;

}

#============================================================================#
1;