package PGF::Polishing::FindTargets::Region; =head1 NAME PGF::Polishing::FindTargets::Region Polishing module for finding target regions =head1 VERSION $Revision: 1.5 $ $Date: 2009-08-26 17:18:38 $ =head1 SYNOPSIS =head1 DESCRIPTION Finds regions of low quality and xs in consensus sequence. =head1 AUTHOR(S) Stephan Trong =head1 HISTORY =over =item * Stephan Trong 09/25/2006 Creation =item * Stephan Trong 09/16/2008 Added findOverlaps method. =back =cut #============================================================================# use strict; use warnings; use Carp; #============================================================================# sub new { my $class = shift; my %params = @_; my $self = {}; bless $self, $class; return $self; } #============================================================================# sub lowQuality { # Returns the start/stop positions of regions where the quality <= a specified # quality value. The return value is an array of references to a hash # containing key=start/stop, value=position of regions. my $self = shift; my $sequenceQuality = shift; # string containing quality values, # each separated by a space. my $minPhdQuality = shift; # if region contains quality <= this, then # return start/stop position of the region. my @regions = (); my $basePosition = 0; my $startPosition = -1; my $stopPosition = -1; while ( $sequenceQuality =~ /(\d+)/g ) { my $baseQuality = $1; $basePosition++; if ( $baseQuality <= $minPhdQuality ) { if ( $startPosition < 0 ) { $startPosition = $basePosition; } $stopPosition = $basePosition; } else { if ( $startPosition > 0 && $stopPosition > 0 ) { push @regions, { start=>$startPosition, stop=>$stopPosition }; } $startPosition = -1; $stopPosition = -1; } } if ( $startPosition > 0 && $stopPosition > 0 ) { push @regions, { start=>$startPosition, stop=>$stopPosition }; } return @regions; } #============================================================================# sub xsInBase { # Returns start/stop positions of regions containing 'x' or 'X' as a base. # The return value is an array of references to a hash containing key=start/stop, # value=position. my $self = shift; my $sequence = shift; # string containing bases. my @regions = (); my $startPositionOfRegion = -1; my $stopPositionOfRegion = -1; while ( $sequence =~ /(x)/ig ) { my $seqPosition = pos $sequence; if ( $seqPosition == $stopPositionOfRegion + 1 ){ $stopPositionOfRegion = $seqPosition; } else { if ( $startPositionOfRegion > 0 && $stopPositionOfRegion > 0 ) { push @regions, { start=>$startPositionOfRegion, stop=>$stopPositionOfRegion }; } $startPositionOfRegion = $seqPosition; $stopPositionOfRegion = $seqPosition; } } if ( $startPositionOfRegion > 0 && $stopPositionOfRegion > 0 ) { push @regions, { start=>$startPositionOfRegion, stop=>$stopPositionOfRegion }; } return @regions; } #============================================================================# sub regionsByDepth { # Groups start/stop positions of regions by the depth of coverage. The # return value is a hash where key=depth, value=array of references to a # hash with key=start/stop, value=position. my $self = shift; my $A_depthDatas = shift; # array containing changes in depth in the # format "position,depth". You can use # PGF::Cloneview::ComputeReadDepth::computeReadDepthByPosition; # to generate this list. my $A_depthsToGet = shift; # array containing depth values to retrieve. my $index = 0; my %depthPositions = (); foreach my $depthData ( @$A_depthDatas ) { my ($startPosition, $depth) = split /,/, $depthData; if ( grep /^$depth$/, @$A_depthsToGet ) { if ( $index <= $#$A_depthDatas - 1 ) { my ($stopPosition) = split /,/, $A_depthDatas->[$index+1]; push @{$depthPositions{$depth}}, { start=>$startPosition, stop=>$stopPosition-1 }; } } $index++; } return %depthPositions; } #============================================================================# sub getReadsSpanningRegion { # Returns a list of reads spanning a given region based on the specified # start and stop position. The return value is an array of references to a # hash containing key=start/stop, value=position. my $self = shift; my $A_readDatas = shift; # Array containing has reference: # start=>startPosition, # stop=>stopPosition # ordered by stop position. my $startOfRegion = shift; # Start position of region. my $endOfRegion = shift; # End position of region. my $expectedReadDepth = shift || 0; # Define this to stop scanning for # for reads if this many found. # Get read index of the read where the end position is closest to # or on the position of the start of the region. # my $index = _getReadIndexAtStartOfRegion($A_readDatas, $startOfRegion); # Get all reads spanning start and end of region. # my @readLists = (); my $numberOfReadsSpanningRegion = 0; while ( $index <= $#$A_readDatas ) { last if ( $expectedReadDepth && @readLists == $expectedReadDepth ); my $readData = $A_readDatas->[$index]; my $readStart = $readData->{start}; my $readStop = $readData->{stop}; if ( $readStart <= $startOfRegion && $readStop >= $endOfRegion ) { push @readLists, $readData; } $index++; } return @readLists; } #============================================================================# sub determineTypeOfRegion { # Determines the type of a polishing region based on the specified regular # expression match to the name of the reads within the region. See notes # on $H_regexForRegionType below for more details. my $self = shift; my $A_readDataRefsSpanningRegion = shift; my $H_regexForRegionType = shift; # Input variables: # # $A_readDataRefsSpanningRegion is a reference to the array returned by # the getReadsSpanningRegion method. # # $H_regexForRegionType is a reference to a hash containing: # hash= # value= # # This is used to determine if all reads spanning a region # matches the specified regular expression for defining # a region type. # # Example: # For determining if a region contains only 454 reads, set: # $params{regexForDefiningRegionTypes} = { '^(?:[^\.]+)\.c'=>'454only' } # # will look for all reads spanning a region in which the # read names contain .c extension. If true, then define # region type as '454only'. # # If no match is found, the region type is set to 'singleSubclone'. my $typeOfRegion = 'singleSubclone'; foreach my $regex ( keys %$H_regexForRegionType ) { my $numberOfMatches = grep { $_->{name} =~ /$regex/ } @$A_readDataRefsSpanningRegion; # If the number of matches equals the number of reads spanning # the region, set the region type to that defined in # $H_regexForRegionType. # if ( $numberOfMatches == @$A_readDataRefsSpanningRegion ) { $typeOfRegion = $H_regexForRegionType->{$regex}; last; } } return $typeOfRegion; } #============================================================================# sub mergeRegions { # Merges regions that are a specified number of bases away from each other. # The return value is an array of references to a hash with key=start/stop, # value=position. my $self = shift; my $A_regions = shift; # reference to array containing hash with key/value: # start=>start of position, # stop=>end of position. my $mergeIfThisManyBasesApart = shift || 0; my @regions = @$A_regions; my @mergedRegions = (); # sort regions by starting position. # @regions = map { $_->[0] } sort {$a->[1] <=> $b->[1] } map { [$_, $_->{start}] } @regions; my $firstRegionRef = shift @regions; my $lastStartOfRegion = $firstRegionRef->{start}; my $lastEndOfRegion = $firstRegionRef->{stop}; while ( my $regionRef = shift @regions ) { my $startOfRegion = $regionRef->{start}; my $endOfRegion = $regionRef->{stop}; if ( $lastEndOfRegion + $mergeIfThisManyBasesApart < $startOfRegion - 1) { push @mergedRegions, { start=>$lastStartOfRegion, stop=>$lastEndOfRegion }; $lastStartOfRegion = $startOfRegion; } $lastEndOfRegion = $endOfRegion if $lastEndOfRegion < $endOfRegion; } push @mergedRegions, { start=>$lastStartOfRegion, stop=>$lastEndOfRegion }; return @mergedRegions; } #============================================================================# sub findOverlaps { # Finds overlapping regions for two list of regions passed in as a reference # to an array containing hash with key/value start=>x, stop=>y. # The return value is an array of references to a hash with key=start/stop, # value=position of the overlapping regions. my $self = shift; my $A_region1 = shift; # reference to array containing hash with key/value: # start=>start of position, # stop=>end of position of first region my $A_region2 = shift; # reference to array containing hash with key/value: # start=>start of position, # stop=>end of position of second region my @results = (); foreach my $refRegion1 ( @$A_region1 ) { my $start1 = $refRegion1->{start}; my $stop1 = $refRegion1->{stop}; foreach my $refRegion2 (@$A_region2 ) { my $startOverlap; my $stopOverlap; my $start2 = $refRegion2->{start}; my $stop2= $refRegion2->{stop}; # case: # start1 --------- stop1 # start2 ------------- stop2 # OR # start2 --- stop2 # if ( $start2 >= $start1 && $start2 < $stop1 ) { $startOverlap = $start2; # case: # start1 --------- stop1 # start2 ------------- stop2 # if ( $stop2 > $stop1 ) { $stopOverlap = $stop1; # case: # start1 --------- stop1 # start2 --- stop2 # } else { $stopOverlap = $stop2; } # case: # start1 ------------- stop1 # start2 ------------- stop2 # OR # start2 --- stop2 # } elsif ( $stop2 >= $start1 && $stop2 <= $stop1 ) { $stopOverlap = $stop2; # case: # start1 ------------- stop1 # start2 ------------- stop2 # if ( $start2 < $start1 ) { $startOverlap = $start1; # case: # start1 ------------- stop1 # start2 --- stop2 # } else { $startOverlap = $start2; } # case: # start1 --- stop1 # start2 --------- stop2 # } elsif ( $start1 >= $start2 && $stop1 <= $stop2 ) { $startOverlap = $start1; $stopOverlap = $stop1; } if ( defined $startOverlap && defined $stopOverlap ) { push @results, { start=>$startOverlap, stop=>$stopOverlap }; } } } return @results; } #============================================================================# sub getRegionsNotDefinedInList { # Determines the start/stop positions of all regions that are not part of the # specified regions. The return value is an array of references to a # hash containing key=start/stop, value=position. my $self = shift; my $startOfSegment = shift; # the start position of the entire segment. my $endOfSegment = shift; # the end position of the entire segment. my $A_definedRegions = shift; # reference to array containing hash with # key/value: # start=>start of position, # stop=>end of position. my @regionsNotDefined = (); return @regionsNotDefined if !@$A_definedRegions; # sort defined regions by starting position. # my @definedRegions = map { $_->[0] } sort {$a->[1] <=> $b->[1] } map { [$_, $_->{start}] } @$A_definedRegions; my $firstRegionRef = shift @definedRegions; my $lastDefinedRegionStart = $firstRegionRef->{start}; my $lastDefinedRegionStop = $firstRegionRef->{stop}; if ( $lastDefinedRegionStart > $startOfSegment ) { push @regionsNotDefined, { start=>$startOfSegment, stop=>$lastDefinedRegionStart-1 } } foreach my $definedRegionsRef ( @definedRegions ) { my $definedStart = $definedRegionsRef->{start}; my $definedStop = $definedRegionsRef->{stop}; push @regionsNotDefined, { start=>$lastDefinedRegionStop+1, stop=>$definedStart-1 }; $lastDefinedRegionStop = $definedStop; } push @regionsNotDefined, { start=>$lastDefinedRegionStop+1, stop=>$endOfSegment }; return @regionsNotDefined; } #============================================================================# sub changeConsensusQuality { # Change the given consensus quality to a specified value within the specified # regions. The return value is the string containing the modified quality # values. my $self = shift; my %args = @_; # Parameters for %args: # # consensusQuality=> string of consensus quality values, with each score # separated by a base. # newQualityValue=> new quality value to adjust within the specified region # regions=> reference to array of hashes specifying regions to change # the consensus quality values. # reference to array containing hash with # key/value: # start=>start of position, # stop=>end of position. # modifyQualtiyScoreOnlyIfAboveNewQualityValue=> set this to 1 if you don't want # the quality scores lower than newQualityValue to be changed. my $consensusQuality = $args{consensusQuality}; my $newQualityValue = $args{newQualityValue}; my $A_regions = $args{regions}; my $modifyQualtiyScoreOnlyIfAboveNewQualityValue = $args{modifyQualtiyScoreOnlyIfAboveNewQualityValue} || 0; # Sort regions by starting position. # my @regions = map { $_->[0] } sort {$a->[1] <=> $b->[1] } map { [$_, $_->{start}] } @$A_regions; my $position = 0; my $newConsensusQuality = ''; while ( $consensusQuality =~ /(\d+)/g ) { my $qualityValue = $1; $position++; shift @regions if @regions && $position > $regions[0]->{stop}; # if $modifyQualtiyScoreOnlyIfAboveNewQuality is set to true and # consensus quality is <= $newQualityValue, don't change. # if ( $modifyQualtiyScoreOnlyIfAboveNewQualityValue && $qualityValue <= $newQualityValue ) { $newConsensusQuality .= "$qualityValue "; next; } foreach my $refRegions (@regions) { if ( $position >= $refRegions->{start} && $position <= $refRegions->{stop} ) { $qualityValue = $newQualityValue; last; } } $newConsensusQuality .= "$qualityValue "; } $newConsensusQuality =~ s/\s+$//; return $newConsensusQuality; } #============================================================================# sub _getReadIndexAtStartOfRegion { # Binary search algorithm to return the index of of the read where the end # position is closest to or on the position of the start of the region. my $A_readDatas = shift; my $position = shift; my ($low, $high) = (0, $#$A_readDatas); while ( $low <= $high ) { my $try = int( ($high+$low)/2 ); if ( $A_readDatas->[$try]{stop} == $position ) { return $try; } elsif ( $low == $high ) { return $try; } elsif ( $A_readDatas->[$try]{stop} < $position ) { $low = $try+1; } elsif ( $A_readDatas->[$try]{stop} > $position ) { $high = $try-1; } } return -1; } #============================================================================# 1;