package PGF::Polishing::FindTargets; =head1 NAME PGF::Polishing::FindTargets Module for finding target regions for polishing. =head1 VERSION $Revision: 1.6 $ $Date: 2009-08-26 17:18:36 $ =head1 SYNOPSIS =head1 DESCRIPTION This module contains methods for finding regions of low quality, x's in consensus sequences and single subclone regions. =head1 AUTHOR(S) Stephan Trong =head1 HISTORY =over =item * Stephan Trong 10/25/2006 Creation =back =cut #============================================================================# use strict; use warnings; use Carp; use Carp qw(cluck); use vars qw($RealBin); use FindBin qw($RealBin); use lib "$RealBin/../lib"; use PGF::Polishing::FindTargets::Region; use PGF::Cloneview::ComputeReadDepth; use PGF::Parsers::ParseAce::Unpad; my $_findRegionObj = PGF::Polishing::FindTargets::Region->new(); my $_unpadObject = PGF::Parsers::ParseAce::Unpad->new(); #============================================================================# sub new { my $class = shift; my %params = @_; if ( !defined $params{aceObject} ) { croak "Need to define aceObject."; } my $aceObject = $params{aceObject}; my $debug = defined $params{debug} ? $params{debug} : 0; my $self = { _aceObject=>$aceObject, _consensusTags=>{}, _debug=>$debug }; bless $self, $class; return $self; } #============================================================================# sub setContig { # Stores the contig name, sequence and quality of a given contig as attributes # of the object. my $self = shift; my $contigName = shift; my $aceObject = $self->{_aceObject}; my $contigRef = $aceObject->contig($contigName); $self->{_contigDataRef} = { name=>$contigRef->{number}, sequence=>$contigRef->{sequence_unpadded}, quality=>$contigRef->{quality}, }; } #============================================================================# sub findLowQualityRegions { # Returns the start/stop positions of regions containing quality values less # than that specified. The return value is an array of references to a # hash containing key=start/stop, value=position. my $self = shift; my $minPhdQuality = shift; my $contigRef = $self->{_contigDataRef}; my $consensusQuality = $contigRef->{quality}; my @regions = $_findRegionObj->lowQuality($consensusQuality, $minPhdQuality); return @regions; } #============================================================================# sub findXsInConsensus { # Returns the start/stop positions of regions containing 'x' or 'X' in the # sequence. The return value is an array of references to a # hash containing key=start/stop, value=position. my $self = shift; my $contigRef = $self->{_contigDataRef}; my $consensusSequence = $contigRef->{sequence}; my @regions = $_findRegionObj->xsInBase($consensusSequence); return @regions; } #============================================================================# sub findSingleSubclone { # Returns single subclone regions. The return value is an array of references # to a hash containing: # key=start, value=position, # key=stop, value=position, # key=typeOfRegion, value=name of region type # key=readDepth, value=value of depth of coverage at that region. my $self = shift; my %params = @_; # %params can be: # regexForDefiningRegionTypes = hash reference in the form: # hash= # value= # # This is used to determine if all reads spanning a region # matches the specified regular expression for defining # a region type. If one or more reads in the region do not match # criteria or if this parameter is not defined, then the type of # region is set to 'singleSubclone'. # # Example: # For determining if a region contains only 454 reads, set: # $params{regexForDefiningRegionTypes} = { '^(?:[^\.]+)\.c'=>'454only' } # # will look for all reads spanning a region in which the # read names contain .c extension. If true, then define # region type as '454only'. # my %regexForDefiningRegionTypes = (); if ( defined $params{regexForDefiningRegionTypes} ) { %regexForDefiningRegionTypes = %{$params{regexForDefiningRegionTypes}}; } my $aceObject = $self->{_aceObject}; my $contigRef = $self->{_contigDataRef}; my $contigName = $contigRef->{name}; my @readDataRefs = (); my @readPositions = (); my @regions = (); # Initialize $unpadObject to use padded sequence from contig $contigName. # This will be used to convert start and end regions to padded positions # for determining the read sequence within the defined region. # $self->_setUnpadObject($contigName); # Get all reads for this contig from ace file and store in data structure. # Use aligned positions of the read. # $aceObject->fetchreads(-contig=>$contigName); while ( my $readDataRef = $aceObject->fetch() ) { my $readName = $readDataRef->{name}; my $readStartPosition = $readDataRef->{align_start}; my $readStopPosition = $readDataRef->{align_stop}; push @readDataRefs, { name=>$readName, start=>$readStartPosition, stop=>$readStopPosition }; push @readPositions, "$readStartPosition,$readStopPosition"; } # Compute read depth for this contig. # my @readDepths = computeReadDepthByPosition( @readPositions ); # Find regions with depth of 1x and 2x. # my @depthsToGet = ( 1, 2 ); # get 1x and 2x regions. my %depthPositions = $_findRegionObj->regionsByDepth( \@readDepths, \@depthsToGet ); if ( $self->{_debug} ) { print map "1x: $_->{start}, $_->{stop}\n", @{$depthPositions{1}}; print map "2x: $_->{start}, $_->{stop}\n", @{$depthPositions{2}}; print "\n\n"; } # For regions with depth of 2, check single subclone criteria for reads # spanning this region... # # First, sort array by read stop position. Need to this because # getReadsSpanningRegion (which is called within _getRegionsAt1XCoverage # and _getRegionsAt2XCoverage) expects the list of read data to be sorted # by stop position in ascending order. # @readDataRefs = map { $_->[0] } sort {$a->[1] <=> $b->[1] } map { [$_, $_->{stop}] } @readDataRefs; # For 1x regions, get type of region and add to @regions array. # my $atThisReadDepth = 1; # get read spanning 1x region. push @regions, $self->_getRegionsAt1XCoverage( \@readDataRefs, \%regexForDefiningRegionTypes, @{$depthPositions{$atThisReadDepth}} ); # Get reads spanning 2x regions and check for single subclone criteria. # $atThisReadDepth = 2; # get reads spanning 2x regions. push @regions, $self->_getRegionsAt2XCoverage( \@readDataRefs, \%regexForDefiningRegionTypes, @{$depthPositions{$atThisReadDepth}} ); # sort regions by starting position. # @regions = map { $_->[0] } sort {$a->[1] <=> $b->[1] } map { [$_, $_->{start}] } @regions; return @regions; } #============================================================================# sub mergeRegions { # Merges regions that are a specified number of bases away from each other. # The return value is an array of references to a hash with key=start/stop, # value=position. my $self = shift; my $A_regions = shift; my $mergeIfThisManyBasesApart = shift || 0; return $_findRegionObj->mergeRegions($A_regions, $mergeIfThisManyBasesApart); } #============================================================================# sub initConsensusTags { # Looks for consensus tags grouped by contig and stores as an object attribute. my $self = shift; my @tagTypes = @_; # example: 'polishLowQualConsensus', # 'polishSingleSubclone' # if defined, only get these consensus tags. my %cTagsByContig = $self->getConsensusTagsByContig(@tagTypes); $self->{_consensusTags} = \%cTagsByContig; } #============================================================================# sub getConsensusTagsByContig { # Returns consensus tags grouped by contig. The return value is a hash # where the key=name of contig, value=array holding the string for each # consensus tag for that contig. my $self = shift; my @tagTypes = @_; # example: 'polishLowQualConsensus', # 'polishSingleSubclone' # if defined, only get these consensus tags. my $targetTypeRegEx = @tagTypes ? join "|", @tagTypes : '\S+'; my $aceObject = $self->{_aceObject}; my @cTags = $aceObject->tag('CT'); my %cTagsByContig = (); foreach my $rawAceCtTag (@cTags) { # look for tag like: # 'Contig1 polishLowQualConsensus findPolishingRegions 12760 1304' # if ( $rawAceCtTag =~ /^(\S+) (?:$targetTypeRegEx)/ ) { push @{$cTagsByContig{$1}}, $rawAceCtTag; } } return %cTagsByContig; } #============================================================================# sub undefConsensusTags { # Clears the consensus tags data from the object. my $self = shift; $self->{_consensusTags} = []; } #============================================================================# sub getRegionsToPolishByContig { # Returns start/stop positions of regions to polish within a given contig. # The return value is an array of references to a hash with key/value pairs: # key=start, value=position, # key=stop, value=position, # key=tagType, value=name of tag type (available only if mergeRegions=0) my $self = shift; my %args = @_; # %args can be: # # contig=>name of contig (required) # extendRegionByThisNumBases=>number of bases to extend ends of region. # doNotPolishThisManyBasesAtEndOfContig=>number of bases at ends of contig # to skip polishing. # mergeRegions=>0|1 Defines whether to merge overlapping regions. Default # =0. # tagTypes=>array containing specific tag types to retrieve (optional). return () unless %{$self->{_consensusTags}}; # Get arguments. # my $contig = $args{contig} || croak 'contig is not defined.'; my $extendRegionByThisNumBases = $args{extendRegionByThisNumBases} || 0; my $doNotPolishThisManyBasesAtEndOfContig = $args{doNotPolishThisManyBasesAtEndOfContig} || 0; my $mergeRegions = $args{mergeRegions} || 0; my $A_tagType = $args{tagType} || []; my %tagTypesToRetrieve = (); # if tagTypes is defined, then set %tagTypesToRetrieve hash where # key=name of tag type, value=1. # if ( @{$A_tagType} ) { @tagTypesToRetrieve{ @$A_tagType } = (1) x @$A_tagType; } # Get contig attributes. # my $aceObject = $self->{_aceObject}; my $contigRef = $aceObject->contig($contig); my $contigSequencePadded = $contigRef->{sequence}; my $contigLength = $contigRef->{size_unpadded}; my @mergedRegions = (); my @targetRegions = (); # Declare this for the Unpad object to create a pad/unpad table for # converting pad to unpad positions and vice versa. # $_unpadObject->use($contigSequencePadded); # Get consensus tags for this contig. # my @cTags = defined $self->{_consensusTags}{$contig} ? @{$self->{_consensusTags}{$contig}} : (); # Get positions of polishing tags for this contig. # foreach my $rawAceCtTag (@cTags) { # Look for tag like: # 'Contig1 polishLowQualConsensus findPolishingRegions 12760 1304' # $1=padded start, $2=padded stop # if ( $rawAceCtTag =~ /$contig (\S+) \S+ (\-*\d+) (\-*\d+)/ ) { my $rawAceCtTagType = $1; # Convert padded positions to unpadded positions. # my $start = $_unpadObject->padToUnpad($2); my $stop = $_unpadObject->padToUnpad($3); # Don't include region if it is within the ends of the contig # by which to ignore polishing. # next if $stop < $doNotPolishThisManyBasesAtEndOfContig; next if $start > $contigLength - $doNotPolishThisManyBasesAtEndOfContig; # Extend start and stop positions by N number of bases defined # by user. # $start -= $extendRegionByThisNumBases; $stop += $extendRegionByThisNumBases; # Adjust start and stop positions to exclude N number of bases # at ends of contig (to skip polishing), where N is defined in # $doNotPolishThisManyBasesAtEndOfContig. # $start = $doNotPolishThisManyBasesAtEndOfContig if $start < $doNotPolishThisManyBasesAtEndOfContig; $stop = $contigLength - $doNotPolishThisManyBasesAtEndOfContig if $stop > $contigLength - $doNotPolishThisManyBasesAtEndOfContig; if ( $start <= $stop ) { # If tagTypes to retrieve are defined, check if this tag type # is in list. If not, don't add to target regions. # if ( %tagTypesToRetrieve && !$tagTypesToRetrieve{$rawAceCtTagType} ) { next; } push @targetRegions, { start=>$start, stop=>$stop, tagType=>$rawAceCtTagType, }; } } } # Merge regions if mergeRegion is defined as true in argument. # if ( $mergeRegions && @targetRegions ) { return $_findRegionObj->mergeRegions(\@targetRegions); # If mergeRegions not defined, return target regions. # } else { return @targetRegions; } } #============================================================================# sub getDoNotPolishRegions { # Determines the start/stop positions of all regions that are not part of the # specified regions. The return value is an array of references to a # hash containing key=start/stop, value=position. my $self = shift; my $startOfSegment = shift; my $endOfSegment = shift; my @polishingRegions = @_; my @doNotPolishRegions = (); if ( @polishingRegions ) { @doNotPolishRegions = $_findRegionObj->getRegionsNotDefinedInList( $startOfSegment, $endOfSegment, \@polishingRegions); } else { @doNotPolishRegions = ( {start=>1, stop=>$endOfSegment} ); } return @doNotPolishRegions; } #============================================================================# sub changeConsensusQuality { # Change the given consensus quality to a specified value within the specified # regions. The return value is the string containing the modified quality # values. my $self = shift; my %args = @_; # Parameters for %args: # # consensusQuality=> string of consensus quality values. # newQualityValue=> new quality value to adjust within the specified region # regions=> reference to array of hashes specifying regions to change # the consensus quality values. # reference to array containing hash with # key/value: # start=>start of position, # stop=>end of position. # modifyQualtiyScoreOnlyIfAboveNewQualityValue=> set this to 1 if you don't # want the quality scores lower than newQualityValue to be changed. return $_findRegionObj->changeConsensusQuality(%args); } #============================================================================# sub _getRegionsAt1XCoverage { # Get regions of 1X coverage. The return value is an array of references # to a hash containing: # key=start, value=position, # key=stop, value=position, # key=typeOfRegion, value=name of region type # key=readDepth, value=value of depth of coverage at that region. my $self = shift; my $A_readDataRefs = shift; my $H_regexForDefiningRegionTypes = shift; my @depthPositions = @_; # Arguments: # # $A_readDataRefs = reference to array containing hash with key/value: # name=readname, # start=>start position of read, # stop=>stop position of read, # # $H_regexForDefiningRegionTypes = hash reference in the form: # hash= # value= # # This is used to determine if all reads spanning a region # matches the specified regular expression for defining # a region type. If one or more reads in the region do not match # criteria or if this parameter is not defined, then the type of # region is set to 'singleSubclone'. # # Example: # For determining if a region contains only 454 reads, set: # $params{regexForDefiningRegionTypes} = { '^(?:[^\.]+)\.c'=>'454only' } # # will look for all reads spanning a region in which the # read names contain .c extension. If true, then define # region type as '454only'. # # @depthPositions = array of references to a hash containing key=start/stop, # value=position. my $atThisReadDepth = 1; my @regions = (); foreach my $readDepthRef( @depthPositions ) { my $startOfRegion = $readDepthRef->{start}; my $endOfRegion = $readDepthRef->{stop}; my @readDataRefsSpanningRegion = $_findRegionObj->getReadsSpanningRegion( $A_readDataRefs, $startOfRegion, $endOfRegion, $atThisReadDepth); my $typeOfRegion = $_findRegionObj->determineTypeOfRegion( \@readDataRefsSpanningRegion, $H_regexForDefiningRegionTypes); push @regions, { start=>$startOfRegion, stop=>$endOfRegion, typeOfRegion=>$typeOfRegion, readDepth=>$atThisReadDepth }; } return @regions; } #============================================================================# sub _getRegionsAt2XCoverage { # Get regions of 1X coverage. The return value is an array of references # to a hash containing: # key=start, value=position, # key=stop, value=position, # key=typeOfRegion, value=name of region type # key=readDepth, value=value of depth of coverage at that region. my $self = shift; my $A_readDataRefs = shift; my $H_regexForDefiningRegionTypes = shift; my @depthPositions = @_; # Arguments: # # $A_readDataRefs = reference to array containing hash with key/value: # name=readname, # start=>start position of read, # stop=>stop position of read, # # $H_regexForDefiningRegionTypes = hash reference in the form: # hash= # value= # # This is used to determine if all reads spanning a region # matches the specified regular expression for defining # a region type. If one or more reads in the region do not match # criteria or if this parameter is not defined, then the type of # region is set to 'singleSubclone'. # # Example: # For determining if a region contains only 454 reads, set: # $params{regexForDefiningRegionTypes} = { '^(?:[^\.]+)\.c'=>'454only' } # # will look for all reads spanning a region in which the # read names contain .c extension. If true, then define # region type as '454only'. # # @depthPositions = array of references to a hash containing key=start/stop, # value=position. my $atThisReadDepth = 2; my @regions = (); foreach my $readDepthRef ( @depthPositions ) { my $startOfRegion = $readDepthRef->{start}; my $endOfRegion = $readDepthRef->{stop}; my $paddedStartOfRegion = $_unpadObject->unpadToPad($startOfRegion); my $paddedEndOfRegion = $_unpadObject->unpadToPad($endOfRegion); my $isSingleSubclone = 0; if ( $self->{_debug} ) { print "At 2x: $startOfRegion, $endOfRegion, ". "padded: $paddedStartOfRegion, $paddedEndOfRegion\n"; } # Get reads spanning this region. # my @readDataRefsSpanningRegion = $_findRegionObj->getReadsSpanningRegion( $A_readDataRefs, $startOfRegion, $endOfRegion, $atThisReadDepth); # Determine the type of region (e.g., singleSubclone, 454Only,...). # This is largely determined by how $H_regexForDefiningRegionTypes # is defined. If the regex defined in the hash does not match all of # the read names for all reads spanning this region, then the type of # region is defined as singleSubclone. Otherwise, it is defined based # on the value of the hash. # my $typeOfRegion = $_findRegionObj->determineTypeOfRegion( \@readDataRefsSpanningRegion, $H_regexForDefiningRegionTypes); # If type of region is 'singleSubclone', then check if the sequences # of the reads spanning the regions are identical. If not, flag as # single subclone. # if ( $typeOfRegion eq 'singleSubclone' ) { $isSingleSubclone = $self->_checkReadsForSingleSubcloneCriteria( $paddedStartOfRegion, $paddedEndOfRegion, \@readDataRefsSpanningRegion); # If type of region is other (specified by # %regexForDefiningRegionTypes), then flag as single subclone. This # means that both reads spanning the region matches the regex # criteria defined in the hash. # } else { $isSingleSubclone = 1; } if ( $self->{_debug} ) { print "isSingleSubclone=$isSingleSubclone, ". "typeOfRegion=$typeOfRegion\n\n"; } # Add region if single subclone criteria is met. # if ( $isSingleSubclone ) { push @regions, { start=>$startOfRegion, stop=>$endOfRegion, typeOfRegion=>$typeOfRegion, readDepth=>$atThisReadDepth }; } } return @regions; } #============================================================================# sub _checkReadsForSingleSubcloneCriteria { # Returns 1 if: # a) only one read in the region (single coverage). # b) reads are from the same clone. # c) reads are from different clones but sequences are different. # my $self = shift; my $paddedStartOfRegion = shift; my $paddedEndOfRegion = shift; my $A_readDatas = shift; # Arguments: # # $paddedStartOfRegion = padded start of contig region to get sequence. # $paddedEndOfRegion = padded end of contig region to get sequence. # $A_readDatas = reference to array of references to a hash containing # key=start/stop, value=position. # If only one read in region, then return 1. # return 1 if @$A_readDatas == 1; my $readDataRef = $A_readDatas->[0]; my $readName = $readDataRef->{name}; my ($cloneName) = $readName =~ /^([^\.]+)/; my $sequenceInRegion = $self->_getReadSequenceInRegion($readDataRef, $paddedStartOfRegion, $paddedEndOfRegion); # Do all reads belong to the same clone? # my $numberOfReadsFromSameClone = grep { $_->{name} =~ /^$cloneName\./ } @$A_readDatas; if ( $self->{_debug} ) { print "NumberOfReadsFromSameClone=$numberOfReadsFromSameClone\n" } return 1 if $numberOfReadsFromSameClone == @$A_readDatas; # Do all reads have the same sequence? # my $isSingleSubclone = 0; for ( my $index = 1; $index <= $#$A_readDatas; $index++ ) { my $readDataRef = $A_readDatas->[$index]; my $sequenceOfReadInRegion = $self->_getReadSequenceInRegion( $readDataRef, $paddedStartOfRegion, $paddedEndOfRegion); if ( lc $sequenceOfReadInRegion ne lc $sequenceInRegion ) { $isSingleSubclone = 1; } } return $isSingleSubclone; } #============================================================================# sub _getReadSequenceInRegion { # Returns the read sequence based on the my $self = shift; my $readDataRef = shift; # my $paddedStartOfRegion = shift; my $paddedEndOfRegion = shift; my $aceObject = $self->{_aceObject}; my $readName = $readDataRef->{name}; # get read sequence from ace file. # my $aceReadData = $aceObject->read($readName); my $paddedSequence = $aceReadData->{sequence}; my $paddedReadStart = $aceReadData->{start}; my $paddedReadStop = $aceReadData->{stop}; my $startOfCut = $paddedStartOfRegion - $paddedReadStart; my $lengthOfCut = $paddedEndOfRegion - $paddedStartOfRegion + 1; my $sequenceInRegion = substr($paddedSequence, $startOfCut, $lengthOfCut); if ( $self->{_debug} ) { print "read=$readName, ". "seq=",length($paddedSequence).", ". "readStart=$paddedReadStart, ". "readStop=$paddedReadStop, ". "startOfCut=$startOfCut, ". "lengthOfCut=$lengthOfCut\n"; print $sequenceInRegion."\n"; } return $sequenceInRegion; } #============================================================================# sub _setUnpadObject { # Creates and stores the pad/unpad conversion table as an object attribute. my $self = shift; my $contigName = shift; my $aceObject = $self->{_aceObject}; my $aceContigDataRef = $aceObject->contig($contigName); my $paddedContigSequence = $aceContigDataRef->{sequence}; $_unpadObject->use($paddedContigSequence); } #============================================================================# 1;