#!/usr/local/bin/perl ########################## ### Class CDNA_alignment ########################## package main; our $SEE; =head1 NAME CDNA::CDNA_alignment =cut =head1 DESCRIPTION This module provides an object specification for storing and manipulating cDNA alignments. The alignment coordinates along the genomic sequence are given along with a genomic sequence. Splice sites are validated and the spliced orientation is determined. =cut package CDNA::CDNA_alignment; use strict; use Exons_to_geneobj; use Gene_obj; use CDNA::Alignment_segment; use Carp qw (cluck confess croak); use GFF_maker; ## Global Vars our $ALLOW_ATAC_splice_pairs = 1; # ON by default; turn it off if you prefer. ## =over 4 =item new() B Instantiates a new CDNA::CDNA_alignment object. B $cdna_length, $Alignment_obj_aref, $sequence_sref B<$cdna_length> is the length of the complete cDNA sequence. B<$Alignment_obj_aref> is a reference to a list of CDNA::Alignment_segment objects like so: $Alignment_obj_aref = \@Alignment_obj_list B<$sequence_sref> should be a reference to the string containing the genomic sequence like so: $sequence = "gatc....."; $sequence_sref = \$sequence; B $obj_ref returns a reference to a CDNA_alignment object. This object is validated requiring consensus splice sites, and the spliced orientation of the cDNA is determined based on the validating orientation. =back =cut sub new { my $packagename = shift; my ($cdna_length, $Alignment_obj_aref, $sequence_ref) = @_; unless (@$Alignment_obj_aref) { die "No alignment segments available to create alignment object.\n"; } my $self = { acc => undef(), # cDNA accession. title =>undef(), # com_name, title, header, whatever you want to call the cdna. genomic_seq => $sequence_ref, genome_acc => undef, alignment_segs=>$Alignment_obj_aref, orientation => undef(), lend=>undef(), rend=>undef(), length=>0, # alignment span length. num_aligned_nts => 0, #number of cDNA nucleotides found in alignment cdna_length => $cdna_length, # the length of the cDNA sequence. avg_per_id => 0, # the average percent identity of this alignment. error_flag=>0, #default w/o errors. Used to indicate non-consensus splice sites. spliced_orientation => '?', # [+-?] depending on validating consenus splice sites; relative to genomic sequence orientation. num_segments=>0, percent_cdna_aligned => 0, # provides the percentage of the cDNA length that is in the alignment. is_fli => 0 #indicates whether the cDNA is a full-length insert (ie. expected to be complete). }; bless ($self, $packagename); ## Convert alignment to object form: $self->process_alignment(); if ($self->{num_segments} > 1 && (ref $sequence_ref)) { $self->identify_splice_junctions($sequence_ref); } return ($self); } sub process_alignment { my $self = shift; $self->determine_alignment_attributes(); my @alignment_segments = $self->get_alignment_segments(); for (my $i = 0; $i <= $#alignment_segments; $i++) { my $segment = $alignment_segments[$i]; if ($#alignment_segments == 0) { #single segment in alignment $segment->set_type("single"); } elsif ($i == 0) { $segment->set_type("first"); } elsif ($i == $#alignment_segments) { $segment->set_type("last"); } else { #must be internal $segment->set_type("internal"); } } $self->set_num_segments($#alignment_segments + 1); $self->verify_contiguity(); } sub identify_splice_junctions { my $self = shift; my $sequence_ref = shift; my $orientation = $self->get_orientation(); my $opposite_orientation = ($orientation eq '+') ? '-' : '+'; ## Since some cDNAs are supplied in the opposite orientation, the splice sites may be on the reverse strand. ## In this case, we must change the orientation my $error_flag = 0; my $error_text = ""; my $validating_orientation = $orientation; #initialize. my %error_lengths; #used to find best orientation (least errors) foreach my $orient ($orientation, $opposite_orientation) { $error_text = $self->validate_splice_junctions($orient, $sequence_ref); $error_lengths{$orient} = length $error_text; if ($error_text) { print "ERRORS in Splice sites given orientation $orient:\n$error_text\n" if $::SEE; unless ($error_flag) { print "Trying other strand might help?\n" if $::SEE;} $error_text = ""; #rest $error_flag = 1; } else { $error_flag = 0; $validating_orientation = $orient; last; } } if ($error_flag) { print "Sorry, still contains problematic splice sites:\n$error_text\n" if $::SEE; print "Setting error flag for this alignment\n" if $::SEE; $self->set_error_flag("Splice site validations failed"); print "ERROR: SETTING ERROR_FLAG\n" if $::SEE; ## revalidate splice sites using orientation that generated the least errors: my @orients = sort {$error_lengths{$a}<=>$error_lengths{$b}} ('+', '-'); my $best_orient = shift @orients; $self->validate_splice_junctions($best_orient, $sequence_ref); #required for appropriate token printing. } else { $self->set_spliced_orientation($validating_orientation); } } sub validate_splice_junctions { my $self = shift; my ($orient) = shift; my $sequence_ref = shift; my $errors = ""; my (@splice_boundary_pairs) = &get_consensus_splice_sites($orient); my @segments = $self->get_alignment_segments(); my $num_segments = scalar (@segments); if ($num_segments == 1) { ## no introns die "Error, trying to validate splice junctions for single segment alignment! "; } ## analyze introns for (my $i = 1; $i <= $#segments; $i++) { my ($prev_segment, $curr_segment) = ($segments[$i-1], $segments[$i]); my ($prev_lend, $prev_rend) = $prev_segment->get_coords(); my ($curr_lend, $curr_rend) = $curr_segment->get_coords(); my $splice_chars_left = uc substr($$sequence_ref, $prev_rend, 2); my $splice_chars_right = uc substr($$sequence_ref, $curr_lend -2 -1, 2); $prev_segment->set_right_splice_site_chars($splice_chars_left); $curr_segment->set_left_splice_site_chars($splice_chars_right); ## check boundaries: my $splice_pair_OK = 0; CONSENSUS_PAIR: foreach my $consensus_pair (@splice_boundary_pairs) { my ($left_chars, $right_chars) = @$consensus_pair; if ($left_chars eq $splice_chars_left && $right_chars eq $splice_chars_right) { # found consensus ## further validate any AT-AC for donor site extended consensus my ($intron_lend, $intron_rend) = ($prev_rend + 1, $curr_lend - 1); if ($ALLOW_ATAC_splice_pairs) { if ( ( $orient eq '+' && $left_chars eq 'AT') || ($orient eq '-' && $right_chars eq 'AT') ) { unless (&_validates_AT_AC_donor_extended_consensus($orient, $intron_lend, $intron_rend, $sequence_ref)) { next CONSENSUS_PAIR; } } } ## got consensus splice pair $splice_pair_OK = 1; ## left and right local are relative to intron # in methods, they reference segment $prev_segment->set_right_splice_junction(1); $curr_segment->set_left_splice_junction(1); last; } } unless ($splice_pair_OK) { $errors .= "nonconsensus splice pair [$splice_chars_left-$splice_chars_right]\n"; ## validate boundaries separately. ## this is useful if only one site is nonconsensus foreach my $consensus_pair (@splice_boundary_pairs) { my ($left_chars, $right_chars) = @$consensus_pair; if ($left_chars eq $splice_chars_left) { $prev_segment->set_right_splice_junction(1); } if ($right_chars eq $splice_chars_right) { $curr_segment->set_left_splice_junction(1); } } ## make nonconsensus lower case unless ($prev_segment->has_right_splice_junction()) { $prev_segment->set_right_splice_site_chars( lc $splice_chars_left); } unless ($curr_segment->has_left_splice_junction()) { $curr_segment->set_left_splice_site_chars( lc $splice_chars_right); } } } return ($errors); } sub verify_contiguity { my $self = shift; my $num_segments = $self->get_num_segments(); my $orient = $self->get_orientation(); if ($num_segments > 1) { my @alignment_segments = $self->get_alignment_segments(); for (my $i = 1; $i <= $#alignment_segments; $i++) { my $prev_seg = $alignment_segments[$i-1]; my $curr_seg = $alignment_segments[$i]; my $error_flag = 0; my $diff = $curr_seg->{mlend} - $prev_seg->{mrend}; if ($orient eq '+' && $diff != 1) { $error_flag = 1; }elsif ($orient eq '-' && $diff != -1) { $error_flag = 1; } if ($error_flag) { $self->set_error_flag("Incontiguous alignment"); return(); } } } } sub get_consensus_splice_sites () { my $orientation = shift; my @pairs; if ($orientation eq '+') { ## Forward pairs # GT-AG # GC-AG # AT-AC push (@pairs, ['GT', 'AG'], ['GC', 'AG']); if ($ALLOW_ATAC_splice_pairs) { push (@pairs, ['AT', 'AC']); } } else { ## Rev Comp of above: # CT-AC # CT-GC # GT-AT push (@pairs, ['CT', 'AC'], ['CT', 'GC']); if ($ALLOW_ATAC_splice_pairs) { push (@pairs, ['GT', 'AT']); } } return (@pairs); } #### sub _validates_AT_AC_donor_extended_consensus { my ($orient, $intron_lend, $intron_rend, $sequence_ref) = @_; my $forward_consensus = 'ATATCC'; my $reverse_consensus = 'GGATAT'; if ($orient eq '+') { my $long_donor_seq = uc substr($$sequence_ref, $intron_lend - 1, 6); if ($long_donor_seq eq $forward_consensus) { return (1); } } elsif ($orient eq '-') { my $long_donor_seq = uc substr($$sequence_ref, $intron_rend -6, 6); if ($long_donor_seq eq $reverse_consensus) { return (1); } } ## got here, didn't fit long consensus return (0); } sub set_orientation { my $self = shift; my $orientation = shift; $self->{orientation} = $orientation; } =over 4 =item get_aligned_orientation() B Provides the orientation of the incoming cDNA alignment. B none. B [+-] =back =cut sub get_aligned_orientation { my $self = shift; return ($self->{orientation}); } sub get_orientation { # deprecated in favor of get_aligned_orientation() my $self = shift; return ($self->get_aligned_orientation()); } sub set_title { my $self = shift; my $title = shift; $self->{title} = $title; } =over 4 =item get_title() B Provides the title for the cDNA, generally the header from a fasta file B none. B string or undef =back =cut sub get_title { my $self = shift; return ($self->{title}); } sub set_coords { my $self = shift; my ($lend, $rend) = @_; $self->{lend} = $lend; $self->{rend} = $rend; } =over 4 =item get_coords() B Provides the coordinate span for the alignment along the genomic sequence. Orientation is not implied, coordinates always provided relevant to the forward orientation. Use the get_orientation method to determine the strand. B none. B $lend, $rend $lend, $rend are integer values indicating the beginning and end coordinates of the alignment on the genomic sequence. The genomic sequence begins at position 1. =back =cut sub get_coords { my $self = shift; return ($self->{lend}, $self->{rend}); } =over 4 =item get_mcoords() B returns the cDNA sequence coordinates corresponding to the lend and rend returned by get_coords() ie. my ($lend, $rend) = $alignment->get_coords(); // always in forward orientation (lend < rend) my ($mlend, $mrend) = $alignment->get_mcoords(); $mlend corresponds to $lend $mrend corresponds to $rend Bnone B ($mlend, $mrend) =back =cut sub get_mcoords { my $self = shift; my @alignment_segments = $self->get_alignment_segments(); my $leftmost_seg = shift @alignment_segments; my $rightmost_seg = pop @alignment_segments; unless ($rightmost_seg) { $rightmost_seg = $leftmost_seg; #must only be one in which case left = right } my ($mlend, $whatever1) = $leftmost_seg->get_mcoords(); my ($whatever2, $mrend) = $rightmost_seg->get_mcoords(); return ($mlend, $mrend); } =over 4 =item get_intron_coords() B returns list of intron coordinates B none B ( [intron_lend, intron_rend], [intron_lend, intron_rend], ...) lend always less than rend =back =cut sub get_intron_coords { my $self = shift; my @segments = $self->get_alignment_segments(); my @seg_coords; foreach my $segment (@segments) { my ($exon_lend, $exon_rend) = sort {$a<=>$b} $segment->get_coords(); push (@seg_coords, [$exon_lend, $exon_rend]); } my @intron_coords; if (scalar (@seg_coords) >= 2) { ## actually have introns @seg_coords = sort {$a->[0]<=>$b->[0]} @seg_coords; my $curr_seg = shift @seg_coords; while (@seg_coords) { my $next_seg = shift @seg_coords; my ($curr_lend, $curr_rend) = @$curr_seg; my ($next_lend, $next_rend) = @$next_seg; my ($intron_lend, $intron_rend) = ($curr_rend + 1, $next_lend - 1); push (@intron_coords, [$intron_lend, $intron_rend]); $curr_seg = $next_seg; } } return (@intron_coords); } sub determine_alignment_attributes { my $self = shift; my @alignment_segments = $self->get_alignment_segments(); my @coords; my $orientation; my $num_nts_matched = 0; my $per_id_x_length = 0; foreach my $segment (@alignment_segments) { my ($lend, $rend) = $segment->get_coords(); my ($mlend, $mrend) = $segment->get_mcoords(); my $orient = $segment->get_orientation(); if (!$orientation && $orient =~ /[+-]/) { $orientation = $orient; } my $seg_length; if ($mrend && $mlend) { $seg_length = abs($mrend - $mlend) + 1; } else { $seg_length = abs ($rend - $lend) + 1; } $num_nts_matched += $seg_length; my $per_id = $segment->get_per_id(); $per_id_x_length += $per_id * $seg_length; push (@coords, $lend, $rend); } $self->set_orientation($orientation); foreach my $segment (@alignment_segments) { $segment->set_orientation($orientation); } @coords = sort {$a<=>$b} @coords; my $lend = shift @coords; my $rend = pop @coords; $self->set_coords($lend, $rend); $self->{length} = abs ($rend - $lend) + 1; $self->{num_aligned_nts} = $num_nts_matched; $self->{avg_per_id} = $per_id_x_length / $num_nts_matched; if ($self->{avg_per_id} > 100) { die "Error, can't have average per_id > 100%\n";} if ($self->{cdna_length} > 0) { $self->{percent_cdna_aligned} = $num_nts_matched / $self->{cdna_length} * 100; } } =over 4 =item get_alignment_segments() B Returns the alignment segments which comprise an alignment object, ordered by left coordinate position. B none B @alignment_segments @alignment_segments is a list of CDNA::Alignment_segment objects (see below). =back =cut sub get_alignment_segments() { my $self = shift; return (sort {$a->{lend}<=>$b->{lend}} @{$self->{alignment_segs}}); } =over 4 =item add_alignment_segment() B Adds a CDNA::Alignment_segment object to the list of segments of this CDNA_alignment object. B CDNA::Alignment_segment object. B none. =back =cut sub add_alignment_segment() { my $self = shift; my $segment = shift; push (@{$self->{alignment_segs}}, $segment); } =over 4 =item delete_all_segments() B Empties the current list of Alignment_segment objects. B None. B None. =back =cut sub delete_all_segments () { my $self = shift; @{$self->{alignment_segs}} = (); #empty the array } sub set_error_flag () { my $self = shift; my $error = shift; if ($self->{error_flag}) { $self->{error_flag} .= $error; } else { $self->{error_flag} = $error; } } =over 4 =item get_error_flag() B Provides the status of the aligments validation. B none. B [error_text|0] A text string containing the error is returned if an error exists. Otherwise, zero is returned indicating the lack of errors. =back =cut sub get_error_flag () { my $self = shift; return ($self->{error_flag}); } =over 4 =item toString() B Returns an alignment as lines of text providing coordinate information. B none. B alignment_string =back =cut sub toString() { my $self = shift; my $text = "\n\nAlignment: orientation: " . $self->{orientation} . "\n" . "coords: " . $self->{lend} . "-" . $self->{rend} . "\n"; my @segments = $self->get_alignment_segments(); foreach my $segment (@segments) { $text .= $segment->toString(); } return ($text); } #### sub to_GFF3_format { my $self = shift; my %preferences = @_; my $seq_id = $preferences{seq_id} || $self->{genome_acc} || confess "Need seq_id in preferences, or set genome_acc attribute of obj"; my $match_id = $preferences{match_id} or confess "Error, require match_id attribute"; my $source = $preferences{source} || "PASA"; my $orientation = $self->get_orientation(); my @alignment_segments = $self->get_alignment_segments(); my $acc = $self->get_acc() or confess "Error, accession for cdna_alignment is not available"; my $GFF3_text = ""; foreach my $alignment_segment (@alignment_segments) { my ($genome_lend, $genome_rend) = $alignment_segment->get_coords(); my ($cdna_lend, $cdna_rend) = sort {$a<=>$b} $alignment_segment->get_mcoords(); my $gff_struct = { seq_id => $seq_id, source => $source, type => "cDNA_match", lend => $genome_lend, rend => $genome_rend, strand => $orientation, attributes => "ID=$match_id; Target=$acc $cdna_lend $cdna_rend +", }; if (my $per_id = $alignment_segment->get_per_id()) { $gff_struct->{score} = $per_id; } $GFF3_text .= &GFF_maker::get_GFF_line($gff_struct); } return ($GFF3_text); } #### sub to_GTF_format { my $self = shift; my %preferences = @_; my $seq_id = $preferences{seq_id} || $self->{genome_acc} || confess "Need seq_id in preferences, or set genome_acc attribute of obj"; my $gene_id = $preferences{gene_id} || confess "Need gene_id in preferences"; my $transcript_id = $preferences{transcript_id} || $self->get_acc(); my $source = $preferences{source} || "PASA"; my $orientation = $self->get_orientation(); my ($trans_lend, $trans_rend) = sort {$a<=>$b} $self->get_coords(); my @alignment_segments = $self->get_alignment_segments(); my $acc = $self->get_acc() or confess "Error, accession for cdna_alignment is not available"; my $gtf_text = join("\t", ( $seq_id, $source, "transcript", $trans_lend, $trans_rend, ".", $orientation, ".", "gene_id \"$gene_id\"; transcript_id \"$transcript_id\";") ) . "\n"; foreach my $alignment_segment (@alignment_segments) { my ($genome_lend, $genome_rend) = $alignment_segment->get_coords(); my ($cdna_lend, $cdna_rend) = sort {$a<=>$b} $alignment_segment->get_mcoords(); my $per_id = $alignment_segment->get_per_id() || "."; $gtf_text .= join("\t", ( $seq_id, $source, "exon", $genome_lend, $genome_rend, $per_id, $orientation, ".", "gene_id \"$gene_id\"; transcript_id \"$transcript_id\";") ) . "\n"; } return ($gtf_text); } sub provide_cdna_segment_coords () { my $self = shift; my @segments = $self->get_alignment_segments(); my $coord_mapper = $self->{genome_cdna_coord_mapper}; foreach my $segment (@segments) { my ($lend, $rend) = $segment->get_coords(); my $mlend = $coord_mapper->{$lend}; my $mrend = $coord_mapper->{$rend}; $segment->set_mcoords($mlend, $mrend); } } =over 4 =item remap_cdna_segment_coords() B Method is used on an assembled alignment to renumber the coordinates of the assembled cDNA product, starting at 1 and ending at the assembly length. Orientation is determined by the validated spliced orientation. B none. B none. =back =cut sub remap_cdna_segment_coords () { ## Used when cDNA mapped coordinates aren't known, or when an assembly of other alignments was generated. ## reassigns cdna coords to alignment coords starting at 1 and ending at determined length. ## spliced orientation determines how the coords will map ## if spliced orient is ambiguous, aligned orient is used. my $self = shift; my %coord_mapper; my @alignment_segments = $self->get_alignment_segments(); #remember, already in increasing order. my $spliced_orient = $self->get_spliced_orientation(); my $aligned_orient = $self->get_aligned_orientation(); my $transcript_orientation = ($spliced_orient =~ /[\+\-]/) ? $spliced_orient : $aligned_orient; if ($transcript_orientation eq '-') { @alignment_segments = reverse @alignment_segments; } my $curr_pos = 0; foreach my $segment (@alignment_segments) { my ($lend, $rend) = $segment->get_coords(); my $seglength = abs ($rend - $lend) + 1; my $mlend = $curr_pos + 1; my $mrend = $curr_pos + $seglength; $curr_pos += $seglength; if ($transcript_orientation eq '-') { ($mlend, $mrend) = ($mrend, $mlend); } $segment->set_orientation($transcript_orientation); #print "setting $mlend, $mrend\n"; $coord_mapper{$lend} = $mlend; $coord_mapper{$rend} = $mrend; $segment->set_mcoords($mlend, $mrend); } ## reset aligned_orient to transcript_orient if different ## this should only happen when aligned orient and spliced orient are opposite. ## in which case, the aligned orient is reset to the spliced orient. if ($aligned_orient ne $transcript_orientation) { $self->set_orientation($transcript_orientation); } } =over 4 =item toToken() B similar to the toString() method, but returns a pretty line of text summarizing the alignment and splice site data. B none. B text_line Here is an example: orient(-/-) align: 103762(2613)-104359(2016)ECT....ACE104452(2015)-105229(1238)ECT....ACE105315(1237)-105482(1070)ECT....ACE105582(1069)-105842(809)ECT....ACE105935(808)-105985(758)ECT....ACE106071(757)-106316(512)ECT....ACE106394(511)-106619(286)ECT....ACE106712(285)-106879(118)ECT....ACE107427(117)-107543(1) or orient(+/+) align: 83074(1)-83318(245)EGT....AGE83637(246)-83702(311)EGT....AGE83796(312)-83846(362)EGT....AGE83938(363)-84017(442)EGT....AGE84308(443)-84352(487)EGT....AGE84467(488)-84507(528) The orient specification includes (cDNA sequence alignment orientation/ spliced orientation). These are different when the reverse-complement of the sequence is provided, ascertained by the aligned orientation with consensus splice sites. =back =cut sub toToken () { my $self = shift; my $orientation = $self->get_orientation(); my $spliced_orientation = $self->get_spliced_orientation(); my @alignment_segments = $self->get_alignment_segments(); my $assembled_token = "orient(a$orientation/s$spliced_orientation) align: "; for (my $i = 0; $i <= $#alignment_segments; $i++) { my $segment = $alignment_segments[$i]; $assembled_token .= $segment->toToken(); unless ($i == $#alignment_segments) { $assembled_token .= "...."; } } return ($assembled_token); } sub set_num_segments { my $self = shift; my $num_segments = shift; $self->{num_segments} = $num_segments; } =over 4 =item get_num_segments() B method provides the number of segments composing an alignment. B none. B int =back =cut sub get_num_segments { my $self = shift; return ($self->{num_segments}); } sub set_spliced_orientation { my $self = shift; my $orientation = shift; $self->{spliced_orientation} = $orientation; } =over 4 =item get_spliced_orientation () B provides the validating spliced orientation for an alignment. In some cases this will be different from the alignment orientation; for example, in cases where the cDNA sequence is provided in the reverse orientation. B none B [+|-|undef()] undef is returned if the alignment did not validate properly. See get_error_flag() =back =cut sub get_spliced_orientation { my $self = shift; return ($self->{spliced_orientation}); } =over 4 =item set_acc() B Method sets the accession field of the cDNA sequence. B string Provide the accession for the cDNA corresponding to this alignment. B none. =back =cut sub set_acc () { my $self = shift; my $acc = shift; $self->{acc} = $acc; } =over 4 =item get_acc() B Method provides the accession for the cDNA in the alignment. B none B string =back =cut sub get_acc () { my $self = shift; return ($self->{acc}); } =over 4 =item set_fli_status() B sets the is_fli attribute of the cDNA alignment, indicative of a full-length insert clone (or complete cDNA sequence). B [1|0] 1 = true, 0 = false. B [1|0] =back =cut sub set_fli_status { my $self = shift; my $is_fli_status = shift; $self->{is_fli} = $is_fli_status; } =over 4 =item is_fli() BProvides the full-length insert status of the cDNA. B none. B [0|1] =back =cut sub is_fli { my $self = shift; return ($self->{is_fli}); } =over 4 =item toAlignIllustration() B Provides a single line of text which illustrates the gapped alignment. See the example below. B none. B string Here is an example of an illustrated alignment: ------> <---> <----- (-)asmbl_6711 =back =cut sub toAlignIllustration () { my ($self, $subtract, $rel_max, $max_line_chars) = @_; my $spliced_orient = $self->get_spliced_orientation(); my $orient = $self->get_orientation(); my @segments = $self->get_alignment_segments(); my @chars = (); my $converter = sub {my $coord = shift; return ( int ( ($coord - $subtract)/$rel_max * $max_line_chars + 0.5)); }; foreach my $segment (@segments) { my ($lend, $rend) = $segment->get_coords(); my $l_rel = &$converter($lend); #print "lend: $lend -> l_rel: $l_rel\n" if $::SEE; my $r_rel = &$converter($rend); #print "rend: $rend -> r_rel: $r_rel\n" if $::SEE; for (my $i = $l_rel; $i <= $r_rel; $i++) { $chars[$i] = '-'; } if ($segment->has_left_splice_junction()) { $chars[$l_rel] = '<'; } elsif ( (! $segment->is_first()) && (! $segment->is_single_segment())) { $chars[$l_rel] = '|'; } if ($segment->has_right_splice_junction()) { $chars[$r_rel] = '>'; } elsif ( (! $segment->is_last()) && (! $segment->is_single_segment())) { $chars[$r_rel] = '|'; } } #fill rest of line with spaces. for (my $i = 0; $i <= $#chars; $i++) { unless ($chars[$i]) { $chars[$i] = ' '; } } my $outline = join ("", @chars); my $acc = $self->get_acc(); $acc =~ tr/\t\n\000-\037\177-\377/\t\n/d; #remove any control characters from accession. my $fli_status = ($self->is_fli()) ? " FL" : "";; return ($outline . "\t(a$orient/s$spliced_orient)" . $acc . $fli_status); } =over 4 =item get_gene_obj_via_alignment() B Creates a Gene_obj object based on an alignment using the Exons_to_geneobj.pm module. B B Gene_obj The object returned is of the type Gene_obj defined in Gene_obj.pm =back =cut sub get_gene_obj_via_alignment { my $self = shift; my $partial_info_href = shift; # { 5prime => 0|1, 3prime => 0|1 }, optional my $orient = $self->get_spliced_orientation(); if ($orient eq '+' || '-') { return($self->_get_gene_obj_via_alignment_by_orient($orient, $partial_info_href)); } elsif ($orient eq '?') { ## find the orientation that provides the longest ORF my $plus_orient_gene = $self->_get_gene_obj_via_alignment_by_orient('+', $partial_info_href); my $minus_orient_gene = $self->_get_gene_obj_via_alignment_by_orient('-', $partial_info_href); if ($plus_orient_gene->get_CDS_length() >= $minus_orient_gene->get_CDS_length()) { return($plus_orient_gene); } else { return($minus_orient_gene); } } else { confess "cannot process spliced orientation of $orient "; } } sub _get_gene_obj_via_alignment_by_orient { my ($self, $orient, $partial_info_href) = @_; my @alignment_segments = $self->get_alignment_segments(); my %coords; foreach my $segment (@alignment_segments) { my ($end5, $end3) = $segment->get_coords(); if ($orient eq '-') { ($end5, $end3) = ($end3, $end5); #force coordinates to contain orientation info. } $coords{$end5} = $end3; } my $genomic_seq_ref = $self->{genomic_seq}; my $gene_obj; if ($genomic_seq_ref) { ## find ORF $gene_obj = Exons_to_geneobj::create_gene_obj(\%coords, $genomic_seq_ref, $partial_info_href); } else { ## No ORF $gene_obj = new Gene_obj; $gene_obj->populate_gene_obj({}, \%coords); } return ($gene_obj); } =over 4 =item force_spliced_validation() B Routine used for testing purposes. Any fake alignment can be created and set to validate to the corresponding orientation using this routine. B [+|-] B none. =back =cut sub force_spliced_validation { my $self = shift; my $orientation = shift; unless ($orientation eq '+' || $orientation eq '-') { croak ("cannot force spliced validation to $orientation.\n"); return; } my ($right_splice, $left_splice) = ($orientation eq '+') ? ('XX','YY') : ('YY','XX'); my @alignment_segments = $self->get_alignment_segments(); foreach my $alignment_segment (@alignment_segments) { if ($alignment_segment->is_internal() || $alignment_segment->is_last()) { $alignment_segment->set_left_splice_junction(1); $alignment_segment->set_left_splice_site_chars($left_splice); } if ($alignment_segment->is_internal() || $alignment_segment->is_first()) { $alignment_segment->set_right_splice_junction(1); $alignment_segment->set_right_splice_site_chars($right_splice); } $alignment_segment->set_orientation($orientation); } $self->set_spliced_orientation($orientation); } =over 4 =item clone() BClones a CDNA_alignment object into a new CDNA_alignment object with same attributes. Performs a deep copy, so all alignment segments contained within the cloned CDNA_alignment object are also clones. B none. B new CDNA_alignment =back =cut sub clone { my $self = shift; my $packagename = ref $self; my $clone = {}; bless ($clone, $packagename); foreach my $key (keys %$self) { $clone->{$key} = $self->{$key}; } $clone->{alignment_segs} = []; foreach my $alignment_segment ($self->get_alignment_segments()) { $clone->add_alignment_segment($alignment_segment->clone()); } return ($clone); } =over 4 =item get_genomic_seq_ref() B Returns a scalar refernence to the genomic sequence. B none. B string_ref =back =cut sub get_genomic_seq_ref { my $self = shift; return ($self->{genomic_seq}); } =over 4 =item extractSplicedSequence() B Returns a string corresponding to the spliced cDNA sequence. B none. B scalar =back =cut sub extractSplicedSequence { my $self = shift; my ($genomic_seq_ref) = @_; my $genomic_seq = $genomic_seq_ref; unless ($genomic_seq) { $genomic_seq = $self->{genomic_seq}; unless (ref $genomic_seq) { confess "Can't extract the spliced sequence when no genomic sequence reference is available.\n"; } } my @segments = $self->get_alignment_segments(); my $splicedSequence = ""; my $toggle = 0; foreach my $segment (@segments) { my ($lend, $rend) = $segment->get_coords(); my $length = abs ($rend - $lend) + 1; my $exonseq = substr ($$genomic_seq, $lend - 1, $length); # alternate case among segments to facilitate manual identification of junctions. if ($toggle) { $exonseq = lc $exonseq; $toggle = 0; } else { $exonseq = uc $exonseq; $toggle = 1; } $splicedSequence .= $exonseq; } my $orient = $self->get_orientation(); if ($orient eq "-") { #reverse complement the sequence: $splicedSequence = reverse ($splicedSequence); $splicedSequence =~tr/ACGTacgtyrkmYRKM/TGCAtgcarymkRYMK/; } return ($splicedSequence); } =over 4 =item get_cDNA_to_genomic_coordinates() B converts a cDNA sequence -relative coordinates to the corresponding genomic sequence coordinates. B @coordinates list of integers B @converted_coordinates list of integers. =back =cut sub get_cDNA_to_genomic_coordinates { my $self = shift; my @coordinates = @_; my @ret_coordinates = (); my @segments = $self->get_alignment_segments(); foreach my $cdna_coord (@coordinates) { my $corresponding_segment; foreach my $seg (@segments) { my ($mlend, $mrend) = sort {$a<=>$b} $seg->get_mcoords(); if ($cdna_coord >= $mlend && $cdna_coord <= $mrend) { $corresponding_segment = $seg; last; } } unless (ref $corresponding_segment) { confess "Error, cDNA coordinate ($cdna_coord) not found in segment list: " . $self->toToken(); } my ($lend, $rend) = $corresponding_segment->get_coords(); my ($mlend, $mrend) = $corresponding_segment->get_mcoords(); my $orient = $corresponding_segment->get_orientation(); my $genomic_coord = undef; if ($orient eq '+') { my $diff = $cdna_coord - $mlend; $genomic_coord = $lend + $diff; } elsif ($orient eq '-') { ## mlend > mrend my $diff = $cdna_coord - $mrend; $genomic_coord = $rend - $diff; } push (@ret_coordinates, $genomic_coord); } return (@ret_coordinates); } =item get_genomic_to_cDNA_coordinates() B converts coordinates in the genome to coordinates in the cDNA: B @coordinates list of integers B @converted_coordinates list of integers. Undef is returned for each position that could not be converted to the genomic coordinate system because it was not found within the aligned region. =back =cut #### sub get_genomic_to_cDNA_coordinates { my $self = shift; my @coordinates = @_; my @ret_coordinates = (); my @segments = $self->get_alignment_segments(); foreach my $genomic_coord (@coordinates) { my $corresponding_segment; foreach my $seg (@segments) { my ($lend, $rend) = $seg->get_coords(); if ($genomic_coord >= $lend && $genomic_coord <= $rend) { $corresponding_segment = $seg; last; } } unless (ref $corresponding_segment) { confess "Error, genomic coordinate ($genomic_coord) not found in segment list:" . $self->toToken(); } my ($lend, $rend) = $corresponding_segment->get_coords(); my ($mlend, $mrend) = $corresponding_segment->get_mcoords(); my $orient = $corresponding_segment->get_orientation(); my $diff = $rend - $genomic_coord; my $cdna_coord = undef; if ($orient eq '+') { $cdna_coord = $mrend - $diff; } elsif ($orient eq '-') { $cdna_coord = $mrend + $diff; } push (@ret_coordinates, $cdna_coord); } return (@ret_coordinates); } #### sub overlaps_genome_span { my ($self, $other_alignment) = @_; my ($lend_A, $rend_A) = sort {$a<=>$b} $self->get_coords(); my ($lend_B, $rend_B) = sort {$a<=>$b} $other_alignment->get_coords(); if (&_overlap($lend_A, $rend_A, $lend_B, $rend_B)) { return(1); } else { return(0); } } sub has_overlapping_segment { my ($self, $other_alignment) = @_; unless ($self->overlaps_genome_span($other_alignment)) { return(0); } my @self_segments = $self->get_alignment_segments(); my @other_segments = $self->get_alignment_segments(); foreach my $segment_A (@self_segments) { my ($lend_A, $rend_A) = sort {$a<=>$b} $segment_A->get_coords(); foreach my $segment_B (@other_segments) { my ($lend_B, $rend_B) = sort {$a<=>$b} $segment_B->get_coords(); if (&_overlap($lend_A, $rend_A, $lend_B, $rend_B) ) { return(1); } } } return(0); # no overlapping segment } =over 4 =item is_compatible() B Returns true (1) if this alignment is found compatible with the other_alignment_obj B ($other_alignment_obj, $fuzz_dist) Alignments A and B are of type CDNA::CDNA_alignment B [1|0] Compatibility between alignment objects requires: -within their region of overlap, introns are identical -if both have spliced orientations, they must be on the same strand =back =cut #### sub is_compatible { my $self = shift; my ($other_alignment, $fuzzlength) = @_; if (!defined $fuzzlength) { $fuzzlength = 0; ## no fuzzy termini allowed } ## The compatibility test requires: # -alignments must have the same spliced orientation if not ? # -alignments must overlap # -alignments must have identical introns in their region of overlap (taking into account the fuzz distance for terminal exons) my ($a_lend, $a_rend) = $self->get_coords(); my $a_spliced_orient = $self->get_spliced_orientation(); my $a_num_segments = $self->get_num_segments(); my ($b_lend, $b_rend) = $other_alignment->get_coords(); my $b_spliced_orient = $other_alignment->get_spliced_orientation(); my $b_num_segments = $other_alignment->get_num_segments(); ## overlap test: unless (&_overlap($a_lend, $a_rend, $b_lend, $b_rend)) { return (0); # not compatible } ## transcribed orientation test: if ($a_spliced_orient ne $b_spliced_orient && $a_spliced_orient ne '?' && $b_spliced_orient ne '?') { # neither alignment is ambiguously oriented and they're transcribed on opposite strands: return (0); # not compatible } ## same introns test: if ($a_num_segments > 1 || $b_num_segments > 1) { my @a_introns = $self->get_intron_coords(); my @b_introns = $self->get_intron_coords(); my ($overlapping_lend, $overlapping_rend) = &_get_coords_of_overlap($a_lend, $a_rend, $b_lend, $b_rend); print "Overlapping coords between alignments: $overlapping_lend to $overlapping_rend\n" if $SEE; ## make adjustments to required overlap coordinates considering fuzzlength: if ($fuzzlength) { ## adjust left overlap boundary requirement $overlapping_lend = &_adjust_left_overlap_boundary_via_fuzzlength($overlapping_lend, $self, $other_alignment, $fuzzlength); $overlapping_rend = &_adjust_right_overlap_boundary_via_fuzzlength($overlapping_rend, $self, $other_alignment, $fuzzlength); print "\tadjusted overlapping coords to: $overlapping_lend to $overlapping_rend\n" if $SEE; } ## find introns within adjusted overlap range and ensure identity my @a_intron_coords = $self->get_intron_coords(); my @b_intron_coords = $other_alignment->get_intron_coords(); my @a_introns_in_range = &_get_overlapping_capped_introns($overlapping_lend, $overlapping_rend, \@a_intron_coords); my @b_introns_in_range = &_get_overlapping_capped_introns($overlapping_lend, $overlapping_rend, \@b_intron_coords); if (@a_introns_in_range || @b_introns_in_range) { ## ensure identity: my %all_introns; my %a_introns; foreach my $coordset (@a_introns_in_range) { my $key = join (",", @$coordset); $a_introns{$key} = 1; $all_introns{$key} = 1; } my %b_introns; foreach my $coordset (@b_introns_in_range) { my $key = join (",", @$coordset); $b_introns{$key} = 1; $all_introns{$key} = 1; } foreach my $intron_key (keys %all_introns) { unless ($a_introns{$intron_key} && $b_introns{$intron_key}) { return (0); # not compatible, an intron difference in the overlapping region exists. } } } } ## if got this far, passed all compatibility tests. return (1); # yes, compatible. } #### sub _get_contained_coords { my ($lend, $rend, $coordsets_aref) = @_; my @contained_coords; foreach my $coordset (@$coordsets_aref) { my ($coord_lend, $coord_rend) = sort {$a<=>$b} @$coordset; if ($lend <= $coord_lend && $coord_rend <= $rend) { ## coordset contained push (@contained_coords, $coordset); } } return (@contained_coords); } #### # get introns that overlap lend and rend, and set termini of intron coords to these values if they extend beyond them. sub _get_overlapping_capped_introns { my ($lend, $rend, $coordsets_aref) = @_; my @contained_coords; foreach my $coordset (@$coordsets_aref) { my ($coord_lend, $coord_rend) = sort {$a<=>$b} @$coordset; if ($lend <= $coord_rend && $rend >= $coord_lend) { ## coordset contained if ($coord_lend < $lend) { $coord_lend = $lend; } if ($coord_rend > $rend) { $coord_rend = $rend; } push (@contained_coords, [$coord_lend, $coord_rend]); } } return (@contained_coords); } #### sub _adjust_left_overlap_boundary_via_fuzzlength { my ($overlapping_lend, $alignment_a, $alignment_b, $fuzzlength) = @_; ## make adjustments to take into account fuzzlength and existing intron coordinates and adjacent segments ## we trust short aligment segments that precede an intron, even if they're shorter than the fuzzlength my $a_overlapping_segment = $alignment_a->find_segment_containing_coord($overlapping_lend); my $b_overlapping_segment = $alignment_b->find_segment_containing_coord($overlapping_lend); my @bounds = (); if ($a_overlapping_segment) { my ($a_seg_lend, $a_seg_rend) = $a_overlapping_segment->get_coords(); push (@bounds, $a_seg_rend); } if ($b_overlapping_segment) { my ($b_seg_lend, $b_seg_rend) = $b_overlapping_segment->get_coords(); push (@bounds, $b_seg_rend); } if (@bounds) { my $max_bound = max_coord(@bounds); my $delta = $max_bound - $overlapping_lend; if ($delta < 0) { confess "Error, delta left bound is less than zero"; } my $fuzz_employed = min_coord($fuzzlength, $delta); $overlapping_lend += $fuzz_employed; } return ($overlapping_lend); } #### sub _adjust_right_overlap_boundary_via_fuzzlength { my ($overlapping_rend, $alignment_a, $alignment_b, $fuzzlength) = @_; ## make adjustments to take into account fuzzlength and existing intron coordinates and adjacent segments ## we trust short aligment segments that precede an intron, even if they're shorter than the fuzzlength my $a_overlapping_segment = $alignment_a->find_segment_containing_coord($overlapping_rend); my $b_overlapping_segment = $alignment_b->find_segment_containing_coord($overlapping_rend); my @bounds = (); if ($a_overlapping_segment) { my ($a_seg_lend, $a_seg_rend) = $a_overlapping_segment->get_coords(); push (@bounds, $a_seg_lend); } if ($b_overlapping_segment) { my ($b_seg_lend, $b_seg_rend) = $b_overlapping_segment->get_coords(); push (@bounds, $b_seg_lend); } if (@bounds) { my $min_bound = min_coord(@bounds); my $delta = $overlapping_rend - $min_bound; if ($delta < 0) { confess "Error, delta left bound is less than zero"; } my $fuzz_employed = min_coord($fuzzlength, $delta); $overlapping_rend -= $fuzz_employed; } return ($overlapping_rend); } #### sub find_segment_containing_coord { my $self = shift; my ($coord) = @_; my @segments = $self->get_alignment_segments(); foreach my $segment (@segments) { my ($lend, $rend) = $segment->get_coords(); if ($lend <= $coord && $coord <= $rend) { return ($segment); } } return (undef); # none found } sub _overlap { my ($a1_lend, $a1_rend, $a2_lend, $a2_rend) = @_; #print "Checking overlap @_\t"; if ($a2_rend >= $a1_lend && $a2_lend <= $a1_rend) { #overlap #print "YES\n"; return (1); } else { #print "NO\n"; return (0); } } =over 4 =item encapsulates() B Returns true (1) if this alignment encapsulates the span of the other alignment object B ($other_alignment_obj, $fuzz_dist) Alignments A and B are of type CDNA::CDNA_alignment B [1|0] =back =cut sub encapsulates { my $self = shift; my ($alignmentB, $fuzz_dist) = @_; my $alignmentA = $self; if (! defined $fuzz_dist) { $fuzz_dist = 0; } my ($alend, $arend) = $alignmentA->get_coords(); my ($blend, $brend) = $alignmentB->get_coords(); if ($blend + $fuzz_dist >= $alend && $brend - $fuzz_dist <= $arend) { return (1); } else { return (0); } } #### sub _get_coords_of_overlap { my ($a_lend, $a_rend, $b_lend, $b_rend) = @_; unless (&_overlap($a_lend, $a_rend, $b_lend, $b_rend)) { confess "Error, trying to get coordinates of overlapping region for two features that do not overlap!"; } my $overlapping_lend = &max_coord($a_lend, $b_lend); my $overlapping_rend = &min_coord($a_rend, $b_rend); return ($overlapping_lend, $overlapping_rend); } #### sub min_coord { my @coords = @_; @coords = sort {$a<=>$b} @coords; my $min_coord = shift @coords; return ($min_coord); } #### sub max_coord { my @coords = @_; @coords = sort {$a<=>$b} @coords; my $max_coord = pop @coords; return ($max_coord); } 1; #EOM