#!/usr/local/bin/perl package main; our $SEE; package CDNA::Genome_based_cDNA_assembler; =head1 NAME CDNA::Genome_based_cdna_assembler =cut =head1 DESCRIPTION This module is used to assemble compatible cDNA alignments. The algorithm is as follows: must describe this here. =cut use strict; use CDNA::CDNA_alignment; use Data::Dumper; my $DELIMETER = "$;,"; my $FUZZLENGTH = 20; =item new() =over 4 B instantiates a new cDNA assembler obj. B $sequence_sref $sequence_sref is a reference to a scalar containing the genomic sequence string. B $obj_href $obj_href is the object reference newly instantiated by this new method. =back =cut sub new () { my $package_name = shift; my $self = {}; bless ($self, $package_name); $self->_init(@_); return ($self); } sub _init { my $self = shift; my ($sequence_ref) = @_; $self->{incoming_alignments} = []; #these are the alignments to be assembled. $self->{assemblies} = []; #contains list of all singletons and assemblies. $self->{sequence_ref} = $sequence_ref; $self->{fuzzlength} = $FUZZLENGTH; #default setting. } =item assemble_alignments() =over 4 B assembles a series of cDNA aligmnments into one or more cDNA assemblies B @alignments @alignments is an array of CDNA::CDNA_alignment objects B none. =back =cut sub assemble_alignments { my $self = shift; my @alignments = @_; @alignments = reverse sort {$a->{length}<=>$b->{length}} @alignments; #keep in order of decreasing alignment length. $self->{incoming_alignments} = [@alignments]; #return; ## Algorithm: given cdna, merge with rest of cDNAs iteratively until merging complete. ## find unmerged cDNA, merge it if possible with all individual cdna entries. ## continue until all cDNAs have merged products, if possible. my $num_alignments = $#alignments + 1; for (my $i = 0; $i < $num_alignments; $i++) { my $seed_alignment = $alignments[$i]; if ($seed_alignment->{merged}) {next;} print "Checking seed $i\n" if $SEE; my $merged_alignment = $seed_alignment; #initialize to seed alignment. my $merged_flag = 1; my $round = 0; while ($merged_flag) { $merged_flag = 0; $round++; print "Merging round $round\n" if $SEE; for (my $j=0; $j < $num_alignments; $j++) { if ($i == $j) {next;} #no self comparisons. my $other_alignment = $alignments[$j]; if ($self->already_contains($merged_alignment, $other_alignment)) { next;} if ($self->can_merge($merged_alignment, $other_alignment)) { my $initial_merged_fli_status = $merged_alignment->is_fli(); my $initial_merged_orient = $merged_alignment->get_orientation(); $merged_alignment = $self->merge_alignments($merged_alignment, $other_alignment); $alignments[$i]->{merged} = 1; #set seed alignment merge flag. $alignments[$j]->{merged} = 1; #set other alignment merge flag. $merged_flag = 1; #indicates something actually merged this round. $merged_alignment->remap_cdna_segment_coords(); print "merged: " . $merged_alignment->toToken() . "\n" if $SEE; } } } push (@{$self->{assemblies}}, $merged_alignment); #either an assembled product, or something that won't ever merge. } } =item get_assemblies() =over 4 B returns all the alignment assemblies resulting from the assembly procedure. B none. B @assemblies @assemblies is an array of CDNA::CDNA_alignment objects. use the get_acc() method of the alignment object to retrieve all the accessions of the cDNAs that were merged into the assembly. =back =cut sub get_assemblies { my $self = shift; return (@{$self->{assemblies}}); } # private method. Determines if two alignments are compatible with one another. sub can_merge () { my $self = shift; my ($a1, $a2) = @_; print "Checking to see if can merge: " . $a1->get_acc() . ", " . $a2->get_acc() . "\n" if $::SEE; ## See if the coord spans overlap my ($a1_lend, $a1_rend) = $a1->get_coords(); my ($a2_lend, $a2_rend) = $a2->get_coords(); unless (&overlap($a1_lend, $a1_rend, $a2_lend, $a2_rend)) { print "failed merge: No overlap between alignment spans. ($a1_lend, $a1_rend) vs. ($a2_lend, $a2_rend)\n" if $::SEE; return(0); } ## Make sure the spliced orientation is equivalent if appropriate my $a1_num_segs = $a1->get_num_segments(); my $a2_num_segs = $a2->get_num_segments(); my $a1_spliced_orientation = $a1->get_spliced_orientation(); my $a2_spliced_orientation = $a2->get_spliced_orientation(); my $a1_is_fli = $a1->is_fli(); my $a2_is_fli = $a2->is_fli(); my $fuzzlength = $self->{fuzzlength}; if ($a1_num_segs > 1 && $a2_num_segs > 1) { #if more than one segment, then spliced orientation is relevant. if ($a1_spliced_orientation ne $a2_spliced_orientation) { print "failed merge: $a1_num_segs segments vs. $a2_num_segs and opposite spliced orientations.\n" if $::SEE; return (0); } } if ($a1_is_fli && $a2_is_fli && ($a1_spliced_orientation ne $a2_spliced_orientation)) { #fli's must have same orient. print "failed merge: (a1-fli: $a1_is_fli, a2-fli: $a2_is_fli) and opposite orientations.\n" if $SEE; return (0); } ## Check all overlapping segments to ensure non-conflicting segments. my @a1_segments = $a1->get_alignment_segments(); my @a2_segments = $a2->get_alignment_segments(); ## align segment orders between a1 and a2 my ($starting_a1, $starting_a2); for (my $i = 0; $i <= $#a1_segments; $i++) { my $a1_seg = $a1_segments[$i]; my ($a1_lend, $a1_rend) = $a1_seg->get_coords(); for (my $j = 0; $j <= $#a2_segments; $j++) { my $a2_seg = $a2_segments[$j]; my ($a2_lend, $a2_rend) = $a2_seg->get_coords(); if (&overlap($a1_lend, $a1_rend, $a2_lend, $a2_rend)) { $starting_a1 = $i; $starting_a2 = $j; last; } } if (defined ($starting_a1) && defined ($starting_a2)) { last; } } unless (defined ($starting_a1) && defined ($starting_a2)) { print "failed merge: can't align two segments between overlapping alignments.\n" if $::SEE; return (0); } unless ($starting_a1 == 0 || $starting_a2 == 0) { print "failed merge: segment alignment doesn't begin at either cDNA terminus.\n" if $::SEE; return (0); } while ($starting_a1 <= $#a1_segments && $starting_a2 <= $#a2_segments) { my $a1_segment = $a1_segments[$starting_a1]; my $a2_segment = $a2_segments[$starting_a2]; my ($a1_lend, $a1_rend) = $a1_segment->get_coords(); my ($a2_lend, $a2_rend) = $a2_segment->get_coords(); if (&overlap($a1_lend, $a1_rend, $a2_lend, $a2_rend)) { ## See if have splice sites, do they exist and are they identical if ($a1_segment->has_left_splice_junction() || $a2_segment->has_left_splice_junction()) { if ($a1_segment->has_left_splice_junction() && $a2_segment->has_left_splice_junction() && $a1_lend != $a2_lend) { print "failed merge:\tboth left splice, but unequal coords: L1 ($a1_lend), L2 ($a2_lend)\n" if $::SEE; return (0); } elsif ($a1_segment->has_left_splice_junction() && ($a2_lend + $fuzzlength < $a1_lend)) { #alignment extends beyond a splice junction. print "failed merge:\tL1 left splice, L2 ($a2_lend) < L1 ($a1_lend)\n" if $::SEE; return (0); } elsif ($a2_segment->has_left_splice_junction() && ($a1_lend + $fuzzlength < $a2_lend)) { print "failed merge:\tL2 left splice, L1 ($a1_lend) < L2 ($a2_lend)\n" if $::SEE; return (0); } } if ($a1_segment->has_right_splice_junction() || $a2_segment->has_right_splice_junction()) { if ($a1_segment->has_right_splice_junction() && $a2_segment->has_right_splice_junction() && $a1_rend != $a2_rend) { print "failed merge:\tboth right splice, but unequal coords: R1($a1_rend), R2($a2_rend)\n" if $::SEE; return (0); } elsif ($a1_segment->has_right_splice_junction() && ($a2_rend - $fuzzlength > $a1_rend)) { print "failed merge:\tR1 right splice, R2($a2_rend) > R1 ($a1_rend)\n" if $::SEE; return (0); } elsif ($a2_segment->has_right_splice_junction() && ($a1_rend - $fuzzlength > $a2_rend)) { print "failed merge:\tR2 right splice, R1 ($a1_rend) > R2 ($a2_rend)\n" if $::SEE; return (0); } } } else { print "failed merge: Two ordered segments don't overlap. ($a1_lend, $a1_rend) , ($a2_lend, $a2_rend)\n" if $::SEE; return (0); } $starting_a1++; $starting_a2++; } ## Passed all tests print "Merge tests PASSED.\n" if $::SEE; return (1); } # private method # merges two alignment objects together into an assembly. sub merge_alignments () { my ($self, $a1, $a2) = @_; print "Merging <" . $a1->get_acc() . ">, <" . $a2->get_acc() . ">\n" if $::SEE; my $a1_fli = $a1->is_fli(); my $a2_fli = $a2->is_fli(); print "a1_fli: $a1_fli, a2_fli: $a2_fli\n" if $SEE; ## Determine fli status for merged product. my $merged_fli_status = ($a1_fli || $a2_fli); my $merged_orientation = $self->determine_merged_orientation($a1, $a2); #get a1 segment cooridnates; my %leftsplicecoords; #preferrentially use splice coords over Fuzzlength extensions. my %rightsplicecoords; my %a1_coords; my @a1_segments = $a1->get_alignment_segments(); foreach my $seg (@a1_segments) { my ($lend, $rend) = $seg->get_coords(); $a1_coords{$lend} = $rend; if ($seg->has_left_splice_junction()) { $leftsplicecoords{$lend} = 1; } if ($seg->has_right_splice_junction()) { $rightsplicecoords{$rend} = 1; } } # get a2 segment coordinates: my %a2_coords; my @a2_segments = $a2->get_alignment_segments(); foreach my $seg (@a2_segments) { my ($lend, $rend) = $seg->get_coords(); $a2_coords{$lend} = $rend; if ($seg->has_left_splice_junction()) { $leftsplicecoords{$lend} = 1; } if ($seg->has_right_splice_junction()) { $rightsplicecoords{$rend} = 1; } } my %merged_coords; #print "Coord dumps:\n" . Dumper (\%a1_coords) . Dumper (\%a2_coords) . "\n"; #print "\n\nBEGIN\n"; ## merge the overlapping coordinate sets between a1 and a2 alignments. foreach my $a1_lend (keys %a1_coords) { my $a1_rend = $a1_coords{$a1_lend}; my ($merged_lend, $merged_rend); foreach my $a2_lend (keys %a2_coords) { my $a2_rend = $a2_coords{$a2_lend}; if (&overlap($a1_lend, $a1_rend, $a2_lend, $a2_rend)) { #overlap ## Determine merged lend; if ($leftsplicecoords{$a1_lend}) { $merged_lend = $a1_lend; } elsif ($leftsplicecoords{$a2_lend}) { $merged_lend = $a2_lend; } else { $merged_lend = min ($a1_lend, $a2_lend); } # Determine merged rend if ($rightsplicecoords{$a1_rend}) { $merged_rend = $a1_rend; } elsif ($rightsplicecoords{$a2_rend}) { $merged_rend = $a2_rend; } else { $merged_rend = max ($a1_rend, $a2_rend); } last; } } if ($merged_lend && $merged_rend) { #print "Adding overlapped \$merged_coords{$merged_lend} = $merged_rend\n"; $merged_coords{$merged_lend} = $merged_rend; } else { #must not have been any overlap; keep a1 coordset #print "Keeping a1 coords: \$merged_coords{$a1_lend} = $a1_rend\n"; $merged_coords{$a1_lend} = $a1_rend; } } #print "adding unconsumed a2 coords:\n"; ## add non-overlapping a2-segments foreach my $a2_lend (keys %a2_coords) { my $overlap = 0; my $a2_rend = $a2_coords{$a2_lend}; foreach my $m_lend (keys %merged_coords) { my $m_rend = $merged_coords{$m_lend}; if (&overlap($a2_lend, $a2_rend, $m_lend, $m_rend)) { $overlap = 1; last; } } if (!$overlap) { #print "Consuming a2 non-overlapping coords.\n"; $merged_coords{$a2_lend} = $a2_rend; #added a2 non-overlapping coordset } else { #print "coords overlapped, not consuming.\n"; } } print Dumper (\%merged_coords) if $::SEE; ## Create a new alignment based on a1 and a2 my @alignment_segments; my $merged_length = 0; foreach my $end5 (keys %merged_coords) { my $end3 = $merged_coords{$end5}; my $alignment_seg = new CDNA::Alignment_segment($end5, $end3); push (@alignment_segments, $alignment_seg); $merged_length += abs ($end3 - $end5) + 1; } my $new_alignment = new CDNA::CDNA_alignment($merged_length, \@alignment_segments, $self->{sequence_ref}); my $new_acc = $self->merge_accs($a1->get_acc(), $a2->get_acc()); $new_alignment->set_acc($new_acc); $new_alignment->set_fli_status($merged_fli_status); $new_alignment->force_spliced_validation($merged_orientation); #print "END.\n\n"; return ($new_alignment); } #private method # returns the minimum of an array of numerical values. sub min { my @x = @_; @x = sort {$a<=>$b} @x; my $y = shift @x; return ($y); } #private method # returns the maximum of an array of numerical values. sub max { my @x = @_; @x = sort {$a<=>$b} @x; my $y = pop @x; return ($y); } #private method # returns true/false, determines whether two coordinate sets overlap each other. sub overlap { my ($a1_lend, $a1_rend, $a2_lend, $a2_rend) = @_; #print "Checking overlap @_\t"; if ($a2_rend >= $a1_lend && $a2_lend <= $a1_rend) { #overlap #print "YES\n"; return (1); } else { #print "NO\n"; return (0); } } =item toAlignIllustration() =over 4 B illustrates the individual cDNAs to be assembled along with the final products. B $max_line_chars(optional) $max_line_chars is an integer representing the maximum number of characters in a single line of output to the terminal. The default is 100. B $alignment_illustration_text $alignment_illustration_text is a string containing a paragraph of text which illustrates the alignments and assemblies. An example is below: ---> <--> <-----> <---> <---------------- (+)gi|1199466 ---> <--> <-----> <---> <------------ (+)gi|1209702 ----> <--> <---- (+)AV827070 ----> <--> <--- (+)AV828861 ----> <--> <--- (+)AV830936 ---> <--> <- (+)H36350 ASSEMBLIES: (1) ----> <--> <-----> <---> <---------------- (+) gi|1199466, gi|1209702, AV827070, AV828861, AV830936, H36350 =back =cut sub toAlignIllustration () { my $self = shift; my $max_line_chars = shift; $max_line_chars = ($max_line_chars) ? $max_line_chars : 100; #if not specified, 100 chars / line is default. ## Get minimum coord for relative positioning. my @coords; my @alignments = @{$self->{incoming_alignments}}; foreach my $alignment (@alignments) { my @c = $alignment->get_coords(); push (@coords, @c); } @coords = sort {$a<=>$b} @coords; print "coords: @coords\n" if $::SEE; my $min_coord = shift @coords; my $max_coord = pop @coords; my $rel_max = $max_coord - $min_coord; my $alignment_text = ""; ## print each alignment followed by assemblies: my $num_alignments = $#alignments + 1; $alignment_text .= "Individual Alignments: ($num_alignments)\n"; my $i = 0; foreach my $alignment (@alignments) { $alignment_text .= (sprintf ("%3d ", $i)) . $alignment->toAlignIllustration($min_coord, $rel_max, $max_line_chars) . "\n"; $i++; } my @assemblies = @{$self->{assemblies}}; my $num_assemblies = $#assemblies + 1; $alignment_text .= "\n\nASSEMBLIES: ($num_assemblies)\n"; foreach my $assembly (@assemblies) { $alignment_text .= " " . $assembly->toAlignIllustration($min_coord, $rel_max, $max_line_chars) . "\n"; } return ($alignment_text); } =over 4 =item set_fuzzlength() B Sets the fuzzlength parameter. B int B none. The fuzzlength is the length allowed to be fuzzy at the terminus of all alignments when compared to overlapping exons containining nearby splice sites. =back =cut sub set_fuzzlength { my $self = shift; my $fuzzlength = shift; $self->{fuzzlength} = $fuzzlength; } #private method # determines if two assemblies have the same accessions. sub composition_same () { my ($name1, $name2) = @_; if ($name1 eq $name2) { return (1); } else { return (0); } } #private method # creates a new name based on the accessions composing two assemblies. sub merge_accs () { my $self = shift; my @names = @_; my @nameaccs; foreach my $name (@names) { my @nameacclist = split (/$DELIMETER/, $name); push (@nameaccs, @nameacclist); } my %unique; foreach my $acc (@nameaccs) { $unique{$acc} = 1; } my @accs = sort {$a cmp $b} keys %unique; my $combo = join ($DELIMETER, @accs); return ($combo); } #private method # checks to see if an assembly already contains all the accessions built into a second assembly. sub already_contains () { ## Checks to see if a1 contains all accessions of a2 my $self = shift; my ($a1, $a2) = @_; my $a1_acc = $a1->get_acc(); my $a2_acc = $a2->get_acc(); my @a1_accs = split (/$DELIMETER/, $a1_acc); my @a2_accs = split (/$DELIMETER/, $a2_acc); my %a1_acc_hash; foreach my $acc (@a1_accs) { $a1_acc_hash{$acc} = 1; } my %a2_acc_hash; foreach my $acc (@a2_accs) { $a2_acc_hash{$acc} = 1; } foreach my $acc (keys %a2_acc_hash) { unless ($a1_acc_hash{$acc}) { return (0); } } ## if still here, then must have all a2 accs in a1. return (1); } #private sub determine_merged_orientation { my ($self, $a1, $a2) = @_; ## Determine the orientation of the merged product. my $num_a1_segments = $a1->get_num_segments(); my $num_a2_segments = $a2->get_num_segments(); my $a1_spliced_orientation = $a1->get_spliced_orientation(); my $a2_spliced_orientation = $a2->get_spliced_orientation(); print "a1_spliced_orient: $a1_spliced_orientation, a2_spliced_orient: $a2_spliced_orientation\n" if $SEE; my $merged_orientation = '+'; #initialize to a default. if ($a1_spliced_orientation eq $a2_spliced_orientation) { $merged_orientation = $a1_spliced_orientation; print "Same spliced orientation: $merged_orientation\n" if $SEE; } elsif ($a1->is_fli() || $a2->is_fli()) { $merged_orientation = ($a1->is_fli()) ? $a1_spliced_orientation : $a2_spliced_orientation; print "a1 is fli, using a1 orient: $merged_orientation\n" if ($a1->is_fli() && $SEE); print "a2 is fli, using a2 orient: $merged_orientation\n" if ($a2->is_fli() && $SEE); } elsif ($num_a1_segments > 1 || $num_a2_segments > 1) { $merged_orientation = ($num_a1_segments > 1) ? $a1_spliced_orientation : $a2_spliced_orientation; print "a1 has multiple segments, using a1 orient: $merged_orientation\n" if ($num_a1_segments > 1 && $SEE); print "a2 has multiple segments, using a2 orient: $merged_orientation\n" if ($num_a2_segments > 1 && $SEE); } else { print "Can't predict the orientation of the merged alignments ($a1_spliced_orientation vs. $a2_spliced_orientation). Using default '+'\n" if $SEE; } return ($merged_orientation); } 1; #EOM