#!/usr/local/bin/perl package main; our $SEE; package CDNA::PASA_alignment_assembler; =head1 NAME CDNA::PASA_alignment_assembler =cut =head1 DESCRIPTION This module is used to assemble compatible cDNA alignments. The algorithm is as follows: must describe this here. =cut use strict; use CDNA::CDNA_alignment; use Data::Dumper; use Carp; ## File scoped globals: my $DELIMETER = "$;,"; our $FUZZLENGTH = 20; =item new() =over 4 B instantiates a new cDNA assembler obj. B none B $obj_href $obj_href is the object reference newly instantiated by this new method. =back =cut sub new { my $package_name = shift; my $self = {}; bless ($self, $package_name); $self->_init(@_); return ($self); } sub _init { my $self = shift; $self->{incoming_alignments} = []; #these are the alignments to be assembled. $self->{assemblies} = []; #contains list of all singletons and assemblies. $self->{fuzzlength} = $FUZZLENGTH; #default setting. my $pasa_bin = `which pasa`; $pasa_bin =~ s/\s//g; unless (-x $pasa_bin) { confess "Error, pasa binary [$pasa_bin] isn't executable or couldn't be found."; } $self->{pasa_bin} = $pasa_bin; } =item assemble_alignments() =over 4 B assembles a series of cDNA aligmnments into one or more cDNA assemblies using a directed acyclic graph. B @alignments @alignments is an array of CDNA::CDNA_alignment objects B none. =back =cut sub assemble_alignments { my $self = shift; my @alignments = @_; @alignments = sort {$a->{lend}<=>$b->{lend}} @alignments; #keep in order of lend across genomic sequence to provide a layout. $self->{incoming_alignments} = [@alignments]; my %accs; my %spliced_orientations; # track so can set later in each assembly based on content. my %aligned_orientations; my %FL_accs; foreach my $alignment (@alignments) { my $acc = $alignment->get_acc(); $accs{$acc} = 0; my $spliced_orient = $alignment->get_spliced_orientation(); $spliced_orientations{$acc} = $spliced_orient; my $aligned_orient = $alignment->get_orientation(); $aligned_orientations{$acc} = $aligned_orient; $FL_accs{$acc} = $alignment->is_fli(); } $self->{accs_in_assemblies} = \%accs; my $num_alignments = $#alignments + 1; $self->force_flexorient('+'); my @assemblies = $self->pasa_cpp_assemblies('+'); $self->force_flexorient('-'); push (@assemblies, $self->pasa_cpp_assemblies('-')); # sort in order of decreasing score @assemblies = reverse sort {$a->{num_contained_aligns}<=>$b->{num_contained_aligns}} @assemblies; if ($SEE) { print "\n\nScore summary for all assemblies (nr set unchosen):\n\n"; foreach my $assembly (@assemblies) { print "score: " . $assembly->{num_contained_aligns} . ", " . $assembly->toToken . "\n"; } print "\n\n"; } my @report_assemblies; my $still_missing = 1; foreach my $assembly (@assemblies) { print "\nAnalyzing assembly.\n" if $SEE; my $contained_aligns_aref = $assembly->{contained_aligns}; my $have_unseen = 0; my $spliced_orient = '?'; my $is_fli = 0; my %aligned_orient_counts; foreach my $acc (@$contained_aligns_aref) { print "got: $acc\n" if $SEE; # check to see if we've encountered this one yet. unless ($accs{$acc}) { $have_unseen = 1; } $accs{$acc} = 1; my $curr_spliced_orient = $spliced_orientations{$acc}; print "curr_spliced_orient: $curr_spliced_orient\n" if $SEE; if ($curr_spliced_orient ne '?') { if ($spliced_orient ne '?' && $spliced_orient ne $curr_spliced_orient) { ## cannot have conflicting spliced orientations in the assembly: corruption. die "Fatal: conflicting spliced orientations in current PASA assembly results (spliced_orient: $spliced_orient, $acc has $curr_spliced_orient).\n"; } $spliced_orient = $curr_spliced_orient; ## retain original spliced orientation. } if ( (!$is_fli) && $FL_accs{$acc}) { $is_fli = 1; } ## track aligned orientation $aligned_orient_counts{ $aligned_orientations{$acc} } ++; } ## set assembly orientation $assembly->set_spliced_orientation($spliced_orient); if ($spliced_orient eq '?') { ## set aligned orientation based on a majority vote my @orients = reverse sort { $aligned_orient_counts{$a} <=> $aligned_orient_counts{$b} } keys %aligned_orient_counts; my $winning_aligned_orient = shift @orients; $assembly->set_orientation($winning_aligned_orient); } else { # got spliced orientation, use it for aligned orientation too. $assembly->set_orientation($spliced_orient); } $assembly->set_fli_status($is_fli); if ($have_unseen) { print $assembly->toToken . "\n" if $SEE; push (@report_assemblies, $assembly); } $still_missing = 0; foreach my $key (keys %accs) { if (! $accs{$key}) { $still_missing = 1; print "still missing: $key\n" if $SEE; } } if (! $still_missing) { last; #got them all. } } $self->{assemblies} = \@report_assemblies; if ($still_missing) { die "Didn't obtain assemblies describing all maximal assemblies.\n"; } } sub pasa_cpp_assemblies { my $self = shift; my $forced_orient = shift; my $prev_input_sep = $/; $/ = "\n"; my $sequence_ref; my $incoming_alignments_aref = $self->{incoming_alignments}; # create input file for pasa-cpp implementation: my $pasa_input = "pasa.$$.$forced_orient.in"; my $pasa_output = "pasa.$$.$forced_orient.out"; my @assemblies; open (TMPIN, ">$pasa_input") or die "Can't open file $pasa_input"; foreach my $alignment (@$incoming_alignments_aref) { my $acc = $alignment->get_acc(); ## commas not allowed in acc name: if ($acc =~ /,/) { die "ERROR, $acc accession contains comma(s). This is not allowed.\n"; } my $orient = $alignment->{fixed_orient}; my $alignText = "$acc,$orient"; unless (ref $sequence_ref) { $sequence_ref = $alignment->get_genomic_seq_ref(); } foreach my $seg ($alignment->get_alignment_segments()) { my ($lend, $rend) = $seg->get_coords(); $alignText .= ",$lend-$rend"; } print TMPIN $alignText . "\n"; } close TMPIN; if ($SEE) { print "PASA_INPUT ($forced_orient):\n====\n"; system "cat $pasa_input"; print "====\n"; } my $cmd = $self->{pasa_bin} . " $pasa_input > $pasa_output"; my $ret = system $cmd; if ($ret) { system "mv $pasa_input pasa_killer.input"; print STDERR "PASA died on input file. See pasa_killer.input"; die; } else { # process the output. open (TMPOUT, $pasa_output) or die "Can't open $pasa_output"; while () { if (/assembly:\s\(\d+\)\scontains\salignments:\s\[([^\]]+)\]\swith\sstructure\s\[([^\]]+)\]/) { print "Extracting assembly output: $_" if $SEE; my $acclist = $1; my $aligndescript = $2; my @x = split (/,/, $aligndescript); shift @x; my $orient = shift @x; my @alignSegs; my $length = 0; foreach my $coordset (@x) { my ($lend, $rend) = sort {$a<=>$b} split (/-/, $coordset); my $seg = new CDNA::Alignment_segment($lend, $rend); $length += ($rend - $lend) + 1; push (@alignSegs, $seg); } my $assembly = new CDNA::CDNA_alignment($length, \@alignSegs, $sequence_ref); my @accs = split (/,/, $acclist); my $num_accs = $#accs + 1; $assembly->{contained_aligns} = [@accs]; $assembly->{num_contained_aligns} = $num_accs; $acclist =~ s/,/\//g; #convert list of accessions into a new accession representing a single entry (unity) # if we keep the commas, use of this assembly in future PASA runs will break the assembler # because of the input file requirements. $assembly->set_acc($acclist); push (@assemblies, $assembly); } } close TMPOUT; if ($SEE) { print "PASA_OUTPUT ($forced_orient):\n####\n"; system "cat $pasa_output"; print "####\n"; } unlink ($pasa_input, $pasa_output) unless $SEE; } $/ = $prev_input_sep; ## restore return (@assemblies); } sub force_flexorient { my $self = shift; my $orient = shift; my $alignments_aref = $self->{incoming_alignments}; my $num_alignments = $#{$alignments_aref} + 1; ## Fix orientations of fli and multi-segment alignments: for (my $i = 0; $i < $num_alignments; $i++) { my $alignment = $alignments_aref->[$i]; my $num_segments = $alignment->get_num_segments(); my $spliced_orientation = $alignment->get_spliced_orientation(); if ($spliced_orientation =~ /^[+-]$/) { # set specifically $alignment->{fixed_orient} = $spliced_orientation; ## adding tag, using only in this module. } else { print "Setting $i to flex orient: $orient.\n" if $SEE; $alignment->{fixed_orient} = $orient; } } } sub unique_entries { my @x = @_; my %z; foreach my $y (@x) { $z{$y}=1; } return (keys %z); } =item get_assemblies() =over 4 B returns all the alignment assemblies resulting from the assembly procedure. B none. B @assemblies @assemblies is an array of CDNA::CDNA_alignment objects. use the get_acc() method of the alignment object to retrieve all the accessions of the cDNAs that were merged into the assembly. =back =cut sub get_assemblies { my $self = shift; return (@{$self->{assemblies}}); } =item toAlignIllustration() =over 4 B illustrates the individual cDNAs to be assembled along with the final products. B $max_line_chars(optional) $max_line_chars is an integer representing the maximum number of characters in a single line of output to the terminal. The default is 100. B $alignment_illustration_text $alignment_illustration_text is a string containing a paragraph of text which illustrates the alignments and assemblies. An example is below: ---> <--> <-----> <---> <---------------- (+)gi|1199466 ---> <--> <-----> <---> <------------ (+)gi|1209702 ----> <--> <---- (+)AV827070 ----> <--> <--- (+)AV828861 ----> <--> <--- (+)AV830936 ---> <--> <- (+)H36350 ASSEMBLIES: (1) ----> <--> <-----> <---> <---------------- (+) gi|1199466, gi|1209702, AV827070, AV828861, AV830936, H36350 =back =cut ; sub toAlignIllustration () { my $self = shift; my $max_line_chars = shift; $max_line_chars = ($max_line_chars) ? $max_line_chars : 100; #if not specified, 100 chars / line is default. ## Get minimum coord for relative positioning. my @coords; my @alignments = @{$self->{incoming_alignments}}; foreach my $alignment (@alignments) { my @c = $alignment->get_coords(); push (@coords, @c); } @coords = sort {$a<=>$b} @coords; print "coords: @coords\n" if $::SEE; my $min_coord = shift @coords; my $max_coord = pop @coords; my $rel_max = $max_coord - $min_coord; my $alignment_text = ""; ## print each alignment followed by assemblies: my $num_alignments = $#alignments + 1; $alignment_text .= "Individual Alignments: ($num_alignments)\n"; my $i = 0; foreach my $alignment (@alignments) { $alignment_text .= (sprintf ("%3d ", $i)) . $alignment->toAlignIllustration($min_coord, $rel_max, $max_line_chars) . "\n"; $i++; } my @assemblies = @{$self->{assemblies}}; my $num_assemblies = $#assemblies + 1; $alignment_text .= "\n\nASSEMBLIES: ($num_assemblies)\n"; foreach my $assembly (@assemblies) { $alignment_text .= " " . $assembly->toAlignIllustration($min_coord, $rel_max, $max_line_chars) . "\n"; } return ($alignment_text); } 1;