package main; our $SEE; package StringGraph; use strict; use warnings; use Carp; use StringNode; use ReadTracker; use base qw (ReadCoverageGraph); no warnings qw (recursion); my $MIN_DISPLAY_SEQ_TAG_LENGTH = 10; # lower than this, and shows up in the dot file as a label. sub new { my $packagename = shift; my $self = { _nodes => {}, #kmerSeq => nodeAddress until after compaction... becomes nodeaddress => $nodeaddress compacted => 0, _edge_color_labels => {}, # keyed on (prev_node, next_node, color) _indirect_edge_color_labels => {}, # keyed on color only }; bless ($self, $packagename); return($self); } sub add_sequence_to_graph { my $self = shift; my ($acc, $sequence_node_names_aref, $weight, $color) = @_; ## Sequence type can be undef, ReferenceTranscript, or GreedyPath unless (defined($weight) && $weight =~ /^\d+/) { confess "Error, params: (seq, weight, color), and weight must be numeric."; } if ($self->{compacted}) { confess "Cannot add more sequences to the graph after it has been compacted"; } my $prev_node_name = $sequence_node_names_aref->[0]; my $prev_node = $self->get_or_create_node($prev_node_name); $prev_node->{_count}++; $prev_node->track_reads($acc); if ($color) { $prev_node->add_color($color); } for (my $i = 1; $i <= $#$sequence_node_names_aref; $i++) { my $next_node_name = $sequence_node_names_aref->[$i]; my $next_node = $self->get_or_create_node($next_node_name); $next_node->track_reads($acc); if ($color) { $next_node->add_color($color); } $next_node->{_count}++; my $prev_edge_weight = $self->get_edge_count($prev_node, $next_node); $self->_set_edge_color_label($prev_node, $next_node, $color, $acc); ## only link together kmers that contain only recognizable nucleotides $self->link_adjacent_nodes($prev_node, $next_node, $weight); my $edge_weight = $self->get_edge_count($prev_node, $next_node); if ($edge_weight < $weight) { confess "Error, added weight $weight to edge $prev_node -> $next_node and didn't stick"; } if ($prev_edge_weight > $edge_weight) { confess "Error, after adding new edge, prev weight of $prev_edge_weight became: $edge_weight "; } $prev_node = $next_node; $prev_node_name = $next_node_name; } return; } sub get_or_create_node { my $self = shift; my ($node_name, $read_accession) = @_; if ($self->node_exists($node_name)) { my $node = $self->get_node($node_name); return($node); } else { # instantiate it, add it to the graph return($self->create_node($node_name)); } } sub create_node { my $self = shift; my ($node_name) = @_; if ($self->node_exists($node_name)) { confess "Error, node $node_name already exists in the graph"; } my $node = new StringNode($node_name); $self->{_nodes}->{$node_name} = $node; return($node); } #### sub compact_graph { my $self = shift; $self->validate_graph(); my $kmer_length = $self->{KmerLength}; unless ($self->{compacted}) { $self->_prep_graph_for_compaction(); $self->{compacted} = 1; } my $COMPACTED = 0; # flag ## identify all nodes that are branched on the left and not branched on the right ## then join together the unbranched nodes starting from these and walking right. my @nodes = $self->get_all_nodes(); ## merge unbranched nodes in runs. #### #### \ #### (-)--- #### / # or # (-)--- # or #### (-)------------ #### / #### - #### \ #### (-) ------------ # or, single line with incompatible decorations: # aaaaaaaaaa b bbbbbbbbbb # ----------(-)---------- my @init_nodes; foreach my $node (@nodes) { my @prev_nodes = $node->get_all_prev_nodes(); my @next_nodes = $node->get_all_next_nodes(); if ( (scalar(@next_nodes) == 1) && ( ( (scalar(@prev_nodes) != 1) # branched before or no node before. || (scalar (@prev_nodes) == 1 && scalar($prev_nodes[0]->get_all_next_nodes() > 1) ) # previous node branches off. ) || ## decoration switch (scalar(@prev_nodes) == 1 && (! &_compatible_decorations($node, $prev_nodes[0])) && (&_compatible_decorations($node, $next_nodes[0])) ) ) ) { # got one push (@init_nodes, $node); } } print "Got " . scalar(@init_nodes) . " init nodes.\n" if $main::SEE; #foreach my $node (@init_nodes) { # print "Init node $node:\n" . $node->toString() . "\n";# if $main::SEE; #} ## Collapse neighboring nodes before branching foreach my $node (@init_nodes) { #print "Init node $node:\n" . $node->toString() . "\n";# if $main::SEE; my @unbranched_path; my ($next_node) = $node->get_all_next_nodes(); unless ($next_node) { print "warning, init node now lacks next nodes... skipping.\n"; # potential bug? or circular dealt with already? next; } my %seen = ($node => 1); while (1) { # avoid circularity if ($seen{$next_node}) { print "Seen $next_node already... avoiding circularity.\n" if $main::SEE; last; } $seen{$next_node} = 1; unless (&_compatible_decorations($node, $next_node)) { print "Incompatible decorations.\n" if $main::SEE; last; } # check for left branching on next node my $num_prev_nodes = scalar ($next_node->get_all_prev_nodes()); if (scalar $num_prev_nodes > 1) { print "Got $num_prev_nodes prev nodes for $next_node; terminating extension.\n" if $main::SEE; last; } push (@unbranched_path, $next_node); my @next_nodes = $next_node->get_all_next_nodes(); if (scalar @next_nodes != 1) { last; } $next_node = shift @next_nodes; } if ($main::SEE) { print "Single unbranched path:\n"; $self->print_path($node, @unbranched_path); } foreach my $next_node (@unbranched_path) { # merge pairs $self->_append_node($node, $next_node); $COMPACTED = 1; } if ($main::SEE) { print "Collapsed:\n"; $self->print_path($node); print "\n\n\n"; } } $self->validate_graph(); return ($COMPACTED); } #### sub _append_node { my $self = shift; my ($node, $next_node) = @_; ## In compacted mode already. if (! _compatible_decorations($node, $next_node)) { confess "Error, trying to join nodes with incompatible decorations."; } my $KmerLength = $self->{KmerLength}; # add counts $node->set_count($node->get_count() + $next_node->get_count()); my $orig_node_seq = $node->get_sequence(); my $orig_next_node_seq = $next_node->get_sequence(); my $new_node_sequence = $orig_node_seq . substr($orig_next_node_seq, $KmerLength-1); # exclude K-1 prefix. # add terminal base $node->set_sequence($new_node_sequence); # add read evidence $node->{_ReadTracker}->append_to_ReadTracker($next_node->{_ReadTracker}); # sever connection between node and next $node->delete_next_node($next_node); $next_node->delete_prev_node($node); # sever connection between next and next-next nodes, and reconnect next-next's with node. my @next_node_next_nodes = $next_node->get_all_next_nodes(); my %next_next_node_edge_count; ## update edge counts. foreach my $next_next_node (@next_node_next_nodes) { my $edge_count = $self->get_edge_count($next_node, $next_next_node); $next_next_node_edge_count{$next_next_node} = $edge_count; } $self->delete_node_from_graph($next_node); foreach my $next_next_node (@next_node_next_nodes) { $self->link_adjacent_nodes($node, $next_next_node, $next_next_node_edge_count{$next_next_node}); } return; } sub _get_edge_color_label_key { my $self = shift; my ($prev_node, $next_node, $color) = @_; my $edge_color_label = join("$;", $prev_node, $next_node, $color); return($edge_color_label); } sub _set_edge_color_label { my $self = shift; my ($prev_node, $next_node, $color, $label) = @_; my $key = $self->_get_edge_color_label_key($prev_node, $next_node, $color); $self->{_edge_color_labels}->{$key} = $label; $self->{_indirect_edge_color_labels}->{$color} = $label; return; } sub _get_edge_color_label { my $self = shift; my ($prev_node, $next_node, $color) = @_; my $key = $self->_get_edge_color_label_key($prev_node, $next_node, $color); my $label = $self->{_edge_color_labels}->{$key}; return($label); } sub _get_indirect_edge_color_label { my $self = shift; my $color = shift; return($self->{_indirect_edge_color_labels}->{$color}); } #### sub toGraphViz { my $self = shift; my (%settings) = @_; my @nodes = $self->get_all_nodes(); my $text = "digraph G {\n"; $text .= #"node [width=0.1,height=0.1,fontsize=10,shape=point];\n" "node [width=0.1,height=0.1,fontsize=10];\n" . "edge [fontsize=12];\n" . "margin=1.0;\n" . "rankdir=LR;\n" . "labeljust=l;\n"; foreach my $node (sort {$a->get_ID() cmp $b->get_ID()} @nodes) { my @prev_nodes = $node->get_all_prev_nodes(); my @next_nodes = $node->get_all_next_nodes(); my $sequence = $node->get_sequence(); #if (my $min_len = $settings{no_short_singletons}) { # # if ( scalar(@prev_nodes) == 0 # && # scalar(@next_nodes) == 0 # && # length($sequence) < $min_len) { # # next; # } #} my $seqLen = length($sequence); my $count = $node->get_count(); my $avg_count = $count; #int($count/$len + 0.5); my $id = hex($node->get_ID()); my $depth =""; if (defined $node->{depth}) { $depth = $node->{depth}; } #$text .= "\t$id \[label=\"$id-L$seqLen-C$avg_count$atts\"$color];\n"; $text .= "\t$id \[label=\"$sequence:D$depth-C$avg_count\"];\n"; foreach my $next_node (@next_nodes) { my $next_id = hex($next_node->get_ID()); my @colors_in_common = &get_colors_in_common($node, $next_node); #my $edge_weight = $self->get_edge_count($node, $next_node); if (@colors_in_common) { foreach my $color (@colors_in_common) { my $edge_label = $self->_get_edge_color_label($node, $next_node, $color); if ($edge_label) { $text .= "\t$id->$next_id [label=\"$edge_label\", color=\"$color\"];\n"; } else { ### look for indirect edge #my $indirect_edge_label = $self->_get_indirect_edge_color_label($color); #$text .= "\t$id->$next_id [label=\"$indirect_edge_label\", color=\"$color\", style=\"dashed\"];\n"; } } } } } $text .= "}\n"; return($text); } sub get_colors_in_common { my ($node, $next_node) = @_; my @colorsA = $node->get_colors(); my @colorsB = $next_node->get_colors(); my %counts; foreach my $color (@colorsA, @colorsB) { $counts{$color}++; } my @colors = grep { $counts{$_} > 1 } keys %counts; return(@colors); } #### sub _prep_graph_for_compaction { my $self = shift; my @nodes = $self->get_all_nodes(); my %new_node_set; ## change lookup so it's not based on sequence anymore. foreach my $node (@nodes) { $new_node_set{"$node"} = $node; } $self->{_nodes} = \%new_node_set; } #### sub delete_node_from_graph { my $self = shift; my ($node) = @_; $self->prune_nodes_from_graph($node); if ($self->{compacted}) { delete $self->{_nodes}->{$node}; } else { my $kmer = $node->get_sequence(); delete $self->{_nodes}->{"$kmer"}; } return; } #### sub purge_nodes_below_count { my $self = shift; my ($min_count) = @_; my @nodes = $self->get_all_nodes(); my $num_deleted = 0; foreach my $node (@nodes) { if ($node->get_count() < $min_count) { my @colors = $node->get_colors(); unless (scalar(@colors) == 1 && $colors[0] eq "black") { next; ## skipping base data (reads) below coverage limit } $self->delete_node_from_graph($node); $num_deleted++; } } print STDERR " -purged nodes below count($min_count), deleted $num_deleted nodes.\n"; return; } #### sub validate_graph { my $self = shift; print STDERR "Validating graph.\n"; my @nodes = $self->get_all_nodes(); my %node_ids_in_graph; foreach my $node (@nodes) { my $node_id = $node->get_ID(); $node_ids_in_graph{$node_id} = $node; } foreach my $node (@nodes) { my $id = $node->get_ID(); my @prev_nodes = $node->get_all_prev_nodes(); foreach my $prev_node (@prev_nodes) { my $prev_id = $prev_node->get_ID(); unless (exists ($node_ids_in_graph{$prev_id})) { print STDERR "Error, $prev_id exists as prev_node to $id, but not in graph\n"; print STDERR $prev_node->get_sequence() . "\n" . $prev_node->toString(); confess; } unless ($prev_node->has_next_node($node)) { confess "Error, prev_node: $prev_id lacks current next node $id as link.\n"; } } my @next_nodes = $node->get_all_next_nodes(); foreach my $next_node (@next_nodes) { my $next_id = $next_node->get_ID(); unless (exists ($node_ids_in_graph{$next_id})) { print STDERR "Error, $next_id exists as next_node to $id, but not in graph\n"; print STDERR $next_node->get_sequence() . "\n" . $next_node->toString(); confess; } unless ($next_node->has_prev_node($node)) { confess "Error, next node: $next_id lacks current node $id as prev link.\n"; } } } return; } #### sub print_path { my $self = shift; my @nodes = @_; my $counter = 0; my $prev_node; foreach my $node (@nodes) { $counter++; printf("%4s", $counter); my $edge_count = ""; if ($prev_node) { $edge_count = " edge_weight: " . $self->get_edge_count($prev_node, $node); } print " " . $node->get_value() . " C:" . $node->get_count() . " " . join(",", $node->get_colors()) . "$edge_count\n"; $prev_node = $node; } print "\n"; return; } sub _compatible_decorations { my ($nodeA, $nodeB) = @_; my @colorsA = $nodeA->get_colors(); my @colorsB = $nodeB->get_colors(); unless (@colorsA || @colorsB) { # no decorations. return(1); } my %counts; foreach my $color (@colorsA, @colorsB) { $counts{$color}++; } my @colors_unique = grep { $counts{$_} == 1 } keys %counts; if (@colors_unique) { #print STDERR "unique colors: @colors_unique\n"; return(0); } else { return(1); } } #### sub score_nodes { my $self = shift; $self->assign_depth_to_nodes(); ## score = sum prev counts for those reads that are consistent, excluding those that are from bifurcating reads. my @nodes = sort {$a->{depth}<=>$b->{depth}} $self->get_all_nodes(); ## assign base scores: foreach my $node (@nodes) { ## define base scores: contributions by reads that are not in prev or next nodes. my %neighboring_reads = $self->gather_read_indices($node->get_all_prev_nodes(), $node->get_all_next_nodes()); my $base_score = 0; my $read_tracker = $node->get_ReadTracker(); my @reads = $read_tracker->get_tracked_read_indices(); foreach my $read (@reads) { if ($node->{depth} == 0 || ! $neighboring_reads{$read}) { my $count = $read_tracker->get_read_base_count($read); $base_score += $count; #print "$node has read [$read] with count [$count]\n"; } } $node->{_base_score} = $base_score; $node->{_sum_score} = $base_score; # init #print "$node : base score = $base_score\n"; } ## assign sum scores foreach my $node (@nodes) { my %indices_exclude; my $read_tracker = $node->get_ReadTracker(); my $highest_prev_score = $node->{_sum_score}; my $highest_prev_node = undef; foreach my $prev_node ($node->get_all_prev_nodes()) { ## want reads that are in current node and prev node, but not other next nodes of prev node (exclusive to current node). my $sum_score = $prev_node->{_sum_score}; my @other_prev_next_nodes = grep { $_ ne $node } $prev_node->get_all_next_nodes(); my %reads_ignore = $self->gather_read_indices(@other_prev_next_nodes); foreach my $read ($read_tracker->get_tracked_read_indices()) { if (! $reads_ignore{$read}) { $sum_score += $read_tracker->get_read_base_count($read); } } push (@{$prev_node->{_forward_scores}}, { next_node => $node, score => $sum_score, } ); if ($sum_score > $highest_prev_score) { $highest_prev_score = $sum_score; $highest_prev_node = $prev_node; } } $node->{_sum_score} = $highest_prev_score; $node->{_best_prev} = $highest_prev_node; } ## find highest sum score my $highest_sum_score = 0; my $best_node = undef; foreach my $node (@nodes) { #print "Node: $node, has sum score: " . $node->{_sum_score} . "\n"; if ($node->{_sum_score} > $highest_sum_score) { $highest_sum_score = $node->{_sum_score}; $best_node = $node; } } my @path_nodes; my $prev_node = $best_node; while ($prev_node) { #print "Backtracked: $prev_node " . $prev_node->{_sum_score} . "\n"; #print $prev_node->toString(); push (@path_nodes, $prev_node); $prev_node = $prev_node->{_best_prev}; } @path_nodes = reverse @path_nodes; return (@path_nodes); } #### sub extract_sequence_from_path { my $self = shift; my (@ordered_nodes) = @_; # ordered left to right my @seqs; foreach my $node (@ordered_nodes) { $node->{_visited} = 1; push (@seqs, $node->get_sequence()); } my $kmer_length = $self->{KmerLength}; my $final_seq = shift @seqs; ## first one gets full kmer treatment. foreach my $seq (@seqs) { $seq = substr($seq, $kmer_length-1); $final_seq .= $seq; } return($final_seq); } #### sub gather_read_indices { my $self = shift; my (@nodes) = @_; my %indices; foreach my $node (@nodes) { my $read_tracker = $node->get_ReadTracker(); my @read_indices = $read_tracker->get_tracked_read_indices(); foreach my $read (@read_indices) { $indices{$read}=1; } } return(%indices); } #### sub assign_depth_to_nodes { my $self = shift; print STDERR "Assigning depth to nodes\n"; my @root_nodes = $self->get_root_nodes(); ## base cases. foreach my $root (@root_nodes) { $root->{depth} = 0; } foreach my $node ($self->get_all_nodes()) { if (! defined ($node->{depth})) { $self->assign_depth($node, {}); } } return; } #### sub assign_depth { my $self = shift; my $node = shift; my $seen_href = shift; my @ancestors = $node->get_all_prev_nodes(); my $max_depth = 0; foreach my $ancestor (@ancestors) { my $depth; if (defined($ancestor->{depth})) { $depth = 1 + $ancestor->{depth}; } else { if ($seen_href->{$ancestor}) { # print "Breaking cycle: $ancestor\n" . join("\n", keys %$seen_href) . "\n"; next; } $seen_href->{$ancestor} = 1; $depth = 1 + $self->assign_depth($ancestor, $seen_href); } if ($depth > $max_depth) { $max_depth = $depth; } } # print "$node\tdepth: $max_depth\n"; $node->{depth} = $max_depth; return($max_depth); } #### sub all_nodes_visited { my $self = shift; my @nodes = $self->get_all_nodes(); foreach my $node (@nodes) { if (! $node->{_visited}) { return(0); } } return(1); # all visited. } #### sub extract_path_from_unvisited_node { my $self = shift; my @nodes = $self->get_all_nodes(); my $highest_scoring_unvisited_node = undef; my $highest_score = -1; foreach my $node (@nodes) { if (! $node->{_visited}) { if ($node->{_base_score} > $highest_score) { $highest_score = $node->{_base_score}; $highest_scoring_unvisited_node = $node; } } } unless ($highest_scoring_unvisited_node) { confess "Error, no unvisited node found"; } my @best_path; my %seen; # don't follow cycles. FIXME: cycles shouldn't be part of the scoring, though... bug somewhere upstream? my $prev_node = $highest_scoring_unvisited_node; while ($prev_node && ! $prev_node->{_visited} && ! $seen{$prev_node}) { print "walking prev_node: $prev_node\n" if $main::SEE; $seen{$prev_node} = 1; unshift(@best_path, $prev_node); $prev_node = $prev_node->{_best_prev}; } ## now track forward. my $next_node = $highest_scoring_unvisited_node; delete $seen{$next_node}; # already used above. while ($next_node && ! $next_node->{_visited} && ! $seen{$next_node}) { print "walking next_node: $next_node\n" if $main::SEE; $seen{$next_node} = 1; my @structs = @{$next_node->{_forward_scores}}; if (@structs) { @structs = sort {$a->{score}<=>$b->{score}} @structs; my $best_struct = pop @structs; push (@best_path, $next_node); $next_node = $best_struct->{next_node}; } else { $next_node = undef; } } return(@best_path); } #### sub prune_low_weight_edges { my $self = shift; my %params = @_; my $edge_weight_threshold = $params{edge_weight_threshold}; my @nodes = $self->get_all_nodes(); my @edges_to_prune; foreach my $node (@nodes) { my @prev_nodes = $node->get_all_prev_nodes(); foreach my $prev_node (@prev_nodes) { my $edge_ratio = $self->compute_edge_support_ratio($prev_node, $node); #print "EdgeRatio: $edge_ratio\n"; if ($edge_ratio < $edge_weight_threshold) { push (@edges_to_prune, [$prev_node, $node]); } } } if (@edges_to_prune) { foreach my $edges_to_prune (@edges_to_prune) { my ($prev_node, $node) = @$edges_to_prune; $self->prune_edge($prev_node, $node); } return(scalar(@edges_to_prune)); } else { return(0); } } #### sub prune_singletons { my $self = shift; my ($min_seq_length) = @_; my @nodes = $self->get_all_nodes(); foreach my $node (@nodes) { if ( (! $node->get_all_prev_nodes()) && (! $node->get_all_next_nodes()) ) { if (length ($node->get_sequence()) < $min_seq_length) { $self->delete_node_from_graph($node); } } } return; } #### sub compute_edge_support_ratio { my $self = shift; my ($prev_node, $node) = @_; ## ratio = reads in common / (all reads out of prev UNION all reads into node) my @reads_exiting_prev_node = $prev_node->get_reads_exiting_node(); my %exiting_reads = map { + $_ => 1 } @reads_exiting_prev_node; my @reads_entering_node = $node->get_reads_entering_node(); my %entering_reads = map { + $_ => 1 } @reads_entering_node; my %all_reads = map { + $_ => 1 } (@reads_exiting_prev_node, @reads_entering_node); my @all = keys %all_reads; my @shared; foreach my $read (@all) { if ($entering_reads{$read} && $exiting_reads{$read}) { push (@shared, $read); } } my $ratio = scalar(@shared) / scalar(@all); return($ratio); } #### sub prune_dangling_nodes { my $self = shift; my %params = @_; my $min_leaf_node_length = $params{min_leaf_node_length}; my $min_leaf_node_avg_cov = $params{min_leaf_node_avg_cov}; unless (defined ($min_leaf_node_length) && defined ($min_leaf_node_avg_cov) ) { confess "Error, must define values for both: min_leaf_node_length && min_leaf_node_avg_cov"; } my @nodes = $self->get_all_nodes(); my @nodes_to_prune; my $kmer_length = $self->{KmerLength}; foreach my $node (@nodes) { ## check to see if it's a dangling node. if ( (! $node->get_all_prev_nodes()) || (! $node->get_all_next_nodes()) ) { my $length = length($node->get_sequence()); my $cov = $node->get_count(); my $avg_cov = $cov / ($length - ($kmer_length - 1)); if ($avg_cov < $min_leaf_node_avg_cov && $length < $min_leaf_node_length) { push (@nodes_to_prune, $node); } } } if (@nodes_to_prune) { foreach my $node (@nodes_to_prune) { $self->prune_nodes_from_graph($node); } return(scalar(@nodes_to_prune)); # nodes pruned. } else { return(0); # no nodes pruned. } } 1; #EOM