#!/usr/bin/env perl use strict; use warnings; use FindBin; use lib ("$FindBin::RealBin/../../PerlLib", "$FindBin::RealBin/../../PerlLib/KmerGraphLib"); use Fasta_reader; use ColorGradient; use KmerNode; use KmerGraph; use Carp; use Getopt::Long qw(:config no_ignore_case bundling); no warnings 'recursion'; my $usage = <<_EOUSAGE_; ##################################################################################### # # Required: # # --reads fasta files containing reads (comma-separated list of files: fileA.reads,fileB.reads,... # basic black coloring in graph.) # and/or # # --misc_seqs fasta files containing reads (comma-separated list of files: fileA.reads,fileB.reads,...) # each sequence is colored differently and identified in the graph @ edges # and: # --assemble # and/or # --graph [filename] dot output file for viewing structure in GraphViz # # Optional: # -K Kmer length (default: 24) # -R maximum recursive search depth from terminus for extension (default: 5) # note can exponentially increases runtime # # # -L minimum assembled sequence length to report (default: 100) # -E minimum Kmer seed entropy (default: 1.5; note maximum entropy = 2 for nucleotide sequences) # --min_coverage minimum kmer coverage to report kmer from reads in graph. # # # -v verbose # # --no_compact do NOT compact the graph # # --min_edge_threshold default(0.1) # --min_leaf_node_length default(76) # --min_leaf_node_avg_cov default(1.2) # # -N max assemblies to report (default: infinity) # ##################################################################################### _EOUSAGE_ ; my $help_flag; my $fasta_file; my $KmerLength = 24; my $max_recurse_depth = 5; my $min_length = 100; my $min_cov = 1; our $VERBOSE =0; my $MIN_KMER_ENTROPY = 1.5; my $graph_file; my $assemble_flag; my $cds_file; my $iworm_file; my $MIN_EDGE_THRESHOLD = 0.1; my $MIN_LEAF_NODE_LENGTH = 76; my $MIN_LEAF_NODE_AVG_COV = 1.2; my $NO_COMPACT = 0; my $max_asmbls_report = -1; my $reads_files; my $misc_seqs_files; &GetOptions ( 'h' => \$help_flag, ## required 'misc_seqs=s' => \$misc_seqs_files, 'reads=s' => \$reads_files, 'K=i' => \$KmerLength, 'R=i' => \$max_recurse_depth, ## options 'L=i' => \$min_length, 'E=f' => \$MIN_KMER_ENTROPY, "min_coverage=i" => \$min_cov, "graph=s" => \$graph_file, "assemble" => \$assemble_flag, "iworm=s" => \$iworm_file, "CDS=s" => \$cds_file, "min_edge_threshold=f" => \$MIN_EDGE_THRESHOLD, "min_leaf_node_length=i" => \$MIN_LEAF_NODE_LENGTH, "min_leaf_node_avg_cov=f" => \$MIN_LEAF_NODE_AVG_COV, "no_compact" => \$NO_COMPACT, 'N=i' => \$max_asmbls_report, # hidden option for debugging 'v' => \$VERBOSE, ); if ($help_flag) { die $usage; } unless ( ($reads_files || $misc_seqs_files) && ($graph_file || $assemble_flag)) { die $usage; } our $SEE = $VERBOSE; main: { my $KmerGraph = KmerGraph->new($KmerLength); if ($reads_files) { my $seq_counter = 0; my @files = split(/,/, $reads_files); foreach my $file (@files) { my $fasta_reader = new Fasta_reader($file); while (my $seq_obj = $fasta_reader->next()) { my $acc = $seq_obj->get_accession(); my $sequence = $seq_obj->get_sequence(); $seq_counter++; print STDERR "\r-parsing seq($seq_counter) "; $KmerGraph->add_sequence_to_graph($acc, $sequence, 1, 'black'); } } } if ($misc_seqs_files) { my @seqs; my @files = split(/,/, $misc_seqs_files); foreach my $file (@files) { my $fr = new Fasta_reader($file); while (my $seq_obj = $fr->next()) { my $seq = $seq_obj->get_sequence(); my $acc = $seq_obj->get_accession(); if (length($seq) >= $min_length) { push (@seqs, [$acc, $seq]); } } } my @colors; if (scalar @seqs > 1) { @colors = &ColorGradient::get_RGB_gradient(scalar @seqs); @colors = &ColorGradient::convert_RGB_hex(@colors); } else { @colors = ('green'); } while (@seqs) { my $seq_info = shift @seqs; my ($acc, $seq) = @$seq_info; my $color = shift @colors; print STDERR "Adding additional seq trace: $acc, $color\n"; $KmerGraph->add_sequence_to_graph($acc, $seq, 0, $color); } } if ($min_cov > 1) { print STDERR "\n-removing kmer nodes below coverage: $min_cov\n"; $KmerGraph->purge_nodes_below_count($min_cov); } print STDERR "\nDone adding sequences\n"; unless ($NO_COMPACT) { my $round = 0; my ($compacted_graph_flag, $number_dangling_nodes_pruned, $number_pruned_edges); do { $compacted_graph_flag = $KmerGraph->compact_graph(); $number_dangling_nodes_pruned = $KmerGraph->prune_dangling_nodes(min_leaf_node_length => $MIN_LEAF_NODE_LENGTH, min_leaf_node_avg_cov => $MIN_LEAF_NODE_AVG_COV); $number_pruned_edges = $KmerGraph->prune_low_weight_edges(edge_weight_threshold => $MIN_EDGE_THRESHOLD); $round++; # no op print STDERR "Cycling through graph compaction, round: $round\n"; print STDERR "\tnodes_pruned: $number_dangling_nodes_pruned\n" . "\tedges_pruned: $number_pruned_edges\n"; } while ($compacted_graph_flag && ($number_dangling_nodes_pruned || $number_pruned_edges)); } print STDERR "done.\n"; if ($assemble_flag) { my @path = $KmerGraph->score_nodes(); my $assembled_seq = $KmerGraph->extract_sequence_from_path(@path); unless ($assembled_seq) { confess "Error, no assembled sequence form path: @path"; } my $score = &sum_node_counts(@path); my $counter = 0; if (length($assembled_seq) >= $min_length) { $counter++; my $length_normalized_score = sprintf("%.2f", $score / length($assembled_seq)); print ">assembly_$counter;$score;$length_normalized_score len: " . length($assembled_seq) . "\n$assembled_seq\n"; } while (! $KmerGraph->all_nodes_visited()) { if ($max_asmbls_report > 0 && $counter >= $max_asmbls_report) { last; } my @path = $KmerGraph->extract_path_from_unvisited_node(@path); #$KmerGraph->print_path(@path); #foreach my $node (@path) { # print $node->toString(); #} my $assembled_seq = $KmerGraph->extract_sequence_from_path(@path); # marks as visited. unless ($assembled_seq) { confess "Error, no sequence extracted from path: @path"; } if ( length($assembled_seq) >= $min_length) { $counter++; my $score = &sum_node_counts(@path); my $length_normalized_score = sprintf("%.2f", $score / length($assembled_seq)); print ">assembly_$counter;$score;$length_normalized_score len: " . length($assembled_seq) . "\n$assembled_seq\n"; } } } ## Write graph file in dot format for GraphViz if ($graph_file) { open (my $ofh, ">$graph_file") or die "Error, cannot write graph file to $graph_file "; print $ofh $KmerGraph->toGraphViz( no_short_singletons => 100); close $ofh; } exit(0); } #### sub sum_node_counts { my @path = @_; my $sum = 0; foreach my $node (@path) { my $count = $node->get_count(); $sum += $count; } return($sum); } #### sub compute_entropy { my ($string) = @_; my @chars = split(//, $string); my %char_counter; foreach my $char (@chars) { $char_counter{$char}++; } my $entropy = 0; my $num_chars = length($string); foreach my $char (keys %char_counter) { my $count = $char_counter{$char}; my $prob = $count/$num_chars; my $val = $prob * log(1/$prob)/log(2); $entropy += $val; } return($entropy); } #### sub get_highest_scoring_iworm_assembly { my ($iworm_file) = @_; my @entries; my $fasta_reader = new Fasta_reader($iworm_file); while (my $seq_obj = $fasta_reader->next()) { my $seq = $seq_obj->get_sequence(); my $acc = $seq_obj->get_accession(); my @parts = split(/;/, $acc); my $cov = pop @parts; my $score = $cov * length($seq); push (@entries, [$score, $seq]); } @entries = reverse sort {$a->[0]<=>$b->[0]} @entries; my $best_entry = shift @entries; return($best_entry->[1]); }