# -*- Mode: Perl; tab-width: 8; perl-indent-level: 2; indent-tabs-mode: nil -*- use strict; ############################################################################### # GTF ############################################################################### # The GTF object parses and stores all data from a gtf file. package GTF; use Carp; # GTF::new(hash) # This is the contructor for GTF objects. It takes one required argument # which is a hash containing the following optional fields: # gtf_filename: The filename of the gtf file to be loaded. If no filename # is given it just creates an empty object. # seq_filename: The filename of the sequence corresponding to this gtf file. # If given additional validity checks are made, additional stats # are kept and the transcript can be output. # tx_out_fh: The filehandle to output the genes transcripts to. Only works # If seq_filename is given. # conseq_filename: The filename of the conservation sequence corresponding to # this gtf file. If given additional stats are kept. # warning_fh: The filehandle to output gtf format warnings to. If none # is given warnings are disregarded. sub new { my ($info) = @_; my $gtf = bless {Genes => [], Transcripts => [], Exons => [], CDS => [], Inter => [], Inter_CNS => [], Filename => "", Sequence => "", Comments => "", Modified => 0}; $gtf->{Total_Conseq} = {0 => -1, 1 => -1, 2 => -1}; $gtf->{Total_Seq} = {A => -1, C => -1, G => -1, T => -1, N => -1}; if(defined($info->{seq_filename})){ if(-e $info->{seq_filename}){ $gtf->{Sequence} = $info->{seq_filename}; } else{ $gtf->{Sequence} = 0; print STDERR "GTF: $info->{seq_filename} does not exist.\n"; } } else{ $gtf->{Sequence} = 0; } if(defined($info->{strict})){ $gtf->{Strict} = $info->{strict}; } else{ $gtf->{Strict} = 0; } if(defined($info->{tx_out_fh})){ if(defined($gtf->{Sequence})){ $gtf->{Tx} = $info->{tx_out_fh}; } else{ $gtf->{Tx} = 0; print STDERR "GTF: Cannot ouput transcripts without the sequence.\n"; } } else{ $gtf->{Tx} = 0; } if(defined($info->{conseq_filename})){ if(-e $info->{conseq_filename}){ $gtf->{Conseq} = $info->{conseq_filename}; } else{ $info->{Conseq} = 0; print STDERR "GTF: $info->{conseq_filename} does not exist.\n"; } } else{ $info->{Conseq} = 0; } if((defined($info->{inframe_stops})) && ($info->{inframe_stops})){ $gtf->{Inframe_Stops} = 1; } else{ $gtf->{Inframe_Stops} = 0; } if((defined($info->{fix_gtf})) && ($info->{fix_gtf} == 1)){ $gtf->{Fix_GTF} = 1; } else{ $gtf->{Fix_GTF} = 0; } if(defined($info->{warning_skips})){ $gtf->{Warning_Skips} = $info->{warning_skips}; } else{ $gtf->{Warning_Skips} = []; } if(defined($info->{bad_list})){ $gtf->{Bad_List} = $info->{bad_list}; } else{ $gtf->{Bad_List} = []; } if(defined($info->{detailed_error_count})){ $gtf->{Detailed_Error_Count} = $info->{detailed_error_count}; } if(defined($info->{mark_ase})) { $gtf->{Mark_ASE} = $info->{mark_ase}; } else { $gtf->{Mark_ASE} = 0; } if(defined($info->{gtf_filename})){ if(-e $info->{gtf_filename}){ $gtf->{Filename} = $info->{gtf_filename}; if((defined($info->{no_check})) && ($info->{no_check})){ $gtf->_load_no_check($info->{gtf_filename}); } else{ $gtf->_parse_file($info->{warning_fh}); } if($gtf->{Mark_ASE}) { $gtf->mark_ase; } } else{ print STDERR "GTF: $info->{gtf_filename} does not exist.\n"; } } else{ $gtf->{Filename} = ""; } return $gtf; } # GTF::_update # This function resorts all lists. It is called when a list is requested. # It does nothing unless the $gtf->{Modified} value is 1. Sets # $gene->{Modified} to 0; sub _update{ my ($gtf) = @_; unless($gtf->{Modified}){ return; } $gtf->{Modified} = 0; my $genes = $gtf->{Genes}; my $txs = $gtf->{Transcripts}; my $cds = $gtf->{CDS}; my $inter = $gtf->{Inter}; $genes = [sort {$a->start <=> $b->start} @$genes]; $txs = [sort {$a->start <=> $b->start} @$txs]; $cds = [sort {$a->start <=> $b->start} @$cds]; $inter = [sort {$a->start <=> $b->start} @$inter]; $gtf->{Genes} = $genes; $gtf->{Transcripts} = $txs; $gtf->{CDS} = $cds; $gtf->{Inter} = $inter; } # GTF::genes() # This function returns an array of refernces to GTF::Gene objects. # The array contains one reference for each gene in the gtf file. # The array is sorted by the start position of each gene. sub genes{ my ($gtf) = @_; $gtf->_update; return $gtf->{Genes}; } sub transcripts{ my ($gtf) = @_; $gtf->_update; return $gtf->{Transcripts}; } # GTF::add_gene(gene_ref) # This function take a reference to a GTF::Gene object and adds it to the # list of genes stored by this object. sub add_gene{ my ($gtf,$gene) = @_; my $genes = $gtf->{Genes}; my $txs = $gtf->{Transcripts}; my $cds = $gtf->{CDS}; push @$genes, $gene; push @$txs, @{$gene->transcripts}; push @$cds, @{$gene->cds}; $gtf->{Modified} = 1; } # GTF::add_feature(feature_ref,seqname,source,strand,id) # This function will take a reference to an intergenic feature (such as # inter or inter_CNS) and add it to the appropriate feature list. # Pass a sequence name, the strand ("+" is OK for intergenic) # and an ID to identify the feature. # Note that the feature must be of the types inter or inter_CNS. sub add_feature{ my ($gtf,$feature,$seqname,$source,$strand,$id) = @_; die "Only inter or inter_CNS features may be directly added to a GTF.\n" if($feature->type ne "inter" && $feature->type ne "inter_CNS"); my $gene = GTF::Gene::new($id,$seqname,$source,$strand); my $tx = GTF::Transcript::new($id); $gene->add_transcript($tx); $tx->add_feature($feature); my $features; if($feature->type eq "inter") { $features = $gtf->{Inter} } elsif($feature->type eq "inter_CNS") { $features = $gtf->{Inter_CNS} } push @$features, $feature; $gtf->{Modified} = 1; } # GTF::set_genes(gene_list) # This function take a reference to a list of GTF::Gene objects and adds each to the # list of genes stored by this object. sub set_genes{ my ($gtf,$genes) = @_; $gtf->{Genes} = $genes; my @txs; my @cds; foreach my $gene (@$genes){ push @txs, @{$gene->transcripts}; push @cds, @{$gene->cds}; } $gtf->{Transcripts} = \@txs; $gtf->{CDS} = \@cds; $gtf->{Modified} = 1; } # GTF::remove_gene(gene_id) # This function takes a gene_id and removes the gene with that gene_id # from the list of genes stored by this object, if a gene with that gene_id # can be found. Otherwise does nothing. Returns true if a gene was removed # and false if no gene with that gene_id was found. sub remove_gene{ my ($gtf,$gene_id) = @_; my $genes = $gtf->{Genes}; my $removed = 0; my @new_genes; my @new_txs; my @new_cds; foreach my $gene (@$genes){ if($gene->gene_id eq $gene_id){ $removed++; } else{ push @new_genes, $gene; push @new_txs, @{$gene->transcripts}; push @new_cds, @{$gene->cds}; } } if($removed){ $gtf->{Genes} = \@new_genes; $gtf->{Transcripts} = \@new_txs; $gtf->{CDS} = \@new_cds; } return $removed; } # GTF::remove_feature(feature) # This function will remove a feature from the list of features. # Note the feature must be of type inter or inter_CNS. sub remove_feature { my($gtf,$feature) = @_; my $removed = 0; my $feature_key; if($feature->type eq "inter") { $feature_key = "Inter"; } elsif($feature->type eq "inter_CNS") { $feature_key = "Inter_CNS"; } my @new_features; foreach my $stored_feature (@{$gtf->{$feature_key}}) { if($stored_feature == $feature) { $removed++; } else { push @new_features, $stored_feature; } } if($removed) { $gtf->{$feature_key} = \@new_features; } return $removed; } # GTF::set_filename(filename) # Sets the filename to be returned by the filename function sub set_filename{ my ($gtf,$filename) = @_; $gtf->{Filename} = $filename; } # GTF::offset(ammount) # This function will offset all genes in this gene by the given ammount sub offset{ my ($gtf,$offset) = @_; unless(defined($offset)){ print STDERR "Undefined value passed to GTF::offset.\n"; return; } my $genes = $gtf->{Genes}; foreach my $gene (@$genes){ $gene->offset($offset); } $gtf->{Modified} = 1; } # GTF::reverse_complement(seq_length) # This function takes the length of the sequence this gtf file was based on and # and reverse complements everything in the file. So the positive strand becomes # the negative strand and the files starts at seq_length and moves downward to 0. sub reverse_complement{ my ($gtf, $seq_length) = @_; unless(defined($seq_length)){ print STDERR "Undefined value for seq_length passed to GTF::reverse_complement.\n"; } my $genes = $gtf->{Genes}; foreach my $gene(@$genes){ $gene->reverse_complement($seq_length); } $gtf->{Modified} = 1; } # GTF::cds() # This function returns an array of refernces to GTF::Feature objects. # The array contains one refernece to each CDS feature in the gtf file. # The array is sorted by the of each CDS. sub cds{ my ($gtf) = @_; $gtf->_update; return $gtf->{CDS}; } sub inter { my ($gtf) = @_; $gtf->_update; return $gtf->{Inter}; } sub inter_cns { my ($gtf) = @_; $gtf->_update; return $gtf->{Inter_CNS}; } # GTF::mark_ase() # If you are looking for alternative splicing events in your GTF file, # call this function to marke them. This function is separate from the # _update methods in order to keep the running time low. sub mark_ase { my ($gtf) = @_; #set the alternative splicing event flags for each feature #currently, only optional coding exons # The main idea is to check all pairs of transcripts in a gene, # and find any incident where there are two exons that have the same # end coord and two exons which have the same start coord # and that an exon in only one of the transcripts lies between them. # It is accomplished by sorting all the exons in both transcripts # and checking for a supercasette, i.e. # # ===|||||||||===||||||||======||||||||||==== # ==||||||||||=================||||||||||||== # <---------------> # supercassette # # Note that it doesn't matter if the opposite ends of the exon are equal. # rpz my @genes = @{$gtf->genes}; for my $gene(@{$gtf->genes}) { my @txs = (@{$gene->transcripts}); if(scalar(@txs) > 1) { for (my $i = 0; $i < scalar(@txs)-1; $i++) { next if(scalar(@{$txs[$i]->cds}) == 0); for (my $j = $i+1; $j < scalar(@txs); $j++) { next if(scalar(@{$txs[$j]->cds}) == 0); my @ocds = sort {$a->start <=> $b->start || $a->stop <=> $b->stop} (@{$txs[$i]->cds}, @{$txs[$j]->cds}); for(my $k = 4; $k < scalar(@ocds); $k++) { if( $ocds[$k-4]->stop == $ocds[$k-3]->stop && $ocds[$k-1]->start == $ocds[$k]->start) { if($ocds[$k-2]->length%3 == 0) { $ocds[$k-2]->set_ase("InframeOptional"); } else { $ocds[$k-2]->set_ase("FrameshiftOptional"); } } } } } } } } # GTF::infer_exons() # Skips infering if Exons already exist # Output is undefined if features overlap, except for start_codon # Currently knows about utr5, start,stop,cds, utr3 sub infer_exons { my ($gtf) = @_; $gtf->_update; for my $tx (@{$gtf->transcripts}) { next if (@{$tx->exons}); if ($tx->strand eq '+') { for ( @{$tx->utr5}, @{$tx->cds}, @{$tx->stop_codons}, @{$tx->utr3} ) { if (!@{$tx->exons} || ${$tx->exons}[-1]->stop + 1 < $_->start ) { $tx->add_feature( GTF::Feature::new('exon', $_->start, $_->stop, 0, 0) ); } elsif ( ${$tx->exons}[-1]->stop + 1 eq $_->start ) { ${$tx->exons}[-1]->set_stop($_->stop); } } } else { for ( @{$tx->utr3}, @{$tx->stop_codons}, @{$tx->cds}, @{$tx->utr5} ) { if (!@{$tx->exons} || ${$tx->exons}[-1]->stop + 1 < $_->start ) { $tx->add_feature(GTF::Feature::new('exon', $_->start, $_->stop, 0, 0)); } elsif ( ${$tx->exons}[-1]->stop + 1 eq $_->start ) { ${$tx->exons}[-1]->set_stop($_->stop); } } } } } # GTF::remove_exons() # remove exons from all transcripts sub remove_exons { my ($gtf) = @_; $gtf->_update; $_->{Exons} = [] for (@{$gtf->transcripts}); } # GTF::filename() # This function returns the filename of the gtf file. sub filename {shift->{Filename}} # GTF::comments() # This function returns all full line commentsfrom the gtf file. sub comments {shift->{Comments}}; # GTF::conservation_count() # Returns a hash containing counts for each conservation character (0,1,2) sub conservation_count {shift->{Total_Conseq}}; # GTF::sequence_count() # Returns a hash containing counts for each sequence character (A,C,T,G,N,X), # Where X is all other characters sub sequence_count {shift->{Total_Seq}}; # GTF::output_gtf_file([file_handle]); # This function moves through each gene in the gtf file and outputs # all data in valid gtf2 format. If takes and optional file_handle # as the only argument to which it outputs the data. If no argument is # given it outputs the data to stdout. sub output_gtf_file{ my ($gtf,$out_handle) = @_; $gtf->_update; unless(defined $out_handle){ $out_handle = \*STDOUT; } if($gtf->comments){ foreach my $comment (@{$gtf->comments}){ print $out_handle "$comment\n"; } } for my $gene (@{$gtf->genes}) { $gene->_update } my @top_level_features = sort {$a->start <=> $b->start} (@{$gtf->genes}, @{$gtf->inter_cns}, @{$gtf->inter}); for my $feature (@top_level_features) { $feature->output_gtf($out_handle); } } # GTF::output_gff_file([file_handle]); # This function moves through each gene in the gtf file and outputs # all data in valid GFF format. If takes and optional file_handle # as the only argument to which it outputs the data. If no argument is # given it outputs the data to stdout. sub output_gff_file{ my ($gtf,$out_handle) = @_; $gtf->_update; unless(defined $out_handle){ $out_handle = \*STDOUT; } if($gtf->genes){ foreach my $gene (@{$gtf->genes}){ $gene->output_gff($out_handle); } } } # GTF::_parse_file() # This function is used internally by the GTF constructor GTF::new # to read, parse, and store the GTF file; sub _parse_file{ my ($gtf,$warnings) = @_; unless(defined($warnings)){ if(open(WARN, ">/dev/null")){ $warnings = \*WARN; } else{ die "Could not open /dev/null for write\n"; } } my $fix_gtf = $gtf->{Fix_GTF}; my $strict = $gtf->{Strict}; #open file open(GTF, "<$gtf->{Filename}") or die "Could not open $gtf->{Filename}.\n"; my (%GeneObj,%TxObj); my (@all_genes,@all_txs,@all_cds,@all_inter,@all_cns); my $all_line_comments = []; #prepare error counts my $max_error_count = 5; if(defined($gtf->{Detailed_Error_Count})){ $max_error_count = $gtf->{Detailed_Error_Count}; } my @error_msgs = $gtf->_get_error_messages(); my @errors; my @bad_list; my $no_warn = $gtf->{Warning_Skips}; for(my $i = 0;$i <= $#error_msgs;$i++){ if($$no_warn[$i]){ $errors[$i] = $max_error_count; } else{ $errors[$i] = 0; } } #read the file my $line_num = 0; while(my $in_line = ){ $line_num++; chomp $in_line; if($in_line !~ /\S/){ next; } if($in_line =~ /^\s*\#/){ push @$all_line_comments, $in_line; next; } else{ #remove leading whitespace and get comments my $comments = ""; while($in_line =~ /^(.*)\#(.*)$/){ $in_line = $1; $comments = $2.$comments; } if($in_line =~ /^\s+(.*)$/){ $in_line = $1; } my @data = split /\s+/,$in_line; #verify line is correct length if($#data < 8){ if($errors[0] < $max_error_count){ print $warnings "Not enough fields on line $line_num.\n"; } $errors[0]++; next; } #check for correct whitespace my $field = 1; while(($field < 8)&&($in_line =~ /\S+(\s+)/g)){ unless($1 =~ /^\t$/){ if($errors[1] < $max_error_count){ print $warnings "Incorrect type of whitespace between fields ", "$field and ",$field + 1, " on line $line_num. ", "Should be tab.\n"; } $errors[1]++; } $field++; } #verify correct field $data[2] = lc($data[2]); unless(($data[2] eq "cds") || ($data[2] eq "exon") || ($data[2] eq "5utr") || ($data[2] eq "3utr") || ($data[2] eq "start_codon") || ($data[2] eq "stop_codon") || ($data[2] eq "sec") || ($data[2] eq "intron_cns") || ($data[2] eq "inter_cns") || ($data[2] eq "inter")) { if($errors[2] < $max_error_count){ print $warnings "Incorrect value for field, \"$data[2]\", ", "on line $line_num.\n"; } $errors[2]++; } if (($data[2] eq "cds") || ($data[2] eq "5utr") || ($data[2] eq "3utr") || ($data[2] eq "sec")) { $data[2] = uc($data[2]); } if ($data[2] eq "inter_cns") { $data[2] = "inter_CNS" } if ($data[2] eq "intron_cns") { $data[2] = "intron_CNS" } #verify correct format if($data[3] !~ /^\d*$/){ if($errors[3] < $max_error_count){ print $warnings "Non-numerical value for field, \"$data[3]\"", ", on line $line_num.\n"; } $errors[3]++; } #verify correct format if($data[4] !~ /^\d*$/){ if($errors[4] < $max_error_count){ print $warnings "Non-numerical value for field, \"$data[4]\"", ", on line $line_num.\n"; } $errors[4]++; } #verify start is < stop if($data[3] > $data[4]){ if($errors[38] < $max_error_count){ print $warnings "Start field is greater than stop field on line $line_num.\n"; } $errors[38]++; my $start = $data[3]; $data[3] = $data[4]; $data[4] = $start; } #verify correct format if($data[5] !~ /^-?\d*\.?\d*$/){ # should check for float if($data[5] ne '.'){ if($errors[5] < $max_error_count){ print $warnings "Value for field, \"$data[5]\", is ", "neither numerical nor \".\" on line $line_num\n"; } $errors[5]++; } } if(!$strict && $data[5] eq '.'){ $data[5] = 0; } #verify correct field unless(($data[6] eq "+")||($data[6] eq "-")|| ($data[6] eq ".")){ if($errors[6] < $max_error_count){ print $warnings "Value of field is \"$data[6]\" on line ", "$line_num. Should be \"+\", \"-\", or \".\".\n"; } $errors[6]++; if(!$strict){ if($data[6] eq "1"){ $data[6] = "+"; } elsif($data[6] eq "-1"){ $data[6] = "-"; } elsif($data[6] eq "0"){ $data[6] = "."; } } } #verify correct field unless(($data[7] eq "0")||($data[7] eq "1")|| ($data[7] eq "2")||($data[7] eq ".")){ if($errors[7] < $max_error_count){ print $warnings "Value of field is \"$data[7]\" on line ", "$line_num. Should be \"0\", \"1\", \"2\", or \".\"", ".\n"; } $errors[7]++; } #prepare and fields my $attribute_string = ""; if($in_line =~ /^\S+\t\S+\t\S+\t\S+\t\S+\t\S+\t\S+\t\S+\t(.*)$/){ $attribute_string = "$1 "; } #check for equals if($attribute_string =~ / = /){ if($errors[30] < $max_error_count){ print $warnings "Illegal \'=\' character in field on ", "line $line_num\n"; } $errors[30]++; $attribute_string =~ s/=/ /g; } # check for tab character. if($attribute_string =~ /\t/){ if($errors[9] < $max_error_count){ print $warnings "Illegal tab character in field on ", "line $line_num.\n"; } $errors[9]++; $attribute_string =~ s/\t/ /g; } # make sure it has semicolon terminator after each attribute # otherwise fake it. unless($attribute_string =~ /;/){ if($errors[10] < $max_error_count){ print $warnings " field conatins no semicolons", " terminating each attribute on line $line_num.\n"; } $errors[10]++; my @att = split /\s+/, $attribute_string; $attribute_string = ""; my $att_count = $#att - (($#att+1) % 2); for(my $i = 0;$i <= $att_count;$i+=2){ $attribute_string .= $att[$i]." ".$att[$i+1]."; "; } } # parse field my $attributes = {}; while($attribute_string =~ /(\S+)(\s+)(\S+)(\s*)/g){ my $name = $1; my $space1 = $2; my $value = $3; my $space2 = $4; if($value =~ /^\"(\S*)\";$/){ $value = $1; } else{ if($value =~ /^(\S+);$/){ $value = $1; } else{ if($errors[10] < $max_error_count){ $value =~ /(.)$/; print $warnings "Bad terminator, \"$1\", after ", " name-value pair, $name $value, on ", "line $line_num. Should be \";\".\n"; } $errors[10]++; } if($value =~ /^\"(\S+)\"$/){ $value = $1; } else{ if($errors[11] < $max_error_count){ print $warnings " field missing quotes around", " values on line $line_num.\n"; } $errors[11]++; } } $attributes->{$name} = $value; unless($space1 =~ /^ $/){ if($errors[12] < $max_error_count){ print $warnings "Bad separator, $space1, between ", "name-value pair, $name $value, on line ", "$line_num.\n"; } $errors[12]++; } unless($space2 =~ /^ $/){ if($errors[12] < $max_error_count){ print $warnings "Bad separator, \"$space2\", between ", " name-value pairs on line $line_num. Should be", " \" \".\n$name $value\n"; } $errors[12]++; } } # check for gene_id and transcript_id attributes my $gid = "missing"; if(defined($attributes->{gene_id})){ $gid = $attributes->{gene_id}; } else{ if($errors[13] < $max_error_count){ print $warnings "gene_id not set in field on line ", "$line_num.\n"; } $errors[13]++; $attributes->{gene_id} = "missing"; } my $tid = "missing"; if(defined($attributes->{transcript_id})){ $tid = $attributes->{transcript_id}; } else{ if($errors[14] < $max_error_count){ print $warnings "transcript_id not set in field on line ", "$line_num.\n"; } $errors[14]++; $attributes->{transcript_id} = "missing"; } #create objects my $feature = GTF::Feature::new($data[2], $data[3], $data[4], $data[5], $data[7]); if($feature->type eq "CDS"){ push @all_cds, $feature; } elsif($feature->type eq "inter") { push @all_inter, $feature; } elsif($feature->type eq "inter_CNS") { push @all_cns, $feature; } my $tx; my $gene; unless(defined($TxObj{$tid}) && length($tid)){ $tx = GTF::Transcript::new($tid); $TxObj{$tid} = $tx; unless($feature->type eq "inter" || $feature->type eq "inter_CNS") { push @all_txs, $tx; } unless(defined($GeneObj{$gid}) && length($gid)){ $gene = GTF::Gene::new($gid,$data[0],$data[1],$data[6]); $GeneObj{$gid} = $gene; unless($feature->type eq "inter" || $feature->type eq "inter_CNS") { push @all_genes, $gene; } } $gene = $GeneObj{$gid}; $gene->add_transcript($tx); } else{ $gene = $GeneObj{$gid}; } $tx = $TxObj{$tid}; #check that strand value is consistent with the strand of the tx unless($data[6] eq $tx->strand){ if($errors[20] < $max_error_count){ print $warnings "Inconsistent value across gene_id = ". $tx->gene_id.".\n"; } $errors[20]++; if (_check_errors_against_badlist($gtf,20)) { push @bad_list, $tx->id; } if($strict){ die; } #last;??? } $tx->add_feature($feature); } } close(GTF); #check each gene object foreach my $tx (@all_txs){ my $exons = $tx->exons; my $cds = $tx->cds; my $utr3 = $tx->utr3; my $utr5 = $tx->utr5; my $sec = $tx->sec; my $inter = $tx->inter; my $inter_cns = $tx->inter_cns; if($#$exons >= 0 && $#$utr3 == -1 && $#$utr5 == -1){ $tx->create_utr_objects_from_exons; } my $starts = $tx->start_codons; my $stops = $tx->stop_codons; my $strand = $tx->strand; #make sure this tx contains at least one CDS, inter_CNS, or inter feature if($#$cds == -1 && $#$inter_cns == -1 && $#$inter == -1){ if($errors[15] < $max_error_count){ print $warnings "Transcript ".$tx->id." contains no CDS, inter or inter_CNS features.\n"; } $errors[15]++; if (_check_errors_against_badlist($gtf,15)) { push @bad_list, $tx->id; } next; } #verify start/stop codon exists and has length 3 my $start_codon_start = -1; my $start_codon_stop = -1; if($#$starts == -1){ if($errors[17] < $max_error_count){ print $warnings "Missing start_codon for transcript \"".$tx->id."\".\n"; } $errors[17]++; if (_check_errors_against_badlist($gtf,17)) { push @bad_list, $tx->id; } } else{ my $length = 0; foreach my $start (@$starts){ $length += $start->stop - $start->start + 1; if(($start_codon_start < 0) || ($start_codon_start > $start->start)){ $start_codon_start = $start->start; } if(($start_codon_stop < 0) || ($start_codon_stop < $start->stop)){ $start_codon_stop = $start->stop; } } if($length != 3){ if($errors[16] < $max_error_count){ print $warnings "Start Codon length is not three in transcript \"".$tx->id."\".\n"; } $errors[16]++; if (_check_errors_against_badlist($gtf,16)) { push @bad_list, $tx->id; } } } my $stop_codon_start = -1; my $stop_codon_stop = -1; if($#$stops == -1){ if($errors[19] < $max_error_count){ print $warnings "Missing stop_codon for transcript \"".$tx->id."\".\n"; } $errors[19]++; if (_check_errors_against_badlist($gtf,19)) { push @bad_list, $tx->id; } } else{ my $length = 0; foreach my $stop (@$stops){ $length += $stop->stop - $stop->start + 1; if(($stop_codon_start < 0) || ($stop_codon_start > $stop->stop)){ $stop_codon_start = $stop->start; } if(($stop_codon_stop < 0) || ($stop_codon_stop < $stop->stop)){ $stop_codon_stop = $stop->stop; } } if($length != 3){ if($errors[18] < $max_error_count){ print $warnings "Stop Codon length is not three in gene \"".$tx->id."\".\n"; } $errors[18]++; if (_check_errors_against_badlist($gtf,18)) { push @bad_list, $tx->id; } } } #check that no cds or utr features overlap each other # also counts introns that are too short # introns shorter than 4 bp are in all likelihood # biologically impossible (at least 2 bp for each splice signal) my $p_exons; my $overlap = 0; my $zero_intron_count = 0; my $short_intron_count = 0; foreach $p_exons ($utr5, $starts, $cds, $stops, $utr3) { my $last_stop; if ($#$p_exons > 0) { $last_stop = $$p_exons[0]->stop; } for(my $i = 1;$i <= $#$p_exons;$i++){ my $intron_length = $$p_exons[$i]->start - $last_stop - 1; if($intron_length < 0){ $overlap++; } elsif($intron_length < 4){ $zero_intron_count++; } elsif($intron_length < 20){ $short_intron_count++; } $last_stop = $$p_exons[$i]->stop; } } # check CDS - UTR junctions for overlaps # here, introns with length 0 are allowed because some # exons could be in part UTR and in part CDS if ($tx->strand eq "+") { my $ilen; if (($#$utr5 > -1) && ($#$starts > -1)){ $ilen = $$starts[0]->start - $$utr5[$#$utr5]->stop - 1; $overlap++ if ($ilen < 0); $zero_intron_count++ if (($ilen > 0) && ($ilen < 4)); } if (($#$cds > -1) && ($#$stops > -1)){ # assumes stop codons are not included in CDS $ilen = $$stops[0]->start - $$cds[$#$cds]->stop - 1; $overlap++ if ($ilen < 0); $zero_intron_count++ if (($ilen > 0) && ($ilen < 4)); } if (($#$stops > -1) && ($#$utr3 > -1)){ $ilen = $$utr3[0]->start - $$stops[$#$stops]->stop - 1; $overlap++ if ($ilen < 0); $zero_intron_count++ if (($ilen > 0) && ($ilen < 4)); } } else{ my $ilen; if (($#$utr3 > -1) && ($#$stops > -1)){ $ilen = $$stops[0]->start - $$utr3[$#$utr3]->stop - 1; $overlap++ if ($ilen < 0); $zero_intron_count++ if (($ilen > 0) && ($ilen < 4)); } if (($#$stops > -1) && ($#$cds > -1)){ # assumes stop codon is not included in CDS $ilen = $$cds[0]->start - $$stops[$#$stops]->stop - 1; $overlap++ if ($ilen < 0); $zero_intron_count++ if (($ilen > 0) && ($ilen < 4)); } if (($#$starts > -1) && ($#$utr5 > -1)){ $ilen = $$utr5[0]->start - $$starts[$#$starts]->stop - 1; $overlap++ if ($ilen < 0); $zero_intron_count++ if (($ilen > 0) && ($ilen < 4)); } } if($overlap > 0){ if($errors[45] < $max_error_count){ print $warnings "Overlapping CDS or UTR features in transcript \"".$tx->id."\".\n"; } $errors[45]++; if (_check_errors_against_badlist($gtf,45)) { push @bad_list, $tx->id; } } if($zero_intron_count > 0){ if($errors[48] < $max_error_count){ print $warnings "$zero_intron_count intron(s) shorter than 4 bp found on in transcript \"".$tx->id."\".\n" ; } $errors[48]++; if (_check_errors_against_badlist($gtf,48)) { push @bad_list, $tx->id; } } if($short_intron_count > 0){ if($errors[49] < $max_error_count){ print $warnings "$short_intron_count short (<20bp) intron(s) found on in transcript \"".$tx->id."\".\n" ; } $errors[49]++; if (_check_errors_against_badlist($gtf,49)) { push @bad_list, $tx->id; } } #check that each cds is between the start and stop and that the initial and #terminal exons are placed correctly with respect to the start/stop codons if($start_codon_start >= 0){ if($strand eq '-'){ for(my $i = $#$cds;$i >= 0;$i--){ if($$cds[$i]->end > $start_codon_stop){ if($errors[21] < $max_error_count){ print $warnings "CDS before start codon in transcript_id = \"". "".$tx->id."\".\n"; } $errors[21]++; if (_check_errors_against_badlist($gtf,21)) { push @bad_list, $tx->id; } } } my $cds = $tx->initial_exon; if($cds->end != $start_codon_stop){ if($errors[33] < $max_error_count){ print $warnings "Start codon location not consistent with initial exon ". "locaiton in transcript_id = \"".$tx->id."\".\n"; } $errors[33]++; if (_check_errors_against_badlist($gtf,33)) { push @bad_list, $tx->id; } } if($cds->length < 3){ my $cds = $tx->cds; my $error = 0; for(my $i = 0;$i <= $#$starts;$i++){ if($i == $#$starts){ unless($$cds[$#$cds-$i]->start <= $$starts[$#$starts-$i]->start){ $error = 1; } } else{ unless($$cds[$#$cds-$i]->start == $$starts[$#$starts-$i]->start){ $error = 1; } } } if($error){ if($errors[47] < $max_error_count){ print $warnings "Start codon annotated in intron region on ". "transcript_id = \"".$tx->id."\".\n"; } $errors[47]++; if (_check_errors_against_badlist($gtf,47)) { push @bad_list, $tx->id; } my $pos = 0; my $start_pos = 0; my $cds_pos = $#$cds; #fix the start codon by splitting it across the cds while($pos < 3){ my $end; if($$cds[$cds_pos]->length < 3-$pos){ $end = $$cds[$cds_pos]->start; $pos += $$cds[$cds_pos]->length; } else{ $end = $$cds[$cds_pos]->end-2+$pos; $pos = 3; } if($start_pos <= $#$starts){ $$starts[$start_pos]->set_start($end); $$starts[$start_pos]->set_stop($$cds[$cds_pos]->end); } else{ my $sc = GTF::Feature::new("start_codon", $end, $$cds[$cds_pos]->end, ".", "."); $tx->add_feature($sc); } $cds_pos--; $start_pos++; } $starts = [sort {$a->start <=> $b->start} @$starts]; } } } else{ for(my $i = 0;$i <= $#$cds;$i++){ if($$cds[$i]->start < $start_codon_start){ if($errors[21] < $max_error_count){ print $warnings "CDS before start codon in transcript_id = \"". "".$tx->id."\".\n"; } $errors[21]++; if (_check_errors_against_badlist($gtf,21)) { push @bad_list, $tx->id; } } } my $cds = $tx->initial_exon; if($cds->start != $start_codon_start){ if($errors[33] < $max_error_count){ print $warnings "Start codon location not consistent with initial exon ". "locaiton in transcript_id = \"".$tx->id."\".\n"; } $errors[33]++; if (_check_errors_against_badlist($gtf,33)) { push @bad_list, $tx->id; } } if($cds->length < 3){ my $cds = $tx->cds; my $error = 0; for(my $i = 0; $i <= $#$starts;$i++){ if($i == $#$starts){ unless($$cds[$i]->stop >= $$starts[$i]->stop){ $error = 1; } } else{ unless($$cds[$i]->stop == $$starts[$i]->stop){ $error = 1; } } } if($error){ if($errors[47] < $max_error_count){ print $warnings "Start codon annotated in intron region on ". "transcript_id = \"".$tx->id."\".\n"; } $errors[47]++; if (_check_errors_against_badlist($gtf,47)) { push @bad_list, $tx->id; } my $pos = 0; my $start_pos = 0; my $cds_pos = 0; #fix the start codon by splitting it across the cds while($pos < 3){ my $end; if($$cds[$cds_pos]->length < 3-$pos){ $end = $$cds[$cds_pos]->stop; $pos += $$cds[$cds_pos]->length; } else{ $end = $$cds[$cds_pos]->start+2-$pos; $pos = 3; } if($start_pos <= $#$starts){ $$starts[$start_pos]->set_start($$cds[$cds_pos]->start); $$starts[$start_pos]->set_stop($end); } else{ my $sc = GTF::Feature::new("start_codon", $$cds[$cds_pos]->start, $end, ".", "."); $tx->add_feature($sc); } $cds_pos++; $start_pos++; } $starts = [sort {$a->start <=> $b->start} @$starts]; } } } } if($stop_codon_start >= 0){ if($strand =~ /\-/){ for(my $i = 0;$i <= $#$cds;$i++){ if($$cds[$i]->start < $stop_codon_start){ if($errors[22] < $max_error_count){ print $warnings "CDS after stop codon in transcript_id = \"". "".$tx->id."\".\n"; } $errors[22]++; if (_check_errors_against_badlist($gtf,22)) { push @bad_list, $tx->id; } } } my $cds = $tx->terminal_exon; if($cds->start == $stop_codon_start){ unless($cds->stop < $stop_codon_stop + 1){ $cds->set_start($stop_codon_stop + 1); } if($errors[46] < $max_error_count){ print $warnings "Stop codon included in terminal CDS transcript_id = ". "\"".$tx->id."\".\n"; } $errors[46]++; if (_check_errors_against_badlist($gtf,46)) { push @bad_list, $tx->id; } } if($cds->start != $stop_codon_stop + 1){ if($errors[34] < $max_error_count){ print $warnings "Stop codon location not consistent with terminal exon ". "locaiton in transcript_id = \"".$tx->id."\".\n"; } $errors[34]++; if (_check_errors_against_badlist($gtf,34)) { push @bad_list, $tx->id; } } } else{ for(my $i = $#$cds;$i >= 0;$i--){ if($$cds[$i]->end > $stop_codon_stop){ if($errors[22] < $max_error_count){ print $warnings "CDS after stop codon in transcript_id = \"".$tx->id."\".\n"; } $errors[22]++; if (_check_errors_against_badlist($gtf,22)) { push @bad_list, $tx->id; } } } my $cds = $tx->terminal_exon; if($cds->stop == $stop_codon_stop){ unless($cds->start > $stop_codon_start -1){ $cds->set_stop($stop_codon_start - 1); } if($errors[46] < $max_error_count){ print $warnings "Stop codon included in terminal CDS transcript_id = ". "\"".$tx->id."\".\n"; } $errors[46]++; if (_check_errors_against_badlist($gtf,46)) { push @bad_list, $tx->id; } } if($cds->stop != $stop_codon_start - 1){ if($errors[34] < $max_error_count){ print $warnings "Stop codon location not consistent with terminal exon ". "locaiton in transcript_id = \"".$tx->id."\".\n"; } $errors[34]++; if (_check_errors_against_badlist($gtf,34)) { push @bad_list, $tx->id; } } } } #check that complete transcripts have even number of codons if(($start_codon_start >= 0) && ($stop_codon_stop >= 0)){ my $len = 0; for(my $i = $#$cds;$i >= 0;$i--){ $len += $$cds[$i]->length; } unless(($len % 3) == 0){ if($errors[41] < $max_error_count){ print $warnings "Complete transcript length not a multiple of three". " on transcript_id = \"".$tx->id."\".\n"; } $errors[41]++; if (_check_errors_against_badlist($gtf,41)) { push @bad_list, $tx->id; } } } #check for consistent phase my $phase = -1; if($start_codon_start >= 0){ $phase = 0; } elsif($start_codon_stop >= 0){ my $len = 0; for(my $i = $#$cds;$i >= 0;$i--){ $len += $$cds[$i]->length; } $phase = $len % 3; } if($strand eq "-"){ if($phase == -1){ $phase = $$cds[$#$cds]->frame; if($phase eq "."){ my $pos = -1; for(my $i = $#$cds; $i >= 0;$i--){ unless($$cds[$i]->frame eq "."){ $pos = $i; $phase = $$cds[$i]->frame; last; } } if($pos >= 0){ for(my $i = $pos + 1; $i <= $#$cds;$i++){ my $len = $$cds[$i]->length + $phase; $phase = $len % 3; } } } } unless($phase eq "."){ my $bad_frame = 0; for(my $i = $#$cds;$i >= 0;$i--){ unless($phase eq $$cds[$i]->frame){ unless($$cds[$i]->frame eq "."){ $bad_frame = 1; } $$cds[$i]->set_frame($phase); } $phase = ($$cds[$i]->length - $phase) % 3; if($phase == 1){ $phase = 2; } elsif($phase == 2){ $phase = 1; } } if($bad_frame){ if($errors[42] < $max_error_count){ print $warnings "Inconsistent frame accross transcript_id = ". "\"".$tx->id."\".\n"; } $errors[42]++; if (_check_errors_against_badlist($gtf,42)) { push @bad_list, $tx->id; } } } } else{ if($phase == -1){ $phase = $$cds[0]->frame; if($phase eq "."){ my $pos = -1; for(my $i = 0; $i <= $#$cds;$i++){ unless($$cds[$i]->frame eq "."){ $pos = $i; $phase = $$cds[$i]->frame; last; } } if($pos >= 0){ for(my $i = $pos - 1; $i >= 0;$i--){ my $len = $$cds[$i]->length + $phase; $phase = $len % 3; } } } } unless($phase eq "."){ my $bad_frame = 0; for(my $i = 0; $i <= $#$cds;$i++){ unless($phase eq $$cds[$i]->frame){ unless($$cds[$i]->frame eq "."){ $bad_frame = 1; } $$cds[$i]->set_frame($phase); } $phase = ($$cds[$i]->length - $phase) % 3; if($phase == 1){ $phase = 2; } elsif($phase == 2){ $phase = 1; } } if($bad_frame){ if($errors[42] < $max_error_count){ print $warnings "Inconsistent frame accross transcript_id = ". "\"".$tx->id."\".\n"; } $errors[42]++; if (_check_errors_against_badlist($gtf,42)) { push @bad_list, $tx->id; } } } } # check selenocysteine (SEC) features here. must be inside a # CDS and have frame 0 if($#$sec >= 0){ foreach my $s (@$sec){ if($s->frame ne "0" && $s->frame ne "."){ #bad SEC frame if($errors[51] < $max_error_count){ print $warnings "Bad frame, ".$s->frame."on SEC feature on tx ".$tx->id.". Must of 0 or \".\"\n"; } $errors[51]++; if (_check_errors_against_badlist($gtf,51)) { push @bad_list, $tx->id; } } my $good = 0; foreach my $c (@$cds){ if($c->stop < $s->start){ next; } if($c->start <= $s->start && $c->stop >= $s->stop){ $good = 1; last; } if($c->start > $s->stop){ last; } } if($good == 0){ # SEC outside CDS if($errors[52] < $max_error_count){ print $warnings "SEC feature outside of CDS region on tx ".$tx->id.".\n"; } $errors[52]++; if (_check_errors_against_badlist($gtf,52)) { push @bad_list, $tx->id; } } } } } #if the conservation sequence is available store it's info; my $conseqname = $gtf->{Conseq}; my %total_conserv = (0 => 0,1 => 0,2 => 0); if($conseqname){ open(Conseq, $conseqname) or die "Could not open conservation sequence file: \"$conseqname\".\n"; my @conseq_info = stat(Conseq); my @comp; foreach my $tx (@all_txs){ my $cds = $tx->cds; my $exons = $tx->exons; my $starts = $tx->start_codons; my $stops = $tx->stop_codons; foreach my $start (@$starts){ push @comp,{feature => $start, "0" => 0, "1" => 0, "2" => 0}; } foreach my $stop (@$stops){ push @comp,{feature => $stop, "0" => 0, "1" => 0, "2" => 0}; } foreach my $exon (@$cds){ push @comp,{feature => $exon, "0" => 0, "1" => 0, "2" => 0}; } foreach my $exon (@$exons){ push @comp,{feature => $exon, "0" => 0, "1" => 0, "2" => 0}; } } @comp = sort {$a->{feature}->start <=> $b->{feature}->start} @comp; my $base = "X"; my $nuc_pos = 0; my $file_pos = 0; my $line = "X"; my $next_comp = 0; my $max_comps = $#comp; my %checking; while(($next_comp <= $max_comps) and ($file_pos < $conseq_info[7])){ $base = getc(Conseq); unless(defined($base)){ print $warnings "Features beyond end of conservation sequence.\n"; last; } $file_pos++; if($base =~ /\d/){ $nuc_pos++; if(defined($total_conserv{$base})){ $total_conserv{$base}++; } else{ $total_conserv{X}++; if($errors[43] < $max_error_count){ print $warnings "Bad conservation sequence character, $base. ". "Should be \'0\',\'1\', or \'2\'.\n"; } $errors[43]++; } #compile composition stats while(($next_comp <= $max_comps) and ($nuc_pos == $comp[$next_comp]->{feature}->start)){ $checking{$comp[$next_comp]} = $comp[$next_comp]; $next_comp++; } foreach my $id (keys %checking){ $checking{$id}->{$base}++; if($checking{$id}->{feature}->stop == $nuc_pos){ delete $checking{$id}; } } } } while($file_pos < $conseq_info[7]){ $base = getc(Conseq); $file_pos++; if($base =~ /\d/){ $nuc_pos++; if(defined($total_conserv{$base})){ $total_conserv{$base}++; } else{ $total_conserv{X}++; if($errors[43] < $max_error_count){ print $warnings "Bad conservation sequence character, $base. ". "Should be \'0\',\'1\', or \'2\'.\n"; } $errors[43]++; } } } close(Conseq); foreach my $info (@comp){ $info->{feature}->set_conseq($info->{0},$info->{1},$info->{2}); } } $gtf->{Total_Conseq} = \%total_conserv; #if the sequence is available use it to fix any problems my @inframe_stops; my $seqname = $gtf->{Sequence}; my %tx_phase; foreach my $tx (@all_txs){ $tx_phase{$tx->id} = -1; } my %total_seq = ('A' => 0,'C' => 0,'G' =>0,'T' => 0,'N' => 0,'X' => 0); #EVAN need to rewrite all this sequence loading stuff because it is fugly if($seqname){ open(SEQ, "<$seqname") or die "Could not open sequence file: \"$seqname\".\n"; my @seq_info = stat(SEQ); #try to fix things with the sequence info here #check start codons, stop codons, splice site, internal stop my %sequence; my %seq_pos; my @checks; foreach my $tx (@all_txs){ $sequence{$tx->id} = ""; $seq_pos{$tx->id} = 0; my $starts = $tx->start_codons; my $stops = $tx->stop_codons; my $utr3 = $tx->utr3; my $utr5 = $tx->utr5; my $cds = $tx->cds; if($#$cds == -1){ next; } my $exons = $tx->exons; foreach my $start (@$starts){ push @checks, {pos => $start->start, type => "start_sequence", tx => $tx, object => $start, stop => $start->stop}; } foreach my $stop (@$stops){ push @checks, {pos => $stop->start, type => "stop_sequence", tx => $tx, object => $stop, stop => $stop->stop}; } foreach my $exon (@$exons){ push @checks, {pos => $exon->start, type => "exon_sequence", tx => $tx, object => $exon, stop => $exon->stop}; } foreach my $coding (@$cds){ push @checks, {pos => $coding->start, type => "coding_sequence", tx => $tx, object => $coding, stop => $coding->stop}; } my($p_exons, $splice_type1, $splice_type2); if ($tx->strand eq "+"){ $splice_type1 = "donor_splice"; $splice_type2 = "acceptor_splice"; } else{ $splice_type1 = "acceptor_splice"; $splice_type2 = "donor_splice"; } foreach $p_exons ($utr5, $starts, $cds, $stops, $utr3){ next if ($#$p_exons < 1); my($i, $subtype); $subtype = ($$p_exons[0]->type =~ /UTR/) ? "utr" : "cds"; for ($i = 1; $i <= $#$p_exons; $i++){ push @checks, {pos => $$p_exons[$i-1]->stop+2, type => $subtype."_".$splice_type1, tx => $tx, stop => 0}; push @checks, {pos => $$p_exons[$i]->start-1, type => $subtype."_".$splice_type2, tx => $tx, stop => 0}; } } # this assumes that the innermost start codon is included in CDS # therefore, no checks are run on start_codon - CDS junction # also, this allows zero-length introns at UTR5 - start_codon # CDS - stop_codon and UTR3 - stop_codon junctions if ($tx->strand eq "+"){ if (($#$utr5 > -1) && ($#$starts > -1)){ my $ilen = $$starts[0]->start - $$utr5[$#$utr5]->stop - 1; if ($ilen != 0){ push @checks, {pos => $$utr5[$#$utr5]->stop+2, type => "utr_donor_splice", tx => $tx, stop => 0}; push @checks, {pos => $$starts[0]->start-1, type => "utr_acceptor_splice", tx => $tx, stop => 0}; } } if (($#$cds > -1) && ($#$stops > -1)){ my $ilen = $$stops[0]->start - $$cds[$#$cds]->stop - 1; if ($ilen != 0){ push @checks, {pos => $$cds[$#$cds]->stop+2, type => "cds_donor_splice", tx => $tx, stop => 0}; push @checks, {pos => $$stops[0]->start-1, type => "cds_acceptor_splice", tx => $tx, stop => 0}; } } if (($#$stops > -1) && ($#$utr3 > -1)){ my $ilen = $$utr3[0]->start - $$stops[$#$stops]->stop - 1; if ($ilen != 0){ push @checks, {pos => $$stops[$#$stops]->stop+2, type => "utr_donor_splice", tx => $tx, stop => 0}; push @checks, {pos => $$utr3[0]->start-1, type => "utr_acceptor_splice", tx => $tx, stop => 0}; } } if ($#{$stops} < 0){ push @checks, {pos => $$cds[$#{$cds}]->stop+1, type => "stop_sequence?", tx => $tx, stop => $$cds[$#{$cds}]->stop+3}; } if ($#{$starts} < 0){ if ($$cds[0]->start-3 > 0){ push @checks, {pos => $$cds[0]->start-3, type => "start_sequence?", tx => $tx, stop => $$cds[0]->start-1}; } else{ $sequence{$tx->id} = "NNN"; } } } else{ if (($#$utr3 > -1) && ($#$stops > -1)){ my $ilen = $$stops[0]->start - $$utr3[$#$utr3]->stop - 1; if ($ilen != 0){ push @checks, {pos => $$utr3[$#$utr3]->stop+2, type => "utr_acceptor_splice", tx => $tx, stop => 0}; push @checks, {pos => $$stops[0]->start-1, type => "utr_donor_splice", tx => $tx, stop => 0}; } } if (($#$stops > -1) && ($#$cds > -1)){ my $ilen = $$cds[0]->start - $$stops[$#$stops]->stop - 1; if ($ilen != 0){ push @checks, {pos => $$stops[$#$stops]->stop+2, type => "cds_acceptor_splice", tx => $tx, stop => 0}; push @checks, {pos => $$cds[0]->start-1, type => "cds_donor_splice", tx => $tx, stop => 0}; } } if (($#$starts > -1) && ($#$utr5 > -1)){ my $ilen = $$utr5[0]->start - $$starts[$#$starts]->stop - 1; if ($ilen != 0){ push @checks, {pos => $$starts[$#$starts]->stop+2, type => "utr_acceptor_splice", tx => $tx, stop => 0}; push @checks, {pos => $$utr5[0]->start-1, type => "utr_donor_splice", tx => $tx, stop => 0}; } } if ($#{$stops} < 0){ if ($$cds[0]->start-3 > 0){ push @checks, {pos => $$cds[0]->start-3, type => "stop_sequence?", tx => $tx, stop => $$cds[0]->start-1}; } } if ($#{$starts} < 0){ push @checks, {pos => $$cds[$#{$cds}]->stop+1, type => "start_sequence?", tx => $tx, stop => $$cds[$#{$cds}]->stop+3}; } } } @checks = sort{$a->{pos} <=> $b->{pos} || $a->{stop} <=> $b->{stop}} @checks; my $file_pos = length(); my $base = "X"; my $nuc_pos = 0; my $line = "X"; my $next = 0; my $prev_base = "X"; my $max_checks = $#checks; my %checking; my $splice_type; while((($next <= $max_checks) || ((keys %checking) >= 0)) && ($file_pos < $seq_info[7])){ $base = getc(SEQ); unless(defined($base)){ print $warnings "Features beyond end of sequence.\n"; last; } $file_pos++; if($base =~ /\S/){ $base = uc($base); if((defined($total_seq{$base})) && ($base ne 'X')){ $total_seq{$base}++; } else{ $total_seq{X}++; if($errors[44] < $max_error_count){ print $warnings "Bad fasta sequence character, $base. Should be". " \'A\',\'C\',\'T\',\'G\',or \'N\'.\n"; } $errors[44]++; } $nuc_pos++; while(($next <= $max_checks) && ($checks[$next]{pos} == $nuc_pos)){ if($checks[$next]{type} =~ /sequence/){ $checking{$checks[$next]{tx}->id . "\t" . $checks[$next]{type}} = { check => $checks[$next], A => 0, C => 0, G => 0, T => 0, N => 0}; } elsif(($checks[$next]{type} =~ /^(\w+)_acceptor_splice$/) && ($splice_type = $1)) { if($checks[$next]{tx}->strand eq "-"){ unless(($prev_base eq "C") && ($base eq "T")){ if (($prev_base eq "G") && ($base eq "T")) { if ($errors[50] < $max_error_count) { print $warnings "Possible non-canonical splice site sequence, ". "$prev_base$base, on negative strand transcript \"". $checks[$next]{tx}->id. "\".\n"; } $errors[50]++; if ((_check_errors_against_badlist($gtf,50)) && ($splice_type ne "utr")) { push @bad_list, $checks[$next]{tx}->id; } } else { if($errors[26] < $max_error_count){ print $warnings "Bad acceptor splice site sequence, $prev_base". "$base, on negative strand transcript \"". $checks[$next]{tx}->id. "\". Should be CT or GT.\n"; } $errors[26]++; if (_check_errors_against_badlist($gtf,26)) { push @bad_list, $checks[$next]{tx}->id; } } } } else{ unless(($prev_base eq "A") && ($base eq "G")){ if (($prev_base eq "A") && ($base eq "C")) { if ($errors[50] < $max_error_count) { print $warnings "Possible non-canonical splice site sequence, ". "$prev_base$base, on transcript \"". $checks[$next]{tx}->id. "\".\n"; } $errors[50]++; if ((_check_errors_against_badlist($gtf,50)) && ($splice_type ne "utr")) { push @bad_list, $checks[$next]{tx}->id; } } else { if($errors[26] < $max_error_count){ print $warnings "Bad acceptor splice site sequence, $prev_base". "$base, on transcript \"". $checks[$next]{tx}->id. "\". Should be AG or AC.\n"; } $errors[26]++; if (_check_errors_against_badlist($gtf,26)) { push @bad_list, $checks[$next]{tx}->id; } } } } } elsif(($checks[$next]{type} =~ /^(\w+)_donor_splice$/) && ($splice_type = $1)) { if($checks[$next]{tx}->strand eq "-"){ unless(($prev_base eq "A") && ($base eq "C")){ if (($prev_base eq "A") && ($base eq "T")) { if ($errors[50] < $max_error_count) { print $warnings "Possible non-canonical splice site sequence, ". "$prev_base$base, on negative strand transcript \"". $checks[$next]{tx}->id. "\".\n"; } $errors[50]++; if ((_check_errors_against_badlist($gtf,50)) && ($splice_type ne "utr")) { push @bad_list, $checks[$next]{tx}->id; } } elsif (($prev_base eq "G") && ($base eq "C")) { if ($errors[50] < $max_error_count) { print $warnings "Possible non-canonical splice site sequence, ". "$prev_base$base, on negative strand transcript \"". $checks[$next]{tx}->id. "\".\n"; } $errors[50]++; if ((_check_errors_against_badlist($gtf,50)) && ($splice_type ne "utr")) { push @bad_list, $checks[$next]{tx}->id; } } else { if ($errors[27] < $max_error_count){ print $warnings "Bad donor splice site sequence, $prev_base". "$base, on negative strand transcript \"". $checks[$next]{tx}->id. "\". Should be AC, GC, or AT.\n"; } $errors[27]++; if (_check_errors_against_badlist($gtf,27)) { push @bad_list, $checks[$next]{tx}->id; } } } } else{ unless(($prev_base eq "G") && ($base eq "T")){ if (($prev_base eq "G") && ($base eq "C")) { if ($errors[50] < $max_error_count) { print $warnings "Possible non-canonical splice site sequence, ". "$prev_base$base, on transcript \"". $checks[$next]{tx}->id. "\".\n"; } $errors[50]++; if ((_check_errors_against_badlist($gtf,50)) && ($splice_type ne "utr")) { push @bad_list, $checks[$next]{tx}->id; } } elsif (($prev_base eq "A") && ($base eq "T")) { if ($errors[50] < $max_error_count) { print $warnings "Possible non-canonical splice site sequence, ". "$prev_base$base, on transcript \"". $checks[$next]{tx}->id. "\".\n"; } $errors[50]++; if ((_check_errors_against_badlist($gtf,50)) && ($splice_type ne "utr")) { push @bad_list, $checks[$next]{tx}->id; } } else { if ($errors[27] < $max_error_count){ print $warnings "Bad donor splice site sequence, $prev_base". "$base, on transcript \"". $checks[$next]{tx}->id. "\". Should be GT, GC, or AT.\n"; } $errors[27]++; if (_check_errors_against_badlist($gtf,27)) { push @bad_list, $checks[$next]{tx}->id; } } } } } $next++; } foreach my $check (keys %checking){ $check =~ /^(.+)\t.+$/; my $tx_id = $1; unless(($checking{$check}{check}{type} =~ /exon/) || ($seq_pos{$tx_id} >= $nuc_pos)){ $sequence{$tx_id}.= uc($base); $seq_pos{$tx_id} = $nuc_pos; } $checking{$check}{uc($base)}++; if($checking{$check}{check}{stop} eq $nuc_pos){ my $object = $checking{$check}{check}{object}; if(defined($object)){ if($object->strand eq "-"){ $object->set_bases($checking{$check}{T}, $checking{$check}{G}, $checking{$check}{C}, $checking{$check}{A}, $checking{$check}{N}); } else{ $object->set_bases($checking{$check}{A}, $checking{$check}{C}, $checking{$check}{G}, $checking{$check}{T}, $checking{$check}{N}); } } delete $checking{$check}; } } $prev_base = $base; } } foreach my $tx (@all_txs){ my $cds = $tx->cds; unless($#$cds >= 0){ next; } my $sec = $tx->sec; my $starts = $tx->start_codons; my $stops = $tx->stop_codons; my $strand = $tx->strand; my $tx_id = $tx->id; my $phase = $$cds[0]->frame; my $old_phase = $phase; if($strand eq "-"){ $phase = $$cds[$#$cds]->frame; } my $seq = $sequence{$tx->id}; if($seq =~ /^$/){ print STDERR "No sequence for $tx_id\n"; next; } if($tx->strand eq "-"){ $seq = _rev_comp($seq); } my $start = 0; my $stop = 0; my $found_start = -1; my $found_stop = -1; my @seq = split //, $seq; #check some SEC stuff here; foreach my $s (@$sec){ my $pos = 0; if($tx->strand eq '-'){ foreach my $c (reverse @$cds){ if($c->start > $s->stop){ $pos += $c->stop-$c->start+1; } else{ $pos += $c->stop-$s->stop; last; } } if($#$stops == -1){ $pos += 3; } } else{ foreach my $c (@$cds){ if($c->stop < $s->start){ $pos += $c->stop-$c->start+1; } else{ $pos += $s->start-$c->start; last; } } if($#$starts == -1){ $pos += 3; } } my $codon = substr($seq,$pos,3); unless($codon eq "TGA"){ #bad SEC codon $error_msgs[53] = "SEC feature had bad sequence. Must be \"TGA\""; if($errors[53] < $max_error_count){ print $warnings "SEC feature had bad sequence, $codon, on tx, ".$tx->id. ". Must be \"TGA\"\n"; } $errors[53]++; if (_check_errors_against_badlist($gtf,53)) { push @bad_list, $tx->id; } } substr($seq,$pos,3) = "NNN"; } if($#$starts >= 0){ my $sc = substr($seq,0,3); unless($sc eq "ATG"){ if($errors[24] < $max_error_count){ print $warnings "Bad start codon sequence, $sc, on tx ". $tx->id.". Should be ATG.\n"; } $errors[24]++; if (_check_errors_against_badlist($gtf,24)) { push @bad_list, $tx->id; } } } else{ my $sc1 = substr($seq,0,3); my $sc2 = substr($seq,3,3); if($sc2 eq "ATG"){ $found_start = 3; $start = 3; } elsif($sc1 eq "ATG"){ $found_start = 0; } else{ $start = 3; } } if($#$stops >= 0){ my $sc = substr($seq,$#seq-2,3); unless(($sc eq "TAA") || ($sc eq "TAG") || ($sc eq "TGA")){ if($errors[25] < $max_error_count){ print $warnings "Bad stop codon sequence, $sc, on tx ". $tx->id.". Should be TAA, TAG, or TGA.\n"; } $errors[25]++; if (_check_errors_against_badlist($gtf,25)) { push @bad_list, $tx->id; } } } else{ my $sc1 = substr($seq,$#seq - 2,3); my $sc2 = substr($seq,$#seq - 5,3); if(($sc2 eq "TAA") || ($sc2 eq "TAG") || ($sc2 eq "TGA")){ $found_stop = $#seq - 5; $stop = 3; } elsif(($sc1 eq "TAA") || ($sc1 eq "TAG") || ($sc1 eq "TGA")){ $found_stop = $#seq - 2; } else{ $stop = 3; } } my @sc = (0,0,0); my $scphase = $phase; $scphase = 0 if($scphase eq '.'); for(my $i = $start;$i <= $#seq - $stop - 5;$i++){ my $codon = substr($seq,$i,3); if(($codon eq "TAA") || ($codon eq "TAG") || ($codon eq "TGA")){ $sc[($i+$scphase) % 3] = 1; } } if(($phase ne ".") && ($sc[0])){ if($errors[29] < $max_error_count){ print $warnings "In frame stop codon(s) found on transcript ". $tx->id."\n"; } $errors[29]++; if (_check_errors_against_badlist($gtf,29)) { push @bad_list, $tx->id; } push @inframe_stops, $tx->id; } else{ if(($sc[0] == 1) && ($sc[1] == 1) && ($sc[2] == 1)){ push @inframe_stops, $tx->id; if($errors[37] < $max_error_count){ print $warnings "Stop Codons in all frames on transcript ". $tx->id."\n"; } $errors[37]++; if (_check_errors_against_badlist($gtf,37)) { push @bad_list, $tx->id; } } } # add start or stop if possible if($found_stop != -1){ if(($phase eq ".") || ($phase == (($#seq + 1) % 3))){ if($sc[($#seq +1) % 3] == 0){ $phase = ($#seq + 1) % 3; if($errors[32] < $max_error_count){ print $warnings "Unannotated stop codon found on transcript ". $tx->id."\n"; } $errors[32]++; if (_check_errors_against_badlist($gtf,32)) { push @bad_list, $tx->id; } if($fix_gtf){ if($strand eq "+"){ my $terminal_exon = $$cds[$#$cds]; my $sc_start = $terminal_exon->stop + 1 - $stop; my $stop_codon = GTF::Feature::new("stop_codon",$sc_start, $sc_start + 2,0,0); $terminal_exon->set_stop($sc_start - 1); $tx->add_feature($stop_codon); } else{ my $terminal_exon = $$cds[0]; my $sc_start = $terminal_exon->start - 3 + $stop; my $stop_codon = GTF::Feature::new("stop_codon",$sc_start, $sc_start + 2,0,0); $terminal_exon->set_start($sc_start + 3); $tx->add_feature($stop_codon); } } } } } if($found_start != -1){ if(($phase eq ".") || ($phase == 0)){ if($sc[0] == 0){ if($errors[31] < $max_error_count){ print $warnings "Unannotated start codon found on transcript ". $tx->id."\n"; } $errors[31]++; if (_check_errors_against_badlist($gtf,31)) { push @bad_list, $tx->id; } if($fix_gtf){ if($strand eq "+"){ my $initial_exon = $$cds[0]; my $sc_start = $initial_exon->start - 3 + $start; my $start_codon = GTF::Feature::new("start_codon",$sc_start, $sc_start + 2,0,0); $initial_exon->set_start($sc_start); $tx->add_feature($start_codon); } else{ my $initial_exon = $$cds[$#$cds]; my $sc_start = $initial_exon->stop + 1 - $start; my $start_codon = GTF::Feature::new("start_codon",$sc_start, $sc_start + 2,0,0); $initial_exon->set_stop($sc_start + 2); $tx->add_feature($start_codon); } } } } } # set phase if unset here; if(($phase ne ".") && ($old_phase eq ".") && ($fix_gtf)){ if($strand eq "+"){ for(my $i = 0;$i <= $#$cds;$i++){ $$cds[$i]->set_frame($phase); $phase = ($$cds[$i]->length - $phase) % 3; if($phase == 1){ $phase = 2; } elsif($phase == 2){ $phase = 1; } } } else{ for(my $i = $#$cds;$i >= 0;$i--){ $$cds[$i]->set_frame($phase); $phase = ($$cds[$i]->length - $phase) % 3; if($phase == 1){ $phase = 2; } elsif($phase == 2){ $phase = 1; } } } } if($gtf->{Tx}){ my $tx_fh = $gtf->{Tx}; print $tx_fh ">".$tx->id ."\t".$tx->start."\t".$tx->stop."\n". substr($seq,$start,$#seq - $stop - $start + 1)."\n"; } } } $gtf->{Total_Seq} = \%total_seq; for(my $i = 0;$i <= $#error_msgs;$i++){ if(($errors[$i] > 0) && !($$no_warn[$i])){ print $warnings "\nWarnings encountered:\n", "Count\tDescription\n"; for(my $j = $i;$j <= $#error_msgs;$j++){ if(($errors[$j] > 0) && !($$no_warn[$j])){ print $warnings $errors[$j],"\t$error_msgs[$j]\n"; } } last; } } #sort genes, transcripts, cds and utr by start position @all_genes = sort {$a->start <=> $b->start} @all_genes; @all_txs = sort {$a->start <=> $b->start} @all_txs; @all_cds = sort {$a->start <=> $b->start} @all_cds; @all_inter = sort {$a->start <=> $b->start} @all_inter; @all_cns = sort {$a->start <=> $b->start} @all_cns; print $warnings "\nStatistics:\n"; print $warnings "\t", $#all_genes + 1, " genes with ", $#all_txs + 1, " transcripts containing ", $#all_cds + 1 , " cds.\n"; if($gtf->{Inframe_Stops}){ print $warnings "Inframe Stop Genes:\n"; foreach my $stop (@inframe_stops){ print $warnings "$stop\n"; } } if ($gtf->{Bad_List}) { print $warnings "Bad Genes:\n"; my %bad_out; foreach my $bad (@bad_list) { $bad_out{$bad}++; } my @out = keys %bad_out; while (@out) { print $warnings pop(@out),"\n"; } } close(WARN); close(GTF); $gtf->{Genes} = \@all_genes; $gtf->{Transcripts} = \@all_txs; $gtf->{CDS} = \@all_cds; $gtf->{Inter} = \@all_inter; $gtf->{Inter_CNS} = \@all_cns; $gtf->{Comments} = $all_line_comments; $gtf->{Parse_Errors} = \@errors; } # GTF::_get_error_messages() # This function returns an array of the error messages associated with # each error number. sub _get_error_messages{ my @error_msgs; $error_msgs[0] = "Not enough fields on line."; $error_msgs[1] = "Illegal type of whitespace between fields. Sould be tab."; $error_msgs[2] = "Illegal value for field. Should be ". "\'CDS\', \'exon\', \'start_codon\', or \'stop_codon\'."; $error_msgs[3] = "Illegal value for field. Should be numerical."; $error_msgs[4] = "Illegal value for field. Should be numerical."; $error_msgs[5] = "Illegal value for field. Should be numerical or \'.\'."; $error_msgs[6] = "Illegal value for field. ". "Should be \'+\', \'-\', or \'.\'."; $error_msgs[7] = "Illegal value for field. ". "Should be \'0\', \'1\', \'2\', or \'.\'."; $error_msgs[8] = "Illegal \'\#\' character in attribute name."; $error_msgs[9] = "Illegal tab character in field."; $error_msgs[10] = "Missing \';\' terminator after attribute in field."; $error_msgs[11] = "Missing \'\"\'s around attribute values in field."; $error_msgs[12] = "Incorrect separator between attribute name-value pairs in ". " field. Should be \' \'."; $error_msgs[13] = "Missing \'gene_id\' attribute in field."; $error_msgs[14] = "Missing \'transcript_id\' attribute in field."; $error_msgs[15] = "Transcript contains no CDS, inter or inter_CNS features."; $error_msgs[16] = "Start Codon length is not three."; $error_msgs[17] = "Transcript has no start codon."; $error_msgs[18] = "Stop Codon length is not three."; $error_msgs[19] = "Transcript has no stop codon."; $error_msgs[20] = "Inconsistent value across gene_id."; $error_msgs[21] = "CDS before start codon."; $error_msgs[22] = "CDS after stop codon."; $error_msgs[23] = "Multiple genes using the same transcript_id."; $error_msgs[24] = "Bad start codon sequence. Should be ATG."; $error_msgs[25] = "Bad stop codon sequence. Should be TAA, TAG, or TGA."; $error_msgs[26] = "Bad acceptor splice site sequence."; $error_msgs[27] = "Bad donor splice site sequence."; $error_msgs[28] = "In frame stop codon found."; $error_msgs[29] = "In frame stop codon in transcript."; $error_msgs[30] = "Illegal \'=\' character in field."; $error_msgs[31] = "Unannotated start codon found."; $error_msgs[32] = "Unannotated stop codon found."; $error_msgs[33] = "Start codon location inconsistent with initial exon location."; $error_msgs[34] = "Stop codon location inconsistent with terminal exon location."; $error_msgs[35] = "Start codon length is wrong."; $error_msgs[36] = "Start codon length is wrong."; $error_msgs[37] = "Transcript does not translate in any frame."; $error_msgs[38] = " field is greater than field."; $error_msgs[39] = "Wrong phase value."; $error_msgs[40] = "Missing phase value."; $error_msgs[41] = "Complete transcript length not multiple of 3."; $error_msgs[42] = "Inconsistant phase value accross transcript."; $error_msgs[43] = "Bad conservation sequence character. Should be \'0\',\'1\', ". "or \'2\'."; $error_msgs[44] = "Bad fasta sequence character. Should be \'A\',\'C\',\'T\',\'G\',". "or \'N\'."; $error_msgs[45] = "Overlapping CDS features in transcript."; $error_msgs[46] = "Stop codon included in terminal CDS."; $error_msgs[47] = "Start codon annotated in intron region."; $error_msgs[48] = "Transcript contains zero length intron(s)."; $error_msgs[49] = "Transcript contains short (<20bp) intron(s)."; $error_msgs[50] = "Possible non-canonical splice site sequence."; $error_msgs[51] = "Bad frame on SEC feature. Must be 0 or \".\""; $error_msgs[52] = "SEC feature outside of CDS feature."; $error_msgs[53] = "SEC feature had bad sequence. Must be \"TGA\""; return @error_msgs; } #this checks to see if the error is one of the errors that #we want to return the transcript name and remove sub _check_errors_against_badlist{ my ($gtf,$error_num) = @_; my $bad = $gtf->{Bad_List}; if($$bad[$error_num]) { return 1; } else { return 0; } } #this loads the gtf file without checking for errors sub _load_no_check{ my ($gtf,$warnings) = @_; my (@all_genes,@all_txs,@all_cds,@all_inter,@all_cns); my %TxObj; my %GeneObj; my $strict = $gtf->{Strict}; #open file open(GTF, "<$gtf->{Filename}") or die "Could not open $gtf->{Filename}.\n"; while(my $in_line = ){ chomp $in_line; if($in_line =~ /^(.*)\#/){ $in_line = $1; } if($in_line =~ /^\s+(.*)$/){ $in_line = $1; } if($in_line !~ /\S/){ next; } my @data = split /\t/,$in_line; if($#data != 8){ die "Wrong number of fields on line:\n$in_line\n"; } if(!$strict && $data[5] eq "."){ $data[5] = 0; } my %attributes; while($data[8] =~ /^\s*(\S+) \"(\S*)\";(.*)$/){ my $key = $1; my $val = $2; $data[8] = $3; $attributes{$key} = $val; } my $gid; if(defined($attributes{gene_id})){ $gid = $attributes{gene_id}; } else{ die "Missing gene_id\n"; } my $tid; if(defined($attributes{transcript_id})){ $tid = $attributes{transcript_id}; } else{ die "Missing transcript_id\n"; } #create objects my $feature = GTF::Feature::new($data[2], $data[3], $data[4], $data[5], $data[7]); if($feature->type eq "CDS"){ push @all_cds, $feature; } elsif($feature->type eq "inter") { push @all_inter, $feature; } elsif($feature->type eq "inter_CNS") { push @all_cns, $feature; } my $tx; my $gene; unless(defined($TxObj{$tid}) && length($tid)){ $tx = GTF::Transcript::new($tid); $TxObj{$tid} = $tx; unless($feature->type eq "inter" || $feature->type eq "inter_CNS") { push @all_txs, $tx; } unless(defined($GeneObj{$gid}) && length($gid)){ $gene = GTF::Gene::new($gid,$data[0],$data[1],$data[6]); $GeneObj{$gid} = $gene; unless($feature->type eq "inter" || $feature->type eq "inter_CNS") { push @all_genes, $gene; } } $gene = $GeneObj{$gid}; $gene->add_transcript($tx); } else{ $gene = $GeneObj{$gid}; } $tx = $TxObj{$tid}; if($tx->strand ne $data[6]){ die "Features on transcript ".$tx->id." have different strands\n"; } $tx->add_feature($feature); } close(GTF); #sort genes, exons, and cds by start position @all_genes = sort {$a->start <=> $b->start} @all_genes; @all_txs = sort {$a->start <=> $b->start} @all_txs; @all_cds = sort {$a->start <=> $b->start} @all_cds; @all_inter = sort {$a->start <=> $b->start} @all_inter; @all_cns = sort {$a->start <=> $b->start} @all_cns; $gtf->{Genes} = \@all_genes; $gtf->{Transcripts} = \@all_txs; $gtf->{CDS} = \@all_cds; $gtf->{Inter_CNS} = \@all_cns; $gtf->{Inter} = \@all_inter; } sub _rev_comp{ my ($seq) = @_; $seq = uc $seq; my @bases = split //, $seq; my $out = ""; foreach my $base (@bases){ if($base eq 'A'){ $base = 'T'; } elsif($base eq 'C'){ $base = 'G'; } elsif($base eq 'G'){ $base = 'C'; } elsif($base eq 'T'){ $base = 'A'; } $out = $base.$out; } return $out; } ############################################################################### # GTF::Gene ############################################################################### # The GTF::Gene object stores all gtf data for one gene. A gene contains one # or more transcripts package GTF::Gene; # GTF::Gene::new(gene_id,seqname,source,strand) # This is the contrcutor for a GTF::Gene object. It has 4 required # parameters. The are: # gene_id - The gene_id for this gene # seqname - The field of the line. Should be a string # containing no whitespace. # source - The field of the line. Should be a string # containing no whitespace. # strand - The field of the line. Should be either # '+', '-', or '.'. sub new { my $gene = bless {}; ($gene->{Id},$gene->{Seqname}, $gene->{Source}, $gene->{Strand}) = @_; $gene->{Transcripts} = []; $gene->{CDS} = []; $gene->{Modified} = 0; $gene->{Start} = -1; $gene->{Stop} = -1; return $gene; } sub add_transcript{ my ($gene,$tx) = @_; my $txs = $gene->{Transcripts}; push @$txs, $tx; $tx->_set_gene($gene); $tx->_update; $gene->{Modified} = 1; } sub id {shift->{Id}} sub set_id{ my ($gene,$id) = @_; $gene->{Id} = $id; } # GTF::Feature::seqname() # This function returns the sequnce name, field, that this # feature came from as a string. sub seqname {shift->{Seqname}} # GTF::Feature::set_source() # This function sets the sequence name, field, of this line sub set_seqname{ my ($feature,$seqname) = @_; $feature->{Seqname} = $seqname; } # GTF::Feature::source() # This function returns the source, field, of this line as a # string. sub source {shift->{Source}} # GTF::Feature::set_source() # This function sets the source, field, of this line sub set_source{ my ($feature,$source) = @_; $feature->{Source} = $source; } # GTF::Feature::strand() # This function returns the strand, field, of this line as a # string. Should be '+', '-', or '.'. sub strand {shift->{Strand}} # GTF::Gene::id() # This function returns this transcript's unique transcript_id sub gene_id{ my ($gene) = @_; return $gene->{Id}; } # GTF::Gene::transcript_ids() # This function returns an array of all trnacsripts in this gene sub transcripts{ my ($gene) = @_; $gene->_update; return $gene->{Transcripts};# [sort {$a->start <=> $b->start || $a->stop <=> $b->stop} @{$gene->{Transcripts}}]; } # GTF::Gene::remove_transcript() sub remove_transcript{ my($gene, $tx_id) = @_; my $txs = $gene->transcripts; my @new_txs = (); @new_txs = grep($_->id ne $tx_id, @$txs); $gene->{Transcripts} = \@new_txs if ($#new_txs < $#$txs); } sub cds{ my ($gene) = @_; my $txs = $gene->transcripts; my @cds; foreach my $tx (@$txs){ push @cds, @{$tx->cds}; } # employ a slow sort to avoid perl sort problems (see GTF::Gene::_update) rpz my $tmp; for(my $a = 0; $a <= $#cds; $a++){ for(my $b = $a+1; $b <= $#cds; $b++){ if(($cds[$a]->start > $cds[$b]->start) || (($cds[$a]->start == $cds[$b]->start) && ($cds[$a]->stop > $cds[$b]->stop))){ $tmp = $cds[$a]; $cds[$a] = $cds[$b]; $cds[$b] = $tmp; } } } return \@cds; } # GTF::Gene::copy() # This function returns a new object which is a copy of this object. Also # creates copies of all transcripts of this gene sub copy{ my ($gene) = @_; my $copy = GTF::Gene::new($gene->id,$gene->seqname,$gene->source,$gene->strand); my $txs = $gene->transcripts; my @new_txs; my $next = 0; foreach my $tx (@$txs){ $next++; my $ntx = $tx->copy; $copy->add_transcript($ntx); } return($copy); } # GTF::Gene::offset(ammount) # This function will offset all locations in this gene by the given ammount sub offset{ my ($gene,$offset) = @_; my $txs = $gene->transcripts; foreach my $tx (@$txs){ $tx->offset($offset); } } # GTF::Gene::output_gtf([file_handle]) # This function outputs the information for this gene in standard # gtf2 format. If takes and optional file_handle as the only argument # to which it outputs the data. If no argument is given it outputs # the data to stdout. sub output_gtf{ my ($gene,$out_handle) = @_; unless(defined($out_handle)){ $out_handle = \*STDOUT; } my $txs = $gene->transcripts; foreach my $tx (@$txs){ $tx->output_gtf($out_handle); print $out_handle "\n"; } } # GTF::Gene::output_gff([file_handle]) # This function outputs the information for this gene in standard # GFF format. If takes and optional file_handle as the only argument # to which it outputs the data. If no argument is given it outputs # the data to stdout. sub output_gff{ my ($gene,$out_handle) = @_; unless(defined($out_handle)){ $out_handle = \*STDOUT; } my $txs = $gene->transcripts; foreach my $tx (@$txs){ $tx->output_gff($out_handle); } } # GTF::Gene::reverse_complement(seq_length) # This function takes the length of the sequence this gene came from and # reverse complements everything in the gene. sub reverse_complement{ my ($gene, $seq_length) = @_; my $txs = $gene->transcripts; foreach my $tx (@$txs){ $tx->reverse_complement($seq_length); } $gene->{Modified} = 1; } # GTF::Gene::start() # This function return the lowest coord of any feature in any tx of this gene sub start{ my ($gene) = @_; $gene->_update; return $gene->{Start}; } # GTF::Gene::stop() # This function returns the highest coord of any feature in any tx of this gene sub stop{ my ($gene) = @_; $gene->_update; return $gene->{Stop}; } sub equals{ my ($gene,$compare) = @_; $gene->_update; unless($gene && $compare){ return 0; } unless(($gene->start == $compare->start) && ($gene->stop == $compare->stop) && ($gene->strand eq $compare->strand)){ return 0; } my $gene_txs = $gene->transcripts; my $compare_txs = $compare->transcripts; unless($#$gene_txs == $#$compare_txs){ return 0; } for(my $i = 0;$i <= $#$gene_txs;$i++){ unless($$gene_txs[$i]->equals($$compare_txs[$i])){ return 0; } } return 1; } sub length{ my ($gene) = @_; my $txs = $gene->transcripts; if($#$txs == -1){ return 0; } my $start = $$txs[0]->start; my $stop = $$txs[0]->stop; my $total = 0; foreach my $tx (@$txs){ if($tx->start < $start){ $start = $tx->start; } if($tx->stop > $stop){ $stop = $tx->stop; } } return($stop - $start + 1); } sub gc_percentage{ my ($gene) = @_; my $txs = $gene->transcripts; if($#$txs == -1){ return 0; } my $total = 0; foreach my $tx (@$txs){ $total += $tx->gc_percentage; } return($total/($#$txs+1)); } sub match_percentage{ my ($gene) = @_; my $txs = $gene->transcripts; if($#$txs == -1){ return 0; } my $total = 0; foreach my $tx (@$txs){ $total += $tx->match_percentage; } return($total/($#$txs+1)); } sub mismatch_percentage{ my ($gene) = @_; my $txs = $gene->transcripts; if($#$txs == -1){ return 0; } my $total = 0; foreach my $tx (@$txs){ $total += $tx->mismatch_percentage; } return($total/($#$txs+1)); } sub unaligned_percentage{ my ($gene) = @_; my $txs = $gene->transcripts; if($#$txs == -1){ return 0; } my $total = 0; foreach my $tx (@$txs){ $total += $tx->unaligned_percentage; } return($total/($#$txs+1)); } sub tag{ my ($gene) = @_; return $gene->{Tag}; } sub set_tag{ my ($gene,$tag) = @_; $gene->{Tag} = $tag; } sub _update{ my ($gene) = @_; unless($gene->{Modified}){ return; } $gene->{Modified} = 0; my $txs = $gene->{Transcripts}; # this slow sort is being done because the commented out sort below # causes crashes sometimes and I have no idea why since the uncommented # sort has no problems # -- the reason for the problems is that perl's sort routine does not # afford you the capability of modifying the $a or $b variable in # the usersub callback, since they are aliases. rpz $gene->{Start} = -1; $gene->{Stop} = -1; my $tmp; for(my $a = 0; $a <= $#$txs; $a++){ for(my $b = $a+1; $b <= $#$txs; $b++){ if(($$txs[$a]->start > $$txs[$b]->start) || (($$txs[$a]->start == $$txs[$b]->start) && ($$txs[$a]->stop > $$txs[$b]->stop))){ $tmp = $$txs[$a]; $$txs[$a] = $$txs[$b]; $$txs[$b] = $tmp; } } if($$txs[$a]->stop > $gene->{Stop}){ # highest stop may not be stop of last tx so we check them all $gene->{Stop} = $$txs[$a]->stop; } } if($#$txs != -1){ # lowest start is start of first tx $gene->{Start} = $$txs[0]->start; } } ############################################################################### # GTF::Transcript ############################################################################### # The GTF::Transcript object stores all data for a single transcript of a gtf # file. package GTF::Transcript; # GTF::Transcript::new(transcript_id) # This is the constructor for GTF::Feature objects. It takes 1 arguments. # transciprt_id - the id for this transcript sub new{ my $tx = bless {}; ($tx->{Id}) = @_; $tx->{Exons} = []; $tx->{CDS} = []; $tx->{UTR3} = []; $tx->{UTR5} = []; $tx->{Introns} = []; $tx->{Starts} = []; $tx->{Stops} = []; $tx->{SEC} = []; $tx->{Intron_CNS} = []; $tx->{Inter_CNS} = []; $tx->{Inter} = []; $tx->{Coding_Start} = -1; $tx->{Coding_Stop} = -1; $tx->{Coding_Length} = -1; $tx->{Gene} = -1; $tx->{Start} = -1; $tx->{Stop} = -1; $tx->{Modified} = 1; return $tx; } sub _set_gene{ my ($tx,$gene) = @_; $tx->{Gene} = $gene; } sub add_feature{ my ($tx,$feature) = @_; my $type = $feature->type; if($type eq 'CDS'){ push @{$tx->{CDS}}, $feature; $tx->{CDS} = [sort {$a->start <=> $b->start} @{$tx->{CDS}}]; } elsif($type eq '3UTR'){ push @{$tx->{UTR3}}, $feature; $tx->{UTR3} = [sort {$a->start <=> $b->start} @{$tx->{UTR3}}]; } elsif($type eq '5UTR'){ push @{$tx->{UTR5}}, $feature; $tx->{UTR5} = [sort {$a->start <=> $b->start} @{$tx->{UTR5}}]; } elsif($type eq 'start_codon'){ push @{$tx->{Starts}}, $feature; $tx->{Starts} = [sort {$a->start <=> $b->start} @{$tx->{Starts}}]; } elsif($type eq 'stop_codon'){ push @{$tx->{Stops}}, $feature; $tx->{Stops} = [sort {$a->start <=> $b->start} @{$tx->{Stops}}]; } elsif($type eq 'exon'){ push @{$tx->{Exons}}, $feature; $tx->{Exons} = [sort {$a->start <=> $b->start} @{$tx->{Exons}}]; } elsif($type eq 'SEC'){ push @{$tx->{SEC}}, $feature; $tx->{SEC} = [sort {$a->start <=> $b->start} @{$tx->{SEC}}]; } elsif($type eq 'intron_CNS'){ push @{$tx->{Intron_CNS}}, $feature; $tx->{Intron_CNS} = [sort {$a->start <=> $b->start} @{$tx->{Intron_CNS}}]; } elsif($type eq 'inter_CNS'){ push @{$tx->{Inter_CNS}}, $feature; $tx->{Inter_CNS} = [sort {$a->start <=> $b->start} @{$tx->{Inter_CNS}}]; } elsif($type eq 'inter'){ push @{$tx->{Inter}}, $feature; $tx->{Inter} = [sort {$a->start <=> $b->start} @{$tx->{Inter}}]; } $feature->set_transcript($tx); $tx->{Modified} = 1; if($tx->{Gene} != -1){ $tx->{Gene}->{Modified} = 1;} } sub transfer_missing_utr{ my($dest, $src, $mode) = @_; my $dest_utr3 = $dest->utr3; my $dest_utr5 = $dest->utr5; my $src_utr3 = $src->utr3; my $src_utr5 = $src->utr5; $mode = "Full" if ($#_ == 1); # default if (($mode eq "3UTR") || ($mode eq "Full")) { if (($#$dest_utr3 == -1) && ($#$src_utr3 > -1)){ print "Transfering 3UTR from ".$src->id." to ".$dest->id."\n"; foreach my $src_exon (@$src_utr3) { my $utr_exon = $src_exon->copy; $utr_exon->set_transcript($dest); $dest->add_feature($utr_exon); } } } if (($mode eq "5UTR") || ($mode eq "Full")) { if (($#$dest_utr5 == -1) && ($#$src_utr5 > -1)){ print "Transfering 5UTR from ".$src->id." to ".$dest->id."\n"; foreach my $src_exon (@$src_utr5) { my $utr_exon = $src_exon->copy; $utr_exon->set_transcript($dest); $dest->add_feature($utr_exon); } } } } # copies the exon obejcts info into utr5 and utr3 objects # WARNING: don't call this if you already have utr5 and utr3 info sub create_utr_objects_from_exons{ my ($tx) = @_; my $exons = $tx->exons; my $start = $tx->coding_start; my $stop = $tx->coding_stop; if($start != -1){ my $type = "5UTR"; if($tx->strand eq "-"){ $type = "3UTR"; } for(my $i = 0; $i <= $#$exons; $i++){ if($$exons[$i]->start < $start){ my $utr = $$exons[$i]->copy; $utr->set_type($type); if($start < $utr->stop){ $utr->set_stop($start); } $tx->add_feature($utr); } else{ last; } } } if($stop != -1){ my $type = "3UTR"; if($tx->strand eq "-"){ $type = "5UTR"; } for(my $i = $#$exons; $i >= 0; $i--){ if($$exons[$i]->stop > $stop){ my $utr = $$exons[$i]->copy; $utr->set_type($type); if($stop > $utr->start){ $utr->set_start($stop); } $tx->add_feature($utr); } else{ last; } } } } sub exons{ my ($tx) = @_; $tx->_update; return $tx->{Exons}; } sub cds{ my ($tx) = @_; $tx->_update; return $tx->{CDS}; } sub utr3{ my ($tx) = @_; $tx->_update; return $tx->{UTR3}; } sub utr5{ my ($tx) = @_; $tx->_update; return $tx->{UTR5}; } sub introns{ my ($tx) = @_; $tx->_update; return $tx->{Introns}; } sub start_codons{ my ($tx) = @_; $tx->_update; return $tx->{Starts}; } sub stop_codons{ my ($tx) = @_; $tx->_update; return $tx->{Stops}; } sub sec{ my ($tx) = @_; $tx->_update; return $tx->{SEC}; } sub intron_cns{ my ($tx) = @_; $tx->_update; return $tx->{Intron_CNS}; } sub inter_cns{ my ($tx) = @_; $tx->_update; return $tx->{Inter_CNS}; } sub inter{ my ($tx) = @_; $tx->_update; return $tx->{Inter}; } sub seqname{ my ($tx) = @_; my $gene = $tx->gene; return $gene->seqname; } sub source{ my ($tx) = @_; my $gene = $tx->gene; return $gene->source; } sub strand{ my ($tx) = @_; my $gene = $tx->gene; return $gene->strand; } sub set_id{ my ($tx,$id) = @_; $tx->{Id} = $id; } sub id{ my ($tx) = @_; return $tx->{Id}; } sub gene_id{ my ($tx) = @_; my $gene = $tx->gene; return $gene->{Id}; } sub gene{ my ($tx) = @_; return $tx->{Gene}; } # GTF::Transcript::remove_exon(exon start pos) # Allows you to remove an exon from the list # Removes the all exons at the specified position sub remove_exon{ my ($tx, $pos) = @_; my $new_exons = []; my $removed = 0; foreach my $exon (@{$tx->{Exons}}){ if($exon->start != $pos){ push @$new_exons, $exon; } else{ $removed++; } } $tx->{Exons} = $new_exons; return $removed; } # GTF::Transcript::remove_cds(cds start pos) # Allows you to remove a cds from the list # Removes the all cdss at the specified position sub remove_cds{ my ($tx, $pos) = @_; my $new_cds = []; my $removed = 0; foreach my $cds (@{$tx->{CDS}}){ if($cds->start != $pos){ push @$new_cds, $cds; } else{ $removed++; } } $tx->{CDS} = $new_cds; return $removed; } # GTF::Transcript::length() # This function return the length of the tx from the start to the stop. sub length{ my ($tx) = @_; return($tx->stop - $tx->start + 1); } # GTF::Transcript::start() # This function returns the start of this tx (lowest coord). sub start{ my ($tx) = @_; $tx->_update; return $tx->{Start}; } # GTF::Transcript::stop() # This function return the end of the tx (highest coord). sub stop{ my ($tx) = @_; $tx->_update; return $tx->{Stop}; } # GTF::Transcript::coding_start() # This function returns the start position (lowest coord) of coding region of # this tx. Same as the start codon's field. sub coding_start{ my ($tx) = @_; $tx->_update; return $tx->{Coding_Start}; } # GTF::Transcript::coding_stop() # This function return the end position (highest coord) of the coding region of # the tx. Same as the stop codon's field. sub coding_stop{ my ($tx) = @_; $tx->_update; return $tx->{Coding_Stop}; } # GTF::Transcript::coding_length() # This function return the coding_length of the tx from the coding_start # to the coding_stop. sub coding_length{ my ($tx) = @_; $tx->_update; return $tx->{Coding_Length}; } sub score{ my ($tx) = @_; my $features = $tx->_all_features; my $score = 0; foreach my $feature (@$features){ if($feature->score ne "."){ $score += $feature->score; } } return $score; } # GTF::Transcript::initial_exon() # This function returns a CDS::Feature object representing the # initial_exon (cds) of this tx # NOTE: Isn't exon the wrong word for this? sub initial_exon{ my ($tx) = @_; my $starts = $tx->start_codons; if($#$starts == -1){ return 0; } $tx->_update; my $exons = $tx->{CDS}; if($tx->strand eq "-"){ return $$exons[$#{$exons}]; } else{ return $$exons[0]; } } # GTF::Transcript::terminal_exon() # This function returns a CDS::Feature object representing the # terminal exon (cds) of this tx # NOTE: Isn't exon the wrong word for this? sub terminal_exon{ my ($tx) = @_; my $stops = $tx->stop_codons; if($#$stops == -1){ return 0; } $tx->_update; my $exons = $tx->cds; if($tx->strand eq "-"){ return $$exons[0]; } else{ return $$exons[$#{$exons}]; } } sub equals{ my ($tx,$compare, $mode) = @_; $mode = "Full" if ($#_ == 1); # default unless($tx && $compare){ return 0; } unless(($tx->start == $compare->start) && ($tx->stop == $compare->stop) && ($tx->strand eq $compare->strand)){ return 0; } if (($mode eq "CDS") || ($mode eq "Full")) { my $tx_cds = $tx->cds; my $comp_cds = $compare->cds; unless($#$tx_cds == $#$comp_cds){ return 0; } for(my $i = 0;$i <= $#$tx_cds;$i++){ unless($$tx_cds[$i]->equals($$comp_cds[$i])){ return 0; } } my $tx_starts = $tx->start_codons; my $comp_starts = $compare->start_codons; unless($#$tx_starts == $#$comp_starts){ return 0; } for(my $i = 0;$i <= $#$tx_starts;$i++){ unless($$tx_starts[$i]->equals($$comp_starts[$i])){ return 0; } } my $tx_stops = $tx->stop_codons; my $comp_stops = $compare->stop_codons; unless($#$tx_stops == $#$comp_stops){ return 0; } for(my $i = 0;$i <= $#$tx_stops;$i++){ unless($$tx_stops[$i]->equals($$comp_stops[$i])){ return 0; } } } if (($mode eq "3UTR") || ($mode eq "Full")) { my $tx_utr3 = $tx->utr3; my $comp_utr3 = $compare->utr3; unless($#$tx_utr3 == $#$comp_utr3){ return 0; } for(my $i = 0;$i <= $#$tx_utr3;$i++){ unless($$tx_utr3[$i]->equals($$comp_utr3[$i])){ return 0; } } } if (($mode eq "5UTR") || ($mode eq "Full")) { my $tx_utr5 = $tx->utr5; my $comp_utr5 = $compare->utr5; unless($#$tx_utr5 == $#$comp_utr5){ return 0; } for(my $i = 0;$i <= $#$tx_utr5;$i++){ unless($$tx_utr5[$i]->equals($$comp_utr5[$i])){ return 0; } } } my $tx_exon = $tx->exons; my $comp_exon = $compare->exons; unless($#$tx_exon == $#$comp_exon){ return 0; } for(my $i = 0;$i <= $#$tx_exon;$i++){ unless($$tx_exon[$i]->equals($$comp_exon[$i])){ return 0; } } my $tx_sec = $tx->sec; my $comp_sec = $compare->sec; unless($#$tx_sec == $#$comp_sec){ return 0; } for(my $i = 0;$i <= $#$tx_sec;$i++){ unless($$tx_sec[$i]->equals($$comp_sec[$i])){ return 0; } } return 1; } # GTF::Transcript::offset(ammount) # This function will offset all locations in this tx by the given ammount sub offset{ my ($tx,$offset) = @_; unless(defined($offset)){ print STDERR "Undefined value passed to GTF::Gene::offset.\n"; return; } my $features = $tx->_all_features; foreach my $feature (@$features){ if(defined($feature)){ $feature->offset($offset); } } $tx->{Modified} = 1; if($tx->{Gene} != -1){ $tx->{Gene}->{Modified} = 1;} } # GTF::Transcript::output_gtf([file_handle]) # This function outputs the information for this tx in standard # gtf2 format. If takes and optional file_handle as the only argument # to which it outputs the data. If no argument is given it outputs # the data to stdout. sub output_gtf{ my ($tx,$out_handle) = @_; unless(defined $out_handle){ $out_handle = \*STDOUT; } my @features = sort {$a->start <=> $b->start || $a->stop <=> $b->stop} @{$tx->_all_features}; foreach my $feature (@features){ $feature->output_gtf($out_handle); } } # GTF::Transcript::output_gff([file_handle]) # This function outputs the information for this tx in standard # GFF format. If takes and optional file_handle as the only argument # to which it outputs the data. If no argument is given it outputs # the data to stdout. sub output_gff{ my ($tx,$out_handle) = @_; unless(defined $out_handle){ $out_handle = \*STDOUT; } my @features = sort {$a->start <=> $b->start || $a->stop <=> $b->stop} @{$tx->_all_features}; foreach my $feature (@features){ $feature->output_gff($out_handle); } } # GTF::Transcript::reverse_complement(seq_length) # This function takes the length of the sequence this tx came from and # reverse complements everything in the gene. sub reverse_complement{ my ($tx, $seq_length) = @_; unless(defined($seq_length)){ print STDERR "Undefined value for seq_length passed to GTF::Gene::reverse_complement.\n"; } my $features = $tx->_all_features; foreach my $feature (@$features){ $feature->reverse_complement($seq_length); } } sub _all_features{ my ($tx) = @_; my @features; my $starts = $tx->start_codons; push @features, @$starts; my $stops = $tx->stop_codons; push @features, @$stops; my $exons = $tx->exons; push @features, @$exons; my $cds = $tx->cds; push @features, @$cds; my $utr3 = $tx->utr3; push @features, @$utr3; my $utr5 = $tx->utr5; push @features, @$utr5; my $sec = $tx->sec; push @features, @$sec; my $cns = $tx->intron_cns; push @features, @$cns; return \@features; } sub copy{ my ($tx) = @_; my $copy = GTF::Transcript::new($tx->id); my $starts = $tx->start_codons; foreach my $old (@$starts){ my $new = $old->copy; $copy->add_feature($new); } my $stops = $tx->stop_codons; foreach my $old (@$stops){ my $new = $old->copy; $copy->add_feature($new); } my $exons = $tx->exons; foreach my $old (@$exons){ my $new = $old->copy; $copy->add_feature($new); } my $cds = $tx->cds; foreach my $old (@$cds){ my $new = $old->copy; $copy->add_feature($new); } my $utr3 = $tx->utr3; foreach my $old (@$utr3){ my $new = $old->copy; $copy->add_feature($new); } my $utr5 = $tx->utr5; foreach my $old (@$utr5){ my $new = $old->copy; $copy->add_feature($new); } my $sec = $tx->sec; foreach my $old (@$sec){ my $new = $old->copy; $copy->add_feature($new); } return($copy); } sub gc_percentage{ my ($tx) = @_; my $features = $tx->_all_features; my $total = 0; my $percent = 0; foreach my $feature (@$features){ $percent += $feature->length * $feature->gc_percentage; $total += $feature->length; } $percent /= $total; return $percent; } sub match_percentage{ my ($tx) = @_; my $features = $tx->_all_features; my $total = 0; my $percent = 0; foreach my $feature (@$features){ $percent += $feature->length * $feature->match_percentage; $total += $feature->length; } $percent /= $total; return $percent; } sub mismatch_percentage{ my ($tx) = @_; my $features = $tx->_all_features; my $total = 0; my $percent = 0; foreach my $feature (@$features){ $percent += $feature->length * $feature->mismatch_percentage; $total += $feature->length; } $percent /= $total; return $percent; } sub unaligned_percentage{ my ($tx) = @_; my $features = $tx->_all_features; my $total = 0; my $percent = 0; foreach my $feature (@$features){ $percent += $feature->length * $feature->unaligned_percentage; $total += $feature->length; } $percent /= $total; return $percent; } sub tag{ my ($tx) = @_; return $tx->{Tag}; } sub set_tag{ my ($tx,$tag) = @_; $tx->{Tag} = $tag; } sub _update{ my ($tx) = @_; unless($tx->{Modified}){ return; } $tx->{Modified} = 0; my $starts = $tx->start_codons; my $stops = $tx->stop_codons; my $cds = $tx->cds; my $utr5 = $tx->utr5; my $utr3 = $tx->utr3; my $exons = $tx->exons; foreach my $utr (@$utr5){ $utr->set_subtype("UTR5"); } foreach my $utr (@$utr3){ $utr->set_subtype("UTR3"); } #set subtypes if($#$cds == 0){ if(($#$starts >= 0) && ($#$stops >= 0)){ $$cds[0]->set_subtype("Single"); } elsif($#$starts >= 0){ $$cds[0]->set_subtype("Initial"); } elsif($#$stops >= 0){ $$cds[0]->set_subtype("Terminal"); } else{ $$cds[0]->set_subtype("Internal"); } } elsif($#$cds >= 0){ my $strand = $tx->strand; my $starts = $tx->start_codons; my $stops = $tx->stop_codons; my $start = 0; my $stop = $#$cds; if($strand eq '-'){ if($#$starts >= 0){ $$cds[$#$cds]->set_subtype("Initial"); $stop--; } if($#$stops >= 0){ $$cds[0]->set_subtype("Terminal"); $start++; } } else{ if($#$starts >= 0){ $$cds[0]->set_subtype("Initial"); $start++; } if($#$stops >= 0){ $$cds[$#$cds]->set_subtype("Terminal"); $stop--; } } for(my $i = $start;$i <= $stop;$i++){ $$cds[$i]->set_subtype("Internal"); } } #get introns and coding length my $clen = 0; my @introns; for(my $i = 0;$i < $#$cds;$i++){ $clen += $$cds[$i]->stop - $$cds[$i]->start +1; my $start = $$cds[$i]->stop + 1; my $stop = $$cds[$i+1]->start - 1; if($start <= $stop){ my $intron = GTF::Feature::new("Intron",$start,$stop,0,'.'); $intron->set_subtype("Intron"); $intron->set_transcript($tx); push @introns, $intron; } } if($#$cds >= 0){ $clen += $$cds[$#$cds]->stop - $$cds[$#$cds]->start +1; } $tx->{Coding_Length} = $clen; $tx->{Introns} = \@introns; #get start/coding start my $start = -1; if(($#{$cds} >= 0) && (($start == -1) || ($start > $$cds[0]->start))){ $start = $$cds[0]->start; } $tx->{Coding_Start} = $start; if(($#{$starts} >= 0) && (($start == -1) || ($start > $$starts[0]->start))){ $start = $$starts[0]->start; } if(($#{$stops} >= 0) && (($start == -1) || ($start > $$stops[0]->start))){ $start = $$stops[0]->start; } if(($#{$exons} >= 0) && (($start == -1) || ($start > $$exons[0]->start))){ $start = $$exons[0]->start; } if(($#{$utr5} >= 0) && (($start == -1) || ($start > $$utr5[0]->start))){ $start = $$utr5[0]->start; } if(($#{$utr3} >= 0) && (($start == -1) || ($start > $$utr3[0]->start))){ $start = $$utr3[0]->start; } $tx->{Start} = $start; #get stop/coding stop my $stop = -1; if(($#{$cds} >= 0) && (($stop == -1) || ($stop < $$cds[$#$cds]->stop))){ $stop = $$cds[$#$cds]->stop; } $tx->{Coding_Stop} = $stop; if(($#{$starts} >= 0) && (($stop == -1) || ($stop < $$starts[$#$starts]->stop))){ $stop = $$starts[$#$starts]->stop; } if(($#{$stops} >= 0) && (($stop == -1) || ($stop < $$stops[$#$stops]->stop))){ $stop = $$stops[$#$stops]->stop; } if(($#{$exons} >= 0) && (($stop == -1) || ($stop < $$exons[$#$exons]->stop))){ $stop = $$exons[$#$exons]->stop; } if(($#{$utr5} >= 0) && (($stop == -1) || ($stop < $$utr5[$#$utr5]->stop))){ $stop = $$utr5[$#$utr5]->stop; } if(($#{$utr3} >= 0) && (($stop == -1) || ($stop < $$utr3[$#$utr3]->stop))){ $stop = $$utr3[$#$utr3]->stop; } $tx->{Stop} = $stop; } ############################################################################### # GTF::Feature ############################################################################### # The GTF::Feature object stores all data for a single feature of a gtf # file. This object basically stores all data for a single non-comment # line of a gtf file. package GTF::Feature; # GTF::Feature::new(feature, start, end, score, frame) # This is the constructor for GTF::Feature objects. It takes 5 required # arguments. They are: # type - The field of the line. Should be either # 'cds', 'exon', 'start_codon', or 'stop_codon'. # start - The field of the line. Should be a number. # end - The field of the line. Should be a number. # score - The field of the line. Should be a number or '.'. # frame - The field of the line. Should be either # '0', '1', '2', or '.'. sub new { my $feature = bless {}; ($feature->{Type}, $feature->{Start}, $feature->{End}, $feature->{Score}, $feature->{Frame}) = @_; my $start = $feature->{Start}; my $stop = $feature->{Stop}; if(defined($start) && defined($stop) && ($start > $stop)){ $feature->{Start} = $stop; $feature->{Stop} = $start; } $feature->{Transcript} = 0; $feature->{Seq} = 0; $feature->{Conseq} = 0; $feature->{Subtype} = 0; $feature->{ASE} = 0; # rpz - Alternative Splicing Event $feature->{Match} = -1; $feature->{Mismatch} = -1; $feature->{Unaligned} = -1; $feature->{GC} = -1; $feature->{A} = -1; $feature->{C} = -1; $feature->{G} = -1; $feature->{T} = -1; $feature->{N} = -1; $feature->{0} = -1; $feature->{1} = -1; $feature->{2} = -1; return $feature; } # GTF::Feature::copy() # This function returns a new object which is a copy of this object sub copy{ my ($feature) = @_; my $copy = GTF::Feature::new($feature->type,$feature->start,$feature->stop, $feature->score,$feature->frame); $copy->set_bases($feature->get_a_count,$feature->get_c_count,$feature->get_g_count, $feature->get_t_count,$feature->get_n_count); $copy->set_conseq($feature->get_match_count,$feature->get_mismatch_count, $feature->get_unaligned_count); return($copy); } # GTF::Feature::offset(ammount) # This function will offset all locations in this Feature by the given ammount sub offset{ my ($feature,$offset) = @_; unless(defined($offset)){ print STDERR "Undefined value passed to GTF::Feature::offset.\n"; return; } $feature->{Start} += $offset; $feature->{End} += $offset; } # GTF::Feature::reverse_complement(seq_length) # This function takes the length of the sequence this feature came from and # reverse complements it. sub reverse_complement{ my ($feature, $seq_length) = @_; unless(defined($seq_length)){ print STDERR "Undefined value for seq_length passed to ", "GTF::Feature::reverse_complement.\n"; } my $start = $seq_length - $feature->{Start} + 1; my $stop = $seq_length - $feature->{End} + 1; if($start < $stop){ $feature->{Start} = $start; $feature->{End} = $stop; } else{ $feature->{Start} = $stop; $feature->{End} = $start; } } # GTF::Feature::type() # This function returns the feature type, field of this line as # a string. Should be either 'CDS', 'exon', 'start_codon', or 'stop_codon'. sub type {shift->{Type}} sub set_type{ my ($feature,$type) = @_; $feature->{Type} = $type; } # GTF::Feature::type() # This function returns the feature sub type, currently inly implemented for # cds, should be 'Initial', 'Internal', 'Terminal', or 'Single' sub subtype {shift->{Subtype}} sub set_subtype{ my ($feature,$subtype) = @_; $feature->{Subtype} = $subtype; } # GTF::Feature::ase() # This function returns the feature's alternative splicing event, # if there is one. Currently the only implemented ASE is 'Optional' # rpz sub ase {shift->{ASE}} sub set_ase { my ($feature, $ASE) = @_; $feature->{ASE} = $ASE; } # GTF::Feature::length() # This function returns the length of this feature in base pairs # as a number. sub length{ my ($feature) = @_; return($feature->stop - $feature->start + 1); } # GTF::Feature::start() # This function returns the start location , field, of this feature # as a number. sub start {shift->{Start}} # GTF::Feature::set_start() # This function sets the start location , field, of this feature # as a number. sub set_start{ my ($feature,$start) = @_; $feature->{Start} = $start; } # GTF::Feature::end() # This function returns the end location, field, of this feature # as a number. sub end {shift->{End}} # GTF::Feature::stop() # This function returns the end location, field, of this feature # as a number. # Same as GTF::Feature::end(); sub stop {shift->{End}} # GTF::Feature::set_stop() # This function sets the stop location , field, of this feature # as a number. sub set_stop{ my ($feature,$stop) = @_; $feature->{End} = $stop; } # GTF::Feature::score() # This function returns the score, field, of this line. This will # be either a number or '.'. sub score {shift->{Score}} # GTF::Feature::frame() # This function returns the frame, field, of this line. Should be # '0', '1', '2', or '.'. sub frame {shift->{Frame}} # GTF::Feature::set_frame() # This function sets the frame, field, of this line. Should be # '0', '1', '2', or '.'. sub set_frame{ my ($feature,$frame) = @_; if(($frame eq '0') || ($frame eq '1') || ($frame eq '2') || ($frame eq '.')){ $feature->{Frame} = $frame; } else{ print STDERR "Bad Frame Value \'$frame\' should be \'0\', \'1\',". "\'2\', or \'.\'.\n"; } } # GTF::Feature::transcript_id() # This function returns the transcript_id of this exon's transcript sub transcript_id{ my ($feature) = @_; my $tx = $feature->{Transcript}; return $tx->id; } # GTF::Feature::transcript() # This function returns the transcript of this exon's transcript sub transcript{ my ($feature) = @_; return $feature->{Transcript}; } # GTF::Feature::transcript() # This function returns the transcript of this exon's transcript sub set_transcript{ my ($feature,$tx) = @_; $feature->{Transcript} = $tx; } # GTF::Feature::transcript_id() # This function returns the transcript_id of this exon's transcript sub gene_id{ my ($feature) = @_; my $tx = $feature->transcript; my $gene = $tx->gene; return $gene->id; } # GTF::Feature::transcript() # This function returns the transcript of this exon's transcript sub gene{ my ($feature) = @_; my $tx = $feature->{Transcript}; return $tx->gene; } # GTF::Feature::seqname() # This function returns the sequnce name, field, that this # feature came from as a string. sub seqname{ my ($feature) = @_; my $tx = $feature->{Transcript}; return $tx->seqname; } # GTF::Feature::source() # This function returns the source, field, of this line as a # string. sub source{ my ($feature) = @_; my $tx = $feature->{Transcript}; return $tx->source; } # GTF::Feature::strand() # This function returns the strand, field, of this line as a # string. Should be '+', '-', or '.'. sub strand{ my ($feature) = @_; my $tx = $feature->{Transcript}; return $tx->strand; } sub equals{ my ($feature,$compare) = @_; unless($feature && $compare){ return 0; } if(($feature->start == $compare->start) && ($feature->stop == $compare->stop) && ($feature->strand eq $compare->strand) && ($feature->type eq $compare->type)){ return 1; } return 0; } # GTF::Feature::output_gtf([file_handle]) # This function outputs the information for this feature in standard # gtf2 format. If takes and optional file_handle as the only argument # to which it outputs the data. If no argument is given it outputs # the data to stdout. sub output_gtf{ my ($feature,$out_handle) = @_; unless(defined $out_handle){ $out_handle = \*STDOUT; } print $out_handle "".$feature->seqname."\t".$feature->source."\t$feature->{Type}\t", "$feature->{Start}\t$feature->{End}\t$feature->{Score}\t".$feature->strand."\t", "$feature->{Frame}\tgene_id \"".$feature->gene_id."\"; transcript_id \"", $feature->transcript_id."\";\n"; } # GTF::Feature::output_gff([file_handle]) # This function outputs the information for this feature in standard # GFF format. If takes and optional file_handle as the only argument # to which it outputs the data. If no argument is given it outputs # the data to stdout. sub output_gff{ my ($feature,$out_handle) = @_; unless(defined $out_handle){ $out_handle = \*STDOUT; } print $out_handle "".$feature->seqname."\t".$feature->source."\t$feature->{Type}\t", "$feature->{Start}\t$feature->{End}\t$feature->{Score}\t".$feature->strand."\t", "$feature->{Frame}\t".$feature->transcript_id."\n"; } # GTF::Feature::set_bases(A,C,G,T,N) # This funtion allows you to set the number of A,C,G,T nucleotides # in this feature sub set_bases{ my ($feature,$a,$c,$g,$t,$n) = @_; $feature->{A} = $a; $feature->{C} = $c; $feature->{G} = $g; $feature->{T} = $t; $feature->{N} = $n; $feature->{Seq} = 1; } # GTF::Feature::set_conseq(n0,n1,n2) # This funtion allows you to set the number of 0,1,2 in the # conservation sequnce for this feature sub set_conseq{ my ($feature,$n0,$n1,$n2) = @_; $feature->{0} = $n0; $feature->{1} = $n1; $feature->{2} = $n2; $feature->{Conseq} = 1; } # GTF::Feature::gc_percentage() # This funciton returns the gc_percentage for this feature if the # sequence was loaded otherwise it returns -1. sub gc_percentage{ my ($feature) = @_; if($feature->{GC} != -1){ return $feature->{GC}; } unless($feature->{Seq}){ return -1; } my $a = $feature->{A}; my $c = $feature->{C}; my $g = $feature->{G}; my $t = $feature->{T}; my $n = $feature->{N}; my $total = $a + $c + $g + $t + $n; my $gc = $g + $c; my $percent = $gc/$total; $feature->{GC} = $percent; return $percent; } # GTF::Feature::match_percentage() # This funciton returns the percentage of matches in the conservation sequence # for this feature if it was loaded otherwise it returns -1. sub match_percentage{ my ($feature) = @_; if($feature->{Match} != -1){ return $feature->{Match}; } unless($feature->{Conseq}){ return -1; } my $match = $feature->{1}; my $mismatch = $feature->{0}; my $unaligned = $feature->{2}; my $total = $match + $mismatch + $unaligned; my $percent = $match/$total; $feature->{Match} = $percent; return $percent; } # GTF::Feature::mismatch_percentage() # This funciton returns the percentage of mismatches in the conservation sequence # for this feature if it was loaded otherwise it returns -1. sub mismatch_percentage{ my ($feature) = @_; if($feature->{Mismatch} != -1){ return $feature->{Mismatch}; } unless($feature->{Conseq}){ return -1; } my $match = $feature->{1}; my $mismatch = $feature->{0}; my $unaligned = $feature->{2}; my $total = $match + $mismatch + $unaligned; my $percent = $mismatch/$total; $feature->{Mismatch} = $percent; return $percent; } # GTF::Feature::unaligned_percentage() # This funciton returns the percentage of unaligned nucs in the conservation sequence # for this feature if it was loaded otherwise it returns -1. sub unaligned_percentage{ my ($feature) = @_; if($feature->{Unaligned} != -1){ return $feature->{Unaligned}; } unless($feature->{Conseq}){ return -1; } my $match = $feature->{1}; my $mismatch = $feature->{0}; my $unaligned = $feature->{2}; my $total = $match + $mismatch + $unaligned; my $percent = $unaligned/$total; $feature->{Unaligned} = $percent; return $percent; } # GTF::Feature::get_a_count() # This funciton returns the number of A nucleotides in this features sequence # if the sequence was loaded, otherwise it returns -1. sub get_a_count{ my ($feature) = @_; unless($feature->{Seq}){ return -1; } return $feature->{A}; } # GTF::Feature::get_c_count() # This funciton returns the number of C nucleotides in this features sequence # if the sequence was loaded, otherwise it returns -1. sub get_c_count{ my ($feature) = @_; unless($feature->{Seq}){ return -1; } return $feature->{C}; } # GTF::Feature::get_g_count() # This funciton returns the number of G nucleotides in this features sequence # if the sequence was loaded, otherwise it returns -1. sub get_g_count{ my ($feature) = @_; unless($feature->{Seq}){ return -1; } return $feature->{G}; } # GTF::Feature::get_t_count() # This funciton returns the number of T nucleotides in this features sequence # if the sequence was loaded, otherwise it returns -1. sub get_t_count{ my ($feature) = @_; unless($feature->{Seq}){ return -1; } return $feature->{T}; } # GTF::Feature::get_n_count() # This funciton returns the number of N nucleotides in this features sequence # if the sequence was loaded, otherwise it returns -1. sub get_n_count{ my ($feature) = @_; unless($feature->{Seq}){ return -1; } return $feature->{N}; } # GTF::Feature::get_match_count() # This funciton returns the number of matched nucleotides in this features # conservation sequence if it was loaded, otherwise it returns -1. sub get_match_count{ my ($feature) = @_; unless($feature->{Conseq}){ return -1; } return $feature->{1}; } # GTF::Feature::get_mismatch_count() # This funciton returns the number of mismatched nucleotides in this features # conservation sequence if it was loaded, otherwise it returns -1. sub get_mismatch_count{ my ($feature) = @_; unless($feature->{Conseq}){ return -1; } return $feature->{0}; } # GTF::Feature::get_unaligned_count() # This funciton returns the number of unaligned nucleotides in this features # conservation sequence if it was loaded, otherwise it returns -1. sub get_unaligned_count{ my ($feature) = @_; unless($feature->{Conseq}){ return -1; } return $feature->{2}; } sub tag{ my ($feature) = @_; return $feature->{Tag}; } sub set_tag{ my ($feature,$tag) = @_; $feature->{Tag} = $tag; } 1; __END__