
BEGIN{
    $ENV{'PATH'}=$ENV{'PATH'}.":/local/workspace01/zhangju/script/perl";
    push(@INC,"/local/workspace01/zhangju/script/perl");
}

use strict;
use Bio::SeqIO;
use Bio::Annotation::Collection;
use Bio::Annotation::Comment;
sub ManualCuration_QC
{
	my $input_file = "C_termidtis_qaed.art";#"GenePRIMP.art";
	
	my $root_dir="/local/workspace01/zhangju/ct_annotation";
	
	my $embl_dir=$root_dir."/June5_2012SPlitfiles";
	#my $gbk_dir=$root_dir."/single_curated_gbk";
	#my $new_embl=$root_dir."/single_new_embl";
	
	
	my $seq_in  = Bio::SeqIO->new(
	                              -format => 'genbank',
	                              -file   => $input_file,
	                              );
	 
	my $seq;
	
	open(OUTFILE,">translation_issue5.txt");
	my %hash_locus=();
	while( $seq = $seq_in->next_seq() ) {
	
	    print OUTFILE "================\n";
	    print OUTFILE "File - ".$seq->display_id."\n\n";
	    my $embl_file=$embl_dir."/".$seq->display_id; #"/twc1a_scaffold00002.2";
	    my $embl_in  = Bio::SeqIO->new(
	                               -format => 'genbank',
	                               -file   => $embl_file,
				       );
	
	
	    my $embl_seq=$embl_in->next_seq();
	    
	    for my $feat_object ($embl_seq->get_SeqFeatures) {          
	       #print "primary tag: ", $feat_object->primary_tag, "\n"; 
	       if ($feat_object->primary_tag eq "CDS"){
	       	   my $start=$feat_object->location->start;
		   my $end=$feat_object->location->end;
		   my $complementary=$feat_object->strand;
		   my $seqobj=Bio::Seq->new( -seq => (substr $seq->seq(),$start-1,$end-$start+1),
		   			     -id => 'temp',
					     );
		   my $dnaobj=$seqobj;
		   if ($complementary==-1){
		   	$dnaobj=$seqobj->revcom;
		   }
		   my $protein=$dnaobj->translate(-codontable_id => 11);
		   my $pro_from_dna=$protein->seq();
					         
		   my $locus=""; 
		     
		   for my $tag ($feat_object->get_all_tags) { 
	               #print $tag."\n";
	           if ($tag eq "locus_tag"){
	          
				   my @arr=$feat_object->get_tag_values($tag);
				   $locus=$arr[0]."\n";
		 		   if(not exists $hash_locus{$locus}){
		 		   	   $hash_locus{$locus}=1
		 		   }else{
		 		   	   $hash_locus{$locus}=$hash_locus{$locus}+1
		 		   }
	           }
	           if ($tag eq "translation"){
	
				   my @arr1=$feat_object->get_tag_values($tag);
				   my $aa=$arr1[0];
				   if ($aa =~ m/\s/){
			               print OUTFILE $locus." tranlation with spaces, please verify\n";
				       #my @arr=$feat_object->get_tag_values($tag);
				       print OUTFILE $aa."\n";
				   }
				   if (not (($aa =~ m/\Q$pro_from_dna\E/) || ($pro_from_dna =~ m/\Q$aa\E/)) || abs(length($aa)-length($pro_from_dna))>5){
				       print OUTFILE $locus."\ttranslations mismatch, please verify\n\n";
				       print OUTFILE "stand - ".$complementary." coordinate (".$start."..".$end.")\n\n";
				       print OUTFILE $dnaobj->seq()."\n\n";
				       print OUTFILE "From dna direct translation: length - ".length($pro_from_dna)."  ".$pro_from_dna."\n\n";
				       print OUTFILE "Annotation translation: length - ".length($aa)."  ".$aa."\n\n";
				   }
		       }
		  }          
	       }       
	    }
	}
	while (my ($key,$value)=each (%hash_locus)){
		
		if ($value>1){
			print OUTFILE "locus - ".$key." is used ".$value." times\n";
		}
	}
	close(OUTFILE);
}
sub LocusTag_Update
{
	my $gbkFile="CTermitidtis_June1_2010.gbk";
	open GF, "<", $gbkFile;
	open NewGF, ">" , "CTermitidtis_June1_2012_CTER.gbk";
	while(<GF>){
		chomp($_);
		my $line=$_;
		if ($line =~ "/locus_tag=\"or"){
			$line =~ s/\/locus_tag=\"or/\/locus_tag=\"CTER_/;
		}
#		if ($line =~ "/locus_tag=\"Ct_"){
#			$line =~ s/\/locus_tag=\"Ct_000/\/locus_tag=\"CTER_/;
#			my $s1 = (substr $line, 0, length($line)-2);
#			$line= $s1 
#		}
		print NewGF $line."\n" 
	}
	
}
&ManualCuration_QC
#&LocusTag_Update

