#!/usr/bin/env python

###################################
# DELETE RRNA FROM GENBANK FILE   #
# THIS IS OPEN SOURCE CODE (YAY!) #
###################################
# LICENSE
#    This code is licensed under the Creative Commons 3.0
#    Attribution + ShareAlike license - for details see:
#       http://creativecommons.org/licenses/by-sa/3.0/
#
# DATE: February 27th, 2014
#
# PROGRAM AUTHOR (PROGRAMMER)
#    Graham Alvare - home.cc.umanitoba.ca/~alvare
#
# CO-AUTHORS/ACKNOWLEDGEMENTS
#    Justin Zhang          - for providing me information on the SAM format,
#                            and test data to build and debug this program.
#    Dr. Brian Fristensky  - my work supervisor, and the man who introduced
#                            me to the wonderful field of bioinformatics.
#
# QUESTIONS & COMMENTS
#    If you have any questions, please contact me: alvare@cc.umanitoba.ca
#    I usually get back to people within 1-2 weekdays (weekends, I am slower)
#
#    P.S. Please also let me know of any bugs, or if you have any suggestions.
#         I am generally happy to help create new tools, or modify my existing
#         tools to make them more useful.
#
# Happy usage!

import sys, os, os.path, re, collections
from Bio import GenBank
from Bio import SeqIO

def process_record(record, geneFile, rrnaFile, gtf_out, source = 'delRNA'):
	skipped = 0
	# the list of deletions (each entry is a tuple - start, end)
	rrnas   = list()

	# Grab the entire sequence
	seq = str(record.seq)

	# Find all of the rRNA sequences
	for feature in record.features:
		# extract the feature data
		start = int(feature.location.start.position)
		end   = int(feature.location.end.position)
		name  = "Unknown"

		# extract the name of the rRNA (if available)
		if 'gene' in feature.qualifiers:
			name = feature.qualifiers['gene'][0]
		elif 'product' in feature.qualifiers:
			name = feature.qualifiers['product'][0]

		if feature.type == 'rRNA':
			print "Found: " + name + " (" + str(start) + ", " + str(end) + ")"

			# append the rRNA to the rRNA list
			rrnas.append((start, end, name))

	for feature in sorted(rrnas, key=lambda tup: tup[1], reverse=True):
		# Extract all of the tuple information
		start = feature[0]
		end   = feature[1]
		name  = feature[2]

		# remove the rRNA from the output sequence
		fsq   = seq[start:(end + 1)]
		seq   = seq[0:start] + seq[end + 1:]

		# print the rRNA to the rRNA output file
		print >> rrnaFile, "> " + name
		print >> rrnaFile, fsq

	removed = 0
	skipped = 0
	if seq.rstrip(" \n\r\t") != "":
		# Write the GFF data
		for f in record.features:
			start  = int(f.location.start.position)
			end    = int(f.location.end.position)
			#strand = int(f.location.strand)

			# extract the gene id
			if 'locus_tag' in f.qualifiers:
				#use locul tag as gene_id/transcript_id
				gene_id = transcript_id = f.qualifiers['locus_tag'][0]
			elif 'gene' in f.qualifiers:
				gene_id = transcript_id = f.qualifiers['gene'][0]
			else:
				if f.type == "source":
					print "Skipping additional source feature: " + str(start) + ", " + str(end)
				elif f.type == "misc_feature" and 'note' in f.qualifiers:
					print "Skipping misc_feature: " + str(f.qualifiers['note'][0])
				else:
					print "Skipping (no gene id): " + str(f)
				skipped += 1
				continue

			if f.type != 'rRNA':
				#code strand as +/- (in genbank 1 or -1)
				if not f.strand:
					print "Skipped (invalid strand): " + str(gene_id)
					skipped += 1
					continue
				elif int(f.strand)>0:
					strand = '+'
				else:
					strand = '-'

				#generate comments field
				comments = 'gene_id "%s"; transcript_id "%s"' % ( gene_id, transcript_id )
				if 'gene' in f.qualifiers:
					comments += '; gene_id "%s"' % f.qualifiers['gene'][0]
				if 'protein_id' in f.qualifiers:
					comments += '; protein_id "%s"' % f.qualifiers['protein_id'][0]
			
				#add external IDs  
				if 'db_xref' in f.qualifiers:
					for extData in f.qualifiers['db_xref']:
						comments += '; db_xref "%s"' % extData
			
				# GFF format definition ()
				"""
				seqname - The name of the sequence. Must be a chromosome or scaffold.
				 - The program that generated this feature.
				feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon".
				start - The starting position of the feature in the sequence. The first base is numbered 1.
				end - The ending position of the feature (inclusive).
				score - A score between 0 and 1000. If the track line useScore attribute is set to 1 for this annotation data set, the score value will determine the level of gray in which this feature is displayed (higher numbers = darker gray). If there is no score value, enter ".".
				strand - Valid entries include '+', '-', or '.' (for don't know/don't care).
				frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
				comments - gene_id "Em:U62317.C22.6.mRNA"; transcript_id "Em:U62317.C22.6.mRNA"; exon_number 1
				"""

				# correct start and end positions
				for feature in sorted(rrnas, key=lambda tup: tup[1], reverse=True):
					# Extract all of the tuple information
					rrna_start  = feature[0]
					rrna_end    = feature[1]
					rrna_length = rrna_end - rrna_start + 1

					# Current GFF gene comes after rRNA gene
					if rrna_end <= start and rrna_end < end:
						start -= rrna_length
						end   -= rrna_length
					# Current GFF gene comes inside rRNA gene
					elif rrna_start <= start and rrna_end >= end:
						continue
					# Otherwise, the current GFF gene comes before rRNA gene

				print "Kept: " + gene_id + " (" + f.type + "): moved from [" + str(f.location.start.position) + ", " + str(f.location.end.position) + "] to [" + str(start) + ", " + str(end) + "]"

				# print GFF output
				print >> gtf_out, '%s\t%s\t%s\t%s\t%s\t.\t%s\t.\t%s' % \
					( record.id, source, f.type, start+1, end, strand, comments )
			else:
				print "Removed: " + gene_id + " (" + str(start) + ", " + str(end) + "): " + f.type
				removed += 1

		# Write FASTA header
		print >> geneFile, "> " + record.id
		print >> geneFile, seq
	else:
		print "Record was completely rRNA!"
	return (skipped, removed)

if __name__=="__main__":
	if len(sys.argv) > 1:
		# process every file specified on the command line
		for filename in sys.argv[1:]:
			total   = 0
			removed = 0
			skipped = 0

			# Open GenBank file
			handle = open(filename, 'rU')

			# Open output FASTA file
			gtf_out  = open(os.path.splitext(filename)[0] + '_norrna.gtf', 'w')
			rrnaFile = open(os.path.splitext(filename)[0] + '_rrnas.fasta', 'w')
			geneFile = open(os.path.splitext(filename)[0] + '_norrna.fasta', 'w')
			
			for record in SeqIO.parse(handle, 'genbank'):
				askipped, aremoved = process_record(record, geneFile, rrnaFile, gtf_out)
				skipped += askipped
				removed += aremoved
				total += len(record.features)

			print "Removed: " + str(removed) + "/" + str(total) + " entries"
			print "Skipped: " + str(skipped) + "/" + str(total) + " entries"

			# Close the file handle
			gtf_out.close()
			rrnaFile.close()
			geneFile.close()
			handle.close()