#!/usr/bin/env python ################################### # DELETE RRNA FROM GENBANK FILE # # THIS IS OPEN SOURCE CODE (YAY!) # ################################### # LICENSE # This code is licensed under the Creative Commons 3.0 # Attribution + ShareAlike license - for details see: # http://creativecommons.org/licenses/by-sa/3.0/ # # DATE: February 27th, 2014 # # PROGRAM AUTHOR (PROGRAMMER) # Graham Alvare - home.cc.umanitoba.ca/~alvare # # CO-AUTHORS/ACKNOWLEDGEMENTS # Justin Zhang - for providing me information on the SAM format, # and test data to build and debug this program. # Dr. Brian Fristensky - my work supervisor, and the man who introduced # me to the wonderful field of bioinformatics. # # QUESTIONS & COMMENTS # If you have any questions, please contact me: alvare@cc.umanitoba.ca # I usually get back to people within 1-2 weekdays (weekends, I am slower) # # P.S. Please also let me know of any bugs, or if you have any suggestions. # I am generally happy to help create new tools, or modify my existing # tools to make them more useful. # # Happy usage! import sys, os, os.path, re, collections from Bio import GenBank from Bio import SeqIO def process_record(record, geneFile, rrnaFile, gtf_out, source = 'delRNA'): skipped = 0 # the list of deletions (each entry is a tuple - start, end) rrnas = list() # Grab the entire sequence seq = str(record.seq) # Find all of the rRNA sequences for feature in record.features: # extract the feature data start = int(feature.location.start.position) end = int(feature.location.end.position) name = "Unknown" # extract the name of the rRNA (if available) if 'gene' in feature.qualifiers: name = feature.qualifiers['gene'][0] elif 'product' in feature.qualifiers: name = feature.qualifiers['product'][0] if feature.type == 'rRNA': print "Found: " + name + " (" + str(start) + ", " + str(end) + ")" # append the rRNA to the rRNA list rrnas.append((start, end, name)) for feature in sorted(rrnas, key=lambda tup: tup[1], reverse=True): # Extract all of the tuple information start = feature[0] end = feature[1] name = feature[2] # remove the rRNA from the output sequence fsq = seq[start:(end + 1)] seq = seq[0:start] + seq[end + 1:] # print the rRNA to the rRNA output file print >> rrnaFile, "> " + name print >> rrnaFile, fsq removed = 0 skipped = 0 if seq.rstrip(" \n\r\t") != "": # Write the GFF data for f in record.features: start = int(f.location.start.position) end = int(f.location.end.position) #strand = int(f.location.strand) # extract the gene id if 'locus_tag' in f.qualifiers: #use locul tag as gene_id/transcript_id gene_id = transcript_id = f.qualifiers['locus_tag'][0] elif 'gene' in f.qualifiers: gene_id = transcript_id = f.qualifiers['gene'][0] else: if f.type == "source": print "Skipping additional source feature: " + str(start) + ", " + str(end) elif f.type == "misc_feature" and 'note' in f.qualifiers: print "Skipping misc_feature: " + str(f.qualifiers['note'][0]) else: print "Skipping (no gene id): " + str(f) skipped += 1 continue if f.type != 'rRNA': #code strand as +/- (in genbank 1 or -1) if not f.strand: print "Skipped (invalid strand): " + str(gene_id) skipped += 1 continue elif int(f.strand)>0: strand = '+' else: strand = '-' #generate comments field comments = 'gene_id "%s"; transcript_id "%s"' % ( gene_id, transcript_id ) if 'gene' in f.qualifiers: comments += '; gene_id "%s"' % f.qualifiers['gene'][0] if 'protein_id' in f.qualifiers: comments += '; protein_id "%s"' % f.qualifiers['protein_id'][0] #add external IDs if 'db_xref' in f.qualifiers: for extData in f.qualifiers['db_xref']: comments += '; db_xref "%s"' % extData # GFF format definition () """ seqname - The name of the sequence. Must be a chromosome or scaffold. - The program that generated this feature. feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon". start - The starting position of the feature in the sequence. The first base is numbered 1. end - The ending position of the feature (inclusive). score - A score between 0 and 1000. If the track line useScore attribute is set to 1 for this annotation data set, the score value will determine the level of gray in which this feature is displayed (higher numbers = darker gray). If there is no score value, enter ".". strand - Valid entries include '+', '-', or '.' (for don't know/don't care). frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. comments - gene_id "Em:U62317.C22.6.mRNA"; transcript_id "Em:U62317.C22.6.mRNA"; exon_number 1 """ # correct start and end positions for feature in sorted(rrnas, key=lambda tup: tup[1], reverse=True): # Extract all of the tuple information rrna_start = feature[0] rrna_end = feature[1] rrna_length = rrna_end - rrna_start + 1 # Current GFF gene comes after rRNA gene if rrna_end <= start and rrna_end < end: start -= rrna_length end -= rrna_length # Current GFF gene comes inside rRNA gene elif rrna_start <= start and rrna_end >= end: continue # Otherwise, the current GFF gene comes before rRNA gene print "Kept: " + gene_id + " (" + f.type + "): moved from [" + str(f.location.start.position) + ", " + str(f.location.end.position) + "] to [" + str(start) + ", " + str(end) + "]" # print GFF output print >> gtf_out, '%s\t%s\t%s\t%s\t%s\t.\t%s\t.\t%s' % \ ( record.id, source, f.type, start+1, end, strand, comments ) else: print "Removed: " + gene_id + " (" + str(start) + ", " + str(end) + "): " + f.type removed += 1 # Write FASTA header print >> geneFile, "> " + record.id print >> geneFile, seq else: print "Record was completely rRNA!" return (skipped, removed) if __name__=="__main__": if len(sys.argv) > 1: # process every file specified on the command line for filename in sys.argv[1:]: total = 0 removed = 0 skipped = 0 # Open GenBank file handle = open(filename, 'rU') # Open output FASTA file gtf_out = open(os.path.splitext(filename)[0] + '_norrna.gtf', 'w') rrnaFile = open(os.path.splitext(filename)[0] + '_rrnas.fasta', 'w') geneFile = open(os.path.splitext(filename)[0] + '_norrna.fasta', 'w') for record in SeqIO.parse(handle, 'genbank'): askipped, aremoved = process_record(record, geneFile, rrnaFile, gtf_out) skipped += askipped removed += aremoved total += len(record.features) print "Removed: " + str(removed) + "/" + str(total) + " entries" print "Skipped: " + str(skipped) + "/" + str(total) + " entries" # Close the file handle gtf_out.close() rrnaFile.close() geneFile.close() handle.close()