#!/usr/bin/env python ################################### # SORTS A GFF FILE FOR CUFFLINKS # # THIS IS OPEN SOURCE CODE (YAY!) # ################################### # LICENSE # This code is licensed under the Creative Commons 3.0 # Attribution + ShareAlike license - for details see: # http://creativecommons.org/licenses/by-sa/3.0/ # # DATE: February 27th, 2014 # # PROGRAM AUTHOR (PROGRAMMER) # Graham Alvare - home.cc.umanitoba.ca/~alvare # # CO-AUTHORS/ACKNOWLEDGEMENTS # Justin Zhang - for providing me information on the SAM format, # and test data to build and debug this program. # Dr. Brian Fristensky - my work supervisor, and the man who introduced # me to the wonderful field of bioinformatics. # # QUESTIONS & COMMENTS # If you have any questions, please contact me: alvare@cc.umanitoba.ca # I usually get back to people within 1-2 weekdays (weekends, I am slower) # # P.S. Please also let me know of any bugs, or if you have any suggestions. # I am generally happy to help create new tools, or modify my existing # tools to make them more useful. # # Happy usage! import sys, os, os.path, csv, re, collections from Bio import GenBank from Bio import SeqIO if __name__=="__main__": if len(sys.argv) > 1: # process every file specified on the command line for filename in sys.argv[1:]: rows = dict() # Open GFF file handle = open(filename, 'rU') # Open output FASTA file #gtf_out = open(os.path.splitext(filename)[0] + '_norrna.gtf', 'w') for row in handle: cols = row.split(' ') key = cols[0] + ',' + cols[3] + ',' + cols[4] + ',' + cols[2] + ',' + cols[6] if key in rows: print >> sys.stderr, "Overlap: " + key if row == rows[key]: print >> sys.stderr, " Duplicate!" elif len(row) > len(rows[key]): print >> sys.stderr, " Replacing with: " + row else: print >> sys.stderr, " Skipping!" else: rows[key] = row for key in sorted(rows, key = lambda k: [k.split(',')[0], int(k.split(',')[1]), int(k.split(',')[2]), k.split(',')[3], k.split(',')[4]]): print rows[key] # Close the file handles handle.close()