#!/usr/bin/env python ############################################## # Bowtie Match Venn-Diagrammer (vdiagram.py) # ############################################## # SYNOPSIS # This program reads multiple SAM files and creates a Venn-Diagram table comparing # the read IDs which aligned to both genomes (i.e. the purpose of this program is # to assist with multiple genome transcriptomic RNA-seq data analysis. # # USAGE # vdiagram.py # # Examples: # vdiagram.py condition1.sam condition2.sam # vdiagram.py condition1.sam condition2.sam condition3.sam # vdiagram.py condition1.sam condition2.sam condition3.sam condition4.sam # # INPUT FILES # This program reads SAM files generated by Bowtie. The program examines # specifically the aligned reads contained within the file. # # PREREQUISITES # This program requires the following modules (the numbers in # parentheses are the version numbers I used; they are NEITHER the # minimum or maximum version numbers supported): # # Python (2.7.3) # # OUTPUT # A Venn Diagram table representing the data contained within the files. # ################################### # THIS IS OPEN SOURCE CODE (YAY!) # ################################### # LICENSE # This code is licensed under the Creative Commons 3.0 # Attribution + ShareAlike license - for details see: # http://creativecommons.org/licenses/by-sa/3.0/ # # DATE: February 27th, 2014 # # PROGRAM AUTHOR (PROGRAMMER) # Graham Alvare - home.cc.umanitoba.ca/~alvare # # CO-AUTHORS/ACKNOWLEDGEMENTS # Justin Zhang - for providing me information on the SAM format, # and test data to build and debug this program. # Dr. Brian Fristensky - my work supervisor, and the man who introduced # me to the wonderful field of bioinformatics. # # QUESTIONS & COMMENTS # If you have any questions, please contact me: alvare@cc.umanitoba.ca # I usually get back to people within 1-2 weekdays (weekends, I am slower) # # P.S. Please also let me know of any bugs, or if you have any suggestions. # I am generally happy to help create new tools, or modify my existing # tools to make them more useful. # # Happy usage! import sys, os, os.path, csv, math import itertools if __name__=="__main__": if len(sys.argv) > 2: # read in the datasets to analyze sets = dict() for filename in sys.argv[1:]: current = list() fhandle = open(filename, "rU") csvread = csv.reader(fhandle, dialect='excel-tab') for line in csvread: read_id = line[0] current.append(read_id) sets[filename] = set(current) # process and write out the results csv_out = csv.writer(sys.stdout, dialect='excel-tab') csv_out.writerow(["status","group","inter","union"]) # generate a list of combinations (via. combinatorics) or the sets # for every possible size (i.e. from 1 to all sets) for size in range(1, len(sets) + 1): # iterate through every combination of the data sets, for the given size combos = [list(x) for x in itertools.combinations(sets.keys(), size)] for cbo in combos: # retrieve the first set in the combination results = sets[cbo[0]] union = sets[cbo[0]] # intersect all of the other sets in the combination for idx in cbo[1:]: results = results.intersection(sets[idx]) union = union.union(sets[idx]) # remove all data that does not intersect the combination for key in sets: if key not in cbo: results = results.difference(sets[key]) # write the results to standard output csv_out.writerow(["filtered", sorted(cbo), len(results), len(union)]) else: # print usage instructions print "Usage: vdiagram.py " print "Output: a Venn-Diagram table (to standard output)."