#!/usr/bin/env python

##############################################
# Bowtie Match Venn-Diagrammer (vdiagram.py) #
##############################################
# SYNOPSIS
#    This program reads multiple SAM files and creates a Venn-Diagram table comparing
#    the read IDs which aligned to both genomes (i.e. the purpose of this program is
#    to assist with multiple genome transcriptomic RNA-seq data analysis.
#    
# USAGE
#    vdiagram.py <SAM_files>
#
#    Examples:
#       vdiagram.py condition1.sam condition2.sam
#       vdiagram.py condition1.sam condition2.sam condition3.sam
#       vdiagram.py condition1.sam condition2.sam condition3.sam condition4.sam
#
# INPUT FILES
#    This program reads SAM files generated by Bowtie.  The program examines
#    specifically the aligned reads contained within the file.
#
# PREREQUISITES
#    This program requires the following modules (the  numbers in
#    parentheses are the version numbers I used; they are NEITHER the
#    minimum or maximum version numbers supported):
#
#      Python    (2.7.3)
#
# OUTPUT
#    A Venn Diagram table representing the data contained within the files.
#
###################################
# THIS IS OPEN SOURCE CODE (YAY!) #
###################################
# LICENSE
#    This code is licensed under the Creative Commons 3.0
#    Attribution + ShareAlike license - for details see:
#       http://creativecommons.org/licenses/by-sa/3.0/
#
# DATE: February 27th, 2014
#
# PROGRAM AUTHOR (PROGRAMMER)
#    Graham Alvare - home.cc.umanitoba.ca/~alvare
#
# CO-AUTHORS/ACKNOWLEDGEMENTS
#    Justin Zhang          - for providing me information on the SAM format,
#                            and test data to build and debug this program.
#    Dr. Brian Fristensky  - my work supervisor, and the man who introduced
#                            me to the wonderful field of bioinformatics.
#
# QUESTIONS & COMMENTS
#    If you have any questions, please contact me: alvare@cc.umanitoba.ca
#    I usually get back to people within 1-2 weekdays (weekends, I am slower)
#
#    P.S. Please also let me know of any bugs, or if you have any suggestions.
#         I am generally happy to help create new tools, or modify my existing
#         tools to make them more useful.
#
# Happy usage!

import sys, os, os.path, csv, math
import itertools

if __name__=="__main__":
	if len(sys.argv) > 2:
		# read in the datasets to analyze
		sets = dict()
		for filename in sys.argv[1:]:
			current = list()
			fhandle = open(filename, "rU")
			csvread = csv.reader(fhandle, dialect='excel-tab')
			for line in csvread:
				read_id = line[0]
				current.append(read_id)
			sets[filename] = set(current)

		# process and write out the results
		csv_out = csv.writer(sys.stdout, dialect='excel-tab')
		csv_out.writerow(["status","group","inter","union"])

		# generate a list of combinations (via. combinatorics) or the sets
		# for every possible size (i.e. from 1 to all sets)
		for size in range(1, len(sets) + 1):
			# iterate through every combination of the data sets, for the given size
			combos = [list(x) for x in itertools.combinations(sets.keys(), size)]
			for cbo in combos:
				# retrieve the first set in the combination
				results = sets[cbo[0]]
				union   = sets[cbo[0]]

				# intersect all of the other sets in the combination
				for idx in cbo[1:]:
					results = results.intersection(sets[idx])
					union   = union.union(sets[idx])

				# remove all data that does not intersect the combination
				for key in sets:
					if key not in cbo:
						results = results.difference(sets[key])

				# write the results to standard output
				csv_out.writerow(["filtered", sorted(cbo), len(results), len(union)])
	else:
		# print usage instructions
		print "Usage:  vdiagram.py <sam_files>"
		print "Output: a Venn-Diagram table (to standard output)."