#!/usr/bin/env python

################################################
# REMOVES ALL SPACES FROM THE NAMES OF ALL     #
# SEQUENCES IN A FASTA FILE                    #
# THIS IS OPEN SOURCE CODE (YAY!)              #
################################################
# LICENSE
#    This code is licensed under the Creative Commons 3.0
#    Attribution + ShareAlike license - for details see:
#       http://creativecommons.org/licenses/by-sa/3.0/
#
# DATE: February 27th, 2014
#
# PROGRAM AUTHOR (PROGRAMMER)
#    Graham Alvare - home.cc.umanitoba.ca/~alvare
#
# CO-AUTHORS/ACKNOWLEDGEMENTS
#    Justin Zhang          - for providing me information on the SAM format,
#                            and test data to build and debug this program.
#    Dr. Brian Fristensky  - my work supervisor, and the man who introduced
#                            me to the wonderful field of bioinformatics.
#
# QUESTIONS & COMMENTS
#    If you have any questions, please contact me: alvare@cc.umanitoba.ca
#    I usually get back to people within 1-2 weekdays (weekends, I am slower)
#
#    P.S. Please also let me know of any bugs, or if you have any suggestions.
#         I am generally happy to help create new tools, or modify my existing
#         tools to make them more useful.
#
# Happy usage!

import sys, os, os.path, re, collections
from Bio import SeqIO

if __name__=="__main__":
	if len(sys.argv) > 1:
		for filename in sys.argv[1:]:
			name = ""
			seq  = ""
			outfile = open("fixed_" + filename, "w")
			# Open the FASTA file and parse each sequence
			for line in open(filename, 'rU'):
				if line.startswith(">"):
					if name != "":
						print >> outfile, "> " + name.replace(" ", "").replace(">", "", 1)
						print >> outfile, seq
					name = line.strip("> ").rstrip("\n\r")
					seq  = ""
				else:
					seq += line.replace(" ", "").replace("\n", "").replace("\r", "").replace("\t", "")
						
			if name != "":
				print >> outfile, "> " + name.replace(" ", "").replace(">", "", 1)
				print >> outfile, seq
			outfile.close()
	else:
		print "Usage:  fsa_len.py <fasta_files>"
		print "Output: the length of each sequence in each FASTA file"