#!/usr/bin/env python
#This work is copyright Cedar McKay and Gabrielle Rocap, University of Washington.
#This work is licensed under the Creative Commons Attribution-ShareAlike 3.0 Unported License. 
#To view a copy of this license, visit http://creativecommons.org/licenses/by-sa/3.0/ or send a 
#letter to Creative Commons, 171 Second Street, Suite 300, San Francisco, California, 94105, USA.


#To install, move this file to your bin directory, adjust the above path to python, and install
#Biopython.


import sys
import os.path
from optparse import OptionParser
from Bio import SeqIO
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqIO import InsdcIO


#### Collect Input ####
#######################
usage="""Takes a GenBank or EMBL format file as input, and outputs a fasta file.
usage: %prog -i FILE [options]"""


parser = OptionParser(usage=usage, version="%prog 1.2")


parser.add_option("-i", "--in_file", metavar="FILE", dest="in_file", default=None,
                help="Specify the input FILE that you wish to convert")

parser.add_option("-m", "--file_format", metavar="FORMAT", dest="file_format", default='genbank',
                help="Specify the input file format. Specify 'genbank' or 'embl'. "
                "Default is genbank.")

parser.add_option("-o", "--out_file", metavar="FILE", dest="out_file", default=None,
                help="Specify the path and name of the output fasta file you wish to create. "
                "Default will be the same as the in_file, but with a 'fasta' suffix.")

parser.add_option("-s", "--sequence_type", dest="sequence_type", default="aa",
                help="Specify the kind of sequence you would like to extract. Options are 'aa' "
                "(feature amino acids), 'nt' (feature nucleotides), 'whole' (the entire "
                "sequence, not just sequence corresponding to features) and 'taa'    (amino acids "
                "translated on the fly, which generates amino acid sequence by translating the "
                "nucleotide sequence rather than extracting from the feature table)."
                "Default is 'aa'.") 

parser.add_option("-f", "--feature_type", dest="feature_type", default="CDS",
                help="Specify the type of feature that you would like to extract. This option "
                "accepts arbitrary text, and will fail if you input a non-existent feature name. "
                "Common options are 'CDS', 'rRNA', 'tRNA', or 'gene'. Default is 'CDS'.")

parser.add_option("-d", "--delimiter", dest="delimiter", default="spacepipe",
                help="Specify the character you wish to use to separate header elements. Options "
                "are 'tab', 'space', 'spacepipe', 'pipe', 'dash', or 'underscore'. "
                "Default is 'spacepipe'.")

parser.add_option("-q", "--qualifiers", dest="qualifiers", default="locus_tag,gene,product,location",
                help="Specify which qualifiers should make up the fasta header line. Takes comma "
                "separated list. Will accept any qualifier that appears in your genbank "
                "file, (e.g. 'note', 'protein_id', etc). Qualifiers appear in the header line in "
                "the order you list them. Use 'location_long' for the exact location information as it "
                "appears in the input file. Default is 'locus_tag,gene,product,location'.")

parser.add_option("-a", "--annotations", dest="annotations", default=None,
                help="Specify which record annotation should make up the header line. Takes "
                "comma separated list. Will accept any annotation that appears in your genbank "
                "file, (e.g. 'comment', 'taxonomy', accessions, etc). Only used with "
                "--sequence_type = whole. Default is 'organism'.")

parser.add_option("-u", "--user_header", dest="user_header", default=None,
                help="If you prefer to specify your own completely custom header line, you may "
                "specify it here. Should be speccified in single quotes. Only used with "
                "--sequence_type = whole.")

(options, args) = parser.parse_args()


#### Variables and Names ####
#############################
#Figure out some names and paths
if options.in_file:
    in_file = os.path.abspath(options.in_file)
else:   
    print "You must specify an in_file. Use '-h' for help."
    sys.exit()

(in_filePath, in_fileWholeName) = os.path.split(in_file)
(in_fileBase, in_fileExt) = os.path.splitext(in_fileWholeName)

#Figure out what our out_file is.
if options.out_file:
    out_file = os.path.join(in_filePath, options.out_file)
else:
    out_file = os.path.join(in_filePath, in_fileBase + '.fasta')
out_file = os.path.abspath(out_file)    

#Figure out what the user really wanted from delimiter:
delimiter = options.delimiter
if delimiter == 'space':
    delimiter = ' '
elif delimiter == 'spacepipe':
    delimiter = ' | '   
elif delimiter == 'pipe':
    delimiter = '|'
elif delimiter == 'dash':
    delimiter = '-'
elif delimiter == 'underscore':
    delimiter = '_'
elif delimiter == 'tab':
    delimiter = '   '
else:
    delimiter = ' | '
    
#Get the header_line user input, split on commas and turn into a list.
qualifier_list = options.qualifiers.split(',')
if options.annotations:
    annotation_list = options.annotations.split(',')
else:
    annotation_list = ['organism']

#Gather Remaining options
sequence_type = options.sequence_type
feature_type = options.feature_type
user_header = options.user_header
file_format = options.file_format


#Make sure no specified options conflict, or don't make sense
if user_header and sequence_type != 'whole':
    print "It doesn't make sense to set the user_header unless you are using the 'whole' " \
    "sequence_type. Use '-h' for help."
    sys.exit()

if file_format not in ['genbank', 'embl']:
    print "Must specify either 'genbank' or 'embl' format for the in_file. Use '-h' for help."
    sys.exit()

if options.annotations  and sequence_type != 'whole':
    print "It doesn't make sense to set the annotations unless you are using the 'whole' " \
    "sequence_type. Use '-h' for help."
    sys.exit()

    
#### Functions ####
###################
def build_header(feature, qualifier_list):
    header = []
    #First we have to handle the special case of location.
    if 'location' in qualifier_list:
        #Go through some pain to make location human readable by adding 1 to first position
        location = str(int(str(feature.location.nofuzzy_start))+1) + ":" + \
        str(feature.location.nofuzzy_end) 
        if feature.strand == 1:
            location = location + ' Forward'
        elif feature.strand == -1:
            location = location + ' Reverse'
        else:
            location = location + ' Could not determine strand'
    if 'location_long' in qualifier_list:
        location_long = InsdcIO._insdc_feature_location_string(feature, len(record.seq))
    #Now march through items in qualifier_list and get qualifiers, or special case each one.
    #Start with the special cases
    for item in qualifier_list:
        if item == 'location':
            header.append(location)
        elif item == 'location_long':
            header.append(location_long)
        else:
            if not feature.qualifiers.has_key(item) and item == 'gene':
                if feature.qualifiers.has_key('locus_tag'):
                    item = 'locus_tag'
                else:
                    item = 'db_xref'
            elif not feature.qualifiers.has_key(item) and item == "locus_tag":
                item == 'db_xref'
            #Finished with the special cases, now just getting plain old qualifiers
            if feature.qualifiers.has_key(item):
                header_part = feature.qualifiers[item][0]
                #Catch improper newline character in the middle of features.
                header_part = header_part.replace("\n"," ")
                #Catch inproper spaces in middle of feature and replace with single space.
                #No idea why this appears sometimes
                header_part = header_part.replace("                      ", " ") 
                if header_part == '':
                    header.append('None')
                else:
                    header.append(header_part)
            else:
                header_part = 'missing_%s_qualifer' % item
                header.append(header_part)
    return delimiter.join(header)


def genbank_to_fasta(record, sequence_type, qualifier_list):
    new_records = []
    for feature in record.features:
        if feature.type == feature_type: # What kind of feature to extract. Usually CDS or tRFLP
            if sequence_type == 'nt':
                temp_record = SeqRecord(feature.extract(record.seq), id = build_header(feature, qualifier_list),\
                description = '')
            elif sequence_type == 'taa':
                if feature.qualifiers.has_key("transl_table"):
                    translation_table = feature.qualifiers["transl_table"][0]
                else:
                    translation_table = 11
                temp_record = SeqRecord(feature.extract(record.seq).translate(table = translation_table),\
                id = build_header(feature, qualifier_list), description = '')
            elif sequence_type == 'aa':
                if feature.qualifiers.has_key("translation"):
                    temp_seq = Seq(feature.qualifiers["translation"][0], IUPAC.protein)
                else:               
                    if feature.qualifiers.has_key("transl_table"):
                        translation_table = feature.qualifiers["transl_table"][0]
                    else:
                        translation_table = 11
                    temp_seq = feature.extract(record.seq).translate(table = translation_table)
                temp_record = SeqRecord(temp_seq, id = build_header(feature, qualifier_list), \
                description = '')
            new_records.append(temp_record)
    return new_records


def genbank_to_fasta_whole(record, annotation_list, user_header, delimiter):
    if user_header:
        header = user_header
    else:
        header = []
        for item in annotation_list:
            if record.annotations.has_key(item):
                header_part = record.annotations[item]
                if type(header_part) == type([]): #Some attributes are lists. Must turn into string
                    header_part = ' : '.join(header_part)
                header_part = header_part.replace("\n"," ") #Catch improper newline character
                header_part = header_part.replace("                      ", " ")#Catch inproper spaces
                header.append(header_part)
            else:
                header_part = 'missing_%s_annotation' % item
                header.append(header_part)
        header = delimiter.join(header)
    temp_record = SeqRecord(record.seq, id = header, description = '')
    return [temp_record] #Return a list because will be used in a context requiring a list

#### Main ####
##############
in_file_handle = open (in_file, 'rU') #The 'U' option so we don't have to worry about line endings
out_file_handle = open (out_file, 'w')

record_iterator = SeqIO.parse(in_file_handle, file_format)

for record in record_iterator:
    print "Converting '%s' to fasta ..." % record.description
    if sequence_type in ['nt', 'aa', 'taa']:
        fasta_records = genbank_to_fasta(record, sequence_type, qualifier_list)
    elif sequence_type == 'whole': #whole records are handled specially
        fasta_records = genbank_to_fasta_whole(record, annotation_list, user_header, delimiter)
    else:
        print "Unrecognized sequence_type. Use '-h' for help."
        sys.exit()
    SeqIO.write(fasta_records, out_file_handle, 'fasta')

in_file_handle.close()
out_file_handle.close()