# Copyright 2009 by Cymon J. Cox.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.
"""Command line wrapper for the multiple alignment programme MAFFT.

http://align.bmr.kyushu-u.ac.jp/mafft/software/

Citations:

Katoh, Toh (BMC Bioinformatics 9:212, 2008) Improved accuracy of
multiple ncRNA alignment by incorporating structural information into a
MAFFT-based framework (describes RNA structural alignment methods)

Katoh, Toh (Briefings in Bioinformatics 9:286-298, 2008) Recent developments in
the MAFFT multiple sequence alignment program (outlines version 6)
Katoh, Toh (Bioinformatics 23:372-374, 2007)  Errata PartTree: an algorithm to
build an approximate tree from a large number of unaligned sequences (describes
the PartTree algorithm)

Katoh, Kuma, Toh, Miyata (Nucleic Acids Res. 33:511-518, 2005) MAFFT version 5:
improvement in accuracy of multiple sequence alignment (describes [ancestral
versions of] the G-INS-i, L-INS-i and E-INS-i strategies) Katoh, Misawa, Kuma,
Miyata (Nucleic Acids Res. 30:3059-3066, 2002)

Last checked against version: 6.626b (2009/03/16)
"""

import os
from Bio.Application import _Option, _Switch, _Argument, AbstractCommandline

class MafftCommandline(AbstractCommandline):
    """Command line wrapper for the multiple alignment program MAFFT."""
    def __init__(self, cmd="mafft", **kwargs):
        BLOSUM_MATRICES = ["30","45","62","80"]
        self.parameters = \
            [
            #**** Algorithm ****
            #Automatically selects an appropriate strategy from L-INS-i, FFT-NS-
            #i and FFT-NS-2, according to data size. Default: off (always FFT-NS-2)
            _Switch(["--auto", "auto"], ["input"],
                    "Automatically select strategy. Default off."),
            #Distance is calculated based on the number of shared 6mers. Default: on
            _Switch(["--6merpair", "6merpair", "sixmerpair"], ["input"],
                     "Distance is calculated based on the number of shared " + \
                     "6mers. Default: on"),
            #All pairwise alignments are computed with the Needleman-Wunsch
            #algorithm. More accurate but slower than --6merpair. Suitable for a
            #set of globally alignable sequences. Applicable to up to ~200
            #sequences. A combination with --maxiterate 1000 is recommended (G-
            #INS-i). Default: off (6mer distance is used)
            _Switch(["--globalpair", "globalpair"], ["input"],
                     "All pairwise alignments are computed with the " + \
                     "Needleman-Wunsch algorithm. Default: off"),
            #All pairwise alignments are computed with the Smith-Waterman
            #algorithm. More accurate but slower than --6merpair. Suitable for a
            #set of locally alignable sequences. Applicable to up to ~200
            #sequences. A combination with --maxiterate 1000 is recommended (L-
            #INS-i). Default: off (6mer distance is used)
            _Switch(["--localpair", "localpair"], ["input"],
                     "All pairwise alignments are computed with the " + \
                     "Smith-Waterman algorithm. Default: off"),
            #All pairwise alignments are computed with a local algorithm with
            #the generalized affine gap cost (Altschul 1998). More accurate but
            #slower than --6merpair. Suitable when large internal gaps are
            #expected. Applicable to up to ~200 sequences. A combination with --
            #maxiterate 1000 is recommended (E-INS-i). Default: off (6mer
            #distance is used)
            _Switch(["--genafpair", "genafpair"], ["input"],
                     "All pairwise alignments are computed with a local " + \
                     "algorithm with the generalized affine gap cost " + \
                     "(Altschul 1998). Default: off"),
            #All pairwise alignments are computed with FASTA (Pearson and Lipman
            #1988). FASTA is required. Default: off (6mer distance is used)
            _Switch(["--fastapair", "fastapair"], ["input"],
                     "All pairwise alignments are computed with FASTA " + \
                     "(Pearson and Lipman 1988). Default: off"),
            #Weighting factor for the consistency term calculated from pairwise
            #alignments. Valid when either of --blobalpair, --localpair, --
            #genafpair, --fastapair or --blastpair is selected. Default: 2.7
            _Option(["--weighti", "weighti"], ["input"],
                     lambda x: isinstance(x, float), 0,
                     "Weighting factor for the consistency term calculated " + \
                     "from pairwise alignments. Default: 2.7",
                     0),
            #Guide tree is built number times in the progressive stage. Valid
            #with 6mer distance. Default: 2
            _Option(["--retree", "retree"], ["input"],
                     lambda x: isinstance(x, int), 0,
                     "Guide tree is built number times in the progressive " + \
                     "stage. Valid with 6mer distance. Default: 2",
                     0),
            #Number cycles of iterative refinement are performed. Default: 0
            _Option(["--maxiterate", "maxiterate"], ["input"],
                     lambda x: isinstance(x, int), 0,
                     "Number cycles of iterative refinement are performed. " + \
                     "Default: 0",
                     0),
            #Use FFT approximation in group-to-group alignment. Default: on
            _Switch(["--fft", "fft"], ["input"],
                    "Use FFT approximation in group-to-group alignment. " + \
                    "Default: on"),
            #Do not use FFT approximation in group-to-group alignment. Default:
            #off
            _Switch(["--nofft", "nofft"], ["input"],
                     "Do not use FFT approximation in group-to-group " + \
                     "alignment. Default: off"),
            #Alignment score is not checked in the iterative refinement stage.
            #Default: off (score is checked)
            _Switch(["--noscore", "noscore"], ["input"],
                    "Alignment score is not checked in the iterative " + \
                    "refinement stage. Default: off (score is checked)"),
            #Use the Myers-Miller (1988) algorithm. Default: automatically
            #turned on when the alignment length exceeds 10,000 (aa/nt).
            _Switch(["--memsave", "memsave"], ["input"],
                    "Use the Myers-Miller (1988) algorithm. Default: " + \
                    "automatically turned on when the alignment length " + \
                    "exceeds 10,000 (aa/nt)."),
            #Use a fast tree-building method (PartTree, Katoh and Toh 2007) with
            #the 6mer distance. Recommended for a large number (> ~10,000) of
            #sequences are input. Default: off
            _Switch(["--parttree", "parttree"], ["input"],
                    "Use a fast tree-building method with the 6mer " + \
                    "distance. Default: off"),
            #The PartTree algorithm is used with distances based on DP. Slightly
            #more accurate and slower than --parttree. Recommended for a large
            #number (> ~10,000) of sequences are input. Default: off
            _Switch(["--dpparttree", "dpparttree"], ["input"],
                    "The PartTree algorithm is used with distances " + \
                    "based on DP. Default: off"),
            #The PartTree algorithm is used with distances based on FASTA.
            #Slightly more accurate and slower than --parttree. Recommended for
            #a large number (> ~10,000) of sequences are input. FASTA is
            #required. Default: off
            _Switch(["--fastaparttree", "fastaparttree"], ["input"],
                    "The PartTree algorithm is used with distances based " + \
                    "on FASTA. Default: off"),
            #The number of partitions in the PartTree algorithm. Default: 50
            _Option(["--partsize", "partsize"], ["input"],
                    lambda x: isinstance(x, int), 0,
                    "The number of partitions in the PartTree algorithm. " + \
                    "Default: 50",
                    0),
            #Do not make alignment larger than number sequences. Valid only with
            #the --*parttree options. Default: the number of input sequences
            _Switch(["--groupsize", "groupsize"], ["input"],
                    "Do not make alignment larger than number sequences. " + \
                    "Default: the number of input sequences"),
            #**** Parameter ****
            #Gap opening penalty at group-to-group alignment. Default: 1.53
            _Option(["--op", "op"], ["input"],
                    lambda x: isinstance(x, float), 0,
                    "Gap opening penalty at group-to-group alignment. " + \
                    "Default: 1.53",
                    0),
            #Offset value, which works like gap extension penalty, for group-to-
            #group alignment. Deafult: 0.123
            _Option(["--ep", "ep"], ["input"],
                    lambda x: isinstance(x, float), 0,
                    "Offset value, which works like gap extension penalty, " + \
                    "for group-to- group alignment. Default: 0.123",
                    0),
            #Gap opening penalty at local pairwise alignment. Valid when the --
            #localpair or --genafpair option is selected. Default: -2.00
            _Option(["--lop", "lop"], ["input"],
                    lambda x: isinstance(x, float), 0,
                    "Gap opening penalty at local pairwise alignment. " + \
                    "Default: 0.123",
                    0),
            #Offset value at local pairwise alignment. Valid when the --
            #localpair or --genafpair option is selected. Default: 0.1
            _Option(["--lep", "lep"], ["input"],
                    lambda x: isinstance(x, float), 0,
                    "Offset value at local pairwise alignment. " + \
                    "Default: 0.1",
                    0),
            #Gap extension penalty at local pairwise alignment. Valid when the -
            #-localpair or --genafpair option is selected. Default: -0.1
            _Option(["--lexp", "lexp"], ["input"],
                    lambda x: isinstance(x, float), 0,
                    "Gap extension penalty at local pairwise alignment. " + \
                    "Default: -0.1",
                    0),
            #Gap opening penalty to skip the alignment. Valid when the --
            #genafpair option is selected. Default: -6.00
            _Option(["--LOP", "LOP"], ["input"],
                    lambda x: isinstance(x, float), 0,
                    "Gap opening penalty to skip the alignment. " + \
                    "Default: -6.00",
                    0),
            #Gap extension penalty to skip the alignment. Valid when the --
            #genafpair option is selected. Default: 0.00
            _Option(["--LEXP", "LEXP"], ["input"],
                    lambda x: isinstance(x, float),
                    0,
                    "Gap extension penalty to skip the alignment. " + \
                    "Default: 0.00",
                    0),

            #BLOSUM number matrix (Henikoff and Henikoff 1992) is used.
            #number=30, 45, 62 or 80. Default: 62
            _Option(["--bl", "bl"], ["input"],
                    lambda x: x in BLOSUM_MATRICES, 0,
                    "BLOSUM number matrix is used. Default: 62",
                    0),
            #JTT PAM number (Jones et al. 1992) matrix is used. number>0.
            #Default: BLOSUM62
            _Option(["--jtt", "jtt"], ["input"], None, 0,
                    "JTT PAM number (Jones et al. 1992) matrix is used. " + \
                    "number>0. Default: BLOSUM62",
                    0),
            #Transmembrane PAM number (Jones et al. 1994) matrix is used.
            #number>0. Default: BLOSUM62
            _Option(["--tm", "tm"], ["input"],
                    os.path.exists, 0,
                    "Transmembrane PAM number (Jones et al. 1994) " + \
                    "matrix is used. number>0. Default: BLOSUM62",
                    0),
            #Use a user-defined AA scoring matrix. The format of matrixfile is
            #the same to that of BLAST. Ignored when nucleotide sequences are
            #input. Default: BLOSUM62
            _Option(["--aamatrix", "aamatrix"], ["input"],
                    os.path.exists, 0,
                    "Use a user-defined AA scoring matrix. " + \
                    "Default: BLOSUM62",
                    0),
            #Incorporate the AA/nuc composition information into the scoring
            #matrix. Default: off
            _Switch(["--fmodel", "fmodel"], ["input"],
                    "Incorporate the AA/nuc composition information " + \
                    "into the scoring matrix. Default: off"),
            #**** Output ****
            #Output format: clustal format. Default: off (fasta format)
            _Switch(["--clustalout", "clustalout"], ["input"],
                    "Output format: clustal format. Default: off (fasta" + \
                    "format)"),
            #Output order: same as input. Default: on
            _Switch(["--inputorder", "inputorder"], ["input"],
                    "Output order: same as input. Default: on"),
            #Output order: aligned. Default: off (inputorder)
            _Switch(["--reorder", "reorder"], ["input"],
                    "Output order: aligned. Default: off (inputorder)"),
            #Guide tree is output to the input.tree file. Default: off
            _Switch(["--treeout", "treeout"], ["input"],
                    "Guide tree is output to the input.tree file. Default: off"),
            #Do not report progress. Default: off
            _Switch(["--quiet", "quiet"], ["input"],
                    "Do not report progress. Default: off"),
            #**** Input ****
            #Assume the sequences are nucleotide. Deafult: auto
            _Switch(["--nuc", "nuc"], ["input"],
                    "Assume the sequences are nucleotide. Default: auto"),
            #Assume the sequences are amino acid. Deafult: auto
            _Switch(["--amino", "amino"], ["input"],
                    "Assume the sequences are amino acid. Default: auto"),
            ###################### SEEDS #####################################
            # MAFFT has multiple --seed commands where the unaligned input is
            # aligned to the seed alignment. There can be multiple seeds in the
            # form: "mafft --seed align1 --seed align2 [etc] input"
            # Effectively for n number of seed alignments. Here we're going to
            # assume 6 extra are enough
            _Option(["--seed", "seed"], ["input", "file"], os.path.exists, 0,
                    "Seed alignments given in alignment_n (fasta format) " + \
                    "are aligned with sequences in input.",
                    0),
            #The old solution of also defining extra parameters with
            #["--seed", "seed1"] etc worked, but clashes with the recent
            #code in the base class to look for duplicate paramters and raise
            #an error.  Perhaps that check should be ignored here, or maybe
            #we can handle this more elegantly...
            #TODO - Create an _OptionList parameter which allows a list to be
            #assigned to the value?
            ####################### END SEEDS  ################################
            #The input (must be FASTA format)
            _Argument(["input"], ["input"], os.path.exists, 1,
                      "Input file name"),
            ###################################################################
            #mafft-profile takes a second alignment input as an argument:
            #mafft-profile align1 align2
            _Argument(["input1"], ["input"], os.path.exists, 0,
                      "Second input file name for the mafft-profile command")
            ]
        AbstractCommandline.__init__(self, cmd, **kwargs)