#!/usr/bin/env python # Copyright (c) 2006, The Regents of the University of California, through # Lawrence Berkeley National Laboratory (subject to receipt of any required # approvals from the U.S. Dept. of Energy). All rights reserved. # This software is distributed under the new BSD Open Source License. # # Please see the LICENSE.txt file that should have been included # as part of this package. """ A tool for converting multiple sequence alignments from one format to another. """ from corebio.utils.deoptparse import DeOptionParser from optparse import OptionGroup from corebio import seq_io from corebio.seq import SeqList, Seq, nucleic_alphabet import sys __version__ = "1.0.0" description = """ A tool for converting multiple sequence alignments from one format to another. """ def main(): # ------ Parse Command line ------ parser = _build_option_parser() (opts, args) = parser.parse_args(sys.argv[1:]) if args : parser.error("Unparsable arguments: %s " % args) seqs = opts.reader.read(opts.fin) if opts.trans_seg: from corebio.transform import mask_low_complexity seqs = SeqList([mask_low_complexity(s) for s in seqs]) if opts.subsample is not None: from random import random frac = opts.subsample ss = [] for s in seqs : if random() < frac : ss.append(s) seqs = SeqList(ss) if opts.reverse : seqs = SeqList([s.reverse() for s in seqs]) if opts.complement : seqs = SeqList([Seq(s,alphabet=nucleic_alphabet) for s in seqs]) seqs = SeqList([s.complement() for s in seqs]) opts.writer.write( opts.fout, seqs) def _build_option_parser() : parser = DeOptionParser(usage="%prog [options] < sequence_data.fa > sequence_logo.eps", description = description, version = __version__ , add_verbose_options = False ) io_grp = OptionGroup(parser, "Input/Output Options",) parser.add_option_group(io_grp) io_grp.add_option( "-f", "--fin", dest="fin", action="store", type="file_in", default=sys.stdin, help="Sequence input file (default: stdin)", metavar="FILENAME") io_grp.add_option("", "--format-in", dest="reader", action="store", type ="dict", default = seq_io, choices = seq_io.format_names(), help="Multiple sequence alignment format: (%s)" % ', '.join([ f.names[0] for f in seq_io.formats]), metavar="FORMAT") io_grp.add_option("-o", "--fout", dest="fout", type="file_out", default=sys.stdout, help="Output file (default: stdout)", metavar="FILENAME") trans_grp = OptionGroup(parser, "Transformations",) parser.add_option_group(trans_grp) trans_grp.add_option( "", "--seg", dest="trans_seg", action="store_true", default=False, help="Mask low complexity regions in protein sequences.", metavar="TRUE/FALSE") trans_grp.add_option( "", "--subsample", dest="subsample", action="store", type= "float", default=None, help="Return a random subsample of the sequences.", metavar="FRACTION") trans_grp.add_option( "", "--reverse", dest="reverse", action="store_true", default=False, help="reverse sequences", metavar="TRUE/FALSE") trans_grp.add_option( "", "--complement", dest="complement", action="store_true", default=False, help="complement DNA sequences", metavar="TRUE/FALSE") # Writers out_formats = [] for f in seq_io.formats: if hasattr(f, "write") : out_formats.append(f) out_choices = {} for f in out_formats: out_choices[ f.names[0] ] = f out_names = [f.names[0] for f in out_formats] io_grp.add_option("-F", "--format-out", dest="writer", action="store", type ="dict", default = seq_io.fasta_io, choices = out_choices, help="Multiple sequence alignment output format: (%s) (Default: fasta)" % ', '.join(out_names), metavar="FORMAT") return parser # Standard python voodoo for CLI if __name__ == "__main__": main()