#!/usr/bin/env python # -------------------------------- WebLogo -------------------------------- # Copyright (c) 2003-2004 The Regents of the University of California. # Copyright (c) 2005 Gavin E. Crooks # Copyright (c) 2006-2011, The Regents of the University of California, through # Lawrence Berkeley National Laboratory (subject to receipt of any required # approvals from the U.S. Dept. of Energy). All rights reserved. # This software is distributed under the new BSD Open Source License. # # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # (1) Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # (2) Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and or other materials provided with the distribution. # # (3) Neither the name of the University of California, Lawrence Berkeley # National Laboratory, U.S. Dept. of Energy nor the names of its contributors # may be used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # Replicates README.txt """ WebLogo (http://code.google.com/p/weblogo/) is a tool for creating sequence logos from biological sequence alignments. It can be run on the command line, as a standalone webserver, as a CGI webapp, or as a python library. The main WebLogo webserver is located at http://weblogo.threeplusone.com Please consult the manual for installation instructions and more information: (Also located in the weblogolib/htdocs subdirectory.) http://weblogo.threeplusone.com/manual.html For help on the command line interface run ./weblogo --help To build a simple logo run ./weblogo < cap.fa > logo0.eps To run as a standalone webserver at localhost:8080 ./weblogo --serve To create a logo in python code: >>> from weblogolib import * >>> fin = open('cap.fa') >>> seqs = read_seq_data(fin) >>> data = LogoData.from_seqs(seqs) >>> options = LogoOptions() >>> options.title = "A Logo Title" >>> format = LogoFormat(data, options) >>> eps = eps_formatter( data, format) -- Distribution and Modification -- This package is distributed under the new BSD Open Source License. Please see the LICENSE.txt file for details on copyright and licensing. The WebLogo source code can be downloaded from http://code.google.com/p/weblogo/ WebLogo requires Python 2.5, 2.6 or 2.7, and the python array package 'numpy' (http://www.scipy.org/Download) Generating logos in PDF or bitmap graphics formats require that the ghostscript program 'gs' be installed. Scalable Vector Graphics (SVG) format also requires the program 'pdf2svg'. """ from __future__ import absolute_import, division, print_function import sys import copy import os from datetime import datetime from math import log, sqrt, exp from string import Template from subprocess import * # Avoid 'from numpy import *' since numpy has lots of names defined from numpy import array, asarray, float64, ones, zeros, int32,all,any, shape import numpy as na from .color import * from .colorscheme import * from .logomath import Dirichlet import corebio from corebio import seq_io from corebio.data import (amino_acid_composition, amino_acid_letters, dna_letters, rna_letters) from corebio.moremath import * from corebio.seq import (Alphabet, Seq, SeqList, unambiguous_dna_alphabet, unambiguous_rna_alphabet, unambiguous_protein_alphabet) from corebio.utils import (isfloat, find_command, ArgumentError, stdrepr, resource_string, resource_filename) from corebio._py3k import StringIO # ------ META DATA ------ __all__ = [ 'LogoOptions', 'description', '__version__', 'LogoFormat', 'LogoData', 'GhostscriptAPI', 'std_color_schemes', 'default_color_schemes', 'classic', 'std_units', 'std_sizes', 'std_alphabets', 'std_percentCG', 'pdf_formatter', 'jpeg_formatter', 'png_formatter', 'png_print_formatter', 'txt_formatter', 'eps_formatter', 'formatters', 'default_formatter', 'base_distribution', 'equiprobable_distribution', 'read_seq_data', 'color', 'colorscheme', 'logomath', ] description = "Create sequence logos from biological sequence alignments." __version__ = corebio.__version__ # These keywords are substituted by subversion. # The date and revision will only tell the truth after a branch or tag, # since different files in trunk will have been changed at different times release_date ="$Date: 2014-06-02 22:07:02 -0700 (Mon, 02 Jun 2014) $".split()[1] release_build = "$Revision: 206 $".split()[1] release_description = "WebLogo %s (%s)" % (__version__, release_date) def cgi(htdocs_directory) : import weblogolib._cgi weblogolib._cgi.main(htdocs_directory) class GhostscriptAPI(object) : """Interface to the command line program Ghostscript ('gs')""" formats = ('png', 'pdf', 'jpeg') def __init__(self, path=None) : try: command = find_command('gs', path=path) except EnvironmentError: try: command = find_command('gswin32c.exe', path=path) except EnvironmentError: raise EnvironmentError("Could not find Ghostscript on path." " There should be either a gs executable or a gswin32c.exe on your system's path") self.command = command def version(self) : args = [self.command, '--version'] try : p = Popen(args, stdout=PIPE) (out,err) = p.communicate() except OSError : raise RuntimeError("Cannot communicate with ghostscript.") return out.strip() def convert(self, format, postscript, width, height, resolution=300) : device_map = { 'png':'png16m', 'pdf':'pdfwrite', 'jpeg':'jpeg'} try : device = device_map[format] except KeyError: raise ValueError("Unsupported format.") args = [self.command, "-sDEVICE=%s" % device, "-dPDFSETTINGS=/printer", #"-q", # Quite: Do not dump messages to stdout. "-sstdout=%stderr", # Redirect messages and errors to stderr # fix issue 36, problems with ghostscript 9.10 "-dColorConversionStrategy=/LeaveColorUnchanged", "-sOutputFile=-", # Stdout "-dDEVICEWIDTHPOINTS=%s" % str(width), "-dDEVICEHEIGHTPOINTS=%s" % str(height), "-dSAFER", # For added security "-dNOPAUSE",] if device != 'pdf' : args.append("-r%s" % str(resolution) ) if resolution < 300 : # Antialias if resolution is Less than 300 DPI args.append("-dGraphicsAlphaBits=4") args.append("-dTextAlphaBits=4") args.append("-dAlignToPixels=0") args.append("-") # Read from stdin. Must be last argument. error_msg = "Unrecoverable error : Ghostscript conversion failed " \ "(Invalid postscript?). %s" % " ".join(args) try : p = Popen(args, stdin=PIPE, stdout = PIPE, stderr= PIPE) (out,err) = p.communicate(postscript.encode()) except OSError : raise RuntimeError(error_msg) if p.returncode != 0 : error_msg += '\nReturn code: %i\n' % p.returncode if err is not None : error_msg += err raise RuntimeError(error_msg) # Python 2: out is a 'str', python 3 out is 'bytes' return out # print( str(type(out)), file=sys.stderr) # print( str(type(fout)), file=sys.stderr) # # if sys.version_info[0] >= 3: # #fout.buffer.write(out) # If file # fout.write(out) #if bytesIO. But mangles outputsomehow # else: # print(out, file=fout) # end class Ghostscript aa_composition = [ amino_acid_composition[_k] for _k in unambiguous_protein_alphabet] # ------ DATA ------ classic = ColorScheme([ ColorGroup("G", "orange" ), ColorGroup("TU", "red"), ColorGroup("C", "blue"), ColorGroup("A", "green") ] ) std_color_schemes = {"auto": None, # Depends on sequence type "monochrome": monochrome, "base pairing": base_pairing, "classic": classic, "hydrophobicity" : hydrophobicity, "chemistry" : chemistry, "charge" : charge, }# default_color_schemes = { unambiguous_protein_alphabet: hydrophobicity, unambiguous_rna_alphabet: base_pairing, unambiguous_dna_alphabet: base_pairing } std_units = { "bits" : 1./log(2), "nats" : 1., "digits" : 1./log(10), "kT" : 1., "kJ/mol" : 8.314472 *298.15 /1000., "kcal/mol": 1.987 *298.15 /1000., "probability" : None, } # The base stack width is set equal to 9pt Courier. # (Courier has a width equal to 3/5 of the point size.) # Check that can get 80 characters in journal page @small # 40 characters in a journal column std_sizes = { "small" : 5.4 , "medium" : 5.4*2, "large" : 5.4*3 } std_alphabets = { 'protein': unambiguous_protein_alphabet, 'rna': unambiguous_rna_alphabet, 'dna': unambiguous_dna_alphabet} std_percentCG = { 'H. sapiens' : 40., 'E. coli' : 50.5, 'S. cerevisiae' : 38., 'C. elegans' : 36., 'D. melanogaster': 43., 'M. musculus' : 42., 'T. thermophilus' : 69.4, } # Thermus thermophilus: Henne A, Bruggemann H, Raasch C, Wiezer A, Hartsch T, # Liesegang H, Johann A, Lienard T, Gohl O, Martinez-Arias R, Jacobi C, # Starkuviene V, Schlenczeck S, Dencker S, Huber R, Klenk HP, Kramer W, # Merkl R, Gottschalk G, Fritz HJ: The genome sequence of the extreme # thermophile Thermus thermophilus. # Nat Biotechnol 2004, 22:547-53 class LogoOptions(object) : """ A container for all logo formatting options. Not all of these are directly accessible through the CLI or web interfaces. To display LogoOption defaults: >>> from weblogolib import * >>> LogoOptions() All physical lengths are measured in points. (72 points per inch, 28.3 points per cm) String attributes: o creator_text -- Embedded as comment in figures. o logo_title o logo_label o unit_name -- See std_units for options. (Default 'bits') o yaxis_label -- Defaults to unit_name o xaxis_label o fineprint -- Defaults to WebLogo name and version Boolean attributes: o show_yaxis o show_xaxis o show_ends o show_fineprint o show_errorbars -- Draw errorbars (default: False) o show_boxes -- Draw boxes around stack characters (default: True) o debug -- Draw extra graphics debugging information. o rotate_numbers -- Draw xaxis numbers with vertical orientation? o scale_width -- boolean, scale width of characters proportional to ungaps o pad_right -- Make a single line logo the same width as multiline logos (default: False) Other attributes: o stacks_per_line o yaxis_tic_interval o yaxis_minor_tic_ratio o yaxis_scale o xaxis_tic_interval o number_interval o shrink_fraction -- Proportional shrinkage of characters if show_boxes is true. o errorbar_fraction o errorbar_width_fraction o errorbar_gray o resolution -- Dots per inch (default: 96). Used for bitmapped output formats o default_color o color_scheme o stack_width -- o stack_aspect_ratio -- Ratio of stack height to width (default: 5) o logo_margin -- Default: 2 pts o stroke_width -- Default: 0.5 pts o tic_length -- Default: 5 pts o stack_margin -- Default: 0.5 pts o small_fontsize -- Small text font size in points o fontsize -- Regular text font size in points o title_fontsize -- Title text font size in points o number_fontsize -- Font size for axis-numbers, in points. o text_font o logo_font o title_font o first_index o logo_start o logo_end """ def __init__(self, **kwargs) : """ Create a new LogoOptions instance. >>> L = LogoOptions(logo_title = "Some Title String") >>> L.show_yaxis = False >>> repr(L) """ self.alphabet = None self.creator_text = release_description self.logo_title = "" self.logo_label = "" self.stacks_per_line = 40 self.unit_name = "bits" self.show_yaxis = True # yaxis_lable default depends on other settings. See LogoFormat self.yaxis_label = None self.yaxis_tic_interval = 1. self.yaxis_minor_tic_ratio = 5 self.yaxis_scale = None self.show_xaxis = True self.xaxis_label = "" self.xaxis_tic_interval =1 self.rotate_numbers = False self.number_interval = 5 self.show_ends = False self.annotate = None self.show_fineprint = True self.fineprint = "WebLogo "+__version__ self.show_boxes = False self.shrink_fraction = 0.5 self.show_errorbars = True self.errorbar_fraction = 0.90 self.errorbar_width_fraction = 0.25 self.errorbar_gray = 0.75 self.resolution = 96. # Dots per inch self.default_color = Color.by_name("black") self.color_scheme = None #self.show_color_key = False # NOT yet implemented self.debug = False self.logo_margin = 2 self.stroke_width = 0.5 self.tic_length = 5 self.stack_width = std_sizes["medium"] self.stack_aspect_ratio = 5 self.stack_margin = 0.5 self.pad_right = False self.small_fontsize = 6 self.fontsize = 10 self.title_fontsize = 12 self.number_fontsize = 8 self.text_font = "ArialMT" self.logo_font = "Arial-BoldMT" self.title_font = "ArialMT" self.first_index = 1 self.logo_start = None self.logo_end=None self.scale_width = True self.reverse_stacks = True # If true, draw stacks with largest letters on top. from corebio.utils import update update(self, **kwargs) def __repr__(self) : from corebio.util import stdrepr return stdrepr( self) def __repr__(self) : attributes = list(vars(self).keys()) attributes.sort() return stdrepr(self, attributes ) # End class LogoOptions class LogoFormat(LogoOptions) : """ Specifies the format of the logo. Requires LogoData and LogoOptions objects. >>> data = LogoData.from_seqs(seqs ) >>> options = LogoOptions() >>> options.title = "A Logo Title" >>> format = LogoFormat(data, options) Raises an ArgumentError if arguments are invalid. """ def __init__(self, data, options= None) : """ Create a new LogoFormat instance. """ LogoOptions.__init__(self) if options is not None : self.__dict__.update(options.__dict__) self.alphabet = data.alphabet self.seqlen = data.length # Derived parameters. self.show_title = False self.show_xaxis_label = False self.yaxis_minor_tic_interval = None self.lines_per_logo = None self.char_width = None # Maximum character width. Stack width minus margins. self.line_margin_left = None self.line_margin_right = None self.line_margin_bottom = None self.line_margin_top = None self.title_height = None self.xaxis_label_height = None self.line_height = None self.line_width = None self.logo_height = None self.logo_width = None self.creation_date = None self.end_type = None self.stack_height = self.stack_width * self.stack_aspect_ratio # Attribute to test, test, error message arg_conditions = ( ("stacks_per_line", lambda x: x>0 , "Stacks per line must be positive."), ("stack_width", lambda x: x>0.0, "Stack width must be greater than zero."), ("stack_aspect_ratio" , lambda x: x>0, "Stack aspect ratio must be greater than zero."), ("fontsize" , lambda x: x>0 , "Font sizes must be positive."), ("small_fontsize" , lambda x: x>0 , "Font sizes must be positive."), ("title_fontsize" , lambda x: x>0 , "Font sizes must be positive."), ("errorbar_fraction" , lambda x: x>=0.0 and x<=1.0, "The visible fraction of the error bar must be between zero and one."), ("yaxis_tic_interval" , lambda x: x>=0.0 , "The yaxis tic interval cannot be negative."), ("yaxis_minor_tic_interval" , lambda x: not (x and x<0.0) , "Distances cannot be negative."), ("xaxis_tic_interval" , lambda x: x>0.0 , "Tic interval must be greater than zero."), ("number_interval" , lambda x: x>0.0 , "Invalid interval between numbers."), ("shrink_fraction" , lambda x: x>=0.0 and x<=1.0 , "Invalid shrink fraction."), ("stack_margin" , lambda x: x>0.0 , "Invalid stack margin."), ("logo_margin" , lambda x: x>0.0 , "Invalid logo margin."), ("stroke_width", lambda x: x>0.0 , "Invalid stroke width."), ("tic_length" , lambda x: x>0.0 , "Invalid tic length."), ) # Run arguments tests. The second, attribute argument to the ArgumentError is # used by the UI to provide user feedback. # FIXME: More validation for test in arg_conditions : if not test[1]( getattr(self,test[0]) ) : raise ArgumentError(test[2], test[0]) # Inclusive upper and lower bounds # FIXME: Validate here. Move from eps_formatter if self.logo_start is None: self.logo_start = self.first_index if self.logo_end is None : self.logo_end = self.seqlen + self.first_index -1 self.total_stacks = self.logo_end - self.logo_start +1 if self.logo_start - self.first_index <0 : raise ArgumentError( "Logo range extends before start of available sequence.", 'logo_range') if self.logo_end - self.first_index >= self.seqlen : raise ArgumentError( "Logo range extends beyond end of available sequence.", 'logo_range') if self.logo_title : self.show_title = True if not self.fineprint : self.show_fineprint = False if self.xaxis_label : self.show_xaxis_label = True if self.yaxis_label is None : self.yaxis_label = self.unit_name if self.yaxis_label : self.show_yaxis_label = True else : self.show_yaxis_label = False self.show_ends = False if not self.yaxis_scale : conversion_factor = std_units[self.unit_name] if conversion_factor : self.yaxis_scale=log(len(self.alphabet))*conversion_factor else : self.yaxis_scale=1.0 # probability units if self.yaxis_scale<=0.0 : raise ArgumentError("Invalid yaxis scale", 'yaxis_scale',) if self.yaxis_tic_interval >= self.yaxis_scale: self.yaxis_tic_interval /= 2. self.yaxis_minor_tic_interval \ = float(self.yaxis_tic_interval)/self.yaxis_minor_tic_ratio if self.color_scheme is None : if self.alphabet in default_color_schemes : self.color_scheme = default_color_schemes[self.alphabet] else : self.color_scheme = monochrome self.lines_per_logo = 1+ ( (self.total_stacks-1) // self.stacks_per_line) if self.lines_per_logo==1 and not self.pad_right: self.stacks_per_line = min(self.stacks_per_line, self.total_stacks) self.char_width = self.stack_width - 2* self.stack_margin if self.show_yaxis : self.line_margin_left = self.fontsize * 3.0 else : self.line_margin_left = 0 if self.show_ends : self.line_margin_right = self.fontsize *1.5 else : self.line_margin_right = self.fontsize if self.show_xaxis : if self.rotate_numbers : self.line_margin_bottom = self.number_fontsize *2.5 else: self.line_margin_bottom = self.number_fontsize *1.5 else : self.line_margin_bottom = 4 self.line_margin_top = 4 if self.show_title : self.title_height = self.title_fontsize else : self.title_height = 0 self.xaxis_label_height =0. if self.show_xaxis_label : self.xaxis_label_height += self.fontsize if self.show_fineprint : self.xaxis_label_height += self.small_fontsize self.line_height = (self.stack_height + self.line_margin_top + self.line_margin_bottom ) self.line_width = (self.stack_width*self.stacks_per_line + self.line_margin_left + self.line_margin_right ) self.logo_height = int(2*self.logo_margin + self.title_height \ + self.xaxis_label_height + self.line_height*self.lines_per_logo) self.logo_width = int(2*self.logo_margin + self.line_width ) self.creation_date = datetime.now().isoformat(' ') end_type = '-' end_types = { unambiguous_protein_alphabet: 'p', unambiguous_rna_alphabet: '-', unambiguous_dna_alphabet: 'd' } if self.show_ends and self.alphabet in end_types: end_type = end_types[self.alphabet] self.end_type = end_type if self.annotate is None : self.annotate = [] for i in range(self.seqlen): index = i + self.first_index if index % self.number_interval == 0 : self.annotate.append( "%d"%index) else : self.annotate.append("") if len(self.annotate)!=self.seqlen : raise ArgumentError( "Annotations must be same length as sequences.", 'annotate') # End __init__ # End class LogoFormat # ------ Logo Formaters ------ # Each formatter is a function f(LogoData, LogoFormat). # that draws a representation of the logo. # The main graphical formatter is eps_formatter. A mapping 'formatters' # containing all available formatters is located after the formatter # definitions. # Each formatter returns binary data. The eps and data formats can decoded # to strings, e.g. eps_as_string = eps_data.decode() def pdf_formatter(data, format) : """ Generate a logo in PDF format.""" eps = eps_formatter(data, format).decode() gs = GhostscriptAPI() return gs.convert('pdf', eps, format.logo_width, format.logo_height) def _bitmap_formatter(data, format, device) : eps = eps_formatter(data, format).decode() gs = GhostscriptAPI() return gs.convert(device, eps, format.logo_width, format.logo_height, format.resolution) def jpeg_formatter(data, format) : """ Generate a logo in JPEG format.""" return _bitmap_formatter(data, format, device="jpeg") def svg_formatter(data, format) : """ Generate a logo in Scalable Vector Graphics (SVG) format. Requires the program 'pdf2svg' be installed. """ pdf = pdf_formatter(data, format) try: command = find_command('pdf2svg') except EnvironmentError: raise EnvironmentError("Scalable Vector Graphics (SVG) format requires the program 'pdf2svg'. " "Cannot find 'pdf2svg' on search path.") import tempfile fpdfi, fname_pdf = tempfile.mkstemp(suffix=".pdf") fsvgi, fname_svg = tempfile.mkstemp(suffix=".svg") try: fpdf2 = open(fname_pdf, 'w') if sys.version_info[0] >= 3: fpdf2.buffer.write(pdf) else: fpdf2.write(pdf) fpdf2.seek(0) args = [command, fname_pdf, fname_svg] p = Popen(args) (out,err) = p.communicate() fsvg = open(fname_svg) return fsvg.read().encode() finally: os.remove(fname_svg) os.remove(fname_pdf) def png_formatter(data, format) : """ Generate a logo in PNG format.""" return _bitmap_formatter(data, format, device="png") def png_print_formatter(data, format) : """ Generate a logo in PNG format with print quality (600 DPI) resolution.""" format.resolution = 600 return _bitmap_formatter(data, format, device="png") def txt_formatter(logodata, format): """ Create a text representation of the logo data. """ return str(logodata).encode() def eps_formatter(logodata, format) : """ Generate a logo in Encapsulated Postscript (EPS)""" substitutions = {} from_format =[ "creation_date", "logo_width", "logo_height", "lines_per_logo", "line_width", "line_height", "line_margin_right","line_margin_left", "line_margin_bottom", "line_margin_top", "title_height", "xaxis_label_height", "creator_text", "logo_title", "logo_margin", "stroke_width", "tic_length", "stacks_per_line", "stack_margin", "yaxis_label", "yaxis_tic_interval", "yaxis_minor_tic_interval", "xaxis_label", "xaxis_tic_interval", "number_interval", "fineprint", "shrink_fraction", "errorbar_fraction", "errorbar_width_fraction", "errorbar_gray", "small_fontsize", "fontsize", "title_fontsize", "number_fontsize", "text_font", "logo_font", "title_font", "logo_label", "yaxis_scale", "end_type", "debug", "show_title", "show_xaxis", "show_xaxis_label", "show_yaxis", "show_yaxis_label", "show_boxes", "show_errorbars", "show_fineprint", "rotate_numbers", "show_ends", "stack_height", "stack_width" ] for s in from_format : substitutions[s] = getattr(format,s) substitutions["shrink"] = str(format.show_boxes).lower() # --------- COLORS -------------- def format_color(color): return " ".join( ("[",str(color.red) , str(color.green), str(color.blue), "]")) substitutions["default_color"] = format_color(format.default_color) colors = [] for group in format.color_scheme.groups : cf = format_color(group.color) for s in group.symbols : colors.append( " ("+s+") " + cf ) substitutions["color_dict"] = "\n".join(colors) data = [] # Unit conversion. 'None' for probability units conv_factor = std_units[format.unit_name] data.append("StartLine") seq_from = format.logo_start- format.first_index seq_to = format.logo_end - format.first_index +1 # seq_index : zero based index into sequence data # logo_index : User visible coordinate, first_index based # stack_index : zero based index of visible stacks for seq_index in range(seq_from, seq_to) : logo_index = seq_index + format.first_index stack_index = seq_index - seq_from if stack_index!=0 and (stack_index % format.stacks_per_line) ==0 : data.append("") data.append("EndLine") data.append("StartLine") data.append("") data.append("(%s) StartStack" % format.annotate[seq_index] ) if conv_factor: stack_height = logodata.entropy[seq_index] * std_units[format.unit_name] else : stack_height = 1.0 # Probability # Sort by frequency. If equal frequency then reverse alphabetic # (So sort reverse alphabetic first, then frequencty) # TODO: doublecheck this actual works s = list(zip(logodata.counts[seq_index], logodata.alphabet)) s.sort(key= lambda x: x[1]) s.reverse() s.sort(key= lambda x: x[0]) if not format.reverse_stacks: s.reverse() C = float(sum(logodata.counts[seq_index])) if C > 0.0 : fraction_width = 1.0 if format.scale_width : fraction_width = logodata.weight[seq_index] # print(fraction_width, file=sys.stderr) for c in s: data.append(" %f %f (%s) ShowSymbol" % (fraction_width, c[0]*stack_height/C, c[1]) ) # Draw error bar on top of logo. Replaced by DrawErrorbarFirst above. if logodata.entropy_interval is not None and conv_factor and C>0.0: low, high = logodata.entropy_interval[seq_index] center = logodata.entropy[seq_index] low *= conv_factor high *= conv_factor center *=conv_factor if high> format.yaxis_scale : high = format.yaxis_scale down = (center - low) up = (high - center) data.append(" %f %f DrawErrorbar" % (down, up) ) data.append("EndStack") data.append("") data.append("EndLine") substitutions["logo_data"] = "\n".join(data) # Create and output logo template = resource_string( __name__, 'template.eps', __file__).decode() logo = Template(template).substitute(substitutions) return logo.encode() # map between output format names and logo formatters = { 'eps': eps_formatter, 'pdf': pdf_formatter, 'png': png_formatter, 'png_print' : png_print_formatter, 'jpeg' : jpeg_formatter, 'svg' : svg_formatter, 'logodata' : txt_formatter, } default_formatter = eps_formatter def parse_prior(composition, alphabet, weight=None) : """ Parse a description of the expected monomer distribution of a sequence. Valid compositions: - None or 'none' : No composition sepecified - 'auto' or 'automatic': Use the typical average distribution for proteins and an equiprobable distribution for everything else. - 'equiprobable' : All monomers have the same probability. - a percentage, e.g. '45%' or a fraction '0.45': The fraction of CG bases for nucleotide alphabets - a species name, e.g. 'E. coli', 'H. sapiens' : Use the average CG percentage for the specie's genome. - An explicit distribution, e.g. {'A':10, 'C':40, 'G':40, 'T':10} """ if composition is None: return None comp = composition.strip() if comp.lower() == 'none': return None if weight is None and alphabet is not None: weight = sqrt(float(len(alphabet))) if weight<0 : raise ValueError("Weight cannot be negative.") if comp.lower() == 'equiprobable' : prior = weight * equiprobable_distribution(len(alphabet)) elif comp.lower() == 'auto' or comp.lower() == 'automatic': if alphabet == unambiguous_protein_alphabet : prior = weight * asarray(aa_composition, float64) else : prior = weight * equiprobable_distribution(len(alphabet)) elif comp in std_percentCG : prior = weight * base_distribution(std_percentCG[comp]) elif comp[-1] == '%' : prior = weight * base_distribution( float(comp[:-1])) elif isfloat(comp) : prior = weight * base_distribution( float(comp)*100. ) elif composition[0] == '{' and composition[-1] == '}' : explicit = composition[1: -1] explicit = explicit.replace(',',' ').replace("'", ' ').replace('"',' ').replace(':', ' ').split() if len(explicit) != len(alphabet)*2 : #print(explicit) raise ValueError("Explicit prior does not match length of alphabet") prior = - ones(len(alphabet), float64) try : for r in range(len(explicit) // 2): letter = explicit[r*2] index = alphabet.ord(letter) value = float(explicit[r*2 +1]) prior[index] = value except ValueError : raise ValueError("Cannot parse explicit composition") if any(prior==-1.) : raise ValueError("Explicit prior does not match alphabet") prior/= sum(prior) prior *= weight else : raise ValueError("Unknown or malformed composition: %s"%composition) if len(prior) != len(alphabet) : raise ValueError( "The sequence alphabet and composition are incompatible.") return prior def base_distribution(percentCG) : A = (1. - (percentCG/100.))/2. C = (percentCG/100.)/2. G = (percentCG/100.)/2. T = (1. - (percentCG/100))/2. return asarray((A,C,G,T), float64) def equiprobable_distribution( length) : return ones( (length), float64) /length def read_seq_data(fin, input_parser=seq_io.read, alphabet=None, ignore_lower_case=False, max_file_size=0): """ Read sequence data from the input stream and return a seqs object. The environment variable WEBLOGO_MAX_FILE_SIZE overides the max_file_size argument. Used to limit the load on the WebLogo webserver. """ max_file_size =int(os.environ.get("WEBLOGO_MAX_FILE_SIZE", max_file_size)) # If max_file_size is set, or if fin==stdin (which is non-seekable), we # read the data and replace fin with a StringIO object. if(max_file_size>0) : data = fin.read(max_file_size) more_data = fin.read(2) if more_data != "" : raise IOError("File exceeds maximum allowed size: %d bytes" % max_file_size) fin = StringIO(data) elif fin == sys.stdin: fin = StringIO(fin.read()) fin.seek(0) seqs = input_parser(fin) if seqs is None or len(seqs) ==0 : raise ValueError("Please provide a multiple sequence alignment") if ignore_lower_case : # Case is significant. Do not count lower case letters. for i,s in enumerate(seqs) : seqs[i] = s.mask() # Add alphabet to seqs. if alphabet : seqs.alphabet = Alphabet(alphabet) else : seqs.alphabet = Alphabet.which(seqs) return seqs class LogoData(object) : """The data needed to generate a sequence logo. - alphabet - length - counts -- An array of character counts - entropy -- The relative entropy of each column - entropy_interval -- entropy confidence interval """ def __init__(self, length=None, alphabet = None, counts =None, entropy =None, entropy_interval = None, weight=None) : """Creates a new LogoData object""" self.length = length self.alphabet = alphabet self.counts = counts self.entropy = entropy self.entropy_interval = entropy_interval self.weight = weight @classmethod def from_counts(cls, alphabet, counts, prior= None): """Build a LogoData object from counts.""" # Counts is a Motif object? #counts = counts.array seq_length, A = counts.shape if prior is not None: prior = array(prior, float64) if prior is None or sum(prior)==0.0: R = log(A) ent = zeros( seq_length, float64) entropy_interval = None for i in range (0, seq_length) : C = sum(counts[i]) #FIXME: fixup corebio.moremath.entropy()? if C == 0 : ent[i] = 0.0 else : ent[i] = R - entropy(counts[i]) else : ent = zeros( seq_length, float64) entropy_interval = zeros( (seq_length,2) , float64) R = log(A) for i in range (0, seq_length) : alpha = array(counts[i] , float64) alpha += prior posterior = Dirichlet(alpha) ent[i] = posterior.mean_relative_entropy(prior/sum(prior)) entropy_interval[i][0], entropy_interval[i][1] = \ posterior.interval_relative_entropy(prior/sum(prior), 0.95) weight = array( na.sum(counts,axis=1) , float) max_weight = max(weight) if max_weight ==0.0 : raise ValueError('No counts.') weight /= max_weight return cls(seq_length, alphabet, counts, ent, entropy_interval, weight) @classmethod def from_seqs(cls, seqs, prior= None): """Build a LogoData object from a SeqList, a list of sequences.""" # --- VALIDATE DATA --- # check that at least one sequence of length at least 1 long if len(seqs)==0 or len(seqs[0]) ==0: raise ValueError("No sequence data found.") # Check sequence lengths seq_length = len(seqs[0]) for i,s in enumerate(seqs) : #print(i, s, len(s)) #TODO: Redundant? Should be checked in SeqList? if seq_length != len(s) : raise ArgumentError( "Sequence number %d differs in length from the previous sequences" % (i+1) , 'sequences') # FIXME: Check seqs.alphabet? counts = seqs.profile() return cls.from_counts(seqs.alphabet, counts, prior) def __str__(self) : out = StringIO() print('## LogoData', file=out) print('# First column is position number, counting from zero', file=out) print('# Subsequent columns are raw symbol counts', file=out) print('# Entropy is mean entropy measured in nats.' , file=out) print('# Low and High are the 95% confidence limits.', file=out) print('# Weight is the fraction of non-gap symbols in the column.', file=out) print('#\t', file=out) # Show column names print('#', end='\t', file=out) for a in self.alphabet : print(a, end=' \t', file=out) print('Entropy\tLow\tHigh\tWeight', file=out) # Write the data table for i in range(self.length) : print(i + 1, end=' \t', file=out) for c in self.counts[i]: print(c, end=' \t', file=out) print("%6.4f" % self.entropy[i], end=' \t', file=out) if self.entropy_interval is not None: print("%6.4f" % self.entropy_interval[i][0], end=' \t', file=out) print("%6.4f" % self.entropy_interval[i][1], end=' \t', file=out) else : print('\t', '\t', end='', file=out) if self.weight is not None : print("%6.4f" % self.weight[i], end='', file=out) print('', file=out) print('# End LogoData', file=out) return out.getvalue()