/* * Seq.java * * Created on August 28, 2008, 2:44 PM * * To change this template, choose Tools | Template Manager * and open the template in the editor. */ package org.biolegato.sequence.data; import org.biolegato.sequence.canvas.ColourMask; import java.io.Serializable; /** *

This class is used to contain all sequence related functions.

* *

The main functionality of this class is to wrap the following * parameters:
*
name, type, topology, direction, strandedness, description, * groupID, sequence, original, mask, protect_align, protect_ambig, * and protect_unambig
*

* *

Please see a description of each of these fields before making any major * edits to this class. Additionally, please note that the toString method * should not be edited lightly (because it is relied upon by other classes). *

* *

WHY PACKAGE ACCESS FOR MANY VARIABLES?

* *

Because the access is restricted to only the package level, there should * not be any major issues with regard to code bugs.

* * *

In addition, because these variables do not have to be validated * (they can be any value, including null), and the variables are only * changed by classes and methods within the package there is no reason for * accessor methods because it would only cause a slow-down * (i.e. additional overhead). In addition, the variables are only given * package access to minimize the amount of code that can change them.

* *

Please note that any variable defined in the scope of a class without the * word 'public' or 'private' preceding it, is defined as package scope. * Package scope means that only classes and methods in the same package can * alter or read the value directly.

* *

WHY TRANSIENT?

* * Certain parts of the object, such as the protection status and groupID, * are not necessary to store when serializing this object. To indicate * this to Java, we add the word 'transient' before each variable * declaration. ** * @author Graham Alvare * @author Brian Fristensky */ public final class Seq implements Cloneable, Serializable { /** * This enum is used for typing/storing sequence types. * This method of storage ensures proper type casting. * Feel free to add more types as necessary. */ public static enum Type { /** * Sequence type DNA */ DNA { /** * Prints a nicely formatted string representation for "DNA type" * enum object (Type.DNA). ** * @return "DNA" */ @Override public String toString() { return "DNA"; } }, /** * Sequence type RNA */ RNA { /** * Prints a nicely formatted string representation for "RNA type" * enum object (Type.RNA). ** * @return "RNA" */ @Override public String toString() { return "RNA"; } }, /** * Sequence type protein */ PROTEIN { /** * Prints a nicely formatted string representation for the * "protein type" enum object (Type.PROTEIN). ** * @return "Protein" */ @Override public String toString() { return "Protein"; } }, /** * Sequence type colour mask */ MASK { /** * Prints a nicely formatted string representation for the * "colour mask type" enum object (Type.MASK). ** * @return "Colour mask" */ @Override public String toString() { return "Colour mask"; } }, /** * Sequence type is text * (represented as '"') */ TEXT { /** * Prints a nicely formatted string representation for the * "text type" enum object (Type.TEXT). ** * @return "Text" */ @Override public String toString() { return "Text"; } }; } /** * Used for typing/storing sequence direction. * This is used for all sequences which may have direction. * This enum may be ignored if you are dealing with non-sequence * test types (such as text), or any type of test that either * doesn't have or doesn't need to distinguish direction. */ public static enum Direction { /** * Direction of the sequence goes from 3' to 5' */ FROM3TO5 { /** * Prints a nicely formatted string representation for the * "From 3' to 5'" enum object (Direction.FROM3TO5). ** * @return "From 3' to 5'" */ @Override public String toString() { return "From 3' to 5'"; } }, /** * Direction of the sequence goes from 5' to 3' */ FROM5TO3 { /** * Prints a nicely formatted string representation for the * "From 5' to 3'" enum object (Direction.FROM5TO3). ** * @return "From 5' to 3'" */ @Override public String toString() { return "From 5' to 3'"; } }; } /** * Used for typing/storing sequence topology. * This is used for all sequences which may have topology. * This enum may be ignored if you are dealing with non-sequence * test types (such as text), or any type of test that either * doesn't have or doesn't need to distinguish topology. */ public static enum Topology { /** * Linear topology */ LINEAR { /** * Prints a nicely formatted string representation for the * "linear topology" enum object (Topology.LINEAR). ** * @return "Linear" */ @Override public String toString() { return "Linear"; } }, /** * Circular topology */ CIRCULAR { /** * Prints a nicely formatted string representation for the * "circular topology" enum object (Topology.CIRCULAR). ** * @return "Circular" */ @Override public String toString() { return "Circular"; } }; } /** * Used for typing/storing sequence strandedness. * This is used for all sequences which may have strandedness. * This enum may be ignored if you are dealing with non-sequence * test types (such as text), or any type of test that either * doesn't have or doesn't need to distinguish strandedness. */ public static enum Strandedness { /** * Single stranded sequence */ SINGLE { /** * Prints a nicely formatted string representation for the * "single stranded" enum object (Strandedness.SINGLE). ** * @return "Single stranded" */ @Override public String toString() { return "Single stranded"; } }, /** * Double stranded sequence * (represented as 'D') */ DOUBLE { /** * Prints a nicely formatted string representation for the * "double stranded" enum object (Strandedness.DOUBLE). ** * @return "Double stranded" */ @Override public String toString() { return "Double stranded"; } }, /** * Mixed stranded sequence * (represented as 'M') */ MIXED { /** * Prints a nicely formatted string representation for the * "mixed stranded" enum object (Strandedness.MIXED). ** * @return "Mixed strandedness" */ @Override public String toString() { return "Mixed strandedness"; } }; } ////////////////////////////////// // NOTES ON THE VARIABLES BELOW // ////////////////////////////////// // WHY PACKAGE ACCESS? // ------------------- // because the access is restricted to only the package level, there should // not be any major issues with regard to code bugs // // http://java.dzone.com/articles/getter-setter-use-or-not-use-0 // http://www.artima.com/intv/sway2.html // // In addition, because these variables do not have to be validated // (they can be any value, including null), and the variables are only // changed by classes and methods within the package there is no reason for // accessor methods because it would only cause a slow-down // (i.e. additional overhead). In addition, the variables are only given // package access to minimize the amount of code that can change them. // // Please note that any variable defined in the scope of a class without the // word 'public' or 'private' preceding it, is defined as package scope. // Package scope means that only classes and methods in the same package can // alter or read the value directly. // // WHY TRANSIENT? // -------------- // Certain parts of the object, such as the protection status and groupID, // are not necessary to store when serializing this object. To indicate // this to Java, we add the word 'transient' before each variable // declaration. ////////////////////////////////// /** * This variable stores the numerical groupID for the sequence. */ transient int groupID = 0; /** * This variable stores the sequence type for the data. */ Type type = Type.DNA; /** * The short name of the sequence. */ String name; /** * A longer description of the sequence. */ String description = null; /** * The topology of the sequence data (linear or circular), where applicable. */ Topology topology = Topology.LINEAR; /** * The direction of the sequence data, where applicable. This applies only * to the direction of nucleotide sequences (e.g. 5' to 3' or 3' to 5'). */ Direction direction = Direction.FROM5TO3; /** * The strandedness of the sequence data (single, double, or mixed), * where applicable. */ Strandedness strandedness = Strandedness.SINGLE; /** * The actual sequence data for the current sequence. */ StringBuffer sequence; /** * The colour mask to display the sequence with. */ transient ColourMask mask = null; /** *

The protection status for alignment characters in the sequence.

* *

According to GDE (version 2.2), for nucleotide sequences, any character, * which is not an ambiguous or unambiguous character, is an alignment * character.

* *

Put simply, any character in a nucleotide sequence, other than: * B, D, H, I, K, M, N, R, S, V, W, Y, A, C, G, T, or U , is an * alignment character.

* *

For protein sequences (ref. GDE 2.2) only the following characters are * considered alignment characters: *

' ', '\n', '\t', '\r' '-'
(i.e. whitespace or dash characters) *

* *

Text sequences do not handle protection characters.

* *

All protections character processing is handled by the method: * Dataset.isProtectionsOn

* *

By default, this protection setting is disabled (false).

** * @see org.biolegato.sequence.data.Dataset#isProtectionsOn(org.biolegato.sequence.data.Seq.Type, boolean, boolean, boolean, char[], int, int) */ transient boolean protect_align = false; /** *

The protection status for ambiguous characters in the sequence.

* *

The ambiguous characters for nucleotide sequences (RNA or DNA), * according to GDE (version 2.2) are as follows: *

B, D, H, I, K, M, N, R, S, V, W, Y

* *

The ambiguous characters for protein sequences (ref. GDE 2.2) are: *

B, J, X, Z, *

* *

Text sequences do not handle protection characters.

* *

All protections character processing is handled by the method: * Dataset.isProtectionsOn

* *

By default, this protection setting is enabled (true), unless the * sequence provided to the constructor is empty (in which case the * protection setting will default to false, so the user can create/type a * new sequence into BioLegato).

** * @see org.biolegato.sequence.data.Dataset#isProtectionsOn(org.biolegato.sequence.data.Seq.Type, boolean, boolean, boolean, char[], int, int) */ transient boolean protect_ambig = true; /** *

The protection status for unambiguous characters in the sequence.

* *

The unambiguous characters in nucleotide sequences (RNA or DNA), * according to GDE (version 2.2) are as follows: *

A, C, G, T, U

* *

For protein sequences (ref. GDE 2.2), any character, which is not an * ambiguous or alignment character, is considered an unambiguous character. *

* *

Put simply, any character in a protein sequence, other than: *, * B, X, J, Z, whitespace or dash characters (' ', '\n', '\t', * '\r' '-'), is considered to be an unambiguous character * (regardless of whether it is actually a valid protein character -- e.g. * '+' is considered an unambiguous amino acid). Why? To * accommodate future and modified amino acids. For example, Selenocysteine * (AA code U) is not a standard amino acid, but is appropriately handled by * this algorithm. *

* *

Technically, one could use just every letter other than B, X, Y and Z * to handle the unambiguous characters (thereby skipping any possible non- * alphabetical expansions). This differs from the current approach, which * includes non-alphabet characters. There is some merit to this, and so if * one decides this approach is more valuable in the future, one could * simply change the code in Dataset. I have left the case statements for * each alphabetical character (above the "default" case) in the method * 'isProtectionsOn', in case such an algorithm is desired in the future. *

* *

Text sequences do not handle protection characters.

* *

All protections character processing is handled by the method: * Dataset.isProtectionsOn

* *

By default, this protection setting is enabled (true), unless the * sequence provided to the constructor is empty (in which case the * protection setting will default to false, so the user can create/type a * new sequence into BioLegato).

** * @see org.biolegato.sequence.data.Dataset#isProtectionsOn(org.biolegato.sequence.data.Seq.Type, boolean, boolean, boolean, char[], int, int) */ transient boolean protect_unambig = true; /** *

This variable stores the original header of the sequence if the * sequence was loaded from a GenBANK file. This is useful for preserving * header metadata, such as annotation features, CDS, etc. This variable is * left null for sequences which are not read from GenBANK files.

* *

In addition, if a sequence is modified, its GenBANK original header * (stored in this variable) is lost (set this variable to null) because the * header will no longer be applicable. For example, if part of a sequence * is deleted or changed, then the features in the sequence will no longer * be valid, and so we delete the original header from memory.

* *

In summary:
* GenBANK original headers are stored from GenBANK files read into * BioLegato. These headers are used for ensuring that CDS, annotation, and * other header data (when present) is not lost from unaltered GenBANK files * read into BioLegato.

*/ CharSequence original = null; /** * Used for serialization purposes. */ private static final long serialVersionUID = 7526472295622777024L; /////////////// //***********// //* METHODS *// //***********// /////////////// /** *

Constructs new instances of a sequence object. The names of the * constructor parameters passed correspond directly with the class fields * of the same names (i.e. the parameter 'name' corresponds directly to the * sequence object field 'name').

* *

In this case, the sequence object will be a nameless empty DNA * sequence.

*/ public Seq() { this(Type.DNA, "", new StringBuffer()); } /** *

Constructs new instances of a sequence object. The names of the * constructor parameters passed correspond directly with the class fields * of the same names (i.e. the parameter 'name' corresponds directly to the * sequence object field 'name').

* *

The only thing really noteworthy is that if the sequence provided is * empty, the character protection settings will all be set to false. This * functionality is handled by this specific constructor method (hence why * all other constructor methods, except the clone/copy Seq constructor * method, interface with this constructor).

* *

This constructor is called directly by BioLegato's GDE flat-file * parser, and also unifies all of the other Seq object constructors (except * the copy/clone Seq constructor).

** * @param type the type of data to store in the sequence object. * @param name the name of the sequence object. * @param sequence the text sequence to store in the sequence object. * @see org.biolegato.sequence.data.Seq#type * @see org.biolegato.sequence.data.Seq#name * @see org.biolegato.sequence.data.Seq#sequence * @see org.biolegato.sequence.data.Seq#Seq() * @see org.biolegato.sequence.data.Seq#Seq(org.biolegato.sequence.data.Seq.Type, java.lang.String, java.lang.StringBuffer, java.lang.String) * @see org.biolegato.sequence.data.Seq#Seq(org.biolegato.sequence.data.Seq.Type, java.lang.String, java.lang.StringBuffer, org.biolegato.sequence.data.Seq.Direction, org.biolegato.sequence.data.Seq.Topology, org.biolegato.sequence.data.Seq.Strandedness) * @see org.biolegato.sequence.data.Seq#Seq(org.biolegato.sequence.data.Seq.Type, java.lang.String, java.lang.StringBuffer, org.biolegato.sequence.data.Seq.Direction, org.biolegato.sequence.data.Seq.Topology, org.biolegato.sequence.data.Seq.Strandedness, java.lang.CharSequence) * @see org.biolegato.sequence.data.Seq#Seq(org.biolegato.sequence.data.Seq.Type, java.lang.String, java.lang.StringBuffer, org.biolegato.sequence.data.Seq.Direction, org.biolegato.sequence.data.Seq.Topology, org.biolegato.sequence.data.Seq.Strandedness, int, java.lang.String) */ public Seq(Type type, String name, StringBuffer sequence) { // Copy the parameters specified in the constructor to the new object. this.name = name; this.type = type; this.sequence = sequence; // Set the default protections to false if the sequence is empty. if (this.sequence != null && sequence.length() == 0) { protect_align = false; protect_ambig = false; protect_unambig = false; } } /** *

Constructs new instances of a sequence object. The names of the * constructor parameters passed correspond directly with the class fields * of the same names (i.e. the parameter 'name' corresponds directly to the * sequence object field 'name').

* *

The only thing really noteworthy is that if the sequence provided is * empty, the character protection settings will all be set to false.

* *

This constructor is called by BioLegato's FastA file parser.

** * @param type the type of data to store in the sequence object. * @param name the name of the sequence object. * @param sequence the text sequence to store in the sequence object. * @param description the description of the sequence object. * @see org.biolegato.sequence.data.Seq#type * @see org.biolegato.sequence.data.Seq#name * @see org.biolegato.sequence.data.Seq#sequence * @see org.biolegato.sequence.data.Seq#description * @see org.biolegato.sequence.data.FastAFile */ public Seq(Type type, String name, StringBuffer sequence, String description) { this(type, name, sequence); this.description = description; } /** *

Constructs new instances of a sequence object. The names of the * constructor parameters passed correspond directly with the class fields * of the same names (i.e. the parameter 'name' corresponds directly to the * sequence object field 'name').

* *

The only thing really noteworthy is that if the sequence provided is * empty, the character protection settings will all be set to false.

* *

This method unifies the Seq constructors called by the GenBANK and * GDE parsers.

** * @param type the type of data to store in the sequence object. * @param name the name of the sequence object. * @param sequence the text sequence to store in the sequence object. * @param direction the direction of the sequence (if nucleotide). * @param topology the topology of the sequence (if nucleotide). * @param strandedness the strandedness of the sequence (if nucleotide). * @see org.biolegato.sequence.data.Seq#type * @see org.biolegato.sequence.data.Seq#name * @see org.biolegato.sequence.data.Seq#sequence * @see org.biolegato.sequence.data.Seq#direction * @see org.biolegato.sequence.data.Seq#topology * @see org.biolegato.sequence.data.Seq#strandedness * @see org.biolegato.sequence.data.Seq#Seq(org.biolegato.sequence.data.Seq.Type, java.lang.String, java.lang.StringBuffer, org.biolegato.sequence.data.Seq.Direction, org.biolegato.sequence.data.Seq.Topology, org.biolegato.sequence.data.Seq.Strandedness, java.lang.CharSequence) * @see org.biolegato.sequence.data.Seq#Seq(org.biolegato.sequence.data.Seq.Type, java.lang.String, java.lang.StringBuffer, org.biolegato.sequence.data.Seq.Direction, org.biolegato.sequence.data.Seq.Topology, org.biolegato.sequence.data.Seq.Strandedness, int, java.lang.String) */ public Seq(Type type, String name, StringBuffer sequence, Direction direction, Topology topology, Strandedness strandedness) { this(type, name, sequence); // Copy parameters specified in the constructor to the new object. this.topology = topology; this.direction = direction; this.strandedness = strandedness; } /** *

Constructs new instances of a sequence object. The names of the * constructor parameters passed correspond directly with the class fields * of the same names (i.e. the parameter 'name' corresponds directly to the * sequence object field 'name').

* *

The only thing really noteworthy is that if the sequence provided is * empty, the character protection settings will all be set to false.

* *

This constructor is called by BioLegato's GenBANK parser.

** * @param type the type of data to store in the sequence object. * @param name the name of the sequence object. * @param sequence the text sequence to store in the sequence object. * @param direction the direction of the sequence (if nucleotide). * @param topology the topology of the sequence (if nucleotide). * @param strandedness the strandedness of the sequence (if nucleotide). * @param original the original GenBANK header for the sequence data. * @see org.biolegato.sequence.data.Seq#type * @see org.biolegato.sequence.data.Seq#name * @see org.biolegato.sequence.data.Seq#sequence * @see org.biolegato.sequence.data.Seq#direction * @see org.biolegato.sequence.data.Seq#topology * @see org.biolegato.sequence.data.Seq#strandedness * @see org.biolegato.sequence.data.Seq#original * @see org.biolegato.sequence.data.GenBankFile2008 */ public Seq(Type type, String name, StringBuffer sequence, Direction direction, Topology topology, Strandedness strandedness, CharSequence original) { this(type, name, sequence, direction, topology, strandedness); // Copy the GenBANK original header to the new object. this.original = original; } /** *

Constructs new instances of a sequence object. The names of the * constructor parameters passed correspond directly with the class fields * of the same names (i.e. the parameter 'name' corresponds directly to the * sequence object field 'name').

* *

The only thing really noteworthy is that if the sequence provided is * empty, the character protection settings will all be set to false.

* *

This constructor is called by BioLegato's GDE parser.

** * @param type the type of data to store in the sequence object. * @param name the name of the sequence object. * @param sequence the text sequence to store in the sequence object. * @param direction the direction of the sequence (if nucleotide). * @param topology the topology of the sequence (if nucleotide). * @param strandedness the strandedness of the sequence (if nucleotide). * @param groupID the group ID for the sequence object. * @param description the description of the sequence object. * @see org.biolegato.sequence.data.Seq#type * @see org.biolegato.sequence.data.Seq#name * @see org.biolegato.sequence.data.Seq#sequence * @see org.biolegato.sequence.data.Seq#direction * @see org.biolegato.sequence.data.Seq#topology * @see org.biolegato.sequence.data.Seq#strandedness * @see org.biolegato.sequence.data.Seq#groupID * @see org.biolegato.sequence.data.Seq#description * @see org.biolegato.sequence.data.GDEFile */ public Seq(Type type, String name, StringBuffer sequence, Direction direction, Topology topology, Strandedness strandedness, int groupID, String description) { this(type, name, sequence, direction, topology, strandedness); // Copy parameters specified in the constructor to the new object. this.groupID = groupID; this.description = description; } /** * Constructs new instance of a sequence object by copying data from * another, pre-existing sequence object. ** * @param data the sequence to copy data from. * @see org.biolegato.sequence.data.Seq#clone() */ public Seq(Seq data) { // Copy data from the old object to the new object. this.name = data.name; this.type = data.type; this.topology = data.topology; this.direction = data.direction; this.strandedness = data.strandedness; this.protect_align = data.protect_align; this.protect_ambig = data.protect_ambig; this.protect_unambig = data.protect_unambig; this.sequence = data.sequence; this.original = data.original; } ////////////////////////// //**********************// //* SEQUENCE FUNCTIONS *// //**********************// ////////////////////////// /** * Return the group ID number of the sequence object. ** * @return the group ID number of the sequence object. * @see org.biolegato.sequence.data.Seq#groupID */ final int getGroupID() { return groupID; } /** * Return the name of the sequence object. ** * @return the name of the sequence object. * @see org.biolegato.sequence.data.Seq#name */ final String getName() { return name; } /** * Return the long description of the sequence object. ** * @return the long description of the sequence object. * @see org.biolegato.sequence.data.Seq#description */ final String getDescription() { return description; } /** * Return the type of data stored in the sequence object (DNA, RNA, etc.) ** * @return the type of data stored in the sequence object. * @see org.biolegato.sequence.data.Seq.Type * @see org.biolegato.sequence.data.Seq#type */ final Type getType() { return type; } /** * Return the direction of nucleotide data stored in the sequence object. * This is only applicable if the data stored in the sequence object is * nucleotide data (DNA or RNA). ** * @return the direction of nucleotide data stored in the sequence object. * @see org.biolegato.sequence.data.Seq.Direction * @see org.biolegato.sequence.data.Seq#direction */ final Direction getDirection() { return direction; } /** * Return the topology of nucleotide data stored in the sequence object. * This is only applicable if the data stored in the sequence object is * nucleotide data (DNA or RNA). ** * @return the topology of nucleotide data stored in the sequence object. * @see org.biolegato.sequence.data.Seq.Topology * @see org.biolegato.sequence.data.Seq#topology */ final Topology getTopology() { return topology; } /** * Return the strandedness of nucleotide data stored in the sequence object. * This is only applicable if the data stored in the sequence object is * nucleotide data (DNA or RNA). ** * @return the strandedness of nucleotide data stored in the object. * @see org.biolegato.sequence.data.Seq.Strandedness * @see org.biolegato.sequence.data.Seq#strandedness */ final Strandedness getStrandedness() { return strandedness; } /** * Return the original GenBANK header of the data stored in the sequence * object. This is only applicable if the data stored in the sequence * object was read from a GenBANK file, and has not been altered by the * user. The reasons for storing and using this information are outlined * in the field 'strandedness' in this same class. If no information is * present, a value of null is returned. ** * @return the original GenBANK header of the data stored in the sequence * object (null if either not present or applicable). * @see org.biolegato.sequence.data.Seq#original */ final CharSequence getOriginal() { return original; } /** * Return the raw nucleotide, amino acid, or text sequence data stored by * this sequence object. ** * @return the raw nucleotide, amino acid, or text sequence data * stored by this object. * @see org.biolegato.sequence.data.Seq#sequence */ final StringBuffer getSequence() { return sequence; } /** * Return the colour mask object to use for colouring the display of the * sequence data (see getSequence) stored in this sequence object. ** * @return the strandedness of nucleotide data stored in the object. * @see org.biolegato.sequence.data.Seq#mask * @see org.biolegato.sequence.data.Seq#sequence * @see org.biolegato.sequence.data.Seq#getSequence() */ final ColourMask getMask() { return mask; } /** *

Creates a string representation of the Seq and its fields * This representation is limited to '[GROUP#|_] NAME' because the toString * method will be called when the sequence is displayed in any JList.

* *

A couple of examples are as follows:

*
     *      1 HEXOKINASE
     *      1 PYRUVATE KINASE
     *      1 ENOLASE
     *      2 ACONITASE
     *      _ OTC
* *

* Where the glycolysis enzymes (HEXOKINASE, PYRUVATE KINASE, and ENOLASE) * are in group 1, the citric acid enzyme (ACONITASE) is in group 2, and * the urea cycle enzyme (OTC -- ornithine transcarbamylase) is not grouped. *

* *

Reliability and performance are the reasons why the string format * limitation of '[GROUP#|_] NAME' was imposed:

*
    *
  1. Reliability: the code will provide a simpler implementation for * displaying sequences in a list. Instead of writing a custom class * to display a list of sequences or using other possible complicated * implementations (e.g. writing multiple external methods to "wrap" * the sequence), the toString method and Java's intrinsic JList * provide a simple streamlined way to display a list of sequences. * Because the code is simpler, it should be more reliable.
  2. *
  3. Performance: because the code is simpler and can be accessed * directly by the Java API (no "wrapping" necessary), the amount of * Java code required to display the sequence list is minimal. * Additionally, JList's implementation is likely (but not necessarily) * optimized C/C++ code, which should (theoretically) be much faster * than any practical custom JComponent display list code.
  4. *
  5. Readability: smaller code is almost always more readable (as long as * it is documented properly).
  6. *
** * @return a string containing the name and group number of the sequence. * The format is as follows: '[GROUP#|_] NAME'. * @see org.biolegato.sequence.data.Seq#groupID * @see org.biolegato.sequence.data.Seq#name * @see org.biolegato.sequence.data.Dataset#getElementAt(int) */ @Override public String toString() { StringBuilder string = new StringBuilder(); // Append the group ID if not 0 (otherwise append '_ ') to the start of // the output string builder (beginning the output string). if (groupID > 0) { string.append(groupID).append(" "); } else { string.append("_ "); } // Append the name to the output string after the group ID (or '_ '). string.append(name); return string.toString(); } /** * Clones the current sequence object. ** * @return a cloned copy of this sequence object. */ @Override public Object clone() { return new Seq(this); } /** *

This function detects the type of a sequence (RNA, DNA, or protein). *

* *

Please note that this function's implementation is very simple and may * not be accurate in all cases. This function reads sequence data and * looks for the characters F, E, J, L, O, Q, X, Z, or U. If the characters * F, E, J, L, O, Q, X, or Z are encountered, the sequence is determined to * be an amino acid/protein sequence; however, if the character U is found, * the sequence is determined to be RNA. If both the character U and any * character from the protein set (F, E, J, L, O, Q, X, or Z) are found, * then the first of those characters found will be used to determine the * sequence type. Note that if none of those characters (F, E, J, L, O, Q, * X, Z, or U) are detected, then the sequence type will default to DNA.

** * @param data the sequence object to detect the type for. * @return the sequence type detected. */ public static Type detectType(StringBuffer data) { Seq.Type result = Seq.Type.DNA; // The current character in the sequence to examine. char test; // Create a new array to contain the sequence data to examine. char[] array = new char[data.length()]; // Extract the the sequence data into an array. data.getChars(0, data.length(), array, 0); // Iterate through the sequence array until we either reach the end of // the array, or until the sequence type is determined not to be DNA. for (int count = 0; count < array.length && result == Seq.Type.DNA; count++) { // Convert the test character to upper-case (this way we avoid // testing both upper and lower case letters for matches). test = Character.toUpperCase(array[count]); if (test == 'U') { // If the sequence contains the character U, it is likely // (but not necessarily) an RNA nucleotide sequence. result = Seq.Type.RNA; } else if (test == 'F' || test == 'E' || test == 'J' || test == 'L' || test == 'O' || test == 'Q' || test == 'X' || test == 'Z') { // If the sequence contains F, E, J, L, O, Q, X or Z, it is // likely (but not necessarily) an amino acid sequence. result = Seq.Type.PROTEIN; } } // Return the results of the type detection. return result; } }