/* * Dataset.java * * Created on September 30, 2008, 11:08 AM * * To change this template, choose Tools | Template Manager * and open the template in the editor. */ package org.biolegato.sequence.data; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Set; import javax.swing.AbstractListModel; import org.biolegato.sequence.canvas.ColourMask; import org.biolegato.sequence.canvas.SequenceCanvas; /** *
The internal document format for BioLegato.
* *This document is structured as a linked list of sequences. Each character * has an offset based on its position within the list and it's position within * its containing sequence. Sequences start at 0 (first character in the first * sequence in the list, and ens with the last character in the last sequence * within the list.
** * @author Graham Alvare * @author Brian Fristensky */ public class Dataset extends AbstractListModel { /** * The parent canvas for the Dataset sequence object container. */ private SequenceCanvas canvas; /** * This linked list used to store all of the height in the document. ** Each y is stored as a linked list of sequence wrappers. * Each sequence wrapper is characterized by a sequence and an index * offset. The index offset is used to determine the offset of the * sequence within the document. *
*/ private final ListInserts text into the Dataset sequence object container. This * function also calls calls all of the listeners which are listening to the * Dataset object. Mostly (ane even 'completely', at the time of this * writing), listeners are used for displaying the data in the sequence list * and the sequence text area. Note that the add method inserts a sequence * object, as a whole, into the Dataset, while the insert method inserts the * text of a sequence object into the middle of the already present sequence * object in the Dataset.
* *This function also obeys all sequence permissions/protections (as long * as the protect boolean is set to true). In the case of text insertion, * text may NOT be inserted if it contains a protected character. For * example, if the sequence is a protein sequence and the unambiguous * character protections are set to true, then the text "BZAZ" may not be * inserted because it contains one unambiguous character (A, for alanine); * however, the text "BZZZ" may be inserted, as long as the ambiguous * character protections are set to false. See Dataset.isProtectionsOn for * more information about character protections.
** * @param x the X-coordinate to insert the sequence text. * @param y the Y-coordinate to insert the sequence text. * @param text the sequence text to insert. * @param offset the offset within the array to insert. * @param length the length in characters of data from the array to insert. * @param protect whether to test the protections of the sequence, already * in the Dataset, before inserting the text. * @return true if the insertion was successful, otherwise false. */ public boolean insert(int x, int y, char[] text, int offset, int length, boolean protect) { Seq current; boolean result = false; // Ensure that the parameters passed to this function are appropriate. // (Particularly the Y-coordinate/line offset, and the sequence text.) if (text != null && y >= 0 && y <= getSize()) { current = getLine(y); // Check the sequence's protection settings if if (!protect || !Dataset.isProtectionsOn(current.type, current.protect_align, current.protect_ambig, current.protect_unambig, text, 0, text.length)) { // BRANCH // if we are at the end of the sequence, we are appending // characters to the sequence; otherwise, we are inserting // characters. This branch mainly has to do with performance // and avoiding possible exceptions. if (current.sequence.length() > x) { current.sequence.insert(x, text, offset, length); } else { current.sequence.append(text, offset, length); } // Deletes GenBank original copies of the sequence // this is because we are modifying the sequence, so we do not // want the original sequence to be exported current.original = null; // Call the canvas and notify it that the sequence length has // changed. This is essential for repainting the text area. SequenceCanvas.textLengthChanged(x, y, text.length); } } return result; } /** *Deletes a portion of a sequence from the Dataset sequence object * container. This function also calls calls all of the listeners which are * listening to the Dataset object. Mostly (ane even 'completely', at the * time of this writing), listeners are used for displaying the data in the * sequence list and the sequence text area. Note that the add method * deletes a sequence object, as a whole, into the Dataset, while the delete * method deletes the text of a sequence object into the middle of the * already present sequence object in the Dataset.
* *This function also obeys all sequence permissions/protections (as long * as the protect boolean is set to true). In the case of text deletion, * text may NOT be deleted if it contains a protected character. For * example, if the sequence is a protein sequence and the unambiguous * character protections are set to true, then the text "BZAZ" may not be * deleted because it contains one unambiguous character (A, for alanine); * however, the text "BZZZ" may be deleted, as long as the ambiguous * character protections are set to false.
** * @param x the X-coordinate to begin the the sequence text deletion. * @param y the Y-coordinate to begin the the sequence text deletion. * @param length is the length (number of characters) of text to delete. * @param protect whether to test the protections of the sequence, already * in the Dataset, before deleting the text. * @return true if the deletion was successful, otherwise false. */ public boolean delete(int x, int y, int length, boolean protect) { Seq curr; char[] text = null; final int xend = x + length; boolean pseq = false; boolean result = false; // ensure that the deletion co-ordinates are appropriate. // this is to prevent exceptions such as retrieving sequences // which are out of bounds, and performing deletions on negative // co-ordinates. // // Please note that this if-clause tests for length > 0. There is an // else if clause below that handles length == 0, which counts as a // deletion (i.e. abstaining from action means deleting zero characters) if (y >= 0 && y <= getSize() && x >= 0 && length > 0) { curr = getLine(y); // ensure that the deletion does not exceed the sequence length if (xend <= curr.sequence.length()) { // do any character protection tests (if necessary) // to ensure that we are not deleting any protected characters if (protect) { // Create an array of the characters to be deleted. This // array will be used to test characters for protection // status. text = new char[length]; curr.sequence.getChars(x, xend, text, 0); // test protection status pseq = isProtectionsOn(curr.type, curr.protect_align, curr.protect_ambig, curr.protect_unambig, text, 0, xend + 1); } // if the characters deleted from the sequence are // not protected, then delete them. if (!pseq) { curr.sequence.delete(x, x + length); // Deletes GenBank original copies of the sequence // this is because we are modifying the sequence, so we do // not want the original sequence to be exported. curr.original = null; // Call the canvas and notify it that the sequence length // has changed (+ a positive length indicates an insertion; // - a negative length indicates a deletion) hence we negate // the length. SequenceCanvas.textLengthChanged(x, y, 0 - length); result = true; } } } else if (length == 0) { // if the length is zero, then technically we have deleted // zero characters by not performing any deletion. Therefore, // the function will return true if length == 0. result = true; } return result; } /** * Adds sequences to the Dataset sequence object container. This function * also calls calls all of the listeners which are listening to the Dataset * object. Mostly (ane even 'completely', at the time of this writing), * listeners are used for displaying the data in the sequence list and the * sequence text area. Note that the add method inserts a sequence object, * as a whole, into the Dataset, while the insert method inserts the text * of a sequence object into the middle of the already present sequence * object in the Dataset. ** * @param y the line index number to insert the sequences object at. * @param seqs the collection of sequence objects to insert. * @return true if the insertion was successful, otherwise false. */ public boolean addSequences(int y, CollectionRetrieves the text contained within a sequence object in the Dataset. * The sequence object to extract text from is specified by its * Y-coordinate. The offset and number of characters to extract from the * sequence are specified by 'offset' and 'length'. The destination for the * text extracted is the character array 'array'. If the Y-coordinate or * offset values are invalid, no data is extracted. Invalid Y-coordinates * are values which exceed the total number of sequences in the Dataset, or * values less than zero. Invalid offsets are those which exceed the number * of characters on the sequence "line", or offset values less than zero. *
* *In contrast, if the length value exceeds the number of characters in * the sequence, the sequence's length is used instead. Thus, the length * parameter may be slightly invalid (it may exceed the sequence length, but * NOT be less than zero).
** * @param array the destination for the characters to be copied to. * @param number the sequence "line number"/Y-coordinate to extract data. * @param offset the offset within the sequence to begin copying data from. * @param length the number of characters to attempt to copy. * @return the number of characters actually copied. */ public int getSequence(char[] array, int number, int offset, int length) { // The position within the sequence to end the sequence character // extraction. On failure, or an extraction length of zero, setting the // 'endpos' to the value of 'offset' will ensure that the calculation // for the number of characters copied will work out to zero. int endpos = offset; // Obtain the sequence object, within the Dataset, to extract sequence // characters from. If the index ('number') is invalid, this parameter // will be null; hence, we will test for null later in the code. Seq current = getLine(number); // Ensure that the sequence was successfully extracted from the Dataset // object. Also ensure that the array is not null, the offset is not // negative and the number of characters to extract (length) is greater // than zero (if zero, we don't extract any characters anyways). if (array != null && current != null && offset >= 0 && length > 0) { // Calculate the position within the sequence to end the extraction. // This will be limited to the maximum possible position in the // sequence to prevent overflow. endpos = Math.min(current.sequence.length(), offset + length); // Extract the characters from the sequence. if (endpos - offset > 0) { current.sequence.getChars(offset, endpos, array, 0); } } // Calculate the number of characters return endpos - offset; } /** * Called when a field in a sequence is modified. This method is currently * called from within the Dataset class, and by the SequenceWindow class. * This method sends events to all of the listener objects listening to this * Dataset object. ** * @param index the sequence "line number"/Y-coordinate modified. * @see org.biolegato.sequence.data.SequenceWindow */ void sequenceChanged(final int index) { fireContentsChanged(this, index, index); } /////////////// //***********// //* GENERAL *// //***********// /////////////// /** * Creates a new sequence-group comprised of the sequences specified by * Y-coordinate/"line numbers" in the array parameter 'sequences'. Please * note that this method first "ungroups" all of sequences passed to it * before creating a new group. This ensures that no sequence belongs to * more than one group at a time. ** * @param sequences the array of Y-coordinates corresponding to sequences * to include in the new group. */ public void group(int[] sequences) { Seq seq; int maxGroupStart = maxGroup; // Ensure that there is an entry in group2seq to add the sequence to. while (group2seq[maxGroup] != null) { // Increase the new group number pointer. maxGroup++; maxGroup %= (group2seq.length - 1); // ensures wraparound // Should virtually never be run (except if someone is using a lot // of groups). This code increases the maximum number of groups. if (maxGroup == maxGroupStart) { SetChecks a string against all of a sequence's protection settings. A * string of text is said to violate the protection settings (for deletion * or insertion purposes) if the text contains any characters which are * protected. Whether a character is protected is determined by the Seq * object's protection status settings. These protection status settings * are split into three variables (for 3 classes of characters):
*If a character from any of the three classes is matched in the text * passed to this function, and the character matched has its protection * status is true, the text is barred from insertion or deletion. In * contrast, any character matched in the text with its character protection * status set to false is ignored. Thus text containing all characters * which have false protection statuses may be inserted or deleted. A * couple of examples will help demonstrate this:
*Please note that ON/OFF are interchangeable in this document * with the terms TRUE/FALSE, where ON is TRUE, and OFF is FALSE. *
* *Please also note that only DNA, RNA and PROTEIN sequences have * protection settings, thus any non DNA, RNA, or PROTEIN sequence type * tested will always return false, regardless of the text. An example * of a non-RNA/DNA/PROTEIN sequence is Seq.Type.TEXT.
** * @param type the type of the sequence to test. * @param protect_align the protection status of alignment characters. * @param protect_ambig the protection status of ambiguous characters. * @param protect_uambig the protection status of unambiguous characters. * @param test the sequence text to test. * @param start the index within 'text' to start the test. * @param end the index within 'text' to end the test. * @return true if the text violates the protection settings of the * sequence, and hence the text should not be inserted or deleted. * @see org.biolegato.sequence.data.Seq#protect_align * @see org.biolegato.sequence.data.Seq#protect_ambig * @see org.biolegato.sequence.data.Seq#protect_unambig * @see org.biolegato.sequence.data.Seq.Type#DNA * @see org.biolegato.sequence.data.Seq.Type#RNA * @see org.biolegato.sequence.data.Seq.Type#PROTEIN * @see org.biolegato.sequence.data.Seq.Type#MASK * @see org.biolegato.sequence.data.Seq.Type#TEXT * @see org.biolegato.sequence.data.Dataset#insert(int, int, char[], int, int, boolean) * @see org.biolegato.sequence.data.Dataset#delete(int, int, int, boolean) */ public static boolean isProtectionsOn(Seq.Type type, boolean protect_align, boolean protect_ambig, boolean protect_uambig, char[] test, int start, int end) { // The result of this function (whether the text tested can be inserted // or deleted into a given sequence, based on sequence protections). // By default this value is false (because types not recognized by this // function, such as TEXT will count as having no protection status by // this function). boolean protect = false; // Ensure that the end point does not exceed the test array's length. end = Math.min(end, test.length - 1); // Ensure that at least one proection setting is on, before testing. // Otherwise there is no reason to test (because if all protection // settings are off, then the text is unprotected and any character may // be inserted or deleted). if (protect_ambig || protect_uambig || protect_align) { // Branch based on whether the sequence is a nucleotide sequence, or // an amino acid/protein sequence. If the sequence is neither an // amino acid or nucleotide sequence, then this method will return // false (i.e. the text to be inserted or deleted is unprotected). if (type == Seq.Type.DNA || type == Seq.Type.RNA) { for (int count = start; !protect && count <= end; count++) { // Iterate through each character in the text string until // we either reach the end of the string, or a protected // character is found. Character types are based on a // combination of the sources listed below and a thorough // testing of character protections in GDE. // // SOURCES: // http://home.cc.umanitoba.ca/~psgendb/formats.html switch (Character.toLowerCase(test[count])) { case 'b': // G or T or C case 'd': // G or T or A case 'h': // A or C or T case 'i': // RESERVED(?) -- Copied behaviour from GDE. case 'k': // G or T case 'm': // A or C case 'n': // Any case 'r': // Purine (A or G) case 's': // G or C case 'v': // G or C or A case 'w': // A or T case 'y': // Pyrimidine (C or T) protect = protect_ambig; break; case 'a': // Adenosine (A) case 'c': // Cytosine (C) case 'g': // Guanine (G) case 't': // Thymine (T) case 'u': // Uracil (U) protect = protect_uambig; break; default: protect = protect_align; break; } } } else if (type == Seq.Type.PROTEIN) { for (int count = start; !protect && count < end; count++) { // Iterate through each character in the text string until // we either reach the end of the string, or a protected // character is found. Character types are based on a // combination of the sources listed below and a thorough // testing of character protections in GDE. // // SOURCES: // http://home.cc.umanitoba.ca/~psgendb/formats.html // http://www.ddbj.nig.ac.jp/sub/ref2-e.html // http://www.bioinformatics.org/sms/iupac.html switch (Character.toLowerCase(test[count])) { // Various standard alignment and whitespace characters. case ' ': case '\n': case '\t': case '\r': case '-': protect = protect_align; break; case 'b': // Aspartic acid or Asparagine (Asx) case 'j': // Leucine or isoleucine (Leu or Ile) case 'x': // UNKNOWN (ANY) case 'z': // Glutamic acid or Glutamine (Glx) case '*': // STOP protect = protect_ambig; break; // The current implementation treats every character // (not just letter) -- excluding NOT B, J, X, Z, *, a // whitespace character (space, tab, new-line, carriage // return), or a dash -- as an unambiguous sequence // character. To change this implementation to the // such that only letters are considered (and everything // which is not a letter is considered an alignment // gap), move the 'default:' case to the end of the // protect_align multiple-case statement. case 'a': // Alanine (Ala) case 'c': // Cysteine (Cys) case 'd': // Aspartic Acid (Asp) case 'e': // Gluamic Acid (Glu) case 'f': // Phenylalanine (Phe) case 'g': // Glycine (Gly) case 'h': // Histidine (His) case 'i': // Isoleucine (Ile) case 'k': // Lysine (Lys) case 'l': // Leucine (Leu) case 'm': // Methionine (Met) case 'n': // Asparagine (Asn) case 'o': // **Pyrrolysine (Pyl) -- NON-STANDARD AA! case 'q': // Glutamine (Gln) case 'p': // Proline (Pro) case 'r': // Arginine (Arg) case 's': // Serine (Ser) case 't': // Threonine (Thr) case 'u': // Selenocysteine (Sec) case 'v': // Valine (Val) case 'w': // Tryptophan (Trp) case 'y': // Tyrosine (Tyr) default: protect = protect_uambig; break; } } } } // Return the status of the protections test. return protect; } }