/* LineGroup.java * * created: Mon Oct 12 1998 * * This file is part of Artemis * * Copyright (C) 1998,1999,2000 Genome Research Limited * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ package uk.ac.sanger.artemis.io; import java.io.Writer; import java.io.IOException; import java.util.Hashtable; import uk.ac.sanger.artemis.util.LinePushBackReader; /** * This class corresponds to a group of associated lines in an EMBL entry. * An example of a group of associated lines is all the lines in an entry * that start with FT. * * @author Kim Rutherford */ abstract class LineGroup extends EMBLObject { /** * The tag used for unidentified input. **/ final static private int UNKNOWN = 0; /** * The tag for the end of entry line: "//" **/ final static int END_OF_ENTRY = 1; final static String END_OF_ENTRY_STRING = "//"; /** * The tag for the start of sequence line **/ final static int SEQUENCE = 2; final static String EMBL_SEQUENCE_STRING = "SQ"; /** * The tag for an EMBL feature table line **/ final static int EMBL_FEATURE = 3; final static String EMBL_FEATURE_STRING = "FT"; /** * The tag for an EMBL feature header lines (FH ...) **/ final static int EMBL_FEATURE_HEADER = 4; final static String EMBL_FEATURE_HEADER_STRING = "FH"; /** * The tag for a GENBANK feature table line **/ final static int GENBANK_FEATURE = 5; /** * This is the tag for an EMBL LineGroup that we don't have a handler for. * It will be stored in an object of type EmblMisc. **/ final static int EMBL_MISC = 6; /** * This is the tag for an Genbank LineGroup that we don't have a handler * for. It will be stored in an object of type GenbankMisc. **/ final static int GENBANK_MISC = 7; /** * This is the tag for a GFF LineGroup (generally a comment line) that we * don't have a handler for. It will be stored in an object of type * GFFMisc. **/ final static int GFF_MISC = 8; /** * This is the tag for a GFF format line. **/ final static int GFF_FEATURE = 9; /** * This is the tag for lines generated by MSPcrunch -d **/ final static int MSPCRUNCH_FEATURE = 10; /** * This is the tag for lines generated by blast **/ final static int BLAST_FEATURE = 11; /** * The tag for files that look like binary. **/ final static int BINARY_CHARACTERS = 12; /** * The tag for BSML XML files. **/ final static int BSML_XML = 13; /** * The tag for AGAVE XML files. **/ final static int AGAVE_XML = 14; /** * The tag for GAME XML files. **/ final static int GAME_XML = 15; /** * This hash table contains the GENBANK start of line keywords (LOCUS, * DEFINITION, FEATURES etc.) **/ private static Hashtable genbank_hash = null; static { genbank_hash = new Hashtable (); genbank_hash.put ("LOCUS","LOCUS"); genbank_hash.put ("DEFINITION","DEFINITION"); genbank_hash.put ("ACCESSION","ACCESSION"); genbank_hash.put ("NID","NID"); genbank_hash.put ("VERSION","VERSION"); genbank_hash.put ("KEYWORDS","KEYWORDS"); genbank_hash.put ("SOURCE","SOURCE"); genbank_hash.put ("REFERENCE","REFERENCE"); genbank_hash.put ("PROJECT","PROJECT"); genbank_hash.put ("COMMENT","COMMENT"); genbank_hash.put ("FEATURES","FEATURES"); genbank_hash.put ("SEGMENT","SEGMENT"); genbank_hash.put ("PRIMARY","PRIMARY"); genbank_hash.put ("DBLINK","DBLINK"); genbank_hash.put ("DBSOURCE","DBSOURCE"); genbank_hash.put ("CONTIG","CONTIG"); } /** * Try to read and return a new LineGroup object from a stream. * @param reader The stream to read from. * @return A new LineGroup object or null if stream is at the end of file. * @exception IOException Thrown if exception occurs while reading. * @exception ReadFormatException Thrown if the format of the input is in * error. * @exception InvalidRelationException Thrown if this Feature cannot contain * a particular Qualifier. **/ protected static LineGroup readNextLineGroup (final LinePushBackReader reader, final Entry entry) throws IOException, InvalidRelationException { String line; // read until we get to a non-blank line LINES: while(true) { line = reader.readLine (); if(line == null) return null; // end of file // check for and ignore blank lines for(int i = 0 ; i < line.length () ; ++i) { final char letter = line.charAt (i); if(letter != ' ' && letter != '\t') break LINES; } } final int line_type = LineGroup.getLineType (line); reader.pushBack (line); switch (line_type) { case SEQUENCE: return StreamSequenceFactory.makeStreamSequence (reader, entry); case EMBL_FEATURE: return EmblStreamFeature.readFromStream (reader); case EMBL_FEATURE_HEADER: return new FeatureHeader (reader); case GENBANK_FEATURE: return GenbankStreamFeature.readFromStream (reader); case GFF_FEATURE: return GFFStreamFeature.readFromStream (reader); case BLAST_FEATURE: return BlastStreamFeature.readFromStream (reader); case MSPCRUNCH_FEATURE: return MSPcrunchStreamFeature.readFromStream (reader); case END_OF_ENTRY: // in this case we do want to read the line (which will be //) so that // the next call to readNextEntry () starts on the next entry reader.readLine (); return null; case EMBL_MISC: return new EmblMisc (reader); case GENBANK_MISC: return new GenbankMisc (reader); case GFF_MISC: return new GFFMisc (reader); case BINARY_CHARACTERS: throw new ReadFormatException ("cannot recognise format of binary file"); default: throw new ReadFormatException ("reader got confused - " + "unknown line type", reader.getLineNumber ()); } } /** * Return the embl line type of the line contained in the argument String. */ protected static int getLineType(final String line) { if(line.startsWith ("= 2 && (line.charAt (0) == '/' || Character.isLetter (line.charAt (0))) && (line.charAt (1) == '/' || Character.isLetter (line.charAt (1))) && (line.length () == 2 || line.length () == 3 && line.endsWith (" ") || line.length () == 4 && line.endsWith (" ") || (line.length () >= 5 && line.substring (2,5).equals (" ") || line.substring (2,5).equals (" * ")) )) // EMBL pre-submission line { if(line.startsWith(EMBL_FEATURE_STRING)) return EMBL_FEATURE; if(line.startsWith(END_OF_ENTRY_STRING)) return END_OF_ENTRY; if(line.startsWith(EMBL_SEQUENCE_STRING)) return SEQUENCE; if(line.startsWith(EMBL_FEATURE_HEADER_STRING)) return EMBL_FEATURE_HEADER; // this covers all the lines in the header return EMBL_MISC; } if(line.length () > 21 && ((line.startsWith (" ") && (Character.isLetter (line.charAt (5)) || Character.isDigit (line.charAt (5)) || line.charAt (5) == '-') && line.charAt (20) == ' ') || (line.startsWith (" ") && line.trim ().length () > 0))) return GENBANK_FEATURE; final int genbank_type = getGenbankType(line); if(genbank_type != UNKNOWN) return GENBANK_MISC; if(isGFFLine(line)) return GFF_FEATURE; if(isBlastLine(line)) return BLAST_FEATURE; if(isMSPcrunchLine(line)) return MSPCRUNCH_FEATURE; // if(isBlastLine(line)) // return BLAST_FEATURE; if(looksLikeBinary(line)) return BINARY_CHARACTERS; // default is sequence return SEQUENCE; } /** * Return true if and only if the argument contains more than 30% binary * characters. "binary" means a control character before space in ascii * (except for tab, new line and form feed) and characters with the high * bit set. This is supposed to approximate the Perl -B test. **/ private static boolean looksLikeBinary (final String line) { int count = 0; if(line.length () == 0) return false; for(int i = 0 ; i < line.length () ; ++i) { final char this_char = line.charAt (i); if (Character.isISOControl (this_char) && this_char != '\t' && this_char != ' ' && this_char != '\r' && this_char != '\n' || this_char >= 128) { ++count; } } if (count * 100 / line.length () >= 30) { return true; } else { return false; } } /** * Return true if and only if the given String appears to be a feature * generated by MSPcrunch -d **/ private static boolean isMSPcrunchLine (final String line) { final String trim_line = line.trim (); if (trim_line.length () > 0 && Character.isDigit (trim_line.charAt (0)) && trim_line.indexOf (' ') != -1) return true; else return false; } /** * Return true if and only if the given String appears to be a feature * generated by blast. This method is easily fooled. **/ private static boolean isBlastLine (final String line) { if (line.length () > 0 && countChars (line, '\t') == 11) return true; else return false; } /** * Return true if and only if the given String appears to be a GFF feature. * This method is easily fooled. **/ private static boolean isGFFLine (final String line) { if (line.length () > 0) { final int tab_count = countChars (line.trim (), '\t'); if (tab_count == 7 || tab_count == 8 || tab_count == 9 || tab_count == 10) return true; } return false; } /** * Return the number of occurrences of the character c in the String s. **/ private static int countChars (final String s, final char c) { int count = 0; int index = 0; while((index = s.indexOf(c, index)) > -1) { count++; index++; } return count; } /** * Return the type of GENBANK LineGroup that starts with the given String * or UNKNOWN if the String isn't the **/ private static int getGenbankType (final String line) { if (line.length () > 0 && Character.isLetter (line.charAt (0))) { final int first_space = line.indexOf (' '); if (first_space == -1) { if (genbank_hash.get (line) != null) return GENBANK_MISC; } else { final String first_word = line.substring (0, first_space); if (genbank_hash.get (first_word) != null) return GENBANK_MISC; } } return UNKNOWN; } /** * Returns a String containing the contents of the line with the initial * type string (two letters) and white space (three spaces) removed. */ public static String getRestOfLine (String line) { final int END_OF_SPACES = 5; if (line.length () > END_OF_SPACES) return line.substring (END_OF_SPACES); else return ""; } /** * Write the end of entry marker - "//". **/ public static void writeEndOfEMBLEntry (Writer writer) throws IOException { writer.write (END_OF_ENTRY_STRING + "\n"); } public static void writeStartOfGFFEntry (Writer writer) throws IOException { writer.write ("##FASTA\n"); } /** * Write this object to the given stream. * @param writer The stream to write to. **/ public abstract void writeToStream (final Writer out_stream) throws IOException; }