/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ package gff; import java.util.*; import java.io.*; import org.biojava.bio.*; import org.biojava.bio.symbol.*; import org.biojava.bio.seq.*; import org.biojava.bio.seq.io.*; import org.biojava.bio.seq.impl.*; import org.biojava.bio.program.gff.*; /** * Converts an EMBL file into a fasta file for the sequence and a GFF file for * the features. *

* This program demonstrates how to read in an EMBL file, how to write a fasta * file and how to write GFF. *

* Use:
* java EmblToGffFasta emblFile fastaOut gffOut * * @author Matthew Pocock */ public class EmblToGffFasta { public static void main(String [] args) throws Exception { if(args.length != 3) { throw new Exception("Use: EmblToGffFasta emblFile fastaOut gffOut"); } try { // make the files early to get exceptions about stupid file names File emblFile = new File(args[0]); File fastaFile = new File(args[1]); File gffFile = new File(args[2]); // reading embl stuff SequenceFormat eFormat = new EmblLikeFormat(); BufferedReader eReader = new BufferedReader( new InputStreamReader(new FileInputStream(emblFile))); SequenceBuilderFactory sFact = new EmblProcessor.Factory(SimpleSequenceBuilder.FACTORY); Alphabet alpha = DNATools.getDNA(); SymbolTokenization rParser = alpha.getTokenization("token"); // fasta stuff SequenceFormat fFormat = new FastaFormat(); OutputStream fastaOut = new FileOutputStream(fastaFile); // gff stuff GFFWriter writer = new GFFWriter( new PrintWriter(new OutputStreamWriter(new FileOutputStream(gffFile)))); SequencesAsGFF seqsAsGFF = new SequencesAsGFF(); // Loop over each sequence in the embl file. // Write the sequence to a fasta file // Write the features to a .gff file SequenceIterator seqI = new StreamReader(eReader, eFormat, rParser, sFact); while(seqI.hasNext()) { Sequence seq = seqI.nextSequence(); String de; Object desc = seq.getAnnotation().getProperty("DE"); if(desc instanceof String) { de = (String) desc; } else { de = null; for(Iterator i = ((Collection) desc).iterator(); i.hasNext(); ) { if(de == null) { de = i.next().toString(); } else { de = de + " " + i.next().toString(); } } } seq.getAnnotation().setProperty(FastaFormat.PROPERTY_DESCRIPTIONLINE, de); fFormat.writeSequence(seq, new PrintStream(fastaOut)); seqsAsGFF.processSequence(seq, writer); } } catch (Throwable t) { t.printStackTrace(); System.exit(1); } } }