/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ package gff; import java.util.*; import java.io.*; import org.biojava.bio.*; import org.biojava.bio.symbol.*; import org.biojava.bio.seq.*; import org.biojava.bio.seq.io.*; import org.biojava.bio.seq.impl.*; import org.biojava.bio.program.gff.*; /** * Converts an EMBL file into a fasta file for the sequence and a GFF file for * the features. *
* This program demonstrates how to read in an EMBL file, how to write a fasta * file and how to write GFF. *
* Use:
* java EmblToGffFasta emblFile fastaOut gffOut
*
* @author Matthew Pocock
*/
public class EmblToGffFasta {
public static void main(String [] args) throws Exception {
if(args.length != 3) {
throw new Exception("Use: EmblToGffFasta emblFile fastaOut gffOut");
}
try {
// make the files early to get exceptions about stupid file names
File emblFile = new File(args[0]);
File fastaFile = new File(args[1]);
File gffFile = new File(args[2]);
// reading embl stuff
SequenceFormat eFormat = new EmblLikeFormat();
BufferedReader eReader = new BufferedReader(
new InputStreamReader(new FileInputStream(emblFile)));
SequenceBuilderFactory sFact = new EmblProcessor.Factory(SimpleSequenceBuilder.FACTORY);
Alphabet alpha = DNATools.getDNA();
SymbolTokenization rParser = alpha.getTokenization("token");
// fasta stuff
SequenceFormat fFormat = new FastaFormat();
OutputStream fastaOut = new FileOutputStream(fastaFile);
// gff stuff
GFFWriter writer = new GFFWriter(
new PrintWriter(new OutputStreamWriter(new FileOutputStream(gffFile))));
SequencesAsGFF seqsAsGFF = new SequencesAsGFF();
// Loop over each sequence in the embl file.
// Write the sequence to a fasta file
// Write the features to a .gff file
SequenceIterator seqI =
new StreamReader(eReader, eFormat, rParser, sFact);
while(seqI.hasNext()) {
Sequence seq = seqI.nextSequence();
String de;
Object desc = seq.getAnnotation().getProperty("DE");
if(desc instanceof String) {
de = (String) desc;
} else {
de = null;
for(Iterator i = ((Collection) desc).iterator(); i.hasNext(); ) {
if(de == null) {
de = i.next().toString();
} else {
de = de + " " + i.next().toString();
}
}
}
seq.getAnnotation().setProperty(FastaFormat.PROPERTY_DESCRIPTIONLINE, de);
fFormat.writeSequence(seq, new PrintStream(fastaOut));
seqsAsGFF.processSequence(seq, writer);
}
} catch (Throwable t) {
t.printStackTrace();
System.exit(1);
}
}
}