/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * * created at Sep 8, 2007 */ package org.biojava.bio.structure.server; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintStream; import java.io.PrintWriter; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.List; import org.biojava.bio.structure.Atom; import org.biojava.bio.structure.Chain; import org.biojava.bio.structure.Compound; import org.biojava.bio.structure.DBRef; import org.biojava.bio.structure.Group; import org.biojava.bio.structure.PDBHeader; import org.biojava.bio.structure.Structure; import org.biojava.bio.structure.StructureImpl; import org.biojava.bio.structure.StructureTools; import org.biojava.bio.structure.io.PDBFileReader; import org.biojava.bio.structure.io.StructureIOFile; public class PrepareIndexFile { private static final SimpleDateFormat dateFormat = FlatFileInstallation.dateFormat; public static void main (String[] args){ try { File pdbLocation = new File("/Users/andreas/WORK/PDB/mmCIF/"); FlatFileInstallation installation = new FlatFileInstallation(pdbLocation); //File indexFile = new File("/Users/andreas/WORK/PDB/pdbindex.idx"); //File chainFile = new File("/Users/andreas/WORK/PDB/chainindex.idx"); //installation.setPDBInfoFile(indexFile); //installation.setChainInfoFile(chainFile); PrepareIndexFile prep = new PrepareIndexFile(); prep.prepareIndexFileForInstallation(installation); } catch(Exception e){ e.printStackTrace(); } } public PrepareIndexFile(){ } /** prepare the index file for this installation * * @param installation */ public void prepareIndexFileForInstallation(FlatFileInstallation installation) throws FileNotFoundException,IOException{ File[] pdbfiles = getAllPDB(installation.getFilePath()); createPDBInfoList(pdbfiles, installation.getPDBInfoFile(), installation.getChainInfoFile()); } /** parses a set of PDB files and writes info into a file * the file is tab separated and has the following columns: * name length resolution depositionDate modificationDate technique title classification filename * * binaryDirectory: a directory in which binary files containing the atoms will be places, to provide a speedup * * This method needs to be run, before a DBSearch can be performed, since the files created by this method * are required for the DBSearch * * @param pdbfiles * @param outputFile * @throws FileNotFoundException * @throws IOException */ public void createPDBInfoList(File[] pdbfiles, File outputFile, File chainInfoFile) throws FileNotFoundException, IOException { //String outputfile = "/Users/ap3/WORK/PDB/rotated.pdb"; FileOutputStream out= new FileOutputStream(outputFile); PrintStream p = new PrintStream( out ); PrintWriter pdbWriter = new PrintWriter(p); FileOutputStream cout = new FileOutputStream(chainInfoFile); PrintStream pc = new PrintStream(cout); PrintWriter chainWriter = new PrintWriter(pc); PDBFileReader pdbreader = new PDBFileReader(); logPDBInfoFile(pdbWriter, chainWriter, pdbreader, pdbfiles); } protected void logPDBInfoFile(PrintWriter pdbWriter, PrintWriter chainWriter, StructureIOFile pdbreader, File[] pdbfiles ) throws IOException{ pdbWriter.println("//pdbId\tnrCAAtoms\ttechnique\tresolution\tdepDate\tmodDate\ttitle\tclassification\ttime\tpath"); chainWriter.println("//pdbId\tchainId\tseqResLength\tatomLength\taminoLength\thetLength\tnucleotideLength\tmolName\tmolId\tdbrefs"); int l = pdbfiles.length; long loopStart = System.currentTimeMillis(); for ( int i = 0 ; i < l ; i++){ //if (i != 36) // continue; /* if ( i < 9424) continue; if ( i > 9426) break; */ // if ( i< 16055) // continue; File f = pdbfiles[i]; String pdb = f.getName().substring(3,7); long startTime = System.currentTimeMillis(); System.out.println("# " + i + " / " + l + " " + f ); //System.out.println("getting " + f); Structure s = null; s = pdbreader.getStructure(f); long stopTime = System.currentTimeMillis(); logPDBInfo(pdbWriter, s, startTime, stopTime,f); logChainInfo(chainWriter, s); } long loopEnd = System.currentTimeMillis(); pdbWriter.flush(); pdbWriter.close(); chainWriter.flush(); chainWriter.close(); long time = ((loopEnd-loopStart) / (long) (1000*60)); System.out.println("loop took: " + time + " minutes" ); } /** get the matching compound for this chain * * @param compounds * @param c * @return */ private Compound getCompoundForChain(List compounds, Chain c){ for(Compound comp : compounds){ ListchainIds = comp.getChainId(); if ( chainIds == null) continue; if ( chainIds.contains(c.getName())) return comp; } return null; } private String getDBRefStringForChain(List dbrefs, Chain c){ List dbIds = new ArrayList(); for (DBRef dbref : dbrefs){ if (dbref.getChainId().equals(c.getName())){ dbIds.add(dbref.getDbIdCode()); } } StringBuffer buf = new StringBuffer(); int pos = 1; for ( String id : dbIds){ buf.append(id); if ( pos < dbIds.size()); buf.append(":"); } return buf.toString(); } private void logChainInfo(PrintWriter chainWriter, Structure s ){ String pdbCode = s.getPDBCode(); List chains = s.getChains(0); List compounds = s.getCompounds(); List dbrefs = s.getDBRefs(); for ( Chain c : chains){ List agr = c.getAtomGroups("amino"); List hgr = c.getAtomGroups("hetatm"); List ngr = c.getAtomGroups("nucleotide"); Compound comp = getCompoundForChain(compounds, c); String molName = "-"; String molId = "-"; if (comp != null){ molName = comp.getMolName(); molId = comp.getMolId(); } String dbRefString = getDBRefStringForChain(dbrefs,c); String str = String.format("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t", pdbCode, c.getName(), c.getSeqResLength(), c.getAtomLength(), agr.size(), hgr.size(), ngr.size(), molName, molId, dbRefString ); chainWriter.println(str); } } private void logPDBInfo(PrintWriter pdbWriter, Structure s, long startTime,long stopTime, File path ){ // only used first model in nmrs ... if ( s.isNmr()){ List chains = s.getModel(0); Structure newsubject = new StructureImpl(); newsubject.setPDBCode(s.getPDBCode()); newsubject.setHeader(s.getHeader()); newsubject.setPDBHeader(s.getPDBHeader()); newsubject.setDBRefs(s.getDBRefs()); newsubject.addModel(chains); newsubject.setNmr(true); s = newsubject; } //System.out.println((stopTime-startTime) / (float)1000); Atom[] ca = StructureTools.getAtomCAArray(s); PDBHeader header = s.getPDBHeader(); String infoline = String.format("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\t%s", s.getPDBCode(), ca.length, header.getTechnique(), header.getResolution(), dateFormat.format(header.getDepDate()), dateFormat.format(header.getModDate()), header.getTitle(), header.getClassification(), ((stopTime-startTime) / (float) 1000), path.getAbsolutePath() ); //System.out.println(i +"/" + pdbfiles.length + " " + infoline); //System.out.println(s.getHeader()); pdbWriter.println(infoline); } /** get all PDBfiles from a directory * * @param dir the directory that contains all PDB files * @return an array of PDB Files */ public File[] getAllPDB(File dir) { if ( ! dir.isDirectory()){ throw new IllegalArgumentException("path is not a directory " + dir); } String[] all = dir.list(); List pdbFiles = new ArrayList(); for (int i = 0 ; i < all.length;i++ ){ // filenames are like 'pdb1234.ent.gz' String file = all[i]; if ( (file.endsWith(".pdb.gz")) || ( file.endsWith(".ent.gz"))){ pdbFiles.add(new File(dir+File.separator + file)); } } return (File[]) pdbFiles.toArray(new File[pdbFiles.size()]); } }