/** * @author rli */ package oligo3; import java.io.IOException; import java.util.regex.Matcher; import java.util.regex.Pattern; public class Fasta extends InputFile { // Pattern to skip blank data line static Pattern blank_p = Pattern.compile("^\\s*"); // Pattern to validate sequence data static Pattern valid_p = Pattern.compile("([^ACGT]+)"); // Pattern to extract sequence data line static Pattern seq_pat = Pattern.compile("^\\s*\\w+"); // Pattern to extract fasta header line static Pattern header = Pattern.compile("^\\s*>\\s*\\S+.*"); // Pattern to extract GenBank header line static Pattern gbk_p0 = Pattern.compile("^\\s*>\\s*\\w+\\|.*"); // Fasta header patterns from GenBank databases static Pattern gbk_p1 = Pattern.compile( "^\\s*>\\s*\\w+\\|(.+)\\|:(\\w??)(\\d+)-(\\d+)\\s+(.+)\\s+(\\w{3,}_\\w*\\d{4})\\s+\\[.+\\s(\\S+)\\].*"); static Pattern gbk_p2 = Pattern.compile( "^\\s*>\\s*\\w+\\|(.+)\\|:(\\w??)(\\d+)-(\\d+)\\s+(.+)\\s+()\\[.+\\s(\\S+)\\].*"); static Pattern gbk_p3 = Pattern.compile( "^\\s*>\\s*\\w+\\|(.+)\\|:(\\w??)(\\d+)-(\\d+)\\s+()()\\[.+\\s(\\S+)\\].*"); static Pattern gbk_p4 = Pattern.compile( "^\\s*>\\s*\\w+\\|(.+)\\|:(\\w??)(\\d+)-(\\d+)()()().*"); // Fasta header patterns from JGI and other databases static Pattern jgi_p1 = Pattern.compile( "^\\s*>\\s*(\\S+)\\s+(\\w{3,}_\\w*\\d{4})\\s+(.+)\\s+(\\d+\\.\\.\\d+\\([+-]\\)).*"); static Pattern jgi_p2 = Pattern.compile( "^\\s*>\\s*(\\S+)\\s+(\\w{3,}_\\w*\\d{4})\\s+()(\\d+\\.\\.\\d+\\([+-]\\)).*"); static Pattern jgi_p3 = Pattern.compile( "^\\s*>\\s*(\\S+)\\s+(\\w{3,}_\\w*\\d{4})()().*"); static Pattern jgi_p4 = Pattern.compile("^\\s*>\\s*(\\S+)()()().*"); // Patterns to grab last text of a filename static Pattern text1 = Pattern.compile("[\\.\\s_-]([^\\.\\s\\)_-]+)\\)*\\.\\w{2,5}$"); static Pattern text2 = Pattern.compile("[\\.\\s_-]([^\\.\\s\\)_-]+)\\)*$"); static Pattern text3 = Pattern.compile("([^\\.\\s\\(\\)_-]{1,4})$"); // One-time declaration and instantiation of variables of repeated use static int file_count = 0; // Count input fasta files static int seq_count = 0; // Count input DNA sequences static int dup_count = 0; // Count sequence duplicates static int uni_count = 0; // Count distinct sequences static int e = 0; // ArrayList index of last element static int i = 0; // ArrayList index of any element static boolean key = false; static StringBuilder sb; static String file = ""; static String line = ""; static String text = ""; static String ss = ""; static Matcher m; /** * @param seq_count should evaluate to zero when called to read a file. */ public static void readFasta(int seq_count) throws IOException { // Grab last text of the name of an input fasta file file = f.getName(); // Get fasta file name if ((m=text1.matcher(file)).find()) {} else if ((m=text2.matcher(file)).find()) {} else if ((m=text3.matcher(file)).find()) {} text = m.group(1); // Read in all lines from the current open file do { line = br.readLine(); if (line != null) { // Skip blank data line if (blank_p.matcher(line).matches()) continue; if (seq_pat.matcher(line).matches()) { // Fasta sequence line hits // Extract sequence block while sequence lines are matching BioSeq.getSeq(line); continue; // Proceed to next loop for sequence reading efficiency } else if (header.matcher(line).matches()) { // Fasta header line hits key = true; // Turn next StringBuilder constructor on seq_count++; // Count input DNA sequences from a file if (seq_count == 1) {BioSeq.getHeader(line); continue;} } else if (line.matches("^\\s*#.*")) { System.out.println("Line: " + line + " in File " + file + " is a comment."); } else { System.out.println("Error in File " + file + "! Header: " + line + " is empty!"); System.exit(1); } } // Process the last read sequence block after reading the current header if (key || (line == null)) { // Uppercase acgt bases, strip whitespaces off the sequence ss = sb.toString().toUpperCase().replaceAll("\\s", ""); // Check and validate nucleotide/base composition of sequence if (!BioSeq.validSeq(ss)) { if (line != null) BioSeq.getHeader(line); continue; // drop invalid sequence } // Detect duplicate sequence from each input i = BioSeq.findDupSeq(ss); if ((i < 0) || (e == 0)) { // is distinct sequence // Remove duplicate sequences by storing unique sequences only ORF.seq.add(e, ss); // Add sequence to seq list BioSeq.setSeqInfo(e); // Set gene information uni_count++; // count distinct sequences e++; // increment seq list size } else { // is duplicate sequence (against element i of seq list) // integrate header information for sequence at index i with e BioSeq.integrateSeqInfo(i, e); OutputFile.tags.println(ORF.gid.get(i)+"\t"+ORF.acn.get(i)+"\t" +ORF.gsb.get(i)+"\t"+ORF.loc.get(i)+"\t"+ORF.prd.get(i)); OutputFile.tags.println(ORF.gid.get(e)+"\t"+ORF.acn.get(e)+"\t" +ORF.gsb.get(e)+"\t"+ORF.loc.get(e)+"\t"+ORF.prd.get(e)+"\n"); dup_count++; // count duplicates } if (line != null) BioSeq.getHeader(line); } } while (line != null); file_count++; // Count input fasta files Fasta.seq_count += seq_count; // global sequence count for all files // Print local sequence count for each of input fasta files OutputFile.stats.println("Sequence count in File "+file+" = "+seq_count); if (br != null) br.close(); } public static void readNCBI() { // Read in fasta lines from the NCBI ftp-based file if ((m=gbk_p1.matcher(line)).matches()) {} else if ((m=gbk_p2.matcher(line)).matches()) {} // no gene symbol else if ((m=gbk_p3.matcher(line)).matches()) {} // no gene symbol & product name else if ((m=gbk_p4.matcher(line)).matches()) {} // no gene symbol, product name & organism name else { System.out.println("Error in File " + file + "! Header: " + line + " is not matchable."); System.exit(1); } // Construct a DNA sequence/gene ID ORF.gid.add(e, (Params.ORF_id_prefix.equals("")?m.group(7): Params.ORF_id_prefix)+"_"+String.format("%04d", e+1)); // Construct a gene accession number ORF.acn.add(e, m.group(1)); // Construct a gene symbol ORF.gsb.add(e, m.group(6).equals("")?"gene_symbol":m.group(6)); // Construct a gene locus (coordinates) ORF.loc.add(e, m.group(2).equals("c")?m.group(3)+".."+m.group(4) +"(-)":m.group(3)+".."+m.group(4)+"(+)"); // Construct a gene product name ORF.prd.add(e, m.group(5).equals("")?"product_name":m.group(5)); } public static void readJGI() { // Read in fasta lines from the JGI-based file if ((m=jgi_p1.matcher(line)).matches()) {} else if ((m=jgi_p2.matcher(line)).matches()) {} // no product name else if ((m=jgi_p3.matcher(line)).matches()) {} // no product name and gene locus else if ((m=jgi_p4.matcher(line)).matches()) {} // no gene symbol, product name and gene locus else { System.out.println("Error in File " + file + "! Header: " + line + " is not matchable!"); System.exit(1); } // Construct a DNA sequence/gene ID ORF.gid.add(e, Params.ORF_id_prefix.equals("")?m.group(1): Params.ORF_id_prefix+"_"+String.format("%04d", e+1)); // Construct a gene accession number ORF.acn.add(e, "gene_accession"); // Construct a gene symbol ORF.gsb.add(e, m.group(2).equals("")?"gene_symbol":m.group(2)); // Construct a gene locus (coordinates) ORF.loc.add(e, m.group(4).equals("")?"gene_locus":m.group(4)); // Construct a gene product name ORF.prd.add(e, m.group(3).equals("")?"product_name":m.group(3)); } // Read in all FASTA files from the sequence data repository public static void readFiles() throws IOException { OutputFile.tags.println("Information of identical sequences found" + " and purged (2nd one of pairwise orfs is removed):"); OutputFile.tags.println("========================================" + "==================================================\n"); System.out.println("\n"); for (String filename : RunJob.files) { System.out.println("Processing " + filename + "..."); if (OpenOkay(Params.input_files_dir, filename)) readFasta(0); } // Remove the redundant header information elements added if (ORF.gid.size() > ORF.seq.size()) { for (i=ORF.gid.size()-1; i >= ORF.seq.size(); i--) { ORF.gid.remove(i); ORF.acn.remove(i); ORF.gsb.remove(i); ORF.loc.remove(i); ORF.prd.remove(i); } } ORF.seq.trimToSize(); ORF.gid.trimToSize(); ORF.acn.trimToSize(); ORF.gsb.trimToSize(); ORF.loc.trimToSize(); ORF.prd.trimToSize(); OutputFile.stats.println("\nNumber of input files = " + file_count); OutputFile.stats.println("Number of input sequences = " + seq_count + "\n"); OutputFile.stats.println("Number of duplicate sequences = " + dup_count); OutputFile.stats.println("Number of distinct sequences = " + uni_count + "\n"); System.out.println("\nNumber of input files = " + file_count); System.out.println("Number of input sequences = " + seq_count); System.out.println("Number of duplicate sequences = " + dup_count); System.out.println("Number of distinct sequences = " + uni_count + "\n"); OutputFile.tags.println("\n"); OutputFile.tags.flush(); OutputFile.stats.flush(); } }