/** * This class provides commonly used utilities for biological sequence processing * @author rli */ package oligo3; public class BioSeq { static String se = ""; static String si = ""; static int i = 0; static int j = 0; public static void getSeq(String line) { // Build biological sequence from each fasta line if (Fasta.key) { // Initialize a new sequence Fasta.sb = new StringBuilder(line); // Turn StringBuilder constructor off Fasta.key = false; } else { // Concatenate sequence lines Fasta.sb.append(line); } } public static void getHeader(String line) { // Extract fasta header data while its line hits if (Fasta.gbk_p0.matcher(line).matches()) { Fasta.readNCBI(); // data from GenBank databases } else { Fasta.readJGI(); // data from JGI and other databases } } public static boolean validSeq(String seq) { // Validate biological sequence fetched from each sequence block if ((Fasta.m=Fasta.valid_p.matcher(seq)).find()) { OutputFile.stats.println("Error in File " + Fasta.file + "! Sequence block contains characters (" + Fasta.m.group(1) + ") other than ACGT letters, which precedes Header: " + Fasta.line); if (Params.sequence_validation > 0) { Fasta.ss = seq.substring(0, Fasta.m.start(1)); if (inFrame(Fasta.ss)) { OutputFile.stats.println("Sequence error" + " has been fixed and sequence retained."); } else { OutputFile.stats.println("Sequence error" + " cannot be fixed and sequence dropped."); return false; } } } return true; } public static int findDupSeq(String seq) { // Find duplicate sequence by global alignment if (Fasta.e > 0) { for (i = Fasta.e - 1; i >= 0; i--) { if (seq.equals(ORF.seq.get(i))) break; } return i; } return 0; } public static boolean inFrame(String dna) { // Check if DNA sequence is in-frame for (i=0, j=3; j <= dna.length(); i+=3, j+=3) { if ((ORF.m=ORF.stop_cd.matcher(dna).region(i, j)).find()) { return true; // stop codon found } } return false; // stop codon not found } public static void deleteSeq(int i) { // Build biological sequence from each fasta line ORF.gid.remove(i); ORF.acn.remove(i); ORF.gsb.remove(i); ORF.loc.remove(i); ORF.prd.remove(i); ORF.seq.remove(i); } public static void setSeqInfo(int e) { // Set gene information for each distinct sequence if (ORF.gsb.get(e).equals("gene_symbol") && ORF.loc.get(e).equals("gene_locus") && ORF.prd.get(e).equals("product_name")) { ORF.gsb.set(e, ORF.gid.get(e)); // Customize a sequence/orf/gene ID ORF.gid.set(e, Fasta.text+"_"+String.format("%04d", e+1)); } } public static void integrateSeqInfo(int i, int e) { // Integrate gene information at element index i with duplicate seq info at element index e if (ORF.loc.get(i).equals("gene_locus") && !ORF.loc.get(e).equals("gene_locus")) { ORF.loc.set(i, ORF.loc.get(e)); } if (ORF.acn.get(i).equals("gene_accession") && !ORF.acn.get(e).equals("gene_accession")) { ORF.acn.set(i, ORF.acn.get(e)); } if ((ORF.gsb.get(i).equals("gene_symbol") && !ORF.gsb.get(e).equals("gene_symbol")) || ( !ORF.gsb.get(i).matches("^\\w{3,}_\\w*\\d{4}$")&&ORF.gsb.get(e).matches("^\\w{3,}_\\w*\\d{4}$"))) { ORF.gsb.set(i, ORF.gsb.get(e)); } if ((ORF.prd.get(i).equals("product_name") && !ORF.prd.get(e).equals("product_name")) || ( ORF.prd.get(i).matches("(?i)hypothetical") && !ORF.prd.get(e).matches("(?i)hypothetical"))) { ORF.prd.set(i, ORF.prd.get(e)); } else if (!ORF.prd.get(e).matches("(?i)hypothetical") && !ORF.prd.get(e).equals("product_name")) { // strip differential characters off product names and lowercase them to make them comparable se = ORF.prd.get(e).toLowerCase().replaceAll("[-,\\s]", ""); si = ORF.prd.get(i).toLowerCase().replaceAll("[-,\\s]", ""); if (!se.equals(si)) { if (se.contains(si)) { // if se contains si ORF.prd.set(i, ORF.prd.get(e)); } else if (si.contains(se)) {} // do nothing if si contains se else { ORF.prd.set(i, ORF.prd.get(i)+"|"+ORF.prd.get(e)); // merge info delimited by | } } } } }