/** * * @author rli */ package oligo3; import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; public class ORF { // Type-safe instantiation of resizable arraylists static ArrayList seq = new ArrayList(); static ArrayList gid = new ArrayList(); static ArrayList acn = new ArrayList(); static ArrayList gsb = new ArrayList(); static ArrayList loc = new ArrayList(); static ArrayList prd = new ArrayList(); // Pattern to validate ORF start codons static Pattern start_cd = Pattern.compile(Params.ORF_start_codons); // Pattern to validate ORF stop codons static Pattern stop_cd = Pattern.compile("TA[AG]|TGA"); // One-time declaration and instantiation of variables of repeated use static int seq_count = 0; // bad/okay sequence count static int bad_count = 0; // bad sequence count static int orf_count = 0; // ORF sequence count static int i = 0; static int j = 0; static Matcher m; // Remove pseudo-genes, get valid genes, and make ORF segmentation public static void getValidORF() { System.out.println("Checking component sequences..."); seq_count = bad_count = orf_count = 0; for (i = seq.size()-1; i >= 0; i--) { if ((m=start_cd.matcher(seq.get(i))).region(0, 3).find()) { // A start codon found if (BioSeq.inFrame(seq.get(i))) { // A stop codon found if (m.end() < Params.minimum_ORF_length) { OutputFile.tags.println("ORF "+gid.get(i)+": Shorter than oligo length (" +Params.oligo_probe_length+" bp)! Sequence length: "+seq.get(i).length() +" Stop codon: "+m.group()+" In-frame length: "+m.end()+" < " +Params.minimum_ORF_length); OutputFile.tags.println(seq.get(i)+"\n"); BioSeq.deleteSeq(i); seq_count++; } else { // has required in-frame length switch(Params.ORF_components) { case 1: // get ORF w/o start and stop codon seq.set(i, seq.get(i).substring(3, m.start())); break; case 2: // get ORF w/ start codon only seq.set(i, seq.get(i).substring(0, m.start())); break; case 3: // get ORF w/ both start and stop codon seq.set(i, seq.get(i).substring(0, m.end())); break; } orf_count++; } } else { // A stop codon not found OutputFile.tags.println("ORF "+gid.get(i)+": Stop codon not found!"); OutputFile.tags.println(seq.get(i)+"\n"); BioSeq.deleteSeq(i); bad_count++; } } else { // A start codon not found OutputFile.tags.println("ORF "+gid.get(i)+": Start codon not found!"); OutputFile.tags.println(seq.get(i)+"\n"); BioSeq.deleteSeq(i); bad_count++; } } OutputFile.stats.println("Number of genes < "+Params.oligo_probe_length+" bp = "+seq_count); OutputFile.stats.println("Number of pseudo-genes = " + bad_count); OutputFile.stats.println("Number of valid genes = " + orf_count+"\n"); OutputFile.tags.println("\n"); } // Remove identical orfs/genes public static void purgeDuplicateORF() { System.out.println("Checking identical component sequences..."); OutputFile.tags.println("Information of identical component sequences" + " found and purged (2nd one of pairwise sequences is removed):"); OutputFile.tags.println("============================================" + "=============================================================\n"); seq_count = orf_count = 0; for (i = seq.size()-1; i >= 0; i--) { if (i == 0) {orf_count++; break;} for (j = i-1; j >= 0; j--) { if (seq.get(i).equals(seq.get(j))) { // duplicate orf hit // integrate gene information for sequence at element index j with i BioSeq.integrateSeqInfo(j, i); OutputFile.tags.println(gid.get(j)+"\t"+acn.get(j)+"\t" +gsb.get(j)+"\t"+loc.get(j)+"\t"+prd.get(j)); OutputFile.tags.println(gid.get(i)+"\t"+acn.get(i)+"\t" +gsb.get(i)+"\t"+loc.get(i)+"\t"+prd.get(i)+"\n"); BioSeq.deleteSeq(i); seq_count++; break; } } if (j < 0) orf_count++; } OutputFile.stats.println("Number of identical genes = " + seq_count); OutputFile.stats.println("Number of distinct genes = " + orf_count+"\n"); } // Remove overlapping genes public static void purgeOverlapORF() { OutputFile.tags.println("\nIdentical subsequences found and purged" + " (2nd one of pairwise ORFs is an overlap and gets removed):"); OutputFile.tags.println("=======================================" + "===========================================================\n"); int len = 0; seq_count = orf_count = 0; for (i = seq.size()-1; i >= 0; i--) { System.out.println("Checking overlap ORF " + i+1 + "..."); if (i == 0) {orf_count++; break;} len = seq.get(i).length(); for (j = i-1; j >= 0; j--) { if (len <= seq.get(j).length()) { if (seq.get(j).contains(seq.get(i))) { // Overlap found OutputFile.tags.println(gid.get(j)+"\t"+seq.get(j)); OutputFile.tags.println(gid.get(i)+"\t"+seq.get(i)+"\n"); OutputFile.overlaps.println(gid.get(j)+"\t"+seq.get(i)); BioSeq.deleteSeq(i); seq_count++; break; } } else { if (seq.get(i).contains(seq.get(j))) { OutputFile.tags.println(gid.get(i)+"\t"+seq.get(i)); OutputFile.tags.println(gid.get(j)+"\t"+seq.get(j)+"\n"); OutputFile.overlaps.println(gid.get(i)+"\t"+seq.get(j)); BioSeq.deleteSeq(j); i--; seq_count++; } } } if (j < 0) orf_count++; } OutputFile.stats.println("Number of overlapping genes = " + seq_count); OutputFile.stats.println("Number of non-overlap genes = " + orf_count+"\n"); } // Remove similar orfs (with equal length) by local alignment public static void purgeSimilarORF() { OutputFile.tags.println("\nSimilar sequences found and purged (2nd one of" + " pairwise ORFs has a similarity and is removed; l=length of either ORF," + " n=number of unmatched bases, pct=percentage of mismatches):"); OutputFile.tags.println("================================================" + "======================================================================" + "===========================================================\n"); int len = 0, k = 0, n = 0; seq_count = orf_count = 0; float p = 0.0f; for (i = seq.size()-1; i >= 0; i--) { System.out.println("Checking similar ORF " + i+1 + "..."); if (i == 0) {orf_count++; break;} len = seq.get(i).length(); for (j = i-1; j >= 0; j--) { if (len == seq.get(j).length()) { n = 0; for (k = 0; k < len; k++) { if (seq.get(i).charAt(k) != seq.get(j).charAt(k)) { n++; } } p = n*1.0f/len; if (p <= Params.remove_similarity) { // similar ORF hits OutputFile.tags.printf("l="+len+"\tn="+n+"\tpct=%f%n",p); OutputFile.tags.println(gid.get(j)+"\t"+seq.get(j)); OutputFile.tags.println(gid.get(i)+"\t"+seq.get(i)+"\n"); BioSeq.deleteSeq(i); seq_count++; break; } } } if (j < 0) orf_count++; } OutputFile.stats.println("Number of similar genes = " + seq_count); OutputFile.stats.println("Number of singleton genes = "+orf_count+"\n"); } // Eliminate homologous genes (similar orfs with unequal length) public static void purgeHomologORF() { OutputFile.tags.println("\nHomologous sequences found and purged (2nd one" + " of pairwise ORFs is eliminated; l=length of shorter ORF, n=number of " + "unmatched bases, pct=percentage of mismatches):"); OutputFile.tags.println("================================================" + "======================================================================" + "=============================================\n"); int ilen = 0, jlen = 0, k = 0, n = 0; seq_count = orf_count = 0; float p = 0.0f; for (i = seq.size()-1; i >= 0; i--) { System.out.println("Checking homolog ORF " + i+1 + "..."); if (i == 0) {orf_count++; break;} ilen = seq.get(i).length(); for (j = i-1; j >= 0; j--) { jlen = seq.get(j).length(); if (ilen < jlen) { n = 0; for (k = 0; k < ilen; k++) { if (seq.get(i).charAt(k) != seq.get(j).charAt(k)) { n++; } } p = n*1.0f/ilen; if (p <= Params.eliminate_homology) { // homologous ORF hits OutputFile.tags.printf("l="+ilen+"\tn="+n+"\tpct=%f%n",p); OutputFile.tags.println(gid.get(j)+"\t"+seq.get(j)); OutputFile.tags.println(gid.get(i)+"\t"+seq.get(i)+"\n"); BioSeq.deleteSeq(i); seq_count++; break; } } else if (ilen > jlen) { n = 0; for (k = 0; k < jlen; k++) { if (seq.get(i).charAt(k) != seq.get(j).charAt(k)) { n++; } } p = n*1.0f/jlen; if (p <= Params.eliminate_homology) { // homologous ORF hits OutputFile.tags.printf("l="+jlen+"\tn="+n+"\tpct=%f%n",p); OutputFile.tags.println(gid.get(i)+"\t"+seq.get(i)); OutputFile.tags.println(gid.get(j)+"\t"+seq.get(j)+"\n"); BioSeq.deleteSeq(j); i--; seq_count++; } } } if (j < 0) orf_count++; } OutputFile.stats.println("Number of homologous genes = " + seq_count); OutputFile.stats.println("Number of non-homolog genes = "+orf_count+"\n"); } // Check if ORF oligos exist across all genes and eliminate oligo-free ORFs public static void purgeOligoFreeORF() { int len = 0, k = 0, e = seq.size()-1; seq_count = orf_count = 0; String oligo = ""; for (i = e; i >= 0; i--) { System.out.println("Checking if ORF# " + i+1 + " has an oligo..."); len = seq.get(i).length()-Params.oligo_probe_length; for (j = 0; j <= len; j++) { oligo = seq.get(i).substring(j, j+Params.oligo_probe_length); for (k = e; k >= 0; k--) { if (k != i) { if (seq.get(k).contains(oligo)) break; } } if (k < 0) { System.out.println("ORF "+gid.get(i)+" has oligo."); orf_count++; break; } } if (j > len) { System.out.println("ORF "+gid.get(i)+" has no oligo."); BioSeq.deleteSeq(i); e--; seq_count++; } } OutputFile.stats.println("Number of genes w/o oligos available = "+seq_count); OutputFile.stats.println("Number of genes w/ oligos available = "+orf_count+"\n"); } // Output quality sequences public static void OutputQualityORF() { for (i = 0; i < seq.size(); i++) { OutputFile.genes.println(">"+gid.get(i)+" "+acn.get(i) +" "+gsb.get(i)+" "+loc.get(i)+" "+prd.get(i)); // Print sequence block in lines of 60 bp for (String line : seq.get(i).split("(?<=\\G.{60})")) { OutputFile.genes.println(line); } } OutputFile.stats.println("Number of output unique genes = "+seq.size()); System.out.println("\nNumber of output unique genes = "+seq.size()+"\n"); } }