/**
 * @author rli
 */
package oligo3;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Fasta extends InputFile {

    // Pattern to skip blank data line
    static Pattern blank_p = Pattern.compile("^\\s*");
    // Pattern to validate sequence data
    static Pattern valid_p = Pattern.compile("([^ACGT]+)");
    // Pattern to extract sequence data line
    static Pattern seq_pat = Pattern.compile("^\\s*\\w+");
    // Pattern to extract fasta header line
    static Pattern header = Pattern.compile("^\\s*>\\s*\\S+.*");
    // Pattern to extract GenBank header line
    static Pattern gbk_p0 = Pattern.compile("^\\s*>\\s*\\w+\\|.*");
    // Fasta header patterns from GenBank databases
    static Pattern gbk_p1 = Pattern.compile(
    "^\\s*>\\s*\\w+\\|(.+)\\|:(\\w??)(\\d+)-(\\d+)\\s+(.+)\\s+(\\w{3,}_\\w*\\d{4})\\s+\\[.+\\s(\\S+)\\].*");
    static Pattern gbk_p2 = Pattern.compile(
    "^\\s*>\\s*\\w+\\|(.+)\\|:(\\w??)(\\d+)-(\\d+)\\s+(.+)\\s+()\\[.+\\s(\\S+)\\].*");
    static Pattern gbk_p3 = Pattern.compile(
    "^\\s*>\\s*\\w+\\|(.+)\\|:(\\w??)(\\d+)-(\\d+)\\s+()()\\[.+\\s(\\S+)\\].*");
    static Pattern gbk_p4 = Pattern.compile(
    "^\\s*>\\s*\\w+\\|(.+)\\|:(\\w??)(\\d+)-(\\d+)()()().*");
    // Fasta header patterns from JGI and other databases
    static Pattern jgi_p1 = Pattern.compile(
    "^\\s*>\\s*(\\S+)\\s+(\\w{3,}_\\w*\\d{4})\\s+(.+)\\s+(\\d+\\.\\.\\d+\\([+-]\\)).*");
    static Pattern jgi_p2 = Pattern.compile(
    "^\\s*>\\s*(\\S+)\\s+(\\w{3,}_\\w*\\d{4})\\s+()(\\d+\\.\\.\\d+\\([+-]\\)).*");
    static Pattern jgi_p3 = Pattern.compile(
    "^\\s*>\\s*(\\S+)\\s+(\\w{3,}_\\w*\\d{4})()().*");
    static Pattern jgi_p4 = Pattern.compile("^\\s*>\\s*(\\S+)()()().*");
    // Patterns to grab last text of a filename
    static Pattern text1 = Pattern.compile("[\\.\\s_-]([^\\.\\s\\)_-]+)\\)*\\.\\w{2,5}$");
    static Pattern text2 = Pattern.compile("[\\.\\s_-]([^\\.\\s\\)_-]+)\\)*$");
    static Pattern text3 = Pattern.compile("([^\\.\\s\\(\\)_-]{1,4})$");

    // One-time declaration and instantiation of variables of repeated use
    static int file_count = 0; // Count input fasta files
    static int seq_count = 0; // Count input DNA sequences
    static int dup_count = 0; // Count sequence duplicates
    static int uni_count = 0; // Count distinct sequences
    static int e = 0; // ArrayList index of last element
    static int i = 0; // ArrayList index of any element
    static boolean key = false;
    static StringBuilder sb;
    static String file = "";
    static String line = "";
    static String text = "";
    static String ss = "";
    static Matcher m;

    /**
     * @param seq_count should evaluate to zero when called to read a file.
     */
    public static void readFasta(int seq_count) throws IOException {
      // Grab last text of the name of an input fasta file
      file = f.getName(); // Get fasta file name
      if ((m=text1.matcher(file)).find()) {}
      else if ((m=text2.matcher(file)).find()) {}
      else if ((m=text3.matcher(file)).find()) {}
      text = m.group(1);
      // Read in all lines from the current open file
      do {
        line = br.readLine();
        if (line != null) {
          // Skip blank data line
          if (blank_p.matcher(line).matches()) continue;
          if (seq_pat.matcher(line).matches()) { // Fasta sequence line hits
             // Extract sequence block while sequence lines are matching
             BioSeq.getSeq(line);
             continue; // Proceed to next loop for sequence reading efficiency
          } else if (header.matcher(line).matches()) { // Fasta header line hits
            key = true; // Turn next StringBuilder constructor on
            seq_count++; // Count input DNA sequences from a file
            if (seq_count == 1) {BioSeq.getHeader(line); continue;}
          } else if (line.matches("^\\s*#.*")) {
            System.out.println("Line: " + line + " in File " + file + " is a comment.");
          } else {
            System.out.println("Error in File " + file + "! Header: " + line + " is empty!");
            System.exit(1);
          }
        }
        // Process the last read sequence block after reading the current header
        if (key || (line == null)) {
           // Uppercase acgt bases, strip whitespaces off the sequence
           ss = sb.toString().toUpperCase().replaceAll("\\s", "");
           // Check and validate nucleotide/base composition of sequence
           if (!BioSeq.validSeq(ss)) {
              if (line != null) BioSeq.getHeader(line);
              continue; // drop invalid sequence
           }
           // Detect duplicate sequence from each input
           i = BioSeq.findDupSeq(ss);
           if ((i < 0) || (e == 0)) { // is distinct sequence
              // Remove duplicate sequences by storing unique sequences only
              ORF.seq.add(e, ss); // Add sequence to seq list
              BioSeq.setSeqInfo(e); // Set gene information
              uni_count++; // count distinct sequences
              e++; // increment seq list size
           } else { // is duplicate sequence (against element i of seq list)
             // integrate header information for sequence at index i with e
             BioSeq.integrateSeqInfo(i, e);
             OutputFile.tags.println(ORF.gid.get(i)+"\t"+ORF.acn.get(i)+"\t"
             +ORF.gsb.get(i)+"\t"+ORF.loc.get(i)+"\t"+ORF.prd.get(i));
             OutputFile.tags.println(ORF.gid.get(e)+"\t"+ORF.acn.get(e)+"\t"
             +ORF.gsb.get(e)+"\t"+ORF.loc.get(e)+"\t"+ORF.prd.get(e)+"\n");
             dup_count++; // count duplicates
           }
           if (line != null) BioSeq.getHeader(line);
        }
      } while (line != null);
      file_count++; // Count input fasta files
      Fasta.seq_count += seq_count; // global sequence count for all files
      // Print local sequence count for each of input fasta files
      OutputFile.stats.println("Sequence count in File "+file+" = "+seq_count);
      if (br != null) br.close();
    }

    public static void readNCBI() {
      // Read in fasta lines from the NCBI ftp-based file
      if ((m=gbk_p1.matcher(line)).matches()) {}
      else if ((m=gbk_p2.matcher(line)).matches()) {} // no gene symbol
      else if ((m=gbk_p3.matcher(line)).matches()) {} // no gene symbol & product name
      else if ((m=gbk_p4.matcher(line)).matches()) {} // no gene symbol, product name & organism name
      else {
       System.out.println("Error in File " + file + "! Header: " + line + " is not matchable.");
       System.exit(1);
      }
      // Construct a DNA sequence/gene ID
      ORF.gid.add(e, (Params.ORF_id_prefix.equals("")?m.group(7):
      Params.ORF_id_prefix)+"_"+String.format("%04d", e+1));
      // Construct a gene accession number
      ORF.acn.add(e, m.group(1));
      // Construct a gene symbol
      ORF.gsb.add(e, m.group(6).equals("")?"gene_symbol":m.group(6));
      // Construct a gene locus (coordinates)
      ORF.loc.add(e, m.group(2).equals("c")?m.group(3)+".."+m.group(4)
      +"(-)":m.group(3)+".."+m.group(4)+"(+)");
      // Construct a gene product name
      ORF.prd.add(e, m.group(5).equals("")?"product_name":m.group(5));
    }

    public static void readJGI() {
      // Read in fasta lines from the JGI-based file
      if ((m=jgi_p1.matcher(line)).matches()) {}
      else if ((m=jgi_p2.matcher(line)).matches()) {} // no product name
      else if ((m=jgi_p3.matcher(line)).matches()) {} // no product name and gene locus
      else if ((m=jgi_p4.matcher(line)).matches()) {} // no gene symbol, product name and gene locus
      else {
       System.out.println("Error in File " + file + "! Header: " + line + " is not matchable!");
       System.exit(1);
      }
      // Construct a DNA sequence/gene ID
      ORF.gid.add(e, Params.ORF_id_prefix.equals("")?m.group(1):
      Params.ORF_id_prefix+"_"+String.format("%04d", e+1));
      // Construct a gene accession number
      ORF.acn.add(e, "gene_accession");
      // Construct a gene symbol
      ORF.gsb.add(e, m.group(2).equals("")?"gene_symbol":m.group(2));
      // Construct a gene locus (coordinates)
      ORF.loc.add(e, m.group(4).equals("")?"gene_locus":m.group(4));
      // Construct a gene product name
      ORF.prd.add(e, m.group(3).equals("")?"product_name":m.group(3));
    }

    // Read in all FASTA files from the sequence data repository
    public static void readFiles() throws IOException {
      OutputFile.tags.println("Information of identical sequences found"
      + " and purged (2nd one of pairwise orfs is removed):");
      OutputFile.tags.println("========================================"
      + "==================================================\n");
      System.out.println("\n");
      for (String filename : RunJob.files) {
        System.out.println("Processing " + filename + "...");
        if (OpenOkay(Params.input_files_dir, filename)) readFasta(0);
      }
      // Remove the redundant header information elements added
      if (ORF.gid.size() > ORF.seq.size()) {
        for (i=ORF.gid.size()-1; i >= ORF.seq.size(); i--) {
          ORF.gid.remove(i);
          ORF.acn.remove(i);
          ORF.gsb.remove(i);
          ORF.loc.remove(i);
          ORF.prd.remove(i);
        }
      }
      ORF.seq.trimToSize();
      ORF.gid.trimToSize();
      ORF.acn.trimToSize();
      ORF.gsb.trimToSize();
      ORF.loc.trimToSize();
      ORF.prd.trimToSize();
      OutputFile.stats.println("\nNumber of input files = " + file_count);
      OutputFile.stats.println("Number of input sequences = " + seq_count + "\n");
      OutputFile.stats.println("Number of duplicate sequences = " + dup_count);
      OutputFile.stats.println("Number of distinct sequences = " + uni_count + "\n");
      System.out.println("\nNumber of input files = " + file_count);
      System.out.println("Number of input sequences = " + seq_count);
      System.out.println("Number of duplicate sequences = " + dup_count);
      System.out.println("Number of distinct sequences = " + uni_count + "\n");
      OutputFile.tags.println("\n");
      OutputFile.tags.flush();
      OutputFile.stats.flush();
    }
}