#include "phylip.h"
#include "seq.h"

/* version 3.6. (c) Copyright 1993-2004 by the University of Washington.
   Written by Joseph Felsenstein, Akiko Fuseki, Sean Lamont, and Andrew Keeffe.
   Permission is granted to copy and use this program provided no fee is
   charged for it and provided that this copyright notice is not removed. */

#define iterationsd     100   /* number of iterates of EM for each distance */

typedef struct valrec {
  double rat, ratxv, z1, y1, z1zz, z1yy, z1xv;
} valrec;


Char infilename[FNMLNGTH], outfilename[FNMLNGTH], catfilename[FNMLNGTH], weightfilename[FNMLNGTH];
long sites, categs, weightsum, datasets, ith, rcategs;
boolean freqsfrom, jukes, kimura, logdet, gama, invar, similarity, lower, f84,
        weights, progress, ctgry, mulsets, justwts, firstset, baddists;
boolean matrix_flags;       /* Matrix output format */
node **nodep;
double xi, xv, ttratio, ttratio0, freqa, freqc, freqg, freqt, freqr, freqy,
        freqar, freqcy, freqgr, freqty, cvi, invarfrac, sumrates, fracchange;
steptr oldweight;
double rate[maxcategs];
double **d;
double sumweightrat;                  /* these values were propagated  */
double *weightrat;                    /* to global values from         */
valrec tbl[maxcategs];                /* function makedists.           */


#ifndef OLDC
/* function  prototypes */
void   getoptions(void);
void   allocrest(void);
void   reallocsites(void);
void   doinit(void);
void   inputcategories(void);
void   printcategories(void);
void   inputoptions(void);
void   dnadist_sitesort(void);
void   dnadist_sitecombine(void);
void   dnadist_sitescrunch(void);
void   makeweights(void);
void   dnadist_makevalues(void);
void   dnadist_empiricalfreqs(void);
void   getinput(void);
void   inittable(void);
double lndet(double (*a)[4]);
void   makev(long, long, double *);
void   makedists(void);
void   writedists(void);
/* function  prototypes */
#endif


void getoptions()
{
  /* interactively set options */
  long loopcount, loopcount2;
  Char ch, ch2;
  boolean ttr;
  char *str = NULL;

  ctgry = false;
  categs = 1;
  cvi = 1.0;
  rcategs = 1;
  rate[0] = 1.0;
  freqsfrom = true;
  gama = false;
  invar = false;
  invarfrac = 0.0;
  jukes = false;
  justwts = false;
  kimura = false;
  logdet = false;
  f84 = true;
  lower = false;
  matrix_flags = MAT_MACHINE;
  similarity = false;
  ttratio = 2.0;
  ttr = false;
  weights = false;
  printdata = false;
  dotdiff = true;
  progress = true;
  interleaved = true;
  loopcount = 0;
  for (;;) {
    cleerhome();
    printf("\nNucleic acid sequence Distance Matrix program,");
    printf(" version %s\n\n",VERSION);
    printf("Settings for this run:\n");
    printf("  D  Distance (F84, Kimura, Jukes-Cantor, LogDet)?  %s\n",
           kimura ? "Kimura 2-parameter" :
           jukes  ? "Jukes-Cantor"       :
           logdet ? "LogDet"             :
           similarity ? "Similarity table" : "F84");
    if (kimura || f84 || jukes) {
      printf("  G          Gamma distributed rates across sites?  ");
      if (gama)
        printf("Yes\n");
      else {
        if (invar)
          printf("Gamma+Invariant\n");
        else
          printf("No\n");
      }
    }
    if (kimura || f84) {
      printf("  T                 Transition/transversion ratio?");
      if (!ttr)
        printf("  2.0\n");
      else
        printf("%8.4f\n", ttratio);
    }
    if (!logdet && !similarity && !gama && !invar) {
      printf("  C            One category of substitution rates?");
      if (!ctgry || categs == 1)
        printf("  Yes\n");
      else
        printf("  %ld categories\n", categs);
    }
    printf("  W                         Use weights for sites?");
    if (weights)
      printf("  Yes\n");
    else
      printf("  No\n");
    if (f84)
      printf("  F                Use empirical base frequencies?  %s\n",
             (freqsfrom ? "Yes" : "No"));
    printf("  L                       Form of distance matrix?  ");
    switch (matrix_flags) {
      case MAT_MACHINE:
        str = "Square";
        break;
      case MAT_LOWERTRI:
        str = "Lower-triangular";
        break;
      case MAT_HUMAN:
        str = "Human-readable";
        break;
      default:  /* shouldn't happen */
        assert( 0 );
        str = "(unknown)";
    }
    puts(str);
    printf("  M                    Analyze multiple data sets?");
    if (mulsets)
      printf("  Yes, %2ld %s\n", datasets,
               (justwts ? "sets of weights" : "data sets"));
    else
      printf("  No\n");
    printf("  I                   Input sequences interleaved?  %s\n",
           (interleaved ? "Yes" : "No, sequential"));
    printf("  0            Terminal type (IBM PC, ANSI, none)?  %s\n",
           ibmpc ? "IBM PC" : ansi  ? "ANSI"   : "(none)");
    printf("  1             Print out the data at start of run  %s\n",
           (printdata ? "Yes" : "No"));
    printf("  2           Print indications of progress of run  %s\n",
           (progress ? "Yes" : "No"));
    printf("\n  Y to accept these or type the letter for one to change\n");
#ifdef WIN32
    phyFillScreenColor();
#endif
    fflush(stdout);
    scanf("%c%*[^\n]", &ch);
    getchar();
    uppercase(&ch);
    if  (ch == 'Y')
           break;
    if ((f84 && (strchr("CFGWLDTMI012",ch) != NULL)) ||
        (kimura && (strchr("CGWLDTMI012",ch) != NULL)) ||
        (jukes && (strchr("CGWLDMI012",ch) != NULL)) ||
        ((logdet || similarity) && (strchr("WLDMI012",ch)) != NULL) ||
        (ctgry && (strchr("CFWLDTMI012",ch) != NULL))) {
     switch (ch) {

     case 'D':
       if (kimura) {
         kimura = false;
         jukes = true;
         freqsfrom = false;
       } else if (f84) {
         f84 = false;
         kimura = true;
         freqsfrom = false;
       } else if (logdet) {
         logdet = false;
         similarity = true;
       } else if (similarity) {
         similarity = false;
         f84 = true;
         freqsfrom = true;
       } else {
         jukes = false;
         logdet = true;
         freqsfrom = false;
       }
       break;

     case 'G':
       if (!(gama || invar))
         gama = true;
       else {
         if (gama) {
           gama = false;
           invar = true;
         } else {
           if (invar)
             invar = false;
         }
       }
       break;


     case 'C':
       ctgry = !ctgry;
       if (ctgry) {
         initcatn(&categs);
         initcategs(categs, rate);
       }
       break;

     case 'F':
       freqsfrom = !freqsfrom;
       if (!freqsfrom)
         initfreqs(&freqa, &freqc, &freqg, &freqt);
       break;

     case 'W':
       weights = !weights;
       break;

     case 'L':
       /* square -> lower-triangular -> machine-readable */
       switch ( matrix_flags ) {
         case MAT_HUMAN:
           matrix_flags = MAT_MACHINE;
           break;
         case MAT_LOWERTRI:
           matrix_flags = MAT_HUMAN;
           break;
         case MAT_MACHINE:
           matrix_flags = MAT_LOWERTRI;
           break;
         default:
           assert(0);
           matrix_flags = MAT_HUMAN;
       }
       break;

     case 'T':
       ttr = !ttr;
       if (ttr)
        initratio(&ttratio);
       break;

     case 'M':
       mulsets = !mulsets;
       if (mulsets) {
          printf("Multiple data sets or multiple weights?");
          loopcount2 = 0;
          do {
            printf(" (type D or W)\n");
#ifdef WIN32
            phyFillScreenColor();
#endif
            fflush(stdout);
            scanf("%c%*[^\n]", &ch2);
            uppercase(&ch2);
            getchar();
            countup(&loopcount2, 10);
          } while ((ch2 != 'W') && (ch2 != 'D'));
          justwts = (ch2 == 'W');
          if (justwts)
            justweights(&datasets);
          else
            initdatasets(&datasets);
        }
       break;

     case 'I':
       interleaved = !interleaved;
       break;

     case '0':
        initterminal(&ibmpc, &ansi);
       break;

     case '1':
       printdata = !printdata;
       break;

     case '2':
       progress = !progress;
       break;
     }
   } else {
      if (strchr("CFGWLDTMI012",ch) == NULL)
        printf("Not a possible option!\n");
      else
        printf("That option not allowed with these settings\n");
      printf("\nPress Enter or Return key to continue\n");
      getchar();
    }
    countup(&loopcount, 100);
  }

  /* Prevent similarity matrices from easily being used as input to other
   * programs */
  if (similarity) {
    if (matrix_flags == MAT_MACHINE)
      matrix_flags = MAT_HUMAN;
    else if (matrix_flags == MAT_LOWERTRI)
      matrix_flags = MAT_LOWER | MAT_HUMAN;
  }

  if (gama || invar) {
    loopcount = 0;
    do {
      printf(
"\nCoefficient of variation of substitution rate among sites (must be positive)\n");
      printf(
  " In gamma distribution parameters, this is 1/(square root of alpha)\n");
#ifdef WIN32
      phyFillScreenColor();
#endif
      fflush(stdout);
      scanf("%lf%*[^\n]", &cvi);
      getchar();
      countup(&loopcount, 10);
    } while (cvi <= 0.0);
    cvi = 1.0 / (cvi * cvi);
  }
  if (invar) {
    loopcount = 0;
    do {
      printf("Fraction of invariant sites?\n");
      fflush(stdout);
      scanf("%lf%*[^\n]", &invarfrac);
      getchar();
      countup (&loopcount, 10);
    } while ((invarfrac <= 0.0) || (invarfrac >= 1.0));
  }
    if (!printdata)
    return;
    fprintf(outfile, "\nNucleic acid sequence Distance Matrix program,");
    fprintf(outfile, " version %s\n\n",VERSION);
}  /* getoptions */


void allocrest()
{
  long i;

  y = (Char **)Malloc(spp*sizeof(Char *));
  nodep = (node **)Malloc(spp*sizeof(node *));
  for (i = 0; i < spp; i++) {
    y[i] = (Char *)Malloc(sites*sizeof(Char));
    nodep[i] = (node *)Malloc(sizeof(node));
  }
  d = (double **)Malloc(spp*sizeof(double *));
  for (i = 0; i < spp; i++)
    d[i] = (double*)Malloc(spp*sizeof(double));
  nayme = (naym *)Malloc(spp*sizeof(naym));
  category = (steptr)Malloc(sites*sizeof(long));
  oldweight = (steptr)Malloc(sites*sizeof(long));
  weight = (steptr)Malloc(sites*sizeof(long));
  alias = (steptr)Malloc(sites*sizeof(long));
  ally = (steptr)Malloc(sites*sizeof(long));
  location = (steptr)Malloc(sites*sizeof(long));
  weightrat = (double *)Malloc(sites*sizeof(double));
} /* allocrest */


void reallocsites() 
{/* The amount of sites can change between runs 
     this function reallocates all the variables 
     whose size depends on the amount of sites */
  long i;

  for (i = 0; i < spp; i++) {
    free(y[i]);  
    y[i] = (Char *)Malloc(sites*sizeof(Char));
  }
  free(category);
  free(oldweight);
  free(weight);
  free(alias);
  free(ally);
  free(location);
  free(weightrat);
  
  category = (steptr)Malloc(sites*sizeof(long));
  oldweight = (steptr)Malloc(sites*sizeof(long));
  weight = (steptr)Malloc(sites*sizeof(long));
  alias = (steptr)Malloc(sites*sizeof(long));
  ally = (steptr)Malloc(sites*sizeof(long));
  location = (steptr)Malloc(sites*sizeof(long));
  weightrat = (double *)Malloc(sites*sizeof(double));  
} /* reallocsites */


void doinit()
{
  /* initializes variables */

  inputnumbers(&spp, &sites, &nonodes, 1);
  getoptions();
  if (printdata)
    fprintf(outfile, "%2ld species, %3ld  sites\n", spp, sites);
  allocrest();
}  /* doinit */


void inputcategories()
{
  /* reads the categories for each site */
  long i;
  Char ch;

  for (i = 1; i < nmlngth; i++)
    gettc(infile);
  for (i = 0; i < sites; i++) {
    do {
      if (eoln(infile))
        scan_eoln(infile);
      ch = gettc(infile);
    } while (ch == ' ');
    category[i] = ch - '0';
  }
  scan_eoln(infile);
}  /* inputcategories */


void printcategories()
{ /* print out list of categories of sites */
  long i, j;

  fprintf(outfile, "Rate categories\n\n");
  for (i = 1; i <= nmlngth + 3; i++)
    putc(' ', outfile);
  for (i = 1; i <= sites; i++) {
    fprintf(outfile, "%ld", category[i - 1]);
    if (i % 60 == 0) {
      putc('\n', outfile);
      for (j = 1; j <= nmlngth + 3; j++)
        putc(' ', outfile);
    } else if (i % 10 == 0)
      putc(' ', outfile);
  }
  fprintf(outfile, "\n\n");
}  /* printcategories */


void inputoptions()
{
  /* read options information */
  long i;

  if (!firstset && !justwts) {
    samenumsp(&sites, ith);
    reallocsites();
  }
  for (i = 0; i < sites; i++) {
    category[i] = 1;
    oldweight[i] = 1;
  }
  if (justwts || weights)
    inputweights(sites, oldweight, &weights);
  if (printdata)
    putc('\n', outfile);
  if (jukes && printdata)
    fprintf(outfile, "  Jukes-Cantor Distance\n");
  if (kimura && printdata)
    fprintf(outfile, "  Kimura 2-parameter Distance\n");
  if (f84 && printdata)
    fprintf(outfile, "  F84 Distance\n");
  if (similarity)
    fprintf(outfile, "  \n  Table of similarity between sequences\n");
  if (firstset && printdata && (kimura || f84))
    fprintf(outfile, "\nTransition/transversion ratio = %10.6f\n", ttratio);
  if (ctgry && categs > 1) {
    inputcategs(0, sites, category, categs, "DnaDist");
    if (printdata)
      printcategs(outfile, sites, category, "Site categories");
  } else if (printdata && (categs > 1)) {
    fprintf(outfile, "\nSite category   Rate of change\n\n");
    for (i = 1; i <= categs; i++)
      fprintf(outfile, "%12ld%13.3f\n", i, rate[i - 1]);
    putc('\n', outfile);
    printcategories();
  }
  if ((jukes || kimura || logdet) && freqsfrom) {
    printf(" WARNING: CANNOT USE EMPIRICAL BASE FREQUENCIES");
    printf(" WITH JUKES-CANTOR, KIMURA, JIN/NEI OR LOGDET DISTANCES\n");
    exxit(-1);
  }
  if (jukes)
    ttratio = 0.5000001;
  if (weights && printdata)
    printweights(outfile, 0, sites, oldweight, "Sites");
}  /* inputoptions */


void dnadist_sitesort()
{
  /* Shell sort of sites lexicographically */
  long gap, i, j, jj, jg, k, itemp;
  boolean flip, tied;

  gap = sites / 2;
  while (gap > 0) {
    for (i = gap + 1; i <= sites; i++) {
      j = i - gap;
      flip = true;
      while (j > 0 && flip) {
        jj = alias[j - 1];
        jg = alias[j + gap - 1];
        tied = (oldweight[jj - 1] == oldweight[jg - 1]);
        flip = (oldweight[jj - 1] < oldweight[jg - 1] ||
                (tied && category[jj - 1] > category[jg - 1]));
        tied = (tied && category[jj - 1] == category[jg - 1]);
        k = 1;
        while (k <= spp && tied) {
          flip = (y[k - 1][jj - 1] > y[k - 1][jg - 1]);
          tied = (tied && y[k - 1][jj - 1] == y[k - 1][jg - 1]);
          k++;
        }
        if (!flip)
          break;
        itemp = alias[j - 1];
        alias[j - 1] = alias[j + gap - 1];
        alias[j + gap - 1] = itemp;
        j -= gap;
      }
    }
    gap /= 2;
  }
}  /* dnadist_sitesort */


void dnadist_sitecombine()
{
  /* combine sites that have identical patterns */
  long i, j, k;
  boolean tied;

  i = 1;
  while (i < sites) {
    j = i + 1;
    tied = true;
    while (j <= sites && tied) {
      tied = (oldweight[alias[i - 1] - 1] == oldweight[alias[j - 1] - 1] &&
              category[alias[i - 1] - 1] == category[alias[j - 1] - 1]);
      k = 1;
      while (k <= spp && tied) {
        tied = (tied &&
            y[k - 1][alias[i - 1] - 1] == y[k - 1][alias[j - 1] - 1]);
        k++;
      }
      if (!tied)
        break;
      ally[alias[j - 1] - 1] = alias[i - 1];
      j++;
    }
    i = j;
  }
}  /* dnadist_sitecombine */


void dnadist_sitescrunch()
{
  /* move so one representative of each pattern of
     sites comes first */
  long i, j, itemp;
  boolean done, found, completed;

  done = false;
  i = 1;
  j = 2;
  while (!done) {
    if (ally[alias[i - 1] - 1] != alias[i - 1]) {
      if (j <= i)
        j = i + 1;
      if (j <= sites) {
        do {
          found = (ally[alias[j - 1] - 1] == alias[j - 1]);
          j++;
          completed = (j > sites);
          if (j <= sites)
            completed = (oldweight[alias[j - 1] - 1] == 0);
        } while (!(found || completed));
        if (found) {
          j--;
          itemp = alias[i - 1];
          alias[i - 1] = alias[j - 1];
          alias[j - 1] = itemp;
        } else
          done = true;
      } else
        done = true;
    }
    i++;
    done = (done || i >= sites);
  }
}  /* dnadist_sitescrunch */


void makeweights()
{
  /* make up weights vector to avoid duplicate computations */
  long i;

  for (i = 1; i <= sites; i++) {
    alias[i - 1] = i;
    ally[i - 1] = i;
    weight[i - 1] = 0;
  }
  dnadist_sitesort();
  dnadist_sitecombine();
  dnadist_sitescrunch();
  endsite = 0;
  for (i = 1; i <= sites; i++) {
    if (ally[i - 1] == i && oldweight[i - 1] > 0)
      endsite++;
  }
  for (i = 1; i <= endsite; i++)
    location[alias[i - 1] - 1] = i;
  weightsum = 0;
  for (i = 0; i < sites; i++)
    weightsum += oldweight[i];
  sumrates = 0.0;
  for (i = 0; i < sites; i++)
    sumrates += oldweight[i] * rate[category[i] - 1];
  for (i = 0; i < categs; i++)
    rate[i] *= weightsum / sumrates;
  for (i = 0; i < sites; i++)
    weight[location[ally[i] - 1] - 1] += oldweight[i];
}  /* makeweights */


void dnadist_makevalues()
{
  /* set up fractional likelihoods at tips */
  long i, j, k;
  bases b;

  for (i = 0; i < spp; i++) {
    nodep[i]->x = (phenotype)Malloc(endsite*sizeof(ratelike));
    for (j = 0; j < endsite; j++)
      nodep[i]->x[j]  = (ratelike)Malloc(rcategs*sizeof(sitelike));
  }
  for (k = 0; k < endsite; k++) {
    j = alias[k];
    for (i = 0; i < spp; i++) {
      for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1))
        nodep[i]->x[k][0][(long)b - (long)A] = 0.0;
      switch (y[i][j - 1]) {

      case 'A':
        nodep[i]->x[k][0][0] = 1.0;
        break;

      case 'C':
        nodep[i]->x[k][0][(long)C - (long)A] = 1.0;
        break;

      case 'G':
        nodep[i]->x[k][0][(long)G - (long)A] = 1.0;
        break;

      case 'T':
        nodep[i]->x[k][0][(long)T - (long)A] = 1.0;
        break;

      case 'U':
        nodep[i]->x[k][0][(long)T - (long)A] = 1.0;
        break;

      case 'M':
        nodep[i]->x[k][0][0] = 1.0;
        nodep[i]->x[k][0][(long)C - (long)A] = 1.0;
        break;

      case 'R':
        nodep[i]->x[k][0][0] = 1.0;
        nodep[i]->x[k][0][(long)G - (long)A] = 1.0;
        break;

      case 'W':
        nodep[i]->x[k][0][0] = 1.0;
        nodep[i]->x[k][0][(long)T - (long)A] = 1.0;
        break;

      case 'S':
        nodep[i]->x[k][0][(long)C - (long)A] = 1.0;
        nodep[i]->x[k][0][(long)G - (long)A] = 1.0;
        break;

      case 'Y':
        nodep[i]->x[k][0][(long)C - (long)A] = 1.0;
        nodep[i]->x[k][0][(long)T - (long)A] = 1.0;
        break;

      case 'K':
        nodep[i]->x[k][0][(long)G - (long)A] = 1.0;
        nodep[i]->x[k][0][(long)T - (long)A] = 1.0;
        break;

      case 'B':
        nodep[i]->x[k][0][(long)C - (long)A] = 1.0;
        nodep[i]->x[k][0][(long)G - (long)A] = 1.0;
        nodep[i]->x[k][0][(long)T - (long)A] = 1.0;
        break;

      case 'D':
        nodep[i]->x[k][0][0] = 1.0;
        nodep[i]->x[k][0][(long)G - (long)A] = 1.0;
        nodep[i]->x[k][0][(long)T - (long)A] = 1.0;
        break;

      case 'H':
        nodep[i]->x[k][0][0] = 1.0;
        nodep[i]->x[k][0][(long)C - (long)A] = 1.0;
        nodep[i]->x[k][0][(long)T - (long)A] = 1.0;
        break;

      case 'V':
        nodep[i]->x[k][0][0] = 1.0;
        nodep[i]->x[k][0][(long)C - (long)A] = 1.0;
        nodep[i]->x[k][0][(long)G - (long)A] = 1.0;
        break;

      case 'N':
        for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1))
          nodep[i]->x[k][0][(long)b - (long)A] = 1.0;
        break;

      case 'X':
        for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1))
          nodep[i]->x[k][0][(long)b - (long)A] = 1.0;
        break;

      case '?':
        for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1))
          nodep[i]->x[k][0][(long)b - (long)A] = 1.0;
        break;

      case 'O':
        for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1))
          nodep[i]->x[k][0][(long)b - (long)A] = 1.0;
        break;

      case '-':
        for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1))
          nodep[i]->x[k][0][(long)b - (long)A] = 1.0;
        break;
      }
    }
  }
}  /* dnadist_makevalues */


void dnadist_empiricalfreqs()
{
  /* Get empirical base frequencies from the data */
  long i, j, k;
  double sum, suma, sumc, sumg, sumt, w;

  freqa = 0.25;
  freqc = 0.25;
  freqg = 0.25;
  freqt = 0.25;
  for (k = 1; k <= 8; k++) {
    suma = 0.0;
    sumc = 0.0;
    sumg = 0.0;
    sumt = 0.0;
    for (i = 0; i < spp; i++) {
      for (j = 0; j < endsite; j++) {
        w = weight[j];
        sum = freqa * nodep[i]->x[j][0][0];
        sum += freqc * nodep[i]->x[j][0][(long)C - (long)A];
        sum += freqg * nodep[i]->x[j][0][(long)G - (long)A];
        sum += freqt * nodep[i]->x[j][0][(long)T - (long)A];
        suma += w * freqa * nodep[i]->x[j][0][0] / sum;
        sumc += w * freqc * nodep[i]->x[j][0][(long)C - (long)A] / sum;
        sumg += w * freqg * nodep[i]->x[j][0][(long)G - (long)A] / sum;
        sumt += w * freqt * nodep[i]->x[j][0][(long)T - (long)A] / sum;
      }
    }
    sum = suma + sumc + sumg + sumt;
    freqa = suma / sum;
    freqc = sumc / sum;
    freqg = sumg / sum;
    freqt = sumt / sum;
  }
}  /* dnadist_empiricalfreqs */


void getinput()
{
  /* reads the input data */
  inputoptions();
  if ((!freqsfrom) && !logdet && !similarity) {
    if (kimura || jukes) {
      freqa = 0.25;
      freqc = 0.25;
      freqg = 0.25;
      freqt = 0.25;
    }
    getbasefreqs(freqa, freqc, freqg, freqt, &freqr, &freqy, &freqar, &freqcy,
                   &freqgr, &freqty, &ttratio, &xi, &xv, &fracchange,
                   freqsfrom, printdata);
    if (freqa < 0.00000001) {
      freqa = 0.000001;
      freqc = 0.999999*freqc;
      freqg = 0.999999*freqg;
      freqt = 0.999999*freqt;
    }
    if (freqc < 0.00000001) {
      freqa = 0.999999*freqa;
      freqc = 0.000001;
      freqg = 0.999999*freqg;
      freqt = 0.999999*freqt;
    }
    if (freqg < 0.00000001) {
      freqa = 0.999999*freqa;
      freqc = 0.999999*freqc;
      freqg = 0.000001;
      freqt = 0.999999*freqt;
    }
    if (freqt < 0.00000001) {
      freqa = 0.999999*freqa;
      freqc = 0.999999*freqc;
      freqg = 0.999999*freqg;
      freqt = 0.000001;
    }
  }
  if (!justwts || firstset)
    inputdata(sites);
  makeweights();
  dnadist_makevalues();
  if (freqsfrom) {
    dnadist_empiricalfreqs();
    getbasefreqs(freqa, freqc, freqg, freqt, &freqr, &freqy, &freqar, &freqcy,
                   &freqgr, &freqty, &ttratio, &xi, &xv, &fracchange,
                   freqsfrom, printdata);
  }
}  /* getinput */


void inittable()
{
  /* Define a lookup table. Precompute values and store in a table */
  long i;

  for (i = 0; i < categs; i++) {
    tbl[i].rat = rate[i];
    tbl[i].ratxv = rate[i] * xv;
  }
}  /* inittable */


double lndet(double (*a)[4])
{
  long i, j, k;
  double temp, ld;

  /*Gauss-Jordan reduction -- invert matrix a in place,
         overwriting previous contents of a.  On exit, matrix a
         contains the inverse, lndet contains the log of the determinant */
  ld = 1.0;
  for (i = 0; i < 4; i++) {
    ld *= a[i][i];
    temp = 1.0 / a[i][i];
    a[i][i] = 1.0;
    for (j = 0; j < 4; j++)
      a[i][j] *= temp;
    for (j = 0; j < 4; j++) {
      if (j != i) {
        temp = a[j][i];
        a[j][i] = 0.0;
        for (k = 0; k < 4; k++)
          a[j][k] -= temp * a[i][k];
      }
    }
  }
  if (ld <= 0.0)
    return(99.0);
  else
    return(log(ld));
}  /* lndet */


void makev(long m, long n, double *v)
{
  /* compute one distance */
  long i, j, k, l, it, num1, num2, idx;
  long numerator = 0, denominator = 0;
  double sum, sum1, sum2, sumyr, lz, aa, bb, cc, vv=0,
         p1, p2, p3, q1, q2, q3, tt, delta = 0, slope,
         xx1freqa, xx1freqc, xx1freqg, xx1freqt;
  double *prod, *prod2, *prod3;
  boolean quick, jukesquick, kimquick, logdetquick, overlap;
  bases b;
  node *p, *q;
  sitelike xx1, xx2;
  double basetable[4][4];  /* for quick logdet */
  double basefreq1[4], basefreq2[4];

  p = nodep[m - 1];
  q = nodep[n - 1];

  /* check for overlap between sequences */
  overlap = false;
  for(i=0 ; i < sites ; i++){
    if((strchr("NX?O-",y[m-1][i])==NULL) && 
       (strchr("NX?O-",y[n-1][i])==NULL)){
      overlap = true;
      break;
    }
  }
  if(!overlap){
    printf("\nWARNING: NO OVERLAP BETWEEN SEQUENCES %ld AND %ld; -1.0 WAS WRITTEN\n", m, n);
    baddists = true;
    return;
  }

  quick = (!ctgry || categs == 1);
  if (jukes || kimura || logdet || similarity) {
    numerator = 0;
    denominator = 0;
    for (i = 0; i < endsite; i++) {
      memcpy(xx1, p->x[i][0], sizeof(sitelike));
      memcpy(xx2, q->x[i][0], sizeof(sitelike));
      sum = 0.0;
      sum1 = 0.0;
      sum2 = 0.0;
      for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1)) {
        sum1 += xx1[(long)b - (long)A];
        sum2 += xx2[(long)b - (long)A];
        sum += xx1[(long)b - (long)A] * xx2[(long)b - (long)A];
      }
      quick = (quick && (sum1 == 1.0 || sum1 == 4.0) &&
               (sum2 == 1.0 || sum2 == 4.0));
      if (sum1 == 1.0 && sum2 == 1.0) {
        numerator += (long)(weight[i] * sum);
        denominator += weight[i];
      }
    }
  }
  jukesquick = ((jukes || similarity) && quick);
  kimquick = (kimura && quick);
  logdetquick = (logdet && quick);
  if (logdet && !quick) {
    printf(" WARNING: CANNOT CALCULATE LOGDET DISTANCE\n");
    printf("  WITH PRESENT PROGRAM IF PARTIALLY AMBIGUOUS NUCLEOTIDES\n");
    printf("  -1.0 WAS WRITTEN\n");
    baddists = true;
  }
  if (jukesquick && jukes && (numerator * 4 <= denominator)) {
    printf("\nWARNING: INFINITE DISTANCE BETWEEN ");
    printf(" SPECIES %3ld AND %3ld\n", m, n);
    printf("  -1.0 WAS WRITTEN\n");
    baddists = true;
  }
  if (jukesquick && invar
      && (4 * (((double)numerator / denominator) - invarfrac)
          <= (1.0 - invarfrac))) {
    printf("\nWARNING: DIFFERENCE BETWEEN SPECIES %3ld AND %3ld", m, n);
    printf(" TOO LARGE FOR INVARIABLE SITES\n");
    printf("  -1.0 WAS WRITTEN\n");
    baddists = true;
  }
  if (jukesquick) {
    if (!gama && !invar)
      vv = -0.75 * log((4.0*((double)numerator / denominator) - 1.0) / 3.0);
    else if (!invar)
        vv = 0.75 * cvi * (exp(-(1/cvi)*
           log((4.0 * ((double)numerator / denominator) - 1.0) / 3.0)) - 1.0);
      else
        vv = 0.75 * cvi * (exp(-(1/cvi)*
           log((4.0 * ((double)numerator / denominator - invarfrac)/
                        (1.0-invarfrac) - 1.0) / 3.0)) - 1.0);
  }
  if (kimquick) {
    num1 = 0;
    num2 = 0;
    denominator = 0;
    for (i = 0; i < endsite; i++) {
      memcpy(xx1, p->x[i][0], sizeof(sitelike));
      memcpy(xx2, q->x[i][0], sizeof(sitelike));
      sum = 0.0;
      sum1 = 0.0;
      sum2 = 0.0;
      for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1)) {
        sum1 += xx1[(long)b - (long)A];
        sum2 += xx2[(long)b - (long)A];
        sum += xx1[(long)b - (long)A] * xx2[(long)b - (long)A];
      }
      sumyr = (xx1[0] + xx1[(long)G - (long)A])
            * (xx2[0] + xx2[(long)G - (long)A]) +
              (xx1[(long)C - (long)A] + xx1[(long)T - (long)A]) *
              (xx2[(long)C - (long)A] + xx2[(long)T - (long)A]);
      if (sum1 == 1.0 && sum2 == 1.0) {
        num1 += (long)(weight[i] * sum);
        num2 += (long)(weight[i] * (sumyr - sum));
        denominator += weight[i];
      }
    }
    tt = ((1.0 - (double)num1 / denominator)-invarfrac)/(1.0-invarfrac);
    if (tt > 0.0) {
      delta = 0.1;
      tt = delta;
      it = 0;
      while (fabs(delta) > 0.0000002 && it < iterationsd) {
        it++;
        if (!gama) {
          p1 = exp(-tt);
          p2 = exp(-xv * tt) - exp(-tt);
          p3 = 1.0 - exp(-xv * tt);
        } else {
          p1 = exp(-cvi * log(1 + tt / cvi));
          p2 = exp(-cvi * log(1 + xv * tt / cvi))
              - exp(-cvi * log(1 + tt / cvi));
          p3 = 1.0 - exp(-cvi * log(1 + xv * tt / cvi));
        }
        q1 = p1 + p2 / 2.0 + p3 / 4.0;
        q2 = p2 / 2.0 + p3 / 4.0;
        q3 = p3 / 2.0;
        q1 = q1 * (1.0-invarfrac) + invarfrac;
        q2 *= (1.0 - invarfrac);
        q3 *= (1.0 - invarfrac);
        if (!gama && !invar)
          slope = 0.5 * exp(-tt) * (num2 / q2 - num1 / q1) +
                  0.25 * xv * exp(-xv * tt) *
                 ((denominator - num1 - num2) * 2 / q3 - num2 / q2 - num1 / q1);
        else
          slope = 0.5 * (1 / (1 + tt / cvi)) * exp(-cvi * log(1 + tt / cvi)) *
                  (num2 / q2 - num1 / q1) + 0.25 * (xv / (1 + xv * tt / cvi)) *
                    exp(-cvi * log(1 + xv * tt / cvi)) *
                 ((denominator - num1 - num2) * 2 / q3 - num2 / q2 - num1 / q1);
        slope *= (1.0-invarfrac);
        if (slope < 0.0)
          delta = fabs(delta) / -2.0;
        else
          delta = fabs(delta);
        tt += delta;
      }
    }
    if ((delta >= 0.1) && (!similarity)) {
      printf("\nWARNING: DIFFERENCE BETWEEN SPECIES %3ld AND %3ld", m, n);
      if (invar)
        printf(" TOO LARGE FOR INVARIABLE SITES\n");
      else
        printf(" TOO LARGE TO ESTIMATE DISTANCE\n");
      printf("  -1.0 WAS WRITTEN\n");
      baddists = true;
    }
    vv = fracchange * tt;
  }
  if (!(jukesquick || kimquick || logdet)) {
    prod = (double *)Malloc(sites*sizeof(double));
    prod2 = (double *)Malloc(sites*sizeof(double));
    prod3 = (double *)Malloc(sites*sizeof(double));
    for (i = 0; i < endsite; i++) {
      memcpy(xx1, p->x[i][0], sizeof(sitelike));
      memcpy(xx2, q->x[i][0], sizeof(sitelike));
      xx1freqa = xx1[0] * freqa;
      xx1freqc = xx1[(long)C - (long)A] * freqc;
      xx1freqg = xx1[(long)G - (long)A] * freqg;
      xx1freqt = xx1[(long)T - (long)A] * freqt;
      sum1 = xx1freqa + xx1freqc + xx1freqg + xx1freqt;
      sum2 = freqa * xx2[0] + freqc * xx2[(long)C - (long)A] +
             freqg * xx2[(long)G - (long)A] + freqt * xx2[(long)T - (long)A];
      prod[i] = sum1 * sum2;
      prod2[i] = (xx1freqa + xx1freqg) *
                 (xx2[0] * freqar + xx2[(long)G - (long)A] * freqgr) +
          (xx1freqc + xx1freqt) *
          (xx2[(long)C - (long)A] * freqcy + xx2[(long)T - (long)A] * freqty);
      prod3[i] = xx1freqa * xx2[0] + xx1freqc * xx2[(long)C - (long)A] +
         xx1freqg * xx2[(long)G - (long)A] + xx1freqt * xx2[(long)T - (long)A];
    }
    tt = 0.1;
    delta = 0.1;
    it = 1;
    while (it < iterationsd && fabs(delta) > 0.0000002) {
      slope = 0.0;
      if (tt > 0.0) {
        lz = -tt;
        for (i = 0; i < categs; i++) {
          if (!gama) {
            tbl[i].z1 = exp(tbl[i].ratxv * lz);
            tbl[i].z1zz = exp(tbl[i].rat * lz);
          }
          else {
            tbl[i].z1 = exp(-cvi*log(1.0-tbl[i].ratxv * lz/cvi));
            tbl[i].z1zz = exp(-cvi*log(1.0-tbl[i].rat * lz/cvi));
          }
          tbl[i].y1 = 1.0 - tbl[i].z1;
          tbl[i].z1yy = tbl[i].z1 - tbl[i].z1zz;
          tbl[i].z1xv = tbl[i].z1 * xv;
        }
        for (i = 0; i < endsite; i++) {
          idx = category[alias[i] - 1];
          cc = prod[i];
          bb = prod2[i];
          aa = prod3[i];
          if (!gama && !invar)
            slope += weightrat[i] * (tbl[idx - 1].z1zz * (bb - aa) +
                                     tbl[idx - 1].z1xv * (cc - bb)) /
                         (aa * tbl[idx - 1].z1zz + bb * tbl[idx - 1].z1yy +
                          cc * tbl[idx - 1].y1);
          else
            slope += (1.0-invarfrac) * weightrat[i] * (
                    ((tbl[idx-1].rat)/(1.0-tbl[idx-1].rat * lz/cvi))
                       * tbl[idx - 1].z1zz * (bb - aa) +
                    ((tbl[idx-1].ratxv)/(1.0-tbl[idx-1].ratxv * lz/cvi))
                       * tbl[idx - 1].z1 * (cc - bb)) /
                (aa * ((1.0-invarfrac)*tbl[idx - 1].z1zz + invarfrac)
                  + bb * (1.0-invarfrac)*tbl[idx - 1].z1yy
                  + cc * (1.0-invarfrac)*tbl[idx - 1].y1);
        }
      }
      if (slope < 0.0)
        delta = fabs(delta) / -2.0;
      else
        delta = fabs(delta);
      tt += delta;
      it++;
    }
    if ((delta >= 0.1) && (!similarity)) {
      printf("\nWARNING: DIFFERENCE BETWEEN SPECIES %3ld AND %3ld", m, n);
      if (invar)
        printf(" TOO LARGE FOR INVARIABLE SITES\n");
      else
        printf(" TOO LARGE TO ESTIMATE DISTANCE\n");
      printf("  -1.0 WAS WRITTEN\n");
      baddists = true;
    }
    vv = tt * fracchange;
    free(prod);
    free(prod2);
    free(prod3);
  }
  if (logdetquick) {  /* compute logdet when no ambiguous nucleotides */
    for (i = 0; i < 4; i++) {
      basefreq1[i] = 0.0;
      basefreq2[i] = 0.0;
      for (j = 0; j < 4; j++)
        basetable[i][j] = 0.0;
    }
    for (i = 0; i < endsite; i++) {
      k = 0;
      while (p->x[i][0][k] == 0.0)
        k++;
      basefreq1[k] += weight[i];
      l = 0;
      while (q->x[i][0][l] == 0.0)
        l++;
      basefreq2[l] += weight[i];
      basetable[k][l] += weight[i];
    }
    vv = lndet(basetable);
    if (vv == 99.0) {
      printf("\nNegative or zero determinant for distance between species");
      printf(" %ld and %ld\n", m, n);
      printf("  -1.0 WAS WRITTEN\n");
      baddists = true;
    }
    vv = -0.25*(vv - 0.5*(log(basefreq1[0])+log(basefreq1[1])
                                        +log(basefreq1[2])+log(basefreq1[3])
                                        +log(basefreq2[0])+log(basefreq2[1])
                                        +log(basefreq2[2])+log(basefreq2[3])));
  }
  if (similarity) {
    if (denominator < 1.0) {
      printf("\nWARNING: SPECIES %3ld AND %3ld HAVE NO BASES THAT", m, n);
      printf(" CAN BE COMPARED\n");
      printf("  -1.0 WAS WRITTEN\n");
      baddists = true;
  }
    vv = (double)numerator / denominator;
  }
  *v = vv;
}  /* makev */


void makedists()
{
  /* compute distance matrix */
  long i, j;
  double v;

  inittable();
  for (i = 0; i < endsite; i++)
    weightrat[i] = weight[i] * rate[category[alias[i] - 1] - 1];
  if (progress) {
    printf("Distances calculated for species\n");
#ifdef WIN32
    phyFillScreenColor();
#endif
  }
  for (i = 0; i < spp; i++)
    if (similarity)
      d[i][i] = 1.0;
    else
      d[i][i] = 0.0;
  baddists = false;
  for (i = 1; i < spp; i++) {
    if (progress) {
      printf("    ");
      for (j = 0; j < nmlngth; j++)
        putchar(nayme[i - 1][j]);
      printf("   ");
    }
    for (j = i + 1; j <= spp; j++) {
      makev(i, j, &v);
      v = fabs(v);     
      if ( baddists == true ) {
        v = -1;
        baddists = false;
      }
      d[i - 1][j - 1] = v;
      d[j - 1][i - 1] = v;
      if (progress) {
        putchar('.');
        fflush(stdout);
      }
    }
    if (progress) {
      putchar('\n');
#ifdef WIN32
      phyFillScreenColor();
#endif
    }
  }
  if (progress) {
    printf("    ");
    for (j = 0; j < nmlngth; j++)
      putchar(nayme[spp - 1][j]);
    putchar('\n');
  }
  for (i = 0; i < spp; i++) {
    for (j = 0; j < endsite; j++)
      free(nodep[i]->x[j]);
    free(nodep[i]->x);
  }
}  /* makedists */


void writedists()
{
  /* write out distances */
  char **names;

  names = stringnames_new();
  output_matrix_d(outfile, d, spp, spp, names, names, matrix_flags);
  stringnames_delete(names);

  if (progress)
    printf("\nDistances written to file \"%s\"\n\n", outfilename);
}  /* writedists */


int main(int argc, Char *argv[])
{  /* DNA Distances by Maximum Likelihood */
#ifdef MAC
  argc = 1;                /* macsetup("Dnadist","");        */
  argv[0] = "Dnadist";
#endif
  init(argc, argv);
  openfile(&infile,INFILE,"input file","r",argv[0],infilename);
  openfile(&outfile,OUTFILE,"output file","w",argv[0],outfilename);

  ibmpc = IBMCRT;
  ansi = ANSICRT;
  mulsets = false;
  datasets = 1;
  firstset = true;
  doinit();
  ttratio0 = ttratio;
  if (ctgry)
    openfile(&catfile,CATFILE,"categories file","r",argv[0],catfilename);
  if (weights || justwts)
    openfile(&weightfile,WEIGHTFILE,"weights file","r",argv[0],weightfilename);
  for (ith = 1; ith <= datasets; ith++) {
    ttratio = ttratio0;
    getinput();
    if (ith == 1)
      firstset = false;
    if (datasets > 1 && progress)
      printf("Data set # %ld:\n\n",ith);
    makedists();
    writedists();
  }
  FClose(infile);
  FClose(outfile);
#ifdef MAC
  fixmacfile(outfilename);
#endif
  printf("Done.\n\n");
#ifdef WIN32
  phyRestoreConsoleAttributes();
#endif
  return 0;
}  /* DNA Distances by Maximum Likelihood */