#include #include #include #include #include #include #include "clustalw.h" #define MAXERRS 10 /* * Prototypes */ static void create_tree(treeptr ptree, treeptr parent); static void create_node(treeptr pptr, treeptr parent); static treeptr insert_node(treeptr pptr); static void skip_space(FILE *fd); static treeptr avail(void); static void set_info(treeptr p, treeptr parent, sint pleaf, char *pname, float pdist); static treeptr reroot(treeptr ptree, sint nseqs); static treeptr insert_root(treeptr p, float diff); static float calc_root_mean(treeptr root, float *maxdist); static float calc_mean(treeptr nptr, float *maxdist, sint nseqs); static void order_nodes(void); static sint calc_weight(sint leaf); static void group_seqs(treeptr p, sint *next_groups, sint nseqs); static void mark_group1(treeptr p, sint *groups, sint n); static void mark_group2(treeptr p, sint *groups, sint n); static void save_set(sint n, sint *groups); static void clear_tree_nodes(treeptr p); /* * Global variables */ extern Boolean interactive; extern Boolean distance_tree; extern Boolean usemenu; extern sint debug; extern double **tmat; extern sint **sets; extern sint nsets; extern char **names; extern sint *seq_weight; extern Boolean no_weights; char ch; FILE *fd; treeptr *lptr; treeptr *olptr; treeptr *nptr; treeptr *ptrs; sint nnodes = 0; sint ntotal = 0; Boolean rooted_tree = TRUE; static treeptr seq_tree,root; static sint *groups, numseq; void calc_seq_weights(sint first_seq, sint last_seq, sint *sweight) { sint i, nseqs; sint temp, sum, *weight; /* If there are more than three sequences.... */ nseqs = last_seq-first_seq; if ((nseqs >= 2) && (distance_tree == TRUE) && (no_weights == FALSE)) { /* Calculate sequence weights based on Phylip tree. */ weight = (sint *)ckalloc((last_seq+1) * sizeof(sint)); for (i=first_seq; i= 2) { /* If there are more than three sequences.... */ groups = (sint *)ckalloc((nseqs+1) * sizeof(sint)); group_seqs(root, groups, nseqs); groups=ckfree((void *)groups); } else { groups = (sint *)ckalloc((nseqs+1) * sizeof(sint)); for (i=0;ileaf distances for the left and right branches of the tree. */ if (distance_tree == FALSE) { if (rooted_tree == FALSE) { error("input tree is unrooted and has no distances.\nCannot align sequences"); return((sint)0); } } if (rooted_tree == FALSE) { root = reroot(seq_tree, last_seq-first_seq+1); } else { root = seq_tree; } /* calculate the 'order' of each node. */ order_nodes(); if (numseq >= 2) { /* If there are more than three sequences.... */ /* assign the sequence nodes (in the same order as in the alignment file) */ for (i=first_seq; i MAXNAMES) warning("name %s is too long for PHYLIP tree format (max %d chars)", names[i+1],MAXNAMES); for (k=0; k< strlen(names[i+1]) && k0x40) && (c<0x5b)) c=c | 0x20; if (c == ' ') c = '_'; name2[k] = c; } name2[k]='\0'; found = FALSE; for (j=0; jname) && kname[k]; if ((c>0x40) && (c<0x5b)) c=c | 0x20; name1[k] = c; } name1[k]='\0'; if (strcmp(name1, name2) == 0) { olptr[i] = lptr[j]; found = TRUE; } } if (found == FALSE) { error("tree not compatible with alignment:\n%s not found", name2); return((sint)0); } } } return((sint)1); } static void create_tree(treeptr ptree, treeptr parent) { treeptr p; sint i, type; float dist; char name[MAXNAMES+1]; /* is this a node or a leaf ? */ skip_space(fd); ch = (char)getc(fd); if (ch == '(') { /* this must be a node.... */ type = NODE; name[0] = '\0'; ptrs[ntotal] = nptr[nnodes] = ptree; nnodes++; ntotal++; create_node(ptree, parent); p = ptree->left; create_tree(p, ptree); if ( ch == ',') { p = ptree->right; create_tree(p, ptree); if ( ch == ',') { ptree = insert_node(ptree); ptrs[ntotal] = nptr[nnodes] = ptree; nnodes++; ntotal++; p = ptree->right; create_tree(p, ptree); rooted_tree = FALSE; } } skip_space(fd); ch = (char)getc(fd); } /* ...otherwise, this is a leaf */ else { type = LEAF; ptrs[ntotal++] = lptr[numseq++] = ptree; /* get the sequence name */ name[0] = ch; ch = (char)getc(fd); i = 1; while ((ch != ':') && (ch != ',') && (ch != ')')) { if (i < MAXNAMES) name[i++] = ch; ch = (char)getc(fd); } name[i] = '\0'; if (ch != ':') { distance_tree = FALSE; dist = 0.0; } } /* get the distance information */ dist = 0.0; if (ch == ':') { skip_space(fd); fscanf(fd,"%f",&dist); skip_space(fd); ch = (char)getc(fd); } set_info(ptree, parent, type, name, dist); } static void create_node(treeptr pptr, treeptr parent) { treeptr t; pptr->parent = parent; t = avail(); pptr->left = t; t = avail(); pptr->right = t; } static treeptr insert_node(treeptr pptr) { treeptr newnode; newnode = avail(); create_node(newnode, pptr->parent); newnode->left = pptr; pptr->parent = newnode; set_info(newnode, pptr->parent, NODE, "", 0.0); return(newnode); } static void skip_space(FILE *fd) { int c; do c = getc(fd); while(isspace(c)); ungetc(c, fd); } static treeptr avail(void) { treeptr p; p = ckalloc(sizeof(stree)); p->left = NULL; p->right = NULL; p->parent = NULL; p->dist = 0.0; p->leaf = 0; p->order = 0; p->name[0] = '\0'; return(p); } void clear_tree(treeptr p) { clear_tree_nodes(p); nptr=ckfree((void *)nptr); ptrs=ckfree((void *)ptrs); lptr=ckfree((void *)lptr); olptr=ckfree((void *)olptr); } static void clear_tree_nodes(treeptr p) { if (p==NULL) p = root; if (p->left != NULL) { clear_tree_nodes(p->left); } if (p->right != NULL) { clear_tree_nodes(p->right); } p->left = NULL; p->right = NULL; p=ckfree((void *)p); } static void set_info(treeptr p, treeptr parent, sint pleaf, char *pname, float pdist) { p->parent = parent; p->leaf = pleaf; p->dist = pdist; p->order = 0; strcpy(p->name, pname); if (p->leaf == TRUE) { p->left = NULL; p->right = NULL; } } static treeptr reroot(treeptr ptree, sint nseqs) { treeptr p, rootnode, rootptr; float diff, mindiff = 0.0, mindepth = 1.0, maxdist; sint i; Boolean first = TRUE; /* find the difference between the means of leaf->node distances on the left and on the right of each node */ rootptr = ptree; for (i=0; iparent == NULL) diff = calc_root_mean(p, &maxdist); else diff = calc_mean(p, &maxdist, nseqs); if ((diff == 0) || ((diff > 0) && (diff < 2 * p->dist))) { if ((maxdist < mindepth) || (first == TRUE)) { first = FALSE; rootptr = p; mindepth = maxdist; mindiff = diff; } } } /* insert a new node as the ancestor of the node which produces the shallowest tree. */ if (rootptr == ptree) { mindiff = rootptr->left->dist + rootptr->right->dist; rootptr = rootptr->right; } rootnode = insert_root(rootptr, mindiff); diff = calc_root_mean(rootnode, &maxdist); return(rootnode); } static treeptr insert_root(treeptr p, float diff) { treeptr newp, prev, q, t; float dist, prevdist,td; newp = avail(); t = p->parent; prevdist = t->dist; p->parent = newp; dist = p->dist; p->dist = diff / 2; if (p->dist < 0.0) p->dist = 0.0; if (p->dist > dist) p->dist = dist; t->dist = dist - p->dist; newp->left = t; newp->right = p; newp->parent = NULL; newp->dist = 0.0; newp->leaf = NODE; if (t->left == p) t->left = t->parent; else t->right = t->parent; prev = t; q = t->parent; t->parent = newp; while (q != NULL) { if (q->left == prev) { q->left = q->parent; q->parent = prev; td = q->dist; q->dist = prevdist; prevdist = td; prev = q; q = q->left; } else { q->right = q->parent; q->parent = prev; td = q->dist; q->dist = prevdist; prevdist = td; prev = q; q = q->right; } } /* remove the old root node */ q = prev; if (q->left == NULL) { dist = q->dist; q = q->right; q->dist += dist; q->parent = prev->parent; if (prev->parent->left == prev) prev->parent->left = q; else prev->parent->right = q; prev->right = NULL; } else { dist = q->dist; q = q->left; q->dist += dist; q->parent = prev->parent; if (prev->parent->left == prev) prev->parent->left = q; else prev->parent->right = q; prev->left = NULL; } return(newp); } static float calc_root_mean(treeptr root, float *maxdist) { float dist , lsum = 0.0, rsum = 0.0, lmean,rmean,diff; treeptr p; sint i; sint nl, nr; sint direction; /* for each leaf, determine whether the leaf is left or right of the root. */ dist = (*maxdist) = 0; nl = nr = 0; for (i=0; i< numseq; i++) { p = lptr[i]; dist = 0.0; while (p->parent != root) { dist += p->dist; p = p->parent; } if (p == root->left) direction = LEFT; else direction = RIGHT; dist += p->dist; if (direction == LEFT) { lsum += dist; nl++; } else { rsum += dist; nr++; } if (dist > (*maxdist)) *maxdist = dist; } lmean = lsum / nl; rmean = rsum / nr; diff = lmean - rmean; return(diff); } static float calc_mean(treeptr nptr, float *maxdist, sint nseqs) { float dist , lsum = 0.0, rsum = 0.0, lmean,rmean,diff; treeptr p, *path2root; float *dist2node; sint depth = 0, i,j , n = 0; sint nl , nr; sint direction, found; path2root = (treeptr *)ckalloc(nseqs * sizeof(treeptr)); dist2node = (float *)ckalloc(nseqs * sizeof(float)); /* determine all nodes between the selected node and the root; */ depth = (*maxdist) = dist = 0; nl = nr = 0; p = nptr; while (p != NULL) { path2root[depth] = p; dist += p->dist; dist2node[depth] = dist; p = p->parent; depth++; } /* *nl = *nr = 0; for each leaf, determine whether the leaf is left or right of the node. (RIGHT = descendant, LEFT = not descendant) */ for (i=0; i< numseq; i++) { p = lptr[i]; if (p == nptr) { direction = RIGHT; dist = 0.0; } else { direction = LEFT; dist = 0.0; /* find the common ancestor. */ found = FALSE; n = 0; while ((found == FALSE) && (p->parent != NULL)) { for (j=0; j< depth; j++) if (p->parent == path2root[j]) { found = TRUE; n = j; } dist += p->dist; p = p->parent; } if (p == nptr) direction = RIGHT; } if (direction == LEFT) { lsum += dist; lsum += dist2node[n-1]; nl++; } else { rsum += dist; nr++; } if (dist > (*maxdist)) *maxdist = dist; } dist2node=ckfree((void *)dist2node); path2root=ckfree((void *)path2root); lmean = lsum / nl; rmean = rsum / nr; diff = lmean - rmean; return(diff); } static void order_nodes(void) { sint i; treeptr p; for (i=0; iorder++; p = p->parent; } } } static sint calc_weight(sint leaf) { treeptr p; float weight = 0.0; p = olptr[leaf]; while (p->parent != NULL) { weight += p->dist / p->order; p = p->parent; } weight *= 100.0; return((sint)weight); } static void group_seqs(treeptr p, sint *next_groups, sint nseqs) { sint i; sint *tmp_groups; tmp_groups = (sint *)ckalloc((nseqs+1) * sizeof(sint)); for (i=0;ileft != NULL) { if (p->left->leaf == NODE) { group_seqs(p->left, next_groups, nseqs); for (i=0;ileft, tmp_groups, nseqs); } } if (p->right != NULL) { if (p->right->leaf == NODE) { group_seqs(p->right, next_groups, nseqs); for (i=0;iright, tmp_groups, nseqs); } save_set(nseqs, tmp_groups); } for (i=0;i= 2) { /* for each leaf, determine all nodes between the leaf and the root; */ for (i = 0;idist; dist2node[depth] = dist; p = p->parent; depth++; } /* for each pair.... */ for (j=0; j < i; j++) { p = olptr[j]; dist = 0.0; /* find the common ancestor. */ found = FALSE; n = 0; while ((found == FALSE) && (p->parent != NULL)) { for (k=0; k< depth; k++) if (p->parent == path2root[k]) { found = TRUE; n = k; } dist += p->dist; p = p->parent; } dmat[i][j] = dist + dist2node[n-1]; } } nerrs = 0; for (i=0;i 1.0) { if (dmat[i][j] > 1.1 && nerrs0) { strcpy(err_mess,"The following sequences are too divergent to be aligned:\n"); for (i=0;i