#include #include #include #include #include "clustalw.h" /* * Prototypes */ /* * Global Variables */ extern double **tmat; extern Boolean no_weights; extern sint debug; extern sint max_aa; extern sint nseqs; extern sint profile1_nseqs; extern sint nsets; extern sint **sets; extern sint divergence_cutoff; extern sint *seq_weight; extern sint output_order, *output_index; extern Boolean distance_tree; extern char seqname[]; extern sint *seqlen_array; extern char **seq_array; sint malign(sint istart,char *phylip_name) /* full progressive alignment*/ { static sint *aligned; static sint *group; static sint ix; sint *maxid, max, sum; sint *tree_weight; sint i,j,set,iseq=0; sint status,entries; lint score = 0; info("Start of Multiple Alignment"); /* get the phylogenetic tree from *.ph */ if (nseqs >= 2) { status = read_tree(phylip_name, (sint)0, nseqs); if (status == 0) return((sint)0); } /* calculate sequence weights according to branch lengths of the tree - weights in global variable seq_weight normalised to sum to 100 */ calc_seq_weights((sint)0, nseqs, seq_weight); /* recalculate tmat matrix as percent similarity matrix */ status = calc_similarities(nseqs); if (status == 0) return((sint)0); /* for each sequence, find the most closely related sequence */ maxid = (sint *)ckalloc( (nseqs+1) * sizeof (sint)); for (i=1;i<=nseqs;i++) { maxid[i] = -1; for (j=1;j<=nseqs;j++) if (j!=i && maxid[i] < tmat[i][j]) maxid[i] = tmat[i][j]; } /* group the sequences according to their relative divergence */ if (istart == 0) { sets = (sint **) ckalloc( (nseqs+1) * sizeof (sint *) ); for(i=0;i<=nseqs;i++) sets[i] = (sint *)ckalloc( (nseqs+1) * sizeof (sint) ); create_sets((sint)0,nseqs); info("There are %d groups",(pint)nsets); /* clear the memory used for the phylogenetic tree */ if (nseqs >= 2) clear_tree(NULL); /* start the multiple alignments......... */ info("Aligning..."); /* first pass, align closely related sequences first.... */ ix = 0; aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) ); for (i=0;i<=nseqs;i++) aligned[i] = 0; for(set=1;set<=nsets;++set) { entries=0; for (i=1;i<=nseqs;i++) { if ((sets[set][i] != 0) && (maxid[i] > divergence_cutoff)) { entries++; if (aligned[i] == 0) { if (output_order==INPUT) { ++ix; output_index[i] = i; } else output_index[++ix] = i; aligned[i] = 1; } } } if(entries > 0) score = prfalign(sets[set], aligned); else score=0.0; /* negative score means fatal error... exit now! */ if (score < 0) { return(-1); } if ((entries > 0) && (score > 0)) info("Group %d: Sequences:%4d Score:%d", (pint)set,(pint)entries,(pint)score); else info("Group %d: Delayed", (pint)set); } for (i=0;i<=nseqs;i++) sets[i]=ckfree((void *)sets[i]); sets=ckfree(sets); } else { /* clear the memory used for the phylogenetic tree */ if (nseqs >= 2) clear_tree(NULL); aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) ); ix = 0; for (i=1;i<=istart+1;i++) { aligned[i] = 1; ++ix; output_index[i] = i; } for (i=istart+2;i<=nseqs;i++) aligned[i] = 0; } /* second pass - align remaining, more divergent sequences..... */ /* if not all sequences were aligned, for each unaligned sequence, find it's closest pair amongst the aligned sequences. */ group = (sint *)ckalloc( (nseqs+1) * sizeof (sint)); tree_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) ); for (i=0;i max)) { max = maxid[i]; iseq = i; } } /* align this sequence to the existing alignment */ /* weight sequences with percent identity with profile*/ /* OR...., multiply sequence weights from tree by percent identity with new sequence */ if(no_weights==FALSE) { for (j=0;j= 2) { status = read_tree(phylip_name, (sint)0, nseqs); if (status == 0) return(0); } /* calculate sequence weights according to branch lengths of the tree - weights in global variable seq_weight normalised to sum to 100 */ calc_seq_weights((sint)0, nseqs, seq_weight); tree_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) ); for (i=0;i= 2) clear_tree(NULL); aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) ); ix = 0; for (i=1;i<=istart+1;i++) { aligned[i] = 1; ++ix; output_index[i] = i; } for (i=istart+2;i<=nseqs;i++) aligned[i] = 0; /* for each unaligned sequence, find it's closest pair amongst the aligned sequences. */ group = (sint *)ckalloc( (nseqs+1) * sizeof (sint)); while (ix < nseqs) { if (ix > 0) { for (i=1;i<=nseqs;i++) { if (aligned[i] == 0) { maxid[i] = -1; for (j=1;j<=nseqs;j++) if ((maxid[i] < tmat[i][j]) && (aligned[j] != 0)) maxid[i] = tmat[i][j]; } } } /* find the most closely related sequence to those already aligned */ max = -1; for (i=1;i<=nseqs;i++) { if ((aligned[i] == 0) && (maxid[i] > max)) { max = maxid[i]; iseq = i; } } /* align this sequence to the existing alignment */ entries = 0; for (j=1;j<=nseqs;j++) if (aligned[j] != 0) { group[j] = 1; entries++; } else if (iseq==j) { group[j] = 2; entries++; } aligned[iseq] = 1; /* EITHER....., set sequence weights equal to percent identity with new sequence */ /* for (j=0;j1) for (j=0;j 1) { fprintf(stdout,"new weights\n"); for (j=0;j=0) && (c1= 2) { status = read_tree(p1_tree_name, (sint)0, profile1_nseqs); if (status == 0) return(0); } /* calculate sequence weights according to branch lengths of the tree - weights in global variable seq_weight normalised to sum to 100 */ p1_weight = (sint *) ckalloc( (profile1_nseqs) * sizeof(sint) ); calc_seq_weights((sint)0, profile1_nseqs, p1_weight); /* clear the memory for the phylogenetic tree */ if (profile1_nseqs >= 2) clear_tree(NULL); if (nseqs-profile1_nseqs >= 2) { status = read_tree(p2_tree_name, profile1_nseqs, nseqs); if (status == 0) return(0); } p2_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) ); calc_seq_weights(profile1_nseqs,nseqs, p2_weight); /* clear the memory for the phylogenetic tree */ if (nseqs-profile1_nseqs >= 2) clear_tree(NULL); /* convert tmat distances to similarities */ for (i=1;i 1) { fprintf(stdout,"new weights\n"); for (j=0;j