/******************************COPYRIGHT NOTICE*******************************/ /* (c) Centro de Regulacio Genomica */ /* and */ /* Cedric Notredame */ /* 12 Aug 2014 - 22:07. */ /*All rights reserved. */ /*This file is part of T-COFFEE. */ /* */ /* T-COFFEE is free software; you can redistribute it and/or modify */ /* it under the terms of the GNU General Public License as published by */ /* the Free Software Foundation; either version 2 of the License, or */ /* (at your option) any later version. */ /* */ /* T-COFFEE is distributed in the hope that it will be useful, */ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ /* GNU General Public License for more details. */ /* */ /* You should have received a copy of the GNU General Public License */ /* along with Foobar; if not, write to the Free Software */ /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /*............................................... */ /* If you need some more information */ /* cedric.notredame@europe.com */ /*............................................... */ /******************************COPYRIGHT NOTICE*******************************/ // #include "kmeans.h" #include #include #include "km_coffee_header.h" //this function is declared in Vector.c double l2norm(Vector *vec, size_t size); /* void plusplus_init(const vector > > &vecs, unsigned int k, vector > > ¢ers, size_t start, size_t end) { centers.clear(); size_t n_points = vecs.size(); size_t center = rand() % n_points; double *distances = new double[n_points]; size_t i; for (i = 0; i < n_points; ++i) distances[i] = sq_dist(*(vecs[center]), *(vecs[i])); if (start == end) k=4; if (k==5) k=3; //TODO delete[] distances; }*/ /* void random_init(const vector > > &vecs, unsigned int k, vector > > ¢ers, size_t start, size_t end) { centers.clear(); size_t id, j, n_vecs=end-start, found = 0, dimension = vecs[0]->size(); map taken; Vector *p, *vec; while (found < k) { id = start + rand() % n_vecs; if (taken.count(id)) { taken.insert(pair(id,true)); p = new Vector(dimension); centers.push_back(boost::shared_ptr >(p)); vec = &(*(vecs[id])); for (j=0; jdim=vec_set->dim; centers->n_vecs=k; size_t dim = vec_set->dim; centers->vecs=my_malloc(k*sizeof(Vector*)); size_t i,j; for (i = 0; i vecs[i]=my_malloc(sizeof(Vector)); centers->vecs[i]->data=my_calloc(dim,sizeof(double)); } // Calculate sum of distances from each point to each other double *sum_dists=my_calloc(end-start, sizeof(double)); double dist; for (i=start; ivecs[i], vec_set->vecs[j], dim); sum_dists[i-start] += dist; sum_dists[j-start] += dist; } } // Find minimum to every other and set as center double min_dist = DBL_MAX; size_t n_vecs = end-start, index=0; for (i=0; ivecs[0]->data; vec = vec_set->vecs[index]->data; for (j=0; jvecs[i], vec_set->vecs[index], dim); qsort (sum_dists, end-start, sizeof(double), aha_compare); int n_points = ceil(n_vecs/k); double y=0; for (i=0; ivecs[l], vec_set->vecs[min_index], dim); if (tmp_dist < min_dist) min_dist = tmp_dist; } ++min_index; printf("ARG: %li\n", min_index); dist+=min_dist*min_dist; } printf("%li\n", min_index); p = centers->vecs[i]->data; vec = vec_set->vecs[min_index]->data; for (j=0; j = y>D(x1)2+D(x2)2+...+ // D(x(i-1))2 // 9. Add xi to C // 10. Repeat steps 5-8 until k centers*/ // // } typedef struct { size_t x; double y; } init_pair; int init_compare (const void * a, const void * b) { return ( ((init_pair*)a)->y - ((init_pair*)b)->y ); } VectorSet * spss_like_init(const VectorSet *vec_set, unsigned int k, size_t start, size_t end) { VectorSet *centers = (VectorSet*)my_malloc(sizeof(VectorSet)); centers->dim=vec_set->dim; centers->n_vecs=k; size_t dim = vec_set->dim; centers->vecs=(Vector**)my_malloc(k*sizeof(Vector*)); size_t i,j; for (i = 0; i vecs[i]=(Vector*)my_malloc(sizeof(Vector)); centers->vecs[i]->data=(double*)my_calloc(dim,sizeof(double)); } // Calculate sum of distances from each point to each other init_pair *dists=(init_pair*)my_malloc((end-start)* sizeof(init_pair)); init_pair *keep = dists; int points=end-start; for (i=0; ivecs[i], vec_set->vecs[j], dim); dists[i].y += dist; dists[j].y += dist; } } // Find minimum to every other and set as center double min_dist = DBL_MAX; size_t n_vecs = end-start, index=0; for (i=0; ivecs[0]->data; vec = vec_set->vecs[index]->data; for (j=0; jvecs[i-1], vec_set->vecs[dists[j].x], dim); if (tmp_dist < dists[j].y) dists[j].y = tmp_dist; } qsort (dists, points, sizeof(init_pair), init_compare); min_index=dists[n_points].x; // printf("T: %li\n", min_index); p = centers->vecs[i]->data; vec = vec_set->vecs[min_index]->data; for (j=0; jx - ((pair*)b)->x ); } int compare_pair_y (const void * a, const void * b) { return ( ((pair*)a)->y - ((pair*)b)->y ); } VectorSet * plusplus_like_init(const VectorSet *vec_set, unsigned int k, size_t start, size_t end) { VectorSet *centers = (VectorSet*)my_malloc(sizeof(VectorSet)); centers->dim=vec_set->dim; centers->n_vecs=k; size_t dim = vec_set->dim; centers->vecs=(Vector**)my_malloc(k*sizeof(Vector*)); size_t i,j; for (i = 0; ivecs[i]=(Vector*)my_malloc(sizeof(Vector)); centers->vecs[i]->data=(double*)my_malloc(dim*sizeof(double)); centers->vecs[i]->id=i; } size_t n_vecs = end-start; pair *dists = (pair*)my_malloc(sizeof(pair) * n_vecs); for (i=0; ivecs[i], vec_set->vecs[j], dim); if (dist < dists[i].y) dists[i].y=dist; if (dist < dists[j].y) dists[j].y=dist; } } qsort (dists, n_vecs, sizeof(pair), compare_pair_y); size_t med_id = dists[n_vecs/2].x; double *p, *vec; p = centers->vecs[0]->data; vec = vec_set->vecs[med_id]->data; for (j=0; jvecs[i-1], vec_set->vecs[j], dim); if (tmp_dist < dists[j].y) dists[j].y=tmp_dist; } qsort (dists+i, n_vecs-i, sizeof(pair), compare_pair_y); p = centers->vecs[i]->data; vec = vec_set->vecs[dists[i+((n_vecs-i)/2)].x]->data; for (j=0; jdim=vec_set->dim; centers->n_vecs=k; size_t dim = vec_set->dim; centers->vecs=(Vector**)my_malloc(k*sizeof(Vector*)); size_t i,j; for (i = 0; i vecs[i]=(Vector*)my_malloc(sizeof(Vector)); centers->vecs[i]->data=(double*)my_calloc(dim,sizeof(double)); centers->vecs[i]->id=i; } //get vector with largest l2norm double max_norm =-1, tmp; size_t index =0; for (i=start; ivecs[i], dim); if (tmp>max_norm) { max_norm=tmp; index=i; } } double *p, *vec; p = centers->vecs[0]->data; vec = vec_set->vecs[index]->data; for (j=0; jvecs[l], vec_set->vecs[j], dim); if (tmp_dist < min_dist) min_dist = tmp_dist; } if (min_dist > max_dist) { max_dist=min_dist; index=j; } } // printf("INDEX: %i\n", index); p = centers->vecs[i]->data; vec = vec_set->vecs[index]->data; for (j=0; jdim=vec_set->dim; centers->n_vecs=k; size_t dim = vec_set->dim; centers->vecs=(Vector**)my_malloc(k*sizeof(Vector*)); size_t i; for (i = 0; i vecs[i]=(Vector*)my_malloc(sizeof(Vector)); centers->vecs[i]->data=(double*)my_calloc(dim,sizeof(double)); } double *p, *vec; size_t j,pos=start; int step = (end-start)/k; for (i = 0; i < k; ++i) { p = centers->vecs[i]->data; vec = vec_set->vecs[pos]->data; for (j=0; jdim=vec_set->dim; centers->n_vecs=k; size_t dim = vec_set->dim; centers->vecs=(Vector**)my_malloc(k*sizeof(Vector*)); size_t i; for (i = 0; i vecs[i]=(Vector*)my_malloc(sizeof(Vector)); centers->vecs[i]->data=(double*)my_calloc(dim,sizeof(double)); } size_t j,x = start+k; double *p, *vec; for (i = start; i < x; ++i) { p = centers->vecs[i-start]->data; vec = vec_set->vecs[i]->data; for (j=0; jn_vecs); } int my_assignment_sort (const void *i, const void *j) { return (*(Vector**)i)->assignment - (*(Vector**)j)->assignment; } void delKM_node(KM_node *node) { free(node->children); free(node); } KM_node* hierarchical_kmeans(VectorSet *vecs, unsigned int k, unsigned int k_leaf, const char *init, double error_threshold) { Stack *todo =Stack_init(); // Initialize tree KM_node *root = (KM_node*)my_malloc(sizeof(KM_node)); root->children=(KM_node**)malloc(k*sizeof(KM_node)); KM_node *current, *tmp; root->n_children=0; root->start=0; root->end=vecs->n_vecs; root->id=0; push(todo, root); size_t node_id = 0; int use_k; size_t start, end, i, old_index, old_assignment; while (todo->size != 0) { current =(KM_node*) todo->last; pop(todo); start = current->start; end = current->end; use_k = k;//(((end-start)/k) >k) ? k : max(2,(end-start)/k); kmeans_sub(vecs, use_k, init, error_threshold, start, end); qsort(&vecs->vecs[current->start], current->end-current->start, sizeof(Vector*), my_assignment_sort); old_index=start; old_assignment=vecs->vecs[start]->assignment; for (i=start; ivecs[i]->assignment != old_assignment) { tmp=(KM_node*)my_malloc(sizeof(KM_node)); tmp->children=(KM_node**)malloc(k*sizeof(KM_node*)); tmp->n_children=0; current->children[current->n_children++]=tmp; tmp->start=old_index; tmp->end=i; tmp->id = ++node_id; if (i - old_index > (k_leaf))//(k*1.5)) push(todo, tmp); old_index=i; old_assignment=vecs->vecs[i]->assignment; } } tmp=(KM_node*)my_malloc(sizeof(KM_node)); tmp->children=(KM_node**)malloc(k*sizeof(KM_node*)); tmp->n_children=0; current->children[current->n_children++]=tmp; tmp->start=old_index; tmp->end=i; tmp->id=++node_id; if ((vecs->vecs[start]->assignment != vecs->vecs[end-1]->assignment) && (i - old_index > (k_leaf))) push(todo, tmp); } delStack(todo); return root; } KM_node* hierarchical_kmeans2(VectorSet *vecs, unsigned int k, unsigned int k_leaf, const char *init, double error_threshold) { // Stack *todo =Stack_init(); size_t node_id = 0; int use_k; size_t n_vecs, i, old_assignment; VectorSet *centers; KM_node **node_array=(KM_node**)malloc(vecs->n_vecs*sizeof(KM_node*)); // each sequence gets a node for (i=0; in_vecs; ++i) { node_array[i]=(KM_node*)my_malloc(sizeof(KM_node)); node_array[i]->start=i; node_array[i]->end=i+1; node_array[i]->id = ++node_id; node_array[i]->n_children=0; } // cluster the sequences so that each cluster on average has k sub nodes int tmp_node_id; KM_node *tmp; size_t max_reserve, old_id; while (vecs->n_vecs > k) { n_vecs= vecs->n_vecs; use_k = std::max(2, (int)(n_vecs/k)); //use_k = max(2, (int)(n_vecs/k)); centers=kmeans_sub_eq_size(vecs, use_k, init, error_threshold, 0, n_vecs); qsort(&vecs->vecs[0], n_vecs, sizeof(Vector*), my_assignment_sort); old_assignment=-1; tmp_node_id=-1; old_id=0; KM_node **tmp_nodes=(KM_node**)malloc(use_k*sizeof(KM_node*)); VectorSet *tmpSet = (VectorSet*)my_malloc(sizeof(VectorSet)); tmpSet->dim=vecs->dim; tmpSet->n_vecs=0; tmpSet->vecs=(Vector**)my_malloc(use_k*sizeof(Vector*)); for (i=0; in_vecs; ++i) { if (vecs->vecs[i]->assignment != old_assignment) { // printf("%li\n", i - old_id); old_id=i; tmp=(KM_node*)my_malloc(sizeof(KM_node)); tmp_nodes[++tmp_node_id]=tmp; tmpSet->vecs[tmp_node_id]=centers->vecs[vecs->vecs[i]->assignment]; tmpSet->vecs[tmp_node_id]->id=tmp_node_id; ++tmpSet->n_vecs; max_reserve=k; tmp->children=(KM_node**)malloc(max_reserve*sizeof(KM_node*)); tmp->n_children=0; tmp->children[tmp->n_children++]=node_array[vecs->vecs[i]->id]; tmp->id = ++node_id; old_assignment=vecs->vecs[i]->assignment; } else { if (tmp->n_children == max_reserve) { max_reserve += 5; tmp->children=(KM_node**)realloc(tmp->children, max_reserve*sizeof(KM_node*)); } tmp->children[tmp->n_children++]=node_array[vecs->vecs[i]->id]; } } for (i=0; i<=tmp_node_id; ++i) node_array[i] = tmp_nodes[i]; free(tmp_nodes); vecs=tmpSet; tmpSet->n_vecs=++tmp_node_id; } // add the root node KM_node *root; if (vecs->n_vecs==1) root=node_array[0]; else { root=(KM_node*)my_malloc(sizeof(KM_node)); root->children=(KM_node**)malloc(use_k*sizeof(KM_node*)); root->n_children=0; for (i=0; in_vecs; ++i) root->children[root->n_children++]=node_array[vecs->vecs[i]->id]; } free(node_array); root->id=0; return root; } void kmeans_sub(const VectorSet *vecs, unsigned int k, const char *init, double error_threshold, size_t start, size_t end) { // printf("%li-%li\n", start, end); // determine first centers VectorSet *centers; if (!strcmp(init, "distributed")) centers = distributed_init(vecs, k, start, end); else if (!strcmp(init, "kkz")) centers = kkz_init(vecs, k, start, end); else if (!strcmp(init, "spss")) centers = spss_like_init(vecs, k, start, end); else if (!strcmp(init, "first")) centers = first_init(vecs, k, start); else if (!strcmp(init, "plusplus")) centers = plusplus_like_init(vecs, k, start, end); else { printf("%s\n", init); printf("Unknown initialization value!"); exit(1); } size_t *n_inside = (size_t*)malloc(k*sizeof(size_t)); size_t i; for (i=0; idim; size_t j; Vector *center_tmp; int chunk = 30; int round=0; do { ++round; old_error = error; error=0; #pragma omp parallel shared(start, end, vecs, chunk, k, centers, n_inside) private(id, new_center_id, min_dist, center_id, tmp_dist) { #pragma omp for schedule(dynamic,chunk) reduction(+:error) nowait for (id = start; id < end; ++id) { min_dist = DBL_MAX; new_center_id=-1; for (center_id = 0; center_id < k; ++center_id) { tmp_dist = sqrt(km_sq_dist(vecs->vecs[id], centers->vecs[center_id],dimension)); // tmp_dist = km_angle_dist(vecs->vecs[id]->data, centers->vecs[center_id]->data,dimension); // tmp_dist = km_common(vecs->vecs[id]->data, centers->vecs[center_id]->data,dimension); if (tmp_dist < min_dist) { new_center_id = center_id; min_dist = tmp_dist; } } ++n_inside[new_center_id]; if ((round != 1) && (vecs->vecs[id]->assignment != new_center_id)) --n_inside[vecs->vecs[id]->assignment]; vecs->vecs[id]->assignment=new_center_id; error += min_dist; } } chunk=10; // set centers to 0 double *tmp,*tmp2; for (i=0; ivecs[center_id]->data; for (j=0; jvecs[id]->data; tmp2 = centers->vecs[vecs->vecs[id]->assignment]->data; ++n_inside[vecs->vecs[id]->assignment]; for (j=0; jvecs[center_id]; if (n_inside[center_id]) { for (j=0; jdata[j] /= n_inside[center_id]; } } } // for (i=0; i error_threshold); free(n_inside); delVecSet(centers); } VectorSet * kmeans_sub_eq_size(const VectorSet *vecs, unsigned int k, const char *init, double error_threshold, size_t start, size_t end) { // determine first centers VectorSet *centers; if (!strcmp(init, "distributed")) centers = distributed_init(vecs, k, start, end); else if (!strcmp(init, "kkz")) centers = kkz_init(vecs, k, start, end); else if (!strcmp(init, "spss")) centers = spss_like_init(vecs, k, start, end); else if (!strcmp(init, "plusplus")) centers = plusplus_like_init(vecs, k, start, end); else if (!strcmp(init, "first")) centers = first_init(vecs, k, start); else { printf("%s\n", init); printf("Unknown initialization value!"); exit(1); } size_t *n_inside = (size_t*)malloc(k*sizeof(double)); size_t i; for (i=0; idim; size_t j; Vector *center_tmp; size_t *nums =(size_t*) my_malloc(k*sizeof(size_t)); int chunk = 30; int round=0; do { ++round; old_error = error; error=0; for (i = 0; ivecs[id], centers->vecs[center_id],dimension))*(n_inside[center_id]*0.2)-log(n_inside[center_id]); // tmp_dist = km_sq_dist(vecs->vecs[id], centers->vecs[center_id],dimension); // tmp_dist = km_muscle_dist(vecs->vecs[id], centers->vecs[center_id],dimension, 3); // tmp_dist = km_angle_dist(vecs->vecs[id]->data, centers->vecs[center_id]->data,dimension); // tmp_dist = 1-(km_common(vecs->vecs[id]->data, centers->vecs[center_id]->data,dimension)/vecs->vecs[id]->seq_len); if (tmp_dist < min_dist) { new_center_id = center_id; min_dist = tmp_dist; } } ++n_inside[new_center_id]; if ((round != 1) && (vecs->vecs[id]->assignment != new_center_id)) --n_inside[vecs->vecs[id]->assignment]; vecs->vecs[id]->assignment=new_center_id; error += min_dist; #pragma omp atomic ++nums[new_center_id]; } // } chunk=10; // set centers to 0 double *tmp,*tmp2; for (i=0; ivecs[center_id]->data; for (j=0; jvecs[id]->data; tmp2 = centers->vecs[vecs->vecs[id]->assignment]->data; ++n_inside[vecs->vecs[id]->assignment]; for (j=0; jvecs[center_id]; if (nums[center_id]) { for (j=0; jdata[j] /= nums[center_id]; } } } // for (i=0; i error_threshold); free(nums); // delVecSet(centers); return centers; } double km_common(const double *vec1, const double *vec2, size_t dim) { double common = 0; size_t i; for (i=0; i0) && (vec2[i] >0)) ++common; } return common; }