#include "Python.h" #include #include #include #include "cluster.h" /* ========================================================================= */ /* -- Helper routines ------------------------------------------------------ */ /* ========================================================================= */ #if PY_MAJOR_VERSION < 3 static char extract_single_character(PyObject* object, const char variable[], const char allowed[]) { char c = '\0'; const char* data; Py_ssize_t n; if (PyString_Check(object)) { n = PyString_GET_SIZE(object); if (n == 1) { data = PyString_AS_STRING(object); c = data[0]; } } else if (PyUnicode_Check(object)) { n = PyUnicode_GET_SIZE(object); if (n == 1) { Py_UNICODE* u = PyUnicode_AS_UNICODE(object); Py_UNICODE ch = u[0]; if (ch < 128) c = (char) ch; } } else { PyErr_Format(PyExc_ValueError, "%s should be a string", variable); return 0; } if (!c) { PyErr_Format(PyExc_ValueError, "%s should be a single character", variable); return 0; } else if (!strchr(allowed, c)) { PyErr_Format(PyExc_ValueError, "unknown %s function specified (should be one of '%s')", variable, allowed); return 0; } return c; } #else static char extract_single_character(PyObject* object, const char variable[], const char allowed[]) { Py_UCS4 ch; Py_ssize_t n; if (!PyUnicode_Check(object)) { PyErr_Format(PyExc_ValueError, "%s should be a string", variable); return 0; } if (PyUnicode_READY(object) == -1) return 0; n = PyUnicode_GET_LENGTH(object); if (n != 1) { PyErr_Format(PyExc_ValueError, "%s should be a single character", variable); return 0; } ch = PyUnicode_READ_CHAR(object, 0); if (ch < 128) { const char c = ch; if (strchr(allowed, c)) return c; } PyErr_Format(PyExc_ValueError, "unknown %s function specified (should be one of '%s')", variable, allowed); return 0; } #endif static int distance_converter(PyObject* object, void* pointer) { char c; c = extract_single_character(object, "dist", "ebcauxsk"); if (c == 0) return 0; *((char*)pointer) = c; return 1; } static int method_treecluster_converter(PyObject* object, void* pointer) { char c; c = extract_single_character(object, "method", "csma"); if (c == 0) return 0; *((char*)pointer) = c; return 1; } static int method_kcluster_converter(PyObject* object, void* pointer) { char c; c = extract_single_character(object, "method", "am"); if (c == 0) return 0; *((char*)pointer) = c; return 1; } static int method_clusterdistance_converter(PyObject* object, void* pointer) { char c; c = extract_single_character(object, "method", "amsxv"); if (c == 0) return 0; *((char*)pointer) = c; return 1; } /* -- data ----------------------------------------------------------------- */ typedef struct { int nrows; int ncols; double** values; Py_buffer view; } Data; static int data_converter(PyObject* object, void* pointer) { Data* data = pointer; int nrows; int ncols; int i; double** values; Py_buffer* view = &data->view; const char* p; Py_ssize_t stride; const int flag = PyBUF_ND | PyBUF_STRIDES; /* data should be initialized to 0 before calling this function. */ if (object == Py_None) return 1; if (PyObject_GetBuffer(object, view, flag) == -1) { PyErr_SetString(PyExc_RuntimeError, "data matrix has unexpected format."); return 0; } if (view->ndim != 2) { PyErr_Format(PyExc_ValueError, "data matrix has incorrect rank (%d expected 2)", view->ndim); return 0; } if (view->itemsize != sizeof(double)) { PyErr_SetString(PyExc_RuntimeError, "data matrix has incorrect data type"); return 0; } nrows = (int) view->shape[0]; ncols = (int) view->shape[1]; if (nrows != view->shape[0] || ncols != view->shape[1]) { PyErr_Format(PyExc_ValueError, "data matrix is too large (dimensions = %zd x %zd)", view->shape[0], view->shape[1]); return 0; } if (nrows < 1 || ncols < 1) { PyErr_SetString(PyExc_ValueError, "data matrix is empty"); return 0; } stride = view->strides[0]; if (view->strides[1] != view->itemsize) { PyErr_SetString(PyExc_RuntimeError, "data is not contiguous"); return 0; } values = malloc(nrows*sizeof(double*)); if (!values) { PyErr_NoMemory(); return 0; } for (i = 0, p = view->buf; i < nrows; i++, p += stride) values[i] = (double*)p; data->values = values; data->nrows = nrows; data->ncols = ncols; return 1; } static void free_data(Data* data) { double** values = data->values; Py_buffer* view = &data->view; PyBuffer_Release(view); if (values) free(values); } /* -- mask ----------------------------------------------------------------- */ typedef struct { int** values; Py_buffer view; } Mask; static int mask_converter(PyObject* object, void* pointer) { Mask* mask = pointer; int nrows; int ncols; int i; int** values; Py_buffer* view = &mask->view; const char* p; Py_ssize_t stride; const int flag = PyBUF_ND | PyBUF_STRIDES; /* data should be initialized to 0 before calling this function. */ if (object == Py_None) return 1; if (PyObject_GetBuffer(object, view, flag) == -1) { PyErr_SetString(PyExc_RuntimeError, "mask has unexpected format."); return 0; } if (view->ndim != 2) { PyErr_Format(PyExc_ValueError, "mask has incorrect rank (%d expected 2)", view->ndim); return 0; } if (view->itemsize != sizeof(int)) { PyErr_SetString(PyExc_RuntimeError, "mask has incorrect data type"); return 0; } nrows = (int) view->shape[0]; ncols = (int) view->shape[1]; if (nrows != view->shape[0] || ncols != view->shape[1]) { PyErr_Format(PyExc_ValueError, "mask is too large (dimensions = %zd x %zd)", view->shape[0], view->shape[1]); return 0; } stride = view->strides[0]; if (view->strides[1] != view->itemsize) { PyErr_SetString(PyExc_RuntimeError, "mask is not contiguous"); return 0; } values = malloc(nrows*sizeof(int*)); if (!values) { PyErr_NoMemory(); return 0; } for (i = 0, p = view->buf; i < nrows; i++, p += stride) values[i] = (int*)p; mask->values = values; return 1; } static void free_mask(Mask* mask) { int** values = mask->values; Py_buffer* view = &mask->view; PyBuffer_Release(view); if (values) free(values); } /* -- 1d array ------------------------------------------------------------- */ static int vector_converter(PyObject* object, void* pointer) { Py_buffer* view = pointer; int ndata; const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS; if (PyObject_GetBuffer(object, view, flag) == -1) { PyErr_SetString(PyExc_RuntimeError, "unexpected format."); return 0; } if (view->ndim != 1) { PyErr_Format(PyExc_ValueError, "incorrect rank (%d expected 1)", view->ndim); return 0; } if (view->itemsize != sizeof(double)) { PyErr_SetString(PyExc_RuntimeError, "array has incorrect data type"); return 0; } ndata = (int) view->shape[0]; if (ndata != view->shape[0]) { PyErr_Format(PyExc_ValueError, "array is too large (size = %zd)", view->shape[0]); return 0; } return 1; } static int vector_none_converter(PyObject* object, void* pointer) { if (object == Py_None) return 1; return vector_converter(object, pointer); } /* -- clusterid ------------------------------------------------------------ */ static int check_clusterid(Py_buffer clusterid) { int i, j; int *p = clusterid.buf; int nclusters = 0; int* number; const int nitems = (int) clusterid.shape[0]; if (nitems != clusterid.shape[0]) { PyErr_Format(PyExc_ValueError, "clusterid array is too large (size = %zd)", clusterid.shape[0]); return 0; } for (i = 0; i < nitems; i++) { j = p[i]; if (j > nclusters) nclusters = j; if (j < 0) { PyErr_SetString(PyExc_ValueError, "negative cluster number found"); return 0; } } nclusters++; /* -- Count the number of items in each cluster --------------------- */ number = calloc(nclusters, sizeof(int)); if (!number) { PyErr_NoMemory(); return 0; } for (i = 0; i < nitems; i++) { j = p[i]; number[j]++; } for (j = 0; j < nclusters; j++) if (number[j] == 0) break; free(number); if (j < nclusters) { PyErr_Format(PyExc_ValueError, "cluster %d is empty", j); return 0; } return nclusters; } /* -- distance ----------------------------------------------------------- */ typedef struct { int n; double** values; Py_buffer* views; Py_buffer view; } Distancematrix; static int _convert_list_to_distancematrix(PyObject* list, Distancematrix* distances) { int i; double** values; Py_buffer* view; Py_buffer* views; const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS; const int n = (int) PyList_GET_SIZE(list); if (n != PyList_GET_SIZE(list)) { PyErr_SetString(PyExc_ValueError, "distance matrix is too large"); return 0; } values = malloc(n*sizeof(double*)); if (!values) { PyErr_NoMemory(); return 0; } distances->values = values; views = malloc(n*sizeof(Py_buffer)); if (!views) { PyErr_NoMemory(); return 0; } distances->views = views; view = views; for (i = 0; i < n; i++, view++) { PyObject* item = PyList_GET_ITEM(list, i); view->len = -1; if (PyObject_GetBuffer(item, view, flag) == -1) { PyErr_Format(PyExc_RuntimeError, "failed to parse row %d.", i); break; } if (view->ndim != 1) { PyErr_Format(PyExc_ValueError, "row %d has incorrect rank (%d expected 1)", i, view->ndim); break; } if (view->itemsize != sizeof(double)) { PyErr_Format(PyExc_RuntimeError, "row %d has incorrect data type", i); break; } if (view->shape[0] != i) { PyErr_Format(PyExc_RuntimeError, "row %d has incorrect size %zd (expected %d)", i, view->shape[0], i); break; } values[i] = view->buf; } if (i < n) { for ( ; i >= 0; i--, view--) PyBuffer_Release(view); return 0; } distances->n = n; distances->view.len = 0; distances->views = views; distances->values = values; return 1; } static int _convert_array_to_distancematrix(PyObject* array, Distancematrix* distances) { int i; int n; double** values; double* p; Py_buffer* view = &distances->view; const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS; if (PyObject_GetBuffer(array, view, flag) == -1) { PyErr_SetString(PyExc_RuntimeError, "distance matrix has unexpected format."); return 0; } if (view->len == 0) { PyErr_SetString(PyExc_RuntimeError, "distance matrix is empty"); return 0; } if (view->itemsize != sizeof(double)) { PyErr_SetString(PyExc_RuntimeError, "distance matrix has an incorrect data type"); return 0; } if (view->ndim == 1) { int m = (int) view->shape[0]; if (m != view->shape[0]) { PyErr_Format(PyExc_ValueError, "distance matrix is too large (size = %zd)", view->shape[0]); return 0; } n = (int)(1+sqrt(1+8*m)/2); /* rounds to (1+sqrt(1+8*m))/2 */ if (n*n-n != 2 * m) { PyErr_SetString(PyExc_ValueError, "distance matrix has unexpected size."); return 0; } distances->n = n; values = malloc(n*sizeof(double*)); if (!values) { PyErr_NoMemory(); return 0; } distances->values = values; for (p = view->buf, i = 0; i < n; p += i, i++) values[i] = p; } else if (view->ndim == 2) { n = (int) view->shape[0]; if (n != view->shape[0]) { PyErr_Format(PyExc_ValueError, "distance matrix is too large (size = %zd)", view->shape[0]); return 0; } distances->n = n; if (view->shape[1] != n) { PyErr_SetString(PyExc_ValueError, "distance matrix is not square."); return 0; } values = malloc(n*sizeof(double*)); if (!values) { PyErr_NoMemory(); return 0; } distances->values = values; for (p = view->buf, i = 0; i < n; p += n, i++) values[i] = p; } else { PyErr_Format(PyExc_ValueError, "data matrix has incorrect rank (%d; expected 1 or 2)", view->ndim); return 0; } return 1; } static int distancematrix_converter(PyObject* argument, void* pointer) { Distancematrix* distances = pointer; if (argument == Py_None) return 1; if (PyList_Check(argument)) { return _convert_list_to_distancematrix(argument, distances); } else { return _convert_array_to_distancematrix(argument, distances); } } static void free_distancematrix(Distancematrix* distances) { double** values = distances->values; if (values == NULL) return; else { int i; const int n = distances->n; Py_buffer* views = distances->views; if (views) { for (i = 0; i < n; i++) PyBuffer_Release(&views[i]); free(views); } else PyBuffer_Release(&distances->view); free(values); } } /* -- celldata ------------------------------------------------------------- */ typedef struct { int nx; int ny; int nz; double*** values; Py_buffer view; } Celldata; static int celldata_converter(PyObject* argument, void* pointer) { int i, n; double* p; double** pp; double*** ppp; int nx; int ny; int nz; Celldata* celldata = pointer; Py_buffer* view = &celldata->view; const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS; if (PyObject_GetBuffer(argument, view, flag) == -1) { PyErr_SetString(PyExc_RuntimeError, "celldata array has unexpected format."); return 0; } nx = (int) view->shape[0]; ny = (int) view->shape[1]; nz = (int) view->shape[2]; if (nx != view->shape[0] || ny != view->shape[1] || nz != view->shape[2]) { PyBuffer_Release(view); PyErr_SetString(PyExc_RuntimeError, "celldata array too large"); return 0; } if (view->itemsize != sizeof(double)) { PyErr_SetString(PyExc_RuntimeError, "celldata array has incorrect data type"); return 0; } pp = malloc(nx*ny*sizeof(double*)); ppp = malloc(nx*sizeof(double**)); if (!pp || !ppp) { if (pp) free(pp); if (ppp) free(ppp); PyErr_NoMemory(); return 0; } p = view->buf; n = nx * ny; for (i = 0; i < n; i++, p += nz) pp[i] = p; for (i = 0; i < nx; i++, pp += ny) ppp[i] = pp; celldata->values = ppp; celldata->nx = nx; celldata->ny = ny; celldata->nz = nz; return 1; } static void free_celldata(Celldata* celldata) { double*** ppp = celldata->values; if (!ppp) return; free(ppp[0]); free(ppp); PyBuffer_Release(&celldata->view); } /* -- index ---------------------------------------------------------------- */ static int index_converter(PyObject* argument, void* pointer) { Py_buffer* view = pointer; int n; const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS; if (PyObject_GetBuffer(argument, view, flag) == -1) { PyErr_SetString(PyExc_RuntimeError, "unexpected format."); return 0; } if (view->ndim != 1) { PyErr_Format(PyExc_ValueError, "incorrect rank (%d expected 1)", view->ndim); return 0; } if (view->itemsize != sizeof(int)) { PyErr_SetString(PyExc_RuntimeError, "argument has incorrect data type"); return 0; } n = (int) view->shape[0]; if (n != view->shape[0]) { PyErr_Format(PyExc_ValueError, "array size is too large (size = %zd)", view->shape[0]); return 0; } return 1; } /* -- index2d ------------------------------------------------------------- */ static int index2d_converter(PyObject* argument, void* pointer) { Py_buffer* view = pointer; int n; const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS; if (PyObject_GetBuffer(argument, view, flag) == -1) { PyErr_SetString(PyExc_RuntimeError, "unexpected format."); return 0; } if (view->ndim != 2) { PyErr_Format(PyExc_ValueError, "incorrect rank (%d expected 2)", view->ndim); return 0; } if (view->itemsize != sizeof(int)) { PyErr_SetString(PyExc_RuntimeError, "argument has incorrect data type"); PyBuffer_Release(view); return 0; } n = (int) view->shape[0]; if (n != view->shape[0]) { PyErr_Format(PyExc_ValueError, "array size is too large (size = %zd)", view->shape[0]); return 0; } if (view->shape[1] != 2) { PyErr_Format(PyExc_ValueError, "expected 2 columns (found %zd columns)", view->shape[0]); return 0; } return 1; } /* ========================================================================= */ /* -- Classes -------------------------------------------------------------- */ /* ========================================================================= */ typedef struct { PyObject_HEAD Node node; } PyNode; static int PyNode_init(PyNode *self, PyObject *args, PyObject *kwds) { int left, right; double distance = 0.0; static char *kwlist[] = {"left", "right", "distance", NULL}; if (!PyArg_ParseTupleAndKeywords(args, kwds, "ii|d", kwlist, &left, &right, &distance)) return -1; self->node.left = left; self->node.right = right; self->node.distance = distance; return 0; } static PyObject* PyNode_repr(PyNode* self) { char string[64]; sprintf(string, "(%d, %d): %g", self->node.left, self->node.right, self->node.distance); #if PY_MAJOR_VERSION >= 3 return PyUnicode_FromString(string); #else return PyString_FromString(string); #endif } static char PyNode_left__doc__[] = "integer representing the first member of this node"; static PyObject* PyNode_getleft(PyNode* self, void* closure) { int left = self->node.left; #if PY_MAJOR_VERSION >= 3 return PyLong_FromLong((long)left); #else return PyInt_FromLong((long)left); #endif } static int PyNode_setleft(PyNode* self, PyObject* value, void* closure) { long left = PyLong_AsLong(value); if (PyErr_Occurred()) return -1; self->node.left = (int) left; return 0; } static char PyNode_right__doc__[] = "integer representing the second member of this node"; static PyObject* PyNode_getright(PyNode* self, void* closure) { int right = self->node.right; #if PY_MAJOR_VERSION >= 3 return PyLong_FromLong((long)right); #else return PyInt_FromLong((long)right); #endif } static int PyNode_setright(PyNode* self, PyObject* value, void* closure) { long right = PyLong_AsLong(value); if (PyErr_Occurred()) return -1; self->node.right = (int) right; return 0; } static PyObject* PyNode_getdistance(PyNode* self, void* closure) { return PyFloat_FromDouble(self->node.distance); } static int PyNode_setdistance(PyNode* self, PyObject* value, void* closure) { const double distance = PyFloat_AsDouble(value); if (PyErr_Occurred()) return -1; self->node.distance = distance; return 0; } static char PyNode_distance__doc__[] = "the distance between the two members of this node\n"; static PyGetSetDef PyNode_getset[] = { {"left", (getter)PyNode_getleft, (setter)PyNode_setleft, PyNode_left__doc__, NULL}, {"right", (getter)PyNode_getright, (setter)PyNode_setright, PyNode_right__doc__, NULL}, {"distance", (getter)PyNode_getdistance, (setter)PyNode_setdistance, PyNode_distance__doc__, NULL}, {NULL} /* Sentinel */ }; static char PyNode_doc[] = "A Node object describes a single node in a hierarchical clustering tree.\n" "The integer attributes 'left' and 'right' represent the two members that\n" "make up this node; the floating point attribute 'distance' contains the\n" "distance between the two members of this node.\n"; static PyTypeObject PyNodeType = { PyVarObject_HEAD_INIT(NULL, 0) "_cluster.Node", /* tp_name */ sizeof(PyNode), /* tp_basicsize */ 0, /* tp_itemsize */ 0, /* tp_dealloc */ 0, /* tp_print */ 0, /* tp_getattr */ 0, /* tp_setattr */ 0, /* tp_compare */ (reprfunc)PyNode_repr, /* tp_repr */ 0, /* tp_as_number */ 0, /* tp_as_sequence */ 0, /* tp_as_mapping */ 0, /* tp_hash */ 0, /* tp_call */ 0, /* tp_str */ 0, /* tp_getattro */ 0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/ PyNode_doc, /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iternext */ 0, /* tp_methods */ 0, /* tp_members */ PyNode_getset, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ (initproc)PyNode_init, /* tp_init */ }; typedef struct { PyObject_HEAD Node* nodes; int n; } PyTree; static void PyTree_dealloc(PyTree* self) { if (self->n) free(self->nodes); Py_TYPE(self)->tp_free((PyObject*)self); } static PyObject* PyTree_new(PyTypeObject *type, PyObject* args, PyObject* kwds) { int i, j; int n; Node* nodes; PyObject* arg = NULL; int* flag; PyTree* self; self = (PyTree *)type->tp_alloc(type, 0); if (!self) return NULL; if (!PyArg_ParseTuple(args, "|O", &arg)) { Py_DECREF(self); return NULL; } if (arg == NULL) { self->n = 0; self->nodes = NULL; return (PyObject*)self; } if (!PyList_Check(arg)) { Py_DECREF(self); PyErr_SetString(PyExc_TypeError, "Argument should be a list of Node objects"); return NULL; } n = (int) PyList_GET_SIZE(arg); if (n != PyList_GET_SIZE(arg)) { Py_DECREF(self); PyErr_Format(PyExc_ValueError, "List is too large (size = %zd)", PyList_GET_SIZE(arg)); return NULL; } if (n < 1) { Py_DECREF(self); PyErr_SetString(PyExc_ValueError, "List is empty"); return NULL; } nodes = malloc(n*sizeof(Node)); if (!nodes) { Py_DECREF(self); return PyErr_NoMemory(); } for (i = 0; i < n; i++) { PyNode* p; PyObject* row = PyList_GET_ITEM(arg, i); if (!PyType_IsSubtype(Py_TYPE(row), &PyNodeType)) { free(nodes); Py_DECREF(self); PyErr_Format(PyExc_TypeError, "Row %d in list is not a Node object", i); return NULL; } p = (PyNode*)row; nodes[i] = p->node; } /* --- Check if this is a bona fide tree ------------------------------- */ flag = malloc((2*n+1)*sizeof(int)); if (!flag) { free(nodes); Py_DECREF(self); return PyErr_NoMemory(); } for (i = 0; i < 2*n+1; i++) flag[i] = 0; for (i = 0; i < n; i++) { j = nodes[i].left; if (j < 0) { j = -j-1; if (j >= i) break; } else j += n; if (flag[j]) break; flag[j] = 1; j = nodes[i].right; if (j < 0) { j = -j-1; if (j >= i) break; } else j += n; if (flag[j]) break; flag[j] = 1; } free(flag); if (i < n) { /* break encountered */ free(nodes); Py_DECREF(self); PyErr_SetString(PyExc_ValueError, "Inconsistent tree"); return NULL; } self->n = n; self->nodes = nodes; return (PyObject*)self; } static PyObject* PyTree_str(PyTree* self) { int i; const int n = self->n; char string[128]; Node node; PyObject* line; PyObject* output; #if PY_MAJOR_VERSION >= 3 PyObject* temp; output = PyUnicode_FromString(""); #else output = PyString_FromString(""); #endif for (i = 0; i < n; i++) { node = self->nodes[i]; sprintf(string, "(%d, %d): %g", node.left, node.right, node.distance); if (i < n-1) strcat(string, "\n"); #if PY_MAJOR_VERSION >= 3 line = PyUnicode_FromString(string); #else line = PyString_FromString(string); #endif if (!line) { Py_DECREF(output); return NULL; } #if PY_MAJOR_VERSION >= 3 temp = PyUnicode_Concat(output, line); if (!temp) { Py_DECREF(output); Py_DECREF(line); return NULL; } output = temp; #else PyString_ConcatAndDel(&output, line); if (!output) { Py_DECREF(line); return NULL; } #endif } return output; } static int PyTree_length(PyTree *self) { return self->n; } static PyObject* PyTree_subscript(PyTree* self, PyObject* item) { if (PyIndex_Check(item)) { PyNode* result; Py_ssize_t i; i = PyNumber_AsSsize_t(item, PyExc_IndexError); if (i == -1 && PyErr_Occurred()) return NULL; if (i < 0) i += self->n; if (i < 0 || i >= self->n) { PyErr_SetString(PyExc_IndexError, "tree index out of range"); return NULL; } result = (PyNode*) PyNodeType.tp_alloc(&PyNodeType, 0); if (!result) return PyErr_NoMemory(); result->node = self->nodes[i]; return (PyObject*) result; } else if (PySlice_Check(item)) { Py_ssize_t i, j; Py_ssize_t start, stop, step, slicelength; #if PY_MAJOR_VERSION < 3 if (PySlice_GetIndicesEx((PySliceObject*)item, self->n, &start, &stop, &step, &slicelength) == -1) return NULL; #else if (PySlice_GetIndicesEx(item, self->n, &start, &stop, &step, &slicelength) == -1) return NULL; #endif if (slicelength == 0) return PyList_New(0); else { PyNode* node; PyObject* result = PyList_New(slicelength); if (!result) return PyErr_NoMemory(); for (i = 0, j = start; i < slicelength; i++, j += step) { node = (PyNode*) PyNodeType.tp_alloc(&PyNodeType, 0); if (!node) { Py_DECREF(result); return PyErr_NoMemory(); } node->node = self->nodes[j]; PyList_SET_ITEM(result, i, (PyObject*)node); } return result; } } else { PyErr_Format(PyExc_TypeError, "tree indices must be integers, not %.200s", item->ob_type->tp_name); return NULL; } } static PyMappingMethods PyTree_mapping = { (lenfunc)PyTree_length, /* mp_length */ (binaryfunc)PyTree_subscript, /* mp_subscript */ }; static char PyTree_scale__doc__[] = "mytree.scale()\n" "\n" "Scale the node distances in the tree such that they are all between one\n" "and zero.\n"; static PyObject* PyTree_scale(PyTree* self) { int i; const int n = self->n; Node* nodes = self->nodes; double maximum = DBL_MIN; for (i = 0; i < n; i++) { double distance = nodes[i].distance; if (distance > maximum) maximum = distance; } if (maximum != 0.0) for (i = 0; i < n; i++) nodes[i].distance /= maximum; Py_INCREF(Py_None); return Py_None; } static char PyTree_cut__doc__[] = "mytree.cut(nclusters) -> array\n" "\n" "Divide the elements in a hierarchical clustering result mytree into\n" "clusters, and return an array with the number of the cluster to which each\n" "element was assigned. The number of clusters is given by nclusters.\n"; static PyObject* PyTree_cut(PyTree* self, PyObject* args) { int ok = -1; int nclusters; const int n = self->n + 1; Py_buffer indices = {0}; if (!PyArg_ParseTuple(args, "O&i", index_converter, &indices, &nclusters)) goto exit; if (nclusters < 1) { PyErr_SetString(PyExc_ValueError, "requested number of clusters should be positive"); goto exit; } if (nclusters > n) { PyErr_SetString(PyExc_ValueError, "more clusters requested than items available"); goto exit; } if (indices.shape[0] != n) { PyErr_SetString(PyExc_RuntimeError, "indices array inconsistent with tree"); goto exit; } ok = cuttree(n, self->nodes, nclusters, indices.buf); exit: PyBuffer_Release(&indices); if (ok == -1) return NULL; if (ok == 0) return PyErr_NoMemory(); Py_INCREF(Py_None); return Py_None; } static char PyTree_sort__doc__[] = "mytree.sort(order) -> array\n" "\n" "Sort a hierarchical clustering tree by switching the left and right\n" "subnode of nodes such that the elements in the left-to-right order of the\n" "tree tend to have increasing order values.\n" "\n" "Return the indices of the elements in the left-to-right order in the\n" "hierarchical clustering tree, such that the element with index indices[i]\n" "occurs at position i in the dendrogram.\n"; static PyObject* PyTree_sort(PyTree* self, PyObject* args) { int ok = -1; Py_buffer indices = {0}; const int n = self->n; Py_buffer order = {0}; if (n == 0) { PyErr_SetString(PyExc_ValueError, "tree is empty"); return NULL; } if (!PyArg_ParseTuple(args, "O&O&", index_converter, &indices, vector_converter, &order)) goto exit; if (indices.shape[0] != n + 1) { PyErr_SetString(PyExc_RuntimeError, "indices array inconsistent with tree"); goto exit; } ok = sorttree(n, self->nodes, order.buf, indices.buf); exit: PyBuffer_Release(&order); PyBuffer_Release(&indices); if (ok == -1) return NULL; if (ok == 0) return PyErr_NoMemory(); Py_INCREF(Py_None); return Py_None; } static PyMethodDef PyTree_methods[] = { {"scale", (PyCFunction)PyTree_scale, METH_NOARGS, PyTree_scale__doc__}, {"cut", (PyCFunction)PyTree_cut, METH_VARARGS, PyTree_cut__doc__}, {"sort", (PyCFunction)PyTree_sort, METH_VARARGS, PyTree_sort__doc__}, {NULL} /* Sentinel */ }; static char PyTree_doc[] = "Tree objects store a hierarchical clustering solution.\n" "Individual nodes in the tree can be accessed with tree[i], where i is\n" "an integer. Whereas the tree itself is a read-only object, tree[:]\n" "returns a list of all the nodes, which can then be modified. To create\n" "a new Tree from this list, use Tree(list).\n" "See the description of the Node class for more information."; static PyTypeObject PyTreeType = { PyVarObject_HEAD_INIT(NULL, 0) "_cluster.Tree", /* tp_name */ sizeof(PyTree), /* tp_basicsize */ 0, /* tp_itemsize */ (destructor)PyTree_dealloc, /* tp_dealloc */ 0, /* tp_print */ 0, /* tp_getattr */ 0, /* tp_setattr */ 0, /* tp_compare */ 0, /* tp_repr */ 0, /* tp_as_number */ 0, /* tp_as_sequence */ &PyTree_mapping, /* tp_as_mapping */ 0, /* tp_hash */ 0, /* tp_call */ (reprfunc)PyTree_str, /* tp_str */ 0, /* tp_getattro */ 0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/ PyTree_doc, /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iternext */ PyTree_methods, /* tp_methods */ NULL, /* tp_members */ 0, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ 0, /* tp_init */ 0, /* tp_alloc */ (newfunc)PyTree_new, /* tp_new */ }; /* ========================================================================= */ /* -- Methods -------------------------------------------------------------- */ /* ========================================================================= */ /* version */ static char version__doc__[] = "version() -> string\n" "\n" "Return the version number of the C Clustering Library as a string.\n"; static PyObject* py_version(PyObject* self) { #if PY_MAJOR_VERSION >= 3 return PyUnicode_FromString( CLUSTERVERSION ); #else return PyString_FromString( CLUSTERVERSION ); #endif } /* kcluster */ static char kcluster__doc__[] = "kcluster(data, nclusters, mask, weight, transpose, npass, method,\n" " dist, clusterid) -> None\n" "\n" "This function implements k-means clustering.\n" "\n" "Arguments:\n" "\n" " - data: nrows x ncols array containing the data to be clustered\n" "\n" " - nclusters: number of clusters (the 'k' in k-means)\n" "\n" " - mask: nrows x ncols array of integers, showing which data are\n" " missing. If mask[i,j] == 0, then data[i,j] is missing.\n" "\n" " - weight: the weights to be used when calculating distances\n" " - transpose:\n" "\n" " - if equal to 0, rows are clustered;\n" " - if equal to 1, columns are clustered.\n" "\n" " - npass: number of times the k-means clustering algorithm is\n" " performed, each time with a different (random) initial\n" " condition. If npass == 0, then the assignments in clusterid\n" " are used as the initial condition.\n" "\n" " - method: specifies how the center of a cluster is found:\n" "\n" " - method == 'a': arithmetic mean\n" " - method == 'm': median\n" "\n" " - dist: specifies the distance function to be used:\n" "\n" " - dist == 'e': Euclidean distance\n" " - dist == 'b': City Block distance\n" " - dist == 'c': Pearson correlation\n" " - dist == 'a': absolute value of the correlation\n" " - dist == 'u': uncentered correlation\n" " - dist == 'x': absolute uncentered correlation\n" " - dist == 's': Spearman's rank correlation\n" " - dist == 'k': Kendall's tau\n" "\n" " - clusterid: array in which the final clustering solution will be\n" " stored (output variable). If npass == 0, then clusterid is also used\n" " as an input variable, containing the initial condition from which\n" " the EM algorithm should start. In this case, the k-means algorithm\n" " is fully deterministic.\n" "\n"; static PyObject* py_kcluster(PyObject* self, PyObject* args, PyObject* keywords) { int nclusters = 2; int nrows, ncols; int nitems; int ndata; Data data = {0}; Mask mask = {0}; Py_buffer weight = {0}; int transpose = 0; int npass = 1; char method = 'a'; char dist = 'e'; Py_buffer clusterid = {0}; double error; int ifound = 0; static char* kwlist[] = {"data", "nclusters", "mask", "weight", "transpose", "npass", "method", "dist", "clusterid", NULL}; if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&iO&O&iiO&O&O&", kwlist, data_converter, &data, &nclusters, mask_converter, &mask, vector_converter, &weight, &transpose, &npass, method_kcluster_converter, &method, distance_converter, &dist, index_converter, &clusterid)) goto exit; if (!data.values) { PyErr_SetString(PyExc_RuntimeError, "data is None"); goto exit; } if (!mask.values) { PyErr_SetString(PyExc_RuntimeError, "mask is None"); goto exit; } if (data.nrows != mask.view.shape[0] || data.ncols != mask.view.shape[1]) { PyErr_Format(PyExc_ValueError, "mask has incorrect dimensions (%zd x %zd, expected %d x %d)", mask.view.shape[0], mask.view.shape[1], data.nrows, data.ncols); goto exit; } nrows = data.nrows; ncols = data.ncols; ndata = transpose ? nrows : ncols; nitems = transpose ? ncols : nrows; if (weight.shape[0] != ndata) { PyErr_Format(PyExc_RuntimeError, "weight has incorrect size %zd (expected %d)", weight.shape[0], ndata); goto exit; } if (nclusters < 1) { PyErr_SetString(PyExc_ValueError, "nclusters should be positive"); goto exit; } if (nitems < nclusters) { PyErr_SetString(PyExc_ValueError, "more clusters than items to be clustered"); goto exit; } if (npass < 0) { PyErr_SetString(PyExc_RuntimeError, "expected a non-negative integer"); goto exit; } else if (npass == 0) { int n = check_clusterid(clusterid); if (n == 0) goto exit; if (n != nclusters) { PyErr_SetString(PyExc_RuntimeError, "more clusters requested than found in clusterid"); goto exit; } } kcluster(nclusters, nrows, ncols, data.values, mask.values, weight.buf, transpose, npass, method, dist, clusterid.buf, &error, &ifound); exit: free_data(&data); free_mask(&mask); PyBuffer_Release(&weight); PyBuffer_Release(&clusterid); if (ifound) return Py_BuildValue("di", error, ifound); return NULL; } /* end of wrapper for kcluster */ /* kmedoids */ static char kmedoids__doc__[] = "kmedoids(distance, nclusters, npass, clusterid) -> error, nfound\n" "\n" "This function implements k-medoids clustering.\n" "\n" "Arguments:\n" " - distance: The distance matrix between the elements. There are three\n" " ways in which you can pass a distance matrix:\n" "\n" " 1. a 2D Numerical Python array (in which only the left-lower\n" " part of the array will be accessed);\n" " 2. a 1D Numerical Python array containing the distances\n" " consecutively;\n" " 3. a list of rows containing the lower-triangular part of\n" " the distance matrix.\n" "\n" " Examples are:\n" "\n" " >>> from numpy import array\n" " >>> distance = array([[0.0, 1.1, 2.3],\n" " ... [1.1, 0.0, 4.5],\n" " ... [2.3, 4.5, 0.0]])\n" " >>> # (option #1)\n" " >>> distance = array([1.1, 2.3, 4.5])\n" " >>> # (option #2)\n" " >>> distance = [array([]),\n" " ... array([1.1]),\n" " ... array([2.3, 4.5])]\n" " >>> # (option #3)\n" "\n" " These three correspond to the same distance matrix.\n" "\n" " - nclusters: number of clusters (the 'k' in k-medoids)\n" "\n" " - npass: number of times the k-medoids clustering algorithm is\n" " performed, each time with a different (random) initial\n" " condition. If npass == 0, then the assignments in clusterid\n" " are used as the initial condition.\n" "\n" " - clusterid: array in which the final clustering solution will be\n" " stored (output variable). If npass == 0, then clusterid is also used\n" " as an input variable, containing the initial condition from which\n" " the EM algorithm should start. In this case, the k-medoids algorithm\n" " is fully deterministic.\n" "\n" "Return values:\n" " - error: the within-cluster sum of distances for the returned k-means\n" " clustering solution;\n" " - nfound: the number of times this solution was found.\n"; static PyObject* py_kmedoids(PyObject* self, PyObject* args, PyObject* keywords) { int nclusters = 2; Distancematrix distances = {0}; Py_buffer clusterid = {0}; int npass = 1; double error; int ifound = -2; static char* kwlist[] = {"distance", "nclusters", "npass", "clusterid", NULL}; if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&iiO&", kwlist, distancematrix_converter, &distances, &nclusters, &npass, index_converter, &clusterid)) goto exit; if (npass < 0) { PyErr_SetString(PyExc_RuntimeError, "expected a non-negative integer"); goto exit; } else if (npass == 0) { int n = check_clusterid(clusterid); if (n == 0) goto exit; if (n != nclusters) { PyErr_SetString(PyExc_RuntimeError, "more clusters requested than found in clusterid"); goto exit; } } if (nclusters <= 0) { PyErr_SetString(PyExc_ValueError, "nclusters should be a positive integer"); goto exit; } if (distances.n < nclusters) { PyErr_SetString(PyExc_ValueError, "more clusters requested than items to be clustered"); goto exit; } kmedoids(nclusters, distances.n, distances.values, npass, clusterid.buf, &error, &ifound); exit: free_distancematrix(&distances); PyBuffer_Release(&clusterid); switch (ifound) { case -2: return NULL; case -1: return PyErr_NoMemory(); case 0: /* should not occur */ PyErr_SetString(PyExc_RuntimeError, "error in kmedoids input arguments"); return NULL; default: return Py_BuildValue("di", error, ifound); } } /* end of wrapper for kmedoids */ /* treecluster */ static char treecluster__doc__[] = "treecluster(tree, data, mask, weight, transpose, dist, method,\n" " distancematrix) -> None\n" "\n" "This function implements the pairwise single, complete, centroid, and\n" "average linkage hierarchical clustering methods.\n" "\n" "Arguments:\n" " - tree: an empty Tree object; its nodes will be filled by treecluster\n" " to describe the hierarchical clustering result. See the description\n" " of the Tree class for more information.\n" "\n" " - data: nrows x ncols array containing the data to be clustered.\n" " Either data or distancematrix (see below) should be None.\n" "\n" " - mask: nrows x ncols array of integers, showing which data are\n" " missing. If mask[i,j]==0, then data[i,j] is missing.\n" "\n" " - weight: the weights to be used when calculating distances.\n" "\n" " - transpose:\n" "\n" " - if equal to 0, rows are clustered;\n" " - if equal to 1, columns are clustered.\n" "\n" " - dist: specifies the distance function to be used:\n" "\n" " - dist == 'e': Euclidean distance\n" " - dist == 'b': City Block distance\n" " - dist == 'c': Pearson correlation\n" " - dist == 'a': absolute value of the correlation\n" " - dist == 'u': uncentered correlation\n" " - dist == 'x': absolute uncentered correlation\n" " - dist == 's': Spearman's rank correlation\n" " - dist == 'k': Kendall's tau\n" "\n" " - method: specifies which linkage method is used:\n" "\n" " - method == 's': Single pairwise linkage\n" " - method == 'm': Complete (maximum) pairwise linkage (default)\n" " - method == 'c': Centroid linkage\n" " - method == 'a': Average pairwise linkage\n" "\n" " - distancematrix: The distance matrix between the elements.\n" " Either data (see above) or distancematrix should be None.\n" " There are three ways in which you can pass a distance matrix:\n" "\n" " 1. a 2D Numerical Python array (in which only the left-lower\n" " part of the array will be accessed);\n" " 2. a 1D Numerical Python array containing the distances\n" " consecutively;\n" " 3. a list of rows containing the lower-triangular part of\n" " the distance matrix.\n" "\n" " Examples are:\n" "\n" " >>> from numpy import array\n" " >>> distance = array([[0.0, 1.1, 2.3],\n" " ... [1.1, 0.0, 4.5],\n" " ... [2.3, 4.5, 0.0]])\n" " >>> # option 1.\n" " >>> distance = array([1.1, 2.3, 4.5])\n" " >>> # option 2.\n" " >>> distance = [array([]),\n" " ... array([1.1]),\n" " ... array([2.3, 4.5])]\n" " >>> # option 3.\n" "\n" " These three correspond to the same distance matrix.\n" "\n" " PLEASE NOTE:\n" " As the treecluster routine may shuffle the values in the\n" " distance matrix as part of the clustering algorithm, be sure\n" " to save this array in a different variable before calling\n" " treecluster if you need it later.\n" "\n" "Either data or distancematrix should be None. If distancematrix is None,\n" "the hierarchical clustering solution is calculated from the values in\n" "the argument data. Instead if data is None, the hierarchical clustering\n" "solution is calculated from the distance matrix.\n" "Pairwise centroid-linkage clustering can be calculated only from the data\n" "and not from the distance matrix.\n" "Pairwise single-, maximum-, and average-linkage clustering can be\n" "calculated from either the data or from the distance matrix.\n"; static PyObject* py_treecluster(PyObject* self, PyObject* args, PyObject* keywords) { Data data = {0}; Mask mask = {0}; Py_buffer weight = {0}; int transpose = 0; char dist = 'e'; char method = 'm'; Distancematrix distances = {0}; PyTree* tree = NULL; Node* nodes; int nitems; static char* kwlist[] = {"tree", "data", "mask", "weight", "transpose", "method", "dist", "distancematrix", NULL }; if (!PyArg_ParseTupleAndKeywords(args, keywords, "O!O&O&O&iO&O&O&", kwlist, &PyTreeType, &tree, data_converter, &data, mask_converter, &mask, vector_none_converter, &weight, &transpose, method_treecluster_converter, &method, distance_converter, &dist, distancematrix_converter, &distances)) goto exit; if (tree->n != 0) { PyErr_SetString(PyExc_RuntimeError, "expected an empty tree"); goto exit; } if (data.values != NULL && distances.values != NULL) { PyErr_SetString(PyExc_ValueError, "use either data or distancematrix, do not use both"); goto exit; } if (data.values == NULL && distances.values == NULL) { PyErr_SetString(PyExc_ValueError, "neither data nor distancematrix was given"); goto exit; } if (data.values) /* use the values in data, not the distance matrix */ { int nrows; int ncols; int ndata; if (!mask.values) { PyErr_SetString(PyExc_RuntimeError, "mask is None"); goto exit; } if (!weight.buf) { PyErr_SetString(PyExc_RuntimeError, "weight is None"); goto exit; } nrows = data.nrows; ncols = data.ncols; if (nrows != mask.view.shape[0] || ncols != mask.view.shape[1]) { PyErr_Format(PyExc_ValueError, "mask has incorrect dimensions (%zd x %zd, expected %d x %d)", mask.view.shape[0], mask.view.shape[1], data.nrows, data.ncols); goto exit; } ndata = transpose ? nrows : ncols; nitems = transpose ? ncols : nrows; if (weight.shape[0] != ndata) { PyErr_Format(PyExc_RuntimeError, "weight has incorrect size %zd (expected %d)", weight.shape[0], ndata); goto exit; } nodes = treecluster(nrows, ncols, data.values, mask.values, weight.buf, transpose, dist, method, NULL); } else { /* use the distance matrix instead of the values in data */ if (!strchr("sma", method)) { PyErr_SetString(PyExc_ValueError, "argument method should be 's', 'm', or 'a' " "when specifying the distance matrix"); goto exit; } nitems = distances.n; nodes = treecluster(nitems, nitems, 0, 0, 0, transpose, dist, method, distances.values); } if (!nodes) { PyErr_NoMemory(); goto exit; } tree->nodes = nodes; tree->n = nitems-1; exit: free_data(&data); free_mask(&mask); if (weight.buf) PyBuffer_Release(&weight); free_distancematrix(&distances); if (tree == NULL || tree->n == 0) return NULL; Py_INCREF(Py_None); return Py_None; } /* end of wrapper for treecluster */ /* somcluster */ static char somcluster__doc__[] = "somcluster(clusterid, celldata, data, mask, weight, transpose,\n" " inittau, niter, dist) -> None\n" "\n" "This function implements a self-organizing map on a rectangular grid.\n" "\n" "Arguments:\n" " - clusterid: array with two columns, with the number of rows equal\n" " to the number of items being clustered. Upon return, each row\n" " in the array contains the x and y coordinates of the cell in the\n" " the rectangular SOM grid to which the item was assigned.\n" "\n" " - celldata: array with dimensions nxgrid x nygrid x number of columns\n" " if rows are being clustered, or nxgrid x nygrid x number of rows\n" " if columns are being clustered, where nxgrid is the horizontal\n" " dimension of the rectangular SOM map and nygrid is the vertical\n" " dimension of the rectangular SOM map.\n" " Upon return, each element [ix, iy] of this array contains the\n" " data for the centroid of the cluster in the SOM grid cell with\n" " coordinates [ix, iy].\n" "\n" " - data: nrows x ncols array containing the data to be clustered.\n" "\n" " - mask: nrows x ncols array of integers, showing which data are\n" " missing. If mask[i,j] == 0, then data[i,j] is missing.\n" "\n" " - weight: the weights to be used when calculating distances\n" "\n" " - transpose:\n" "\n" " - if equal to 0, rows are clustered;\n" " - if equal to 1, columns are clustered.\n" "\n" " - inittau: the initial value of tau (the neighborbood function)\n" "\n" " - niter: the number of iterations\n" "\n" " - dist: specifies the distance function to be used:\n" "\n" " - dist == 'e': Euclidean distance\n" " - dist == 'b': City Block distance\n" " - dist == 'c': Pearson correlation\n" " - dist == 'a': absolute value of the correlation\n" " - dist == 'u': uncentered correlation\n" " - dist == 'x': absolute uncentered correlation\n" " - dist == 's': Spearman's rank correlation\n" " - dist == 'k': Kendall's tau\n"; static PyObject* py_somcluster(PyObject* self, PyObject* args, PyObject* keywords) { int nrows; int ncols; int ndata; Data data = {0}; Mask mask = {0}; Py_buffer weight = {0}; int transpose = 0; double inittau = 0.02; int niter = 1; char dist = 'e'; Py_buffer indices = {0}; Celldata celldata = {0}; PyObject* result = NULL; static char* kwlist[] = {"clusterids", "celldata", "data", "mask", "weight", "transpose", "inittau", "niter", "dist", NULL}; if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&O&O&O&O&idiO&", kwlist, index2d_converter, &indices, celldata_converter, &celldata, data_converter, &data, mask_converter, &mask, vector_converter, &weight, &transpose, &inittau, &niter, distance_converter, &dist)) goto exit; if (niter < 1) { PyErr_SetString(PyExc_ValueError, "number of iterations (niter) should be positive"); goto exit; } if (!data.values) { PyErr_SetString(PyExc_RuntimeError, "data is None"); goto exit; } if (!mask.values) { PyErr_SetString(PyExc_RuntimeError, "mask is None"); goto exit; } nrows = data.nrows; ncols = data.ncols; if (nrows != mask.view.shape[0] || ncols != mask.view.shape[1]) { PyErr_Format(PyExc_ValueError, "mask has incorrect dimensions (%zd x %zd, expected %d x %d)", mask.view.shape[0], mask.view.shape[1], data.nrows, data.ncols); goto exit; } ndata = transpose ? nrows : ncols; if (weight.shape[0] != ndata) { PyErr_Format(PyExc_RuntimeError, "weight has incorrect size %zd (expected %d)", weight.shape[0], ndata); goto exit; } if (celldata.nz != ndata) { PyErr_Format(PyExc_RuntimeError, "the celldata array size is not consistent with the data " "(last dimension is %d; expected %d)", celldata.nz, ndata); goto exit; } somcluster(nrows, ncols, data.values, mask.values, weight.buf, transpose, celldata.nx, celldata.ny, inittau, niter, dist, celldata.values, indices.buf); Py_INCREF(Py_None); result = Py_None; exit: free_data(&data); PyBuffer_Release(&weight); free_celldata(&celldata); PyBuffer_Release(&indices); return result; } /* end of wrapper for somcluster */ /* clusterdistance */ static char clusterdistance__doc__[] = "clusterdistance(data, mask, weight, index1, index2, dist, method,\n" " transpose) -> distance between two clusters\n" "\n" "Arguments:\n" "\n" " - data: nrows x ncols array containing the data values.\n" "\n" " - mask: nrows x ncols array of integers, showing which data are\n" " missing. If mask[i,j] == 0, then data[i,j] is missing.\n" "\n" " - weight: the weights to be used when calculating distances\n" "\n" " - index1: 1D array identifying which items belong to the first\n" " cluster.\n" "\n" " - index2: 1D array identifying which items belong to the second\n" " cluster.\n" "\n" " - dist: specifies the distance function to be used:\n" "\n" " - dist == 'e': Euclidean distance\n" " - dist == 'b': City Block distance\n" " - dist == 'c': Pearson correlation\n" " - dist == 'a': absolute value of the correlation\n" " - dist == 'u': uncentered correlation\n" " - dist == 'x': absolute uncentered correlation\n" " - dist == 's': Spearman's rank correlation\n" " - dist == 'k': Kendall's tau\n" "\n" " - method: specifies how the distance between two clusters is defined:\n" "\n" " - method == 'a': the distance between the arithmetic means of the\n" " two clusters\n" " - method == 'm': the distance between the medians of the two\n" " clusters\n" " - method == 's': the smallest pairwise distance between members\n" " of the two clusters\n" " - method == 'x': the largest pairwise distance between members of\n" " the two clusters\n" " - method == 'v': average of the pairwise distances between\n" " members of the clusters\n" "\n" " - transpose:\n" "\n" " - if equal to 0: clusters of rows are considered;\n" " - if equal to 1: clusters of columns are considered.\n" "\n"; static PyObject* py_clusterdistance(PyObject* self, PyObject* args, PyObject* keywords) { double distance; int nrows; int ncols; int ndata; Data data = {0}; Mask mask = {0}; Py_buffer weight = {0}; char dist = 'e'; char method = 'a'; int transpose = 0; Py_buffer index1 = {0}; Py_buffer index2 = {0}; PyObject* result = NULL; static char* kwlist[] = {"data", "mask", "weight", "index1", "index2", "method", "dist", "transpose", NULL}; if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&O&O&O&O&O&O&i", kwlist, data_converter, &data, mask_converter, &mask, vector_converter, &weight, index_converter, &index1, index_converter, &index2, method_clusterdistance_converter, &method, distance_converter, &dist, &transpose)) goto exit; if (!data.values) { PyErr_SetString(PyExc_RuntimeError, "data is None"); goto exit; } if (!mask.values) { PyErr_SetString(PyExc_RuntimeError, "mask is None"); goto exit; } nrows = data.nrows; ncols = data.ncols; ndata = transpose ? nrows : ncols; if (nrows != mask.view.shape[0] || ncols != mask.view.shape[1]) { PyErr_Format(PyExc_ValueError, "mask has incorrect dimensions (%zd x %zd, expected %d x %d)", mask.view.shape[0], mask.view.shape[1], data.nrows, data.ncols); goto exit; } if (weight.shape[0] != ndata) { PyErr_Format(PyExc_RuntimeError, "weight has incorrect size %zd (expected %d)", weight.shape[0], ndata); goto exit; } distance = clusterdistance(nrows, ncols, data.values, mask.values, weight.buf, (int) index1.shape[0], (int) index2.shape[0], index1.buf, index2.buf, dist, method, transpose); if (distance < -0.5) /* Actually -1.0; avoiding roundoff errors */ PyErr_SetString(PyExc_IndexError, "index out of range"); else result = PyFloat_FromDouble(distance); exit: free_data(&data); free_mask(&mask); PyBuffer_Release(&weight); PyBuffer_Release(&index1); PyBuffer_Release(&index2); return result; } /* end of wrapper for clusterdistance */ /* clustercentroids */ static char clustercentroids__doc__[] = "clustercentroids(data, mask, clusterid, method, transpose) -> cdata, cmask\n" "\n" "The clustercentroids routine calculates the cluster centroids, given to\n" "which cluster each element belongs. The centroid is defined as either\n" "the mean or the median over all elements for each dimension.\n" "\n" "Arguments:\n" " - data: nrows x ncols array containing the data values.\n" "\n" " - mask: nrows x ncols array of integers, showing which data are\n" " missing. If mask[i,j] == 0, then data[i,j] is missing.\n" "\n" " - clusterid: array containing the cluster number for each item.\n" " The cluster number should be non-negative.\n" "\n" " - method: specifies whether the centroid is calculated from the\n" " arithmetic mean (method == 'a', default) or the median\n" " (method == 'm') over each dimension.\n" "\n" " - transpose: if equal to 0, row clusters are considered;\n" " if equal to 1, column clusters are considered.\n" "\n" " - cdata: 2D array containing, upon return, the cluster centroids.\n" " If transpose == 0, then the dimensions of cdata should be\n" " nclusters x ncols.\n" " If transpose == 1, then the dimensions of cdata should be \n" " nrows x nclusters.\n" "\n" " - cmask: 2D array of integers describing, upon return, which elements\n" " in cdata, if any, are missing.\n"; static PyObject* py_clustercentroids(PyObject* self, PyObject* args, PyObject* keywords) { int nrows; int ncols; int nclusters; Data data = {0}; Mask mask = {0}; Data cdata = {0}; Mask cmask = {0}; Py_buffer clusterid = {0}; char method = 'a'; int transpose = 0; int ok = -1; static char* kwlist[] = {"data", "mask", "clusterid", "method", "transpose", "cdata", "cmask", NULL }; if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&O&O&O&iO&O&", kwlist, data_converter, &data, mask_converter, &mask, index_converter, &clusterid, method_kcluster_converter, &method, &transpose, data_converter, &cdata, mask_converter, &cmask)) goto exit; if (!data.values) { PyErr_SetString(PyExc_RuntimeError, "data is None"); goto exit; } if (!mask.values) { PyErr_SetString(PyExc_RuntimeError, "mask is None"); goto exit; } nrows = data.nrows; ncols = data.ncols; if (nrows != mask.view.shape[0] || ncols != mask.view.shape[1]) { PyErr_Format(PyExc_ValueError, "mask has incorrect dimensions (%zd x %zd, expected %d x %d)", mask.view.shape[0], mask.view.shape[1], data.nrows, data.ncols); goto exit; } nclusters = check_clusterid(clusterid); if (nclusters == 0) goto exit; if (transpose == 0) nrows = nclusters; else ncols = nclusters; if (cdata.nrows != nrows) { PyErr_Format(PyExc_RuntimeError, "cdata has incorrect number of rows (%d, expected %d)", cdata.nrows, nrows); goto exit; } if (cdata.ncols != ncols) { PyErr_Format(PyExc_RuntimeError, "cdata has incorrect number of columns (%d, expected %d)", cdata.ncols, ncols); goto exit; } if (cmask.view.shape[0] != nrows) { PyErr_Format(PyExc_RuntimeError, "cmask has incorrect number of rows (%zd, expected %d)", cmask.view.shape[0], nrows); goto exit; } if (cmask.view.shape[1] != ncols) { PyErr_Format(PyExc_RuntimeError, "cmask has incorrect number of columns " "(%zd, expected %d)", cmask.view.shape[1], ncols); goto exit; } ok = getclustercentroids(nclusters, data.nrows, data.ncols, data.values, mask.values, clusterid.buf, cdata.values, cmask.values, transpose, method); exit: free_data(&data); free_mask(&mask); free_data(&cdata); free_mask(&cmask); PyBuffer_Release(&clusterid); if (ok == -1) return NULL; if (ok == 0) return PyErr_NoMemory(); Py_INCREF(Py_None); return Py_None; } /* end of wrapper for clustercentroids */ /* distancematrix */ static char distancematrix__doc__[] = "distancematrix(data, mask, weight, transpose, dist, distancematrix)\n" " -> None\n" "\n" "This function calculuates the distance matrix between the data values.\n" "\n" "Arguments:\n" "\n" " - data: nrows x ncols array containing the data values.\n" "\n" " - mask: nrows x ncols array of integers, showing which data are\n" " missing. If mask[i,j] == 0, then data[i,j] is missing.\n" "\n" " - weight: the weights to be used when calculating distances.\n" "\n" " - transpose: if equal to 0: the distances between rows are\n" " calculated;\n" " if equal to 1, the distances beteeen columns are calculated.\n" "\n" " - dist: specifies the distance function to be used:\n" "\n" " - dist == 'e': Euclidean distance\n" " - dist == 'b': City Block distance\n" " - dist == 'c': Pearson correlation\n" " - dist == 'a': absolute value of the correlation\n" " - dist == 'u': uncentered correlation\n" " - dist == 'x': absolute uncentered correlation\n" " - dist == 's': Spearman's rank correlation\n" " - dist == 'k': Kendall's tau\n" "\n" " - distancematrix: Upon return, the distance matrix as a list of 1D\n" " arrays. The number of columns in each row is equal to the row number\n" " (i.e., len(distancematrix[i]) == i).\n" " An example of the return value is:\n" "\n" " matrix = [[],\n" " array([1.]),\n" " array([7., 3.]),\n" " array([4., 2., 6.])]\n" "\n" "This corresponds to the distance matrix:\n" "\n" " [0.\t1.\t7.\t4.]\n" " [1.\t0.\t3.\t2.]\n" " [7.\t3.\t0.\t6.]\n" " [4.\t2.\t6.\t0.]\n"; static PyObject* py_distancematrix(PyObject* self, PyObject* args, PyObject* keywords) { PyObject* list; Distancematrix distances = {0}; Data data = {0}; Mask mask = {0}; Py_buffer weight = {0}; int transpose = 0; char dist = 'e'; int nrows, ncols, ndata; PyObject* result = NULL; /* -- Read the input variables --------------------------------------- */ static char* kwlist[] = {"data", "mask", "weight", "transpose", "dist", "distancematrix", NULL}; if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&O&O&iO&O!", kwlist, data_converter, &data, mask_converter, &mask, vector_converter, &weight, &transpose, distance_converter, &dist, &PyList_Type, &list)) goto exit; if (!data.values) { PyErr_SetString(PyExc_RuntimeError, "data is None"); goto exit; } if (!mask.values) { PyErr_SetString(PyExc_RuntimeError, "mask is None"); goto exit; } nrows = data.nrows; ncols = data.ncols; if (nrows != mask.view.shape[0] || ncols != mask.view.shape[1]) { PyErr_Format(PyExc_ValueError, "mask has incorrect dimensions (%zd x %zd, expected %d x %d)", mask.view.shape[0], mask.view.shape[1], data.nrows, data.ncols); goto exit; } ndata = (transpose == 0) ? ncols : nrows; if (weight.shape[0] != ndata) { PyErr_Format(PyExc_RuntimeError, "weight has incorrect size %zd (expected %d)", weight.shape[0], ndata); goto exit; } if (_convert_list_to_distancematrix(list, &distances) == 0) goto exit; distancematrix(nrows, ncols, data.values, mask.values, weight.buf, dist, transpose, distances.values); Py_INCREF(Py_None); result = Py_None; exit: free_data(&data); free_mask(&mask); PyBuffer_Release(&weight); free_distancematrix(&distances); return result; } /* end of wrapper for distancematrix */ /* pca */ static char pca__doc__[] = "pca(data, columnmean, coordinates, pc, eigenvalues) -> None\n" "\n" "This function calculates the principal component decomposition\n" "of the values in data.\n" "\n" "Arguments:\n" "\n" " - data: nrows x ncols array containing the data values.\n" "\n" " - columnmean: array of size nrows) in which the mean of each column\n" " will be sorted.\n" "\n" " - coordinates: nrows x nmin array in which the coordinates of the\n" " data along the principal components will be stored;\n" " nmin is min(nrows, ncols).\n" "\n" " - pc : the principal components as an nmin x ncols array, where nmin\n" " is min(nrows, ncols).\n" "\n" " - eigenvalues: array of size min(nrows, ncols), in which the\n" " eigenvalues will be stored, sorted by the magnitude\n" " of the eigenvalues, with the largest eigenvalues\n" " appearing first.\n" "\n" "Adding the column means to the dot product of the coordinates and the\n" "principal components, i.e.\n" "\n" " columnmean + dot(coordinates, pc)\n" "\n" "recreates the data matrix.\n"; static PyObject* py_pca(PyObject* self, PyObject* args) { Py_buffer eigenvalues = {0}; double** u; double** v; Data data = {0}; Data pc = {0}; Data coordinates = {0}; Py_buffer mean = {0}; int nrows, ncols; int nmin; int error = -2; double* p; double** values; int i, j; if (!PyArg_ParseTuple(args, "O&O&O&O&O&", data_converter, &data, vector_converter, &mean, data_converter, &coordinates, data_converter, &pc, vector_converter, &eigenvalues)) goto exit; values = data.values; if (!values) { PyErr_SetString(PyExc_RuntimeError, "data is None"); goto exit; } nrows = data.nrows; ncols = data.ncols; if (mean.shape[0] != ncols) { PyErr_Format(PyExc_RuntimeError, "columnmean has inconsistent size %zd (expected %d)", mean.shape[0], ncols); goto exit; } nmin = nrows < ncols ? nrows : ncols; if (pc.nrows != nmin || pc.ncols != ncols) { PyErr_Format(PyExc_RuntimeError, "pc has inconsistent size %zd x %zd (expected %d x %d)", mean.shape[0], mean.shape[1], nmin, ncols); goto exit; } if (coordinates.nrows != nrows || coordinates.ncols != nmin) { PyErr_Format(PyExc_RuntimeError, "coordinates has inconsistent size %zd x %zd (expected %d x %d)", mean.shape[0], mean.shape[1], nrows, nmin); goto exit; } if (nrows >= ncols) { u = coordinates.values; v = pc.values; } else { /* nrows < ncolums */ u = pc.values; v = coordinates.values; } /* -- Calculate the mean of each column ------------------------------ */ p = mean.buf; for (j = 0; j < ncols; j++) { p[j] = 0.0; for (i = 0; i < nrows; i++) p[j] += values[i][j]; p[j] /= nrows; } /* -- Subtract the mean of each column ----------------------------- */ for (i = 0; i < nrows; i++) for (j = 0; j < ncols; j++) u[i][j] = values[i][j] - p[j]; /* -- Perform the principal component analysis ----------------------- */ error = pca(nrows, ncols, u, v, eigenvalues.buf); /* ------------------------------------------------------------------- */ exit: free_data(&data); free_data(&pc); free_data(&coordinates); PyBuffer_Release(&eigenvalues); if (error == 0) { Py_INCREF(Py_None); return Py_None; } if (error == -1) return PyErr_NoMemory(); else if (error > 0) PyErr_SetString(PyExc_RuntimeError, "Singular value decomposition failed to converge"); return NULL; } /* end of wrapper for pca */ /* ========================================================================= */ /* -- The methods table ---------------------------------------------------- */ /* ========================================================================= */ static struct PyMethodDef cluster_methods[] = { {"version", (PyCFunction) py_version, METH_NOARGS, version__doc__}, {"kcluster", (PyCFunction) py_kcluster, METH_VARARGS | METH_KEYWORDS, kcluster__doc__ }, {"kmedoids", (PyCFunction) py_kmedoids, METH_VARARGS | METH_KEYWORDS, kmedoids__doc__ }, {"treecluster", (PyCFunction) py_treecluster, METH_VARARGS | METH_KEYWORDS, treecluster__doc__ }, {"somcluster", (PyCFunction) py_somcluster, METH_VARARGS | METH_KEYWORDS, somcluster__doc__ }, {"clusterdistance", (PyCFunction) py_clusterdistance, METH_VARARGS | METH_KEYWORDS, clusterdistance__doc__ }, {"clustercentroids", (PyCFunction) py_clustercentroids, METH_VARARGS | METH_KEYWORDS, clustercentroids__doc__ }, {"distancematrix", (PyCFunction) py_distancematrix, METH_VARARGS | METH_KEYWORDS, distancematrix__doc__ }, {"pca", (PyCFunction) py_pca, METH_VARARGS | METH_KEYWORDS, pca__doc__ }, {NULL, NULL, 0, NULL} /* sentinel */ }; /* ========================================================================= */ /* -- Initialization ------------------------------------------------------- */ /* ========================================================================= */ #if PY_MAJOR_VERSION >= 3 static struct PyModuleDef moduledef = { PyModuleDef_HEAD_INIT, "_cluster", "C Clustering Library", -1, cluster_methods, NULL, NULL, NULL, NULL }; PyObject * PyInit__cluster(void) #else void init_cluster(void) #endif { PyObject *module; PyNodeType.tp_new = PyType_GenericNew; if (PyType_Ready(&PyNodeType) < 0) #if PY_MAJOR_VERSION >= 3 return NULL; #else return; #endif if (PyType_Ready(&PyTreeType) < 0) #if PY_MAJOR_VERSION >= 3 return NULL; #else return; #endif #if PY_MAJOR_VERSION >= 3 module = PyModule_Create(&moduledef); if (module == NULL) return NULL; #else module = Py_InitModule4("_cluster", cluster_methods, "C Clustering Library", NULL, PYTHON_API_VERSION); if (module == NULL) return; #endif Py_INCREF(&PyTreeType); Py_INCREF(&PyNodeType); PyModule_AddObject(module, "Tree", (PyObject*) &PyTreeType); PyModule_AddObject(module, "Node", (PyObject*) &PyNodeType); #if PY_MAJOR_VERSION >= 3 return module; #endif }