/******************************COPYRIGHT NOTICE*******************************/ /* (c) Centro de Regulacio Genomica */ /* and */ /* Cedric Notredame */ /* 12 Aug 2014 - 22:07. */ /*All rights reserved. */ /*This file is part of T-COFFEE. */ /* */ /* T-COFFEE is free software; you can redistribute it and/or modify */ /* it under the terms of the GNU General Public License as published by */ /* the Free Software Foundation; either version 2 of the License, or */ /* (at your option) any later version. */ /* */ /* T-COFFEE is distributed in the hope that it will be useful, */ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ /* GNU General Public License for more details. */ /* */ /* You should have received a copy of the GNU General Public License */ /* along with Foobar; if not, write to the Free Software */ /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /*............................................... */ /* If you need some more information */ /* cedric.notredame@europe.com */ /*............................................... */ /******************************COPYRIGHT NOTICE*******************************/ // #include "km_coffee.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include "io_lib_header.h" #include "util_lib_header.h" #include "define_header.h" #include "km_coffee_header.h" /** * \brief Writes the sequences to the file * \param node The node containing the data to write * \param vecs The vector set * \param seq_set The sequence Set * \param name_f The file to write the sequence into. */ void write_files(KM_node *node, int *vecs, const SeqSet *seq_set, char *name_f) { size_t end = node->end; FILE * tmp_F = fopen(name_f, "w"); size_t i; for (i = node->start; i%s\n%s\n", seq_set->seqs[vecs[i]]->name, seq_set->seqs[vecs[i]]->seq); } fclose(tmp_F); } /** * \brief Frees all memory occupied by this structure * \param root The root of the tree to delete. */ void del_tree(KM_node* root) { Stack *to_do = Stack_init(); Node_pair *tmp = (Node_pair*)my_malloc(sizeof(Node_pair)); tmp->node=root; tmp->id = 0; push(to_do, tmp); KM_node* current; size_t child; while (to_do->size != 0) { current = ((Node_pair*)to_do->last)->node; child = ((Node_pair*)to_do->last)->id; if (current->n_children==0) { free(pop(to_do)); delKM_node(current); } else { if (child == current->n_children) { delKM_node(current); free(pop(to_do)); } else { ++((Node_pair*)to_do->last)->id; tmp = (Node_pair*)my_malloc(sizeof(Node_pair)); tmp->node=current->children[child]; tmp->id = 0; push(to_do, tmp); } } } delStack(to_do); } /** * \brief Traverses the tree and calls t_coffee to produce the alignments * \param root The root of the tree. * \param vecs The vector set. * \param seq_set The sequence set. * \param out_f The output file. * \param n_core The number of cores to use (OPEN-MP for kmeans/fork in T-Coffee) * \param gapopen The gapopening costs * \param gapext The gapextension costs * \param method The method to use in the alignment */ void traverse_km_tree(KM_node* root, int *vecs, const SeqSet *seq_set, char *out_f, int n_cores, int gapopen, int gapext, char *method) { Stack *to_do =Stack_init(); Node_pair *tmp = (Node_pair*)my_malloc(sizeof(Node_pair)); tmp->node=root; tmp->id = 0; push(to_do, tmp); KM_node* current; size_t child; size_t j; // size_t pos; char command[1000]; while (to_do->size != 0) { current = ((Node_pair*)to_do->last)->node; child = ((Node_pair*)to_do->last)->id; if (current->n_children==0) { if(current->end-current->start > 2) { sprintf(command, "%li",current->id); write_files(current, vecs, seq_set, command); // sprintf(command,"clustalo --guidetree-out co_tree.dnd -i %li --force >/dev/null 2>/dev/null", current->id); // if (system(command)) // { // printf("%s\n",command); // exit(1); // } if (current->id!=0) sprintf(command,"t_coffee -in %li -output fasta_aln -outfile %li.fa -n_core %i -gapopen %i -gapext %i -method %s -quiet >/dev/null 2>/dev/null", current->id, current->id, n_cores, gapopen, gapext, method); else sprintf(command,"t_coffee -in %li -output fasta_aln -outfile %s -n_core %i -gapopen %i -gapext %i -method %s -quiet >/dev/null 2>/dev/null ", current->id, out_f, n_cores, gapopen, gapext, method); if (system(command)) { printf("ERROR when running: %s\n",command); exit(1); } } else if(current->end-current->start > 1) { sprintf(command, "%li",current->id); write_files(current, vecs, seq_set, command); if (current->id!=0) sprintf(command,"t_coffee -in %li -output fasta_aln -outfile %li.fa -n_core %i -gapopen %i -gapext %i -method %s -quiet >/dev/null 2>/dev/null", current->id, current->id, n_cores, gapopen, gapext, method); else sprintf(command,"t_coffee -in %li -output fasta_aln -outfile %s -n_core %i -gapopen %i -gapext %i -method %s -quiet >/dev/null 2>/dev/null ", current->id, out_f, n_cores, gapopen, gapext, method); printf("%s\n", command); if (system(command)) { printf("ERROR when running: %s\n",command); exit(1); } } else { sprintf(command, "%li.fa",current->id); write_files(current, vecs, seq_set, command); } tmp =(Node_pair*) pop(to_do); } else { if (child == current->n_children) { if (current->id!=0) sprintf(command, "t_coffee -output fasta_aln -method %s -quiet -outfile %li.fa -n_core %i -gapopen %i -profile FILE::prf.fa ", method, current->id,n_cores, gapopen); else sprintf(command, "t_coffee -output fasta_aln -method %s -quiet -outfile %s -n_core %i -gapopen %i -profile FILE::prf.fa ", method, out_f, n_cores, gapopen); FILE *prf_F = my_fopen("prf.fa", "w"); for(j=0; jn_children;++j) { fprintf(prf_F, "%li.fa\n", current->children[j]->id); } fclose(prf_F); printf("%s\n", command); if (system(command)) { printf("ERROR when running: %s\n",command); exit(1); } pop(to_do); } else { ++((Node_pair*)to_do->last)->id; tmp = (Node_pair*)my_malloc(sizeof(Node_pair)); tmp->node=current->children[child]; tmp->id = 0; push(to_do, tmp); } } } exit(1); } int my_seq_sort (const void *i, const void *j) { return strcmp((*(Seq**)i)->seq,(*(Seq**)j)->seq); } void print_km_tree(KM_node *root, int *vecs, const SeqSet *seq_set, char *out_f) { FILE *out_F = fopen(out_f, "w"); Stack *to_do =Stack_init(); Node_pair *tmp = (Node_pair*)my_malloc(sizeof(Node_pair)); tmp->node=root; tmp->id = 0; push(to_do, tmp); KM_node* current; size_t child; size_t k; // size_t pos; // char command[1000]; while (to_do->size != 0) { current = ((Node_pair*)to_do->last)->node; child = ((Node_pair*)to_do->last)->id; if (current->n_children==0) { if (current->end-current->start >1) fprintf(out_F, "("); for (k=current->start; kend; ++k) { fprintf(out_F, "%s", seq_set->seqs[vecs[k]]->name); if (kend-1) fprintf(out_F, ","); } tmp =(Node_pair*) pop(to_do); if (current->end-current->start >1) fprintf(out_F, ")"); } else { if (child == 0) { fprintf(out_F, "("); } if (child == current->n_children) { fprintf(out_F, ")"); pop(to_do); } else { if (child != 0) fprintf(out_F, ","); ++((Node_pair*)to_do->last)->id; tmp = (Node_pair*)my_malloc(sizeof(Node_pair)); tmp->node=current->children[child]; tmp->id = 0; push(to_do, tmp); } } } fprintf(out_F, ";"); } typedef struct { size_t id; float sim; } sorter; int my_val_compare (const void *i, const void *j) { return (*(Vector**)j)->val < (*(Vector**)j)->val; } /* KM_node* simple_clust(VectorSet *vecSet, unsigned int k) { size_t n_vecs = vecSet->n_vecs; size_t i, max_id=0, max_val=0; Vector **vecs = vecSet->vecs; size_t dim=vecSet->dim; // determine longest_seq for (i=0; iseq_len > max_val) { max_val=vecs[i]->seq_len; max_id=i; } } size_t num_nodes= ceil(n_vecs*1.0/k); KM_node **nodes = (KM_node**)malloc(sizeof(KM_node*)*num_nodes); size_t j; //determine leaves size_t node_id=0; for (j=0; jval=km_sq_dist(vecs[max_id], vecs[i],dim); qsort (vecs, n_vecs, sizeof(Vector*), my_val_compare); nodes[node_id] = malloc(sizeof(KM_node)); nodes[node_id]->children = NULL; nodes[node_id]->start = node_id*k; nodes[node_id]->end = (n_vecs>k )? (node_id+1)*k: (node_id*k)+n_vecs; nodes[node_id]->n_children=0; nodes[node_id]->id=++node_id; n_vecs -= k; max_id =0; vecs+=k; } // generate inner nodes j=num_nodes; KM_node *tmp; size_t pos=0; size_t overall_pos=0; // printf("1: %li %li\n", num_nodes, node_id); while (1) { for(i=0; ichildren=malloc(sizeof(KM_node*)*num_nodes); tmp->children[0]=nodes[i]; tmp->n_children=1; tmp->id=++node_id; nodes[overall_pos]=tmp; pos=0; ++overall_pos; } else { tmp->children[++pos]=nodes[i]; ++tmp->n_children; } } overall_pos=0; if (num_nodes==1) break; num_nodes=ceil(num_nodes*1.0/k); } // printf("%li\n", node_id); free(nodes); tmp->id=0; return tmp; }*/ /* typedef struct KM_node{ size_t start; size_t end; struct KM_node **children; size_t id; size_t n_children; } KM_node;*/ int km_coffee_align3(char *seq_f, int k, int k_leaf, char *method, char *aln_f, int n_cores, int gapopen, int gapext, char *init) { char *use_as_temp = get_tmp_4_tcoffee(); #ifdef _OPENMP omp_set_num_threads(n_cores); #endif SeqSet *seq_set = read_fasta(seq_f); qsort(seq_set->seqs, seq_set->n_seqs, sizeof(Seq*), my_seq_sort); srand(time(0)); short j = -1; short i; /**************************************************** Sequences to vector using k-mers *****************************************************/ short alphabet[256]; // standard alphabet for (i = 65; i < 91; ++i) if ((i==66) || (i==74) || (i==79) || (i==88) || (i==90)) alphabet[i] = 0; else alphabet[i] = ++j; j=-1; for (i = 97; i < 123; ++i) if ((i==98) || (i==106) || (i==111) || (i==120) || (i==122)) alphabet[i] = 0; else alphabet[i] = ++j; // shrinked alphabet // for (i = 0; i < 256; ++i) // alphabet[i] = 0; // char *groups[]={"LlVvIiMmCcAaGgSsTtPpFfYyWw","EeDdNnQqKkRrHh"}; // char *groups[]={"LlVvIiMmCc","AaGgSsTtPp","FfYyWw","EeDdNnQqKkRrHh"}; // size_t n_groups = 4; // size_t len; // char *group; // for (i=0; in_vecs, vec_set->dim); // print_vecs(vec_set, &vec_file[0]); // read_vecs(vec_set, "matrix_59"); // exit(1); // normalize(vec_set); KM_node *root = hierarchical_kmeans(vec_set, k, k_leaf, init, 0.001); // KM_node *root = simple_clust(vec_set, k); char templatee[400]; sprintf(templatee, "%s/km_coffee_tmp_XXXXXX", use_as_temp); char tmp_str[FILENAME_MAX]; km_cwd = getcwd(tmp_str, FILENAME_MAX); km_tmp_dir = my_make_temp_dir(templatee, "main"); chdir(km_tmp_dir); char out_f[500]; if (aln_f[0] != '/') sprintf(out_f, "%s/%s", km_cwd, aln_f); else sprintf(out_f, "%s", aln_f); size_t n_vecs = seq_set->n_seqs; int *assignment = (int*)malloc(n_vecs*sizeof(int)); size_t l; for (l = 0; l< n_vecs; ++l) assignment[l]=vec_set->vecs[l]->id; // printf("TRAVERSE\n"); delVecSet(vec_set); traverse_km_tree(root, assignment, seq_set, out_f, n_cores, gapopen, gapext, method); free( assignment); del_tree(root); delSeqSet(seq_set); free(km_tmp_dir); return EXIT_SUCCESS; }