/* # Copyright 2007, 2008 # Niko Beerenwinkel, # Nicholas Eriksson, # Lukas Geyrhofer, # Osvaldo Zagordi, # ETH Zurich # This file is part of ShoRAH. # ShoRAH is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # ShoRAH is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with ShoRAH. If not, see . */ #include #include #include #include #include #include #include #include #include #include #include #include #include "data_structures.h" #include "dpm_sampler.h" #define PROPHISTSIZE 100 int main(int argc, char** argv){ unsigned int i, j, k, ll, K1, mesh, tot_untouch, new_proposed=0; int dk1,hapbases; int* p; cnode* tn; //rnode* rn; rnode* tr; double quality; ssret* samp_stat; double dt; double_threshold = - (double)DBL_DIG * gsl_sf_log(10.0); printf("# dna_code:\t"); putchar(i2dna_code[0]); putchar(i2dna_code[1]); putchar(i2dna_code[2]); putchar(i2dna_code[3]); putchar(i2dna_code[4]); putchar(i2dna_code[5]); printf("\n"); if( (assign = fopen("assignment.tmp", "w")) == NULL ){ printf("Impossible to write file assignment.tmp\n"); exit(EXIT_FAILURE); } parsecommandline(argc, argv); printf("# randseed = %ld\n",randseed); // random number generator via gsl gsl_rng_env_setup(); T = gsl_rng_default; rg = gsl_rng_alloc(T); gsl_rng_set(rg, randseed); dist = calloc(2, sizeof(int)); res_dist = calloc(2, sizeof(int)); res = (ssret*)malloc(sizeof(ssret)); cbase = (int*) malloc(B * sizeof(int)); // count base pbase = (double*) malloc(B * sizeof(double)); log_pbase = (double*) malloc(B * sizeof(double)); mesh = 1;//iter/100; read_data(filein); build_assignment(); printf("# +++++++++++++++++++ BEFORE THE SAMPLING +++++++++++++++++++\n"); print_stats(mxt, J); printf("#\titeration\tcomponents\tuntouched\ttheta\tgamma\n"); /********************* sampling procedure *********************/ if(write_clusterfile) { fp_clustersize = fopen(clusterfile_output,"w"); tn = mxt; while(tn != NULL) { fprintf(fp_clustersize," %d %d",tn->ci,tn->size); tn=tn->next; } fprintf(fp_clustersize,"\n"); } for(k=0; k<=iter; k++){ #ifdef DEBUG printf("-----------------------------------------------\n"); printf("-----------> sampling the %ith time <-----------\n", k); printf("-----------------------------------------------\n"); #endif // fprintf(stderr, "\x1B[1A\x1B[2K iteration %i\n", k); tot_untouch = 0; // sample classes for all reads for(i=0; iuntouched; new_proposed += samp_stat->proposed; } // sample haplotypes from reads tn = mxt; K1 = 0; dt = 0.0; dk1 = 0; hapbases = 0; while(tn != NULL){ sample_hap(tn); if(write_clusterfile) { fprintf(fp_clustersize," %d %d",tn->ci,tn->size); } tr = tn->rlist; while (tr != NULL){ p = seq_distance(tn->h, r[tr->ri], J); dt += p[1]; tr = tr->next; } // compute distance of all reads to their haplotype for (ll=0; llh, r[ll], J); tn->rd0[ll] = p[0]; tn->rd1[ll] = p[1]; } // compute distance of haplotypes to reference p = seq_distance(tn->h, h, J); dk1 += p[1]; hapbases += p[1] + p[0]; K1++; if(k == iter - HISTORY + 1){// starts recording if(record == 0){ create_history(k); record = 1; #ifdef DEBUG printf("creating history...\n"); #endif } } if(record){ record_conf(tn, k); } tn = tn->next; } if(write_clusterfile) { fprintf(fp_clustersize,"\n"); } theta = dt/totbases + gsl_ran_gaussian(rg, g_noise); gam = (double)dk1/hapbases; // HACK!!! theta=1 gives undesired behaviour; not elegant, but effective if(theta >= 1.0) theta = 0.9999; //sample_ref(); // sampling the reference every step gives strange behaviour... #ifdef DEBUG fprintf(stderr,"dt, theta, gamma are %d %f %f\n", dt, theta, gam); #endif printf("iteration\t%i\t%i\t%i\t%f\t%f\n", k+1, count_classes(mxt), tot_untouch, theta, gam); } if(write_clusterfile) { fclose(fp_clustersize); } /********************* sampling ends *********************/ write_assignment(k, new_proposed, mxt); printf("\n\n# +++++++++++++++++++ AFTER THE SAMPLING ++++++++++++++++++++\n"); print_stats(mxt, J); printf("\n# reference genome is:\n# "); for(j=0; j'){ n++; seq = 0; } if (c == '\n') seq = 1; if (isdna(c) && seq) { totsites++; if (d2i(c) != B) totbases++; } if (c == EOF) break; } J = totsites/n; // assuming fixed length reads fclose(data); if(n == 1) { printf("Nothing to do, have only one read... (n == 1)\n"); exit(0); } printf("# totbases = %d, totsites = %d\n",totbases,totsites); //allocate memory for r[i][j] r = calloc(n, sizeof(unsigned short int*)); id = calloc(n, sizeof(char*)); for (i = 0; i < n; i++){ r[i] = calloc(J, sizeof(unsigned short int)); id[i] = calloc(100, sizeof(char)); } if( (data = fopen(filename, "r")) == NULL){ printf("Impossible to REopen file %s\n", filename); exit(EXIT_FAILURE); } i = -1; for (;;) {// second sweep to read c = fgetc(data); if (c == '>'){ i++; seq = 0; k = 0; } if (seq == 0){ id[i][k] = c; k++; } if (seq == 0 && c == '\n'){ seq = 1; j=0; } if (isdna(c) && seq){ r[i][j] = d2i(c); j++; } if (c == EOF) break; } fclose(data); } void build_assignment(){ unsigned int i, m, ll; unsigned int ci; cnode* cn; rnode* rn; int* p; double* p_k; gsl_ran_discrete_t* g; h = calloc(J, sizeof(unsigned int)); c_ptr = calloc(n, sizeof(cnode*)); MAX_K = n + 1; P = (double*) calloc((MAX_K), sizeof(double)); log_P = (double*) calloc((MAX_K), sizeof(double)); cl_ptr = (cnode**) calloc((MAX_K), sizeof(cnode*)); ass_hist = (hnode***)calloc(n,sizeof(hnode**)); for(i=0; i0), calculate K if(avgNK > 0.0) { K = (int)(1.*n/avgNK); } // otherwise K is already set as commandline option // K == 0 can happen with very few reads... // no empty clusters at the beginning if ((n < K) || (K == 0)) K = n; lowest_free = K; p_k = (double*)malloc(K*sizeof(double)); // make the class list of K initial components mxt = NULL; for(i=0; ici) { add_read(&cn->rlist, m); cn->size++; break; } cn=cn->next; } } free(p_k); // there is at least the first cluster... cn = mxt; #ifdef DEBUG printf("cluster %d has size %d\n",cn->ci,cn->size); #endif cn->h = (unsigned short int *) malloc(J * sizeof(unsigned short int)); cn->rd0 = (unsigned short int*) malloc(n * sizeof(unsigned short int)); cn->rd1 = (unsigned short int*) malloc(n * sizeof(unsigned short int)); while(cn->size == 0) { // or maybe not ... remove_comp(&mxt); cn = mxt; cn->h = (unsigned short int *) malloc(J * sizeof(unsigned short int)); cn->rd0 = (unsigned short int*) malloc(n * sizeof(unsigned short int)); cn->rd1 = (unsigned short int*) malloc(n * sizeof(unsigned short int)); } sample_hap(cn); for (ll=0; ll < n; ll++){ p = seq_distance(cn->h, r[ll], J); cn->rd0[ll] = p[0]; cn->rd1[ll] = p[1]; } // have to go through the list with ->next, because dont want to loose connection between the elements of the list // needed for remove_comp, which returns the ->next element of the deleted cluster at the position of the given argument while(cn->next != NULL){ // have to allocate mem anyway, otherwise would get error, when freeing memory, which happens when removing cluster cn->next->h = (unsigned short int *) malloc(J * sizeof(unsigned short int)); cn->next->rd0 = (unsigned short int*) malloc(n * sizeof(unsigned short int)); cn->next->rd1 = (unsigned short int*) malloc(n * sizeof(unsigned short int)); #ifdef DEBUG printf("cluster %d has size %d\n",cn->next->ci,cn->next->size); #endif if(cn->next->size == 0) { #ifdef DEBUG printf("removing some node... p=%p\n",cn->next); #endif remove_comp(&cn->next); }else{ sample_hap(cn->next); for (ll=0; ll < n; ll++){ p = seq_distance(cn->next->h, r[ll], J); cn->rd0[ll] = p[0]; cn->rd1[ll] = p[1]; } cn = cn->next; } } // define the predecessor of each read cn = mxt; rn = cn->rlist; while (rn != NULL){ i = rn->ri; c_ptr[i] = NULL; rn = rn->next; } while (cn->next != NULL){ rn = cn->next->rlist; while (rn != NULL){ i = rn->ri; c_ptr[i] = cn; rn = rn->next; } cn = cn->next; } /* for(i=0; i 0) { bmax = cbase[0]; bi = 0; for(rr=1; rr bmax) { bmax = cbase[rr]; bi = rr; } } h[i] = bi; } else{ h[i] = B; } } return; } double sample_ref() { cnode* cn; unsigned int i,j; double b1,b2; unsigned int K1 = 0; gsl_ran_discrete_t* g; double max_log_pbase; int max_cbase; unsigned int base_id, dk1=0; unsigned int countbases = 0; for(j=0;jsize > 0) { if(cn->h[j] < B) { cbase[cn->h[j]]++; K1++; } } cn=cn->next; } countbases += K1; if(gam < .2)gam = 0.2; b1 = gam; b2 = (1.0-gam)/((double)B - 1.0); if(b1 == 0.0) { printf("# There is something wrong with GAMMA ( == 0.0), sample ref\n"); print_stats(mxt,J); printf("# reference:\n# "); for(i=0;i 0) { if(b1 != 1.0) { for(i=0; i max_log_pbase) { max_log_pbase = log_pbase[i]; base_id = i; } } for(i=0; i= max_cbase) { max_cbase = cbase[i]; } } for(i=0; i< B; i++) { if(cbase[i] == max_cbase) { pbase[i] = 1.0; } else { pbase[i] = 0.0; } } g = gsl_ran_discrete_preproc(B, pbase); h[j] = gsl_ran_discrete(rg, g); gsl_ran_discrete_free(g); } } else{ // K1 == 0, that is all N's h[j] = B; } dk1 += cbase[h[j]]; } return (double)dk1/(double)countbases; } void sample_hap(cnode* cn){ rnode* tr; unsigned int i, j, tot_reads; gsl_ran_discrete_t* g; double b1, b2; double max_log_pbase; int max_cbase; unsigned int base_id; #ifdef DEBUG printf("Haplotype %i is\n", cn->ci); #endif for (j=0; jrlist; while (tr != NULL){ // correct for missing data if (r[tr->ri][j] < B) { cbase[r[tr->ri][j]]++; tot_reads++; } tr = tr->next; } b1 = theta; b2 = (1. - theta)/((double)B-1); if(b1 == 0.0) { printf("# There is something wrong with THETA ( == 0.0)\n"); exit(EXIT_FAILURE); } if(tot_reads > 0) { if(b1 != 1.0) { for(i=0; i max_log_pbase) { max_log_pbase = log_pbase[i]; base_id = i; } } for(i=0; ih[j] = gsl_ran_discrete(rg, g); gsl_ran_discrete_free(g); }else{ // theta == 1.0 max_cbase = cbase[0]; base_id = 0; for(i=1; i= max_cbase) { max_cbase = cbase[i]; base_id = i; } } cn->h[j] = base_id; } }else{ // tot_reads == 0 cn->h[j] = B; } #ifdef DEBUG printf("%i ", cn->h[j]); #endif } #ifdef DEBUG printf("\n"); #endif return; } void check_size(const cnode* cst, unsigned int n){ unsigned int this_n = 0; while(cst != NULL){ this_n += cst->size; cst = cst->next; } if(this_n != n){ print_stats(mxt, J); printf("!!!! STOP !!!!\n"); printf("size is now %i\n", this_n); exit(EXIT_FAILURE); } return; } int count_classes(const cnode* cst){ unsigned int ck = 0; while(cst != NULL){ ck++; cst = cst->next; } return ck; } int isdna (char c){ int ans = (c == 'A' || c == 'C' || c == 'G' || c == 'T'|| c == 'a' || c == 'c' || c == 'g' || c == 't' || c == 'N' || c == 'n' || c == '-'); return ans; } unsigned int d2i (char c){ if(c == 'A' || c == 'a') return 0; if(c == 'C' || c == 'c') return 1; if(c == 'G' || c == 'g') return 2; if(c == 'T' || c == 't') return 3; if(c == '-') return 4; if(c == 'N' || c == 'n') // this stands for missing return B; printf("%c not a DNA base", c); exit(EXIT_FAILURE); return 0; } int* seq_distance(unsigned short int* a, unsigned short int* b, int seq_length){ int i, ns=0; //int d=sizeof(unsigned short int); dist[0] = 0; dist[1] = 0; for (i=0; i sampling class for %ith read <----------\n", i); printf("---------------------------------------------------\n"); printf("------------ c_ptr = %p with size = %d\n", c_ptr[i], mxt->size); printf("------------- NOW STATS----------\n"); print_stats(mxt, J); #endif // if the class is populated only by i, remove it // if it's in the head if(c_ptr[i] == NULL && mxt->size == 1){ rn = mxt->next->rlist; while(rn != NULL){ c_ptr[rn->ri] = NULL; rn = rn->next; } remove_comp(&mxt); removed = 1; #ifdef DEBUG printf("----------- REMOVED SIZE 1 NODE ----------\n"); #endif } // if it's not in the head if(c_ptr[i] != NULL && c_ptr[i]->next->size == 1){ // if i not in the last node update the following predecessor if(c_ptr[i]->next->next != NULL){ rn = c_ptr[i]->next->next->rlist; while(rn != NULL){ c_ptr[rn->ri] = c_ptr[i]; rn = rn->next; } } remove_comp(&(c_ptr[i]->next)); removed = 1; #ifdef DEBUG printf("----------- REMOVED SIZE 1 NODE ----------\n"); #endif } /********************************************************** run through the populated classes to assign a probability ***********************************************************/ cn = mxt; if( cn == NULL){ printf("sampling classes, no classes found\n"); exit(EXIT_FAILURE); } st = 0; b1 = theta; b2 = (1. - theta)/((double)B - 1.); if(theta == 0.0) { printf("# There is something wrong wih THETA! ( == 0.0 )\n"); exit(EXIT_FAILURE); } if(gam == 0.0) { printf("# There is something wrong with GAMMA! ( == 0.0 )\n"); exit(EXIT_FAILURE); } while (cn != NULL){ if(cn->size > 0) { sz = cn->size; read_came_from_current = 0; if(removed == 0){ if(c_ptr[i] != NULL && cn == c_ptr[i]->next) { sz--; read_came_from_current = 1; } if(c_ptr[i] == NULL && cn == mxt) { sz--; read_came_from_current = 1; } } if(sz == 0){ printf("THIS IS ZERO!!!\n"); exit(EXIT_FAILURE); } cl_ptr[st] = cn; if(b1 != 1.0) { dist = cn->rd0[i];//seq_distance(cn->h, r[i], J); nodist = cn->rd1[i]; log_P[st] = gsl_sf_log((double)sz) + nodist * gsl_sf_log(b1) + dist * gsl_sf_log(b2); P[st] = 1.0; // all probabilities, which should change afterwards, set to 1 } else{ // theta == 1.0, not needed if(gam == 1.0) { log_P[st] = gsl_sf_log((double)sz); P[st] = 1.0; // same as above, P != 0 later } else{ if(cn->rd0[i] == 0) { // current class is from_class log_P[st] = gsl_sf_log((double)sz); P[st] = 1.0; // same as above, P != 0 later } else{ P[st] = 0.0; } } } } else{ fprintf(stderr, "------********* CN->SIZE = 0 **********----------\n"); P[st] = 0.0; // all prob, which shouldn't change, set to 0 } st++; cn = cn->next; } // plus the newly instantiated one p = seq_distance(h, r[i], J); dist = p[0]; nodist = p[1]; b1 = (theta * gam) + (1. - gam)*(1. - theta)/((double)B - 1.); b2 = (theta + gam + B*(1. - gam * theta) - 2.)/(gsl_sf_pow_int((double)B - 1., 2)); if((theta == 1.0) && (gam == 1.0)) { log_P[st] = gsl_sf_log(alpha); P[st] = 1.0; } else { log_P[st] = gsl_sf_log(alpha) + nodist * gsl_sf_log(b1) + dist * gsl_sf_log(b2); P[st] = 1.0; } cl_ptr[st] = NULL; #ifdef DEBUG for (j=0; j<=st; j++) printf("with P[%i] = %e to class %p\n", j, P[j], cl_ptr[j]); #endif max_log_P = log_P[st]; // set the max_log_P to the log_P of the new class, the only one where we definitly know an id, when using the logscale class_id = st; for(ll=0; ll=.99)) { // allow small errors in double, should be 1 nevertheless max_log_P = log_P[ll]; class_id = ll; } } for(ll=0; ll<=st; ll++) { if(P[ll] >= .99) { log_P[ll] -= max_log_P; if(log_P[ll] < double_threshold) { P[ll] = 0.0; }else{ P[ll] = gsl_sf_exp(log_P[ll] - max_log_P); } }// else P[i] = 0, from above } g = gsl_ran_discrete_preproc(st+1, P); this_class = gsl_ran_discrete(rg, g); gsl_ran_discrete_free(g); #ifdef DEBUG printf ("extracted class is = %lu\n", this_class); #endif from_class = (c_ptr[i] != NULL) ? c_ptr[i]->next : mxt; to_class = cl_ptr[this_class]; /*************************************** move the read to the extracted class ****************************************/ if( removed == 0 ){ if ( from_class == to_class ){ #ifdef DEBUG printf ("from %p to itself\n", from_class); #endif // free(P); // free(cl_ptr); res->untouched = 1; res->proposed = 0; res->to_class = to_class; return res; } else if ( to_class != NULL){ #ifdef DEBUG printf("moving the read from %p to %p\n", from_class, to_class); #endif c_ptr[i] = c_ptr[ to_class->rlist->ri ]; // predecessor is taken from the already present read move_read(i, &from_class, &to_class); (from_class->size)--; (to_class->size)++; // free(P); // free(cl_ptr); res->untouched = 0; res->proposed = 0; res->to_class = to_class; return res; } else if (to_class == NULL){ #ifdef DEBUG printf("moving %i to a new class from %p\n", index, from_class); #endif remove_read( search_read(&from_class->rlist, i) ); (from_class->size)--; add_comp(&mxt, &hst, lowest_free, 1, NULL, NULL, NULL, NULL, J, step, record); lowest_free++; add_read(&mxt->rlist, i); mxt->h = (unsigned short int*) malloc(J * sizeof(unsigned short int)); sample_hap(mxt); mxt->rd0 = (unsigned short int*) malloc(n * sizeof(unsigned short int)); mxt->rd1 = (unsigned short int*) malloc(n * sizeof(unsigned short int)); for (ll=0; ll < n; ll++){ p = seq_distance(mxt->h, r[ll], J); mxt->rd0[ll] = p[0]; mxt->rd1[ll] = p[1]; } c_ptr[i] = NULL; rn = mxt->next->rlist; if(rn == NULL){ printf("STOP!!! There should be something\n"); exit(21); } while (rn != NULL){ c_ptr[rn->ri] = mxt; rn = rn->next; } // free(P); // free(cl_ptr); res->untouched = 0; res->proposed = 1; res->to_class = mxt; return res; } } else if (removed == 1){ #ifdef DEBUG printf("moving having removed\n"); #endif if (to_class != NULL){ add_read(&to_class->rlist, i); (to_class->size)++; c_ptr[i] = c_ptr[ to_class->rlist->next->ri ]; res->untouched = 0; res->proposed = 0; res->to_class = to_class; } else if (to_class == NULL){ add_comp(&mxt, &hst, lowest_free, 1, NULL, NULL, NULL, NULL, J, step, record); lowest_free++; res->to_class = mxt; c_ptr[i] = NULL; cn = mxt->next; rn = cn->rlist; while (rn != NULL){ c_ptr[rn->ri] = mxt; rn = rn->next; } add_read(&mxt->rlist, i); mxt->h = (unsigned short int*) malloc(J * sizeof(unsigned short int)); sample_hap(mxt); mxt->rd0 = (unsigned short int*) malloc(n * sizeof(unsigned short int)); mxt->rd1 = (unsigned short int*) malloc(n * sizeof(unsigned short int)); for (ll=0; ll < n; ll++){ p = seq_distance(mxt->h, r[ll], J); mxt->rd0[ll] = p[0]; mxt->rd1[ll] = p[1]; } res->untouched = 0; res->proposed = 1; } // free(P); // free(cl_ptr); return res; } fprintf(stderr, "WARNING!!! I DIDN'T SAMPLE READ %d\n", i); return res; } void write_assignment (unsigned int it, unsigned int new_proposed, const cnode* tn) { rnode* rn; unsigned int j = 0, k; fprintf(assign, "# %d iterations\n", it-1); fprintf(assign, "\n#gamma = %f\n", gam); fprintf(assign, "\n#made %i new clusters\n\n", new_proposed); while(tn != NULL){ rn = tn->rlist; while(rn != NULL){ k = 0; while(id[rn->ri][k] != '\n'){ fputc(id[rn->ri][k], assign); k++; } fprintf(assign, " -> %i\n", j); rn = rn->next; } j++; tn = tn->next; } // fprintf(assign, "\n"); fprintf(assign, "\n#final number of components = %i\n\n", j); } void create_history(unsigned int step){ // copy haplotypes from mxt to create the history cnode* tn; tn = mxt; while(tn != NULL){ add_hap(&hst, NULL, tn->ci, J, step); tn->hh = hst; memcpy(hst->h, tn->h, J*sizeof(unsigned short int)); tn = tn->next; } return; } void record_conf(cnode* tn, unsigned int step){ int *p; rnode* rn; if(tn->step < step) { p = seq_distance(tn->h,tn->hh->h,J); if(p[0] != 0) { add_hap(&hst,tn->hh,tn->ci,J,step); memcpy(hst->h,tn->h,J*sizeof(unsigned short int)); tn->hh = hst; } }else{ memcpy(tn->hh->h,tn->h,J*sizeof(unsigned short int)); } rn = tn->rlist; while(rn != NULL){ ass_hist[rn->ri][step - iter + HISTORY - 1] = tn->hh; rn = rn->next; } return; } void cleanup() { free(P); free(log_P); free(pbase); free(cbase); free(log_pbase); free(h); // should maybe cleanup all nodes ... } int compare(const void* a, const void* b) { unsigned int i=0; unsigned short int *h1, *h2; h1 = *(unsigned short int**)a; h2 = *(unsigned short int**)b; for(i=0; ih, J*sizeof(unsigned short int)); } qsort(haplotypes, HISTORY, sizeof(unsigned short int*), compare); // qsort returns haplotype (pointers to the ascending sorted haplotypes) hap = 0; ch[0] = 1; for(k = 1; k < HISTORY; k++) { p = seq_distance(haplotypes[k-1], haplotypes[k], J); if (p[0] == 0 ) { ch[hap] += 1; }else{ hap = k; ch[hap] = 1; } } int max = 0; for(k=0; k= max){ max = ch[k]; hap = k; } } quality = ch[hap]/(double)HISTORY; for(j=0;jh = (unsigned short int*)malloc(J*sizeof(unsigned short int)); all_haplo[i*HISTORY+j]->hi = ass_hist[i][j]->hi; all_haplo[i*HISTORY+j]->step = ass_hist[i][j]->step; all_haplo[i*HISTORY+j]->count = 1; memcpy(all_haplo[i*HISTORY+j]->h,ass_hist[i][j]->h,J*sizeof(unsigned short int)); } } qsort(all_haplo,n*HISTORY,sizeof(hnode_single*),compare_hnss_seq); hap = 0; for(i=1; ih,all_haplo[i-1]->h,J); if(p[0] == 0) { all_haplo[hap]->count++; all_haplo[i]->count = 0; }else{ hap = i; } } if(hcount > 0) { qsort(all_haplo,n*HISTORY,sizeof(hnode_single**),compare_hnss_count); }else{ hcount = n*HISTORY; } hap=0; fp = fopen(filename,"w"); for(i=0; icount > 0) { fprintf(fp,">haplotype_%04d|%.9lf\n",hap,(double)(1.0*all_haplo[i]->count)/(1.0 * n * HISTORY)); for(j=0; jh[j]]); } fprintf(fp,"\n"); hap++; if(hap >= hcount) break; } } fclose(fp); for(i=0; ih); } free(all_haplo); } int parsecommandline(int argc, char** argv){ // get command line parameters char c; while((c = getopt(argc, argv,"i:j:a:t:o:m:K:k:R:c:Hh")) != -1){ switch (c){ case 'i': filein = optarg; break; case 'j': iter = atoi(optarg); break; case 'a': alpha = atof(optarg); break; case 't': HISTORY = atoi(optarg); break; case 'o': haplotype_output = optarg; ho_flag = 1; break; case 'm': ho_count = atoi(optarg); break; case 'K': if(avgNK == 0.0) { K = atoi(optarg); }else{ printf("can't use -k and -K at same time.\n"); exit(1); } break; case 'k': if(K == 0) { avgNK = atof(optarg); }else{ printf("can't use -k and -K at same time.\n"); exit(1); } break; case 'R': randseed = atoi(optarg); break; case 'c': write_clusterfile = 1; clusterfile_output = optarg; break; default : fprintf(stdout, "%s [options]\n" "\n" " files\n" "\t-i \n" " parameters\n" "\t-j \n" "\t-a \n" "\t-K not compat. with -k\n" "\t-k not compat. with -K\n" "\t-t \n" "\t-o (for debugging purposes)\n" "\t-m \n" "\t-c \n" "\t-R \n" "-----------------------------------------------------\n" "\t-h\t\t this help!\n", argv[0]); exit(-1); } } if(HISTORY > iter)HISTORY=iter; if(randseed == 0) randseed = time(NULL); if( (K==0) && (avgNK <= 0.0))avgNK = default_avgNK; return 0; } // ends parsecommandline