/*
# Copyright 2007, 2008
# Niko Beerenwinkel,
# Nicholas Eriksson,
# Lukas Geyrhofer,
# Osvaldo Zagordi,
# ETH Zurich
# This file is part of ShoRAH.
# ShoRAH is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# ShoRAH is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with ShoRAH. If not, see .
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "data_structures.h"
#include "dpm_sampler.h"
#define PROPHISTSIZE 100
int main(int argc, char** argv){
unsigned int i, j, k, ll, K1, mesh, tot_untouch, new_proposed=0;
int dk1,hapbases;
int* p;
cnode* tn;
//rnode* rn;
rnode* tr;
double quality;
ssret* samp_stat;
double dt;
double_threshold = - (double)DBL_DIG * gsl_sf_log(10.0);
printf("# dna_code:\t");
putchar(i2dna_code[0]);
putchar(i2dna_code[1]);
putchar(i2dna_code[2]);
putchar(i2dna_code[3]);
putchar(i2dna_code[4]);
putchar(i2dna_code[5]);
printf("\n");
if( (assign = fopen("assignment.tmp", "w")) == NULL ){
printf("Impossible to write file assignment.tmp\n");
exit(EXIT_FAILURE);
}
parsecommandline(argc, argv);
printf("# randseed = %ld\n",randseed);
// random number generator via gsl
gsl_rng_env_setup();
T = gsl_rng_default;
rg = gsl_rng_alloc(T);
gsl_rng_set(rg, randseed);
dist = calloc(2, sizeof(int));
res_dist = calloc(2, sizeof(int));
res = (ssret*)malloc(sizeof(ssret));
cbase = (int*) malloc(B * sizeof(int)); // count base
pbase = (double*) malloc(B * sizeof(double));
log_pbase = (double*) malloc(B * sizeof(double));
mesh = 1;//iter/100;
read_data(filein);
build_assignment();
printf("# +++++++++++++++++++ BEFORE THE SAMPLING +++++++++++++++++++\n");
print_stats(mxt, J);
printf("#\titeration\tcomponents\tuntouched\ttheta\tgamma\n");
/*********************
sampling procedure
*********************/
if(write_clusterfile) {
fp_clustersize = fopen(clusterfile_output,"w");
tn = mxt;
while(tn != NULL) {
fprintf(fp_clustersize," %d %d",tn->ci,tn->size);
tn=tn->next;
}
fprintf(fp_clustersize,"\n");
}
for(k=0; k<=iter; k++){
#ifdef DEBUG
printf("-----------------------------------------------\n");
printf("-----------> sampling the %ith time <-----------\n", k);
printf("-----------------------------------------------\n");
#endif
// fprintf(stderr, "\x1B[1A\x1B[2K iteration %i\n", k);
tot_untouch = 0;
// sample classes for all reads
for(i=0; iuntouched;
new_proposed += samp_stat->proposed;
}
// sample haplotypes from reads
tn = mxt;
K1 = 0;
dt = 0.0;
dk1 = 0;
hapbases = 0;
while(tn != NULL){
sample_hap(tn);
if(write_clusterfile) {
fprintf(fp_clustersize," %d %d",tn->ci,tn->size);
}
tr = tn->rlist;
while (tr != NULL){
p = seq_distance(tn->h, r[tr->ri], J);
dt += p[1];
tr = tr->next;
}
// compute distance of all reads to their haplotype
for (ll=0; llh, r[ll], J);
tn->rd0[ll] = p[0];
tn->rd1[ll] = p[1];
}
// compute distance of haplotypes to reference
p = seq_distance(tn->h, h, J);
dk1 += p[1];
hapbases += p[1] + p[0];
K1++;
if(k == iter - HISTORY + 1){// starts recording
if(record == 0){
create_history(k);
record = 1;
#ifdef DEBUG
printf("creating history...\n");
#endif
}
}
if(record){
record_conf(tn, k);
}
tn = tn->next;
}
if(write_clusterfile) {
fprintf(fp_clustersize,"\n");
}
theta = dt/totbases + gsl_ran_gaussian(rg, g_noise);
gam = (double)dk1/hapbases;
// HACK!!! theta=1 gives undesired behaviour; not elegant, but effective
if(theta >= 1.0)
theta = 0.9999;
//sample_ref(); // sampling the reference every step gives strange behaviour...
#ifdef DEBUG
fprintf(stderr,"dt, theta, gamma are %d %f %f\n", dt, theta, gam);
#endif
printf("iteration\t%i\t%i\t%i\t%f\t%f\n", k+1, count_classes(mxt), tot_untouch, theta, gam);
}
if(write_clusterfile) {
fclose(fp_clustersize);
}
/*********************
sampling ends
*********************/
write_assignment(k, new_proposed, mxt);
printf("\n\n# +++++++++++++++++++ AFTER THE SAMPLING ++++++++++++++++++++\n");
print_stats(mxt, J);
printf("\n# reference genome is:\n# ");
for(j=0; j'){
n++;
seq = 0;
}
if (c == '\n')
seq = 1;
if (isdna(c) && seq) {
totsites++;
if (d2i(c) != B)
totbases++;
}
if (c == EOF)
break;
}
J = totsites/n; // assuming fixed length reads
fclose(data);
if(n == 1) {
printf("Nothing to do, have only one read... (n == 1)\n");
exit(0);
}
printf("# totbases = %d, totsites = %d\n",totbases,totsites);
//allocate memory for r[i][j]
r = calloc(n, sizeof(unsigned short int*));
id = calloc(n, sizeof(char*));
for (i = 0; i < n; i++){
r[i] = calloc(J, sizeof(unsigned short int));
id[i] = calloc(100, sizeof(char));
}
if( (data = fopen(filename, "r")) == NULL){
printf("Impossible to REopen file %s\n", filename);
exit(EXIT_FAILURE);
}
i = -1;
for (;;) {// second sweep to read
c = fgetc(data);
if (c == '>'){
i++;
seq = 0;
k = 0;
}
if (seq == 0){
id[i][k] = c;
k++;
}
if (seq == 0 && c == '\n'){
seq = 1;
j=0;
}
if (isdna(c) && seq){
r[i][j] = d2i(c);
j++;
}
if (c == EOF)
break;
}
fclose(data);
}
void build_assignment(){
unsigned int i, m, ll;
unsigned int ci;
cnode* cn;
rnode* rn;
int* p;
double* p_k;
gsl_ran_discrete_t* g;
h = calloc(J, sizeof(unsigned int));
c_ptr = calloc(n, sizeof(cnode*));
MAX_K = n + 1;
P = (double*) calloc((MAX_K), sizeof(double));
log_P = (double*) calloc((MAX_K), sizeof(double));
cl_ptr = (cnode**) calloc((MAX_K), sizeof(cnode*));
ass_hist = (hnode***)calloc(n,sizeof(hnode**));
for(i=0; i0), calculate K
if(avgNK > 0.0) {
K = (int)(1.*n/avgNK);
}
// otherwise K is already set as commandline option
// K == 0 can happen with very few reads...
// no empty clusters at the beginning
if ((n < K) || (K == 0))
K = n;
lowest_free = K;
p_k = (double*)malloc(K*sizeof(double));
// make the class list of K initial components
mxt = NULL;
for(i=0; ici) {
add_read(&cn->rlist, m);
cn->size++;
break;
}
cn=cn->next;
}
}
free(p_k);
// there is at least the first cluster...
cn = mxt;
#ifdef DEBUG
printf("cluster %d has size %d\n",cn->ci,cn->size);
#endif
cn->h = (unsigned short int *) malloc(J * sizeof(unsigned short int));
cn->rd0 = (unsigned short int*) malloc(n * sizeof(unsigned short int));
cn->rd1 = (unsigned short int*) malloc(n * sizeof(unsigned short int));
while(cn->size == 0) {
// or maybe not ...
remove_comp(&mxt);
cn = mxt;
cn->h = (unsigned short int *) malloc(J * sizeof(unsigned short int));
cn->rd0 = (unsigned short int*) malloc(n * sizeof(unsigned short int));
cn->rd1 = (unsigned short int*) malloc(n * sizeof(unsigned short int));
}
sample_hap(cn);
for (ll=0; ll < n; ll++){
p = seq_distance(cn->h, r[ll], J);
cn->rd0[ll] = p[0];
cn->rd1[ll] = p[1];
}
// have to go through the list with ->next, because dont want to loose connection between the elements of the list
// needed for remove_comp, which returns the ->next element of the deleted cluster at the position of the given argument
while(cn->next != NULL){
// have to allocate mem anyway, otherwise would get error, when freeing memory, which happens when removing cluster
cn->next->h = (unsigned short int *) malloc(J * sizeof(unsigned short int));
cn->next->rd0 = (unsigned short int*) malloc(n * sizeof(unsigned short int));
cn->next->rd1 = (unsigned short int*) malloc(n * sizeof(unsigned short int));
#ifdef DEBUG
printf("cluster %d has size %d\n",cn->next->ci,cn->next->size);
#endif
if(cn->next->size == 0) {
#ifdef DEBUG
printf("removing some node... p=%p\n",cn->next);
#endif
remove_comp(&cn->next);
}else{
sample_hap(cn->next);
for (ll=0; ll < n; ll++){
p = seq_distance(cn->next->h, r[ll], J);
cn->rd0[ll] = p[0];
cn->rd1[ll] = p[1];
}
cn = cn->next;
}
}
// define the predecessor of each read
cn = mxt;
rn = cn->rlist;
while (rn != NULL){
i = rn->ri;
c_ptr[i] = NULL;
rn = rn->next;
}
while (cn->next != NULL){
rn = cn->next->rlist;
while (rn != NULL){
i = rn->ri;
c_ptr[i] = cn;
rn = rn->next;
}
cn = cn->next;
}
/*
for(i=0; i 0) {
bmax = cbase[0];
bi = 0;
for(rr=1; rr bmax) {
bmax = cbase[rr];
bi = rr;
}
}
h[i] = bi;
}
else{
h[i] = B;
}
}
return;
}
double sample_ref() {
cnode* cn;
unsigned int i,j;
double b1,b2;
unsigned int K1 = 0;
gsl_ran_discrete_t* g;
double max_log_pbase;
int max_cbase;
unsigned int base_id, dk1=0;
unsigned int countbases = 0;
for(j=0;jsize > 0) {
if(cn->h[j] < B) {
cbase[cn->h[j]]++;
K1++;
}
}
cn=cn->next;
}
countbases += K1;
if(gam < .2)gam = 0.2;
b1 = gam;
b2 = (1.0-gam)/((double)B - 1.0);
if(b1 == 0.0) {
printf("# There is something wrong with GAMMA ( == 0.0), sample ref\n");
print_stats(mxt,J);
printf("# reference:\n# ");
for(i=0;i 0) {
if(b1 != 1.0) {
for(i=0; i max_log_pbase) {
max_log_pbase = log_pbase[i];
base_id = i;
}
}
for(i=0; i= max_cbase) {
max_cbase = cbase[i];
}
}
for(i=0; i< B; i++) {
if(cbase[i] == max_cbase) {
pbase[i] = 1.0;
}
else {
pbase[i] = 0.0;
}
}
g = gsl_ran_discrete_preproc(B, pbase);
h[j] = gsl_ran_discrete(rg, g);
gsl_ran_discrete_free(g);
}
}
else{ // K1 == 0, that is all N's
h[j] = B;
}
dk1 += cbase[h[j]];
}
return (double)dk1/(double)countbases;
}
void sample_hap(cnode* cn){
rnode* tr;
unsigned int i, j, tot_reads;
gsl_ran_discrete_t* g;
double b1, b2;
double max_log_pbase;
int max_cbase;
unsigned int base_id;
#ifdef DEBUG
printf("Haplotype %i is\n", cn->ci);
#endif
for (j=0; jrlist;
while (tr != NULL){ // correct for missing data
if (r[tr->ri][j] < B) {
cbase[r[tr->ri][j]]++;
tot_reads++;
}
tr = tr->next;
}
b1 = theta;
b2 = (1. - theta)/((double)B-1);
if(b1 == 0.0) {
printf("# There is something wrong with THETA ( == 0.0)\n");
exit(EXIT_FAILURE);
}
if(tot_reads > 0) {
if(b1 != 1.0) {
for(i=0; i max_log_pbase) {
max_log_pbase = log_pbase[i];
base_id = i;
}
}
for(i=0; ih[j] = gsl_ran_discrete(rg, g);
gsl_ran_discrete_free(g);
}else{ // theta == 1.0
max_cbase = cbase[0];
base_id = 0;
for(i=1; i= max_cbase) {
max_cbase = cbase[i];
base_id = i;
}
}
cn->h[j] = base_id;
}
}else{ // tot_reads == 0
cn->h[j] = B;
}
#ifdef DEBUG
printf("%i ", cn->h[j]);
#endif
}
#ifdef DEBUG
printf("\n");
#endif
return;
}
void check_size(const cnode* cst, unsigned int n){
unsigned int this_n = 0;
while(cst != NULL){
this_n += cst->size;
cst = cst->next;
}
if(this_n != n){
print_stats(mxt, J);
printf("!!!! STOP !!!!\n");
printf("size is now %i\n", this_n);
exit(EXIT_FAILURE);
}
return;
}
int count_classes(const cnode* cst){
unsigned int ck = 0;
while(cst != NULL){
ck++;
cst = cst->next;
}
return ck;
}
int isdna (char c){
int ans = (c == 'A' || c == 'C' || c == 'G' || c == 'T'||
c == 'a' || c == 'c' || c == 'g' || c == 't' ||
c == 'N' || c == 'n' || c == '-');
return ans;
}
unsigned int d2i (char c){
if(c == 'A' || c == 'a')
return 0;
if(c == 'C' || c == 'c')
return 1;
if(c == 'G' || c == 'g')
return 2;
if(c == 'T' || c == 't')
return 3;
if(c == '-')
return 4;
if(c == 'N' || c == 'n') // this stands for missing
return B;
printf("%c not a DNA base", c);
exit(EXIT_FAILURE);
return 0;
}
int* seq_distance(unsigned short int* a, unsigned short int* b, int seq_length){
int i, ns=0;
//int d=sizeof(unsigned short int);
dist[0] = 0;
dist[1] = 0;
for (i=0; i sampling class for %ith read <----------\n", i);
printf("---------------------------------------------------\n");
printf("------------ c_ptr = %p with size = %d\n", c_ptr[i], mxt->size);
printf("------------- NOW STATS----------\n");
print_stats(mxt, J);
#endif
// if the class is populated only by i, remove it
// if it's in the head
if(c_ptr[i] == NULL && mxt->size == 1){
rn = mxt->next->rlist;
while(rn != NULL){
c_ptr[rn->ri] = NULL;
rn = rn->next;
}
remove_comp(&mxt);
removed = 1;
#ifdef DEBUG
printf("----------- REMOVED SIZE 1 NODE ----------\n");
#endif
}
// if it's not in the head
if(c_ptr[i] != NULL && c_ptr[i]->next->size == 1){
// if i not in the last node update the following predecessor
if(c_ptr[i]->next->next != NULL){
rn = c_ptr[i]->next->next->rlist;
while(rn != NULL){
c_ptr[rn->ri] = c_ptr[i];
rn = rn->next;
}
}
remove_comp(&(c_ptr[i]->next));
removed = 1;
#ifdef DEBUG
printf("----------- REMOVED SIZE 1 NODE ----------\n");
#endif
}
/**********************************************************
run through the populated classes to assign a probability
***********************************************************/
cn = mxt;
if( cn == NULL){
printf("sampling classes, no classes found\n");
exit(EXIT_FAILURE);
}
st = 0;
b1 = theta;
b2 = (1. - theta)/((double)B - 1.);
if(theta == 0.0) {
printf("# There is something wrong wih THETA! ( == 0.0 )\n");
exit(EXIT_FAILURE);
}
if(gam == 0.0) {
printf("# There is something wrong with GAMMA! ( == 0.0 )\n");
exit(EXIT_FAILURE);
}
while (cn != NULL){
if(cn->size > 0) {
sz = cn->size;
read_came_from_current = 0;
if(removed == 0){
if(c_ptr[i] != NULL && cn == c_ptr[i]->next) {
sz--;
read_came_from_current = 1;
}
if(c_ptr[i] == NULL && cn == mxt) {
sz--;
read_came_from_current = 1;
}
}
if(sz == 0){
printf("THIS IS ZERO!!!\n");
exit(EXIT_FAILURE);
}
cl_ptr[st] = cn;
if(b1 != 1.0) {
dist = cn->rd0[i];//seq_distance(cn->h, r[i], J);
nodist = cn->rd1[i];
log_P[st] = gsl_sf_log((double)sz) + nodist * gsl_sf_log(b1) + dist * gsl_sf_log(b2);
P[st] = 1.0; // all probabilities, which should change afterwards, set to 1
}
else{ // theta == 1.0, not needed
if(gam == 1.0) {
log_P[st] = gsl_sf_log((double)sz);
P[st] = 1.0; // same as above, P != 0 later
}
else{
if(cn->rd0[i] == 0) { // current class is from_class
log_P[st] = gsl_sf_log((double)sz);
P[st] = 1.0; // same as above, P != 0 later
}
else{
P[st] = 0.0;
}
}
}
}
else{
fprintf(stderr, "------********* CN->SIZE = 0 **********----------\n");
P[st] = 0.0; // all prob, which shouldn't change, set to 0
}
st++;
cn = cn->next;
}
// plus the newly instantiated one
p = seq_distance(h, r[i], J);
dist = p[0];
nodist = p[1];
b1 = (theta * gam) + (1. - gam)*(1. - theta)/((double)B - 1.);
b2 = (theta + gam + B*(1. - gam * theta) - 2.)/(gsl_sf_pow_int((double)B - 1., 2));
if((theta == 1.0) && (gam == 1.0)) {
log_P[st] = gsl_sf_log(alpha);
P[st] = 1.0;
}
else {
log_P[st] = gsl_sf_log(alpha) + nodist * gsl_sf_log(b1) + dist * gsl_sf_log(b2);
P[st] = 1.0;
}
cl_ptr[st] = NULL;
#ifdef DEBUG
for (j=0; j<=st; j++)
printf("with P[%i] = %e to class %p\n", j, P[j], cl_ptr[j]);
#endif
max_log_P = log_P[st]; // set the max_log_P to the log_P of the new class, the only one where we definitly know an id, when using the logscale
class_id = st;
for(ll=0; ll=.99)) { // allow small errors in double, should be 1 nevertheless
max_log_P = log_P[ll];
class_id = ll;
}
}
for(ll=0; ll<=st; ll++) {
if(P[ll] >= .99) {
log_P[ll] -= max_log_P;
if(log_P[ll] < double_threshold) {
P[ll] = 0.0;
}else{
P[ll] = gsl_sf_exp(log_P[ll] - max_log_P);
}
}// else P[i] = 0, from above
}
g = gsl_ran_discrete_preproc(st+1, P);
this_class = gsl_ran_discrete(rg, g);
gsl_ran_discrete_free(g);
#ifdef DEBUG
printf ("extracted class is = %lu\n", this_class);
#endif
from_class = (c_ptr[i] != NULL) ? c_ptr[i]->next : mxt;
to_class = cl_ptr[this_class];
/***************************************
move the read to the extracted class
****************************************/
if( removed == 0 ){
if ( from_class == to_class ){
#ifdef DEBUG
printf ("from %p to itself\n", from_class);
#endif
// free(P);
// free(cl_ptr);
res->untouched = 1;
res->proposed = 0;
res->to_class = to_class;
return res;
}
else if ( to_class != NULL){
#ifdef DEBUG
printf("moving the read from %p to %p\n", from_class, to_class);
#endif
c_ptr[i] = c_ptr[ to_class->rlist->ri ]; // predecessor is taken from the already present read
move_read(i, &from_class, &to_class);
(from_class->size)--;
(to_class->size)++;
// free(P);
// free(cl_ptr);
res->untouched = 0;
res->proposed = 0;
res->to_class = to_class;
return res;
}
else if (to_class == NULL){
#ifdef DEBUG
printf("moving %i to a new class from %p\n", index, from_class);
#endif
remove_read( search_read(&from_class->rlist, i) );
(from_class->size)--;
add_comp(&mxt, &hst, lowest_free, 1, NULL, NULL, NULL, NULL, J, step, record);
lowest_free++;
add_read(&mxt->rlist, i);
mxt->h = (unsigned short int*) malloc(J * sizeof(unsigned short int));
sample_hap(mxt);
mxt->rd0 = (unsigned short int*) malloc(n * sizeof(unsigned short int));
mxt->rd1 = (unsigned short int*) malloc(n * sizeof(unsigned short int));
for (ll=0; ll < n; ll++){
p = seq_distance(mxt->h, r[ll], J);
mxt->rd0[ll] = p[0];
mxt->rd1[ll] = p[1];
}
c_ptr[i] = NULL;
rn = mxt->next->rlist;
if(rn == NULL){
printf("STOP!!! There should be something\n");
exit(21);
}
while (rn != NULL){
c_ptr[rn->ri] = mxt;
rn = rn->next;
}
// free(P);
// free(cl_ptr);
res->untouched = 0;
res->proposed = 1;
res->to_class = mxt;
return res;
}
}
else if (removed == 1){
#ifdef DEBUG
printf("moving having removed\n");
#endif
if (to_class != NULL){
add_read(&to_class->rlist, i);
(to_class->size)++;
c_ptr[i] = c_ptr[ to_class->rlist->next->ri ];
res->untouched = 0;
res->proposed = 0;
res->to_class = to_class;
}
else if (to_class == NULL){
add_comp(&mxt, &hst, lowest_free, 1, NULL, NULL, NULL, NULL, J, step, record);
lowest_free++;
res->to_class = mxt;
c_ptr[i] = NULL;
cn = mxt->next;
rn = cn->rlist;
while (rn != NULL){
c_ptr[rn->ri] = mxt;
rn = rn->next;
}
add_read(&mxt->rlist, i);
mxt->h = (unsigned short int*) malloc(J * sizeof(unsigned short int));
sample_hap(mxt);
mxt->rd0 = (unsigned short int*) malloc(n * sizeof(unsigned short int));
mxt->rd1 = (unsigned short int*) malloc(n * sizeof(unsigned short int));
for (ll=0; ll < n; ll++){
p = seq_distance(mxt->h, r[ll], J);
mxt->rd0[ll] = p[0];
mxt->rd1[ll] = p[1];
}
res->untouched = 0;
res->proposed = 1;
}
// free(P);
// free(cl_ptr);
return res;
}
fprintf(stderr, "WARNING!!! I DIDN'T SAMPLE READ %d\n", i);
return res;
}
void write_assignment (unsigned int it, unsigned int new_proposed, const cnode* tn) {
rnode* rn;
unsigned int j = 0, k;
fprintf(assign, "# %d iterations\n", it-1);
fprintf(assign, "\n#gamma = %f\n", gam);
fprintf(assign, "\n#made %i new clusters\n\n", new_proposed);
while(tn != NULL){
rn = tn->rlist;
while(rn != NULL){
k = 0;
while(id[rn->ri][k] != '\n'){
fputc(id[rn->ri][k], assign);
k++;
}
fprintf(assign, " -> %i\n", j);
rn = rn->next;
}
j++;
tn = tn->next;
}
// fprintf(assign, "\n");
fprintf(assign, "\n#final number of components = %i\n\n", j);
}
void create_history(unsigned int step){
// copy haplotypes from mxt to create the history
cnode* tn;
tn = mxt;
while(tn != NULL){
add_hap(&hst, NULL, tn->ci, J, step);
tn->hh = hst;
memcpy(hst->h, tn->h, J*sizeof(unsigned short int));
tn = tn->next;
}
return;
}
void record_conf(cnode* tn, unsigned int step){
int *p;
rnode* rn;
if(tn->step < step) {
p = seq_distance(tn->h,tn->hh->h,J);
if(p[0] != 0) {
add_hap(&hst,tn->hh,tn->ci,J,step);
memcpy(hst->h,tn->h,J*sizeof(unsigned short int));
tn->hh = hst;
}
}else{
memcpy(tn->hh->h,tn->h,J*sizeof(unsigned short int));
}
rn = tn->rlist;
while(rn != NULL){
ass_hist[rn->ri][step - iter + HISTORY - 1] = tn->hh;
rn = rn->next;
}
return;
}
void cleanup() {
free(P);
free(log_P);
free(pbase);
free(cbase);
free(log_pbase);
free(h);
// should maybe cleanup all nodes ...
}
int compare(const void* a, const void* b) {
unsigned int i=0;
unsigned short int *h1, *h2;
h1 = *(unsigned short int**)a;
h2 = *(unsigned short int**)b;
for(i=0; ih, J*sizeof(unsigned short int));
}
qsort(haplotypes, HISTORY, sizeof(unsigned short int*), compare);
// qsort returns haplotype (pointers to the ascending sorted haplotypes)
hap = 0;
ch[0] = 1;
for(k = 1; k < HISTORY; k++) {
p = seq_distance(haplotypes[k-1], haplotypes[k], J);
if (p[0] == 0 ) {
ch[hap] += 1;
}else{
hap = k;
ch[hap] = 1;
}
}
int max = 0;
for(k=0; k= max){
max = ch[k];
hap = k;
}
}
quality = ch[hap]/(double)HISTORY;
for(j=0;jh = (unsigned short int*)malloc(J*sizeof(unsigned short int));
all_haplo[i*HISTORY+j]->hi = ass_hist[i][j]->hi;
all_haplo[i*HISTORY+j]->step = ass_hist[i][j]->step;
all_haplo[i*HISTORY+j]->count = 1;
memcpy(all_haplo[i*HISTORY+j]->h,ass_hist[i][j]->h,J*sizeof(unsigned short int));
}
}
qsort(all_haplo,n*HISTORY,sizeof(hnode_single*),compare_hnss_seq);
hap = 0;
for(i=1; ih,all_haplo[i-1]->h,J);
if(p[0] == 0) {
all_haplo[hap]->count++;
all_haplo[i]->count = 0;
}else{
hap = i;
}
}
if(hcount > 0) {
qsort(all_haplo,n*HISTORY,sizeof(hnode_single**),compare_hnss_count);
}else{
hcount = n*HISTORY;
}
hap=0;
fp = fopen(filename,"w");
for(i=0; icount > 0) {
fprintf(fp,">haplotype_%04d|%.9lf\n",hap,(double)(1.0*all_haplo[i]->count)/(1.0 * n * HISTORY));
for(j=0; jh[j]]);
}
fprintf(fp,"\n");
hap++;
if(hap >= hcount)
break;
}
}
fclose(fp);
for(i=0; ih);
}
free(all_haplo);
}
int parsecommandline(int argc, char** argv){
// get command line parameters
char c;
while((c = getopt(argc, argv,"i:j:a:t:o:m:K:k:R:c:Hh")) != -1){
switch (c){
case 'i':
filein = optarg;
break;
case 'j':
iter = atoi(optarg);
break;
case 'a':
alpha = atof(optarg);
break;
case 't':
HISTORY = atoi(optarg);
break;
case 'o':
haplotype_output = optarg;
ho_flag = 1;
break;
case 'm':
ho_count = atoi(optarg);
break;
case 'K':
if(avgNK == 0.0) {
K = atoi(optarg);
}else{
printf("can't use -k and -K at same time.\n");
exit(1);
}
break;
case 'k':
if(K == 0) {
avgNK = atof(optarg);
}else{
printf("can't use -k and -K at same time.\n");
exit(1);
}
break;
case 'R':
randseed = atoi(optarg);
break;
case 'c':
write_clusterfile = 1;
clusterfile_output = optarg;
break;
default :
fprintf(stdout, "%s [options]\n"
"\n"
" files\n"
"\t-i \n"
" parameters\n"
"\t-j \n"
"\t-a \n"
"\t-K not compat. with -k\n"
"\t-k not compat. with -K\n"
"\t-t \n"
"\t-o (for debugging purposes)\n"
"\t-m \n"
"\t-c \n"
"\t-R \n"
"-----------------------------------------------------\n"
"\t-h\t\t this help!\n", argv[0]);
exit(-1);
}
}
if(HISTORY > iter)HISTORY=iter;
if(randseed == 0) randseed = time(NULL);
if( (K==0) && (avgNK <= 0.0))avgNK = default_avgNK;
return 0;
} // ends parsecommandline