/****************************************************************************** #### ##### # # #### # # # ## ## # # # # # ## # # # # # # ### # # # # # # ### # # #### # # # ### #### ******************************************************************************/ /* This file is part of MAPMAKER 3.0b, Copyright 1987-1992, Whitehead Institute for Biomedical Research. All rights reserved. See READ.ME for license. */ /************** Hiden Markov Model based Converge to Map for F2 **************/ #define INC_LIB #define INC_SHELL #include "mapm.h" /**** statics all setup by setup_hmm() - arrays are freed by free_hmm() ****/ int n_indivs, n_loci, n_intervals; int max_observations, max_states; int **n_poss_states, ***poss_state; /* [indiv][locus],[indiv][locus][#] */ real ***trans_prob, ***exp_transitions; /* [interval][from_state][to_state] */ real ***implied_recs, ***implied_norecs; /* [interval][from_state][to_state] */ real ***obs_prob, ***exp_observations; /* [locus][observation/MD][state] */ bool *obs_prob_fixed; /* [locus] - if all are fixed then exp_obs==NULL */ int **observation; /* [indiv][locus] => OBS_ code */ int *the_observation; /* [indiv] a temp used only in f2_setup_hmm */ real **apriori_prob; /* [indiv][state] - for leftmost locus */ int no_data; /* index [indiv] into apriori_prob for right locus */ real *e_step_temp; /* [state]- used in LCP and RCP calculations */ real ***exp_genotype; /* [indiv][locus][state] for ERROR-KLUDGE */ int *observations; /* [indiv] externed for use as a temp */ void hmm_converge_to_map(); void (*hmm_set_probs)(); bool error_kludge; /* TRUE for ERROR-KLUDGE, e.g. error_rate>0 */ bool separate_recs; /* TRUE if distinct implied_rec/norec entries for each interval */ typedef struct { real recs; /* for now */ real norecs; } SUFF_STATS; SUFF_STATS **sufficient_stats; /* [locus] => ptr to struct */ /* Internals for HMM E-Step (obs stuff moved to toplevel,h) */ real **left_prob, **right_prob; /* [locus#][state#] */ real ***indiv_transitions; /* [interval][from_state][to_state] */ real ***indiv_observations; /* [locus][observation][state], maybe NULL */ int obligate_recs[6][6]= { /* externed in toplevel.h */ /* - A H B C D */ { 0, 0, 0, 0, 0, 0 }, /* - */ { 0, 0, 1, 2, 1, 0 }, /* A */ { 0, 1, 0, 1, 0, 0 }, /* H */ { 0, 2, 1, 0, 0, 1 }, /* B */ { 0, 1, 0, 0, 0, 0 }, /* C */ { 0, 0, 1, 0, 0, 0 } /* D */ }; #define HMM_MAX_THETA 0.4995 #define HMM_MIN_THETA 0.0004 /* F2 backcross/intercross, RI states */ #define STATE_A 0 #define STATE_B 1 #define STATE_H 2 /* intercross only */ int ri_type; /* RI_SELF, RI_SIB, or RI_NOT */ /**** F3 specific stuff Follows: ****/ #define A 0 /* F2 States... */ #define H 1 #define J 2 #define B 3 /* States for one F3 chromosome (grand- maternal or paternal origin) */ #define M 0 #define P 1 /* There are 4 F2 states and 16 hidden F2/F3 states: */ int A_AMAM, A_AMAP, A_APAM, A_APAP, H_AMAM, H_AMBP, H_BPAM, H_BPBP; int J_BMBM, J_BMAP, J_APBM, J_APAP, B_BMBM, B_BMBP, B_BPBM, B_BPBP; /* These are temps for setup_f3_self_hmm(), although f2_recs is also used in hmm_f3_self_set_probs() */ int f3_states, mat_chrom[16], pat_chrom[16], f2_state[16]; real f2_recs[4][4]; int f3_state(); /**** The internal routines ****/ real hmm_e_step(); void hmm_count_stats(), hmm_make_new_map(); void setup_hmm(); void setup_bc_like_hmm(), hmm_bc_like_set_probs(); void setup_f2_hmm(), hmm_f2_set_probs(); void setup_f3_self_hmm(), hmm_f3_self_set_probs(); void test_dump(), test_dump0(); real null_like; bool twopt_kludge; MAP *map2; void count_error_lods(); void hmm_fake_converge_to_map(); /**** External Routines, mostly ****/ void allocate_hmm_temps(total_loci,num_indivs,cross_type) int total_loci, num_indivs, cross_type; { int i, num_states, num_observations, num_loci, num_intervals; bool special_recs_hack; switch (cross_type) { case F2_BACKCROSS: num_states=2; num_observations=3; break; case F2_INTERCROSS: num_states=3; num_observations=6; break; case RI_SELF: num_states=2; num_observations=3; break; case RI_SIB: num_states=2; num_observations=3; break; case F3_SELF: num_states=16; num_observations=6; break; default: send(CRASH); } num_loci=MAX_MAP_LOCI; num_intervals=num_loci-1; special_recs_hack= (cross_type=F2_INTERCROSS ? TRUE:FALSE); parray(sufficient_stats,num_loci,SUFF_STATS); matrix(left_prob,num_loci,num_states,real); matrix(right_prob,num_loci,num_states,real); indiv_observations=NULL; array(trans_prob,num_intervals,real**); array(exp_transitions,num_intervals,real**); array(indiv_transitions,num_intervals,real**); array(implied_recs,num_intervals,real**); array(implied_norecs,num_intervals,real**); for (i=0; inum_loci<2 || raw.data_type!=F2) send(CRASH); if (map->num_loci>MAX_MAP_LOCI) { strcpy(ps,"too many loci in one map"); error(ps); } setup_hmm(map); /* set statics above, and set the initial parameters */ if (fake_maps) hmm_fake_converge_to_map(map); else hmm_converge_to_map(map); /* else old_converge_to_map(map); */ } void f2_genotype(locus,haplo,observation) int locus; bool haplo; /* merge haplotypes */ int *observation; /* array of raw.num_indivs observations */ { int j; if (raw.data_type!=F2) send(CRASH); if (haplo && haplotyped(locus)) locus=haplo_first[locus]; /* go to the head of the list */ for (j=0; j=3 ? 1:0); /* just debugging vars */ for (iter=0; TRUE; ++iter) { (*hmm_set_probs)(map,trans_prob,obs_prob,implied_recs); old_like= new_like; new_like= hmm_e_step(map,trans_prob,obs_prob,exp_transitions, exp_observations); if (n_loci==2 && iter==0) null_like=new_like; /* sf(ps,"iter: %2d like=%lf thetas: %lf %lf\n",iter,new_like, map->rec_frac[0][0],map->rec_frac[one][0]); pr(); */ if (testing) { /********** DEBUGGING CODE **********/ test_dump(obs_prob,max_observations,"obs_prob"); test_dump(implied_recs,max_states,"implied_recs"); test_dump(trans_prob,max_states,"trans_prob"); test_dump(exp_transitions,max_states,"from->to exp_trans: "); } /********** END OF DEBUGGING CODE **********/ if (fabs(new_like - old_like) < tolerance) break; /* converged! */ hmm_count_stats(exp_transitions,exp_observations,implied_recs, sufficient_stats); if (testing) { /********** MORE DEBUGGING CODE **********/ #define ss sufficient_stats sf(ps,"ss: R/NR %lf N %lf | %lf %lf\n",ss[0]->recs,ss[0]->norecs, ss[one]->recs,ss[one]->norecs); pr(); } /********** END OF DEBUGGING CODE **********/ hmm_make_new_map(sufficient_stats,new_like,map); } /* iterate */ if (error_kludge) count_error_lods(map->error_rate,obs_prob,exp_genotype, map->error_lod); /* if (n_loci==2) { map->rec_frac[0][0]= 0.499999; (*hmm_set_probs)(map,trans_prob,obs_prob,implied_recs); null_like= hmm_e_step(map,trans_prob,obs_prob,exp_transitions, exp_observations); } */ } void setup_hmm(map) MAP *map; { int i, type; n_loci=map->num_loci; n_intervals=n_loci-1; n_indivs=raw.data.f2.num_indivs; no_data=raw.data.f2.num_indivs; /* extra index into apriori_prob */ error_kludge= twopt_kludge= FALSE; ri_type=RI_NOT; type=raw.data.f2.cross_type; if (type==F2_INTERCROSS) setup_f2_hmm(map->locus,map->error_rate); else if (type==F3_SELF) setup_f3_self_hmm(map->locus); else if (type==F2_BACKCROSS || type==RI_SELF || type==RI_SIB) setup_bc_like_hmm(map->locus,map->error_rate,type); else send(CRASH); } void setup_bc_like_hmm(locus,error_rate,cross_type) int *locus; /* The locus numbers in order */ double *error_rate; int cross_type; { int i, j, k, obs; real total, **recs, **norecs, error_prob; max_states=2; /* STATE_A or _B */ max_observations=3; /* OBS_A, _H, or _MISSING (yup, this is odd for BC) */ separate_recs=FALSE; hmm_set_probs=hmm_bc_like_set_probs; if (error_rate!=NULL) error_kludge=TRUE; if (cross_type==F2_BACKCROSS) ri_type=RI_NOT; else if (cross_type==RI_SELF) ri_type=RI_SELF; else if (cross_type==RI_SIB) ri_type=RI_SIB; else send(CRASH); error_prob=0.0; for (i=0; i0.0) { poss_state[j][i][k++]=STATE_A; poss_state[j][i][k++]=STATE_B; } else if (obs==OBS_A) { poss_state[j][i][k++]=STATE_A; } else if (obs==OBS_H) { poss_state[j][i][k++]=STATE_B; } else send(CRASH); n_poss_states[j][i]=k; } } for (i=0; i0.0) { poss_state[j][i][k++]= STATE_A; poss_state[j][i][k++]= STATE_B; poss_state[j][i][k++]= STATE_H; } else if (obs==OBS_NOT_A) { poss_state[j][i][k++]= STATE_B; poss_state[j][i][k++]= STATE_H; } else if (obs==OBS_NOT_B) { poss_state[j][i][k++]= STATE_A; poss_state[j][i][k++]= STATE_H; } else if (obs==OBS_A) { poss_state[j][i][k++]= STATE_A; } else if (obs==OBS_B) { poss_state[j][i][k++]= STATE_B; } else if (obs==OBS_H) { poss_state[j][i][k++]= STATE_H; } else send(CRASH); n_poss_states[j][i]=k; } } for (i=0; irec_frac[i][0]; if (ri_type==RI_NOT) rec=theta; else if (ri_type==RI_SELF) rec=(2.0*theta)/(1.0+2.0*theta); else if (ri_type==RI_SIB) rec=(4.0*theta)/(1.0+6.0*theta); else send(CRASH); /* From Haldane and Wadington (inverse of y(x)) */ prob=trans_prob[i]; norec=1.0-rec; prob[STATE_A][STATE_A]=norec; prob[STATE_A][STATE_B]=rec; prob[STATE_B][STATE_A]=rec; prob[STATE_B][STATE_B]=norec; } /* For BC/RI we do not need to set any implied_recs/norecs - all entries are constant and are initialized by setup_bc1_hmm(). For RI this means that we are counting observed genotypic classes (as rec/norec) not really the implied_recs. See make_new_map. */ /* Set obs_probs based on penetrance, etc. - not implemented yet exp_observations!=NULL means that obs_probs are not all fixed */ if (exp_observations!=NULL) send(CRASH); } void hmm_f2_set_probs(map,trans_prob,obs_prob,implied_recs) MAP *map; real ***trans_prob; /* [interval][from_state][to_state] */ real ***obs_prob; /* [locus][observation/MD][state] side-effected */ real ***implied_recs; /* [interval][from_state][to_state] side-effected */ /* for now, assume obs_probs are fixed and initialized by setup_hmm() */ { int i, k, j; real theta, one_minus_theta, no_recs, one_rec, two_recs; real **prob, recs, total; /* set the trans_probs (right geno, conditional on left) given theta */ for (i=0; irec_frac[i][0]; one_minus_theta= 1.0-theta; no_recs= one_minus_theta * one_minus_theta; one_rec= theta * one_minus_theta; two_recs= theta * theta; prob= trans_prob[i]; prob[STATE_A][STATE_A]= no_recs; prob[STATE_A][STATE_B]= two_recs; prob[STATE_A][STATE_H]= 2.0 * one_rec; prob[STATE_B][STATE_A]= two_recs; prob[STATE_B][STATE_B]= no_recs; prob[STATE_B][STATE_H]= 2.0 * one_rec; prob[STATE_H][STATE_A]= one_rec; prob[STATE_H][STATE_B]= one_rec; prob[STATE_H][STATE_H]= no_recs + two_recs; /* as an optimization we only set implied_recs[H][H] - the other entries are constant and are initialized by setup_f2_hmm() */ recs= (2.0 * two_recs)/(two_recs + no_recs); implied_recs[i][STATE_H][STATE_H]= recs; implied_norecs[i][STATE_H][STATE_H]= 2.0 - recs; } /* Set obs_probs based on penetrance, etc. - not implemented yet exp_observations!=NULL means that obs_probs are not all fixed */ if (exp_observations!=NULL) send(CRASH); } void hmm_f3_self_set_probs(map,trans_prob,obs_prob,implied_recs) MAP *map; real ***trans_prob; /* [interval][from_state][to_state] */ real ***obs_prob; /* [locus][observation/MD][state] side-effected */ real ***implied_recs; /* [interval][from_state][to_state] side-effected */ /* for now, assume obs_probs are fixed and initialized by setup_hmm() */ { int i, k, j; real rec, norec, r0, r1, r2; real **prob; /* set the trans_probs (right geno, conditional on left) given theta */ for (i=0; irec_frac[i][0]; norec= 1.0 - rec; r0= norec * norec; r1= rec * norec; r2= rec * rec; /* f2_recs[][] is really the trans_prob for the F2 generation. */ f2_recs[A][A]=r0; f2_recs[H][A]=r1; f2_recs[J][A]=r1; f2_recs[B][A]=r2; f2_recs[A][H]=r1; f2_recs[H][H]=r0; f2_recs[J][H]=r2; f2_recs[B][H]=r1; f2_recs[A][J]=r1; f2_recs[H][J]=r2; f2_recs[J][J]=r0; f2_recs[B][J]=r1; f2_recs[A][B]=r2; f2_recs[H][B]=r1; f2_recs[J][B]=r1; f2_recs[B][B]=r0; prob= trans_prob[i]; for (j=0; j ptr to struct (recs,norecs)... */ { int i, k, j; int state, left, right; real **transitions_exp, **recs_implied, **norecs_implied; real recs, norecs; if (!separate_recs) { recs_implied= implied_recs[0]; norecs_implied= implied_norecs[0]; } for (i=0; irecs= recs; sufficient_stats[i]->norecs= norecs; } } void hmm_make_new_map(sufficient_stats,new_like,map) SUFF_STATS **sufficient_stats; /* [locus] => ptr to struct (recs,norecs)... */ real new_like; MAP *map; /* for now, assume no penetrance stuff */ { int i; real recs, norecs, sum, theta, ratio; for (i=0; ifix_interval[i]) continue; sum= recs= sufficient_stats[i]->recs; sum+= norecs= sufficient_stats[i]->norecs; if (sum>=0.00001) ratio=recs/sum; else ratio=0.0; /* WHAT TO DO? */ if (ri_type==RI_NOT) theta=ratio; else if (ri_type==RI_SELF) theta=ratio/(2.0*(1.0-ratio)); else if (ri_type==RI_SIB) theta=ratio/(2.0*(2.0-3.0*ratio)); else send(CRASH); /* From Haldane and Wadington (inverse of y(x)) */ if (theta>HMM_MAX_THETA) theta=HMM_MAX_THETA; else if (thetarec_frac[i][0]=theta; } map->log_like= new_like; } real hmm_e_step(map,trans_prob,obs_prob,exp_transitions,exp_observations) MAP *map; real ***trans_prob, ***exp_transitions; /* [interval][from_state][to_state] */ real ***obs_prob, ***exp_observations; /* [locus][observation][state] */ /* for now, assume exp_observations==NULL, meaning "don't bother" */ /* return likelihood */ { int indiv, i, the_observation, lstate, rstate, state, leftk, rightk; int locus, prev, num_left_states, num_right_states, num_inf; int *the_left_state, *the_right_state; real sum, total, val, *pheno_given_geno, *the_prob, *prev_prob; real indiv_like, total_like; real **the_transitions, **my_transitions, *temp, **the_trans_prob; /* debugging */ int x, y; real r, n; /* Sometime unroll array refs in here for a little speed */ /* For the moment, we don't ned all of the indiv_transitions matrix - so we KLUDGE and use entry [0] as BOTH my_transitions, and as temp. I will fix this someday - I THINK I JUST DID IT */ temp= e_step_temp; total_like= 0.0; for (i=0; i=0; prev=locus, locus--) { num_left_states= n_poss_states[indiv][locus]; num_right_states= n_poss_states[indiv][prev]; the_left_state= poss_state[indiv][locus]; the_right_state= poss_state[indiv][prev]; the_prob= right_prob[locus]; prev_prob= right_prob[prev]; the_trans_prob= trans_prob[locus]; the_observation= observation[indiv][prev]; /* was [locus] */ pheno_given_geno= obs_prob[prev][the_observation]; /* ditto */ for (state=0; state 1 ? 2:1; for (k=0; k 1 ? 3:2; printf("=== %s ===\n ",name); for (j=0; jVERY_SMALL) error_lod[i][j]= log10(like); else error_lod[i][j]= 0.0; } } } void quick_two_pt(locus0,locus1,two_pt,sex) int locus0, locus1; TWO_PT_DATA *two_pt; bool sex; /* do both with and without sex spec */ { if (raw.data_type==F2 && (raw.data.f2.cross_type==F2_INTERCROSS || raw.data.f2.cross_type==F2_BACKCROSS)) { f2_quick_two_pt(locus0,locus1,two_pt,FALSE); #ifdef HAVE_CEPH } else if (raw.data_type==CEPH) { ceph_quick_two_pt(locus0,locus1,two_pt,FALSE); #endif } else { /* bail out - use HMM */ map2->num_loci= 2; map2->locus[0]= locus0; map2->locus[1]= locus1; map2->rec_frac[0][0]= 0.499; map2->fix_interval[0]= FALSE; /* map2->error_rate=NULL */ setup_hmm(map2); twopt_kludge=TRUE; hmm_converge_to_map(map2); two_pt->theta[NOSEX]= map2->rec_frac[0][0]; two_pt->lodscore[NOSEX]= map2->log_like - null_like; if (two_pt->lodscore[NOSEX]<0.0) two_pt->lodscore[NOSEX]=0.0; } } void hmm_fake_converge_to_map(map) MAP *map; { int i, j; for (i=0; inum_loci-1; i++) map->rec_frac[i][MALE]= randnum()/2.0; map->log_like= -100.0 - 20.0*randnum(); if (map->error_lod!=NULL) for (i=1; inum_loci-1; i++) { if (map->error_rate[i]>0.0) for (j=0; jerror_lod[i][j]= 1.0+ 4.1*randnum(); else map->error_lod[i][j]= 10.0*randnum(); } } }