/****************************************************************************** ##### ###### ## ##### ###### ##### #### # # # # # # # # # # # # # # ##### # # # # ##### # # # ##### # ###### # # # ##### ### # # # # # # # # # # # ### # # # # ###### # # ##### ###### # # ### #### ******************************************************************************/ /* This file is part of MAPMAKER 3.0b, Copyright 1987-1992, Whitehead Institute for Biomedical Research. All rights reserved. See READ.ME for license. */ #define INC_LIB #define INC_SHELL #define INC_EQN #include "mapm.h" RAW_DATA raw; int original_markers; #define end_of_text(fp) feof(fp) int data_line; /* Remember to reset when you open the data file! */ /* for the BADDATA message */ int BADDATA_line_num; char *BADDATA_text; char *BADDATA_reason; char *badstr; int read_data_file_header(); int read_raw_file_header(); FILE *start_save_to_file(); void finish_save_to_file(); bool read_magic_number(); void write_magic_number(); int new_magic_number(); void new_read_f2_data(); void allocate_f2_data(); void free_f2_data(); void free_traits(); void new_read_f2_locus(); void write_f2_data(); void new_read_f2_raw(); void read_raw_f2_locus(); void read_raw_trait(); void write_traits(); bool uppercase_genotypes; /* set by read_raw_header() for read_raw_f2_locus()*/ real **trait=NULL; char **trait_name=NULL; int num_traits=0; #define MISSING_PHENO -100000.0 #define F2_ONLY \ "only experimental data may be loaded into this version of MAPMAKER" #define LOADING_FROM "loading %s data from file '%s'... " #define SAVING_TO "saving %s data in file '%s'... " #define NOT_SAVING_TO "can't write file '%s'... " #define NOT_LOADING_FROM "%s data in file '%s' is old... not loading\n" #define OLD_FILENUM "\ This file was prepared using an older version of MAPMAKER.\n\ You will need to re-prepare it using the 'prepare data' command." void data_init() { strcpy(raw.filename,""); raw.data_type= NO_DATA; setmsg(BADDATA,"error in data file",sender,strmsg_default); array(BADDATA_text,MAXLINE+1,char); array(BADDATA_reason,MAXLINE+1,char); array(badstr,LINE+1,char); } void getdataln(fp) /* get next nonblank,noncomment data file line */ FILE *fp; { do { fgetln(fp); data_line++; } while (nullstr(ln)||ln[0]=='#'); } void baddata(reason) /* send data reading error message */ char *reason; /* NOTE: should be a constant or a global */ { nstrcpy(BADDATA_reason,reason,MAXLINE); BADDATA_line_num=data_line; nstrcpy(BADDATA_text,ln,MAXLINE); send(BADDATA); } int data_loaded() { return (raw.data_type!=NO_DATA); } char *data_info(add_nums) bool add_nums; { char *str1= get_temp_string(), *str2= get_temp_string(); #ifdef HAVE_CEPH if (raw.data_type==CEPH) { if (add_nums) sf(str,"CEPH data (%d families, %d loci)", raw.data.ceph.num_families,raw.num_markers); else strcpy(str,"CEPH data"); return(str); } #endif /* else */ switch (raw.data.f2.cross_type) { case F2_INTERCROSS: strcpy(str1,"F2 intercross data "); break; case F2_BACKCROSS: strcpy(str1,"F2 backcross data "); break; case RI_SIB: strcpy(str1,"RI (sib-mating) data "); break; case RI_SELF: strcpy(str1,"RI (selfing) data "); break; case F3_SELF: strcpy(str1,"F3 intercross (selfing) data"); break; default: send(CRASH); } if (!add_nums) return(str1); else { sprintf(str2,"%s (%d individuals, %d loci)",str1, raw.data.f2.num_indivs,raw.num_markers); return(str2); } } FILE *start_save_to_file(name,ext,type,exists) char *name, *ext, *type; bool *exists; { FILE *fp; char tmpname[PATH_LENGTH+1]; make_filename(name,FORCE_EXTENSION,ext); strcpy(tmpname,name); make_filename(tmpname,FORCE_EXTENSION,TEMP_EXT); fp=NULL; *exists=FALSE; run { fp=open_file(name,READ); *exists=TRUE; close_file(fp); } trapping(CANTOPEN) {} fp=NULL; run { fp=open_file(tmpname,WRITE); sf(ps,SAVING_TO,type,name); pr(); flush(); } except_when(CANTOPEN) { sf(ps,NOT_SAVING_TO,name); pr(); nl(); abort_command(); } return(fp); } void finish_save_to_file(name,oldext,exists) char *name, *oldext; bool exists; { char tmpname[PATH_LENGTH+1], oldname[PATH_LENGTH+1]; strcpy(tmpname,name); make_filename(tmpname,FORCE_EXTENSION,TEMP_EXT); strcpy(oldname,name); make_filename(oldname,FORCE_EXTENSION,oldext); if (exists) { rename_file(name,oldname); } rename_file(tmpname,name); print("ok\n"); } /**** Do Real Work Now ****/ void do_load_data(fp,filename,israw) FILE *fp; char *filename; bool israw; { char name[PATH_LENGTH+1], symbols[7], type[TOKLEN+1]; FILE *fp2; int ntype; if (data_loaded()) send(CRASH); num_traits=0; trait=NULL; trait_name=NULL; sf(ps,"%sing data from file '%s'... ",(israw ? "prepar":"load"),filename); pr(); flush(); if (!israw) ntype=read_data_file_header(fp,filename); /* sets raw.stuff */ else ntype=read_raw_file_header(fp,filename,symbols); /* ditto */ if (ntype==F2) { /* read_f2_frob allocates the matrix first */ if (!israw) new_read_f2_data(fp); else new_read_f2_raw(fp,symbols); } #ifndef HAVE_CEPH else baddata(F2_ONLY); #else else read_ceph_data(fp); allocate_sex_choose(); #endif print("ok\n "); print(data_info(TRUE)); /*print(".");*/ flush(); strcpy(raw.filename,filename); raw.data_type=ntype; /* got it */ strcpy(name,filename); reset_state(); allocate_order_data(raw.num_markers); /* print("."); flush(); */ allocate_seq_stuff(raw.num_markers); print("."); flush(); allocate_mapping_data(raw.num_markers); print("."); flush(); allocate_two_pt(raw.num_markers); /* print("."); flush(); */ allocate_three_pt(raw.num_markers); print("."); flush(); allocate_hmm_temps(raw.num_markers,raw.data.f2.num_indivs, raw.data.f2.cross_type); print(" ok\n"); make_filename(name,FORCE_EXTENSION,MAPS_EXT); strcpy(type,"map"); run { fp2=NULL; fp2=open_file(name,READ); if (read_magic_number(fp2,type)) { sf(ps,LOADING_FROM,type,name); pr(); flush(); read_order_data(fp2); read_mapping_data(fp2); read_status(fp2); print("ok\n"); } else { sf(ps,NOT_LOADING_FROM,type,name); pr(); } close_file(fp2); } except_when(CANTOPEN) { } /* need a better handler */ make_filename(name,FORCE_EXTENSION,TWO_EXT); strcpy(type,"two-point"); run { fp2=NULL; fp2=open_file(name,READ); if (read_magic_number(fp2,type)) { sf(ps,LOADING_FROM,type,name); pr(); flush(); read_two_pt(fp2); print("ok\n"); } else { sf(ps,NOT_LOADING_FROM,type,name); pr(); } close_file(fp2); } except_when(CANTOPEN) { } /* need a better handler */ make_filename(name,FORCE_EXTENSION,THREE_EXT); strcpy(type,"three-point"); run { fp2=NULL; fp2=open_file(name,READ); if (read_magic_number(fp2,type)) { sf(ps,LOADING_FROM,type,name); pr(); flush(); read_three_pt(fp2); print("ok\n"); } else { sf(ps,NOT_LOADING_FROM,type,name); pr(); } close_file(fp2); } except_when(CANTOPEN) { } /* need a better handler */ } void do_unload_data() { if (raw.data_type==F2) free_f2_data(raw.num_markers,raw.data.f2.num_indivs); #ifdef HAVE_CEPH else free_ceph_data(raw.num_markers,raw.data.ceph,num_families); free_sex_choose(); #endif strcpy(raw.filename,""); raw.data_type=NO_DATA; /* use cross_type below */ undo_state(); free_order_data(raw.num_markers); free_mapping_data(raw.num_markers); free_seq_stuff(raw.num_markers); free_two_pt(raw.num_markers); free_three_pt(raw.num_markers); free_hmm_temps(raw.num_markers,raw.data.f2.num_indivs, raw.data.f2.cross_type); } void do_save_data(base_name,save_genos_too) char *base_name; bool save_genos_too; { FILE *fp=NULL; char name[PATH_LENGTH+1]; bool exists; run { strcpy(name,base_name); if (save_genos_too) { fp= start_save_to_file(name,DATA_EXT,"genotype",&exists); write_f2_data(fp); /* deals with header and magic number */ close_file(fp); fp=NULL; finish_save_to_file(name,DATA_OLD,exists); } fp= start_save_to_file(name,MAPS_EXT,"map",&exists); write_magic_number(fp,"map"); write_order_data(fp); write_mapping_data(fp); write_status(fp); close_file(fp); fp=NULL; finish_save_to_file(name,MAPS_OLD,exists); if (two_pt_touched) { fp= start_save_to_file(name,TWO_EXT,"two-point",&exists); write_magic_number(fp,"two-point"); write_two_pt(fp); close_file(fp); fp=NULL; finish_save_to_file(name,TWO_OLD,exists); } if (three_pt_touched) { fp= start_save_to_file(name,THREE_EXT,"three-point",&exists); write_magic_number(fp,"three-point"); write_three_pt(fp); close_file(fp); fp=NULL; finish_save_to_file(name,THREE_OLD,exists); } if (num_traits>0) { fp= start_save_to_file(name,TRAIT_EXT,"traits",&exists); write_magic_number(fp,"trait"); write_traits(fp); close_file(fp); fp=NULL; finish_save_to_file(name,TRAIT_OLD,exists); free_traits(); } two_pt_touched= FALSE; three_pt_touched= FALSE; } on_error { if (fp!=NULL && msg!=CANTCLOSE) close_file(fp); relay; } } /**** Lower level stuff ****/ int read_data_file_header(fp,filename) FILE *fp; char *filename; { int type=NO_DATA; if (data_loaded() || fp==NULL) send(CRASH); getdataln(fp); if (streq(ln,"prepared data f2 intercross")) { type=F2; raw.data.f2.cross_type=F2_INTERCROSS; } else if (streq(ln,"prepared data f2 backcross")) { type=F2; raw.data.f2.cross_type=F2_BACKCROSS; } else if (streq(ln,"prepared data f2 ri-sib")) { type=F2; raw.data.f2.cross_type=RI_SIB; } else if (streq(ln,"prepared data f2 ri-self")) { type=F2; raw.data.f2.cross_type=RI_SELF; } else if (streq(ln,"prepared data f3")) { type=F2; raw.data.f2.cross_type=F3_SELF; #ifdef HAVE_CEPH } else if (streq(ln,"data type ceph")) { type=CEPH; #endif } else baddata("can't recognize the data type"); if (type!=F2) send(CRASH); fscanf(fp,"%d %d %d",&raw.filenumber,&raw.data.f2.num_indivs, &raw.num_markers); if (raw.filenumber%10 != F2VERSION) baddata(OLD_FILENUM); return(type); } #define OLD_CHROM_WARN \ "warning: chromosome definitions in the raw file are ignored by this version\n\ of MAPMAKER. Type 'help prepare data' for instructions on how to accomplish\n\ the same thing using a '.prep' file." #define SYM_SYMBOLS "found 'symbols' without genotype symbols" #define SYM_FORMAT \ "the format of the genotype symbol definitions is wrong\n\ Type 'help data formats' for instructions." #define SYM_REPEAT "a genotype symbol is defined more than once" #define OK_SYMBOLS \ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890+-" int read_raw_file_header(fp,filename,symbol) FILE *fp; char *filename, *symbol; { int type, n_indivs, n_loci, n_chroms, n_traits, i, n, first; char str[TOKLEN+1], *dflt, user[2]; if (data_loaded() || fp==NULL) send(CRASH); getdataln(fp); while(nullstr(ln)); crunch(ln); if (streq(ln,"data type f2 intercross")) { type=F2; raw.data.f2.cross_type=F2_INTERCROSS; dflt=ptr_to("-AHBCD"); } else if (streq(ln,"data type f3 intercross")) { type=F2; raw.data.f2.cross_type=F3_SELF; dflt=ptr_to("-AHBCD"); } else if (streq(ln,"data type f2 backcross")) { type=F2; raw.data.f2.cross_type=F2_BACKCROSS; dflt=ptr_to("-AH"); } else if (streq(ln,"data type ri sib")) { type=F2; raw.data.f2.cross_type=RI_SIB; dflt=ptr_to("-AB"); } else if (streq(ln,"data type ri self")) { type=F2; raw.data.f2.cross_type=RI_SELF; dflt=ptr_to("-AB"); #ifdef HAVE_CEPH } else if (streq(ln,"data type ceph")) { raw.data_type=CEPH; #endif } else baddata("bad header first line - can't recognize data type"); if (type!=F2) send(CRASH); do getdataln(fp); while(nullstr(ln)); /* NOT crunched */ if (!(itoken(&ln,iREQUIRED,&n_indivs) && n_indivs > 0 && itoken(&ln,iREQUIRED,&n_loci) && n_loci > 0 && itoken(&ln,iREQUIRED,&n_traits) && n_traits >= 0)) error("bad #individuals, #loci or #traits"); uppercase_genotypes=TRUE; first=TRUE; user[1]='\0'; if (len(dflt)==3) strcpy(symbol," "); else strcpy(symbol," "); if (!nullstr(ln)) { /* symbol definitions, old chrom#, case thing */ if (itoken(&ln,iREQUIRED,&n_chroms)) if (n_chroms>0) { print(OLD_CHROM_WARN); } /* BIG KLUDGE BECAUSE SELF_DELIMITING INCLUDES '=' */ while (!nullstr(ln) && sscanf(ln,"%s",str)==1) { while (!white(ln[0]) && ln[0]!='\0') ln++; while (white(ln[0]) && ln[0]!='\0') ln++; if (first && xstreq(str,"case")) { uppercase_genotypes=FALSE; continue; } if (first && xstreq(str,"symbols") || xstreq(str,"symbols:")) { if (nullstr(ln)) baddata(SYM_SYMBOLS); else continue; } first=FALSE; if (len(str)!=3 || str[1]!='=' || !strin(OK_SYMBOLS,str[0]) || !strin(dflt,str[2])) baddata(SYM_FORMAT); switch (str[2]) { case '-': n=0; break; case 'A': case 'a': n=1; break; case 'H': case 'h': n=2; break; case 'B': case 'b': n=3; break; case 'C': case 'c': n=4; break; case 'D': case 'd': n=5; break; default: baddata(SYM_FORMAT); } user[0]=str[0]; if (uppercase_genotypes) uppercase(user); /* trick for RIs, want to use -AB as default but convert it to -AH surreptitiously */ if (n==3 && (raw.data.f2.cross_type==RI_SIB || raw.data.f2.cross_type==RI_SELF)) n=2; if (strin(symbol,*user) || symbol[n]!=' ') baddata(SYM_REPEAT); symbol[n]=user[0]; } } for (i=0; symbol[i]!='\0'; i++) if (symbol[i]!=' ') continue; else if (strin(symbol,dflt[i])) baddata(SYM_REPEAT); else symbol[i]=dflt[i]; raw.num_markers= n_loci; raw.data.f2.num_indivs= n_indivs; raw.filenumber= new_magic_number(); num_traits= n_traits; return(type); } bool read_magic_number(fp,type) FILE *fp; char *type; /* no spaces allowed, must parse with %s */ { int magic_number; char id[TOKLEN+1]; getdataln(fp); if (sscanf(ln,"%d %*s %s %*s",&magic_number,id)!=2) return(FALSE); else if (magic_number!=raw.filenumber) return(FALSE); else if (!streq(id,type)) return(FALSE); else return(TRUE); } void write_magic_number(fp,type) FILE *fp; char *type; { fprintf(fp,"%d mapmaker %s data\n",raw.filenumber,type); } int new_magic_number() { return(((int)(randnum()*3275.0))*10 + F2VERSION); } /**** Actually Get the Genotypes ****/ void allocate_f2_data(n_loci,n_indivs) int n_loci, n_indivs; { matrix(raw.locus_name,n_loci,NAME_LEN+1,char); matrix(raw.data.f2.allele,n_loci,n_indivs,char); matrix(raw.data.f2.allelic_distribution,n_loci,4,real); if (num_traits>0) { matrix(trait,num_traits, raw.data.f2.num_indivs,real); matrix(trait_name,num_traits,NAME_LEN+1,char); } } void free_f2_data(n_loci,n_indivs) int n_loci, n_indivs; { unmatrix(raw.locus_name, n_loci, char); unmatrix(raw.data.f2.allele, n_loci, char); unmatrix(raw.data.f2.allelic_distribution, n_loci, real); raw.data_type=NO_DATA; raw.filename[0]='\0'; /* Just for sanity, although if this happens we get a big memory leak */ num_traits=0; } void free_traits() { if (num_traits>0) { unmatrix(trait,num_traits,real); unmatrix(trait_name,num_traits,char); num_traits=0; } } void new_read_f2_data(fp) FILE *fp; /* sends BADDATA, or MALLOC, INTERRUPT etc. if an error */ { int j; run { allocate_f2_data(raw.num_markers,raw.data.f2.num_indivs); for (j=0; j 0 ? locus_num-1 :0]); baddata(badstr); } if (len(name)<2) { sf(badstr,"expected locus name after '*', following locus %s\n", raw.locus_name[locus_num > 0 ? locus_num-1 : 0]); baddata(badstr); } if (!strin(NAME_FIRST_CHARS,name[1])) { sf(badstr,"illegal first character in locus name %s\n",name+1); baddata(badstr); } for (i=2; name[i]!='\0'; i++) { if (name[i]=='-') name[i]='_'; else if (!strin(NAME_CHARS,name[i])) { sf(badstr,"illegal character '%c' in locus name %s", name[i],name+1); baddata(badstr); } } /* make sure marker name is not a duplicate */ for (j=0; j 0 ? trait_num-1 : 0]); baddata(name); } if ((templen=len(tempstr)) < 2) { sf(badstr,"expected trait name after '*', following trait %s\n", trait_name[trait_num > 0 ? trait_num-1 : 0]); baddata(badstr); } if (!strin(NAME_FIRST_CHARS,tempstr[1])) { sf(badstr,"illegal first character in trait name %s\n",tempstr); baddata(badstr); } for (i=2; i