#include #include #include #include #include "build_lmer_table.h" #include "cmd_line_opts.h" int default_l(int); /* build_lmer_table.c -- build table of l-mers */ // RMH: From version.c which in turn is created by the Makefile extern const char *Version; #define PADLENGTH 11000 /* important -- must match build_repeat_families.c */ #define HASH_SIZE 16000057 /* prime */ #define BLOCKLENGTH 1000000000 #define IUPAC(c) c == 'R' || c=='r' || c=='Y' || c=='y' || c=='M' || c=='m' || c=='K' || c=='k' || c=='W' || c=='w' || c=='S' || c=='s' || c=='B' || c=='b' || c=='D' || c=='d' || c=='H' || c=='h' || c=='V' || c=='v' int length; /* length of genome sequence */ int l; /* length of l-mers to look at. Odd or even is Ok. */ char* LMER_TABLE_FILE; /* Where to put the results... */ char* SEQUENCE_FILE; /* Where to get the sequence from... */ int VERBOSE = 0; /* How chatty should I be? Really chatty? Really really chatty? */ int TANDEMDIST = 500; /* Distance between countable instances of an l-mer. */ int SORT = 0; int MAXLENGTH; /* Length of sequence to allow. */ int MINTHRESH = 3; int START = -1; int STOP = -1; char *sequence; void usage() { fprintf(stderr, "build_lmer_table Version %s\n\n"\ "Usage:\n"\ " build_lmer_table -l -sequence -freq [opts]\n"\ " -tandem --- tandem distance window (def: 500)\n"\ " -min <#> --- smallest number of required lmers (def: 3)\n"\ " -v --- output a small amount of debugging information.\n", Version ); exit(1); } int main(int argc, char* argv[]) { time_t start; struct llist **headptr; int *sortedocc, *sortedfreq, ngoodlmers; long *sortedindex; FILE* fp; start = time(0); /* Gather command-line options... */ if( 0 == 1* co_get_string(argc, argv, "-sequence", &SEQUENCE_FILE) * co_get_string(argc, argv, "-freq", &LMER_TABLE_FILE) ) { usage(); } co_get_int(argc, argv, "-tandem", &TANDEMDIST) || (TANDEMDIST = 500); co_get_int(argc, argv, "-min", &MINTHRESH) || (MINTHRESH = 3); co_get_bool(argc, argv, "-sort", &SORT); co_get_bool(argc, argv, "-v", &VERBOSE); co_get_int(argc, argv, "-start", &START); co_get_int(argc, argv, "-stop", &STOP); fp = fopen( SEQUENCE_FILE, "ro" ); if( NULL == fp ) { fprintf(stderr, "Could not open sequence file %s\n", SEQUENCE_FILE); exit(1); } fseek(fp, 0, SEEK_END); MAXLENGTH = ftell(fp); /* This is an approximation, but an overestimate, so we're ok! */ fclose(fp); co_get_int(argc, argv, "-l", &l) || (l = default_l(MAXLENGTH)); sequence = (char *)malloc( (MAXLENGTH + PADLENGTH) * sizeof(char) ); if( NULL == sequence ) { fprintf(stderr, "Unable to allocate %d bytes for the sequence\n", MAXLENGTH + PADLENGTH ); exit(1); } /* build sequence */ length = build_sequence(sequence,SEQUENCE_FILE); /* build headptr */ if((headptr = (struct llist **) malloc(HASH_SIZE*sizeof(*headptr))) == NULL) { fprintf(stderr,"Out of memory\n"); exit(1); } if(VERBOSE) fprintf(stderr," Done allocating headptr\n"); build_headptr(headptr); if(VERBOSE) fprintf(stderr," Done building headptr\n"); /* sort headptr */ ngoodlmers = count_headptr(headptr); if(VERBOSE) fprintf(stderr," There are %d l-mers\n",ngoodlmers); if((sortedfreq = (int *) malloc(ngoodlmers*sizeof(*sortedfreq))) == NULL) { fprintf(stderr,"Out of memory\n"); exit(1); } if((sortedocc = (int *) malloc(ngoodlmers*sizeof(*sortedocc))) == NULL) { fprintf(stderr,"Out of memory\n"); exit(1); } if((sortedindex = (long *) malloc(ngoodlmers*sizeof(*sortedindex))) == NULL) { fprintf(stderr,"Out of memory\n"); exit(1); } sort_headptr(headptr, sortedfreq, sortedocc, sortedindex, ngoodlmers); if(VERBOSE) fprintf(stderr," Done sorting headptr\n"); if(ngoodlmers == 0) { fprintf(stderr,"OOPS no good lmers\n"); exit(1); } print_lmers(headptr, sortedfreq, sortedocc, sortedindex, ngoodlmers); exit(0); } /* ************************************************************** */ int build_sequence(char *sequence, char *filename) { int i, j; char c; FILE *fp; if( (fp = fopen(filename, "r")) == NULL) { fprintf(stderr,"Could not open input file %s\n", filename); exit(1); } for(i=0; i') { for(j=0; (getc(fp) != '\n') && !feof(fp); j++) ; } else { if(c > 64) { sequence[i+PADLENGTH] = char_to_num(c); i++; } for(j=0; ((c = getc(fp)) != '\n') && !feof(fp) && (i 64) { sequence[i+PADLENGTH] = char_to_num(c); i++; } } } } fclose(fp); return (i+PADLENGTH); /* length of genome */ } void build_headptr(struct llist **headptr) { int h, i; int beg = START; int end = STOP; struct llist *tmp; if( START == -1 ) { beg = 0; } if( STOP == -1 ) { end = length - l + 1; } /* set each head pointer *headptr to NULL */ for(h=0; h0)) trim_headptr(headptr); if( !(i%100000) ) { fprintf(stderr, "%d \r", i); } /* processing the l-mer sequence[i], ..., sequence[i+l-1] */ h = hash_function(sequence+i); if(h<0) continue; tmp = headptr[h]; while(tmp != NULL) { if(lmermatch(sequence+(tmp->lastplusocc),sequence+i) == 1) /* forward match */ { if(i - tmp->lastplusocc >= TANDEMDIST) { tmp->freq += 1; } tmp->lastplusocc = i; break; } else if(lmermatchrc(sequence+(tmp->lastplusocc),sequence+i) == 1) /* rc match */ { if(i - tmp->lastminusocc >= TANDEMDIST) { tmp->freq += 1; } tmp->lastminusocc = i; break; } tmp = tmp->next; } /* add new guys */ if(tmp != NULL) continue; /* found l-mer, don't need to add in */ if((tmp = (struct llist *) malloc(sizeof(*tmp))) == NULL) { fprintf(stderr,"Out of memory at i=%d\n",i); exit(1); } tmp->lastplusocc = i; tmp->lastminusocc = -1000000; tmp->freq = 1; tmp->next = headptr[h]; /* either NULL, or some other l-mer with hash h */ headptr[h] = tmp; } trim_headptr(headptr); } void trim_headptr(struct llist **headptr) { int h; struct llist *tmp, *prevtmp, *nexttmp; /* remove l-mers with freq < MINTHRESH */ for(h=0; hfreq >= MINTHRESH) { /* don't remove tmp */ prevtmp = tmp; tmp = tmp->next; continue; } /* remove tmp */ nexttmp = tmp->next; free(tmp); tmp = nexttmp; if(prevtmp == NULL) /* first guy in linked list */ headptr[h] = tmp; else prevtmp->next = tmp; } } } int count_headptr(struct llist **headptr) { int n, h; struct llist *tmp; n = 0; for(h=0; hnext; } } return n; } void sort_headptr(struct llist **headptr, int *sortedfreq, int *sortedocc, long *sortedindex, int ngoodlmers) { int n, h; int *order; struct llist *tmp; void q_sort(int *sortedfreq, int *sortedocc, long *sortedindex, int *order, int left, int right); long compute_index (char *this); char num_to_char(char z); if((order = (int *) malloc(ngoodlmers*sizeof(*order))) == NULL) { fprintf(stderr,"Out of memory\n"); exit(1); } n = 0; for(h=0; hlastplusocc; sortedfreq[n] = tmp->freq; sortedindex[n] = compute_index(sequence+tmp->lastplusocc); n++; tmp = tmp->next; } } for(n=0; n= origfreq) && (left < right)) left++; if (left != right) { sortedfreq[right] = sortedfreq[left]; order[right] = order[left]; sortedocc[right] = sortedocc[left]; sortedindex[right] = sortedindex[left]; right--; } } sortedfreq[left] = origfreq; order[left] = origorder; sortedocc[left] = origocc; sortedindex[left] = origindex; pivot= left; left = l_hold; right = r_hold; if (left < pivot) q_sort(sortedfreq, sortedocc, sortedindex, order, left, pivot-1); if (right > pivot) q_sort(sortedfreq, sortedocc, sortedindex, order, pivot+1, right); } /* ************************************************************************ */ int hash_function(char *lmer) /* IS symmetric wrt reverse complements */ { int x, ans, ans2; for(x=0; x ans) ans = ans2; return ans; } int lmermatch(char *lmer1, char *lmer2) /* forward match */ { int x; for(x=0; x rcthis[x]) { for(x=0; x%c\n",c,c); exit(1); } char num_to_char(char z) { if(z == 0) return (char) 'A'; if(z == 1) return (char) 'C'; if(z == 2) return (char) 'G'; if(z == 3) return (char) 'T'; return (char) 'N'; } /* * Set l = ceil( 1 + log(\sum(g.segments[*].length)/log(4) ); */ int default_l(int len) { return ceil( 1 + log(len) / log(4) ); }