/* sam_header.c -- basic SAM/BAM header API. Copyright (C) 2009-2013 Genome Research Ltd. Author: Petr Danecek Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include "sam_header.h" #include #include #include #include #include #include "htslib/khash.h" KHASH_MAP_INIT_STR(str, const char *) struct _HeaderList { struct _HeaderList *last; // Hack: Used and maintained only by list_append_to_end. Maintained in the root node only. struct _HeaderList *next; void *data; }; typedef struct _HeaderList list_t; typedef list_t HeaderDict; typedef struct { char key[2]; char *value; } HeaderTag; typedef struct { char type[2]; list_t *tags; } HeaderLine; const char *o_hd_tags[] = {"SO","GO",NULL}; const char *r_hd_tags[] = {"VN",NULL}; const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL}; const char *r_sq_tags[] = {"SN","LN",NULL}; const char *u_sq_tags[] = {"SN",NULL}; const char *o_rg_tags[] = {"CN","DS","DT","FO","KS","LB","PG","PI","PL","PU","SM",NULL}; const char *r_rg_tags[] = {"ID",NULL}; const char *u_rg_tags[] = {"ID",NULL}; const char *o_pg_tags[] = {"VN","CL",NULL}; const char *r_pg_tags[] = {"ID",NULL}; const char *types[] = {"HD","SQ","RG","PG","CO",NULL}; const char **optional_tags[] = {o_hd_tags,o_sq_tags,o_rg_tags,o_pg_tags,NULL,NULL}; const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,NULL}; const char **unique_tags[] = {NULL, u_sq_tags,u_rg_tags,NULL,NULL,NULL}; static void debug(const char *format, ...) { va_list ap; va_start(ap, format); vfprintf(stderr, format, ap); va_end(ap); } #if 0 // Replaced by list_append_to_end static list_t *list_prepend(list_t *root, void *data) { list_t *l = malloc(sizeof(list_t)); l->next = root; l->data = data; return l; } #endif // Relies on the root->last being correct. Do not use with the other list_* // routines unless they are fixed to modify root->last as well. static list_t *list_append_to_end(list_t *root, void *data) { list_t *l = malloc(sizeof(list_t)); l->last = l; l->next = NULL; l->data = data; if ( !root ) return l; root->last->next = l; root->last = l; return root; } static list_t *list_append(list_t *root, void *data) { list_t *l = root; while (l && l->next) l = l->next; if ( l ) { l->next = malloc(sizeof(list_t)); l = l->next; } else { l = malloc(sizeof(list_t)); root = l; } l->data = data; l->next = NULL; return root; } static void list_free(list_t *root) { list_t *l = root; while (root) { l = root; root = root->next; free(l); } } // Look for a tag "XY" in a predefined const char *[] array. static int tag_exists(const char *tag, const char **tags) { int itag=0; if ( !tags ) return -1; while ( tags[itag] ) { if ( tags[itag][0]==tag[0] && tags[itag][1]==tag[1] ) return itag; itag++; } return -1; } // Mimics the behaviour of getline, except it returns pointer to the next chunk of the text // or NULL if everything has been read. The lineptr should be freed by the caller. The // newline character is stripped. static const char *nextline(char **lineptr, size_t *n, const char *text) { int len; const char *to = text; if ( !*to ) return NULL; while ( *to && *to!='\n' && *to!='\r' ) to++; len = to - text + 1; if ( *to ) { // Advance the pointer for the next call if ( *to=='\n' ) to++; else if ( *to=='\r' && *(to+1)=='\n' ) to+=2; } if ( !len ) return to; if ( !*lineptr ) { *lineptr = malloc(len); *n = len; } else if ( *nkey[0] = name[0]; tag->key[1] = name[1]; tag->value = malloc(len+1); memcpy(tag->value,value_from,len+1); tag->value[len] = 0; return tag; } static HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key) { list_t *tags = hline->tags; while (tags) { HeaderTag *tag = tags->data; if ( tag->key[0]==key[0] && tag->key[1]==key[1] ) return tag; tags = tags->next; } return NULL; } // Return codes: // 0 .. different types or unique tags differ or conflicting tags, cannot be merged // 1 .. all tags identical -> no need to merge, drop one // 2 .. the unique tags match and there are some conflicting tags (same tag, different value) -> error, cannot be merged nor duplicated // 3 .. there are some missing complementary tags and no unique conflict -> can be merged into a single line static int sam_header_compare_lines(HeaderLine *hline1, HeaderLine *hline2) { HeaderTag *t1, *t2; if ( hline1->type[0]!=hline2->type[0] || hline1->type[1]!=hline2->type[1] ) return 0; int itype = tag_exists(hline1->type,types); if ( itype==-1 ) { debug("[sam_header_compare_lines] Unknown type [%c%c]\n", hline1->type[0],hline1->type[1]); return -1; // FIXME (lh3): error; I do not know how this will be handled in Petr's code } if ( unique_tags[itype] ) { t1 = header_line_has_tag(hline1,unique_tags[itype][0]); t2 = header_line_has_tag(hline2,unique_tags[itype][0]); if ( !t1 || !t2 ) // this should never happen, the unique tags are required return 2; if ( strcmp(t1->value,t2->value) ) return 0; // the unique tags differ, cannot be merged } if ( !required_tags[itype] && !optional_tags[itype] ) { t1 = hline1->tags->data; t2 = hline2->tags->data; if ( !strcmp(t1->value,t2->value) ) return 1; // identical comments return 0; } int missing=0, itag=0; while ( required_tags[itype] && required_tags[itype][itag] ) { t1 = header_line_has_tag(hline1,required_tags[itype][itag]); t2 = header_line_has_tag(hline2,required_tags[itype][itag]); if ( !t1 && !t2 ) return 2; // this should never happen else if ( !t1 || !t2 ) missing = 1; // there is some tag missing in one of the hlines else if ( strcmp(t1->value,t2->value) ) { if ( unique_tags[itype] ) return 2; // the lines have a matching unique tag but have a conflicting tag return 0; // the lines contain conflicting tags, cannot be merged } itag++; } itag = 0; while ( optional_tags[itype] && optional_tags[itype][itag] ) { t1 = header_line_has_tag(hline1,optional_tags[itype][itag]); t2 = header_line_has_tag(hline2,optional_tags[itype][itag]); if ( !t1 && !t2 ) { itag++; continue; } if ( !t1 || !t2 ) missing = 1; // there is some tag missing in one of the hlines else if ( strcmp(t1->value,t2->value) ) { if ( unique_tags[itype] ) return 2; // the lines have a matching unique tag but have a conflicting tag return 0; // the lines contain conflicting tags, cannot be merged } itag++; } if ( missing ) return 3; // there are some missing complementary tags with no conflicts, can be merged return 1; } static HeaderLine *sam_header_line_clone(const HeaderLine *hline) { list_t *tags; HeaderLine *out = malloc(sizeof(HeaderLine)); out->type[0] = hline->type[0]; out->type[1] = hline->type[1]; out->tags = NULL; tags = hline->tags; while (tags) { HeaderTag *old = tags->data; HeaderTag *new = malloc(sizeof(HeaderTag)); new->key[0] = old->key[0]; new->key[1] = old->key[1]; new->value = strdup(old->value); out->tags = list_append(out->tags, new); tags = tags->next; } return out; } static int sam_header_line_merge_with(HeaderLine *out_hline, const HeaderLine *tmpl_hline) { list_t *tmpl_tags; if ( out_hline->type[0]!=tmpl_hline->type[0] || out_hline->type[1]!=tmpl_hline->type[1] ) return 0; tmpl_tags = tmpl_hline->tags; while (tmpl_tags) { HeaderTag *tmpl_tag = tmpl_tags->data; HeaderTag *out_tag = header_line_has_tag(out_hline, tmpl_tag->key); if ( !out_tag ) { HeaderTag *tag = malloc(sizeof(HeaderTag)); tag->key[0] = tmpl_tag->key[0]; tag->key[1] = tmpl_tag->key[1]; tag->value = strdup(tmpl_tag->value); out_hline->tags = list_append(out_hline->tags,tag); } tmpl_tags = tmpl_tags->next; } return 1; } static HeaderLine *sam_header_line_parse(const char *headerLine) { HeaderLine *hline; HeaderTag *tag; const char *from, *to; from = headerLine; if ( *from != '@' ) { debug("[sam_header_line_parse] expected '@', got [%s]\n", headerLine); return 0; } to = ++from; while (*to && *to!='\t') to++; if ( to-from != 2 ) { debug("[sam_header_line_parse] expected '@XY', got [%s]\nHint: The header tags must be tab-separated.\n", headerLine); return 0; } hline = malloc(sizeof(HeaderLine)); hline->type[0] = from[0]; hline->type[1] = from[1]; hline->tags = NULL; int itype = tag_exists(hline->type, types); from = to; while (*to && *to=='\t') to++; if ( to-from != 1 ) { debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from)); free(hline); return 0; } from = to; while (*from) { while (*to && *to!='\t') to++; if ( !required_tags[itype] && !optional_tags[itype] ) { // CO is a special case, it can contain anything, including tabs if ( *to ) { to++; continue; } tag = new_tag(" ",from,to-1); } else tag = new_tag(from,from+3,to-1); if ( header_line_has_tag(hline,tag->key) ) debug("The tag '%c%c' present (at least) twice on line [%s]\n", tag->key[0],tag->key[1], headerLine); hline->tags = list_append(hline->tags, tag); from = to; while (*to && *to=='\t') to++; if ( *to && to-from != 1 ) { debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from)); return 0; } from = to; } return hline; } // Must be of an existing type, all tags must be recognised and all required tags must be present static int sam_header_line_validate(HeaderLine *hline) { list_t *tags; HeaderTag *tag; int itype, itag; // Is the type correct? itype = tag_exists(hline->type, types); if ( itype==-1 ) { debug("The type [%c%c] not recognised.\n", hline->type[0],hline->type[1]); return 0; } // Has all required tags? itag = 0; while ( required_tags[itype] && required_tags[itype][itag] ) { if ( !header_line_has_tag(hline,required_tags[itype][itag]) ) { debug("The tag [%c%c] required for [%c%c] not present.\n", required_tags[itype][itag][0],required_tags[itype][itag][1], hline->type[0],hline->type[1]); return 0; } itag++; } // Are all tags recognised? tags = hline->tags; while ( tags ) { tag = tags->data; if ( !tag_exists(tag->key,required_tags[itype]) && !tag_exists(tag->key,optional_tags[itype]) ) { // Lower case tags are user-defined values. if( !(islower(tag->key[0]) || islower(tag->key[1])) ) { // Neither is lower case, but tag was not recognized. debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]); // return 0; // Even unknown tags are allowed - for forward compatibility with new attributes } // else - allow user defined tag } tags = tags->next; } return 1; } static void print_header_line(FILE *fp, HeaderLine *hline) { list_t *tags = hline->tags; HeaderTag *tag; fprintf(fp, "@%c%c", hline->type[0],hline->type[1]); while (tags) { tag = tags->data; fprintf(fp, "\t"); if ( tag->key[0]!=' ' || tag->key[1]!=' ' ) fprintf(fp, "%c%c:", tag->key[0],tag->key[1]); fprintf(fp, "%s", tag->value); tags = tags->next; } fprintf(fp,"\n"); } static void sam_header_line_free(HeaderLine *hline) { list_t *tags = hline->tags; while (tags) { HeaderTag *tag = tags->data; free(tag->value); free(tag); tags = tags->next; } list_free(hline->tags); free(hline); } void sam_header_free(void *_header) { HeaderDict *header = (HeaderDict*)_header; list_t *hlines = header; while (hlines) { sam_header_line_free(hlines->data); hlines = hlines->next; } list_free(header); } HeaderDict *sam_header_clone(const HeaderDict *dict) { HeaderDict *out = NULL; while (dict) { HeaderLine *hline = dict->data; out = list_append(out, sam_header_line_clone(hline)); dict = dict->next; } return out; } // Returns a newly allocated string char *sam_header_write(const void *_header) { const HeaderDict *header = (const HeaderDict*)_header; char *out = NULL; int len=0, nout=0; const list_t *hlines; // Calculate the length of the string to allocate hlines = header; while (hlines) { len += 4; // @XY and \n HeaderLine *hline = hlines->data; list_t *tags = hline->tags; while (tags) { HeaderTag *tag = tags->data; len += strlen(tag->value) + 1; // \t if ( tag->key[0]!=' ' || tag->key[1]!=' ' ) len += strlen(tag->value) + 3; // XY: tags = tags->next; } hlines = hlines->next; } nout = 0; out = malloc(len+1); hlines = header; while (hlines) { HeaderLine *hline = hlines->data; nout += sprintf(out+nout,"@%c%c",hline->type[0],hline->type[1]); list_t *tags = hline->tags; while (tags) { HeaderTag *tag = tags->data; nout += sprintf(out+nout,"\t"); if ( tag->key[0]!=' ' || tag->key[1]!=' ' ) nout += sprintf(out+nout,"%c%c:", tag->key[0],tag->key[1]); nout += sprintf(out+nout,"%s", tag->value); tags = tags->next; } hlines = hlines->next; nout += sprintf(out+nout,"\n"); } out[len] = 0; return out; } void *sam_header_parse2(const char *headerText) { list_t *hlines = NULL; HeaderLine *hline; const char *text; char *buf=NULL; size_t nbuf = 0; int tovalidate = 0; if ( !headerText ) return 0; text = headerText; while ( (text=nextline(&buf, &nbuf, text)) ) { hline = sam_header_line_parse(buf); if ( hline && (!tovalidate || sam_header_line_validate(hline)) ) // With too many (~250,000) reference sequences the header parsing was too slow with list_append. hlines = list_append_to_end(hlines, hline); else { if (hline) sam_header_line_free(hline); sam_header_free(hlines); if ( buf ) free(buf); return NULL; } } if ( buf ) free(buf); return hlines; } void *sam_header2tbl(const void *_dict, char type[2], char key_tag[2], char value_tag[2]) { const HeaderDict *dict = (const HeaderDict*)_dict; const list_t *l = dict; khash_t(str) *tbl = kh_init(str); khiter_t k; int ret; if (_dict == 0) return tbl; // return an empty (not null) hash table while (l) { HeaderLine *hline = l->data; if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) { l = l->next; continue; } HeaderTag *key, *value; key = header_line_has_tag(hline,key_tag); value = header_line_has_tag(hline,value_tag); if ( !key || !value ) { l = l->next; continue; } k = kh_get(str, tbl, key->value); if ( k != kh_end(tbl) ) debug("[sam_header_lookup_table] They key %s not unique.\n", key->value); k = kh_put(str, tbl, key->value, &ret); kh_value(tbl, k) = value->value; l = l->next; } return tbl; } char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n) { const HeaderDict *dict = (const HeaderDict*)_dict; const list_t *l = dict; int max, n; char **ret; ret = 0; *_n = max = n = 0; while (l) { HeaderLine *hline = l->data; if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) { l = l->next; continue; } HeaderTag *key; key = header_line_has_tag(hline,key_tag); if ( !key ) { l = l->next; continue; } if (n == max) { max = max? max<<1 : 4; ret = realloc(ret, max * sizeof(char*)); } ret[n++] = key->value; l = l->next; } *_n = n; return ret; } void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **_key, const char **_value) { list_t *l = iter; if ( !l ) return NULL; while (l) { HeaderLine *hline = l->data; if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) { l = l->next; continue; } HeaderTag *key, *value; key = header_line_has_tag(hline,key_tag); value = header_line_has_tag(hline,value_tag); if ( !key || !value ) { l = l->next; continue; } *_key = key->value; *_value = value->value; return l->next; } return l; } const char *sam_tbl_get(void *h, const char *key) { khash_t(str) *tbl = (khash_t(str)*)h; khint_t k; k = kh_get(str, tbl, key); return k == kh_end(tbl)? 0 : kh_val(tbl, k); } int sam_tbl_size(void *h) { khash_t(str) *tbl = (khash_t(str)*)h; return h? kh_size(tbl) : 0; } void sam_tbl_destroy(void *h) { khash_t(str) *tbl = (khash_t(str)*)h; kh_destroy(str, tbl); } void *sam_header_merge(int n, const void **_dicts) { const HeaderDict **dicts = (const HeaderDict**)_dicts; HeaderDict *out_dict; int idict, status; if ( n<2 ) return NULL; out_dict = sam_header_clone(dicts[0]); for (idict=1; idictdata, out_hlines->data); if ( status==0 ) { out_hlines = out_hlines->next; continue; } if ( status==2 ) { print_header_line(stderr,tmpl_hlines->data); print_header_line(stderr,out_hlines->data); debug("Conflicting lines, cannot merge the headers.\n"); return 0; } if ( status==3 ) sam_header_line_merge_with(out_hlines->data, tmpl_hlines->data); inserted = 1; break; } if ( !inserted ) out_dict = list_append(out_dict, sam_header_line_clone(tmpl_hlines->data)); tmpl_hlines = tmpl_hlines->next; } } return out_dict; } char **sam_header2tbl_n(const void *dict, const char type[2], const char *tags[], int *n) { int nout = 0; char **out = NULL; *n = 0; list_t *l = (list_t *)dict; if ( !l ) return NULL; int i, ntags = 0; while ( tags[ntags] ) ntags++; while (l) { HeaderLine *hline = l->data; if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) { l = l->next; continue; } out = (char**) realloc(out, sizeof(char*)*(nout+1)*ntags); for (i=0; ivalue; } nout++; l = l->next; } *n = nout; return out; }