/* hts_expr.c -- filter expression parsing and processing. Copyright (C) 2020-2021 Genome Research Ltd. Author: James Bonfield Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notices and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ // TODO: // - add maths functions. pow, sqrt, log, ? // - ?: operator for conditionals? #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include #include #include #include #include #include #include #include #include "htslib/hts_expr.h" #include "textutils_internal.h" // Could also cache hts_expr_val_t stack here for kstring reuse? #define MAX_REGEX 10 struct hts_filter_t { char *str; int parsed; int curr_regex, max_regex; regex_t preg[MAX_REGEX]; }; /* * This is designed to be mostly C like with mostly same the precedence rules, * with the exception of bit operators (widely considered as a mistake in C). * It's not full C (eg no bit-shifting), but good enough for our purposes. * * Supported syntax, in order of precedence: * * Grouping: (, ), eg "(1+2)*3" * Values: integers, floats, strings or variables * Unary ops: +, -, !, ~ eg -10 +10, !10 (0), ~5 (bitwise not) * Math ops: *, /, % [TODO: add // for floor division?] * Math ops: +, - * Bit-wise: &, ^, | [NB as 3 precedence levels, in that order] * Conditionals: >, >=, <, <=, * Equality: ==, !=, =~, !~ * Boolean: &&, || */ // Skip to start of term static char *ws(char *str) { while (*str && (*str == ' ' || *str == '\t')) str++; return str; } static int expression(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, char *str, char **end, hts_expr_val_t *res); /* * Simple functions operating on strings only. * length, min, max, avg. * * All return 0 on success, * -1 on failure */ static int expr_func_length(hts_expr_val_t *res) { if (!res->is_str) return -1; res->is_str = 0; res->d = res->s.l; return 0; } static int expr_func_min(hts_expr_val_t *res) { if (!res->is_str) return -1; size_t l = res->s.l; int v = INT_MAX; const uint8_t *x = (uint8_t *)res->s.s; for (l = 0; l < res->s.l; l++) if (v > x[l]) v = x[l]; res->is_str = 0; res->d = v == INT_MAX ? NAN : v; return 0; } static int expr_func_max(hts_expr_val_t *res) { if (!res->is_str) return -1; size_t l = res->s.l; int v = INT_MIN; const uint8_t *x = (uint8_t *)res->s.s; for (l = 0; l < res->s.l; l++) if (v < x[l]) v = x[l]; res->is_str = 0; res->d = v == INT_MIN ? NAN : v; return 0; } static int expr_func_avg(hts_expr_val_t *res) { if (!res->is_str) return -1; size_t l = res->s.l; double v = 0; const uint8_t *x = (uint8_t *)res->s.s; for (l = 0; l < res->s.l; l++) v += x[l]; if (l) v /= l; res->is_str = 0; res->d = v; return 0; } /* * functions: FUNC(expr). * Note for simplicity of parsing, the "(" must immediately follow FUNC, * so "FUNC (x)" is invalid. */ static int func_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, char *str, char **end, hts_expr_val_t *res) { int func_ok = -1; switch (*str) { case 'a': if (strncmp(str, "avg(", 4) == 0) { if (expression(filt, data, fn, str+4, end, res)) return -1; func_ok = expr_func_avg(res); } break; case 'l': if (strncmp(str, "length(", 7) == 0) { if (expression(filt, data, fn, str+7, end, res)) return -1; func_ok = expr_func_length(res); } break; case 'm': if (strncmp(str, "min(", 4) == 0) { if (expression(filt, data, fn, str+4, end, res)) return -1; func_ok = expr_func_min(res); } else if (strncmp(str, "max(", 4) == 0) { if (expression(filt, data, fn, str+4, end, res)) return -1; func_ok = expr_func_max(res); } break; } if (func_ok < 0) return -1; str = ws(*end); if (*str != ')') { fprintf(stderr, "Missing ')'\n"); return -1; } *end = str+1; return 0; } /* * simple_expr * : identifier * | constant * | string * | func_expr * | '(' expression ')' */ static int simple_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, char *str, char **end, hts_expr_val_t *res) { // Main recursion step str = ws(str); if (*str == '(') { if (expression(filt, data, fn, str+1, end, res)) return -1; str = ws(*end); if (*str != ')') { fprintf(stderr, "Missing ')'\n"); return -1; } *end = str+1; return 0; } // Otherwise a basic element. int fail = 0; double d = hts_str2dbl(str, end, &fail); if (str != *end) { res->is_str = 0; res->d = d; } else { // Not valid floating point syntax. // TODO: add function call names in here; len(), sqrt(), pow(), etc if (*str == '"') { res->is_str = 1; char *e = str+1; int backslash = 0; while (*e && *e != '"') { if (*e == '\\') backslash=1, e+=1+(e[1]!='\0'); else e++; } kputsn(str+1, e-(str+1), ks_clear(&res->s)); if (backslash) { size_t i, j; for (i = j = 0; i < res->s.l; i++) { res->s.s[j++] = res->s.s[i]; if (res->s.s[i] == '\\') { switch (res->s.s[++i]) { case '"': res->s.s[j-1] = '"'; break; case '\\':res->s.s[j-1] = '\\'; break; case 't': res->s.s[j-1] = '\t'; break; case 'n': res->s.s[j-1] = '\n'; break; case 'r': res->s.s[j-1] = '\r'; break; default: res->s.s[j++] = res->s.s[i]; } } } res->s.s[j] = 0; res->s.l = j; } if (*e != '"') return -1; *end = e+1; } else if (fn) { // Try lookup as variable, if not as function if (fn(data, str, end, res) == 0) return 0; else return func_expr(filt, data, fn, str, end, res); } else { return -1; } } return 0; } /* * unary_expr * : simple_expr * | '+' simple_expr * | '-' simple_expr * | '!' unary_expr // higher precedence * | '~' unary_expr // higher precedence */ static int unary_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, char *str, char **end, hts_expr_val_t *res) { int err; str = ws(str); if (*str == '+') { err = simple_expr(filt, data, fn, str+1, end, res); err |= res->is_str; res->is_true = res->d != 0; } else if (*str == '-') { err = simple_expr(filt, data, fn, str+1, end, res); err |= res->is_str; res->d = -res->d; res->is_true = res->d != 0; } else if (*str == '!') { err = unary_expr(filt, data, fn, str+1, end, res); if (res->is_str) { res->is_str = 0; res->d = 0; res->is_true = !res->is_true; } else { res->d = !(int64_t)res->d; res->is_true = res->d != 0; } } else if (*str == '~') { err = unary_expr(filt, data, fn, str+1, end, res); err |= res->is_str; res->d = ~(int64_t)res->d; res->is_true = res->d != 0; } else { err = simple_expr(filt, data, fn, str, end, res); } return err ? -1 : 0; } /* * mul_expr * : unary_expr ( * '*' unary_expr * | '/' unary_expr * | '%' unary_expr * )* */ static int mul_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, char *str, char **end, hts_expr_val_t *res) { if (unary_expr(filt, data, fn, str, end, res)) return -1; str = *end; hts_expr_val_t val = HTS_EXPR_VAL_INIT; while (*str) { str = ws(str); if (*str == '*' || *str == '/' || *str == '%') { if (unary_expr(filt, data, fn, str+1, end, &val)) return -1; if (val.is_str || res->is_str) { hts_expr_val_free(&val); return -1; // arith on strings } } if (*str == '*') res->d *= val.d; else if (*str == '/') res->d /= val.d; else if (*str == '%') res->d = (int64_t)res->d % (int64_t)val.d; else break; str = *end; } hts_expr_val_free(&val); return 0; } /* * add_expr * : mul_expr ( * '+' mul_expr * | '-' mul_expr * )* */ static int add_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, char *str, char **end, hts_expr_val_t *res) { if (mul_expr(filt, data, fn, str, end, res)) return -1; str = *end; hts_expr_val_t val = HTS_EXPR_VAL_INIT; while (*str) { str = ws(str); if (*str == '+' || *str == '-') { if (mul_expr(filt, data, fn, str+1, end, &val)) return -1; if (val.is_str || res->is_str) { hts_expr_val_free(&val); return -1; // arith on strings } } if (*str == '+') res->d += val.d; else if (*str == '-') res->d -= val.d; else break; str = *end; } hts_expr_val_free(&val); return 0; } /* * bitand_expr * : add_expr * | bitand_expr '&' add_expr */ static int bitand_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, char *str, char **end, hts_expr_val_t *res) { if (add_expr(filt, data, fn, str, end, res)) return -1; hts_expr_val_t val = HTS_EXPR_VAL_INIT; for (;;) { str = ws(*end); if (*str == '&' && str[1] != '&') { if (add_expr(filt, data, fn, str+1, end, &val)) return -1; if (res->is_str || val.is_str) { hts_expr_val_free(&val); return -1; } res->is_true = (res->d = ((int64_t)res->d & (int64_t)val.d)) != 0; } else { break; } } hts_expr_val_free(&val); return 0; } /* * bitxor_expr * : bitand_expr * | bitxor_expr '^' bitand_expr */ static int bitxor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, char *str, char **end, hts_expr_val_t *res) { if (bitand_expr(filt, data, fn, str, end, res)) return -1; hts_expr_val_t val = HTS_EXPR_VAL_INIT; for (;;) { str = ws(*end); if (*str == '^') { if (bitand_expr(filt, data, fn, str+1, end, &val)) return -1; if (res->is_str || val.is_str) { hts_expr_val_free(&val); return -1; } res->is_true = (res->d = ((int64_t)res->d ^ (int64_t)val.d)) != 0; } else { break; } } hts_expr_val_free(&val); return 0; } /* * bitor_expr * : bitxor_expr * | bitor_expr '|' bitxor_expr */ static int bitor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, char *str, char **end, hts_expr_val_t *res) { if (bitxor_expr(filt, data, fn, str, end, res)) return -1; hts_expr_val_t val = HTS_EXPR_VAL_INIT; for (;;) { str = ws(*end); if (*str == '|' && str[1] != '|') { if (bitxor_expr(filt, data, fn, str+1, end, &val)) return -1; if (res->is_str || val.is_str) { hts_expr_val_free(&val); return -1; } res->is_true = (res->d = ((int64_t)res->d | (int64_t)val.d)) != 0; } else { break; } } hts_expr_val_free(&val); return 0; } /* * cmp_expr * : bitor_expr * | cmp_expr '<=' bitor_expr * | cmp_expr '<' bitor_expr * | cmp_expr '>=' bitor_expr * | cmp_expr '>' bitor_expr */ static int cmp_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, char *str, char **end, hts_expr_val_t *res) { if (bitor_expr(filt, data, fn, str, end, res)) return -1; str = ws(*end); hts_expr_val_t val = HTS_EXPR_VAL_INIT; int err = 0; if (*str == '>' && str[1] == '=') { err = cmp_expr(filt, data, fn, str+2, end, &val); res->is_true=res->d = res->is_str && res->s.s && val.is_str && val.s.s ? strcmp(res->s.s, val.s.s) >= 0 : !res->is_str && !val.is_str && res->d >= val.d; res->is_str = 0; } else if (*str == '>') { err = cmp_expr(filt, data, fn, str+1, end, &val); res->is_true=res->d = res->is_str && res->s.s && val.is_str && val.s.s ? strcmp(res->s.s, val.s.s) > 0 : !res->is_str && !val.is_str && res->d > val.d; res->is_str = 0; } else if (*str == '<' && str[1] == '=') { err = cmp_expr(filt, data, fn, str+2, end, &val); res->is_true=res->d = res->is_str && res->s.s && val.is_str && val.s.s ? strcmp(res->s.s, val.s.s) <= 0 : !res->is_str && !val.is_str && res->d <= val.d; res->is_str = 0; } else if (*str == '<') { err = cmp_expr(filt, data, fn, str+1, end, &val); res->is_true=res->d = res->is_str && res->s.s && val.is_str && val.s.s ? strcmp(res->s.s, val.s.s) < 0 : !res->is_str && !val.is_str && res->d < val.d; res->is_str = 0; } hts_expr_val_free(&val); return err ? -1 : 0; } /* * eq_expr * : cmp_expr * | eq_expr '==' cmp_expr * | eq_expr '!=' cmp_expr * | eq_expr '=~' cmp_expr * | eq_expr '!~' cmp_expr */ static int eq_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, char *str, char **end, hts_expr_val_t *res) { if (cmp_expr(filt, data, fn, str, end, res)) return -1; str = ws(*end); int err = 0; hts_expr_val_t val = HTS_EXPR_VAL_INIT; // numeric vs numeric comparison is as expected // string vs string comparison is as expected // numeric vs string is false if (str[0] == '=' && str[1] == '=') { if ((err = eq_expr(filt, data, fn, str+2, end, &val))) { res->is_true = res->d = 0; } else { res->is_true = res->d = res->is_str ? (res->s.s && val.s.s ? strcmp(res->s.s, val.s.s)==0 : 0) : !res->is_str && !val.is_str && res->d == val.d; } res->is_str = 0; } else if (str[0] == '!' && str[1] == '=') { if ((err = eq_expr(filt, data, fn, str+2, end, &val))) { res->is_true = res->d = 0; } else { res->is_true = res->d = res->is_str ? (res->s.s && val.s.s ? strcmp(res->s.s, val.s.s) != 0 : 1) : res->is_str != val.is_str || res->d != val.d; } res->is_str = 0; } else if ((str[0] == '=' && str[1] == '~') || (str[0] == '!' && str[1] == '~')) { err = eq_expr(filt, data, fn, str+2, end, &val); if (!val.is_str || !res->is_str) { hts_expr_val_free(&val); return -1; } if (val.s.s && res->s.s && val.is_true >= 0 && res->is_true >= 0) { regex_t preg_, *preg; if (filt->curr_regex >= filt->max_regex) { // Compile regex if not seen before if (filt->curr_regex >= MAX_REGEX) { preg = &preg_; } else { preg = &filt->preg[filt->curr_regex]; filt->max_regex++; } int ec = regcomp(preg, val.s.s, REG_EXTENDED | REG_NOSUB); if (ec != 0) { char errbuf[1024]; regerror(ec, preg, errbuf, 1024); fprintf(stderr, "Failed regex: %.1024s\n", errbuf); hts_expr_val_free(&val); return -1; } } else { preg = &filt->preg[filt->curr_regex]; } res->is_true = res->d = regexec(preg, res->s.s, 0, NULL, 0) == 0 ? *str == '=' // matcn : *str == '!'; // no-match if (preg == &preg_) regfree(preg); filt->curr_regex++; } else { // nul regexp or input is considered false res->is_true = 0; } res->is_str = 0; } hts_expr_val_free(&val); return err ? -1 : 0; } /* * and_expr * : eq_expr * | and_expr 'and' eq_expr * | and_expr 'or' eq_expr */ static int and_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, char *str, char **end, hts_expr_val_t *res) { if (eq_expr(filt, data, fn, str, end, res)) return -1; hts_expr_val_t val = HTS_EXPR_VAL_INIT; for (;;) { str = ws(*end); if (str[0] == '&' && str[1] == '&') { if (eq_expr(filt, data, fn, str+2, end, &val)) return -1; res->is_true = res->d = (res->is_true || (res->is_str && res->s.s) || res->d) && (val.is_true || (val.is_str && val.s.s) || val.d); res->is_str = 0; } else if (str[0] == '|' && str[1] == '|') { if (eq_expr(filt, data, fn, str+2, end, &val)) return -1; res->is_true = res->d = res->is_true || (res->is_str && res->s.s) || res->d || val.is_true || (val.is_str && val.s.s ) || val.d; res->is_str = 0; } else { break; } } hts_expr_val_free(&val); return 0; } static int expression(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, char *str, char **end, hts_expr_val_t *res) { return and_expr(filt, data, fn, str, end, res); } hts_filter_t *hts_filter_init(const char *str) { hts_filter_t *f = calloc(1, sizeof(*f)); if (!f) return NULL; // Oversize to permit faster comparisons with memcmp over strcmp size_t len = strlen(str)+100; if (!(f->str = malloc(len))) { free(f); return NULL; } strcpy(f->str, str); return f; } void hts_filter_free(hts_filter_t *filt) { if (!filt) return; int i; for (i = 0; i < filt->max_regex; i++) regfree(&filt->preg[i]); free(filt->str); free(filt); } int hts_filter_eval(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, hts_expr_val_t *res) { char *end = NULL; memset(res, 0, sizeof(*res)); filt->curr_regex = 0; if (expression(filt, data, fn, filt->str, &end, res)) return -1; if (end && *ws(end)) { fprintf(stderr, "Unable to parse expression at %s\n", filt->str); return -1; } // Strings evaluate to true. An empty string is also true, but an // absent (null) string is false. An empty string has kstring length // of zero, but a pointer as it's nul-terminated. if (res->is_str) res->is_true = res->d = res->s.s != NULL; else res->is_true |= res->d != 0; return 0; }