/* * Copyright (c) 2004-2013 Sergey Lyubka * Copyright (c) 2013 Cesanta Software Limited * All rights reserved * * This library is dual-licensed: you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. For the terms of this * license, see . * * You are free to use this library under the terms of the GNU General * Public License, but WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for more details. * * Alternatively, you can license this library under a commercial * license, as set out in . */ #include #include #include #include #include "slre.h" static const char *static_error_no_match = "No match"; static const char *static_error_unexpected_quantifier = "Unexpected quantifier"; static const char *static_error_unbalanced_brackets = "Unbalanced brackets"; static const char *static_error_internal = "Internal error"; static const char *static_error_invalid_metacharacter = "Invalid metacharacter"; #define MAX_BRANCHES 100 #define MAX_BRACKETS 100 #define ARRAY_SIZE(ar) (int) (sizeof(ar) / sizeof((ar)[0])) #define FAIL_IF(cond,msg) do { if (cond) \ {info->error_msg = msg; return 0; }} while (0) #ifdef SLRE_DEBUG #define DBG(x) printf x #else #define DBG(x) #endif struct regex_info { /* * Describes all bracket pairs in the regular expression. * First entry is always present, and grabs the whole regex. */ struct bracket_pair { const char *ptr; /* Points to the first char after '(' in regex */ int len; /* Length of the text between '(' and ')' */ int branches; /* Index in the branches array for this pair */ int num_branches; /* Number of '|' in this bracket pair */ } brackets[MAX_BRACKETS]; int num_brackets; /* * Describes alternations ('|' operators) in the regular expression. * Each branch falls into a specific branch pair. */ struct branch { int bracket_index; /* index into 'brackets' array defined above */ const char *schlong; /* points to the '|' character in the regex */ } branches[MAX_BRANCHES]; int num_branches; /* Error message to be returned to the user */ const char *error_msg; /* E.g. IGNORE_CASE. See enum below */ int flags; }; enum { IGNORE_CASE = 1 }; static int get_op_len(const char *re) { return re[0] == '\\' ? 2 : 1; } static int is_quantifier(const char *re) { return re[0] == '*' || re[0] == '+' || re[0] == '?'; } static int doh(const char *s, int s_len, struct slre_cap *caps, struct regex_info *info, int bi); static int bar(const char *re, int re_len, const char *s, int s_len, struct slre_cap *caps, struct regex_info *info, int bi) { /* i is offset in re, j is offset in s, bi is brackets index */ int i, j, n, step; (void) caps; DBG(("%s [%.*s] [%.*s]\n", __func__, re_len, re, s_len, s)); for (i = j = 0; i < re_len && j < s_len; i += step) { step = get_op_len(re + i); DBG(("%s [%.*s] [%.*s] re_len=%d step=%d i=%d j=%d\n", __func__, re_len - i, re + i, s_len - j, s + j, re_len, step, i, j)); FAIL_IF(is_quantifier(&re[i]), static_error_unexpected_quantifier); FAIL_IF(step <= 0, static_error_internal); /* Handle quantifiers. Look ahead. */ if (i + step < re_len && is_quantifier(re + i + step)) { if (re[i + step] == '?') { j += bar(re + i, step, s + j, s_len - j, caps, info, bi); i++; continue; } else if (re[i + step] == '+' || re[i + step] == '*') { int j2 = j, nj = 0, n1, n2, ni, next_step, non_greedy = 0; /* Points to the regexp code after the quantifier */ next_step = get_op_len(re + i + step); if (i + step + 1 < re_len && re[i + step + 1] == '?') { non_greedy = 1; next_step++; } ni = i + step + next_step; while ((n1 = bar(re + i, step, s + j2, s_len - j2, caps, info, bi)) > 0) { if (ni >= re_len) { /* After quantifier, there is nothing */ nj = j2 + n1; } else if ((n2 = bar(re + ni, re_len - ni, s + j2 + n1, s_len - (j2 + n1), caps, info, bi)) > 0) { nj = j2 + n1 + n2; } if (nj > 0 && non_greedy) break; j2 += n1; } FAIL_IF(re[i + step] == '+' && nj == 0, static_error_no_match); return nj; } } switch (re[i]) { case '\\': /* Metacharacters */ switch (re[i + 1]) { case 'S': FAIL_IF(isspace(((unsigned char *) s)[j]), static_error_no_match); j++; break; case 's': FAIL_IF(!isspace(((unsigned char *) s)[j]), static_error_no_match); j++; break; case 'd': FAIL_IF(!isdigit(((unsigned char *) s)[j]), static_error_no_match); j++; break; case '+': case '?': case '*': case '\\': case '(': case ')': case '^': case '$': case '.': case '[': case ']': FAIL_IF(re[i + 1] != s[j], static_error_no_match); j++; break; default: FAIL_IF(1, static_error_invalid_metacharacter); break; } break; case '(': bi++; FAIL_IF(bi >= info->num_brackets, static_error_internal); DBG(("CAPTURING [%.*s] [%.*s]\n", info->brackets[bi].len + 2, re + i, s_len - j, s + j)); n = doh(s + j, s_len - j, caps, info, bi); DBG(("CAPTURED [%.*s] [%.*s]:%d\n", info->brackets[bi].len + 2, re + i, s_len - j, s + j, n)); FAIL_IF(n <= 0, static_error_no_match); if (caps != NULL) { caps[bi - 1].ptr = s + j; caps[bi - 1].len = n; } j += n; i += info->brackets[bi].len + 1; break; case '^': FAIL_IF(j != 0, static_error_no_match); break; case '|': FAIL_IF(1, static_error_internal); break; case '$': /* $ anchor handling is at the end of this function */ FAIL_IF(1, static_error_no_match); break; case '.': j++; break; default: FAIL_IF(re[i] != s[j], static_error_no_match); j++; break; } } /* * Process $ anchor here. If we've reached the end of the string, * but did not exhaust regexp yet, this is no match. */ FAIL_IF(i < re_len && !(re[i] == '$' && i + 1 == re_len), static_error_no_match); return j; } /* Process branch points */ static int doh(const char *s, int s_len, struct slre_cap *caps, struct regex_info *info, int bi) { const struct bracket_pair *b = &info->brackets[bi]; int i = 0, len, result; const char *p; do { p = i == 0 ? b->ptr : info->branches[b->branches + i - 1].schlong + 1; len = b->num_branches == 0 ? b->len : i == b->num_branches ? b->ptr + b->len - p : info->branches[b->branches + i].schlong - p; DBG(("%s %d %d [%.*s]\n", __func__, bi, i, len, p)); result = bar(p, len, s, s_len, caps, info, bi); } while (i++ < b->num_branches); /* At least 1 iteration */ return result; } static void setup_branch_points(struct regex_info *info) { int i, j; struct branch tmp; /* First, sort branches. Must be stable, no qsort. Use bubble algo. */ for (i = 0; i < info->num_branches; i++) { for (j = i + 1; j < info->num_branches; j++) { if (info->branches[i].bracket_index > info->branches[j].bracket_index) { tmp = info->branches[i]; info->branches[i] = info->branches[j]; info->branches[j] = tmp; } } } /* * For each bracket, set their branch points. This way, for every bracket * (i.e. every chunk of regex) we know all branch points before matching. */ for (i = j = 0; i < info->num_brackets; i++) { info->brackets[i].num_branches = 0; info->brackets[i].branches = j; while (j < info->num_branches && info->branches[j].bracket_index == i) { info->brackets[i].num_branches++; j++; } } } static int foo(const char *re, int re_len, const char *s, int s_len, struct slre_cap *caps, struct regex_info *info) { int result, i, step, depth = 0; const char *stack[ARRAY_SIZE(info->brackets)]; stack[0] = re; /* First bracket captures everything */ info->brackets[0].ptr = re; info->brackets[0].len = re_len; info->num_brackets = 1; /* Make a single pass over regex string, memorize brackets and branches */ for (i = 0; i < re_len; i += step) { step = get_op_len(&re[i]); if (re[i] == '|') { FAIL_IF(info->num_branches >= ARRAY_SIZE(info->branches), "Too many |. Increase MAX_BRANCHES"); info->branches[info->num_branches].bracket_index = info->brackets[info->num_brackets - 1].len == -1 ? info->num_brackets - 1 : depth; info->branches[info->num_branches].schlong = &re[i]; info->num_branches++; } else if (re[i] == '(') { FAIL_IF(info->num_brackets >= ARRAY_SIZE(info->brackets), "Too many (. Increase MAX_BRACKETS"); depth++; /* Order is important here. Depth increments first. */ stack[depth] = &re[i]; info->brackets[info->num_brackets].ptr = re + i + 1; info->brackets[info->num_brackets].len = -1; info->num_brackets++; } else if (re[i] == ')') { int ind = info->brackets[info->num_brackets - 1].len == -1 ? info->num_brackets - 1 : depth; info->brackets[ind].len = &re[i] - info->brackets[ind].ptr; DBG(("SETTING BRACKET %d [%.*s]\n", ind, info->brackets[ind].len, info->brackets[ind].ptr)); depth--; FAIL_IF(depth < 0, static_error_unbalanced_brackets); FAIL_IF(i > 0 && re[i - 1] == '(', static_error_no_match); } } FAIL_IF(depth != 0, static_error_unbalanced_brackets); setup_branch_points(info); /* Scan the string from left to right, applying the regex. Stop on match. */ result = 0; for (i = 0; i < s_len; i++) { result = doh(s + i, s_len - i, caps, info, 0); DBG((" (iter) -> %d [%.*s] [%.*s] [%s]\n", result, re_len, re, s_len - i, s + i, info->error_msg)); if (result > 0 || re[0] == '^') { result += i; break; } } return result; } int slre_match(const char *regexp, const char *s, int s_len, struct slre_cap *caps, const char **error_msg) { struct regex_info info; int result; /* Initialize info structure */ info.flags = info.num_brackets = info.num_branches = 0; info.error_msg = static_error_no_match; DBG(("========================> [%s] [%.*s]\n", regexp, s_len, s)); result = foo(regexp, strlen(regexp), s, s_len, caps, &info); if (error_msg != NULL) { *error_msg = info.error_msg; } return result; } /*****************************************************************************/ /********************************** UNIT TEST ********************************/ /*****************************************************************************/ #ifdef SLRE_UNIT_TEST static int static_total_tests = 0; static int static_failed_tests = 0; #define FAIL(str, line) do { \ printf("Fail on line %d: [%s]\n", line, str); \ static_failed_tests++; \ } while (0) #define ASSERT(expr) do { \ static_total_tests++; \ if (!(expr)) FAIL(#expr, __LINE__); \ } while (0) int main(void) { const char *msg = ""; struct slre_cap caps[10]; #if 0 #endif ASSERT(slre_match("fo", "foo", 3, NULL, &msg) == 2); ASSERT(slre_match(".+", "foo", 3, NULL, &msg) == 3); ASSERT(slre_match(".+k", "fooklmn", 7, NULL, &msg) == 4); ASSERT(slre_match(".+k.", "fooklmn", 7, NULL, &msg) == 5); ASSERT(slre_match("p+", "fooklmn", 7, NULL, &msg) == 0); ASSERT(slre_match("ok", "fooklmn", 7, NULL, &msg) == 4); ASSERT(slre_match("lmno", "fooklmn", 7, NULL, &msg) == 0); ASSERT(slre_match("mn.", "fooklmn", 7, NULL, &msg) == 0); ASSERT(slre_match("o", "fooklmn", 7, NULL, &msg) == 2); ASSERT(slre_match("^o", "fooklmn", 7, NULL, &msg) == 0); ASSERT(slre_match("^", "fooklmn", 7, NULL, &msg) == 0); ASSERT(slre_match("n$", "fooklmn", 7, NULL, &msg) == 7); ASSERT(slre_match("n$k", "fooklmn", 7, NULL, &msg) == 0); ASSERT(slre_match("l$", "fooklmn", 7, NULL, &msg) == 0); ASSERT(slre_match(".$", "fooklmn", 7, NULL, &msg) == 7); ASSERT(slre_match("a?", "fooklmn", 7, NULL, &msg) == 0); ASSERT(slre_match("\\_", "fooklmn", 7, NULL, &msg) == 0); ASSERT(strcmp(msg, static_error_invalid_metacharacter) == 0); ASSERT(slre_match("+", "fooklmn", 7, NULL, &msg) == 0); ASSERT(strcmp(msg, static_error_unexpected_quantifier) == 0); ASSERT(slre_match("()+", "fooklmn", 7, NULL, &msg) == 0); ASSERT(strcmp(msg, static_error_no_match) == 0); /* Balancing brackets */ ASSERT(slre_match("(x))", "fooklmn", 7, NULL, &msg) == 0); ASSERT(strcmp(msg, static_error_unbalanced_brackets) == 0); ASSERT(slre_match("(", "fooklmn", 7, NULL, &msg) == 0); ASSERT(strcmp(msg, static_error_unbalanced_brackets) == 0); ASSERT(slre_match("klz?mn", "fooklmn", 7, NULL, &msg) == 7); ASSERT(slre_match("fa?b", "fooklmn", 7, NULL, &msg) == 0); /* Brackets & capturing */ ASSERT(slre_match("^(te)", "tenacity subdues all", 20, caps, &msg) == 2); ASSERT(slre_match("(bc)", "abcdef", 6, caps, &msg) == 3); ASSERT(slre_match(".(d.)", "abcdef", 6, caps, &msg) == 5); ASSERT(slre_match(".(d.)\\)?", "abcdef", 6, caps, &msg) == 5); ASSERT(caps[0].len == 2); ASSERT(memcmp(caps[0].ptr, "de", 2) == 0); ASSERT(slre_match("(.+)", "123", 3, caps, &msg) == 3); ASSERT(slre_match("(2.+)", "123", 3, caps, &msg) == 3); ASSERT(caps[0].len == 2); ASSERT(memcmp(caps[0].ptr, "23", 2) == 0); ASSERT(slre_match("(.+2)", "123", 3, caps, &msg) == 2); ASSERT(caps[0].len == 2); ASSERT(memcmp(caps[0].ptr, "12", 2) == 0); ASSERT(slre_match("(.*(2.))", "123", 3, caps, &msg) == 3); ASSERT(slre_match("(.)(.)", "123", 3, caps, &msg) == 2); ASSERT(slre_match("(\\d+)\\s+(\\S+)", "12 hi", 5, caps, &msg) == 5); /* Greedy vs non-greedy */ ASSERT(slre_match(".+c", "abcabc", 6, NULL, &msg) == 6); ASSERT(slre_match(".+?c", "abcabc", 6, NULL, &msg) == 3); ASSERT(slre_match(".*?c", "abcabc", 6, NULL, &msg) == 3); ASSERT(slre_match(".*c", "abcabc", 6, NULL, &msg) == 6); ASSERT(slre_match("bc.d?k?b+", "abcabc", 6, NULL, &msg) == 5); /* Branching */ ASSERT(slre_match("|", "abc", 3, NULL, &msg) == 0); ASSERT(slre_match("|.", "abc", 3, NULL, &msg) == 1); ASSERT(slre_match("x|y|b", "abc", 3, NULL, &msg) == 2); ASSERT(slre_match("k(xx|yy)|ca", "abcabc", 6, NULL, &msg) == 4); ASSERT(slre_match("k(xx|yy)|ca|bc", "abcabc", 6, NULL, &msg) == 3); ASSERT(slre_match("(|.c)", "abc", 3, caps, &msg) == 3); ASSERT(caps[0].len == 2); ASSERT(memcmp(caps[0].ptr, "bc", 2) == 0); /* Example: HTTP request */ { const char *error_msg, *request = " GET /index.html HTTP/1.0\r\n\r\n"; struct slre_cap caps[4]; if (slre_match("^\\s*(\\S+)\\s+(\\S+)\\s+HTTP/(\\d)\\.(\\d)", request, strlen(request), caps, &error_msg)) { printf("Method: [%.*s], URI: [%.*s]\n", caps[0].len, caps[0].ptr, caps[1].len, caps[1].ptr); } else { printf("Error parsing [%s]: [%s]\n", request, error_msg); } } printf("Unit test %s (total test: %d, failed tests: %d)\n", static_failed_tests > 0 ? "FAILED" : "PASSED", static_total_tests, static_failed_tests); return EXIT_SUCCESS; } #endif /* SLRE_UNIT_TEST */