mirror of
https://github.com/cesanta/slre.git
synced 2024-11-17 08:55:30 +00:00
469 lines
16 KiB
C
469 lines
16 KiB
C
/*
|
|
* Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
|
|
* Copyright (c) 2013 Cesanta Software Limited
|
|
* All rights reserved
|
|
*
|
|
* This library is dual-licensed: you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License version 2 as
|
|
* published by the Free Software Foundation. For the terms of this
|
|
* license, see <http://www.gnu.org/licenses/>.
|
|
*
|
|
* You are free to use this library under the terms of the GNU General
|
|
* Public License, but WITHOUT ANY WARRANTY; without even the implied
|
|
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
* See the GNU General Public License for more details.
|
|
*
|
|
* Alternatively, you can license this library under a commercial
|
|
* license, as set out in <http://cesanta.com/products.html>.
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <ctype.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#include "slre.h"
|
|
|
|
static const char *static_error_no_match = "No match";
|
|
static const char *static_error_unexpected_quantifier = "Unexpected quantifier";
|
|
static const char *static_error_unbalanced_brackets = "Unbalanced brackets";
|
|
static const char *static_error_internal = "Internal error";
|
|
static const char *static_error_invalid_metacharacter = "Invalid metacharacter";
|
|
|
|
#define MAX_BRANCHES 100
|
|
#define MAX_BRACKETS 100
|
|
#define ARRAY_SIZE(ar) (int) (sizeof(ar) / sizeof((ar)[0]))
|
|
#define FAIL_IF(cond,msg) do { if (cond) \
|
|
{info->error_msg = msg; return 0; }} while (0)
|
|
|
|
#ifdef SLRE_DEBUG
|
|
#define DBG(x) printf x
|
|
#else
|
|
#define DBG(x)
|
|
#endif
|
|
|
|
struct regex_info {
|
|
/*
|
|
* Describes all bracket pairs in the regular expression.
|
|
* First entry is always present, and grabs the whole regex.
|
|
*/
|
|
struct bracket_pair {
|
|
const char *ptr; /* Points to the first char after '(' in regex */
|
|
int len; /* Length of the text between '(' and ')' */
|
|
int branches; /* Index in the branches array for this pair */
|
|
int num_branches; /* Number of '|' in this bracket pair */
|
|
} brackets[MAX_BRACKETS];
|
|
int num_brackets;
|
|
|
|
/*
|
|
* Describes alternations ('|' operators) in the regular expression.
|
|
* Each branch falls into a specific branch pair.
|
|
*/
|
|
struct branch {
|
|
int bracket_index; /* index into 'brackets' array defined above */
|
|
const char *schlong; /* points to the '|' character in the regex */
|
|
} branches[MAX_BRANCHES];
|
|
int num_branches;
|
|
|
|
/* Error message to be returned to the user */
|
|
const char *error_msg;
|
|
|
|
/* E.g. IGNORE_CASE. See enum below */
|
|
int flags;
|
|
};
|
|
enum { IGNORE_CASE = 1 };
|
|
|
|
static int get_op_len(const char *re) {
|
|
return re[0] == '\\' ? 2 : 1;
|
|
}
|
|
|
|
static int is_quantifier(const char *re) {
|
|
return re[0] == '*' || re[0] == '+' || re[0] == '?';
|
|
}
|
|
|
|
static int doh(const char *s, int s_len,
|
|
struct slre_cap *caps, struct regex_info *info, int bi);
|
|
|
|
static int bar(const char *re, int re_len, const char *s, int s_len,
|
|
struct slre_cap *caps, struct regex_info *info, int bi) {
|
|
/* i is offset in re, j is offset in s, bi is brackets index */
|
|
int i, j, n, step;
|
|
|
|
(void) caps;
|
|
|
|
DBG(("%s [%.*s] [%.*s]\n", __func__, re_len, re, s_len, s));
|
|
|
|
for (i = j = 0; i < re_len && j < s_len; i += step) {
|
|
step = get_op_len(re + i);
|
|
|
|
DBG(("%s [%.*s] [%.*s] re_len=%d step=%d i=%d j=%d\n",
|
|
__func__, re_len - i, re + i,
|
|
s_len - j, s + j, re_len, step, i, j));
|
|
|
|
FAIL_IF(is_quantifier(&re[i]), static_error_unexpected_quantifier);
|
|
FAIL_IF(step <= 0, static_error_internal);
|
|
|
|
/* Handle quantifiers. Look ahead. */
|
|
if (i + step < re_len && is_quantifier(re + i + step)) {
|
|
if (re[i + step] == '?') {
|
|
j += bar(re + i, step, s + j, s_len - j, caps, info, bi);
|
|
i++;
|
|
continue;
|
|
} else if (re[i + step] == '+' || re[i + step] == '*') {
|
|
int j2 = j, nj = 0, n1, n2, ni, next_step, non_greedy = 0;
|
|
|
|
/* Points to the regexp code after the quantifier */
|
|
next_step = get_op_len(re + i + step);
|
|
if (i + step + 1 < re_len && re[i + step + 1] == '?') {
|
|
non_greedy = 1;
|
|
next_step++;
|
|
}
|
|
ni = i + step + next_step;
|
|
|
|
while ((n1 = bar(re + i, step, s + j2, s_len - j2,
|
|
caps, info, bi)) > 0) {
|
|
if (ni >= re_len) {
|
|
/* After quantifier, there is nothing */
|
|
nj = j2 + n1;
|
|
} else if ((n2 = bar(re + ni, re_len - ni, s + j2 + n1,
|
|
s_len - (j2 + n1), caps, info, bi)) > 0) {
|
|
nj = j2 + n1 + n2;
|
|
}
|
|
if (nj > 0 && non_greedy) break;
|
|
j2 += n1;
|
|
}
|
|
FAIL_IF(re[i + step] == '+' && nj == 0, static_error_no_match);
|
|
return nj;
|
|
}
|
|
}
|
|
|
|
switch (re[i]) {
|
|
case '\\':
|
|
/* Metacharacters */
|
|
switch (re[i + 1]) {
|
|
case 'S':
|
|
FAIL_IF(isspace(((unsigned char *) s)[j]), static_error_no_match);
|
|
j++;
|
|
break;
|
|
|
|
case 's':
|
|
FAIL_IF(!isspace(((unsigned char *) s)[j]), static_error_no_match);
|
|
j++;
|
|
break;
|
|
|
|
case 'd':
|
|
FAIL_IF(!isdigit(((unsigned char *) s)[j]), static_error_no_match);
|
|
j++;
|
|
break;
|
|
|
|
case '+': case '?': case '*': case '\\': case '(': case ')':
|
|
case '^': case '$': case '.': case '[': case ']':
|
|
FAIL_IF(re[i + 1] != s[j], static_error_no_match);
|
|
j++;
|
|
break;
|
|
|
|
default:
|
|
FAIL_IF(1, static_error_invalid_metacharacter);
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case '(':
|
|
bi++;
|
|
FAIL_IF(bi >= info->num_brackets, static_error_internal);
|
|
DBG(("CAPTURING [%.*s] [%.*s]\n", info->brackets[bi].len + 2,
|
|
re + i, s_len - j, s + j));
|
|
n = doh(s + j, s_len - j, caps, info, bi);
|
|
DBG(("CAPTURED [%.*s] [%.*s]:%d\n", info->brackets[bi].len + 2,
|
|
re + i, s_len - j, s + j, n));
|
|
FAIL_IF(n <= 0, static_error_no_match);
|
|
if (caps != NULL) {
|
|
caps[bi - 1].ptr = s + j;
|
|
caps[bi - 1].len = n;
|
|
}
|
|
j += n;
|
|
i += info->brackets[bi].len + 1;
|
|
break;
|
|
|
|
case '^':
|
|
FAIL_IF(j != 0, static_error_no_match);
|
|
break;
|
|
|
|
case '|':
|
|
FAIL_IF(1, static_error_internal);
|
|
break;
|
|
|
|
case '$':
|
|
/* $ anchor handling is at the end of this function */
|
|
FAIL_IF(1, static_error_no_match);
|
|
break;
|
|
|
|
case '.':
|
|
j++;
|
|
break;
|
|
|
|
default:
|
|
FAIL_IF(re[i] != s[j], static_error_no_match);
|
|
j++;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Process $ anchor here. If we've reached the end of the string,
|
|
* but did not exhaust regexp yet, this is no match.
|
|
*/
|
|
FAIL_IF(i < re_len && !(re[i] == '$' && i + 1 == re_len),
|
|
static_error_no_match);
|
|
|
|
return j;
|
|
}
|
|
|
|
/* Process branch points */
|
|
static int doh(const char *s, int s_len,
|
|
struct slre_cap *caps, struct regex_info *info, int bi) {
|
|
const struct bracket_pair *b = &info->brackets[bi];
|
|
int i = 0, len, result;
|
|
const char *p;
|
|
|
|
do {
|
|
p = i == 0 ? b->ptr : info->branches[b->branches + i - 1].schlong + 1;
|
|
len = b->num_branches == 0 ? b->len :
|
|
i == b->num_branches ? b->ptr + b->len - p :
|
|
info->branches[b->branches + i].schlong - p;
|
|
DBG(("%s %d %d [%.*s]\n", __func__, bi, i, len, p));
|
|
result = bar(p, len, s, s_len, caps, info, bi);
|
|
} while (i++ < b->num_branches); /* At least 1 iteration */
|
|
|
|
|
|
return result;
|
|
}
|
|
|
|
static void setup_branch_points(struct regex_info *info) {
|
|
int i, j;
|
|
struct branch tmp;
|
|
|
|
/* First, sort branches. Must be stable, no qsort. Use bubble algo. */
|
|
for (i = 0; i < info->num_branches; i++) {
|
|
for (j = i + 1; j < info->num_branches; j++) {
|
|
if (info->branches[i].bracket_index > info->branches[j].bracket_index) {
|
|
tmp = info->branches[i];
|
|
info->branches[i] = info->branches[j];
|
|
info->branches[j] = tmp;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* For each bracket, set their branch points. This way, for every bracket
|
|
* (i.e. every chunk of regex) we know all branch points before matching.
|
|
*/
|
|
for (i = j = 0; i < info->num_brackets; i++) {
|
|
info->brackets[i].num_branches = 0;
|
|
info->brackets[i].branches = j;
|
|
while (j < info->num_branches && info->branches[j].bracket_index == i) {
|
|
info->brackets[i].num_branches++;
|
|
j++;
|
|
}
|
|
}
|
|
}
|
|
|
|
static int foo(const char *re, int re_len, const char *s, int s_len,
|
|
struct slre_cap *caps, struct regex_info *info) {
|
|
int result, i, step, depth = 0;
|
|
const char *stack[ARRAY_SIZE(info->brackets)];
|
|
|
|
stack[0] = re;
|
|
|
|
/* First bracket captures everything */
|
|
info->brackets[0].ptr = re;
|
|
info->brackets[0].len = re_len;
|
|
info->num_brackets = 1;
|
|
|
|
/* Make a single pass over regex string, memorize brackets and branches */
|
|
for (i = 0; i < re_len; i += step) {
|
|
step = get_op_len(&re[i]);
|
|
|
|
if (re[i] == '|') {
|
|
FAIL_IF(info->num_branches >= ARRAY_SIZE(info->branches),
|
|
"Too many |. Increase MAX_BRANCHES");
|
|
info->branches[info->num_branches].bracket_index =
|
|
info->brackets[info->num_brackets - 1].len == -1 ?
|
|
info->num_brackets - 1 : depth;
|
|
info->branches[info->num_branches].schlong = &re[i];
|
|
info->num_branches++;
|
|
} else if (re[i] == '(') {
|
|
FAIL_IF(info->num_brackets >= ARRAY_SIZE(info->brackets),
|
|
"Too many (. Increase MAX_BRACKETS");
|
|
depth++; /* Order is important here. Depth increments first. */
|
|
stack[depth] = &re[i];
|
|
info->brackets[info->num_brackets].ptr = re + i + 1;
|
|
info->brackets[info->num_brackets].len = -1;
|
|
info->num_brackets++;
|
|
} else if (re[i] == ')') {
|
|
int ind = info->brackets[info->num_brackets - 1].len == -1 ?
|
|
info->num_brackets - 1 : depth;
|
|
info->brackets[ind].len = &re[i] - info->brackets[ind].ptr;
|
|
DBG(("SETTING BRACKET %d [%.*s]\n",
|
|
ind, info->brackets[ind].len, info->brackets[ind].ptr));
|
|
depth--;
|
|
FAIL_IF(depth < 0, static_error_unbalanced_brackets);
|
|
FAIL_IF(i > 0 && re[i - 1] == '(', static_error_no_match);
|
|
}
|
|
}
|
|
|
|
FAIL_IF(depth != 0, static_error_unbalanced_brackets);
|
|
|
|
setup_branch_points(info);
|
|
|
|
/* Scan the string from left to right, applying the regex. Stop on match. */
|
|
result = 0;
|
|
for (i = 0; i < s_len; i++) {
|
|
result = doh(s + i, s_len - i, caps, info, 0);
|
|
DBG((" (iter) -> %d [%.*s] [%.*s] [%s]\n", result, re_len, re,
|
|
s_len - i, s + i, info->error_msg));
|
|
if (result > 0 || re[0] == '^') {
|
|
result += i;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
int slre_match(const char *regexp, const char *s, int s_len,
|
|
struct slre_cap *caps, const char **error_msg) {
|
|
struct regex_info info;
|
|
int result;
|
|
|
|
/* Initialize info structure */
|
|
info.flags = info.num_brackets = info.num_branches = 0;
|
|
info.error_msg = static_error_no_match;
|
|
|
|
DBG(("========================> [%s] [%.*s]\n", regexp, s_len, s));
|
|
result = foo(regexp, strlen(regexp), s, s_len, caps, &info);
|
|
|
|
if (error_msg != NULL) {
|
|
*error_msg = info.error_msg;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
|
|
/*****************************************************************************/
|
|
/********************************** UNIT TEST ********************************/
|
|
/*****************************************************************************/
|
|
#ifdef SLRE_UNIT_TEST
|
|
static int static_total_tests = 0;
|
|
static int static_failed_tests = 0;
|
|
|
|
#define FAIL(str, line) do { \
|
|
printf("Fail on line %d: [%s]\n", line, str); \
|
|
static_failed_tests++; \
|
|
} while (0)
|
|
|
|
#define ASSERT(expr) do { \
|
|
static_total_tests++; \
|
|
if (!(expr)) FAIL(#expr, __LINE__); \
|
|
} while (0)
|
|
|
|
int main(void) {
|
|
const char *msg = "";
|
|
struct slre_cap caps[10];
|
|
|
|
#if 0
|
|
#endif
|
|
ASSERT(slre_match("fo", "foo", 3, NULL, &msg) == 2);
|
|
ASSERT(slre_match(".+", "foo", 3, NULL, &msg) == 3);
|
|
|
|
ASSERT(slre_match(".+k", "fooklmn", 7, NULL, &msg) == 4);
|
|
ASSERT(slre_match(".+k.", "fooklmn", 7, NULL, &msg) == 5);
|
|
ASSERT(slre_match("p+", "fooklmn", 7, NULL, &msg) == 0);
|
|
ASSERT(slre_match("ok", "fooklmn", 7, NULL, &msg) == 4);
|
|
ASSERT(slre_match("lmno", "fooklmn", 7, NULL, &msg) == 0);
|
|
ASSERT(slre_match("mn.", "fooklmn", 7, NULL, &msg) == 0);
|
|
ASSERT(slre_match("o", "fooklmn", 7, NULL, &msg) == 2);
|
|
ASSERT(slre_match("^o", "fooklmn", 7, NULL, &msg) == 0);
|
|
ASSERT(slre_match("^", "fooklmn", 7, NULL, &msg) == 0);
|
|
ASSERT(slre_match("n$", "fooklmn", 7, NULL, &msg) == 7);
|
|
ASSERT(slre_match("n$k", "fooklmn", 7, NULL, &msg) == 0);
|
|
ASSERT(slre_match("l$", "fooklmn", 7, NULL, &msg) == 0);
|
|
ASSERT(slre_match(".$", "fooklmn", 7, NULL, &msg) == 7);
|
|
ASSERT(slre_match("a?", "fooklmn", 7, NULL, &msg) == 0);
|
|
|
|
ASSERT(slre_match("\\_", "fooklmn", 7, NULL, &msg) == 0);
|
|
ASSERT(strcmp(msg, static_error_invalid_metacharacter) == 0);
|
|
ASSERT(slre_match("+", "fooklmn", 7, NULL, &msg) == 0);
|
|
ASSERT(strcmp(msg, static_error_unexpected_quantifier) == 0);
|
|
ASSERT(slre_match("()+", "fooklmn", 7, NULL, &msg) == 0);
|
|
ASSERT(strcmp(msg, static_error_no_match) == 0);
|
|
|
|
/* Balancing brackets */
|
|
ASSERT(slre_match("(x))", "fooklmn", 7, NULL, &msg) == 0);
|
|
ASSERT(strcmp(msg, static_error_unbalanced_brackets) == 0);
|
|
ASSERT(slre_match("(", "fooklmn", 7, NULL, &msg) == 0);
|
|
ASSERT(strcmp(msg, static_error_unbalanced_brackets) == 0);
|
|
|
|
ASSERT(slre_match("klz?mn", "fooklmn", 7, NULL, &msg) == 7);
|
|
ASSERT(slre_match("fa?b", "fooklmn", 7, NULL, &msg) == 0);
|
|
|
|
/* Brackets & capturing */
|
|
ASSERT(slre_match("^(te)", "tenacity subdues all", 20, caps, &msg) == 2);
|
|
ASSERT(slre_match("(bc)", "abcdef", 6, caps, &msg) == 3);
|
|
ASSERT(slre_match(".(d.)", "abcdef", 6, caps, &msg) == 5);
|
|
ASSERT(slre_match(".(d.)\\)?", "abcdef", 6, caps, &msg) == 5);
|
|
ASSERT(caps[0].len == 2);
|
|
ASSERT(memcmp(caps[0].ptr, "de", 2) == 0);
|
|
ASSERT(slre_match("(.+)", "123", 3, caps, &msg) == 3);
|
|
ASSERT(slre_match("(2.+)", "123", 3, caps, &msg) == 3);
|
|
ASSERT(caps[0].len == 2);
|
|
ASSERT(memcmp(caps[0].ptr, "23", 2) == 0);
|
|
ASSERT(slre_match("(.+2)", "123", 3, caps, &msg) == 2);
|
|
ASSERT(caps[0].len == 2);
|
|
ASSERT(memcmp(caps[0].ptr, "12", 2) == 0);
|
|
ASSERT(slre_match("(.*(2.))", "123", 3, caps, &msg) == 3);
|
|
ASSERT(slre_match("(.)(.)", "123", 3, caps, &msg) == 2);
|
|
ASSERT(slre_match("(\\d+)\\s+(\\S+)", "12 hi", 5, caps, &msg) == 5);
|
|
|
|
/* Greedy vs non-greedy */
|
|
ASSERT(slre_match(".+c", "abcabc", 6, NULL, &msg) == 6);
|
|
ASSERT(slre_match(".+?c", "abcabc", 6, NULL, &msg) == 3);
|
|
ASSERT(slre_match(".*?c", "abcabc", 6, NULL, &msg) == 3);
|
|
ASSERT(slre_match(".*c", "abcabc", 6, NULL, &msg) == 6);
|
|
ASSERT(slre_match("bc.d?k?b+", "abcabc", 6, NULL, &msg) == 5);
|
|
|
|
/* Branching */
|
|
ASSERT(slre_match("|", "abc", 3, NULL, &msg) == 0);
|
|
ASSERT(slre_match("|.", "abc", 3, NULL, &msg) == 1);
|
|
ASSERT(slre_match("x|y|b", "abc", 3, NULL, &msg) == 2);
|
|
ASSERT(slre_match("k(xx|yy)|ca", "abcabc", 6, NULL, &msg) == 4);
|
|
ASSERT(slre_match("k(xx|yy)|ca|bc", "abcabc", 6, NULL, &msg) == 3);
|
|
ASSERT(slre_match("(|.c)", "abc", 3, caps, &msg) == 3);
|
|
ASSERT(caps[0].len == 2);
|
|
ASSERT(memcmp(caps[0].ptr, "bc", 2) == 0);
|
|
|
|
|
|
/* Example: HTTP request */
|
|
{
|
|
const char *error_msg, *request = " GET /index.html HTTP/1.0\r\n\r\n";
|
|
struct slre_cap caps[4];
|
|
|
|
if (slre_match("^\\s*(\\S+)\\s+(\\S+)\\s+HTTP/(\\d)\\.(\\d)",
|
|
request, strlen(request), caps, &error_msg)) {
|
|
printf("Method: [%.*s], URI: [%.*s]\n",
|
|
caps[0].len, caps[0].ptr,
|
|
caps[1].len, caps[1].ptr);
|
|
} else {
|
|
printf("Error parsing [%s]: [%s]\n", request, error_msg);
|
|
}
|
|
}
|
|
|
|
printf("Unit test %s (total test: %d, failed tests: %d)\n",
|
|
static_failed_tests > 0 ? "FAILED" : "PASSED",
|
|
static_total_tests, static_failed_tests);
|
|
|
|
return EXIT_SUCCESS;
|
|
}
|
|
#endif /* SLRE_UNIT_TEST */
|