mirror of
https://github.com/cesanta/slre.git
synced 2024-11-24 11:25:30 +00:00
Initial import
This commit is contained in:
parent
210b08dbbc
commit
c6d2484b85
16
LICENSE
16
LICENSE
@ -0,0 +1,16 @@
|
||||
Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
|
||||
Copyright (c) 2013 Cesanta Limited
|
||||
All rights reserved
|
||||
|
||||
This code is dual-licensed: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License version 2 as
|
||||
published by the Free Software Foundation. For the terms of this
|
||||
license, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
You are free to use this code under the terms of the GNU General
|
||||
Public License, but WITHOUT ANY WARRANTY; without even the implied
|
||||
warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
See the GNU General Public License for more details.
|
||||
|
||||
Alternatively, you can license this code under a commercial
|
||||
license, as set out in <http://cesanta.com/products.html>.
|
274
slre.c
Normal file
274
slre.c
Normal file
@ -0,0 +1,274 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
|
||||
* Copyright (c) 2013 Cesanta Limited
|
||||
* All rights reserved
|
||||
*
|
||||
* This library is dual-licensed: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation. For the terms of this
|
||||
* license, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You are free to use this library under the terms of the GNU General
|
||||
* Public License, but WITHOUT ANY WARRANTY; without even the implied
|
||||
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See the GNU General Public License for more details.
|
||||
*
|
||||
* Alternatively, you can license this library under a commercial
|
||||
* license, as set out in <http://cesanta.com/products.html>.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <ctype.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "slre.h"
|
||||
|
||||
static const char *static_error_no_match = "No match";
|
||||
static const char *static_error_unexpected_quantifier = "Unexpected quantifier";
|
||||
static const char *static_error_unbalanced_brackets = "Unbalanced brackets";
|
||||
static const char *static_error_internal = "Internal error";
|
||||
static const char *static_error_invalid_metacharacter = "Invalid metacharacter";
|
||||
|
||||
#define MAX_BRANCHES 100
|
||||
#define MAX_BRACKETS 100
|
||||
#define MAX_QUANTIFIERS 100
|
||||
#define ARRAY_SIZE(ar) (int) (sizeof(ar) / sizeof((ar)[0]))
|
||||
#define FAIL_IF(cond,msg) do { if (cond) \
|
||||
{info->error_msg = msg; return 0; }} while (0)
|
||||
|
||||
#ifdef SLRE_DEBUG
|
||||
#define DBG(x) printf x
|
||||
#else
|
||||
#define DBG(x)
|
||||
#endif
|
||||
|
||||
struct regex_info {
|
||||
/*
|
||||
* Describes all bracket pairs in the regular expression.
|
||||
* First entry is always present, and grabs the whole regex.
|
||||
*/
|
||||
struct bracket_pair {
|
||||
const char *opening_bracket;
|
||||
const char *closing_bracket;
|
||||
int nesting_depth;
|
||||
} brackets[MAX_BRACKETS];
|
||||
int num_bracket_pairs;
|
||||
|
||||
/*
|
||||
* Describes alternations ('|' operators) in the regular expression.
|
||||
* Each branch falls into a specific branch pair.
|
||||
*/
|
||||
struct branch {
|
||||
int bracket_pair_index; /* index into 'brackets' array defined above */
|
||||
const char *schlong; /* points to the '|' character in the regex */
|
||||
} branches[MAX_BRANCHES];
|
||||
int num_branches;
|
||||
|
||||
/* Error message to be returned to the user */
|
||||
const char *error_msg;
|
||||
|
||||
/* E.g. IGNORE_CASE. See enum below */
|
||||
int flags;
|
||||
enum { IGNORE_CASE = 1 };
|
||||
};
|
||||
|
||||
static int get_op_len(const char *re) {
|
||||
return re[0] == '\\' ? 2 :
|
||||
(re[0] == '*' || re[0] == '+') && re[1] == '?' ? 2 : 1;
|
||||
}
|
||||
|
||||
static int is_quantifier(const char *re) {
|
||||
return re[0] == '*' || re[0] == '+' || re[0] == '?';
|
||||
}
|
||||
|
||||
static int get_brackets_length(const char *p, const struct regex_info *info) {
|
||||
int i;
|
||||
for (i = 0; i < info->num_bracket_pairs; i++) {
|
||||
if (info->brackets[i].opening_bracket == p) {
|
||||
return info->brackets[i].closing_bracket -
|
||||
info->brackets[i].opening_bracket;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int m1(const char *re, int re_len, const char *s, int s_len,
|
||||
struct slre_cap *caps, struct regex_info *info) {
|
||||
/* i is offset in re, j is offset in s */
|
||||
int i, j, step;
|
||||
|
||||
(void) caps;
|
||||
|
||||
DBG(("%s [%.*s] [%.*s]\n", __func__, re_len, re, s_len, s));
|
||||
|
||||
for (i = j = 0; i < re_len && j < s_len; i += step) {
|
||||
step = re[i] == '(' ?
|
||||
get_brackets_length(re + i, info) : get_op_len(re + i);
|
||||
|
||||
#if 1
|
||||
DBG(("%s [%.*s] [%.*s] re_len=%d step=%d i=%d j=%d\n",
|
||||
__func__, re_len - i, re + i,
|
||||
s_len - j, s + j, re_len, step, i, j));
|
||||
#endif
|
||||
|
||||
FAIL_IF(is_quantifier(&re[i]), static_error_unexpected_quantifier);
|
||||
FAIL_IF(step <= 0, static_error_internal);
|
||||
|
||||
/* Handle quantifiers. Look ahead. */
|
||||
if (i + step < re_len && is_quantifier(re + i + step)) {
|
||||
if (re[i + step] == '?') {
|
||||
j += m1(re + i, step, s + j, s_len - j, caps, info);
|
||||
i++;
|
||||
continue;
|
||||
} else if (re[i + step] == '+') {
|
||||
int j2 = j, nj = 0, n1, n2, ni, next_step;
|
||||
|
||||
/* Points to the regexp code after the quantifier */
|
||||
next_step = get_op_len(re + i + step);
|
||||
ni = i + step + next_step;
|
||||
|
||||
while ((n1 = m1(re + i, step, s + j2, s_len - j2, caps, info)) > 0) {
|
||||
if (ni >= re_len) {
|
||||
/* After quantifier, there is nothing */
|
||||
nj = j2 + n1;
|
||||
} else if ((n2 = m1(re + ni, re_len - ni, s + j2 + n1,
|
||||
s_len - (j2 + n1), caps, info)) > 0) {
|
||||
nj = j2 + n1 + n2;
|
||||
}
|
||||
j2 += n1;
|
||||
}
|
||||
FAIL_IF(nj == 0, static_error_no_match);
|
||||
return nj;
|
||||
}
|
||||
}
|
||||
|
||||
switch (re[i]) {
|
||||
case '\\':
|
||||
/* Metacharacters */
|
||||
switch (re[i + 1]) {
|
||||
case 'S':
|
||||
FAIL_IF(isspace(((unsigned char *) s)[j]), static_error_no_match);
|
||||
j++;
|
||||
break;
|
||||
|
||||
case '+':
|
||||
case '?':
|
||||
case '*':
|
||||
case '\\':
|
||||
case '(':
|
||||
case ')':
|
||||
case '^':
|
||||
case '$':
|
||||
FAIL_IF(re[i + 1] != s[j], static_error_no_match);
|
||||
j++;
|
||||
break;
|
||||
|
||||
default:
|
||||
FAIL_IF(1, static_error_invalid_metacharacter);
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
case '^':
|
||||
FAIL_IF(j != 0, static_error_no_match);
|
||||
break;
|
||||
|
||||
case '$':
|
||||
/* $ anchor handling is at the end of this function */
|
||||
FAIL_IF(1, static_error_no_match);
|
||||
break;
|
||||
|
||||
case '.':
|
||||
j++;
|
||||
break;
|
||||
|
||||
default:
|
||||
FAIL_IF(re[i] != s[j], static_error_no_match);
|
||||
j++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Process $ anchor here. If we've reached the end of the string,
|
||||
* but did not exhaust regexp yet, this is no match.
|
||||
*/
|
||||
FAIL_IF(i < re_len && !(re[i] == '$' && i + 1 == re_len),
|
||||
static_error_no_match);
|
||||
|
||||
return j;
|
||||
}
|
||||
|
||||
/* Step 1. Process brackets and branches. */
|
||||
static int m(const char *re, int re_len, const char *s, int s_len,
|
||||
struct slre_cap *caps, struct regex_info *info) {
|
||||
int result, i, step, depth = 0;
|
||||
const char *stack[ARRAY_SIZE(info->brackets)];
|
||||
|
||||
stack[0] = re;
|
||||
|
||||
info->brackets[0].opening_bracket = re;
|
||||
info->brackets[0].closing_bracket = re + re_len;
|
||||
info->brackets[0].nesting_depth = 0;
|
||||
info->num_bracket_pairs = 1;
|
||||
|
||||
for (i = 0; i < re_len; i += step) {
|
||||
step = get_op_len(&re[i]);
|
||||
|
||||
if (re[i] == '|') {
|
||||
FAIL_IF(info->num_branches >= ARRAY_SIZE(info->branches),
|
||||
"Too many |. Increase MAX_BRANCHES");
|
||||
info->branches[info->num_branches].bracket_pair_index =
|
||||
info->num_bracket_pairs - 1;
|
||||
info->branches[info->num_branches].schlong = &re[i];
|
||||
info->num_branches++;
|
||||
} else if (re[i] == '(') {
|
||||
FAIL_IF(info->num_bracket_pairs >= ARRAY_SIZE(info->brackets),
|
||||
"Too many (. Increase MAX_BRACKETS");
|
||||
depth++; /* Order is important here. Depth increments first. */
|
||||
stack[depth] = &re[i];
|
||||
info->brackets[info->num_bracket_pairs].opening_bracket = &re[i];
|
||||
info->brackets[info->num_bracket_pairs].nesting_depth = depth;
|
||||
info->num_bracket_pairs++;
|
||||
} else if (re[i] == ')') {
|
||||
info->brackets[info->num_bracket_pairs].closing_bracket = &re[i];
|
||||
depth--;
|
||||
FAIL_IF(depth < 0, static_error_unbalanced_brackets);
|
||||
}
|
||||
}
|
||||
|
||||
FAIL_IF(depth != 0, static_error_unbalanced_brackets);
|
||||
|
||||
/* Scan the string from left to right, applying the regex. Stop on match. */
|
||||
result = 0;
|
||||
for (i = 0; i < s_len; i++) {
|
||||
result = m1(re, re_len, s + i, s_len - i, caps, info);
|
||||
DBG((" m1 -> %d [%.*s] [%.*s] [%s]\n", result, re_len, re,
|
||||
s_len - i, s + i, info->error_msg));
|
||||
if (result > 0 || re[0] == '^') {
|
||||
result += i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
int slre_match(const char *regexp, const char *s, int s_len,
|
||||
struct slre_cap *caps, const char **error_msg) {
|
||||
struct regex_info info;
|
||||
int result;
|
||||
|
||||
memset(&info, 0, sizeof(info));
|
||||
info.error_msg = static_error_no_match;
|
||||
|
||||
DBG(("---------------- [%s] [%.*s]\n", regexp, s_len, s));
|
||||
result = m(regexp, strlen(regexp), s, s_len, caps, &info);
|
||||
|
||||
if (error_msg != NULL) {
|
||||
*error_msg = info.error_msg;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
85
slre.h
Normal file
85
slre.h
Normal file
@ -0,0 +1,85 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
|
||||
* Copyright (c) 2013 Cesanta Limited
|
||||
* All rights reserved
|
||||
*
|
||||
* This library is dual-licensed: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation. For the terms of this
|
||||
* license, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You are free to use this library under the terms of the GNU General
|
||||
* Public License, but WITHOUT ANY WARRANTY; without even the implied
|
||||
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See the GNU General Public License for more details.
|
||||
*
|
||||
* Alternatively, you can license this library under a commercial
|
||||
* license, as set out in <http://cesanta.com/products.html>.
|
||||
*/
|
||||
|
||||
#ifndef SLRE_HEADER_DEFINED
|
||||
#define SLRE_HEADER_DEFINED
|
||||
|
||||
/*
|
||||
* This is a regular expression library that implements a subset of Perl RE.
|
||||
* Please refer to http://cesanta.com/docs/slre for detailed reference.
|
||||
*/
|
||||
|
||||
/* This structure describes a matched fragment, a "capture" */
|
||||
struct slre_cap {
|
||||
const char *ptr; /* Points to the matched fragment */
|
||||
int len; /* Length of the matched fragment */
|
||||
};
|
||||
|
||||
/*
|
||||
* Match string buffer "buf" of length "buf_len" against "regexp", which should
|
||||
* conform the syntax outlined below. If regular expression
|
||||
* "regexp" contains brackets, slre_match() will capture the respective
|
||||
* substring. Array of captures, "caps", must have at least as many elements
|
||||
* as number of opening parenthesis in the regexp.
|
||||
*
|
||||
* Return:
|
||||
* 0, if there is no match. error_msg will contain the error message
|
||||
* >0, number of bytes matched in a buffer
|
||||
*/
|
||||
int slre_match(const char *regexp, const char *buf, int buf_len,
|
||||
struct slre_cap *caps, const char **error_msg);
|
||||
|
||||
/*
|
||||
* Supported syntax:
|
||||
* ^ Match beginning of a buffer
|
||||
* $ Match end of a buffer
|
||||
* () Grouping and substring capturing
|
||||
* [...] Match any character from set
|
||||
* [^...] Match any character but ones from set
|
||||
* \s Match whitespace
|
||||
* \S Match non-whitespace
|
||||
* \d Match decimal digit
|
||||
* \r Match carriage return
|
||||
* \n Match newline
|
||||
* + Match one or more times (greedy)
|
||||
* +? Match one or more times (non-greedy)
|
||||
* * Match zero or more times (greedy)
|
||||
* *? Match zero or more times (non-greedy)
|
||||
* ? Match zero or once
|
||||
* \xDD Match byte with hex value 0xDD
|
||||
* \meta Match one of the meta character: ^$().[*+\?
|
||||
* x|y Match x or y (alternation operator)
|
||||
|
||||
* Usage example: parsing HTTP request line.
|
||||
*
|
||||
* const char *request = "GET /index.html HTTP/1.0\r\n\r\n";
|
||||
* struct slre_capture method, uri, version_min, version_maj;
|
||||
*
|
||||
* error = slre_match("^\\s*(GET|POST)\\s+(\\S+)\\s+HTTP/(\\d)\\.(\\d)",
|
||||
* request, strlen(request),
|
||||
* &method, &uri, &version_min, &version_maj);
|
||||
*
|
||||
* if (error != NULL) {
|
||||
* printf("Error parsing HTTP request: %s\n", error);
|
||||
* } else {
|
||||
* printf("Requested URI: [%.*s]\n", uri.len, uri.ptr);
|
||||
* }
|
||||
*/
|
||||
|
||||
#endif
|
84
unit_test.c
Normal file
84
unit_test.c
Normal file
@ -0,0 +1,84 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
|
||||
* Copyright (c) 2013 Cesanta Limited
|
||||
* All rights reserved
|
||||
*
|
||||
* This library is dual-licensed: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation. For the terms of this
|
||||
* license, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You are free to use this library under the terms of the GNU General
|
||||
* Public License, but WITHOUT ANY WARRANTY; without even the implied
|
||||
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See the GNU General Public License for more details.
|
||||
*
|
||||
* Alternatively, you can license this library under a commercial
|
||||
* license, as set out in <http://cesanta.com/products.html>.
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* To unit-test SLRE, do
|
||||
* cc -W -Wall -O2 -ansi -pedantic -pipe unit_test.c -o /tmp/t && /tmp/t
|
||||
*/
|
||||
|
||||
#include "slre.c"
|
||||
|
||||
static int static_total_tests = 0;
|
||||
static int static_failed_tests = 0;
|
||||
|
||||
#define FAIL(str, line) do { \
|
||||
printf("Fail on line %d: [%s]\n", line, str); \
|
||||
static_failed_tests++; \
|
||||
} while (0)
|
||||
|
||||
#define ASSERT(expr) do { \
|
||||
static_total_tests++; \
|
||||
if (!(expr)) FAIL(#expr, __LINE__); \
|
||||
} while (0)
|
||||
|
||||
int main(void) {
|
||||
const char *msg = "";
|
||||
|
||||
ASSERT(slre_match("fo", "foo", 3, NULL, &msg) == 2);
|
||||
ASSERT(slre_match(".+", "foo", 3, NULL, &msg) == 3);
|
||||
|
||||
ASSERT(slre_match(".+k", "fooklmn", 7, NULL, &msg) == 4);
|
||||
ASSERT(slre_match(".+k.", "fooklmn", 7, NULL, &msg) == 5);
|
||||
ASSERT(slre_match("p+", "fooklmn", 7, NULL, &msg) == 0);
|
||||
ASSERT(slre_match("ok", "fooklmn", 7, NULL, &msg) == 4);
|
||||
ASSERT(slre_match("lmno", "fooklmn", 7, NULL, &msg) == 0);
|
||||
ASSERT(slre_match("mn.", "fooklmn", 7, NULL, &msg) == 0);
|
||||
ASSERT(slre_match("o", "fooklmn", 7, NULL, &msg) == 2);
|
||||
ASSERT(slre_match("^o", "fooklmn", 7, NULL, &msg) == 0);
|
||||
ASSERT(slre_match("^", "fooklmn", 7, NULL, &msg) == 0);
|
||||
ASSERT(slre_match("n$", "fooklmn", 7, NULL, &msg) == 7);
|
||||
ASSERT(slre_match("n$k", "fooklmn", 7, NULL, &msg) == 0);
|
||||
ASSERT(slre_match("l$", "fooklmn", 7, NULL, &msg) == 0);
|
||||
ASSERT(slre_match(".$", "fooklmn", 7, NULL, &msg) == 7);
|
||||
ASSERT(slre_match("a?", "fooklmn", 7, NULL, &msg) == 0);
|
||||
|
||||
ASSERT(slre_match("\\_", "fooklmn", 7, NULL, &msg) == 0);
|
||||
ASSERT(strcmp(msg, static_error_invalid_metacharacter) == 0);
|
||||
ASSERT(slre_match("+", "fooklmn", 7, NULL, &msg) == 0);
|
||||
ASSERT(strcmp(msg, static_error_unexpected_quantifier) == 0);
|
||||
ASSERT(slre_match("()+", "fooklmn", 7, NULL, &msg) == 0);
|
||||
ASSERT(strcmp(msg, static_error_no_match) == 0);
|
||||
|
||||
ASSERT(slre_match("(x))", "fooklmn", 7, NULL, &msg) == 0);
|
||||
ASSERT(strcmp(msg, static_error_unbalanced_brackets) == 0);
|
||||
ASSERT(slre_match("(", "fooklmn", 7, NULL, &msg) == 0);
|
||||
ASSERT(strcmp(msg, static_error_unbalanced_brackets) == 0);
|
||||
|
||||
ASSERT(slre_match("klz?mn", "fooklmn", 7, NULL, &msg) == 7);
|
||||
ASSERT(slre_match("fa?b", "fooklmn", 7, NULL, &msg) == 0);
|
||||
#if 0
|
||||
#endif
|
||||
|
||||
printf("Unit test %s (total test: %d, failed tests: %d)\n",
|
||||
static_failed_tests > 0 ? "FAILED" : "PASSED",
|
||||
static_total_tests, static_failed_tests);
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
Loading…
Reference in New Issue
Block a user