diff --git a/README.md b/README.md index f69e3c9..4e1f590 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,67 @@ -slre -==== +SLRE: Super Light Regular Expression library +============================================ -Super Light Regular Expression library +SLRE is an ISO C library that implements a subset of Perl regular +expression syntax. Main focus of SLRE is small size, [simple +API](https://github.com/cesanta/slre/blob/master/slre.h), clarity of code +and extensibility. It is making it perfect for tasks like parsing network +requests, configuration files, user input, etc, when libraries like +[PCRE](http://pcre.org) are too heavyweight for the given task. Developers in +embedded would benefit most. + +Extensibility is another great aspect of SLRE. For example, if one wants to +introduce a new metacharacter, '\i', meaning 'IPv4 address', it is easy to do +so with SLRE. + +## Supported Syntax + + ^ Match beginning of a buffer + $ Match end of a buffer + () Grouping and substring capturing + [...] Match any character from set + [^...] Match any character but ones from set + \s Match whitespace + \S Match non-whitespace + \d Match decimal digit + + Match one or more times (greedy) + +? Match one or more times (non-greedy) + * Match zero or more times (greedy) + *? Match zero or more times (non-greedy) + ? Match zero or once + \xDD Match byte with hex value 0xDD + \meta Match one of the meta character: ^$().[*+\? + x|y Match x or y (alternation operator) + +## API + + int slre_match(const char *regexp, const char *buf, int buf_len, + struct slre_cap *caps, const char **error_msg); + + +`slre_match()` matches string buffer `buf` of length `buf_len` against +regular expression `regexp`, which should conform the syntax outlined +above. If regular expression `regexp` contains brackets, `slre_match()` +will capture the respective substrings. Array of captures, `caps`, +must have at least as many elements as number of bracket pairs in the `regexp`. + +`slre_match()` returns 0 if there is no match found. Otherwise, it returns +the number scanned bytes from the beginning of the string. This way, +it is easy to do repetitive matches. Hint: if it is required to know +the exact matched substring, enclose `regexp` in a brackets and specify `caps`, +which should be an array of following structures: + + struct slre_cap { + const char *ptr; /* Points to the matched fragment */ + int len; /* Length of the matched fragment */ + }; + +## Example: parsing HTTP request + + const char *error_msg, *request = " GET /index.html HTTP/1.0\r\n\r\n"; + struct slre_cap caps[4]; + + if (slre_match("^\\s*(\\S+)\\s+(\\S+)\\s+HTTP/(\\d)\\.(\\d)", + request, strlen(request), caps, &error_msg)) { + } else { + printf("Error parsing [%s]: [%s]\n", request, error_msg); + } diff --git a/slre.c b/slre.c index 1a7bce3..7b75fd6 100644 --- a/slre.c +++ b/slre.c @@ -157,7 +157,7 @@ static int bar(const char *re, int re_len, const char *s, int s_len, break; case '+': case '?': case '*': case '\\': case '(': case ')': - case '^': case '$': + case '^': case '$': case '.': case '[': case ']': FAIL_IF(re[i + 1] != s[j], static_error_no_match); j++; break; @@ -169,19 +169,20 @@ static int bar(const char *re, int re_len, const char *s, int s_len, break; case '(': - FAIL_IF(bi + 1 >= info->num_brackets, static_error_internal); - DBG(("CAPTURING [%.*s] [%.*s]\n", info->brackets[bi + 1].len + 2, + bi++; + FAIL_IF(bi >= info->num_brackets, static_error_internal); + DBG(("CAPTURING [%.*s] [%.*s]\n", info->brackets[bi].len + 2, re + i, s_len - j, s + j)); - n = doh(s + j, s_len - j, caps, info, bi + 1); - DBG(("CAPTURED [%.*s] [%.*s]:%d\n", info->brackets[bi + 1].len + 2, + n = doh(s + j, s_len - j, caps, info, bi); + DBG(("CAPTURED [%.*s] [%.*s]:%d\n", info->brackets[bi].len + 2, re + i, s_len - j, s + j, n)); FAIL_IF(n <= 0, static_error_no_match); if (caps != NULL) { - caps[bi].ptr = s + j; - caps[bi].len = n; + caps[bi - 1].ptr = s + j; + caps[bi - 1].len = n; } j += n; - i += info->brackets[bi + 1].len + 1; + i += info->brackets[bi].len + 1; break; case '^': diff --git a/slre.h b/slre.h index 5d7a261..292a2a0 100644 --- a/slre.h +++ b/slre.h @@ -26,7 +26,7 @@ extern "C" { /* * This is a regular expression library that implements a subset of Perl RE. - * Please refer to http://cesanta.com/docs/slre for detailed reference. + * Please refer to https://github.com/cesanta/slre for detailed reference. */ /* This structure describes a matched fragment, a "capture" */ @@ -49,43 +49,6 @@ struct slre_cap { int slre_match(const char *regexp, const char *buf, int buf_len, struct slre_cap *caps, const char **error_msg); -/* - * Supported syntax: - * ^ Match beginning of a buffer - * $ Match end of a buffer - * () Grouping and substring capturing - * [...] Match any character from set - * [^...] Match any character but ones from set - * \s Match whitespace - * \S Match non-whitespace - * \d Match decimal digit - * \r Match carriage return - * \n Match newline - * + Match one or more times (greedy) - * +? Match one or more times (non-greedy) - * * Match zero or more times (greedy) - * *? Match zero or more times (non-greedy) - * ? Match zero or once - * \xDD Match byte with hex value 0xDD - * \meta Match one of the meta character: ^$().[*+\? - * x|y Match x or y (alternation operator) - - * Usage example: parsing HTTP request line. - * - * const char *request = "GET /index.html HTTP/1.0\r\n\r\n"; - * struct slre_capture method, uri, version_min, version_maj; - * - * error = slre_match("^\\s*(GET|POST)\\s+(\\S+)\\s+HTTP/(\\d)\\.(\\d)", - * request, strlen(request), - * &method, &uri, &version_min, &version_maj); - * - * if (error != NULL) { - * printf("Error parsing HTTP request: %s\n", error); - * } else { - * printf("Requested URI: [%.*s]\n", uri.len, uri.ptr); - * } - */ - #ifdef __cplusplus } #endif diff --git a/unit_test.c b/unit_test.c index 7c6f8cf..230c2ef 100644 --- a/unit_test.c +++ b/unit_test.c @@ -94,6 +94,7 @@ int main(void) { ASSERT(memcmp(caps[0].ptr, "12", 2) == 0); ASSERT(slre_match("(.*(2.))", "123", 3, caps, &msg) == 3); ASSERT(slre_match("(.)(.)", "123", 3, caps, &msg) == 2); + ASSERT(slre_match("(\\d+)\\s+(\\S+)", "12 hi", 5, caps, &msg) == 5); /* Greedy vs non-greedy */ ASSERT(slre_match(".+c", "abcabc", 6, NULL, &msg) == 6); @@ -113,18 +114,31 @@ int main(void) { ASSERT(memcmp(caps[0].ptr, "bc", 2) == 0); - ASSERT(slre_match("(\\S+)\\s+(\\S+)\\s+HTTP/(\\d)", "POST /x HTTP/1.1", 16, - caps, &msg) == 16); -#if 0 /* HTTP request */ + ASSERT(slre_match("(\\S+)\\s+(\\S+)\\s+HTTP/(\\d)", + "POST /x HTTP/1.1", 16, caps, &msg) == 14); + { static const char *req = "POST /x HTTP/1.0\r\n\r\nPOST DATA"; int len = strlen(req); ASSERT(slre_match("((\\S+)\\s+(\\S+)\\s+HTTP/(\\d)\\.(\\d)\r\n\r\n(.*))", req, len, caps, &msg) == len); } -#endif + /* Examples */ + { + const char *error_msg, *request = " GET /index.html HTTP/1.0\r\n\r\n"; + struct slre_cap caps[4]; + + if (slre_match("^\\s*(\\S+)\\s+(\\S+)\\s+HTTP/(\\d)\\.(\\d)", + request, strlen(request), caps, &error_msg)) { + printf("Method: [%.*s], URI: [%.*s]\n", + caps[0].len, caps[0].ptr, + caps[1].len, caps[1].ptr); + } else { + printf("Error parsing [%s]: [%s]\n", request, error_msg); + } + } printf("Unit test %s (total test: %d, failed tests: %d)\n", static_failed_tests > 0 ? "FAILED" : "PASSED",