Initial import

2025-03-12 16:55:30 +00:00 · 2013-07-29 10:04:59 +01:00 · 2013-07-29 10:04:59 +01:00 · c6d2484b85
commit c6d2484b85
parent 210b08dbbc
4 changed files with 459 additions and 0 deletions
--- a/16
+++ b/16
@ -0,0 +1,16 @@
+Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
+Copyright (c) 2013 Cesanta Limited
+All rights reserved
+
+This code is dual-licensed: you can redistribute it and/or modify
+it under the terms of the GNU General Public License version 2 as
+published by the Free Software Foundation. For the terms of this
+license, see <http://www.gnu.org/licenses/>.
+
+You are free to use this code under the terms of the GNU General
+Public License, but WITHOUT ANY WARRANTY; without even the implied
+warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+See the GNU General Public License for more details.
+
+Alternatively, you can license this code under a commercial
+license, as set out in <http://cesanta.com/products.html>.
--- a/slre.c
+++ b/slre.c
@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
+ * Copyright (c) 2013 Cesanta Limited
+ * All rights reserved
+ *
+ * This library is dual-licensed: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation. For the terms of this
+ * license, see <http://www.gnu.org/licenses/>.
+ *
+ * You are free to use this library under the terms of the GNU General
+ * Public License, but WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * Alternatively, you can license this library under a commercial
+ * license, as set out in <http://cesanta.com/products.html>.
+ */
+
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "slre.h"
+
+static const char *static_error_no_match = "No match";
+static const char *static_error_unexpected_quantifier = "Unexpected quantifier";
+static const char *static_error_unbalanced_brackets = "Unbalanced brackets";
+static const char *static_error_internal = "Internal error";
+static const char *static_error_invalid_metacharacter = "Invalid metacharacter";
+
+#define MAX_BRANCHES 100
+#define MAX_BRACKETS 100
+#define MAX_QUANTIFIERS 100
+#define ARRAY_SIZE(ar) (int) (sizeof(ar) / sizeof((ar)[0]))
+#define FAIL_IF(cond,msg) do { if (cond) \
+  {info->error_msg = msg; return 0; }} while (0)
+
+#ifdef SLRE_DEBUG
+#define DBG(x) printf x
+#else
+#define DBG(x)
+#endif
+
+struct regex_info {
+  /*
+   * Describes all bracket pairs in the regular expression.
+   * First entry is always present, and grabs the whole regex.
+   */
+  struct bracket_pair {
+    const char *opening_bracket;
+    const char *closing_bracket;
+    int nesting_depth;
+  } brackets[MAX_BRACKETS];
+  int num_bracket_pairs;
+
+  /*
+   * Describes alternations ('|' operators) in the regular expression.
+   * Each branch falls into a specific branch pair.
+   */
+  struct branch {
+    int bracket_pair_index;   /* index into 'brackets' array defined above */
+    const char *schlong;      /* points to the '|' character in the regex */
+  } branches[MAX_BRANCHES];
+  int num_branches;
+
+  /* Error message to be returned to the user */
+  const char *error_msg;
+
+  /* E.g. IGNORE_CASE. See enum below */
+  int flags;
+  enum { IGNORE_CASE = 1 };
+};
+
+static int get_op_len(const char *re) {
+  return re[0] == '\\' ? 2 :
+    (re[0] == '*' || re[0] == '+') && re[1] == '?' ? 2 : 1;
+}
+
+static int is_quantifier(const char *re) {
+  return re[0] == '*' || re[0] == '+' || re[0] == '?';
+}
+
+static int get_brackets_length(const char *p, const struct regex_info *info) {
+  int i;
+  for (i = 0; i < info->num_bracket_pairs; i++) {
+    if (info->brackets[i].opening_bracket == p) {
+      return info->brackets[i].closing_bracket -
+        info->brackets[i].opening_bracket;
+    }
+  }
+  return 0;
+}
+
+static int m1(const char *re, int re_len, const char *s, int s_len,
+              struct slre_cap *caps, struct regex_info *info) {
+  /* i is offset in re, j is offset in s */
+  int i, j, step;
+
+  (void) caps;
+
+  DBG(("%s [%.*s] [%.*s]\n", __func__, re_len, re, s_len, s));
+
+  for (i = j = 0; i < re_len && j < s_len; i += step) {
+    step = re[i] == '(' ?
+      get_brackets_length(re + i, info) : get_op_len(re + i);
+
+#if 1
+    DBG(("%s    [%.*s] [%.*s] re_len=%d step=%d i=%d j=%d\n",
+              __func__, re_len - i, re + i,
+              s_len - j, s + j, re_len, step, i, j));
+#endif
+
+    FAIL_IF(is_quantifier(&re[i]), static_error_unexpected_quantifier);
+    FAIL_IF(step <= 0, static_error_internal);
+
+    /* Handle quantifiers. Look ahead. */
+    if (i + step < re_len && is_quantifier(re + i + step)) {
+      if (re[i + step] == '?') {
+        j += m1(re + i, step, s + j, s_len - j, caps, info);
+        i++;
+        continue;
+      } else if (re[i + step] == '+') {
+        int j2 = j, nj = 0, n1, n2, ni, next_step;
+
+        /* Points to the regexp code after the quantifier */
+        next_step = get_op_len(re + i + step);
+        ni = i + step + next_step;
+
+        while ((n1 = m1(re + i, step, s + j2, s_len - j2, caps, info)) > 0) {
+          if (ni >= re_len) {
+            /* After quantifier, there is nothing */
+            nj = j2 + n1;
+          } else if ((n2 = m1(re + ni, re_len - ni, s + j2 + n1,
+                              s_len - (j2 + n1), caps, info)) > 0) {
+            nj = j2 + n1 + n2;
+          }
+          j2 += n1;
+        }
+        FAIL_IF(nj == 0, static_error_no_match);
+        return nj;
+      }
+    }
+
+    switch (re[i]) {
+      case '\\':
+        /* Metacharacters */
+        switch (re[i + 1]) {
+          case 'S':
+            FAIL_IF(isspace(((unsigned char *) s)[j]), static_error_no_match);
+            j++;
+            break;
+
+          case '+':
+          case '?':
+          case '*':
+          case '\\':
+          case '(':
+          case ')':
+          case '^':
+          case '$':
+            FAIL_IF(re[i + 1] != s[j], static_error_no_match);
+            j++;
+            break;
+
+          default:
+            FAIL_IF(1, static_error_invalid_metacharacter);
+            break;
+        }
+        break;
+
+      case '^':
+        FAIL_IF(j != 0, static_error_no_match);
+        break;
+
+      case '$':
+        /* $ anchor handling is at the end of this function */
+        FAIL_IF(1, static_error_no_match);
+        break;
+
+      case '.':
+        j++;
+        break;
+
+      default:
+        FAIL_IF(re[i] != s[j], static_error_no_match);
+        j++;
+        break;
+    }
+  }
+
+  /*
+   * Process $ anchor here. If we've reached the end of the string,
+   * but did not exhaust regexp yet, this is no match.
+   */
+  FAIL_IF(i < re_len && !(re[i] == '$' && i + 1 == re_len),
+          static_error_no_match);
+
+  return j;
+}
+
+/* Step 1. Process brackets and branches. */
+static int m(const char *re, int re_len, const char *s, int s_len,
+             struct slre_cap *caps, struct regex_info *info) {
+  int result, i, step, depth = 0;
+  const char *stack[ARRAY_SIZE(info->brackets)];
+
+  stack[0] = re;
+
+  info->brackets[0].opening_bracket = re;
+  info->brackets[0].closing_bracket = re + re_len;
+  info->brackets[0].nesting_depth = 0;
+  info->num_bracket_pairs = 1;
+
+  for (i = 0; i < re_len; i += step) {
+    step = get_op_len(&re[i]);
+
+    if (re[i] == '|') {
+      FAIL_IF(info->num_branches >= ARRAY_SIZE(info->branches),
+              "Too many |. Increase MAX_BRANCHES");
+      info->branches[info->num_branches].bracket_pair_index =
+        info->num_bracket_pairs - 1;
+      info->branches[info->num_branches].schlong = &re[i];
+      info->num_branches++;
+    } else if (re[i] == '(') {
+      FAIL_IF(info->num_bracket_pairs >= ARRAY_SIZE(info->brackets),
+              "Too many (. Increase MAX_BRACKETS");
+      depth++;  /* Order is important here. Depth increments first. */
+      stack[depth] = &re[i];
+      info->brackets[info->num_bracket_pairs].opening_bracket = &re[i];
+      info->brackets[info->num_bracket_pairs].nesting_depth = depth;
+      info->num_bracket_pairs++;
+    } else if (re[i] == ')') {
+      info->brackets[info->num_bracket_pairs].closing_bracket = &re[i];
+      depth--;
+      FAIL_IF(depth < 0, static_error_unbalanced_brackets);
+    }
+  }
+
+  FAIL_IF(depth != 0, static_error_unbalanced_brackets);
+
+  /* Scan the string from left to right, applying the regex. Stop on match. */
+  result = 0;
+  for (i = 0; i < s_len; i++) {
+    result = m1(re, re_len, s + i, s_len - i, caps, info);
+    DBG(("  m1 -> %d [%.*s] [%.*s] [%s]\n", result, re_len, re,
+         s_len - i, s + i, info->error_msg));
+    if (result > 0 || re[0] == '^') {
+      result += i;
+      break;
+    }
+  }
+
+  return result;
+}
+
+int slre_match(const char *regexp, const char *s, int s_len,
+               struct slre_cap *caps, const char **error_msg) {
+  struct regex_info info;
+  int result;
+
+  memset(&info, 0, sizeof(info));
+  info.error_msg = static_error_no_match;
+
+  DBG(("---------------- [%s] [%.*s]\n", regexp, s_len, s));
+  result = m(regexp, strlen(regexp), s, s_len, caps, &info);
+
+  if (error_msg != NULL) {
+    *error_msg = info.error_msg;
+  }
+
+  return result;
+}
--- a/slre.h
+++ b/slre.h
@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
+ * Copyright (c) 2013 Cesanta Limited
+ * All rights reserved
+ *
+ * This library is dual-licensed: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation. For the terms of this
+ * license, see <http://www.gnu.org/licenses/>.
+ *
+ * You are free to use this library under the terms of the GNU General
+ * Public License, but WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * Alternatively, you can license this library under a commercial
+ * license, as set out in <http://cesanta.com/products.html>.
+ */
+
+#ifndef SLRE_HEADER_DEFINED
+#define SLRE_HEADER_DEFINED
+
+/*
+ * This is a regular expression library that implements a subset of Perl RE.
+ * Please refer to http://cesanta.com/docs/slre for detailed reference.
+ */
+
+/* This structure describes a matched fragment, a "capture" */
+struct slre_cap {
+  const char *ptr;  /* Points to the matched fragment */
+  int len;          /* Length of the matched fragment */
+};
+
+/*
+ * Match string buffer "buf" of length "buf_len" against "regexp", which should
+ * conform the syntax outlined below. If regular expression
+ * "regexp" contains brackets, slre_match() will capture the respective
+ * substring. Array of captures, "caps", must have at least as many elements
+ * as number of opening parenthesis in the regexp.
+ *
+ * Return:
+ *   0, if there is no match. error_msg will contain the error message
+ *   >0, number of bytes matched in a buffer
+ */
+int slre_match(const char *regexp, const char *buf, int buf_len,
+               struct slre_cap *caps, const char **error_msg);
+
+/*
+ * Supported syntax:
+ *    ^        Match beginning of a buffer
+ *    $        Match end of a buffer
+ *    ()       Grouping and substring capturing
+ *    [...]    Match any character from set
+ *    [^...]   Match any character but ones from set
+ *    \s       Match whitespace
+ *    \S       Match non-whitespace
+ *    \d       Match decimal digit
+ *    \r       Match carriage return
+ *    \n       Match newline
+ *    +        Match one or more times (greedy)
+ *    +?       Match one or more times (non-greedy)
+ *    *        Match zero or more times (greedy)
+ *    *?       Match zero or more times (non-greedy)
+ *    ?        Match zero or once
+ *    \xDD     Match byte with hex value 0xDD
+ *    \meta    Match one of the meta character: ^$().[*+\?
+ *    x|y      Match x or y (alternation operator)
+
+ * Usage example: parsing HTTP request line.
+ *
+ *  const char *request = "GET /index.html HTTP/1.0\r\n\r\n";
+ *  struct slre_capture method, uri, version_min, version_maj;
+ *
+ *  error = slre_match("^\\s*(GET|POST)\\s+(\\S+)\\s+HTTP/(\\d)\\.(\\d)",
+ *                     request, strlen(request),
+ *                     &method, &uri, &version_min, &version_maj);
+ *
+ *  if (error != NULL) {
+ *    printf("Error parsing HTTP request: %s\n", error);
+ *  } else {
+ *    printf("Requested URI: [%.*s]\n", uri.len, uri.ptr);
+ *  }
+ */
+
+#endif
--- a/unit_test.c
+++ b/unit_test.c
@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
+ * Copyright (c) 2013 Cesanta Limited
+ * All rights reserved
+ *
+ * This library is dual-licensed: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation. For the terms of this
+ * license, see <http://www.gnu.org/licenses/>.
+ *
+ * You are free to use this library under the terms of the GNU General
+ * Public License, but WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * Alternatively, you can license this library under a commercial
+ * license, as set out in <http://cesanta.com/products.html>.
+ */
+
+
+/*
+ * To unit-test SLRE, do
+ * cc -W -Wall -O2 -ansi -pedantic -pipe unit_test.c -o /tmp/t && /tmp/t
+ */
+
+#include "slre.c"
+
+static int static_total_tests = 0;
+static int static_failed_tests = 0;
+
+#define FAIL(str, line) do {                      \
+  printf("Fail on line %d: [%s]\n", line, str);   \
+  static_failed_tests++;                          \
+} while (0)
+
+#define ASSERT(expr) do {               \
+  static_total_tests++;                 \
+  if (!(expr)) FAIL(#expr, __LINE__);   \
+} while (0)
+
+int main(void) {
+  const char *msg = "";
+
+  ASSERT(slre_match("fo", "foo", 3, NULL, &msg) == 2);
+  ASSERT(slre_match(".+", "foo", 3, NULL, &msg) == 3);
+
+  ASSERT(slre_match(".+k", "fooklmn", 7, NULL, &msg) == 4);
+  ASSERT(slre_match(".+k.", "fooklmn", 7, NULL, &msg) == 5);
+  ASSERT(slre_match("p+", "fooklmn", 7, NULL, &msg) == 0);
+  ASSERT(slre_match("ok", "fooklmn", 7, NULL, &msg) == 4);
+  ASSERT(slre_match("lmno", "fooklmn", 7, NULL, &msg) == 0);
+  ASSERT(slre_match("mn.", "fooklmn", 7, NULL, &msg) == 0);
+  ASSERT(slre_match("o", "fooklmn", 7, NULL, &msg) == 2);
+  ASSERT(slre_match("^o", "fooklmn", 7, NULL, &msg) == 0);
+  ASSERT(slre_match("^", "fooklmn", 7, NULL, &msg) == 0);
+  ASSERT(slre_match("n$", "fooklmn", 7, NULL, &msg) == 7);
+  ASSERT(slre_match("n$k", "fooklmn", 7, NULL, &msg) == 0);
+  ASSERT(slre_match("l$", "fooklmn", 7, NULL, &msg) == 0);
+  ASSERT(slre_match(".$", "fooklmn", 7, NULL, &msg) == 7);
+  ASSERT(slre_match("a?", "fooklmn", 7, NULL, &msg) == 0);
+
+  ASSERT(slre_match("\\_", "fooklmn", 7, NULL, &msg) == 0);
+  ASSERT(strcmp(msg, static_error_invalid_metacharacter) == 0);
+  ASSERT(slre_match("+", "fooklmn", 7, NULL, &msg) == 0);
+  ASSERT(strcmp(msg, static_error_unexpected_quantifier) == 0);
+  ASSERT(slre_match("()+", "fooklmn", 7, NULL, &msg) == 0);
+  ASSERT(strcmp(msg, static_error_no_match) == 0);
+
+  ASSERT(slre_match("(x))", "fooklmn", 7, NULL, &msg) == 0);
+  ASSERT(strcmp(msg, static_error_unbalanced_brackets) == 0);
+  ASSERT(slre_match("(", "fooklmn", 7, NULL, &msg) == 0);
+  ASSERT(strcmp(msg, static_error_unbalanced_brackets) == 0);
+
+  ASSERT(slre_match("klz?mn", "fooklmn", 7, NULL, &msg) == 7);
+  ASSERT(slre_match("fa?b", "fooklmn", 7, NULL, &msg) == 0);
+#if 0
+#endif
+
+  printf("Unit test %s (total test: %d, failed tests: %d)\n",
+         static_failed_tests > 0 ? "FAILED" : "PASSED",
+         static_total_tests, static_failed_tests);
+
+  return EXIT_SUCCESS;
+}