Added %word rule

2025-01-22 13:25:30 +00:00 · 2018-07-13 17:26:57 -04:00 · 2018-07-13 17:26:57 -04:00 · a77edadfa7
commit a77edadfa7
parent ca950520ef
3 changed files with 73 additions and 4 deletions
--- a/README.md
+++ b/README.md
@ -15,8 +15,8 @@ The PEG syntax is well described on page 2 in the [document](http://www.brynosau
  * `\x20` (Hex number char)
  * `$<` ... `>` (Capture operator)
  * `$name<` ... `>` (Named capture operator)
-
-This library also supports the linear-time parsing known as the [*Packrat*](http://pdos.csail.mit.edu/~baford/packrat/thesis/thesis.pdf) parsing.
+  * `%whitespace` (Automatic whitespace skipping)
+  * `%word` (Word expression)

 If you need a Go language version, please see [*go-peg*](https://github.com/yhirose/go-peg).

@ -246,6 +246,20 @@ PHRASE       <- < '"' (!'"' .)* '"' >
 %whitespace  <-  [ \t\r\n]*
 ```

+Word expression
+---------------
+
+```cpp
+peg::parser parser(R"(
+    ROOT         <-  'hello' 'world'
+    %whitespace  <-  [ \t\r\n]*
+    %word        <-  [a-z]+
+)");
+
+parser.parse("hello world") // OK
+parser.parse("helloworld")  // NG
+```
+
 AST generation
 --------------

--- a/peglib.h
+++ b/peglib.h
@ -510,6 +510,8 @@ public:
    std::shared_ptr<Ope>                         whitespaceOpe;
    bool                                         in_whitespace;

+    std::shared_ptr<Ope>                         wordOpe;
+
    const size_t                                 def_count;
    const bool                                   enablePackratParsing;
    std::vector<bool>                            cache_registered;
@ -525,6 +527,7 @@ public:
        size_t               a_l,
        size_t               a_def_count,
        std::shared_ptr<Ope> a_whitespaceOpe,
+        std::shared_ptr<Ope> a_wordOpe,
        bool                 a_enablePackratParsing,
        Tracer               a_tracer)
        : path(a_path)
@ -537,6 +540,7 @@ public:
        , in_token(false)
        , whitespaceOpe(a_whitespaceOpe)
        , in_whitespace(false)
+        , wordOpe(a_wordOpe)
        , def_count(a_def_count)
        , enablePackratParsing(a_enablePackratParsing)
        , cache_registered(enablePackratParsing ? def_count * (l + 1) : 0)
@ -895,13 +899,19 @@ public:
 class LiteralString : public Ope
 {
 public:
-    LiteralString(const std::string& s) : lit_(s) {}
+    LiteralString(const std::string& s)
+        : lit_(s)
+        , init_is_word_(false)
+        , is_word_(false)
+        {}

    size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override;

    void accept(Visitor& v) override;

    std::string lit_;
+	mutable bool init_is_word_;
+	mutable bool is_word_;
 };

 class CharacterClass : public Ope
@ -1202,6 +1212,7 @@ struct IsToken : public Ope::Visitor
 };

 static const char* WHITESPACE_DEFINITION_NAME = "%whitespace";
+static const char* WORD_DEFINITION_NAME = "%word";

 /*
 * Definition
@ -1239,6 +1250,7 @@ public:
        : name(std::move(rhs.name))
        , ignoreSemanticValue(rhs.ignoreSemanticValue)
        , whitespaceOpe(rhs.whitespaceOpe)
+        , wordOpe(rhs.wordOpe)
        , enablePackratParsing(rhs.enablePackratParsing)
        , is_token(rhs.is_token)
        , has_token_boundary(rhs.has_token_boundary)
@ -1358,6 +1370,7 @@ public:
    std::function<std::string ()>  error_message;
    bool                           ignoreSemanticValue;
    std::shared_ptr<Ope>           whitespaceOpe;
+    std::shared_ptr<Ope>           wordOpe;
    bool                           enablePackratParsing;
    bool                           is_token;
    bool                           has_token_boundary;
@ -1378,7 +1391,7 @@ private:
            ope = std::make_shared<Sequence>(whitespaceOpe, ope);
        }

-        Context cxt(path, s, n, assignId.ids.size(), whitespaceOpe, enablePackratParsing, tracer);
+        Context cxt(path, s, n, assignId.ids.size(), whitespaceOpe, wordOpe, enablePackratParsing, tracer);
        auto len = ope->parse(s, n, sv, cxt, dt);
        return Result{ success(len), len, cxt.error_pos, cxt.message_pos, cxt.message };
    }
@ -1401,6 +1414,28 @@ inline size_t LiteralString::parse(const char* s, size_t n, SemanticValues& sv,
        }
    }

+	// Word check
+    static Context dummy_c(nullptr, lit_.data(), lit_.size(), 0, nullptr, nullptr, false, nullptr);
+    static SemanticValues dummy_sv;
+    static any dummy_dt;
+
+    if (!init_is_word_) { // TODO: Protect with mutex
+		if (c.wordOpe) {
+			auto len = c.wordOpe->parse(lit_.data(), lit_.size(), dummy_sv, dummy_c, dummy_dt);
+			is_word_ = success(len);
+		}
+        init_is_word_ = true;
+    }
+
+	if (is_word_) {
+        auto ope = std::make_shared<NotPredicate>(c.wordOpe);
+		auto len = ope->parse(s + i, n - i, dummy_sv, dummy_c, dummy_dt);
+		if (fail(len)) {
+            return static_cast<size_t>(-1);
+		}
+        i += len;
+	}
+
    // Skip whiltespace
    if (!c.in_token) {
        if (c.whitespaceOpe) {
@ -2068,6 +2103,12 @@ private:
            rule.whitespaceOpe = wsp((*data.grammar)[WHITESPACE_DEFINITION_NAME].get_core_operator());
        }

+        // Word expression
+        if (grammar.count(WORD_DEFINITION_NAME)) {
+            auto& rule = (*data.grammar)[start];
+            rule.wordOpe = (*data.grammar)[WORD_DEFINITION_NAME].get_core_operator();
+        }
+
        return data.grammar;
    }

--- a/test/test.cc
+++ b/test/test.cc
@ -255,6 +255,20 @@ TEST_CASE("WHITESPACE test2", "[general]")
    REQUIRE(items[2] == "three");
 }

+TEST_CASE("Word expression test", "[general]") {
+    peg::parser parser(R"(
+        ROOT         <-  'hello' ','? 'world'
+        %whitespace  <-  [ \t\r\n]*
+        %word        <-  [a-z]+
+    )");
+
+	REQUIRE(parser.parse("helloworld") == false);
+	REQUIRE(parser.parse("hello world") == true);
+	REQUIRE(parser.parse("hello,world") == true);
+	REQUIRE(parser.parse("hello, world") == true);
+	REQUIRE(parser.parse("hello , world") == true);
+}
+
 TEST_CASE("Skip token test", "[general]")
 {
    peg::parser parser(