Changed the capture operator and made the anchor operator.

2025-07-12 02:02:08 +00:00 · 2015-02-17 22:35:07 -05:00 · 2015-02-17 22:35:07 -05:00 · 56daf08d5b
commit 56daf08d5b
parent f0351a9909
4 changed files with 130 additions and 87 deletions
--- a/README.md
+++ b/README.md
@ -7,7 +7,8 @@ C++11 header-only [PEG](http://en.wikipedia.org/wiki/Parsing_expression_grammar)
 The PEG syntax is well described on page 2 in the [document](http://pdos.csail.mit.edu/papers/parsing:popl04.pdf). *cpp-peglib* also supports the following additional syntax for now:
-  * `<` and `>` (Capture operators)
+  * `<` ... `>` (Anchor operators)
  * `$<` ... `>` (Capture operators)
 How to use
 ----------
@ -88,7 +89,7 @@ Here is a complete list of available actions:
 `any& c` is a context data which can be used by the user for whatever purposes.
-The following example uses `<` and ` >` operators. They are the *capture* operators. Each capture operator creates a semantic value that contains `const char*` of the position. It could be useful to eliminate unnecessary characters.
+The following example uses `<` and ` >` operators. They are the *anchor* operators. Each anchor operator creates a semantic value that contains `const char*` of the position. It could be useful to eliminate unnecessary characters.
 ```c++
 auto syntax = R"(
@ -100,9 +101,8 @@ auto syntax = R"(
 peg pg(syntax);
 pg["TOKEN"] = [](const char* s, size_t l, const vector<any>& v) {
-    auto b = v[0].get<const char*>(); // '<'
+    // 'token' doesn't include trailing whitespaces
-    auto e = v[1].get<const char*>(); // '>'
+    auto token = string(s, l);
    auto token = string(b, e - b);    // 'token' doesn't include trailing whitespaces
 };
 auto ret = pg.parse(" token1, token2 ");
@ -113,13 +113,13 @@ Simple interface
 *cpp-peglib* provides std::regex-like simple interface for trivial tasks.
-`peglib::peg_match` tries to capture strings in the `< ... >` operator and store them into `peglib::match` object.
+`peglib::peg_match` tries to capture strings in the `$< ... >` operator and store them into `peglib::match` object.
 ```c++
 peglib::match m;
 auto ret = peglib::peg_match(
    R"(
-        ROOT      <-  _ ('[' < TAG_NAME > ']' _)*
+        ROOT      <-  _ ('[' $< TAG_NAME > ']' _)*
        TAG_NAME  <-  (!']' .)+
        _         <-  [ \t]*
    )",
@ -139,7 +139,7 @@ There are some ways to *search* a peg pattern in a document.
 using namespace peglib;
 auto syntax = R"(
-ROOT <- '[' < [a-z0-9]+ > ']'
+ROOT <- '[' $< [a-z0-9]+ > ']'
 )";
 auto s = " [tag1] [tag2] [tag3] ";
@ -206,6 +206,8 @@ The following are available operators:
 | cls      | Character class    |
 | chr      | Character          |
 | dot      | Any character      |
 | anc      | Anchor character   |
 | cap      | Capture character  |
 Sample codes
 ------------
--- a/lint/peglint.cc
+++ b/lint/peglint.cc
@ -27,11 +27,11 @@ int main(int argc, const char** argv)
        return -1;
    }
-    peglib::peg parser(syntax.data(), syntax.size(), [&](size_t ln, size_t col, const string& msg) {
+    peglib::peg peg(syntax.data(), syntax.size(), [&](size_t ln, size_t col, const string& msg) {
        cerr << syntax_path << ":" << ln << ":" << col << ": " << msg << endl;
    });
-    if (!parser) {
+    if (!peg) {
        return -1;
    }
@ -48,10 +48,14 @@ int main(int argc, const char** argv)
        return -1;
    }
-    auto ret = parser.lint(source.data(), source.size(), true, [&](size_t ln, size_t col, const string& msg) {
+    auto ret = peg.lint(source.data(), source.size(), true, [&](size_t ln, size_t col, const string& msg) {
        cerr << source_path << ":" << ln << ":" << col << ": " << msg << endl;
    });
    if (ret) {
        peg.parse(source.data(), source.size());
    }
    return ret ? 0 : -1;
 }
--- a/peglib.h
+++ b/peglib.h
@ -151,7 +151,15 @@ private:
 /*
 * Semantic values
 */
-typedef std::vector<any> Values;
+struct SemanticValues
 {
 	std::vector<any>         values;
   //std::vector<std::string> names;
   const char*              s;
 	size_t                   l;
   SemanticValues() : s(nullptr), l(0) {}
 };
 /*
 * Semantic action
@ -366,7 +374,7 @@ class Ope
 {
 public:
    virtual ~Ope() {};
-    virtual Result parse(const char* s, size_t l, Values& v, any& c) const = 0;
+    virtual Result parse(const char* s, size_t l, SemanticValues& v, any& c) const = 0;
 };
 class Sequence : public Ope
@ -393,7 +401,7 @@ public:
    Sequence(const std::vector<std::shared_ptr<Ope>>& opes) : opes_(opes) {}
    Sequence(std::vector<std::shared_ptr<Ope>>&& opes) : opes_(std::move(opes)) {}
-    Result parse(const char* s, size_t l, Values& v, any& c) const {
+    Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
        size_t i = 0;
        for (const auto& ope : opes_) {
            const auto& rule = *ope;
@ -436,18 +444,20 @@ public:
    PrioritizedChoice(const std::vector<std::shared_ptr<Ope>>& opes) : opes_(opes) {}
    PrioritizedChoice(std::vector<std::shared_ptr<Ope>>&& opes) : opes_(std::move(opes)) {}
-    Result parse(const char* s, size_t l, Values& v, any& c) const {
+    Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
        size_t id = 0;
        for (const auto& ope : opes_) {
            const auto& rule = *ope;
-            Values chldsv;
+            SemanticValues chldsv;
            auto r = rule.parse(s, l, chldsv, c);
            if (r.ret) {
-                if (!chldsv.empty()) {
+                //assert(chldsv.values.size() == chldsv.names.size());
-                    for (const auto& x: chldsv) {
+                if (!chldsv.values.empty()) {
-                        v.push_back(x);
+                    v.values.insert(v.values.end(), chldsv.values.begin(), chldsv.values.end());
-                    }
+                    //v.names.insert(v.names.end(), chldsv.names.begin(), chldsv.names.end());
                }
                v.s = chldsv.s;
                v.l = chldsv.l;
                return success(r.len, id);
            }
            id++;
@ -466,7 +476,7 @@ class ZeroOrMore : public Ope
 public:
    ZeroOrMore(const std::shared_ptr<Ope>& ope) : ope_(ope) {}
-    Result parse(const char* s, size_t l, Values& v, any& c) const {
+    Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
        auto i = 0;
        while (l - i > 0) {
            const auto& rule = *ope_;
@ -488,7 +498,7 @@ class OneOrMore : public Ope
 public:
    OneOrMore(const std::shared_ptr<Ope>& ope) : ope_(ope) {}
-    Result parse(const char* s, size_t l, Values& v, any& c) const {
+    Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
        const auto& rule = *ope_;
        auto r = rule.parse(s, l, v, c);
        if (!r.ret) {
@ -519,7 +529,7 @@ class Option : public Ope
 public:
    Option(const std::shared_ptr<Ope>& ope) : ope_(ope) {}
-    Result parse(const char* s, size_t l, Values& v, any& c) const {
+    Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
        const auto& rule = *ope_;
        auto r = rule.parse(s, l, v, c);
        return success(r.ret ? r.len : 0);
@ -534,7 +544,7 @@ class AndPredicate : public Ope
 public:
    AndPredicate(const std::shared_ptr<Ope>& ope) : ope_(ope) {}
-    Result parse(const char* s, size_t l, Values& v, any& c) const {
+    Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
        const auto& rule = *ope_;
        auto r = rule.parse(s, l, v, c);
        if (r.ret) {
@ -553,7 +563,7 @@ class NotPredicate : public Ope
 public:
    NotPredicate(const std::shared_ptr<Ope>& ope) : ope_(ope) {}
-    Result parse(const char* s, size_t l, Values& v, any& c) const {
+    Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
        const auto& rule = *ope_;
        auto r = rule.parse(s, l, v, c);
        if (r.ret) {
@ -572,7 +582,7 @@ class LiteralString : public Ope
 public:
    LiteralString(const std::string& s) : lit_(s) {}
-    Result parse(const char* s, size_t l, Values& v, any& c) const {
+    Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
        auto i = 0u;
        for (; i < lit_.size(); i++) {
            if (i >= l || s[i] != lit_[i]) {
@ -591,7 +601,7 @@ class CharacterClass : public Ope
 public:
    CharacterClass(const std::string& chars) : chars_(chars) {}
-    Result parse(const char* s, size_t l, Values& v, any& c) const {
+    Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
        // TODO: UTF8 support
        if (l < 1) {
            return fail(s);
@ -623,7 +633,7 @@ class Character : public Ope
 public:
    Character(char ch) : ch_(ch) {}
-    Result parse(const char* s, size_t l, Values& v, any& c) const {
+    Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
        // TODO: UTF8 support
        if (l < 1 || s[0] != ch_) {
            return fail(s);
@ -638,7 +648,7 @@ private:
 class AnyCharacter : public Ope
 {
 public:
-    Result parse(const char* s, size_t l, Values& v, any& c) const {
+    Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
        // TODO: UTF8 support
        if (l < 1) {
            return fail(s);
@ -651,11 +661,10 @@ public:
 class Capture : public Ope
 {
 public:
    Capture(const std::shared_ptr<Ope>& ope) : ope_(ope) {}
    Capture(const std::shared_ptr<Ope>& ope, MatchAction ma, size_t ci)
        : ope_(ope), match_action_(ma), capture_id(ci) {}
-    Result parse(const char* s, size_t l, Values& v, any& c) const {
+    Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
        assert(ope_);
        const auto& rule = *ope_;
        auto r = rule.parse(s, l, v, c);
@ -674,10 +683,21 @@ private:
 class Anchor : public Ope
 {
 public:
-    Result parse(const char* s, size_t l, Values& v, any& c) const {
+    Anchor(const std::shared_ptr<Ope>& ope) : ope_(ope) {}
-        return success(0);
+
    Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
        assert(ope_);
        const auto& rule = *ope_;
        auto r = rule.parse(s, l, v, c);
        if (r.ret) {
            v.s = s;
            v.l = r.len;
        }
        return r;
    }
 private:
    std::shared_ptr<Ope> ope_;
 };
 class WeakHolder : public Ope
@ -685,7 +705,7 @@ class WeakHolder : public Ope
 public:
    WeakHolder(const std::shared_ptr<Ope>& ope) : weak_(ope) {}
-    Result parse(const char* s, size_t l, Values& v, any& c) const {
+    Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
        auto ope = weak_.lock();
        assert(ope);
        const auto& rule = *ope;
@ -738,17 +758,17 @@ public:
        return *this;
    }
-    Result parse(const char* s, size_t l, Values& v, any& c) const {
+    Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
        return holder_->parse(s, l, v, c);
    }
    template <typename T>
    Result parse(const char* s, size_t l, T& val) const {
-        Values v;
+        SemanticValues v;
        any c;
        auto r = holder_->parse(s, l, v, c);
-        if (r.ret && !v.empty() && !v.front().is_undefined()) {
+        if (r.ret && !v.values.empty() && !v.values.front().is_undefined()) {
-            val = v[0].get<T>();
+            val = v.values[0].get<T>();
        }
        return r;
    }
@ -761,7 +781,7 @@ public:
    Result parse(const char* s) const {
        auto l = strlen(s);
-        Values v;
+        SemanticValues v;
        any c;
        return holder_->parse(s, l, v, c);
    }
@ -795,13 +815,13 @@ private:
        Holder(Definition* outer)
           : outer_(outer) {}
-        Result parse(const char* s, size_t l, Values& v, any& c) const {
+        Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
            if (!ope_) {
                throw std::logic_error("Uninitialized definition ope was used...");
            }
            const auto& rule = *ope_;
-            Values chldsv;
+            SemanticValues chldsv;
            auto r = rule.parse(s, l, chldsv, c);
            if (r.ret) {
                assert(!outer_->actions.empty());
@ -811,7 +831,12 @@ private:
                    ? outer_->actions[id]
                    : outer_->actions[0];
-                v.push_back(reduce(s, r.len, chldsv, c, ac));
+                auto ts = chldsv.s ? chldsv.s : s;
                auto tl = chldsv.s ? chldsv.l : r.len;
                auto sv = reduce(ts, tl, chldsv, c, ac);
                v.values.push_back(sv);
                //v.names.push_back(outer_->name);
            }
            return r;
        }
@ -819,13 +844,13 @@ private:
    private:
        friend class Definition;
-        any reduce(const char* s, size_t l, const Values& v, any& c, const Action& action) const {
+        any reduce(const char* s, size_t l, const SemanticValues& v, any& c, const Action& action) const {
            if (action) {
-                return action(s, l, v, c);
+                return action(s, l, v.values, c);
-            } else if (v.empty()) {
+            } else if (v.values.empty()) {
                return any();
            } else {
-                return v.front();
+                return v.values.front();
            }
        }
@ -847,7 +872,7 @@ public:
        : grammar_(grammar)
        , name_(name) {}
-    Result parse(const char* s, size_t l, Values& v, any& c) const {
+    Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
        const auto& rule = *grammar_.at(name_).holder_;
        return rule.parse(s, l, v, c);
    }
@ -916,8 +941,8 @@ inline std::shared_ptr<Ope> cap(const std::shared_ptr<Ope>& ope, MatchAction ma)
    return std::make_shared<Capture>(ope, ma, (size_t)-1);
 }
-inline std::shared_ptr<Ope> anc() {
+inline std::shared_ptr<Ope> anc(const std::shared_ptr<Ope>& ope) {
-    return std::make_shared<Anchor>();
+    return std::make_shared<Anchor>(ope);
 }
 inline std::shared_ptr<Ope> ref(const std::map<std::string, Definition>& grammar, const std::string& name) {
@ -994,7 +1019,8 @@ private:
        g["Suffix"]     <= seq(g["Primary"], opt(cho(g["QUESTION"], g["STAR"], g["PLUS"])));
        g["Primary"]    <= cho(seq(g["Identifier"], npd(g["LEFTARROW"])),
                               seq(g["OPEN"], g["Expression"], g["CLOSE"]),
-                               seq(g["CAPTUREOPEN"], g["Expression"], g["CAPTURECLOSE"]),
+                               seq(g["Begin"], g["Expression"], g["End"]),
                               seq(g["BeginCap"], g["Expression"], g["EndCap"]),
                               g["Literal"], g["Class"], g["DOT"]);
        g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]);
@ -1002,13 +1028,10 @@ private:
        g["IdentStart"] <= cls("a-zA-Z_");
        g["IdentRest"]  <= cho(g["IdentStart"], cls("0-9"));
-        g["Literal"]    <= cho(seq(cls("'"), g["SQCont"], cls("'"), g["Spacing"]),
+        g["Literal"]    <= cho(seq(cls("'"), anc(zom(seq(npd(cls("'")), g["Char"]))), cls("'"), g["Spacing"]),
-                               seq(cls("\""), g["DQCont"], cls("\""), g["Spacing"]));
+                               seq(cls("\""), anc(zom(seq(npd(cls("\"")), g["Char"]))), cls("\""), g["Spacing"]));
        g["SQCont"]     <= zom(seq(npd(cls("'")), g["Char"]));
        g["DQCont"]     <= zom(seq(npd(cls("\"")), g["Char"]));
-        g["Class"]      <= seq(chr('['), g["ClassCont"], chr(']'), g["Spacing"]);
+        g["Class"] <= seq(chr('['), anc(zom(seq(npd(chr(']')), g["Range"]))), chr(']'), g["Spacing"]);
        g["ClassCont"]  <= zom(seq(npd(chr(']')), g["Range"]));
        g["Range"]      <= cho(seq(g["Char"], chr('-'), g["Char"]), g["Char"]);
        g["Char"]       <= cho(seq(chr('\\'), cls("nrt'\"[]\\")),
@ -1033,8 +1056,11 @@ private:
        g["EndOfLine"]  <= cho(lit("\r\n"), chr('\n'), chr('\r'));
        g["EndOfFile"]  <= npd(dot());
-        g["CAPTUREOPEN"]  <= seq(chr('<'), g["Spacing"]);
+        g["Begin"]      <= seq(chr('<'), g["Spacing"]);
-        g["CAPTURECLOSE"] <= seq(chr('>'), g["Spacing"]);
+        g["End"]        <= seq(chr('>'), g["Spacing"]);
        g["BeginCap"]   <= seq(lit("$<"), g["Spacing"]);
        g["EndCap"]     <= seq(lit(">"), g["Spacing"]);
        // Set definition names
        for (auto& x: g) {
@ -1130,36 +1156,27 @@ private:
            [&](const std::vector<any>& v) {
                return v[1];
            },
            // Anchor
            [&](const std::vector<any>& v) {
                auto ope = v[1].get<std::shared_ptr<Ope>>();
                return anc(ope);
            },
            // Capture
-            [&](const char* s, size_t l, const std::vector<any>& v, any& c) {
+            [&](const std::vector<any>& v, any& c) {
                Context& cxt = *c.get<Context*>();
                auto ope = v[1].get<std::shared_ptr<Ope>>();
-                return seq(
+                return cap(ope, cxt.match_action, ++cxt.capture_count);
                    ref(*cxt.grammar, "%ANCHOR%"),
                    cap(ope, cxt.match_action, ++cxt.capture_count),
                    ref(*cxt.grammar, "%ANCHOR%"));
            }
        };
        g["IdentCont"] = [](const char* s, size_t l) {
            return std::string(s, l);
        };
-
+        g["Literal"] = [this](const char* s, size_t l) {
-        g["Literal"] = [](const std::vector<any>& v) {
+            return lit(resolve_escape_sequence(s, l));
            return lit(v[0].get<std::string>());
        };
-        g["SQCont"] = [this](const char* s, size_t l) {
+        g["Class"] = [this](const char* s, size_t l) {
-            return resolve_escape_sequence(s, l);
+            return cls(resolve_escape_sequence(s, l));
        };
        g["DQCont"] = [this](const char* s, size_t l) {
            return resolve_escape_sequence(s, l);
        };
        g["Class"] = [](const std::vector<any>& v) {
            return cls(v[0].get<std::string>());
        };
        g["ClassCont"] = [this](const char* s, size_t l) {
            return resolve_escape_sequence(s, l);
        };
        g["AND"]      = [](const char* s, size_t l) { return *s; };
@ -1168,16 +1185,14 @@ private:
        g["STAR"]     = [](const char* s, size_t l) { return *s; };
        g["PLUS"]     = [](const char* s, size_t l) { return *s; };
-        g["DOT"] = []() {
+        g["DOT"] = []() { return dot(); };
            return dot();
        };
    }
    std::shared_ptr<Grammar> perform_core(const char* s, size_t l, std::string& start, MatchAction ma, Log log) {
        Context cxt;
        cxt.match_action = ma;
-        Values v;
+        SemanticValues v;
        any c = &cxt;
        auto r = g["Grammar"].parse(s, l, v, c);
@ -1205,9 +1220,6 @@ private:
        start = cxt.start;
        grammar["%ANCHOR%"] <= anc();
        grammar["%ANCHOR%"] = [](const char* s, size_t l) { return s; };
        return cxt.grammar;
    }
@ -1345,7 +1357,7 @@ public:
                }
            } else if (exact && r.len != l) {
                auto line = line_info(s, s + r.len);
-                log(line.first, line.second, "garbage string at the end");
+                log(line.first, line.second, "syntax error");
            }
            return r.ret && (!exact || r.len == l);
        }
--- a/test/test.cc
+++ b/test/test.cc
@ -39,7 +39,7 @@ TEST_CASE("String capture test with match", "[general]")
 {
    peglib::match m;
    auto ret = peglib::peg_match(
-        "  ROOT      <-  _ ('[' < TAG_NAME > ']' _)*  "
+        "  ROOT      <-  _ ('[' $< TAG_NAME > ']' _)*  "
        "  TAG_NAME  <-  (!']' .)+                "
        "  _         <-  [ \t]*                   ",
        " [tag1] [tag:2] [tag-3] ",
@ -74,6 +74,31 @@ TEST_CASE("String capture test2", "[general]")
    REQUIRE(tags[2] == "tag-3");
 }
 TEST_CASE("String capture test3", "[general]")
 {
   auto syntax = 
       " ROOT  <- _ TOKEN*                "
       " TOKEN <- '[' < (!']' .)+ > ']' _ "
       " _     <- [ \t\r\n]*              "
       ;
   peg pg(syntax);
   std::vector<std::string> tags;
   pg["TOKEN"] = [&](const char* s, size_t l, const vector<any>& v) {
      tags.push_back(std::string(s, l));
   };
   auto ret = pg.parse(" [tag1] [tag:2] [tag-3] ");
   REQUIRE(ret == true);
   REQUIRE(tags.size() == 3);
   REQUIRE(tags[0] == "tag1");
   REQUIRE(tags[1] == "tag:2");
   REQUIRE(tags[2] == "tag-3");
 }
 TEST_CASE("String capture test with embedded match action", "[general]")
 {
    rule ROOT, TAG, TAG_NAME, WS;