From f0d2c529ba7ec75f89f86ec3fa9d3c4becc63672 Mon Sep 17 00:00:00 2001 From: yhirose Date: Thu, 12 Jul 2018 19:06:48 +0200 Subject: [PATCH] Back reference support --- README.md | 5 ++- peglib.h | 120 ++++++++++++++++++++++++++++++++------------------- test/test.cc | 108 ++++++++++++++++++++++++++++++++++++++-------- 3 files changed, 170 insertions(+), 63 deletions(-) diff --git a/README.md b/README.md index 0c0786f..8a4233e 100644 --- a/README.md +++ b/README.md @@ -13,11 +13,13 @@ The PEG syntax is well described on page 2 in the [document](http://www.brynosau * `<` ... `>` (Token boundary operator) * `~` (Ignore operator) * `\x20` (Hex number char) - * `$<` ... `>` (Capture operator) * `$name<` ... `>` (Named capture operator) + * `$name` (Backreference operator) * `%whitespace` (Automatic whitespace skipping) * `%word` (Word expression) +This library also supports the linear-time parsing known as the [*Packrat*](http://pdos.csail.mit.edu/~baford/packrat/thesis/thesis.pdf) parsing. + If you need a Go language version, please see [*go-peg*](https://github.com/yhirose/go-peg). How to use @@ -322,6 +324,7 @@ The following are available operators: | tok | Token boundary | | ign | Ignore semantic value | | cap | Capture character | +| bkr | Back reference | Unicode support --------------- diff --git a/peglib.h b/peglib.h index 47c37cb..b386679 100644 --- a/peglib.h +++ b/peglib.h @@ -464,11 +464,6 @@ private: std::string s_; }; -/* - * Match action - */ -typedef std::function MatchAction; - /* * Result */ @@ -483,8 +478,8 @@ inline bool fail(size_t len) { /* * Context */ -class Ope; class Context; +class Ope; class Definition; typedef std::function Tracer; @@ -512,6 +507,8 @@ public: std::shared_ptr wordOpe; + std::unordered_map captures; + const size_t def_count; const bool enablePackratParsing; std::vector cache_registered; @@ -989,14 +986,16 @@ public: class Capture : public Ope { public: - Capture(const std::shared_ptr& ope, MatchAction ma, size_t id, const std::string& name) - : ope_(ope), match_action_(ma), id_(id), name_(name) {} + typedef std::function MatchAction; + + Capture(const std::shared_ptr& ope, MatchAction ma) + : ope_(ope), match_action_(ma) {} size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override { const auto& rule = *ope_; auto len = rule.parse(s, n, sv, c, dt); if (success(len) && match_action_) { - match_action_(s, len, id_, name_); + match_action_(s, len, c); } return len; } @@ -1006,9 +1005,7 @@ public: std::shared_ptr ope_; private: - MatchAction match_action_; - size_t id_; - std::string name_; + MatchAction match_action_; }; class TokenBoundary : public Ope @@ -1123,6 +1120,18 @@ public: std::shared_ptr ope_; }; +class BackReference : public Ope +{ +public: + BackReference(const std::string& name) : name_(name) {} + + size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override; + + void accept(Visitor& v) override; + + std::string name_; +}; + /* * Visitor */ @@ -1147,6 +1156,7 @@ struct Ope::Visitor virtual void visit(Holder& /*ope*/) {} virtual void visit(DefinitionReference& /*ope*/) {} virtual void visit(Whitespace& /*ope*/) {} + virtual void visit(BackReference& /*ope*/) {} }; struct AssignIDToDefinition : public Ope::Visitor @@ -1403,31 +1413,31 @@ private: * Implementations */ -inline size_t LiteralString::parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const { - c.trace("LiteralString", s, n, sv, dt); - +inline size_t parse_literal(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt, + const std::string& lit, bool& init_is_word, bool& is_word) +{ size_t i = 0; - for (; i < lit_.size(); i++) { - if (i >= n || s[i] != lit_[i]) { + for (; i < lit.size(); i++) { + if (i >= n || s[i] != lit[i]) { c.set_error_pos(s); return static_cast(-1); } } // Word check - static Context dummy_c(nullptr, lit_.data(), lit_.size(), 0, nullptr, nullptr, false, nullptr); + static Context dummy_c(nullptr, lit.data(), lit.size(), 0, nullptr, nullptr, false, nullptr); static SemanticValues dummy_sv; static any dummy_dt; - if (!init_is_word_) { // TODO: Protect with mutex + if (!init_is_word) { // TODO: Protect with mutex if (c.wordOpe) { - auto len = c.wordOpe->parse(lit_.data(), lit_.size(), dummy_sv, dummy_c, dummy_dt); - is_word_ = success(len); + auto len = c.wordOpe->parse(lit.data(), lit.size(), dummy_sv, dummy_c, dummy_dt); + is_word = success(len); } - init_is_word_ = true; + init_is_word = true; } - if (is_word_) { + if (is_word) { auto ope = std::make_shared(c.wordOpe); auto len = ope->parse(s + i, n - i, dummy_sv, dummy_c, dummy_dt); if (fail(len)) { @@ -1450,6 +1460,11 @@ inline size_t LiteralString::parse(const char* s, size_t n, SemanticValues& sv, return i; } +inline size_t LiteralString::parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const { + c.trace("LiteralString", s, n, sv, dt); + return parse_literal(s, n, sv, c, dt, lit_, init_is_word_, is_word_); +} + inline size_t TokenBoundary::parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const { c.in_token = true; auto se = make_scope_exit([&]() { c.in_token = false; }); @@ -1560,6 +1575,17 @@ inline std::shared_ptr DefinitionReference::get_rule() const { return rule_; } +inline size_t BackReference::parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const { + c.trace("BackReference", s, n, sv, dt); + if (c.captures.find(name_) == c.captures.end()) { + throw std::runtime_error("Invalid back reference..."); + } + const auto& lit = c.captures[name_]; + bool init_is_word = false; + bool is_word = false; + return parse_literal(s, n, sv, c, dt, lit, init_is_word, is_word); +} + inline void Sequence::accept(Visitor& v) { v.visit(*this); } inline void PrioritizedChoice::accept(Visitor& v) { v.visit(*this); } inline void ZeroOrMore::accept(Visitor& v) { v.visit(*this); } @@ -1578,6 +1604,7 @@ inline void WeakHolder::accept(Visitor& v) { v.visit(*this); } inline void Holder::accept(Visitor& v) { v.visit(*this); } inline void DefinitionReference::accept(Visitor& v) { v.visit(*this); } inline void Whitespace::accept(Visitor& v) { v.visit(*this); } +inline void BackReference::accept(Visitor& v) { v.visit(*this); } inline void AssignIDToDefinition::visit(Holder& ope) { auto p = static_cast(ope.outer_); @@ -1639,8 +1666,8 @@ inline std::shared_ptr dot() { return std::make_shared(); } -inline std::shared_ptr cap(const std::shared_ptr& ope, MatchAction ma, size_t n, const std::string& s) { - return std::make_shared(ope, ma, n, s); +inline std::shared_ptr cap(const std::shared_ptr& ope, Capture::MatchAction ma) { + return std::make_shared(ope, ma); } inline std::shared_ptr tok(const std::shared_ptr& ope) { @@ -1659,6 +1686,10 @@ inline std::shared_ptr wsp(const std::shared_ptr& ope) { return std::make_shared(std::make_shared(ope)); } +inline std::shared_ptr bkr(const std::string& name) { + return std::make_shared(name); +} + /*----------------------------------------------------------------------------- * PEG parser generator *---------------------------------------------------------------------------*/ @@ -1673,10 +1704,9 @@ public: const char* s, size_t n, std::string& start, - MatchAction ma, Log log) { - return get_instance().perform_core(s, n, start, ma, log); + return get_instance().perform_core(s, n, start, log); } // For debuging purpose @@ -1698,15 +1728,10 @@ private: struct Data { std::shared_ptr grammar; std::string start; - MatchAction match_action; std::vector> duplicates; std::unordered_map references; - size_t capture_count; - Data() - : grammar(std::make_shared()) - , capture_count(0) - {} + Data(): grammar(std::make_shared()) {} }; struct DetectLeftRecursion : public Ope::Visitor { @@ -1793,6 +1818,9 @@ private: } done_ = true; } + void visit(BackReference& /*ope*/) override { + done_ = true; + } const char* s_; @@ -1815,7 +1843,7 @@ private: seq(g["OPEN"], g["Expression"], g["CLOSE"]), seq(g["BeginTok"], g["Expression"], g["EndTok"]), seq(g["BeginCap"], g["Expression"], g["EndCap"]), - g["Literal"], g["Class"], g["DOT"]); + g["BackRef"], g["Literal"], g["Class"], g["DOT"]); g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]); g["IdentCont"] <= seq(g["IdentStart"], zom(g["IdentRest"])); @@ -1858,8 +1886,10 @@ private: g["BeginTok"] <= seq(chr('<'), g["Spacing"]); g["EndTok"] <= seq(chr('>'), g["Spacing"]); - g["BeginCap"] <= seq(chr('$'), tok(opt(g["Identifier"])), chr('<'), g["Spacing"]); - g["EndCap"] <= seq(lit(">"), g["Spacing"]); + g["BeginCap"] <= seq(chr('$'), tok(g["IdentCont"]), chr('<'), g["Spacing"]); + g["EndCap"] <= seq(chr('>'), g["Spacing"]); + + g["BackRef"] <= seq(chr('$'), tok(g["IdentCont"]), g["Spacing"]); g["IGNORE"] <= chr('~'); @@ -1954,7 +1984,7 @@ private: } }; - g["Primary"] = [&](const SemanticValues& sv, any& dt) -> std::shared_ptr { + g["Primary"] = [&](const SemanticValues& sv, any& dt) { Data& data = *dt.get(); switch (sv.choice()) { @@ -1983,7 +2013,9 @@ private: case 3: { // Capture const auto& name = sv[0].get(); auto ope = sv[1].get>(); - return cap(ope, data.match_action, ++data.capture_count, name); + return cap(ope, [name](const char* a_s, size_t a_n, Context& c) { + c.captures[name] = std::string(a_s, a_n); + }); } default: { return sv[0].get>(); @@ -2013,18 +2045,19 @@ private: g["DOT"] = [](const SemanticValues& /*sv*/) { return dot(); }; g["BeginCap"] = [](const SemanticValues& sv) { return sv.token(); }; + + g["BackRef"] = [&](const SemanticValues& sv) { + return bkr(sv.token()); + }; } std::shared_ptr perform_core( const char* s, size_t n, std::string& start, - MatchAction ma, Log log) { Data data; - data.match_action = ma; - any dt = &data; auto r = g["Grammar"].parse(s, n, dt); @@ -2373,7 +2406,7 @@ public: } bool load_grammar(const char* s, size_t n) { - grammar_ = ParserGenerator::parse(s, n, start_, match_action, log); + grammar_ = ParserGenerator::parse(s, n, start_, log); return grammar_ != nullptr; } @@ -2516,8 +2549,7 @@ public: } } - MatchAction match_action; - Log log; + Log log; private: void output_log(const char* s, size_t n, const Definition::Result& r) const { diff --git a/test/test.cc b/test/test.cc index af39258..d8bc2ed 100644 --- a/test/test.cc +++ b/test/test.cc @@ -83,27 +83,27 @@ TEST_CASE("String capture test2", "[general]") TEST_CASE("String capture test3", "[general]") { - auto syntax = - " ROOT <- _ TOKEN* " - " TOKEN <- '[' < (!']' .)+ > ']' _ " - " _ <- [ \t\r\n]* " - ; + auto syntax = + " ROOT <- _ TOKEN* " + " TOKEN <- '[' < (!']' .)+ > ']' _ " + " _ <- [ \t\r\n]* " + ; - parser pg(syntax); + parser pg(syntax); - std::vector tags; + std::vector tags; - pg["TOKEN"] = [&](const SemanticValues& sv) { - tags.push_back(sv.token()); - }; + pg["TOKEN"] = [&](const SemanticValues& sv) { + tags.push_back(sv.token()); + }; - auto ret = pg.parse(" [tag1] [tag:2] [tag-3] "); + auto ret = pg.parse(" [tag1] [tag:2] [tag-3] "); - REQUIRE(ret == true); - REQUIRE(tags.size() == 3); - REQUIRE(tags[0] == "tag1"); - REQUIRE(tags[1] == "tag:2"); - REQUIRE(tags[2] == "tag-3"); + REQUIRE(ret == true); + REQUIRE(tags.size() == 3); + REQUIRE(tags[0] == "tag1"); + REQUIRE(tags[1] == "tag:2"); + REQUIRE(tags[2] == "tag-3"); } TEST_CASE("Cyclic grammer test", "[general]") @@ -455,7 +455,7 @@ TEST_CASE("Calculator test2", "[general]") ; string start; - auto grammar = ParserGenerator::parse(syntax, strlen(syntax), start, nullptr, nullptr); + auto grammar = ParserGenerator::parse(syntax, strlen(syntax), start, nullptr); auto& g = *grammar; // Setup actions @@ -649,7 +649,7 @@ TEST_CASE("Literal token on AST test1", "[general]") TEST_CASE("Literal token on AST test2", "[general]") { parser parser(R"( - STRING_LITERAL <- '"' (ESC / CHAR)* '"' + STRING_LITERAL <- '"' (ESC / CHAR)* '"' ESC <- ('\\"' / '\\t' / '\\n') CHAR <- (!["] .) )"); @@ -701,6 +701,78 @@ TEST_CASE("Definition duplicates test", "[general]") REQUIRE(!parser); } +TEST_CASE("Back reference test", "[back reference]") +{ + parser parser(R"( + START <- _ LQUOTE < (!RQUOTE .)* > RQUOTE _ + LQUOTE <- 'R"' $delm< [a-zA-Z]* > '(' + RQUOTE <- ')' $delm '"' + ~_ <- [ \t\r\n]* + )"); + + std::string token; + parser["START"] = [&](const SemanticValues& sv) { + token = sv.token(); + }; + + { + token.clear(); + auto ret = parser.parse(R"delm( + R"("hello world")" + )delm"); + + REQUIRE(ret == true); + REQUIRE(token == "\"hello world\""); + } + + { + token.clear(); + auto ret = parser.parse(R"delm( + R"foo("(hello world)")foo" + )delm"); + + REQUIRE(ret == true); + REQUIRE(token == "\"(hello world)\""); + } + + { + token.clear(); + auto ret = parser.parse(R"delm( + R"foo("(hello world)foo")foo" + )delm"); + + REQUIRE(ret == false); + REQUIRE(token == "\"(hello world"); + } + + { + token.clear(); + auto ret = parser.parse(R"delm( + R"foo("(hello world)")bar" + )delm"); + + REQUIRE(ret == false); + REQUIRE(token.empty()); + } +} + +TEST_CASE("Invalid back reference test", "[back reference]") +{ + parser parser(R"( + START <- _ LQUOTE (!RQUOTE .)* RQUOTE _ + LQUOTE <- 'R"' $delm< [a-zA-Z]* > '(' + RQUOTE <- ')' $delm2 '"' + ~_ <- [ \t\r\n]* + )"); + + REQUIRE_THROWS_AS( + parser.parse(R"delm( + R"foo("(hello world)")foo" + )delm"), + std::runtime_error); +} + + TEST_CASE("Left recursive test", "[left recursive]") { parser parser(