diff --git a/example/calc3.cc b/example/calc3.cc index a8b0758..1775de2 100644 --- a/example/calc3.cc +++ b/example/calc3.cc @@ -40,13 +40,15 @@ int main(int argc, const char** argv) }; parser parser(R"( - EXPRESSION <- _ TERM (TERM_OPERATOR TERM)* + EXPRESSION <- TERM (TERM_OPERATOR TERM)* TERM <- FACTOR (FACTOR_OPERATOR FACTOR)* - FACTOR <- NUMBER / '(' _ EXPRESSION ')' _ - TERM_OPERATOR <- < [-+] > _ - FACTOR_OPERATOR <- < [/*] > _ - NUMBER <- < [0-9]+ > _ - ~_ <- [ \t\r\n]* + FACTOR <- NUMBER / '(' EXPRESSION ')' + + TERM_OPERATOR <- [-+] + FACTOR_OPERATOR <- [/*] + NUMBER <- [0-9]+ + + %whitespace <- [ \t\r\n]* )"); parser.enable_ast(); diff --git a/peglib.h b/peglib.h index 6ba7fc7..a27d126 100644 --- a/peglib.h +++ b/peglib.h @@ -374,7 +374,7 @@ private: } template - Fty make_adaptor(F fn, R(*mf)(const SemanticValues& sv)) { + Fty make_adaptor(F fn, R (*mf)(const SemanticValues& sv)) { return TypeAdaptor(fn); } @@ -427,6 +427,9 @@ inline bool fail(size_t len) { /* * Context */ +class Definition; +class Ope; + struct Context { const char* path; @@ -437,27 +440,38 @@ struct Context const char* message_pos; std::string message; // TODO: should be `int`. + std::vector definition_stack; + + std::vector> value_stack; + size_t value_stack_size; + + std::shared_ptr whiteSpaceOpe; + const size_t def_count; const bool enablePackratParsing; std::vector cache_register; std::vector cache_success; - std::vector> stack; - size_t stack_size; - std::map, std::tuple> cache_result; - Context(const char* path, const char* s, size_t l, size_t def_count, bool enablePackratParsing) + Context( + const char* path, + const char* s, + size_t l, + size_t def_count, + std::shared_ptr whiteSpaceOpe, + bool enablePackratParsing) : path(path) , s(s) , l(l) , error_pos(nullptr) , message_pos(nullptr) + , whiteSpaceOpe(whiteSpaceOpe) + , value_stack_size(0) , def_count(def_count) , enablePackratParsing(enablePackratParsing) , cache_register(enablePackratParsing ? def_count * (l + 1) : 0) , cache_success(enablePackratParsing ? def_count * (l + 1) : 0) - , stack_size(0) { } @@ -493,11 +507,11 @@ struct Context } inline SemanticValues& push() { - assert(stack_size <= stack.size()); - if (stack_size == stack.size()) { - stack.emplace_back(std::make_shared()); + assert(value_stack_size <= value_stack.size()); + if (value_stack_size == value_stack.size()) { + value_stack.emplace_back(std::make_shared()); } - auto& sv = *stack[stack_size++]; + auto& sv = *value_stack[value_stack_size++]; if (!sv.empty()) { sv.clear(); } @@ -509,7 +523,7 @@ struct Context } void pop() { - stack_size--; + value_stack_size--; } void set_error_pos(const char* s) { @@ -754,16 +768,7 @@ class LiteralString : public Ope public: LiteralString(const std::string& s) : lit_(s) {} - size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override { - auto i = 0u; - for (; i < lit_.size(); i++) { - if (i >= n || s[i] != lit_[i]) { - c.set_error_pos(s); - return -1; - } - } - return i; - } + size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override; void accept(Visitor& v) override; @@ -936,8 +941,6 @@ public: std::weak_ptr weak_; }; -class Definition; - class Holder : public Ope { public: @@ -1063,6 +1066,8 @@ struct IsToken : public Ope::Visitor bool has_rule; }; +const char* WHITESPACE_DEFINITION_NAME = "%whitespace"; + /* * Definition */ @@ -1096,6 +1101,7 @@ public: Definition(Definition&& rhs) : name(std::move(rhs.name)) , ignoreSemanticValue(rhs.ignoreSemanticValue) + , whiteSpaceOpe(rhs.whiteSpaceOpe) , enablePackratParsing(rhs.enablePackratParsing) , is_token(rhs.is_token) , holder_(std::move(rhs.holder_)) @@ -1200,6 +1206,10 @@ public: holder_->accept(v); } + std::shared_ptr get_core_operator() { + return holder_->ope_; + } + std::string name; size_t id; Action action; @@ -1207,6 +1217,7 @@ public: std::function after; std::function error_message; bool ignoreSemanticValue; + std::shared_ptr whiteSpaceOpe; bool enablePackratParsing; bool is_token; @@ -1220,7 +1231,7 @@ private: AssignIDToDefinition assignId; holder_->accept(assignId); - Context cxt(path, s, n, assignId.ids.size(), enablePackratParsing); + Context cxt(path, s, n, assignId.ids.size(), whiteSpaceOpe, enablePackratParsing); auto len = holder_->parse(s, n, sv, cxt, dt); return Result{ success(len), len, cxt.error_pos, cxt.message_pos, cxt.message }; } @@ -1232,6 +1243,28 @@ private: * Implementations */ +inline size_t LiteralString::parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const { + auto i = 0u; + for (; i < lit_.size(); i++) { + if (i >= n || s[i] != lit_[i]) { + c.set_error_pos(s); + return -1; + } + } + + // Skip whiltespace + const auto d = c.definition_stack.back(); + if (!d->is_token && c.whiteSpaceOpe) { + auto len = c.whiteSpaceOpe->parse(s + i, n - i, sv, c, dt); + if (fail(len)) { + return -1; + } + i += len; + } + + return i; +} + inline size_t Holder::parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const { if (!ope_) { throw std::logic_error("Uninitialized definition ope was used..."); @@ -1243,13 +1276,23 @@ inline size_t Holder::parse(const char* s, size_t n, SemanticValues& sv, Context size_t token_boundary_n = n; c.packrat(s, outer_->id, len, val, [&](any& val) { + c.definition_stack.push_back(outer_); + auto& chldsv = c.push(); if (outer_->before) { outer_->before(dt); } - const auto& rule = *ope_; + auto ope = ope_; + + if (outer_->whiteSpaceOpe) { + ope = std::make_shared(outer_->whiteSpaceOpe, ope_); + } else if (outer_->is_token && c.whiteSpaceOpe) { + ope = std::make_shared(std::make_shared(ope_), c.whiteSpaceOpe); + } + + const auto& rule = *ope; len = rule.parse(s, n, chldsv, c, dt); token_boundary_n = len; @@ -1282,6 +1325,7 @@ inline size_t Holder::parse(const char* s, size_t n, SemanticValues& sv, Context } c.pop(); + c.definition_stack.pop_back(); }); if (success(len)) { @@ -1622,7 +1666,7 @@ private: g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]); g["IdentCont"] <= seq(g["IdentStart"], zom(g["IdentRest"])); - g["IdentStart"] <= cls("a-zA-Z_\x80-\xff"); + g["IdentStart"] <= cls("a-zA-Z_\x80-\xff%"); g["IdentRest"] <= cho(g["IdentStart"], cls("0-9")); g["Literal"] <= cho(seq(cls("'"), tok(zom(seq(npd(cls("'")), g["Char"]))), cls("'"), g["Spacing"]), @@ -1912,6 +1956,12 @@ private: // Set root definition start = data.start; + // Automatic whitespace skipping + if (grammar.count(WHITESPACE_DEFINITION_NAME)) { + auto& rule = (*data.grammar)[start]; + rule.whiteSpaceOpe = (*data.grammar)[WHITESPACE_DEFINITION_NAME].get_core_operator(); + } + return data.grammar; } @@ -2317,7 +2367,7 @@ public: private: void output_log(const char* s, size_t n, Log log, const Definition::Result& r) const { if (log) { - if (!r.ret) { + if (!r.ret) { if (r.message_pos) { auto line = line_info(s, r.message_pos); log(line.first, line.second, r.message); diff --git a/test/test.cc b/test/test.cc index a8b0124..6669662 100644 --- a/test/test.cc +++ b/test/test.cc @@ -255,6 +255,25 @@ TEST_CASE("before/after handlers test", "[general]") parser.parse("hello=world", dt); } +TEST_CASE("WHITESPACE test", "[general]") +{ + peg::parser parser(R"( + # Rules + ROOT <- ITEM (',' ITEM)* + ITEM <- WORD / PHRASE + + # Tokens + WORD <- [a-zA-Z0-9_]+ + PHRASE <- '"' (!'"' .)* '"' + + %whitespace <- [ \t\r\n]* + )"); + + auto ret = parser.parse(R"( one, "two, three", four )"); + + REQUIRE(ret == true); +} + TEST_CASE("Skip token test", "[general]") { peg::parser parser( @@ -272,6 +291,23 @@ TEST_CASE("Skip token test", "[general]") REQUIRE(ret == true); } +TEST_CASE("Skip token test2", "[general]") +{ + peg::parser parser(R"( + ROOT <- ITEM (',' ITEM)* + ITEM <- ([a-z0-9])+ + %whitespace <- [ \t]* + )"); + + parser["ROOT"] = [&](const SemanticValues& sv) { + REQUIRE(sv.size() == 2); + }; + + auto ret = parser.parse(" item1, item2 "); + + REQUIRE(ret == true); +} + TEST_CASE("Backtracking test", "[general]") { parser parser(