Cut operator support

This commit is contained in:
yhirose 2021-01-25 22:46:28 -05:00
parent 4895bea99b
commit d2a2eb3cf6
5 changed files with 110 additions and 34 deletions

View File

@ -25,10 +25,11 @@ The PEG syntax is well described on page 2 in the [document](http://www.brynosau
* `$name<` ... `>` (Named capture operator) * `$name<` ... `>` (Named capture operator)
* `$name` (Backreference operator) * `$name` (Backreference operator)
* `|` (Dictionary operator) * `|` (Dictionary operator)
* `↑` (Cut operator)
* `MACRO_NAME(` ... `)` (Parameterized rule or Macro) * `MACRO_NAME(` ... `)` (Parameterized rule or Macro)
* `{ precedence L - + L / * }` (Parsing infix expression) * `{ precedence L - + L / * }` (Parsing infix expression)
* `%recovery(` ... `)` (Error recovery operator) * `%recovery(` ... `)` (Error recovery operator)
* `exp^label` (Syntax sugar for `(exp / %recover(label))`) * `explabel` (Syntax sugar for `(exp / %recover(label))`)
* `label { message "..." }` (Error message instruction) * `label { message "..." }` (Error message instruction)
This library supports the linear-time parsing known as the [*Packrat*](http://pdos.csail.mit.edu/~baford/packrat/thesis/thesis.pdf) parsing. This library supports the linear-time parsing known as the [*Packrat*](http://pdos.csail.mit.edu/~baford/packrat/thesis/thesis.pdf) parsing.
@ -324,6 +325,18 @@ START <- 'This month is ' MONTH '.'
MONTH <- 'Jan' | 'January' | 'Feb' | 'February' | '...' MONTH <- 'Jan' | 'January' | 'Feb' | 'February' | '...'
``` ```
Cut operator
------------
`↑` operator could mitigate backtrack performance problem, but has a risk to change the meaning of grammar.
```peg
S <- '(' P ')' / '"' P '"' / P
P <- 'a' / 'b' / 'c'
```
When we parse `(z` with the above grammar, we don't have to backtrack in `S` after `(` is matched because a cut operator is inserted there.
Parameterized Rule or Macro Parameterized Rule or Macro
--------------------------- ---------------------------

File diff suppressed because one or more lines are too long

Binary file not shown.

123
peglib.h
View File

@ -767,6 +767,8 @@ public:
std::vector<std::map<std::string_view, std::string>> capture_scope_stack; std::vector<std::map<std::string_view, std::string>> capture_scope_stack;
size_t capture_scope_stack_size = 0; size_t capture_scope_stack_size = 0;
std::vector<bool> cut_stack;
const size_t def_count; const size_t def_count;
const bool enablePackratParsing; const bool enablePackratParsing;
std::vector<bool> cache_registered; std::vector<bool> cache_registered;
@ -971,24 +973,33 @@ public:
class PrioritizedChoice : public Ope { class PrioritizedChoice : public Ope {
public: public:
template <typename... Args> template <typename... Args>
PrioritizedChoice(const Args &... args) PrioritizedChoice(bool for_label, const Args &... args)
: opes_{static_cast<std::shared_ptr<Ope>>(args)...} {} : opes_{static_cast<std::shared_ptr<Ope>>(args)...},
for_label_(for_label) {}
PrioritizedChoice(const std::vector<std::shared_ptr<Ope>> &opes) PrioritizedChoice(const std::vector<std::shared_ptr<Ope>> &opes)
: opes_(opes) {} : opes_(opes) {}
PrioritizedChoice(std::vector<std::shared_ptr<Ope>> &&opes) : opes_(opes) {} PrioritizedChoice(std::vector<std::shared_ptr<Ope>> &&opes) : opes_(opes) {}
size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c,
std::any &dt) const override { std::any &dt) const override {
size_t len = static_cast<size_t>(-1);
if (!for_label_) { c.cut_stack.push_back(false); }
size_t id = 0; size_t id = 0;
for (const auto &ope : opes_) { for (const auto &ope : opes_) {
if (!c.cut_stack.empty()) { c.cut_stack.back() = false; }
auto &chldsv = c.push(); auto &chldsv = c.push();
c.push_capture_scope(); c.push_capture_scope();
auto se = scope_exit([&]() { auto se = scope_exit([&]() {
c.pop(); c.pop();
c.pop_capture_scope(); c.pop_capture_scope();
}); });
auto len = ope->parse(s, n, chldsv, c, dt); len = ope->parse(s, n, chldsv, c, dt);
if (success(len)) { if (success(len)) {
if (!chldsv.empty()) { if (!chldsv.empty()) {
for (size_t i = 0; i < chldsv.size(); i++) { for (size_t i = 0; i < chldsv.size(); i++) {
@ -1008,14 +1019,18 @@ public:
vs.tokens.emplace_back(std::move(chldsv.tokens[i])); vs.tokens.emplace_back(std::move(chldsv.tokens[i]));
} }
} }
c.shift_capture_values(); c.shift_capture_values();
return len; break;
} else if (!c.cut_stack.empty() && c.cut_stack.back()) {
break;
} }
id++; id++;
} }
return static_cast<size_t>(-1);
if (!for_label_) { c.cut_stack.pop_back(); }
return len;
} }
void accept(Visitor &v) override; void accept(Visitor &v) override;
@ -1023,6 +1038,7 @@ public:
size_t size() const { return opes_.size(); } size_t size() const { return opes_.size(); }
std::vector<std::shared_ptr<Ope>> opes_; std::vector<std::shared_ptr<Ope>> opes_;
bool for_label_ = false;
}; };
class Repetition : public Ope { class Repetition : public Ope {
@ -1501,6 +1517,17 @@ public:
std::shared_ptr<Ope> ope_; std::shared_ptr<Ope> ope_;
}; };
class Cut : public Ope, public std::enable_shared_from_this<Cut> {
public:
size_t parse_core(const char * /*s*/, size_t /*n*/, SemanticValues & /*vs*/,
Context &c, std::any & /*dt*/) const override {
c.cut_stack.back() = true;
return 0;
}
void accept(Visitor &v) override;
};
/* /*
* Factories * Factories
*/ */
@ -1510,7 +1537,12 @@ template <typename... Args> std::shared_ptr<Ope> seq(Args &&... args) {
template <typename... Args> std::shared_ptr<Ope> cho(Args &&... args) { template <typename... Args> std::shared_ptr<Ope> cho(Args &&... args) {
return std::make_shared<PrioritizedChoice>( return std::make_shared<PrioritizedChoice>(
static_cast<std::shared_ptr<Ope>>(args)...); false, static_cast<std::shared_ptr<Ope>>(args)...);
}
template <typename... Args> std::shared_ptr<Ope> cho4label_(Args &&... args) {
return std::make_shared<PrioritizedChoice>(
true, static_cast<std::shared_ptr<Ope>>(args)...);
} }
inline std::shared_ptr<Ope> zom(const std::shared_ptr<Ope> &ope) { inline std::shared_ptr<Ope> zom(const std::shared_ptr<Ope> &ope) {
@ -1623,6 +1655,8 @@ inline std::shared_ptr<Ope> rec(const std::shared_ptr<Ope> &ope) {
return std::make_shared<Recovery>(ope); return std::make_shared<Recovery>(ope);
} }
inline std::shared_ptr<Ope> cut() { return std::make_shared<Cut>(); }
/* /*
* Visitor * Visitor
*/ */
@ -1650,6 +1684,7 @@ struct Ope::Visitor {
virtual void visit(BackReference &) {} virtual void visit(BackReference &) {}
virtual void visit(PrecedenceClimbing &) {} virtual void visit(PrecedenceClimbing &) {}
virtual void visit(Recovery &) {} virtual void visit(Recovery &) {}
virtual void visit(Cut &) {}
}; };
struct IsReference : public Ope::Visitor { struct IsReference : public Ope::Visitor {
@ -1688,6 +1723,7 @@ struct TraceOpeName : public Ope::Visitor {
void visit(BackReference &) override { name_ = "BackReference"; } void visit(BackReference &) override { name_ = "BackReference"; }
void visit(PrecedenceClimbing &) override { name_ = "PrecedenceClimbing"; } void visit(PrecedenceClimbing &) override { name_ = "PrecedenceClimbing"; }
void visit(Recovery &) override { name_ = "Recovery"; } void visit(Recovery &) override { name_ = "Recovery"; }
void visit(Cut &) override { name_ = "Cut"; }
static std::string get(Ope &ope) { static std::string get(Ope &ope) {
TraceOpeName vis; TraceOpeName vis;
@ -1853,6 +1889,7 @@ struct DetectLeftRecursion : public Ope::Visitor {
void visit(BackReference &) override { done_ = true; } void visit(BackReference &) override { done_ = true; }
void visit(PrecedenceClimbing &ope) override { ope.atom_->accept(*this); } void visit(PrecedenceClimbing &ope) override { ope.atom_->accept(*this); }
void visit(Recovery &ope) override { ope.ope_->accept(*this); } void visit(Recovery &ope) override { ope.ope_->accept(*this); }
void visit(Cut &) override { done_ = true; }
const char *error_s = nullptr; const char *error_s = nullptr;
@ -2119,6 +2156,7 @@ struct FindReference : public Ope::Visitor {
ope.ope_->accept(*this); ope.ope_->accept(*this);
found_ope = rec(found_ope); found_ope = rec(found_ope);
} }
void visit(Cut &ope) override { found_ope = ope.shared_from_this(); }
std::shared_ptr<Ope> found_ope; std::shared_ptr<Ope> found_ope;
@ -2516,10 +2554,12 @@ inline size_t Holder::parse_core(const char *s, size_t n, SemanticValues &vs,
try { try {
a_val = reduce(chldsv, dt); a_val = reduce(chldsv, dt);
} catch (const parse_error &e) { } catch (const parse_error &e) {
if (e.what()) { if (c.log) {
if (c.error_info.message_pos < s) { if (e.what()) {
c.error_info.message_pos = s; if (c.error_info.message_pos < s) {
c.error_info.message = e.what(); c.error_info.message_pos = s;
c.error_info.message = e.what();
}
} }
} }
len = static_cast<size_t>(-1); len = static_cast<size_t>(-1);
@ -2696,31 +2736,49 @@ inline size_t PrecedenceClimbing::parse_expression(const char *s, size_t n,
inline size_t Recovery::parse_core(const char *s, size_t n, inline size_t Recovery::parse_core(const char *s, size_t n,
SemanticValues & /*vs*/, Context &c, SemanticValues & /*vs*/, Context &c,
std::any & /*dt*/) const { std::any & /*dt*/) const {
auto save_log = c.log;
c.log = nullptr;
const auto &rule = dynamic_cast<Reference &>(*ope_); const auto &rule = dynamic_cast<Reference &>(*ope_);
SemanticValues dummy_vs; // Custom error message
std::any dummy_dt; if (c.log) {
auto len = rule.parse(s, n, dummy_vs, c, dummy_dt); auto label = dynamic_cast<Reference *>(rule.args_[0].get());
if (label) {
if (!label->rule_->error_message.empty()) {
c.error_info.message_pos = c.error_info.error_pos;
c.error_info.message = label->rule_->error_message;
}
}
}
c.log = save_log; // Recovery
size_t len = static_cast<size_t>(-1);
{
auto save_log = c.log;
c.log = nullptr;
auto se = scope_exit([&]() { c.log = save_log; });
SemanticValues dummy_vs;
std::any dummy_dt;
len = rule.parse(s, n, dummy_vs, c, dummy_dt);
}
if (success(len)) { if (success(len)) {
c.recovered = true; c.recovered = true;
if (c.log) { if (c.log) {
auto label = dynamic_cast<Reference *>(rule.args_[0].get());
if (label) {
if (!label->rule_->error_message.empty()) {
c.error_info.message_pos = c.error_info.error_pos;
c.error_info.message = label->rule_->error_message;
}
}
c.error_info.output_log(c.log, c.s, c.l); c.error_info.output_log(c.log, c.s, c.l);
c.error_info.clear();
}
}
// Cut
if (!c.cut_stack.empty()) {
c.cut_stack.back() = true;
if (c.cut_stack.size() == 1) {
// TODO: Remove unneeded entries in packrat memoise table
} }
} }
c.error_info.clear();
return len; return len;
} }
@ -2747,6 +2805,7 @@ inline void Whitespace::accept(Visitor &v) { v.visit(*this); }
inline void BackReference::accept(Visitor &v) { v.visit(*this); } inline void BackReference::accept(Visitor &v) { v.visit(*this); }
inline void PrecedenceClimbing::accept(Visitor &v) { v.visit(*this); } inline void PrecedenceClimbing::accept(Visitor &v) { v.visit(*this); }
inline void Recovery::accept(Visitor &v) { v.visit(*this); } inline void Recovery::accept(Visitor &v) { v.visit(*this); }
inline void Cut::accept(Visitor &v) { v.visit(*this); }
inline void AssignIDToDefinition::visit(Holder &ope) { inline void AssignIDToDefinition::visit(Holder &ope) {
auto p = static_cast<void *>(ope.outer_); auto p = static_cast<void *>(ope.outer_);
@ -2947,10 +3006,10 @@ private:
seq(g["Ignore"], g["Identifier"], g["LEFTARROW"], g["Expression"], seq(g["Ignore"], g["Identifier"], g["LEFTARROW"], g["Expression"],
opt(g["Instruction"]))); opt(g["Instruction"])));
g["Expression"] <= seq(g["Sequence"], zom(seq(g["SLASH"], g["Sequence"]))); g["Expression"] <= seq(g["Sequence"], zom(seq(g["SLASH"], g["Sequence"])));
g["Sequence"] <= zom(g["Prefix"]); g["Sequence"] <= zom(cho(g["CUT"], g["Prefix"]));
g["Prefix"] <= seq(opt(cho(g["AND"], g["NOT"])), g["SuffixWithLabel"]); g["Prefix"] <= seq(opt(cho(g["AND"], g["NOT"])), g["SuffixWithLabel"]);
g["SuffixWithLabel"] <= g["SuffixWithLabel"] <=
seq(g["Suffix"], opt(seq(g["HAT"], g["Identifier"]))); seq(g["Suffix"], opt(seq(g["LABEL"], g["Identifier"])));
g["Suffix"] <= seq(g["Primary"], opt(g["Loop"])); g["Suffix"] <= seq(g["Primary"], opt(g["Loop"]));
g["Loop"] <= cho(g["QUESTION"], g["STAR"], g["PLUS"], g["Repetition"]); g["Loop"] <= cho(g["QUESTION"], g["STAR"], g["PLUS"], g["Repetition"]);
g["Primary"] <= g["Primary"] <=
@ -3023,7 +3082,6 @@ private:
~g["PIPE"] <= seq(chr('|'), g["Spacing"]); ~g["PIPE"] <= seq(chr('|'), g["Spacing"]);
g["AND"] <= seq(chr('&'), g["Spacing"]); g["AND"] <= seq(chr('&'), g["Spacing"]);
g["NOT"] <= seq(chr('!'), g["Spacing"]); g["NOT"] <= seq(chr('!'), g["Spacing"]);
~g["HAT"] <= seq(chr('^'), g["Spacing"]);
g["QUESTION"] <= seq(chr('?'), g["Spacing"]); g["QUESTION"] <= seq(chr('?'), g["Spacing"]);
g["STAR"] <= seq(chr('*'), g["Spacing"]); g["STAR"] <= seq(chr('*'), g["Spacing"]);
g["PLUS"] <= seq(chr('+'), g["Spacing"]); g["PLUS"] <= seq(chr('+'), g["Spacing"]);
@ -3031,6 +3089,9 @@ private:
~g["CLOSE"] <= seq(chr(')'), g["Spacing"]); ~g["CLOSE"] <= seq(chr(')'), g["Spacing"]);
g["DOT"] <= seq(chr('.'), g["Spacing"]); g["DOT"] <= seq(chr('.'), g["Spacing"]);
g["CUT"] <= seq(lit(u8""), g["Spacing"]);
~g["LABEL"] <= seq(cho(chr('^'), lit(u8"")), g["Spacing"]);
~g["Spacing"] <= zom(cho(g["Space"], g["Comment"])); ~g["Spacing"] <= zom(cho(g["Space"], g["Comment"]));
g["Comment"] <= g["Comment"] <=
seq(chr('#'), zom(seq(npd(g["EndOfLine"]), dot())), g["EndOfLine"]); seq(chr('#'), zom(seq(npd(g["EndOfLine"]), dot())), g["EndOfLine"]);
@ -3193,7 +3254,7 @@ private:
auto label = ref(*data.grammar, ident, vs.sv().data(), false, {}); auto label = ref(*data.grammar, ident, vs.sv().data(), false, {});
auto recovery = rec(ref(*data.grammar, RECOVER_DEFINITION_NAME, auto recovery = rec(ref(*data.grammar, RECOVER_DEFINITION_NAME,
vs.sv().data(), true, {label})); vs.sv().data(), true, {label}));
return cho(ope, recovery); return cho4label_(ope, recovery);
} }
}; };
@ -3363,6 +3424,8 @@ private:
g["DOT"] = [](const SemanticValues & /*vs*/) { return dot(); }; g["DOT"] = [](const SemanticValues & /*vs*/) { return dot(); };
g["CUT"] = [](const SemanticValues & /*vs*/) { return cut(); };
g["BeginCap"] = [](const SemanticValues &vs) { return vs.token(); }; g["BeginCap"] = [](const SemanticValues &vs) { return vs.token(); };
g["BackRef"] = [&](const SemanticValues &vs) { g["BackRef"] = [&](const SemanticValues &vs) {

View File

@ -1239,7 +1239,7 @@ R"(+ START
TEST_CASE("Error recovery 2", "[error]") { TEST_CASE("Error recovery 2", "[error]") {
parser pg(R"( parser pg(R"(
START <- ENTRY ((',' ENTRY) / %recover((!(',' / Space) .)+))* (_ / %recover((!'!.' .)+)) START <- ENTRY ((',' ENTRY) / %recover((!(',' / Space) .)+))* (_ / %recover(.*))
ENTRY <- '[' ITEM (',' ITEM)* ']' ENTRY <- '[' ITEM (',' ITEM)* ']'
ITEM <- WORD / NUM / %recover((!(',' / ']') .)+) ITEM <- WORD / NUM / %recover((!(',' / ']') .)+)
NUM <- [0-9]+ ![a-z] NUM <- [0-9]+ ![a-z]
@ -1259,7 +1259,7 @@ TEST_CASE("Error recovery 2", "[error]") {
R"(1:38: syntax error, unexpected 'ddd', expecting <NUM>.)", R"(1:38: syntax error, unexpected 'ddd', expecting <NUM>.)",
R"(1:55: syntax error, unexpected ']', expecting <WORD>.)", R"(1:55: syntax error, unexpected ']', expecting <WORD>.)",
R"(1:58: syntax error, unexpected '\n', expecting <NUM>.)", R"(1:58: syntax error, unexpected '\n', expecting <NUM>.)",
R"(1:56: syntax error, unexpected ',', expecting <Space>.)", R"(2:3: syntax error.)",
}; };
size_t i = 0; size_t i = 0;