Fixed problems with backreference

This commit is contained in:
yhirose 2018-07-20 22:09:54 -04:00
parent e2da595899
commit 9980eea1cd
3 changed files with 224 additions and 24 deletions

View File

@ -17,6 +17,9 @@ The PEG syntax is well described on page 2 in the [document](http://www.brynosau
* `$name` (Backreference operator)
* `%whitespace` (Automatic whitespace skipping)
* `%word` (Word expression)
* `$name(` ... `)` (Create capture scope)
* `$name<` ... `>` (Named capture operator)
* `$name` (Backreference operator)
This library also supports the linear-time parsing known as the [*Packrat*](http://pdos.csail.mit.edu/~baford/packrat/thesis/thesis.pdf) parsing.
@ -323,7 +326,8 @@ The following are available operators:
| dot | Any character |
| tok | Token boundary |
| ign | Ignore semantic value |
| cap | Capture character |
| ncs | New capture scope |
| cap | Capture |
| bkr | Back reference |
Unicode support

133
peglib.h
View File

@ -507,7 +507,7 @@ public:
std::shared_ptr<Ope> wordOpe;
std::unordered_map<std::string, std::string> captures;
std::vector<std::unordered_map<std::string, std::string>> capture_scope_stack;
const size_t def_count;
const bool enablePackratParsing;
@ -544,6 +544,7 @@ public:
, cache_success(enablePackratParsing ? def_count * (l + 1) : 0)
, tracer(a_tracer)
{
capture_scope_stack.resize(1);
}
template <typename T>
@ -598,6 +599,23 @@ public:
value_stack_size--;
}
void push_capture_scope() {
capture_scope_stack.resize(capture_scope_stack.size() + 1);
}
void pop_capture_scope() {
capture_scope_stack.resize(capture_scope_stack.size() - 1);
}
void shift_capture_values() {
assert(capture_scope_stack.size() >= 2);
auto it = capture_scope_stack.rbegin();
auto it_prev = it + 1;
for (const auto& kv: *it) {
(*it_prev)[kv.first] = kv.second;
}
}
void set_error_pos(const char* a_s) {
if (error_pos < a_s) error_pos = a_s;
}
@ -693,9 +711,11 @@ public:
for (const auto& ope : opes_) {
c.nest_level++;
auto& chldsv = c.push();
c.push_capture_scope();
auto se = make_scope_exit([&]() {
c.nest_level--;
c.pop();
c.pop_capture_scope();
});
const auto& rule = *ope;
auto len = rule.parse(s, n, chldsv, c, dt);
@ -707,6 +727,8 @@ public:
sv.n_ = chldsv.length();
sv.choice_ = id;
sv.tokens.insert(sv.tokens.end(), chldsv.tokens.begin(), chldsv.tokens.end());
c.shift_capture_values();
return len;
}
id++;
@ -732,12 +754,18 @@ public:
size_t i = 0;
while (n - i > 0) {
c.nest_level++;
auto se = make_scope_exit([&]() { c.nest_level--; });
c.push_capture_scope();
auto se = make_scope_exit([&]() {
c.nest_level--;
c.pop_capture_scope();
});
auto save_sv_size = sv.size();
auto save_tok_size = sv.tokens.size();
const auto& rule = *ope_;
auto len = rule.parse(s + i, n - i, sv, c, dt);
if (fail(len)) {
if (success(len)) {
c.shift_capture_values();
} else {
if (sv.size() != save_sv_size) {
sv.erase(sv.begin() + static_cast<std::ptrdiff_t>(save_sv_size));
}
@ -767,10 +795,16 @@ public:
size_t len = 0;
{
c.nest_level++;
auto se = make_scope_exit([&]() { c.nest_level--; });
c.push_capture_scope();
auto se = make_scope_exit([&]() {
c.nest_level--;
c.pop_capture_scope();
});
const auto& rule = *ope_;
len = rule.parse(s, n, sv, c, dt);
if (fail(len)) {
if (success(len)) {
c.shift_capture_values();
} else {
return static_cast<size_t>(-1);
}
}
@ -778,12 +812,18 @@ public:
auto i = len;
while (n - i > 0) {
c.nest_level++;
auto se = make_scope_exit([&]() { c.nest_level--; });
c.push_capture_scope();
auto se = make_scope_exit([&]() {
c.nest_level--;
c.pop_capture_scope();
});
auto save_sv_size = sv.size();
auto save_tok_size = sv.tokens.size();
const auto& rule = *ope_;
len = rule.parse(s + i, n - i, sv, c, dt);
if (fail(len)) {
if (success(len)) {
c.shift_capture_values();
} else {
if (sv.size() != save_sv_size) {
sv.erase(sv.begin() + static_cast<std::ptrdiff_t>(save_sv_size));
}
@ -814,10 +854,15 @@ public:
c.nest_level++;
auto save_sv_size = sv.size();
auto save_tok_size = sv.tokens.size();
auto se = make_scope_exit([&]() { c.nest_level--; });
c.push_capture_scope();
auto se = make_scope_exit([&]() {
c.nest_level--;
c.pop_capture_scope();
});
const auto& rule = *ope_;
auto len = rule.parse(s, n, sv, c, dt);
if (success(len)) {
c.shift_capture_values();
return len;
} else {
if (sv.size() != save_sv_size) {
@ -845,9 +890,11 @@ public:
c.trace("AndPredicate", s, n, sv, dt);
c.nest_level++;
auto& chldsv = c.push();
c.push_capture_scope();
auto se = make_scope_exit([&]() {
c.nest_level--;
c.pop();
c.pop_capture_scope();
});
const auto& rule = *ope_;
auto len = rule.parse(s, n, chldsv, c, dt);
@ -873,9 +920,11 @@ public:
auto save_error_pos = c.error_pos;
c.nest_level++;
auto& chldsv = c.push();
c.push_capture_scope();
auto se = make_scope_exit([&]() {
c.nest_level--;
c.pop();
c.pop_capture_scope();
});
const auto& rule = *ope_;
auto len = rule.parse(s, n, chldsv, c, dt);
@ -983,6 +1032,27 @@ public:
void accept(Visitor& v) override;
};
class NewCaptureScope : public Ope
{
public:
NewCaptureScope(const std::shared_ptr<Ope>& ope)
: ope_(ope) {}
size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override {
c.push_capture_scope();
auto se = make_scope_exit([&]() {
c.pop_capture_scope();
});
const auto& rule = *ope_;
auto len = rule.parse(s, n, sv, c, dt);
return len;
}
void accept(Visitor& v) override;
std::shared_ptr<Ope> ope_;
};
class Capture : public Ope
{
public:
@ -1149,6 +1219,7 @@ struct Ope::Visitor
virtual void visit(CharacterClass& /*ope*/) {}
virtual void visit(Character& /*ope*/) {}
virtual void visit(AnyCharacter& /*ope*/) {}
virtual void visit(NewCaptureScope& /*ope*/) {}
virtual void visit(Capture& /*ope*/) {}
virtual void visit(TokenBoundary& /*ope*/) {}
virtual void visit(Ignore& /*ope*/) {}
@ -1178,6 +1249,7 @@ struct AssignIDToDefinition : public Ope::Visitor
void visit(Option& ope) override { ope.ope_->accept(*this); }
void visit(AndPredicate& ope) override { ope.ope_->accept(*this); }
void visit(NotPredicate& ope) override { ope.ope_->accept(*this); }
void visit(NewCaptureScope& ope) override { ope.ope_->accept(*this); }
void visit(Capture& ope) override { ope.ope_->accept(*this); }
void visit(TokenBoundary& ope) override { ope.ope_->accept(*this); }
void visit(Ignore& ope) override { ope.ope_->accept(*this); }
@ -1207,6 +1279,7 @@ struct IsToken : public Ope::Visitor
void visit(ZeroOrMore& ope) override { ope.ope_->accept(*this); }
void visit(OneOrMore& ope) override { ope.ope_->accept(*this); }
void visit(Option& ope) override { ope.ope_->accept(*this); }
void visit(NewCaptureScope& ope) override { ope.ope_->accept(*this); }
void visit(Capture& ope) override { ope.ope_->accept(*this); }
void visit(TokenBoundary& /*ope*/) override { has_token_boundary = true; }
void visit(Ignore& ope) override { ope.ope_->accept(*this); }
@ -1491,7 +1564,9 @@ inline size_t Holder::parse(const char* s, size_t n, SemanticValues& sv, Context
c.trace(outer_->name.c_str(), s, n, sv, dt);
c.nest_level++;
auto se = make_scope_exit([&]() { c.nest_level--; });
auto se = make_scope_exit([&]() {
c.nest_level--;
});
size_t len;
any val;
@ -1577,14 +1652,19 @@ inline std::shared_ptr<Ope> DefinitionReference::get_rule() const {
inline size_t BackReference::parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const {
c.trace("BackReference", s, n, sv, dt);
if (c.captures.find(name_) == c.captures.end()) {
throw std::runtime_error("Invalid back reference...");
}
const auto& lit = c.captures[name_];
bool init_is_word = false;
bool is_word = false;
auto it = c.capture_scope_stack.rbegin();
while (it != c.capture_scope_stack.rend()) {
const auto& captures = *it;
if (captures.find(name_) != captures.end()) {
const auto& lit = captures.at(name_);
auto init_is_word = false;
auto is_word = false;
return parse_literal(s, n, sv, c, dt, lit, init_is_word, is_word);
}
++it;
}
throw std::runtime_error("Invalid back reference...");
}
inline void Sequence::accept(Visitor& v) { v.visit(*this); }
inline void PrioritizedChoice::accept(Visitor& v) { v.visit(*this); }
@ -1597,6 +1677,7 @@ inline void LiteralString::accept(Visitor& v) { v.visit(*this); }
inline void CharacterClass::accept(Visitor& v) { v.visit(*this); }
inline void Character::accept(Visitor& v) { v.visit(*this); }
inline void AnyCharacter::accept(Visitor& v) { v.visit(*this); }
inline void NewCaptureScope::accept(Visitor& v) { v.visit(*this); }
inline void Capture::accept(Visitor& v) { v.visit(*this); }
inline void TokenBoundary::accept(Visitor& v) { v.visit(*this); }
inline void Ignore::accept(Visitor& v) { v.visit(*this); }
@ -1666,6 +1747,10 @@ inline std::shared_ptr<Ope> dot() {
return std::make_shared<AnyCharacter>();
}
inline std::shared_ptr<Ope> ncs(const std::shared_ptr<Ope>& ope) {
return std::make_shared<NewCaptureScope>(ope);
}
inline std::shared_ptr<Ope> cap(const std::shared_ptr<Ope>& ope, Capture::MatchAction ma) {
return std::make_shared<Capture>(ope, ma);
}
@ -1792,6 +1877,9 @@ private:
void visit(AnyCharacter& /*ope*/) override {
done_ = true;
}
void visit(NewCaptureScope& ope) override {
ope.ope_->accept(*this);
}
void visit(Capture& ope) override {
ope.ope_->accept(*this);
}
@ -1842,6 +1930,7 @@ private:
g["Primary"] <= cho(seq(opt(g["IGNORE"]), g["Identifier"], npd(g["LEFTARROW"])),
seq(g["OPEN"], g["Expression"], g["CLOSE"]),
seq(g["BeginTok"], g["Expression"], g["EndTok"]),
seq(g["BeginCapScope"], g["Expression"], g["EndCapScope"]),
seq(g["BeginCap"], g["Expression"], g["EndCap"]),
g["BackRef"], g["Literal"], g["Class"], g["DOT"]);
@ -1886,6 +1975,9 @@ private:
g["BeginTok"] <= seq(chr('<'), g["Spacing"]);
g["EndTok"] <= seq(chr('>'), g["Spacing"]);
g["BeginCapScope"] <= seq(chr('$'), chr('('), g["Spacing"]);
g["EndCapScope"] <= seq(chr(')'), g["Spacing"]);
g["BeginCap"] <= seq(chr('$'), tok(g["IdentCont"]), chr('<'), g["Spacing"]);
g["EndCap"] <= seq(chr('>'), g["Spacing"]);
@ -1903,8 +1995,8 @@ private:
g["Definition"] = [&](const SemanticValues& sv, any& dt) {
Data& data = *dt.get<Data*>();
auto ignore = (sv.size() == 4);
auto baseId = ignore ? 1u : 0u;
auto baseId = sv.size() - 3;
auto ignore = baseId > 0;
const auto& name = sv[baseId].get<std::string>();
auto ope = sv[baseId + 2].get<std::shared_ptr<Ope>>();
@ -2010,11 +2102,14 @@ private:
case 2: { // TokenBoundary
return tok(sv[1].get<std::shared_ptr<Ope>>());
}
case 3: { // Capture
case 3: { // NewCaptureScope
return ncs(sv[1].get<std::shared_ptr<Ope>>());
}
case 4: { // Capture
const auto& name = sv[0].get<std::string>();
auto ope = sv[1].get<std::shared_ptr<Ope>>();
return cap(ope, [name](const char* a_s, size_t a_n, Context& c) {
c.captures[name] = std::string(a_s, a_n);
c.capture_scope_stack.back()[name] = std::string(a_s, a_n);
});
}
default: {

View File

@ -773,6 +773,107 @@ TEST_CASE("Invalid back reference test", "[back reference]")
}
TEST_CASE("Nested capture test", "[backreference]")
{
parser parser(R"(
ROOT <- CONTENT
CONTENT <- (ELEMENT / TEXT)*
ELEMENT <- $(STAG CONTENT ETAG)
STAG <- '<' $tag< TAG_NAME > '>'
ETAG <- '</' $tag '>'
TAG_NAME <- 'b' / 'u'
TEXT <- TEXT_DATA
TEXT_DATA <- ![<] .
)");
REQUIRE(parser.parse("This is <b>a <u>test</u> text</b>."));
REQUIRE(!parser.parse("This is <b>a <u>test</b> text</u>."));
REQUIRE(!parser.parse("This is <b>a <u>test text</b>."));
}
TEST_CASE("Backreference with Prioritized Choice test", "[backreference]")
{
parser parser(R"(
TREE <- WRONG_BRANCH / CORRECT_BRANCH
WRONG_BRANCH <- BRANCH THAT IS_capture WRONG
CORRECT_BRANCH <- BRANCH THAT IS_backref CORRECT
BRANCH <- 'branch'
THAT <- 'that'
IS_capture <- $ref<..>
IS_backref <- $ref
WRONG <- 'wrong'
CORRECT <- 'correct'
)");
REQUIRE_THROWS_AS(parser.parse("branchthatiscorrect"), std::runtime_error);
}
TEST_CASE("Backreference with Zero or More test", "[backreference]")
{
parser parser(R"(
TREE <- WRONG_BRANCH* CORRECT_BRANCH
WRONG_BRANCH <- BRANCH THAT IS_capture WRONG
CORRECT_BRANCH <- BRANCH THAT IS_backref CORRECT
BRANCH <- 'branch'
THAT <- 'that'
IS_capture <- $ref<..>
IS_backref <- $ref
WRONG <- 'wrong'
CORRECT <- 'correct'
)");
REQUIRE(parser.parse("branchthatiswrongbranchthatiscorrect"));
REQUIRE(!parser.parse("branchthatiswrongbranchthatIscorrect"));
REQUIRE(!parser.parse("branchthatiswrongbranchthatIswrongbranchthatiscorrect"));
REQUIRE(parser.parse("branchthatiswrongbranchthatIswrongbranchthatIscorrect"));
REQUIRE_THROWS_AS(parser.parse("branchthatiscorrect"), std::runtime_error);
REQUIRE_THROWS_AS(parser.parse("branchthatiswron_branchthatiscorrect"), std::runtime_error);
}
TEST_CASE("Backreference with One or More test", "[backreference]")
{
parser parser(R"(
TREE <- WRONG_BRANCH+ CORRECT_BRANCH
WRONG_BRANCH <- BRANCH THAT IS_capture WRONG
CORRECT_BRANCH <- BRANCH THAT IS_backref CORRECT
BRANCH <- 'branch'
THAT <- 'that'
IS_capture <- $ref<..>
IS_backref <- $ref
WRONG <- 'wrong'
CORRECT <- 'correct'
)");
REQUIRE(parser.parse("branchthatiswrongbranchthatiscorrect"));
REQUIRE(!parser.parse("branchthatiswrongbranchthatIscorrect"));
REQUIRE(!parser.parse("branchthatiswrongbranchthatIswrongbranchthatiscorrect"));
REQUIRE(parser.parse("branchthatiswrongbranchthatIswrongbranchthatIscorrect"));
REQUIRE(!parser.parse("branchthatiscorrect"));
REQUIRE(!parser.parse("branchthatiswron_branchthatiscorrect"));
}
TEST_CASE("Backreference with Option test", "[backreference]")
{
parser parser(R"(
TREE <- WRONG_BRANCH? CORRECT_BRANCH
WRONG_BRANCH <- BRANCH THAT IS_capture WRONG
CORRECT_BRANCH <- BRANCH THAT IS_backref CORRECT
BRANCH <- 'branch'
THAT <- 'that'
IS_capture <- $ref<..>
IS_backref <- $ref
WRONG <- 'wrong'
CORRECT <- 'correct'
)");
REQUIRE(parser.parse("branchthatiswrongbranchthatiscorrect"));
REQUIRE(!parser.parse("branchthatiswrongbranchthatIscorrect"));
REQUIRE(!parser.parse("branchthatiswrongbranchthatIswrongbranchthatiscorrect"));
REQUIRE(!parser.parse("branchthatiswrongbranchthatIswrongbranchthatIscorrect"));
REQUIRE_THROWS_AS(parser.parse("branchthatiscorrect"), std::runtime_error);
REQUIRE_THROWS_AS(parser.parse("branchthatiswron_branchthatiscorrect"), std::runtime_error);
}
TEST_CASE("Left recursive test", "[left recursive]")
{
parser parser(