Merge pull request #38 from yhirose/backref

Back reference support
This commit is contained in:
yhirose 2018-07-14 00:01:01 +02:00 committed by GitHub
commit e2da595899
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 170 additions and 63 deletions

View File

@ -13,11 +13,13 @@ The PEG syntax is well described on page 2 in the [document](http://www.brynosau
* `<` ... `>` (Token boundary operator) * `<` ... `>` (Token boundary operator)
* `~` (Ignore operator) * `~` (Ignore operator)
* `\x20` (Hex number char) * `\x20` (Hex number char)
* `$<` ... `>` (Capture operator)
* `$name<` ... `>` (Named capture operator) * `$name<` ... `>` (Named capture operator)
* `$name` (Backreference operator)
* `%whitespace` (Automatic whitespace skipping) * `%whitespace` (Automatic whitespace skipping)
* `%word` (Word expression) * `%word` (Word expression)
This library also supports the linear-time parsing known as the [*Packrat*](http://pdos.csail.mit.edu/~baford/packrat/thesis/thesis.pdf) parsing.
If you need a Go language version, please see [*go-peg*](https://github.com/yhirose/go-peg). If you need a Go language version, please see [*go-peg*](https://github.com/yhirose/go-peg).
How to use How to use
@ -322,6 +324,7 @@ The following are available operators:
| tok | Token boundary | | tok | Token boundary |
| ign | Ignore semantic value | | ign | Ignore semantic value |
| cap | Capture character | | cap | Capture character |
| bkr | Back reference |
Unicode support Unicode support
--------------- ---------------

120
peglib.h
View File

@ -464,11 +464,6 @@ private:
std::string s_; std::string s_;
}; };
/*
* Match action
*/
typedef std::function<void (const char* s, size_t n, size_t id, const std::string& name)> MatchAction;
/* /*
* Result * Result
*/ */
@ -483,8 +478,8 @@ inline bool fail(size_t len) {
/* /*
* Context * Context
*/ */
class Ope;
class Context; class Context;
class Ope;
class Definition; class Definition;
typedef std::function<void (const char* name, const char* s, size_t n, const SemanticValues& sv, const Context& c, const any& dt)> Tracer; typedef std::function<void (const char* name, const char* s, size_t n, const SemanticValues& sv, const Context& c, const any& dt)> Tracer;
@ -512,6 +507,8 @@ public:
std::shared_ptr<Ope> wordOpe; std::shared_ptr<Ope> wordOpe;
std::unordered_map<std::string, std::string> captures;
const size_t def_count; const size_t def_count;
const bool enablePackratParsing; const bool enablePackratParsing;
std::vector<bool> cache_registered; std::vector<bool> cache_registered;
@ -989,14 +986,16 @@ public:
class Capture : public Ope class Capture : public Ope
{ {
public: public:
Capture(const std::shared_ptr<Ope>& ope, MatchAction ma, size_t id, const std::string& name) typedef std::function<void (const char* s, size_t n, Context& c)> MatchAction;
: ope_(ope), match_action_(ma), id_(id), name_(name) {}
Capture(const std::shared_ptr<Ope>& ope, MatchAction ma)
: ope_(ope), match_action_(ma) {}
size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override { size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override {
const auto& rule = *ope_; const auto& rule = *ope_;
auto len = rule.parse(s, n, sv, c, dt); auto len = rule.parse(s, n, sv, c, dt);
if (success(len) && match_action_) { if (success(len) && match_action_) {
match_action_(s, len, id_, name_); match_action_(s, len, c);
} }
return len; return len;
} }
@ -1006,9 +1005,7 @@ public:
std::shared_ptr<Ope> ope_; std::shared_ptr<Ope> ope_;
private: private:
MatchAction match_action_; MatchAction match_action_;
size_t id_;
std::string name_;
}; };
class TokenBoundary : public Ope class TokenBoundary : public Ope
@ -1123,6 +1120,18 @@ public:
std::shared_ptr<Ope> ope_; std::shared_ptr<Ope> ope_;
}; };
class BackReference : public Ope
{
public:
BackReference(const std::string& name) : name_(name) {}
size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override;
void accept(Visitor& v) override;
std::string name_;
};
/* /*
* Visitor * Visitor
*/ */
@ -1147,6 +1156,7 @@ struct Ope::Visitor
virtual void visit(Holder& /*ope*/) {} virtual void visit(Holder& /*ope*/) {}
virtual void visit(DefinitionReference& /*ope*/) {} virtual void visit(DefinitionReference& /*ope*/) {}
virtual void visit(Whitespace& /*ope*/) {} virtual void visit(Whitespace& /*ope*/) {}
virtual void visit(BackReference& /*ope*/) {}
}; };
struct AssignIDToDefinition : public Ope::Visitor struct AssignIDToDefinition : public Ope::Visitor
@ -1403,31 +1413,31 @@ private:
* Implementations * Implementations
*/ */
inline size_t LiteralString::parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const { inline size_t parse_literal(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt,
c.trace("LiteralString", s, n, sv, dt); const std::string& lit, bool& init_is_word, bool& is_word)
{
size_t i = 0; size_t i = 0;
for (; i < lit_.size(); i++) { for (; i < lit.size(); i++) {
if (i >= n || s[i] != lit_[i]) { if (i >= n || s[i] != lit[i]) {
c.set_error_pos(s); c.set_error_pos(s);
return static_cast<size_t>(-1); return static_cast<size_t>(-1);
} }
} }
// Word check // Word check
static Context dummy_c(nullptr, lit_.data(), lit_.size(), 0, nullptr, nullptr, false, nullptr); static Context dummy_c(nullptr, lit.data(), lit.size(), 0, nullptr, nullptr, false, nullptr);
static SemanticValues dummy_sv; static SemanticValues dummy_sv;
static any dummy_dt; static any dummy_dt;
if (!init_is_word_) { // TODO: Protect with mutex if (!init_is_word) { // TODO: Protect with mutex
if (c.wordOpe) { if (c.wordOpe) {
auto len = c.wordOpe->parse(lit_.data(), lit_.size(), dummy_sv, dummy_c, dummy_dt); auto len = c.wordOpe->parse(lit.data(), lit.size(), dummy_sv, dummy_c, dummy_dt);
is_word_ = success(len); is_word = success(len);
} }
init_is_word_ = true; init_is_word = true;
} }
if (is_word_) { if (is_word) {
auto ope = std::make_shared<NotPredicate>(c.wordOpe); auto ope = std::make_shared<NotPredicate>(c.wordOpe);
auto len = ope->parse(s + i, n - i, dummy_sv, dummy_c, dummy_dt); auto len = ope->parse(s + i, n - i, dummy_sv, dummy_c, dummy_dt);
if (fail(len)) { if (fail(len)) {
@ -1450,6 +1460,11 @@ inline size_t LiteralString::parse(const char* s, size_t n, SemanticValues& sv,
return i; return i;
} }
inline size_t LiteralString::parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const {
c.trace("LiteralString", s, n, sv, dt);
return parse_literal(s, n, sv, c, dt, lit_, init_is_word_, is_word_);
}
inline size_t TokenBoundary::parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const { inline size_t TokenBoundary::parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const {
c.in_token = true; c.in_token = true;
auto se = make_scope_exit([&]() { c.in_token = false; }); auto se = make_scope_exit([&]() { c.in_token = false; });
@ -1560,6 +1575,17 @@ inline std::shared_ptr<Ope> DefinitionReference::get_rule() const {
return rule_; return rule_;
} }
inline size_t BackReference::parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const {
c.trace("BackReference", s, n, sv, dt);
if (c.captures.find(name_) == c.captures.end()) {
throw std::runtime_error("Invalid back reference...");
}
const auto& lit = c.captures[name_];
bool init_is_word = false;
bool is_word = false;
return parse_literal(s, n, sv, c, dt, lit, init_is_word, is_word);
}
inline void Sequence::accept(Visitor& v) { v.visit(*this); } inline void Sequence::accept(Visitor& v) { v.visit(*this); }
inline void PrioritizedChoice::accept(Visitor& v) { v.visit(*this); } inline void PrioritizedChoice::accept(Visitor& v) { v.visit(*this); }
inline void ZeroOrMore::accept(Visitor& v) { v.visit(*this); } inline void ZeroOrMore::accept(Visitor& v) { v.visit(*this); }
@ -1578,6 +1604,7 @@ inline void WeakHolder::accept(Visitor& v) { v.visit(*this); }
inline void Holder::accept(Visitor& v) { v.visit(*this); } inline void Holder::accept(Visitor& v) { v.visit(*this); }
inline void DefinitionReference::accept(Visitor& v) { v.visit(*this); } inline void DefinitionReference::accept(Visitor& v) { v.visit(*this); }
inline void Whitespace::accept(Visitor& v) { v.visit(*this); } inline void Whitespace::accept(Visitor& v) { v.visit(*this); }
inline void BackReference::accept(Visitor& v) { v.visit(*this); }
inline void AssignIDToDefinition::visit(Holder& ope) { inline void AssignIDToDefinition::visit(Holder& ope) {
auto p = static_cast<void*>(ope.outer_); auto p = static_cast<void*>(ope.outer_);
@ -1639,8 +1666,8 @@ inline std::shared_ptr<Ope> dot() {
return std::make_shared<AnyCharacter>(); return std::make_shared<AnyCharacter>();
} }
inline std::shared_ptr<Ope> cap(const std::shared_ptr<Ope>& ope, MatchAction ma, size_t n, const std::string& s) { inline std::shared_ptr<Ope> cap(const std::shared_ptr<Ope>& ope, Capture::MatchAction ma) {
return std::make_shared<Capture>(ope, ma, n, s); return std::make_shared<Capture>(ope, ma);
} }
inline std::shared_ptr<Ope> tok(const std::shared_ptr<Ope>& ope) { inline std::shared_ptr<Ope> tok(const std::shared_ptr<Ope>& ope) {
@ -1659,6 +1686,10 @@ inline std::shared_ptr<Ope> wsp(const std::shared_ptr<Ope>& ope) {
return std::make_shared<Whitespace>(std::make_shared<Ignore>(ope)); return std::make_shared<Whitespace>(std::make_shared<Ignore>(ope));
} }
inline std::shared_ptr<Ope> bkr(const std::string& name) {
return std::make_shared<BackReference>(name);
}
/*----------------------------------------------------------------------------- /*-----------------------------------------------------------------------------
* PEG parser generator * PEG parser generator
*---------------------------------------------------------------------------*/ *---------------------------------------------------------------------------*/
@ -1673,10 +1704,9 @@ public:
const char* s, const char* s,
size_t n, size_t n,
std::string& start, std::string& start,
MatchAction ma,
Log log) Log log)
{ {
return get_instance().perform_core(s, n, start, ma, log); return get_instance().perform_core(s, n, start, log);
} }
// For debuging purpose // For debuging purpose
@ -1698,15 +1728,10 @@ private:
struct Data { struct Data {
std::shared_ptr<Grammar> grammar; std::shared_ptr<Grammar> grammar;
std::string start; std::string start;
MatchAction match_action;
std::vector<std::pair<std::string, const char*>> duplicates; std::vector<std::pair<std::string, const char*>> duplicates;
std::unordered_map<std::string, const char*> references; std::unordered_map<std::string, const char*> references;
size_t capture_count;
Data() Data(): grammar(std::make_shared<Grammar>()) {}
: grammar(std::make_shared<Grammar>())
, capture_count(0)
{}
}; };
struct DetectLeftRecursion : public Ope::Visitor { struct DetectLeftRecursion : public Ope::Visitor {
@ -1793,6 +1818,9 @@ private:
} }
done_ = true; done_ = true;
} }
void visit(BackReference& /*ope*/) override {
done_ = true;
}
const char* s_; const char* s_;
@ -1815,7 +1843,7 @@ private:
seq(g["OPEN"], g["Expression"], g["CLOSE"]), seq(g["OPEN"], g["Expression"], g["CLOSE"]),
seq(g["BeginTok"], g["Expression"], g["EndTok"]), seq(g["BeginTok"], g["Expression"], g["EndTok"]),
seq(g["BeginCap"], g["Expression"], g["EndCap"]), seq(g["BeginCap"], g["Expression"], g["EndCap"]),
g["Literal"], g["Class"], g["DOT"]); g["BackRef"], g["Literal"], g["Class"], g["DOT"]);
g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]); g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]);
g["IdentCont"] <= seq(g["IdentStart"], zom(g["IdentRest"])); g["IdentCont"] <= seq(g["IdentStart"], zom(g["IdentRest"]));
@ -1858,8 +1886,10 @@ private:
g["BeginTok"] <= seq(chr('<'), g["Spacing"]); g["BeginTok"] <= seq(chr('<'), g["Spacing"]);
g["EndTok"] <= seq(chr('>'), g["Spacing"]); g["EndTok"] <= seq(chr('>'), g["Spacing"]);
g["BeginCap"] <= seq(chr('$'), tok(opt(g["Identifier"])), chr('<'), g["Spacing"]); g["BeginCap"] <= seq(chr('$'), tok(g["IdentCont"]), chr('<'), g["Spacing"]);
g["EndCap"] <= seq(lit(">"), g["Spacing"]); g["EndCap"] <= seq(chr('>'), g["Spacing"]);
g["BackRef"] <= seq(chr('$'), tok(g["IdentCont"]), g["Spacing"]);
g["IGNORE"] <= chr('~'); g["IGNORE"] <= chr('~');
@ -1954,7 +1984,7 @@ private:
} }
}; };
g["Primary"] = [&](const SemanticValues& sv, any& dt) -> std::shared_ptr<Ope> { g["Primary"] = [&](const SemanticValues& sv, any& dt) {
Data& data = *dt.get<Data*>(); Data& data = *dt.get<Data*>();
switch (sv.choice()) { switch (sv.choice()) {
@ -1983,7 +2013,9 @@ private:
case 3: { // Capture case 3: { // Capture
const auto& name = sv[0].get<std::string>(); const auto& name = sv[0].get<std::string>();
auto ope = sv[1].get<std::shared_ptr<Ope>>(); auto ope = sv[1].get<std::shared_ptr<Ope>>();
return cap(ope, data.match_action, ++data.capture_count, name); return cap(ope, [name](const char* a_s, size_t a_n, Context& c) {
c.captures[name] = std::string(a_s, a_n);
});
} }
default: { default: {
return sv[0].get<std::shared_ptr<Ope>>(); return sv[0].get<std::shared_ptr<Ope>>();
@ -2013,18 +2045,19 @@ private:
g["DOT"] = [](const SemanticValues& /*sv*/) { return dot(); }; g["DOT"] = [](const SemanticValues& /*sv*/) { return dot(); };
g["BeginCap"] = [](const SemanticValues& sv) { return sv.token(); }; g["BeginCap"] = [](const SemanticValues& sv) { return sv.token(); };
g["BackRef"] = [&](const SemanticValues& sv) {
return bkr(sv.token());
};
} }
std::shared_ptr<Grammar> perform_core( std::shared_ptr<Grammar> perform_core(
const char* s, const char* s,
size_t n, size_t n,
std::string& start, std::string& start,
MatchAction ma,
Log log) Log log)
{ {
Data data; Data data;
data.match_action = ma;
any dt = &data; any dt = &data;
auto r = g["Grammar"].parse(s, n, dt); auto r = g["Grammar"].parse(s, n, dt);
@ -2373,7 +2406,7 @@ public:
} }
bool load_grammar(const char* s, size_t n) { bool load_grammar(const char* s, size_t n) {
grammar_ = ParserGenerator::parse(s, n, start_, match_action, log); grammar_ = ParserGenerator::parse(s, n, start_, log);
return grammar_ != nullptr; return grammar_ != nullptr;
} }
@ -2516,8 +2549,7 @@ public:
} }
} }
MatchAction match_action; Log log;
Log log;
private: private:
void output_log(const char* s, size_t n, const Definition::Result& r) const { void output_log(const char* s, size_t n, const Definition::Result& r) const {

View File

@ -83,27 +83,27 @@ TEST_CASE("String capture test2", "[general]")
TEST_CASE("String capture test3", "[general]") TEST_CASE("String capture test3", "[general]")
{ {
auto syntax = auto syntax =
" ROOT <- _ TOKEN* " " ROOT <- _ TOKEN* "
" TOKEN <- '[' < (!']' .)+ > ']' _ " " TOKEN <- '[' < (!']' .)+ > ']' _ "
" _ <- [ \t\r\n]* " " _ <- [ \t\r\n]* "
; ;
parser pg(syntax); parser pg(syntax);
std::vector<std::string> tags; std::vector<std::string> tags;
pg["TOKEN"] = [&](const SemanticValues& sv) { pg["TOKEN"] = [&](const SemanticValues& sv) {
tags.push_back(sv.token()); tags.push_back(sv.token());
}; };
auto ret = pg.parse(" [tag1] [tag:2] [tag-3] "); auto ret = pg.parse(" [tag1] [tag:2] [tag-3] ");
REQUIRE(ret == true); REQUIRE(ret == true);
REQUIRE(tags.size() == 3); REQUIRE(tags.size() == 3);
REQUIRE(tags[0] == "tag1"); REQUIRE(tags[0] == "tag1");
REQUIRE(tags[1] == "tag:2"); REQUIRE(tags[1] == "tag:2");
REQUIRE(tags[2] == "tag-3"); REQUIRE(tags[2] == "tag-3");
} }
TEST_CASE("Cyclic grammer test", "[general]") TEST_CASE("Cyclic grammer test", "[general]")
@ -455,7 +455,7 @@ TEST_CASE("Calculator test2", "[general]")
; ;
string start; string start;
auto grammar = ParserGenerator::parse(syntax, strlen(syntax), start, nullptr, nullptr); auto grammar = ParserGenerator::parse(syntax, strlen(syntax), start, nullptr);
auto& g = *grammar; auto& g = *grammar;
// Setup actions // Setup actions
@ -649,7 +649,7 @@ TEST_CASE("Literal token on AST test1", "[general]")
TEST_CASE("Literal token on AST test2", "[general]") TEST_CASE("Literal token on AST test2", "[general]")
{ {
parser parser(R"( parser parser(R"(
STRING_LITERAL <- '"' (ESC / CHAR)* '"' STRING_LITERAL <- '"' (ESC / CHAR)* '"'
ESC <- ('\\"' / '\\t' / '\\n') ESC <- ('\\"' / '\\t' / '\\n')
CHAR <- (!["] .) CHAR <- (!["] .)
)"); )");
@ -701,6 +701,78 @@ TEST_CASE("Definition duplicates test", "[general]")
REQUIRE(!parser); REQUIRE(!parser);
} }
TEST_CASE("Back reference test", "[back reference]")
{
parser parser(R"(
START <- _ LQUOTE < (!RQUOTE .)* > RQUOTE _
LQUOTE <- 'R"' $delm< [a-zA-Z]* > '('
RQUOTE <- ')' $delm '"'
~_ <- [ \t\r\n]*
)");
std::string token;
parser["START"] = [&](const SemanticValues& sv) {
token = sv.token();
};
{
token.clear();
auto ret = parser.parse(R"delm(
R"("hello world")"
)delm");
REQUIRE(ret == true);
REQUIRE(token == "\"hello world\"");
}
{
token.clear();
auto ret = parser.parse(R"delm(
R"foo("(hello world)")foo"
)delm");
REQUIRE(ret == true);
REQUIRE(token == "\"(hello world)\"");
}
{
token.clear();
auto ret = parser.parse(R"delm(
R"foo("(hello world)foo")foo"
)delm");
REQUIRE(ret == false);
REQUIRE(token == "\"(hello world");
}
{
token.clear();
auto ret = parser.parse(R"delm(
R"foo("(hello world)")bar"
)delm");
REQUIRE(ret == false);
REQUIRE(token.empty());
}
}
TEST_CASE("Invalid back reference test", "[back reference]")
{
parser parser(R"(
START <- _ LQUOTE (!RQUOTE .)* RQUOTE _
LQUOTE <- 'R"' $delm< [a-zA-Z]* > '('
RQUOTE <- ')' $delm2 '"'
~_ <- [ \t\r\n]*
)");
REQUIRE_THROWS_AS(
parser.parse(R"delm(
R"foo("(hello world)")foo"
)delm"),
std::runtime_error);
}
TEST_CASE("Left recursive test", "[left recursive]") TEST_CASE("Left recursive test", "[left recursive]")
{ {
parser parser( parser parser(