mirror of
https://github.com/yhirose/cpp-peglib.git
synced 2024-12-22 20:05:31 +00:00
Back reference support
This commit is contained in:
parent
a77edadfa7
commit
f0d2c529ba
@ -13,11 +13,13 @@ The PEG syntax is well described on page 2 in the [document](http://www.brynosau
|
||||
* `<` ... `>` (Token boundary operator)
|
||||
* `~` (Ignore operator)
|
||||
* `\x20` (Hex number char)
|
||||
* `$<` ... `>` (Capture operator)
|
||||
* `$name<` ... `>` (Named capture operator)
|
||||
* `$name` (Backreference operator)
|
||||
* `%whitespace` (Automatic whitespace skipping)
|
||||
* `%word` (Word expression)
|
||||
|
||||
This library also supports the linear-time parsing known as the [*Packrat*](http://pdos.csail.mit.edu/~baford/packrat/thesis/thesis.pdf) parsing.
|
||||
|
||||
If you need a Go language version, please see [*go-peg*](https://github.com/yhirose/go-peg).
|
||||
|
||||
How to use
|
||||
@ -322,6 +324,7 @@ The following are available operators:
|
||||
| tok | Token boundary |
|
||||
| ign | Ignore semantic value |
|
||||
| cap | Capture character |
|
||||
| bkr | Back reference |
|
||||
|
||||
Unicode support
|
||||
---------------
|
||||
|
120
peglib.h
120
peglib.h
@ -464,11 +464,6 @@ private:
|
||||
std::string s_;
|
||||
};
|
||||
|
||||
/*
|
||||
* Match action
|
||||
*/
|
||||
typedef std::function<void (const char* s, size_t n, size_t id, const std::string& name)> MatchAction;
|
||||
|
||||
/*
|
||||
* Result
|
||||
*/
|
||||
@ -483,8 +478,8 @@ inline bool fail(size_t len) {
|
||||
/*
|
||||
* Context
|
||||
*/
|
||||
class Ope;
|
||||
class Context;
|
||||
class Ope;
|
||||
class Definition;
|
||||
|
||||
typedef std::function<void (const char* name, const char* s, size_t n, const SemanticValues& sv, const Context& c, const any& dt)> Tracer;
|
||||
@ -512,6 +507,8 @@ public:
|
||||
|
||||
std::shared_ptr<Ope> wordOpe;
|
||||
|
||||
std::unordered_map<std::string, std::string> captures;
|
||||
|
||||
const size_t def_count;
|
||||
const bool enablePackratParsing;
|
||||
std::vector<bool> cache_registered;
|
||||
@ -989,14 +986,16 @@ public:
|
||||
class Capture : public Ope
|
||||
{
|
||||
public:
|
||||
Capture(const std::shared_ptr<Ope>& ope, MatchAction ma, size_t id, const std::string& name)
|
||||
: ope_(ope), match_action_(ma), id_(id), name_(name) {}
|
||||
typedef std::function<void (const char* s, size_t n, Context& c)> MatchAction;
|
||||
|
||||
Capture(const std::shared_ptr<Ope>& ope, MatchAction ma)
|
||||
: ope_(ope), match_action_(ma) {}
|
||||
|
||||
size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override {
|
||||
const auto& rule = *ope_;
|
||||
auto len = rule.parse(s, n, sv, c, dt);
|
||||
if (success(len) && match_action_) {
|
||||
match_action_(s, len, id_, name_);
|
||||
match_action_(s, len, c);
|
||||
}
|
||||
return len;
|
||||
}
|
||||
@ -1006,9 +1005,7 @@ public:
|
||||
std::shared_ptr<Ope> ope_;
|
||||
|
||||
private:
|
||||
MatchAction match_action_;
|
||||
size_t id_;
|
||||
std::string name_;
|
||||
MatchAction match_action_;
|
||||
};
|
||||
|
||||
class TokenBoundary : public Ope
|
||||
@ -1123,6 +1120,18 @@ public:
|
||||
std::shared_ptr<Ope> ope_;
|
||||
};
|
||||
|
||||
class BackReference : public Ope
|
||||
{
|
||||
public:
|
||||
BackReference(const std::string& name) : name_(name) {}
|
||||
|
||||
size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override;
|
||||
|
||||
void accept(Visitor& v) override;
|
||||
|
||||
std::string name_;
|
||||
};
|
||||
|
||||
/*
|
||||
* Visitor
|
||||
*/
|
||||
@ -1147,6 +1156,7 @@ struct Ope::Visitor
|
||||
virtual void visit(Holder& /*ope*/) {}
|
||||
virtual void visit(DefinitionReference& /*ope*/) {}
|
||||
virtual void visit(Whitespace& /*ope*/) {}
|
||||
virtual void visit(BackReference& /*ope*/) {}
|
||||
};
|
||||
|
||||
struct AssignIDToDefinition : public Ope::Visitor
|
||||
@ -1403,31 +1413,31 @@ private:
|
||||
* Implementations
|
||||
*/
|
||||
|
||||
inline size_t LiteralString::parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const {
|
||||
c.trace("LiteralString", s, n, sv, dt);
|
||||
|
||||
inline size_t parse_literal(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt,
|
||||
const std::string& lit, bool& init_is_word, bool& is_word)
|
||||
{
|
||||
size_t i = 0;
|
||||
for (; i < lit_.size(); i++) {
|
||||
if (i >= n || s[i] != lit_[i]) {
|
||||
for (; i < lit.size(); i++) {
|
||||
if (i >= n || s[i] != lit[i]) {
|
||||
c.set_error_pos(s);
|
||||
return static_cast<size_t>(-1);
|
||||
}
|
||||
}
|
||||
|
||||
// Word check
|
||||
static Context dummy_c(nullptr, lit_.data(), lit_.size(), 0, nullptr, nullptr, false, nullptr);
|
||||
static Context dummy_c(nullptr, lit.data(), lit.size(), 0, nullptr, nullptr, false, nullptr);
|
||||
static SemanticValues dummy_sv;
|
||||
static any dummy_dt;
|
||||
|
||||
if (!init_is_word_) { // TODO: Protect with mutex
|
||||
if (!init_is_word) { // TODO: Protect with mutex
|
||||
if (c.wordOpe) {
|
||||
auto len = c.wordOpe->parse(lit_.data(), lit_.size(), dummy_sv, dummy_c, dummy_dt);
|
||||
is_word_ = success(len);
|
||||
auto len = c.wordOpe->parse(lit.data(), lit.size(), dummy_sv, dummy_c, dummy_dt);
|
||||
is_word = success(len);
|
||||
}
|
||||
init_is_word_ = true;
|
||||
init_is_word = true;
|
||||
}
|
||||
|
||||
if (is_word_) {
|
||||
if (is_word) {
|
||||
auto ope = std::make_shared<NotPredicate>(c.wordOpe);
|
||||
auto len = ope->parse(s + i, n - i, dummy_sv, dummy_c, dummy_dt);
|
||||
if (fail(len)) {
|
||||
@ -1450,6 +1460,11 @@ inline size_t LiteralString::parse(const char* s, size_t n, SemanticValues& sv,
|
||||
return i;
|
||||
}
|
||||
|
||||
inline size_t LiteralString::parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const {
|
||||
c.trace("LiteralString", s, n, sv, dt);
|
||||
return parse_literal(s, n, sv, c, dt, lit_, init_is_word_, is_word_);
|
||||
}
|
||||
|
||||
inline size_t TokenBoundary::parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const {
|
||||
c.in_token = true;
|
||||
auto se = make_scope_exit([&]() { c.in_token = false; });
|
||||
@ -1560,6 +1575,17 @@ inline std::shared_ptr<Ope> DefinitionReference::get_rule() const {
|
||||
return rule_;
|
||||
}
|
||||
|
||||
inline size_t BackReference::parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const {
|
||||
c.trace("BackReference", s, n, sv, dt);
|
||||
if (c.captures.find(name_) == c.captures.end()) {
|
||||
throw std::runtime_error("Invalid back reference...");
|
||||
}
|
||||
const auto& lit = c.captures[name_];
|
||||
bool init_is_word = false;
|
||||
bool is_word = false;
|
||||
return parse_literal(s, n, sv, c, dt, lit, init_is_word, is_word);
|
||||
}
|
||||
|
||||
inline void Sequence::accept(Visitor& v) { v.visit(*this); }
|
||||
inline void PrioritizedChoice::accept(Visitor& v) { v.visit(*this); }
|
||||
inline void ZeroOrMore::accept(Visitor& v) { v.visit(*this); }
|
||||
@ -1578,6 +1604,7 @@ inline void WeakHolder::accept(Visitor& v) { v.visit(*this); }
|
||||
inline void Holder::accept(Visitor& v) { v.visit(*this); }
|
||||
inline void DefinitionReference::accept(Visitor& v) { v.visit(*this); }
|
||||
inline void Whitespace::accept(Visitor& v) { v.visit(*this); }
|
||||
inline void BackReference::accept(Visitor& v) { v.visit(*this); }
|
||||
|
||||
inline void AssignIDToDefinition::visit(Holder& ope) {
|
||||
auto p = static_cast<void*>(ope.outer_);
|
||||
@ -1639,8 +1666,8 @@ inline std::shared_ptr<Ope> dot() {
|
||||
return std::make_shared<AnyCharacter>();
|
||||
}
|
||||
|
||||
inline std::shared_ptr<Ope> cap(const std::shared_ptr<Ope>& ope, MatchAction ma, size_t n, const std::string& s) {
|
||||
return std::make_shared<Capture>(ope, ma, n, s);
|
||||
inline std::shared_ptr<Ope> cap(const std::shared_ptr<Ope>& ope, Capture::MatchAction ma) {
|
||||
return std::make_shared<Capture>(ope, ma);
|
||||
}
|
||||
|
||||
inline std::shared_ptr<Ope> tok(const std::shared_ptr<Ope>& ope) {
|
||||
@ -1659,6 +1686,10 @@ inline std::shared_ptr<Ope> wsp(const std::shared_ptr<Ope>& ope) {
|
||||
return std::make_shared<Whitespace>(std::make_shared<Ignore>(ope));
|
||||
}
|
||||
|
||||
inline std::shared_ptr<Ope> bkr(const std::string& name) {
|
||||
return std::make_shared<BackReference>(name);
|
||||
}
|
||||
|
||||
/*-----------------------------------------------------------------------------
|
||||
* PEG parser generator
|
||||
*---------------------------------------------------------------------------*/
|
||||
@ -1673,10 +1704,9 @@ public:
|
||||
const char* s,
|
||||
size_t n,
|
||||
std::string& start,
|
||||
MatchAction ma,
|
||||
Log log)
|
||||
{
|
||||
return get_instance().perform_core(s, n, start, ma, log);
|
||||
return get_instance().perform_core(s, n, start, log);
|
||||
}
|
||||
|
||||
// For debuging purpose
|
||||
@ -1698,15 +1728,10 @@ private:
|
||||
struct Data {
|
||||
std::shared_ptr<Grammar> grammar;
|
||||
std::string start;
|
||||
MatchAction match_action;
|
||||
std::vector<std::pair<std::string, const char*>> duplicates;
|
||||
std::unordered_map<std::string, const char*> references;
|
||||
size_t capture_count;
|
||||
|
||||
Data()
|
||||
: grammar(std::make_shared<Grammar>())
|
||||
, capture_count(0)
|
||||
{}
|
||||
Data(): grammar(std::make_shared<Grammar>()) {}
|
||||
};
|
||||
|
||||
struct DetectLeftRecursion : public Ope::Visitor {
|
||||
@ -1793,6 +1818,9 @@ private:
|
||||
}
|
||||
done_ = true;
|
||||
}
|
||||
void visit(BackReference& /*ope*/) override {
|
||||
done_ = true;
|
||||
}
|
||||
|
||||
const char* s_;
|
||||
|
||||
@ -1815,7 +1843,7 @@ private:
|
||||
seq(g["OPEN"], g["Expression"], g["CLOSE"]),
|
||||
seq(g["BeginTok"], g["Expression"], g["EndTok"]),
|
||||
seq(g["BeginCap"], g["Expression"], g["EndCap"]),
|
||||
g["Literal"], g["Class"], g["DOT"]);
|
||||
g["BackRef"], g["Literal"], g["Class"], g["DOT"]);
|
||||
|
||||
g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]);
|
||||
g["IdentCont"] <= seq(g["IdentStart"], zom(g["IdentRest"]));
|
||||
@ -1858,8 +1886,10 @@ private:
|
||||
g["BeginTok"] <= seq(chr('<'), g["Spacing"]);
|
||||
g["EndTok"] <= seq(chr('>'), g["Spacing"]);
|
||||
|
||||
g["BeginCap"] <= seq(chr('$'), tok(opt(g["Identifier"])), chr('<'), g["Spacing"]);
|
||||
g["EndCap"] <= seq(lit(">"), g["Spacing"]);
|
||||
g["BeginCap"] <= seq(chr('$'), tok(g["IdentCont"]), chr('<'), g["Spacing"]);
|
||||
g["EndCap"] <= seq(chr('>'), g["Spacing"]);
|
||||
|
||||
g["BackRef"] <= seq(chr('$'), tok(g["IdentCont"]), g["Spacing"]);
|
||||
|
||||
g["IGNORE"] <= chr('~');
|
||||
|
||||
@ -1954,7 +1984,7 @@ private:
|
||||
}
|
||||
};
|
||||
|
||||
g["Primary"] = [&](const SemanticValues& sv, any& dt) -> std::shared_ptr<Ope> {
|
||||
g["Primary"] = [&](const SemanticValues& sv, any& dt) {
|
||||
Data& data = *dt.get<Data*>();
|
||||
|
||||
switch (sv.choice()) {
|
||||
@ -1983,7 +2013,9 @@ private:
|
||||
case 3: { // Capture
|
||||
const auto& name = sv[0].get<std::string>();
|
||||
auto ope = sv[1].get<std::shared_ptr<Ope>>();
|
||||
return cap(ope, data.match_action, ++data.capture_count, name);
|
||||
return cap(ope, [name](const char* a_s, size_t a_n, Context& c) {
|
||||
c.captures[name] = std::string(a_s, a_n);
|
||||
});
|
||||
}
|
||||
default: {
|
||||
return sv[0].get<std::shared_ptr<Ope>>();
|
||||
@ -2013,18 +2045,19 @@ private:
|
||||
g["DOT"] = [](const SemanticValues& /*sv*/) { return dot(); };
|
||||
|
||||
g["BeginCap"] = [](const SemanticValues& sv) { return sv.token(); };
|
||||
|
||||
g["BackRef"] = [&](const SemanticValues& sv) {
|
||||
return bkr(sv.token());
|
||||
};
|
||||
}
|
||||
|
||||
std::shared_ptr<Grammar> perform_core(
|
||||
const char* s,
|
||||
size_t n,
|
||||
std::string& start,
|
||||
MatchAction ma,
|
||||
Log log)
|
||||
{
|
||||
Data data;
|
||||
data.match_action = ma;
|
||||
|
||||
any dt = &data;
|
||||
auto r = g["Grammar"].parse(s, n, dt);
|
||||
|
||||
@ -2373,7 +2406,7 @@ public:
|
||||
}
|
||||
|
||||
bool load_grammar(const char* s, size_t n) {
|
||||
grammar_ = ParserGenerator::parse(s, n, start_, match_action, log);
|
||||
grammar_ = ParserGenerator::parse(s, n, start_, log);
|
||||
return grammar_ != nullptr;
|
||||
}
|
||||
|
||||
@ -2516,8 +2549,7 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
MatchAction match_action;
|
||||
Log log;
|
||||
Log log;
|
||||
|
||||
private:
|
||||
void output_log(const char* s, size_t n, const Definition::Result& r) const {
|
||||
|
108
test/test.cc
108
test/test.cc
@ -83,27 +83,27 @@ TEST_CASE("String capture test2", "[general]")
|
||||
|
||||
TEST_CASE("String capture test3", "[general]")
|
||||
{
|
||||
auto syntax =
|
||||
" ROOT <- _ TOKEN* "
|
||||
" TOKEN <- '[' < (!']' .)+ > ']' _ "
|
||||
" _ <- [ \t\r\n]* "
|
||||
;
|
||||
auto syntax =
|
||||
" ROOT <- _ TOKEN* "
|
||||
" TOKEN <- '[' < (!']' .)+ > ']' _ "
|
||||
" _ <- [ \t\r\n]* "
|
||||
;
|
||||
|
||||
parser pg(syntax);
|
||||
parser pg(syntax);
|
||||
|
||||
std::vector<std::string> tags;
|
||||
std::vector<std::string> tags;
|
||||
|
||||
pg["TOKEN"] = [&](const SemanticValues& sv) {
|
||||
tags.push_back(sv.token());
|
||||
};
|
||||
pg["TOKEN"] = [&](const SemanticValues& sv) {
|
||||
tags.push_back(sv.token());
|
||||
};
|
||||
|
||||
auto ret = pg.parse(" [tag1] [tag:2] [tag-3] ");
|
||||
auto ret = pg.parse(" [tag1] [tag:2] [tag-3] ");
|
||||
|
||||
REQUIRE(ret == true);
|
||||
REQUIRE(tags.size() == 3);
|
||||
REQUIRE(tags[0] == "tag1");
|
||||
REQUIRE(tags[1] == "tag:2");
|
||||
REQUIRE(tags[2] == "tag-3");
|
||||
REQUIRE(ret == true);
|
||||
REQUIRE(tags.size() == 3);
|
||||
REQUIRE(tags[0] == "tag1");
|
||||
REQUIRE(tags[1] == "tag:2");
|
||||
REQUIRE(tags[2] == "tag-3");
|
||||
}
|
||||
|
||||
TEST_CASE("Cyclic grammer test", "[general]")
|
||||
@ -455,7 +455,7 @@ TEST_CASE("Calculator test2", "[general]")
|
||||
;
|
||||
|
||||
string start;
|
||||
auto grammar = ParserGenerator::parse(syntax, strlen(syntax), start, nullptr, nullptr);
|
||||
auto grammar = ParserGenerator::parse(syntax, strlen(syntax), start, nullptr);
|
||||
auto& g = *grammar;
|
||||
|
||||
// Setup actions
|
||||
@ -649,7 +649,7 @@ TEST_CASE("Literal token on AST test1", "[general]")
|
||||
TEST_CASE("Literal token on AST test2", "[general]")
|
||||
{
|
||||
parser parser(R"(
|
||||
STRING_LITERAL <- '"' (ESC / CHAR)* '"'
|
||||
STRING_LITERAL <- '"' (ESC / CHAR)* '"'
|
||||
ESC <- ('\\"' / '\\t' / '\\n')
|
||||
CHAR <- (!["] .)
|
||||
)");
|
||||
@ -701,6 +701,78 @@ TEST_CASE("Definition duplicates test", "[general]")
|
||||
REQUIRE(!parser);
|
||||
}
|
||||
|
||||
TEST_CASE("Back reference test", "[back reference]")
|
||||
{
|
||||
parser parser(R"(
|
||||
START <- _ LQUOTE < (!RQUOTE .)* > RQUOTE _
|
||||
LQUOTE <- 'R"' $delm< [a-zA-Z]* > '('
|
||||
RQUOTE <- ')' $delm '"'
|
||||
~_ <- [ \t\r\n]*
|
||||
)");
|
||||
|
||||
std::string token;
|
||||
parser["START"] = [&](const SemanticValues& sv) {
|
||||
token = sv.token();
|
||||
};
|
||||
|
||||
{
|
||||
token.clear();
|
||||
auto ret = parser.parse(R"delm(
|
||||
R"("hello world")"
|
||||
)delm");
|
||||
|
||||
REQUIRE(ret == true);
|
||||
REQUIRE(token == "\"hello world\"");
|
||||
}
|
||||
|
||||
{
|
||||
token.clear();
|
||||
auto ret = parser.parse(R"delm(
|
||||
R"foo("(hello world)")foo"
|
||||
)delm");
|
||||
|
||||
REQUIRE(ret == true);
|
||||
REQUIRE(token == "\"(hello world)\"");
|
||||
}
|
||||
|
||||
{
|
||||
token.clear();
|
||||
auto ret = parser.parse(R"delm(
|
||||
R"foo("(hello world)foo")foo"
|
||||
)delm");
|
||||
|
||||
REQUIRE(ret == false);
|
||||
REQUIRE(token == "\"(hello world");
|
||||
}
|
||||
|
||||
{
|
||||
token.clear();
|
||||
auto ret = parser.parse(R"delm(
|
||||
R"foo("(hello world)")bar"
|
||||
)delm");
|
||||
|
||||
REQUIRE(ret == false);
|
||||
REQUIRE(token.empty());
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Invalid back reference test", "[back reference]")
|
||||
{
|
||||
parser parser(R"(
|
||||
START <- _ LQUOTE (!RQUOTE .)* RQUOTE _
|
||||
LQUOTE <- 'R"' $delm< [a-zA-Z]* > '('
|
||||
RQUOTE <- ')' $delm2 '"'
|
||||
~_ <- [ \t\r\n]*
|
||||
)");
|
||||
|
||||
REQUIRE_THROWS_AS(
|
||||
parser.parse(R"delm(
|
||||
R"foo("(hello world)")foo"
|
||||
)delm"),
|
||||
std::runtime_error);
|
||||
}
|
||||
|
||||
|
||||
TEST_CASE("Left recursive test", "[left recursive]")
|
||||
{
|
||||
parser parser(
|
||||
|
Loading…
Reference in New Issue
Block a user