Changed the capture operator and made the anchor operator.

This commit is contained in:
yhirose 2015-02-17 22:35:07 -05:00
parent f0351a9909
commit 56daf08d5b
4 changed files with 130 additions and 87 deletions

View File

@ -7,7 +7,8 @@ C++11 header-only [PEG](http://en.wikipedia.org/wiki/Parsing_expression_grammar)
The PEG syntax is well described on page 2 in the [document](http://pdos.csail.mit.edu/papers/parsing:popl04.pdf). *cpp-peglib* also supports the following additional syntax for now:
* `<` and `>` (Capture operators)
* `<` ... `>` (Anchor operators)
* `$<` ... `>` (Capture operators)
How to use
----------
@ -88,7 +89,7 @@ Here is a complete list of available actions:
`any& c` is a context data which can be used by the user for whatever purposes.
The following example uses `<` and ` >` operators. They are the *capture* operators. Each capture operator creates a semantic value that contains `const char*` of the position. It could be useful to eliminate unnecessary characters.
The following example uses `<` and ` >` operators. They are the *anchor* operators. Each anchor operator creates a semantic value that contains `const char*` of the position. It could be useful to eliminate unnecessary characters.
```c++
auto syntax = R"(
@ -100,9 +101,8 @@ auto syntax = R"(
peg pg(syntax);
pg["TOKEN"] = [](const char* s, size_t l, const vector<any>& v) {
auto b = v[0].get<const char*>(); // '<'
auto e = v[1].get<const char*>(); // '>'
auto token = string(b, e - b); // 'token' doesn't include trailing whitespaces
// 'token' doesn't include trailing whitespaces
auto token = string(s, l);
};
auto ret = pg.parse(" token1, token2 ");
@ -113,13 +113,13 @@ Simple interface
*cpp-peglib* provides std::regex-like simple interface for trivial tasks.
`peglib::peg_match` tries to capture strings in the `< ... >` operator and store them into `peglib::match` object.
`peglib::peg_match` tries to capture strings in the `$< ... >` operator and store them into `peglib::match` object.
```c++
peglib::match m;
auto ret = peglib::peg_match(
R"(
ROOT <- _ ('[' < TAG_NAME > ']' _)*
ROOT <- _ ('[' $< TAG_NAME > ']' _)*
TAG_NAME <- (!']' .)+
_ <- [ \t]*
)",
@ -139,7 +139,7 @@ There are some ways to *search* a peg pattern in a document.
using namespace peglib;
auto syntax = R"(
ROOT <- '[' < [a-z0-9]+ > ']'
ROOT <- '[' $< [a-z0-9]+ > ']'
)";
auto s = " [tag1] [tag2] [tag3] ";
@ -206,6 +206,8 @@ The following are available operators:
| cls | Character class |
| chr | Character |
| dot | Any character |
| anc | Anchor character |
| cap | Capture character |
Sample codes
------------

View File

@ -27,11 +27,11 @@ int main(int argc, const char** argv)
return -1;
}
peglib::peg parser(syntax.data(), syntax.size(), [&](size_t ln, size_t col, const string& msg) {
peglib::peg peg(syntax.data(), syntax.size(), [&](size_t ln, size_t col, const string& msg) {
cerr << syntax_path << ":" << ln << ":" << col << ": " << msg << endl;
});
if (!parser) {
if (!peg) {
return -1;
}
@ -48,10 +48,14 @@ int main(int argc, const char** argv)
return -1;
}
auto ret = parser.lint(source.data(), source.size(), true, [&](size_t ln, size_t col, const string& msg) {
auto ret = peg.lint(source.data(), source.size(), true, [&](size_t ln, size_t col, const string& msg) {
cerr << source_path << ":" << ln << ":" << col << ": " << msg << endl;
});
if (ret) {
peg.parse(source.data(), source.size());
}
return ret ? 0 : -1;
}

162
peglib.h
View File

@ -151,7 +151,15 @@ private:
/*
* Semantic values
*/
typedef std::vector<any> Values;
struct SemanticValues
{
std::vector<any> values;
//std::vector<std::string> names;
const char* s;
size_t l;
SemanticValues() : s(nullptr), l(0) {}
};
/*
* Semantic action
@ -366,7 +374,7 @@ class Ope
{
public:
virtual ~Ope() {};
virtual Result parse(const char* s, size_t l, Values& v, any& c) const = 0;
virtual Result parse(const char* s, size_t l, SemanticValues& v, any& c) const = 0;
};
class Sequence : public Ope
@ -393,7 +401,7 @@ public:
Sequence(const std::vector<std::shared_ptr<Ope>>& opes) : opes_(opes) {}
Sequence(std::vector<std::shared_ptr<Ope>>&& opes) : opes_(std::move(opes)) {}
Result parse(const char* s, size_t l, Values& v, any& c) const {
Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
size_t i = 0;
for (const auto& ope : opes_) {
const auto& rule = *ope;
@ -436,18 +444,20 @@ public:
PrioritizedChoice(const std::vector<std::shared_ptr<Ope>>& opes) : opes_(opes) {}
PrioritizedChoice(std::vector<std::shared_ptr<Ope>>&& opes) : opes_(std::move(opes)) {}
Result parse(const char* s, size_t l, Values& v, any& c) const {
Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
size_t id = 0;
for (const auto& ope : opes_) {
const auto& rule = *ope;
Values chldsv;
SemanticValues chldsv;
auto r = rule.parse(s, l, chldsv, c);
if (r.ret) {
if (!chldsv.empty()) {
for (const auto& x: chldsv) {
v.push_back(x);
}
//assert(chldsv.values.size() == chldsv.names.size());
if (!chldsv.values.empty()) {
v.values.insert(v.values.end(), chldsv.values.begin(), chldsv.values.end());
//v.names.insert(v.names.end(), chldsv.names.begin(), chldsv.names.end());
}
v.s = chldsv.s;
v.l = chldsv.l;
return success(r.len, id);
}
id++;
@ -466,7 +476,7 @@ class ZeroOrMore : public Ope
public:
ZeroOrMore(const std::shared_ptr<Ope>& ope) : ope_(ope) {}
Result parse(const char* s, size_t l, Values& v, any& c) const {
Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
auto i = 0;
while (l - i > 0) {
const auto& rule = *ope_;
@ -488,7 +498,7 @@ class OneOrMore : public Ope
public:
OneOrMore(const std::shared_ptr<Ope>& ope) : ope_(ope) {}
Result parse(const char* s, size_t l, Values& v, any& c) const {
Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
const auto& rule = *ope_;
auto r = rule.parse(s, l, v, c);
if (!r.ret) {
@ -519,7 +529,7 @@ class Option : public Ope
public:
Option(const std::shared_ptr<Ope>& ope) : ope_(ope) {}
Result parse(const char* s, size_t l, Values& v, any& c) const {
Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
const auto& rule = *ope_;
auto r = rule.parse(s, l, v, c);
return success(r.ret ? r.len : 0);
@ -534,7 +544,7 @@ class AndPredicate : public Ope
public:
AndPredicate(const std::shared_ptr<Ope>& ope) : ope_(ope) {}
Result parse(const char* s, size_t l, Values& v, any& c) const {
Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
const auto& rule = *ope_;
auto r = rule.parse(s, l, v, c);
if (r.ret) {
@ -553,7 +563,7 @@ class NotPredicate : public Ope
public:
NotPredicate(const std::shared_ptr<Ope>& ope) : ope_(ope) {}
Result parse(const char* s, size_t l, Values& v, any& c) const {
Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
const auto& rule = *ope_;
auto r = rule.parse(s, l, v, c);
if (r.ret) {
@ -572,7 +582,7 @@ class LiteralString : public Ope
public:
LiteralString(const std::string& s) : lit_(s) {}
Result parse(const char* s, size_t l, Values& v, any& c) const {
Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
auto i = 0u;
for (; i < lit_.size(); i++) {
if (i >= l || s[i] != lit_[i]) {
@ -591,7 +601,7 @@ class CharacterClass : public Ope
public:
CharacterClass(const std::string& chars) : chars_(chars) {}
Result parse(const char* s, size_t l, Values& v, any& c) const {
Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
// TODO: UTF8 support
if (l < 1) {
return fail(s);
@ -623,7 +633,7 @@ class Character : public Ope
public:
Character(char ch) : ch_(ch) {}
Result parse(const char* s, size_t l, Values& v, any& c) const {
Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
// TODO: UTF8 support
if (l < 1 || s[0] != ch_) {
return fail(s);
@ -638,7 +648,7 @@ private:
class AnyCharacter : public Ope
{
public:
Result parse(const char* s, size_t l, Values& v, any& c) const {
Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
// TODO: UTF8 support
if (l < 1) {
return fail(s);
@ -651,11 +661,10 @@ public:
class Capture : public Ope
{
public:
Capture(const std::shared_ptr<Ope>& ope) : ope_(ope) {}
Capture(const std::shared_ptr<Ope>& ope, MatchAction ma, size_t ci)
: ope_(ope), match_action_(ma), capture_id(ci) {}
Result parse(const char* s, size_t l, Values& v, any& c) const {
Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
assert(ope_);
const auto& rule = *ope_;
auto r = rule.parse(s, l, v, c);
@ -674,10 +683,21 @@ private:
class Anchor : public Ope
{
public:
Result parse(const char* s, size_t l, Values& v, any& c) const {
return success(0);
Anchor(const std::shared_ptr<Ope>& ope) : ope_(ope) {}
Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
assert(ope_);
const auto& rule = *ope_;
auto r = rule.parse(s, l, v, c);
if (r.ret) {
v.s = s;
v.l = r.len;
}
return r;
}
private:
std::shared_ptr<Ope> ope_;
};
class WeakHolder : public Ope
@ -685,7 +705,7 @@ class WeakHolder : public Ope
public:
WeakHolder(const std::shared_ptr<Ope>& ope) : weak_(ope) {}
Result parse(const char* s, size_t l, Values& v, any& c) const {
Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
auto ope = weak_.lock();
assert(ope);
const auto& rule = *ope;
@ -738,17 +758,17 @@ public:
return *this;
}
Result parse(const char* s, size_t l, Values& v, any& c) const {
Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
return holder_->parse(s, l, v, c);
}
template <typename T>
Result parse(const char* s, size_t l, T& val) const {
Values v;
SemanticValues v;
any c;
auto r = holder_->parse(s, l, v, c);
if (r.ret && !v.empty() && !v.front().is_undefined()) {
val = v[0].get<T>();
if (r.ret && !v.values.empty() && !v.values.front().is_undefined()) {
val = v.values[0].get<T>();
}
return r;
}
@ -761,7 +781,7 @@ public:
Result parse(const char* s) const {
auto l = strlen(s);
Values v;
SemanticValues v;
any c;
return holder_->parse(s, l, v, c);
}
@ -795,13 +815,13 @@ private:
Holder(Definition* outer)
: outer_(outer) {}
Result parse(const char* s, size_t l, Values& v, any& c) const {
Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
if (!ope_) {
throw std::logic_error("Uninitialized definition ope was used...");
}
const auto& rule = *ope_;
Values chldsv;
SemanticValues chldsv;
auto r = rule.parse(s, l, chldsv, c);
if (r.ret) {
assert(!outer_->actions.empty());
@ -811,7 +831,12 @@ private:
? outer_->actions[id]
: outer_->actions[0];
v.push_back(reduce(s, r.len, chldsv, c, ac));
auto ts = chldsv.s ? chldsv.s : s;
auto tl = chldsv.s ? chldsv.l : r.len;
auto sv = reduce(ts, tl, chldsv, c, ac);
v.values.push_back(sv);
//v.names.push_back(outer_->name);
}
return r;
}
@ -819,13 +844,13 @@ private:
private:
friend class Definition;
any reduce(const char* s, size_t l, const Values& v, any& c, const Action& action) const {
any reduce(const char* s, size_t l, const SemanticValues& v, any& c, const Action& action) const {
if (action) {
return action(s, l, v, c);
} else if (v.empty()) {
return action(s, l, v.values, c);
} else if (v.values.empty()) {
return any();
} else {
return v.front();
return v.values.front();
}
}
@ -847,7 +872,7 @@ public:
: grammar_(grammar)
, name_(name) {}
Result parse(const char* s, size_t l, Values& v, any& c) const {
Result parse(const char* s, size_t l, SemanticValues& v, any& c) const {
const auto& rule = *grammar_.at(name_).holder_;
return rule.parse(s, l, v, c);
}
@ -916,8 +941,8 @@ inline std::shared_ptr<Ope> cap(const std::shared_ptr<Ope>& ope, MatchAction ma)
return std::make_shared<Capture>(ope, ma, (size_t)-1);
}
inline std::shared_ptr<Ope> anc() {
return std::make_shared<Anchor>();
inline std::shared_ptr<Ope> anc(const std::shared_ptr<Ope>& ope) {
return std::make_shared<Anchor>(ope);
}
inline std::shared_ptr<Ope> ref(const std::map<std::string, Definition>& grammar, const std::string& name) {
@ -994,7 +1019,8 @@ private:
g["Suffix"] <= seq(g["Primary"], opt(cho(g["QUESTION"], g["STAR"], g["PLUS"])));
g["Primary"] <= cho(seq(g["Identifier"], npd(g["LEFTARROW"])),
seq(g["OPEN"], g["Expression"], g["CLOSE"]),
seq(g["CAPTUREOPEN"], g["Expression"], g["CAPTURECLOSE"]),
seq(g["Begin"], g["Expression"], g["End"]),
seq(g["BeginCap"], g["Expression"], g["EndCap"]),
g["Literal"], g["Class"], g["DOT"]);
g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]);
@ -1002,13 +1028,10 @@ private:
g["IdentStart"] <= cls("a-zA-Z_");
g["IdentRest"] <= cho(g["IdentStart"], cls("0-9"));
g["Literal"] <= cho(seq(cls("'"), g["SQCont"], cls("'"), g["Spacing"]),
seq(cls("\""), g["DQCont"], cls("\""), g["Spacing"]));
g["SQCont"] <= zom(seq(npd(cls("'")), g["Char"]));
g["DQCont"] <= zom(seq(npd(cls("\"")), g["Char"]));
g["Literal"] <= cho(seq(cls("'"), anc(zom(seq(npd(cls("'")), g["Char"]))), cls("'"), g["Spacing"]),
seq(cls("\""), anc(zom(seq(npd(cls("\"")), g["Char"]))), cls("\""), g["Spacing"]));
g["Class"] <= seq(chr('['), g["ClassCont"], chr(']'), g["Spacing"]);
g["ClassCont"] <= zom(seq(npd(chr(']')), g["Range"]));
g["Class"] <= seq(chr('['), anc(zom(seq(npd(chr(']')), g["Range"]))), chr(']'), g["Spacing"]);
g["Range"] <= cho(seq(g["Char"], chr('-'), g["Char"]), g["Char"]);
g["Char"] <= cho(seq(chr('\\'), cls("nrt'\"[]\\")),
@ -1033,8 +1056,11 @@ private:
g["EndOfLine"] <= cho(lit("\r\n"), chr('\n'), chr('\r'));
g["EndOfFile"] <= npd(dot());
g["CAPTUREOPEN"] <= seq(chr('<'), g["Spacing"]);
g["CAPTURECLOSE"] <= seq(chr('>'), g["Spacing"]);
g["Begin"] <= seq(chr('<'), g["Spacing"]);
g["End"] <= seq(chr('>'), g["Spacing"]);
g["BeginCap"] <= seq(lit("$<"), g["Spacing"]);
g["EndCap"] <= seq(lit(">"), g["Spacing"]);
// Set definition names
for (auto& x: g) {
@ -1130,36 +1156,27 @@ private:
[&](const std::vector<any>& v) {
return v[1];
},
// Anchor
[&](const std::vector<any>& v) {
auto ope = v[1].get<std::shared_ptr<Ope>>();
return anc(ope);
},
// Capture
[&](const char* s, size_t l, const std::vector<any>& v, any& c) {
[&](const std::vector<any>& v, any& c) {
Context& cxt = *c.get<Context*>();
auto ope = v[1].get<std::shared_ptr<Ope>>();
return seq(
ref(*cxt.grammar, "%ANCHOR%"),
cap(ope, cxt.match_action, ++cxt.capture_count),
ref(*cxt.grammar, "%ANCHOR%"));
return cap(ope, cxt.match_action, ++cxt.capture_count);
}
};
g["IdentCont"] = [](const char* s, size_t l) {
return std::string(s, l);
};
g["Literal"] = [](const std::vector<any>& v) {
return lit(v[0].get<std::string>());
g["Literal"] = [this](const char* s, size_t l) {
return lit(resolve_escape_sequence(s, l));
};
g["SQCont"] = [this](const char* s, size_t l) {
return resolve_escape_sequence(s, l);
};
g["DQCont"] = [this](const char* s, size_t l) {
return resolve_escape_sequence(s, l);
};
g["Class"] = [](const std::vector<any>& v) {
return cls(v[0].get<std::string>());
};
g["ClassCont"] = [this](const char* s, size_t l) {
return resolve_escape_sequence(s, l);
g["Class"] = [this](const char* s, size_t l) {
return cls(resolve_escape_sequence(s, l));
};
g["AND"] = [](const char* s, size_t l) { return *s; };
@ -1168,16 +1185,14 @@ private:
g["STAR"] = [](const char* s, size_t l) { return *s; };
g["PLUS"] = [](const char* s, size_t l) { return *s; };
g["DOT"] = []() {
return dot();
};
g["DOT"] = []() { return dot(); };
}
std::shared_ptr<Grammar> perform_core(const char* s, size_t l, std::string& start, MatchAction ma, Log log) {
Context cxt;
cxt.match_action = ma;
Values v;
SemanticValues v;
any c = &cxt;
auto r = g["Grammar"].parse(s, l, v, c);
@ -1205,9 +1220,6 @@ private:
start = cxt.start;
grammar["%ANCHOR%"] <= anc();
grammar["%ANCHOR%"] = [](const char* s, size_t l) { return s; };
return cxt.grammar;
}
@ -1345,7 +1357,7 @@ public:
}
} else if (exact && r.len != l) {
auto line = line_info(s, s + r.len);
log(line.first, line.second, "garbage string at the end");
log(line.first, line.second, "syntax error");
}
return r.ret && (!exact || r.len == l);
}

View File

@ -39,7 +39,7 @@ TEST_CASE("String capture test with match", "[general]")
{
peglib::match m;
auto ret = peglib::peg_match(
" ROOT <- _ ('[' < TAG_NAME > ']' _)* "
" ROOT <- _ ('[' $< TAG_NAME > ']' _)* "
" TAG_NAME <- (!']' .)+ "
" _ <- [ \t]* ",
" [tag1] [tag:2] [tag-3] ",
@ -74,6 +74,31 @@ TEST_CASE("String capture test2", "[general]")
REQUIRE(tags[2] == "tag-3");
}
TEST_CASE("String capture test3", "[general]")
{
auto syntax =
" ROOT <- _ TOKEN* "
" TOKEN <- '[' < (!']' .)+ > ']' _ "
" _ <- [ \t\r\n]* "
;
peg pg(syntax);
std::vector<std::string> tags;
pg["TOKEN"] = [&](const char* s, size_t l, const vector<any>& v) {
tags.push_back(std::string(s, l));
};
auto ret = pg.parse(" [tag1] [tag:2] [tag-3] ");
REQUIRE(ret == true);
REQUIRE(tags.size() == 3);
REQUIRE(tags[0] == "tag1");
REQUIRE(tags[1] == "tag:2");
REQUIRE(tags[2] == "tag-3");
}
TEST_CASE("String capture test with embedded match action", "[general]")
{
rule ROOT, TAG, TAG_NAME, WS;