Added simple interface.

This commit is contained in:
yhirose 2015-02-15 17:52:39 -05:00
parent 4c5fd70503
commit 81ca85cba5
3 changed files with 414 additions and 88 deletions

146
README.md
View File

@ -10,72 +10,20 @@ The PEG syntax is well described on page 2 in the [document](http://pdos.csail.m
How to use How to use
---------- ----------
What if we want to extract only tag names in brackets from ` [tag1] [tag2] [tag3] [tag4]... `? This is a simple calculator sample. It shows how to define grammar, associate samantic actions to the grammar and handle semantic values.
PEG grammar for this task could be like this:
```
ROOT <- _ ('[' TAG_NAME ']' _)*
TAG_NAME <- (!']' .)+
_ <- [ \t]*
```
Here is how to parse text with the PEG syntax and retrieve tag names:
```c++ ```c++
// (1) Include the header file
#include "peglib.h"
// (2) Make a parser
peglib::peg parser(R"(
ROOT <- _ ('[' TAG_NAME ']' _)*
TAG_NAME <- (!']' .)+
_ <- [ \t]*
)");
// (3) Setup an action
std::vector<std::string> tags;
parser["TAG_NAME"] = [&](const char* s, size_t l) {
tags.push_back(std::string(s, l));
};
// (4) Parse
auto ret = parser.parse(" [tag1] [tag:2] [tag-3] ");
assert(ret == true);
assert(tags[0] == "tag1");
assert(tags[1] == "tag:2");
assert(tags[2] == "tag-3");
```
This action `[&](const char* s, size_t l)` gives a pointer and length of the matched string.
There are more actions available. Here is a complete list:
```c++
[](const char* s, size_t l, const std::vector<peglib::any>& v, any& c)
[](const char* s, size_t l, const std::vector<peglib::any>& v)
[](const char* s, size_t l)
[](const std::vector<peglib::any>& v, any& c)
[](const std::vector<peglib::any>& v)
[]()
```
`const std::vector<peglib::any>& v` contains semantic values. `peglib::any` class is very similar to [boost::any](http://www.boost.org/doc/libs/1_57_0/doc/html/any.html). You can obtain a value by castning it to the actual type. In order to determine the actual type, you have to check the return value type of the child action for the semantic value.
`any& c` is a context data which can be used by the user for whatever purposes.
This is a complete code of a simple calculator. It shows how to associate actions to definitions and set/get semantic values.
```c++
#include <peglib.h>
#include <assert.h> #include <assert.h>
// (1) Include the header file
#include <peglib.h>
using namespace peglib; using namespace peglib;
using namespace std; using namespace std;
int main(void) { int main(void) {
// (2) Make a parser
auto syntax = R"( auto syntax = R"(
# Grammar for Calculator... # Grammar for Calculator...
Additive <- Multitive '+' Additive / Multitive Additive <- Multitive '+' Additive / Multitive
@ -86,6 +34,7 @@ int main(void) {
peg parser(syntax); peg parser(syntax);
// (3) Setup an action
parser["Additive"] = { parser["Additive"] = {
nullptr, // Default action nullptr, // Default action
[](const vector<any>& v) { [](const vector<any>& v) {
@ -110,6 +59,7 @@ int main(void) {
return stoi(string(s, l), nullptr, 10); return stoi(string(s, l), nullptr, 10);
}; };
// (4) Parse
int val; int val;
parser.parse("1+2*3", val); parser.parse("1+2*3", val);
@ -117,6 +67,85 @@ int main(void) {
} }
``` ```
Here is a complete list of available actions:
```c++
[](const char* s, size_t l, const std::vector<peglib::any>& v, any& c)
[](const char* s, size_t l, const std::vector<peglib::any>& v)
[](const char* s, size_t l)
[](const std::vector<peglib::any>& v, any& c)
[](const std::vector<peglib::any>& v)
[]()
```
`const char* s, size_t l` gives a pointer and length of the matched string.
`const std::vector<peglib::any>& v` contains semantic values. `peglib::any` class is very similar to [boost::any](http://www.boost.org/doc/libs/1_57_0/doc/html/any.html). You can obtain a value by castning it to the actual type. In order to determine the actual type, you have to check the return value type of the child action for the semantic value.
`any& c` is a context data which can be used by the user for whatever purposes.
Simple interface
----------------
*cpp-peglib* provides std::regex-like simple interface for trivial tasks.
In the following example, `< ... >` means the *capture* operator. `peglib::peg_match` tries to capture strings in the `< ... >` operator and store them into `peglib::match` object.
```c++
peglib::match m;
auto ret = peglib::peg_match(
R"(
ROOT <- _ ('[' < TAG_NAME > ']' _)*
TAG_NAME <- (!']' .)+
_ <- [ \t]*
)",
" [tag1] [tag:2] [tag-3] ",
m);
assert(ret == true);
assert(m.size() == 4);
assert(m.str(1) == "tag1");
assert(m.str(2) == "tag:2");
assert(m.str(3) == "tag-3");
```
There are some ways to *search* a peg pattern in a document.
```c++
using namespace peglib;
auto syntax = R"(
ROOT <- '[' < [a-z0-9]+ > ']'
)";
auto s = " [tag1] [tag2] [tag3] ";
// peglib::peg_search
peg pg(syntax);
size_t pos = 0;
auto l = strlen(s);
match m;
while (peg_search(pg, s + pos, l - pos, m)) {
cout << m.str() << endl; // entire match
cout << m.str(1) << endl; // submatch #1
pos += m.length();
}
// peglib::peg_token_iterator
peg_token_iterator it(syntax, s);
while (it != peg_token_iterator()) {
cout << it->str() << endl; // entire match
cout << it->str(1) << endl; // submatch #1
++it;
}
// peglib::peg_token_range
for (auto& m: peg_token_range(syntax, s)) {
cout << m.str() << endl; // entire match
cout << m.str(1) << endl; // submatch #1
}
```
Make a parser with parser operators Make a parser with parser operators
----------------------------------- -----------------------------------
@ -144,7 +173,6 @@ The following are available operators:
|:---------|:-------------------| |:---------|:-------------------|
| seq | Sequence | | seq | Sequence |
| cho | Prioritized Choice | | cho | Prioritized Choice |
| grp | Grouping |
| zom | Zero or More | | zom | Zero or More |
| oom | One or More | | oom | One or More |
| opt | Optional | | opt | Optional |

332
peglib.h
View File

@ -356,6 +356,11 @@ private:
Fty fn_; Fty fn_;
}; };
/*
* Match action
*/
typedef std::function<void (const char* s, size_t l, size_t i)> MatchAction;
/* /*
* Result * Result
*/ */
@ -665,25 +670,27 @@ public:
}; };
class Grouping : public Ope class Capture : public Ope
{ {
public: public:
Grouping(const std::shared_ptr<Ope>& ope) : ope_(ope) {} Capture(const std::shared_ptr<Ope>& ope) : ope_(ope) {}
Grouping(const std::shared_ptr<Ope>& ope, std::function<void(const char* s, size_t l)> match) : ope_(ope), match_(match) {} Capture(const std::shared_ptr<Ope>& ope, MatchAction ma, size_t ci)
: ope_(ope), match_action_(ma), capture_id(ci) {}
Result parse(const char* s, size_t l, Values& v, any& c) const { Result parse(const char* s, size_t l, Values& v, any& c) const {
assert(ope_); assert(ope_);
const auto& rule = *ope_; const auto& rule = *ope_;
auto r = rule.parse(s, l, v, c); auto r = rule.parse(s, l, v, c);
if (r.ret && match_) { if (r.ret && match_action_) {
match_(s, r.len); match_action_(s, r.len, capture_id);
} }
return r; return r;
} }
private: private:
std::shared_ptr<Ope> ope_; std::shared_ptr<Ope> ope_;
std::function<void(const char* s, size_t l)> match_; MatchAction match_action_;
size_t capture_id;
}; };
class WeakHolder : public Ope class WeakHolder : public Ope
@ -914,12 +921,12 @@ inline std::shared_ptr<Ope> dot() {
return std::make_shared<AnyCharacter>(); return std::make_shared<AnyCharacter>();
} }
inline std::shared_ptr<Ope> grp(const std::shared_ptr<Ope>& ope) { inline std::shared_ptr<Ope> cap(const std::shared_ptr<Ope>& ope, MatchAction ma, size_t ci) {
return std::make_shared<Grouping>(ope); return std::make_shared<Capture>(ope, ma, ci);
} }
inline std::shared_ptr<Ope> grp(const std::shared_ptr<Ope>& ope, std::function<void (const char* s, size_t l)> match) { inline std::shared_ptr<Ope> cap(const std::shared_ptr<Ope>& ope, MatchAction ma) {
return std::make_shared<Grouping>(ope, match); return std::make_shared<Capture>(ope, ma, (size_t)-1);
} }
inline std::shared_ptr<Ope> ref(const std::map<std::string, Definition>& grammar, const std::string& name) { inline std::shared_ptr<Ope> ref(const std::map<std::string, Definition>& grammar, const std::string& name) {
@ -954,9 +961,9 @@ typedef std::function<void (size_t, size_t, const std::string&)> Log;
class PEGParser class PEGParser
{ {
public: public:
static std::shared_ptr<Grammar> parse(const char* s, size_t l, std::string& start, Log log) { static std::shared_ptr<Grammar> parse(const char* s, size_t l, std::string& start, MatchAction ma, Log log) {
static PEGParser instance; static PEGParser instance;
return get().perform_core(s, l, start, log); return get().perform_core(s, l, start, ma, log);
} }
// For debuging purpose // For debuging purpose
@ -976,9 +983,13 @@ private:
} }
struct Context { struct Context {
std::shared_ptr<Grammar> grammar = std::make_shared<Grammar>(); std::shared_ptr<Grammar> grammar;
std::string start; std::string start;
std::map<std::string, const char*> refs; MatchAction match_action;
std::map<std::string, const char*> references;
size_t capture_count;
Context() : grammar(std::make_shared<Grammar>()), capture_count(0) {}
}; };
void make_grammar() { void make_grammar() {
@ -992,6 +1003,7 @@ private:
g["Suffix"] <= seq(g["Primary"], opt(cho(g["QUESTION"], g["STAR"], g["PLUS"]))); g["Suffix"] <= seq(g["Primary"], opt(cho(g["QUESTION"], g["STAR"], g["PLUS"])));
g["Primary"] <= cho(seq(g["Identifier"], npd(g["LEFTARROW"])), g["Primary"] <= cho(seq(g["Identifier"], npd(g["LEFTARROW"])),
seq(g["OPEN"], g["Expression"], g["CLOSE"]), seq(g["OPEN"], g["Expression"], g["CLOSE"]),
seq(g["CAPTUREOPEN"], g["Expression"], g["CAPTURECLOSE"]),
g["Literal"], g["Class"], g["DOT"]); g["Literal"], g["Class"], g["DOT"]);
g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]); g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]);
@ -1030,6 +1042,9 @@ private:
g["EndOfLine"] <= cho(lit("\r\n"), chr('\n'), chr('\r')); g["EndOfLine"] <= cho(lit("\r\n"), chr('\n'), chr('\r'));
g["EndOfFile"] <= npd(dot()); g["EndOfFile"] <= npd(dot());
g["CAPTUREOPEN"] <= seq(chr('<'), g["Spacing"]);
g["CAPTURECLOSE"] <= seq(chr('>'), g["Spacing"]);
// Set definition names // Set definition names
for (auto& x: g) { for (auto& x: g) {
x.second.name = x.first; x.second.name = x.first;
@ -1117,11 +1132,17 @@ private:
}, },
[&](const char* s, size_t l, const std::vector<any>& v, any& c) { [&](const char* s, size_t l, const std::vector<any>& v, any& c) {
Context& cxt = *c.get<Context*>(); Context& cxt = *c.get<Context*>();
cxt.refs[v[0]] = s; cxt.references[v[0]] = s;
return ref(*cxt.grammar, v[0]); return ref(*cxt.grammar, v[0]);
}, },
[&](const std::vector<any>& v) { [&](const std::vector<any>& v) {
return v[1]; return v[1];
},
// Capture
[&](const std::vector<any>& v, any& c) {
Context& cxt = *c.get<Context*>();
auto ope = v[1].get<std::shared_ptr<Ope>>();
return cap(ope, cxt.match_action, ++cxt.capture_count);
} }
}; };
@ -1157,9 +1178,11 @@ private:
}; };
} }
std::shared_ptr<Grammar> perform_core(const char* s, size_t l, std::string& start, Log log) { std::shared_ptr<Grammar> perform_core(const char* s, size_t l, std::string& start, MatchAction ma, Log log) {
Values v;
Context cxt; Context cxt;
cxt.match_action = ma;
Values v;
any c = &cxt; any c = &cxt;
auto r = g["Grammar"].parse(s, l, v, c); auto r = g["Grammar"].parse(s, l, v, c);
@ -1171,7 +1194,7 @@ private:
return nullptr; return nullptr;
} }
for (const auto& x : cxt.refs) { for (const auto& x : cxt.references) {
const auto& name = x.first; const auto& name = x.first;
auto ptr = x.second; auto ptr = x.second;
if (cxt.grammar->find(name) == cxt.grammar->end()) { if (cxt.grammar->find(name) == cxt.grammar->end()) {
@ -1228,13 +1251,27 @@ private:
class peg class peg
{ {
public: public:
peg() = default;
peg(const char* s, size_t l, Log log = nullptr) { peg(const char* s, size_t l, Log log = nullptr) {
grammar_ = PEGParser::parse(s, l, start_, log); grammar_ = PEGParser::parse(
s, l,
start_,
[&](const char* s, size_t l, size_t i) {
if (match_action) match_action(s, l, i);
},
log);
} }
peg(const char* s, Log log = nullptr) { peg(const char* s, Log log = nullptr) {
auto l = strlen(s); auto l = strlen(s);
grammar_ = PEGParser::parse(s, l, start_, log); grammar_ = PEGParser::parse(
s, l,
start_,
[&](const char* s, size_t l, size_t i) {
if (match_action) match_action(s, l, i);
},
log);
} }
operator bool() { operator bool() {
@ -1251,6 +1288,12 @@ public:
return false; return false;
} }
template <typename T>
bool parse(const char* s, T& out, bool exact = true) const {
auto l = strlen(s);
return parse(s, l, out, exact);
}
bool parse(const char* s, size_t l, bool exact = true) const { bool parse(const char* s, size_t l, bool exact = true) const {
if (grammar_ != nullptr) { if (grammar_ != nullptr) {
const auto& rule = (*grammar_)[start_]; const auto& rule = (*grammar_)[start_];
@ -1260,17 +1303,36 @@ public:
return false; return false;
} }
template <typename T>
bool parse(const char* s, T& out, bool exact = true) const {
auto l = strlen(s);
return parse(s, l, out, exact);
}
bool parse(const char* s, bool exact = true) const { bool parse(const char* s, bool exact = true) const {
auto l = strlen(s); auto l = strlen(s);
return parse(s, l, exact); return parse(s, l, exact);
} }
bool search(const char* s, size_t l, size_t& mpos, size_t& mlen) const {
const auto& rule = (*grammar_)[start_];
if (grammar_ != nullptr) {
size_t pos = 0;
while (pos < l) {
size_t len = l - pos;
auto r = rule.parse(s + pos, len);
if (r.ret) {
mpos = pos;
mlen = r.len;
return true;
}
pos++;
}
}
mpos = 0;
mlen = 0;
return false;
}
bool search(const char* s, size_t& mpos, size_t& mlen) const {
auto l = strlen(s);
return search(s, l, mpos, mlen);
}
bool lint(const char* s, size_t l, bool exact, Log log = nullptr) { bool lint(const char* s, size_t l, bool exact, Log log = nullptr) {
assert(grammar_); assert(grammar_);
if (grammar_ != nullptr) { if (grammar_ != nullptr) {
@ -1294,11 +1356,227 @@ public:
return (*grammar_)[s]; return (*grammar_)[s];
} }
MatchAction match_action;
private: private:
std::shared_ptr<Grammar> grammar_; std::shared_ptr<Grammar> grammar_;
std::string start_; std::string start_;
}; };
/*-----------------------------------------------------------------------------
* Utilities
*---------------------------------------------------------------------------*/
struct match
{
struct Item {
const char* s;
size_t l;
size_t id;
size_t length() const { return l; }
std::string str() const { return std::string(s, l); }
};
std::vector<Item> matches;
typedef std::vector<Item>::iterator iterator;
typedef std::vector<Item>::const_iterator const_iterator;
bool empty() const {
return matches.empty();
}
size_t size() const {
return matches.size();
}
size_t length(size_t n = 0) {
return matches[n].length();
}
std::string str(size_t n = 0) const {
return matches[n].str();
}
const Item& operator[](size_t n) const {
return matches[n];
}
iterator begin() {
return matches.begin();
}
iterator end() {
return matches.end();
}
const_iterator begin() const {
return matches.cbegin();
}
const_iterator end() const {
return matches.cend();
}
};
inline bool peg_match(const char* syntax, const char* s, match& m) {
m.matches.clear();
peg pg(syntax);
pg.match_action = [&](const char* s, size_t l, size_t i) {
m.matches.push_back(match::Item{ s, l, i });
};
auto ret = pg.parse(s);
if (ret) {
auto l = strlen(s);
m.matches.insert(m.matches.begin(), match::Item{ s, l, 0 });
}
return ret;
}
inline bool peg_match(const char* syntax, const char* s) {
peg pg(syntax);
return pg.parse(s);
}
inline bool peg_search(peg& pg, const char* s, size_t l, match& m) {
m.matches.clear();
pg.match_action = [&](const char* s, size_t l, size_t i) {
m.matches.push_back(match::Item{ s, l, i });
};
size_t mpos, mlen;
auto ret = pg.search(s, l, mpos, mlen);
if (ret) {
m.matches.insert(m.matches.begin(), match::Item{ s + mpos, mlen, 0 });
return true;
}
return false;
}
inline bool peg_search(peg& pg, const char* s, match& m) {
auto l = strlen(s);
return peg_search(pg, s, l, m);
}
inline bool peg_search(const char* syntax, const char* s, size_t l, match& m) {
peg pg(syntax);
return peg_search(pg, s, l, m);
}
inline bool peg_search(const char* syntax, const char* s, match& m) {
peg pg(syntax);
auto l = strlen(s);
return peg_search(pg, s, l, m);
}
class peg_token_iterator : public std::iterator<std::forward_iterator_tag, match>
{
public:
peg_token_iterator()
: s_(nullptr)
, l_(0)
, pos_(std::numeric_limits<size_t>::max()) {}
peg_token_iterator(const char* syntax, const char* s)
: peg_(syntax)
, s_(s)
, l_(strlen(s))
, pos_(0) {
peg_.match_action = [&](const char* s, size_t l, size_t i) {
m_.matches.push_back(match::Item{ s, l, i });
};
search();
}
peg_token_iterator(const peg_token_iterator& rhs)
: peg_(rhs.peg_)
, s_(rhs.s_)
, l_(rhs.l_)
, pos_(rhs.pos_)
, m_(rhs.m_) {}
peg_token_iterator& operator++() {
search();
return *this;
}
peg_token_iterator operator++(int) {
auto it = *this;
search();
return it;
}
match& operator*() {
return m_;
}
match* operator->() {
return &m_;
}
bool operator==(const peg_token_iterator& rhs) {
return pos_ == rhs.pos_;
}
bool operator!=(const peg_token_iterator& rhs) {
return pos_ != rhs.pos_;
}
private:
void search() {
m_.matches.clear();
size_t mpos, mlen;
if (peg_.search(s_ + pos_, l_ - pos_, mpos, mlen)) {
m_.matches.insert(m_.matches.begin(), match::Item{ s_ + mpos, mlen, 0 });
pos_ += mpos + mlen;
} else {
pos_ = std::numeric_limits<size_t>::max();
}
}
peg peg_;
const char* s_;
size_t l_;
size_t pos_;
match m_;
};
struct peg_token_range {
typedef peg_token_iterator iterator;
typedef const peg_token_iterator const_iterator;
peg_token_range(const char* syntax, const char* s)
: beg_iter(peg_token_iterator(syntax, s))
, end_iter() {}
iterator begin() {
return beg_iter;
}
iterator end() {
return end_iter;
}
const_iterator cbegin() const {
return beg_iter;
}
const_iterator cend() const {
return end_iter;
}
private:
peg_token_iterator beg_iter;
peg_token_iterator end_iter;
};
} // namespace peglib } // namespace peglib
#endif #endif

View File

@ -35,6 +35,23 @@ TEST_CASE("String capture test", "[general]")
REQUIRE(tags[2] == "tag-3"); REQUIRE(tags[2] == "tag-3");
} }
TEST_CASE("String capture test with match", "[general]")
{
peglib::match m;
auto ret = peglib::peg_match(
" ROOT <- _ ('[' < TAG_NAME > ']' _)* "
" TAG_NAME <- (!']' .)+ "
" _ <- [ \t]* ",
" [tag1] [tag:2] [tag-3] ",
m);
REQUIRE(ret == true);
REQUIRE(m.size() == 4);
REQUIRE(m.str(1) == "tag1");
REQUIRE(m.str(2) == "tag:2");
REQUIRE(m.str(3) == "tag-3");
}
using namespace peglib; using namespace peglib;
using namespace std; using namespace std;
@ -64,7 +81,10 @@ TEST_CASE("String capture test with embedded match action", "[general]")
vector<string> tags; vector<string> tags;
ROOT <= seq(WS, zom(TAG)); ROOT <= seq(WS, zom(TAG));
TAG <= seq(chr('['), grp(TAG_NAME, [&](const char* s, size_t l) { tags.push_back(string(s, l)); }), chr(']'), WS); TAG <= seq(chr('['),
cap(TAG_NAME, [&](const char* s, size_t l, size_t id) { tags.push_back(string(s, l)); }),
chr(']'),
WS);
TAG_NAME <= oom(seq(npd(chr(']')), dot())); TAG_NAME <= oom(seq(npd(chr(']')), dot()));
WS <= zom(cls(" \t")); WS <= zom(cls(" \t"));
@ -213,7 +233,7 @@ TEST_CASE("Calculator test2", "[general]")
; ;
string start; string start;
auto grammar = PEGParser::parse(syntax, strlen(syntax), start, nullptr); auto grammar = PEGParser::parse(syntax, strlen(syntax), start, nullptr, nullptr);
auto& g = *grammar; auto& g = *grammar;
// Setup actions // Setup actions