From 81ca85cba5af19cde0497278e16dcb3a3f05c20b Mon Sep 17 00:00:00 2001 From: yhirose Date: Sun, 15 Feb 2015 17:52:39 -0500 Subject: [PATCH] Added simple interface. --- README.md | 146 +++++++++++++--------- peglib.h | 332 ++++++++++++++++++++++++++++++++++++++++++++++----- test/test.cc | 24 +++- 3 files changed, 414 insertions(+), 88 deletions(-) diff --git a/README.md b/README.md index 582466c..324b7e5 100644 --- a/README.md +++ b/README.md @@ -10,72 +10,20 @@ The PEG syntax is well described on page 2 in the [document](http://pdos.csail.m How to use ---------- -What if we want to extract only tag names in brackets from ` [tag1] [tag2] [tag3] [tag4]... `? - -PEG grammar for this task could be like this: - -``` -ROOT <- _ ('[' TAG_NAME ']' _)* -TAG_NAME <- (!']' .)+ -_ <- [ \t]* -``` - -Here is how to parse text with the PEG syntax and retrieve tag names: - +This is a simple calculator sample. It shows how to define grammar, associate samantic actions to the grammar and handle semantic values. ```c++ -// (1) Include the header file -#include "peglib.h" - -// (2) Make a parser -peglib::peg parser(R"( - ROOT <- _ ('[' TAG_NAME ']' _)* - TAG_NAME <- (!']' .)+ - _ <- [ \t]* -)"); - -// (3) Setup an action -std::vector tags; -parser["TAG_NAME"] = [&](const char* s, size_t l) { - tags.push_back(std::string(s, l)); -}; - -// (4) Parse -auto ret = parser.parse(" [tag1] [tag:2] [tag-3] "); - -assert(ret == true); -assert(tags[0] == "tag1"); -assert(tags[1] == "tag:2"); -assert(tags[2] == "tag-3"); -``` - -This action `[&](const char* s, size_t l)` gives a pointer and length of the matched string. - -There are more actions available. Here is a complete list: - -```c++ -[](const char* s, size_t l, const std::vector& v, any& c) -[](const char* s, size_t l, const std::vector& v) -[](const char* s, size_t l) -[](const std::vector& v, any& c) -[](const std::vector& v) -[]() -``` - -`const std::vector& v` contains semantic values. `peglib::any` class is very similar to [boost::any](http://www.boost.org/doc/libs/1_57_0/doc/html/any.html). You can obtain a value by castning it to the actual type. In order to determine the actual type, you have to check the return value type of the child action for the semantic value. - -`any& c` is a context data which can be used by the user for whatever purposes. - -This is a complete code of a simple calculator. It shows how to associate actions to definitions and set/get semantic values. - -```c++ -#include #include +// (1) Include the header file +#include + using namespace peglib; using namespace std; int main(void) { + + // (2) Make a parser auto syntax = R"( # Grammar for Calculator... Additive <- Multitive '+' Additive / Multitive @@ -86,6 +34,7 @@ int main(void) { peg parser(syntax); + // (3) Setup an action parser["Additive"] = { nullptr, // Default action [](const vector& v) { @@ -110,6 +59,7 @@ int main(void) { return stoi(string(s, l), nullptr, 10); }; + // (4) Parse int val; parser.parse("1+2*3", val); @@ -117,6 +67,85 @@ int main(void) { } ``` +Here is a complete list of available actions: + +```c++ +[](const char* s, size_t l, const std::vector& v, any& c) +[](const char* s, size_t l, const std::vector& v) +[](const char* s, size_t l) +[](const std::vector& v, any& c) +[](const std::vector& v) +[]() +``` + +`const char* s, size_t l` gives a pointer and length of the matched string. + +`const std::vector& v` contains semantic values. `peglib::any` class is very similar to [boost::any](http://www.boost.org/doc/libs/1_57_0/doc/html/any.html). You can obtain a value by castning it to the actual type. In order to determine the actual type, you have to check the return value type of the child action for the semantic value. + +`any& c` is a context data which can be used by the user for whatever purposes. + +Simple interface +---------------- + +*cpp-peglib* provides std::regex-like simple interface for trivial tasks. + +In the following example, `< ... >` means the *capture* operator. `peglib::peg_match` tries to capture strings in the `< ... >` operator and store them into `peglib::match` object. + +```c++ +peglib::match m; +auto ret = peglib::peg_match( + R"( + ROOT <- _ ('[' < TAG_NAME > ']' _)* + TAG_NAME <- (!']' .)+ + _ <- [ \t]* + )", + " [tag1] [tag:2] [tag-3] ", + m); + +assert(ret == true); +assert(m.size() == 4); +assert(m.str(1) == "tag1"); +assert(m.str(2) == "tag:2"); +assert(m.str(3) == "tag-3"); +``` + +There are some ways to *search* a peg pattern in a document. + +```c++ +using namespace peglib; + +auto syntax = R"( +ROOT <- '[' < [a-z0-9]+ > ']' +)"; + +auto s = " [tag1] [tag2] [tag3] "; + +// peglib::peg_search +peg pg(syntax); +size_t pos = 0; +auto l = strlen(s); +match m; +while (peg_search(pg, s + pos, l - pos, m)) { + cout << m.str() << endl; // entire match + cout << m.str(1) << endl; // submatch #1 + pos += m.length(); +} + +// peglib::peg_token_iterator +peg_token_iterator it(syntax, s); +while (it != peg_token_iterator()) { + cout << it->str() << endl; // entire match + cout << it->str(1) << endl; // submatch #1 + ++it; +} + +// peglib::peg_token_range +for (auto& m: peg_token_range(syntax, s)) { + cout << m.str() << endl; // entire match + cout << m.str(1) << endl; // submatch #1 +} +``` + Make a parser with parser operators ----------------------------------- @@ -144,7 +173,6 @@ The following are available operators: |:---------|:-------------------| | seq | Sequence | | cho | Prioritized Choice | -| grp | Grouping | | zom | Zero or More | | oom | One or More | | opt | Optional | diff --git a/peglib.h b/peglib.h index 166dd97..a571312 100644 --- a/peglib.h +++ b/peglib.h @@ -356,6 +356,11 @@ private: Fty fn_; }; +/* + * Match action + */ +typedef std::function MatchAction; + /* * Result */ @@ -665,25 +670,27 @@ public: }; -class Grouping : public Ope +class Capture : public Ope { public: - Grouping(const std::shared_ptr& ope) : ope_(ope) {} - Grouping(const std::shared_ptr& ope, std::function match) : ope_(ope), match_(match) {} + Capture(const std::shared_ptr& ope) : ope_(ope) {} + Capture(const std::shared_ptr& ope, MatchAction ma, size_t ci) + : ope_(ope), match_action_(ma), capture_id(ci) {} Result parse(const char* s, size_t l, Values& v, any& c) const { assert(ope_); const auto& rule = *ope_; auto r = rule.parse(s, l, v, c); - if (r.ret && match_) { - match_(s, r.len); + if (r.ret && match_action_) { + match_action_(s, r.len, capture_id); } return r; } private: - std::shared_ptr ope_; - std::function match_; + std::shared_ptr ope_; + MatchAction match_action_; + size_t capture_id; }; class WeakHolder : public Ope @@ -914,12 +921,12 @@ inline std::shared_ptr dot() { return std::make_shared(); } -inline std::shared_ptr grp(const std::shared_ptr& ope) { - return std::make_shared(ope); +inline std::shared_ptr cap(const std::shared_ptr& ope, MatchAction ma, size_t ci) { + return std::make_shared(ope, ma, ci); } -inline std::shared_ptr grp(const std::shared_ptr& ope, std::function match) { - return std::make_shared(ope, match); +inline std::shared_ptr cap(const std::shared_ptr& ope, MatchAction ma) { + return std::make_shared(ope, ma, (size_t)-1); } inline std::shared_ptr ref(const std::map& grammar, const std::string& name) { @@ -954,9 +961,9 @@ typedef std::function Log; class PEGParser { public: - static std::shared_ptr parse(const char* s, size_t l, std::string& start, Log log) { + static std::shared_ptr parse(const char* s, size_t l, std::string& start, MatchAction ma, Log log) { static PEGParser instance; - return get().perform_core(s, l, start, log); + return get().perform_core(s, l, start, ma, log); } // For debuging purpose @@ -976,9 +983,13 @@ private: } struct Context { - std::shared_ptr grammar = std::make_shared(); + std::shared_ptr grammar; std::string start; - std::map refs; + MatchAction match_action; + std::map references; + size_t capture_count; + + Context() : grammar(std::make_shared()), capture_count(0) {} }; void make_grammar() { @@ -992,6 +1003,7 @@ private: g["Suffix"] <= seq(g["Primary"], opt(cho(g["QUESTION"], g["STAR"], g["PLUS"]))); g["Primary"] <= cho(seq(g["Identifier"], npd(g["LEFTARROW"])), seq(g["OPEN"], g["Expression"], g["CLOSE"]), + seq(g["CAPTUREOPEN"], g["Expression"], g["CAPTURECLOSE"]), g["Literal"], g["Class"], g["DOT"]); g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]); @@ -1030,6 +1042,9 @@ private: g["EndOfLine"] <= cho(lit("\r\n"), chr('\n'), chr('\r')); g["EndOfFile"] <= npd(dot()); + g["CAPTUREOPEN"] <= seq(chr('<'), g["Spacing"]); + g["CAPTURECLOSE"] <= seq(chr('>'), g["Spacing"]); + // Set definition names for (auto& x: g) { x.second.name = x.first; @@ -1117,11 +1132,17 @@ private: }, [&](const char* s, size_t l, const std::vector& v, any& c) { Context& cxt = *c.get(); - cxt.refs[v[0]] = s; + cxt.references[v[0]] = s; return ref(*cxt.grammar, v[0]); }, [&](const std::vector& v) { return v[1]; + }, + // Capture + [&](const std::vector& v, any& c) { + Context& cxt = *c.get(); + auto ope = v[1].get>(); + return cap(ope, cxt.match_action, ++cxt.capture_count); } }; @@ -1157,9 +1178,11 @@ private: }; } - std::shared_ptr perform_core(const char* s, size_t l, std::string& start, Log log) { - Values v; + std::shared_ptr perform_core(const char* s, size_t l, std::string& start, MatchAction ma, Log log) { Context cxt; + cxt.match_action = ma; + + Values v; any c = &cxt; auto r = g["Grammar"].parse(s, l, v, c); @@ -1171,7 +1194,7 @@ private: return nullptr; } - for (const auto& x : cxt.refs) { + for (const auto& x : cxt.references) { const auto& name = x.first; auto ptr = x.second; if (cxt.grammar->find(name) == cxt.grammar->end()) { @@ -1228,13 +1251,27 @@ private: class peg { public: + peg() = default; + peg(const char* s, size_t l, Log log = nullptr) { - grammar_ = PEGParser::parse(s, l, start_, log); + grammar_ = PEGParser::parse( + s, l, + start_, + [&](const char* s, size_t l, size_t i) { + if (match_action) match_action(s, l, i); + }, + log); } peg(const char* s, Log log = nullptr) { auto l = strlen(s); - grammar_ = PEGParser::parse(s, l, start_, log); + grammar_ = PEGParser::parse( + s, l, + start_, + [&](const char* s, size_t l, size_t i) { + if (match_action) match_action(s, l, i); + }, + log); } operator bool() { @@ -1251,6 +1288,12 @@ public: return false; } + template + bool parse(const char* s, T& out, bool exact = true) const { + auto l = strlen(s); + return parse(s, l, out, exact); + } + bool parse(const char* s, size_t l, bool exact = true) const { if (grammar_ != nullptr) { const auto& rule = (*grammar_)[start_]; @@ -1260,17 +1303,36 @@ public: return false; } - template - bool parse(const char* s, T& out, bool exact = true) const { - auto l = strlen(s); - return parse(s, l, out, exact); - } - bool parse(const char* s, bool exact = true) const { auto l = strlen(s); return parse(s, l, exact); } + bool search(const char* s, size_t l, size_t& mpos, size_t& mlen) const { + const auto& rule = (*grammar_)[start_]; + if (grammar_ != nullptr) { + size_t pos = 0; + while (pos < l) { + size_t len = l - pos; + auto r = rule.parse(s + pos, len); + if (r.ret) { + mpos = pos; + mlen = r.len; + return true; + } + pos++; + } + } + mpos = 0; + mlen = 0; + return false; + } + + bool search(const char* s, size_t& mpos, size_t& mlen) const { + auto l = strlen(s); + return search(s, l, mpos, mlen); + } + bool lint(const char* s, size_t l, bool exact, Log log = nullptr) { assert(grammar_); if (grammar_ != nullptr) { @@ -1294,11 +1356,227 @@ public: return (*grammar_)[s]; } + MatchAction match_action; + private: std::shared_ptr grammar_; std::string start_; }; +/*----------------------------------------------------------------------------- + * Utilities + *---------------------------------------------------------------------------*/ + +struct match +{ + struct Item { + const char* s; + size_t l; + size_t id; + + size_t length() const { return l; } + std::string str() const { return std::string(s, l); } + }; + + std::vector matches; + + typedef std::vector::iterator iterator; + typedef std::vector::const_iterator const_iterator; + + bool empty() const { + return matches.empty(); + } + + size_t size() const { + return matches.size(); + } + + size_t length(size_t n = 0) { + return matches[n].length(); + } + + std::string str(size_t n = 0) const { + return matches[n].str(); + } + + const Item& operator[](size_t n) const { + return matches[n]; + } + + iterator begin() { + return matches.begin(); + } + + iterator end() { + return matches.end(); + } + + const_iterator begin() const { + return matches.cbegin(); + } + + const_iterator end() const { + return matches.cend(); + } +}; + +inline bool peg_match(const char* syntax, const char* s, match& m) { + m.matches.clear(); + + peg pg(syntax); + pg.match_action = [&](const char* s, size_t l, size_t i) { + m.matches.push_back(match::Item{ s, l, i }); + }; + + auto ret = pg.parse(s); + if (ret) { + auto l = strlen(s); + m.matches.insert(m.matches.begin(), match::Item{ s, l, 0 }); + } + + return ret; +} + +inline bool peg_match(const char* syntax, const char* s) { + peg pg(syntax); + return pg.parse(s); +} + +inline bool peg_search(peg& pg, const char* s, size_t l, match& m) { + m.matches.clear(); + + pg.match_action = [&](const char* s, size_t l, size_t i) { + m.matches.push_back(match::Item{ s, l, i }); + }; + + size_t mpos, mlen; + auto ret = pg.search(s, l, mpos, mlen); + if (ret) { + m.matches.insert(m.matches.begin(), match::Item{ s + mpos, mlen, 0 }); + return true; + } + + return false; +} + +inline bool peg_search(peg& pg, const char* s, match& m) { + auto l = strlen(s); + return peg_search(pg, s, l, m); +} + +inline bool peg_search(const char* syntax, const char* s, size_t l, match& m) { + peg pg(syntax); + return peg_search(pg, s, l, m); +} + +inline bool peg_search(const char* syntax, const char* s, match& m) { + peg pg(syntax); + auto l = strlen(s); + return peg_search(pg, s, l, m); +} + +class peg_token_iterator : public std::iterator +{ +public: + peg_token_iterator() + : s_(nullptr) + , l_(0) + , pos_(std::numeric_limits::max()) {} + + peg_token_iterator(const char* syntax, const char* s) + : peg_(syntax) + , s_(s) + , l_(strlen(s)) + , pos_(0) { + peg_.match_action = [&](const char* s, size_t l, size_t i) { + m_.matches.push_back(match::Item{ s, l, i }); + }; + search(); + } + + peg_token_iterator(const peg_token_iterator& rhs) + : peg_(rhs.peg_) + , s_(rhs.s_) + , l_(rhs.l_) + , pos_(rhs.pos_) + , m_(rhs.m_) {} + + peg_token_iterator& operator++() { + search(); + return *this; + } + + peg_token_iterator operator++(int) { + auto it = *this; + search(); + return it; + } + + match& operator*() { + return m_; + } + + match* operator->() { + return &m_; + } + + bool operator==(const peg_token_iterator& rhs) { + return pos_ == rhs.pos_; + } + + bool operator!=(const peg_token_iterator& rhs) { + return pos_ != rhs.pos_; + } + +private: + void search() { + m_.matches.clear(); + size_t mpos, mlen; + if (peg_.search(s_ + pos_, l_ - pos_, mpos, mlen)) { + m_.matches.insert(m_.matches.begin(), match::Item{ s_ + mpos, mlen, 0 }); + pos_ += mpos + mlen; + } else { + pos_ = std::numeric_limits::max(); + } + } + + peg peg_; + const char* s_; + size_t l_; + size_t pos_; + match m_; +}; + +struct peg_token_range { + typedef peg_token_iterator iterator; + typedef const peg_token_iterator const_iterator; + + peg_token_range(const char* syntax, const char* s) + : beg_iter(peg_token_iterator(syntax, s)) + , end_iter() {} + + iterator begin() { + return beg_iter; + } + + iterator end() { + return end_iter; + } + + const_iterator cbegin() const { + return beg_iter; + } + + const_iterator cend() const { + return end_iter; + } + +private: + peg_token_iterator beg_iter; + peg_token_iterator end_iter; +}; + + } // namespace peglib #endif diff --git a/test/test.cc b/test/test.cc index d2a7442..9ab8989 100644 --- a/test/test.cc +++ b/test/test.cc @@ -35,6 +35,23 @@ TEST_CASE("String capture test", "[general]") REQUIRE(tags[2] == "tag-3"); } +TEST_CASE("String capture test with match", "[general]") +{ + peglib::match m; + auto ret = peglib::peg_match( + " ROOT <- _ ('[' < TAG_NAME > ']' _)* " + " TAG_NAME <- (!']' .)+ " + " _ <- [ \t]* ", + " [tag1] [tag:2] [tag-3] ", + m); + + REQUIRE(ret == true); + REQUIRE(m.size() == 4); + REQUIRE(m.str(1) == "tag1"); + REQUIRE(m.str(2) == "tag:2"); + REQUIRE(m.str(3) == "tag-3"); +} + using namespace peglib; using namespace std; @@ -64,7 +81,10 @@ TEST_CASE("String capture test with embedded match action", "[general]") vector tags; ROOT <= seq(WS, zom(TAG)); - TAG <= seq(chr('['), grp(TAG_NAME, [&](const char* s, size_t l) { tags.push_back(string(s, l)); }), chr(']'), WS); + TAG <= seq(chr('['), + cap(TAG_NAME, [&](const char* s, size_t l, size_t id) { tags.push_back(string(s, l)); }), + chr(']'), + WS); TAG_NAME <= oom(seq(npd(chr(']')), dot())); WS <= zom(cls(" \t")); @@ -213,7 +233,7 @@ TEST_CASE("Calculator test2", "[general]") ; string start; - auto grammar = PEGParser::parse(syntax, strlen(syntax), start, nullptr); + auto grammar = PEGParser::parse(syntax, strlen(syntax), start, nullptr, nullptr); auto& g = *grammar; // Setup actions