mirror of
https://github.com/yhirose/cpp-peglib.git
synced 2024-12-22 20:05:31 +00:00
Added automatic whitespace skipping feature.
This commit is contained in:
parent
49c81a5f59
commit
33978023cf
@ -40,13 +40,15 @@ int main(int argc, const char** argv)
|
|||||||
};
|
};
|
||||||
|
|
||||||
parser parser(R"(
|
parser parser(R"(
|
||||||
EXPRESSION <- _ TERM (TERM_OPERATOR TERM)*
|
EXPRESSION <- TERM (TERM_OPERATOR TERM)*
|
||||||
TERM <- FACTOR (FACTOR_OPERATOR FACTOR)*
|
TERM <- FACTOR (FACTOR_OPERATOR FACTOR)*
|
||||||
FACTOR <- NUMBER / '(' _ EXPRESSION ')' _
|
FACTOR <- NUMBER / '(' EXPRESSION ')'
|
||||||
TERM_OPERATOR <- < [-+] > _
|
|
||||||
FACTOR_OPERATOR <- < [/*] > _
|
TERM_OPERATOR <- [-+]
|
||||||
NUMBER <- < [0-9]+ > _
|
FACTOR_OPERATOR <- [/*]
|
||||||
~_ <- [ \t\r\n]*
|
NUMBER <- [0-9]+
|
||||||
|
|
||||||
|
%whitespace <- [ \t\r\n]*
|
||||||
)");
|
)");
|
||||||
|
|
||||||
parser.enable_ast();
|
parser.enable_ast();
|
||||||
|
104
peglib.h
104
peglib.h
@ -374,7 +374,7 @@ private:
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<typename F, typename R>
|
template<typename F, typename R>
|
||||||
Fty make_adaptor(F fn, R(*mf)(const SemanticValues& sv)) {
|
Fty make_adaptor(F fn, R (*mf)(const SemanticValues& sv)) {
|
||||||
return TypeAdaptor<R>(fn);
|
return TypeAdaptor<R>(fn);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -427,6 +427,9 @@ inline bool fail(size_t len) {
|
|||||||
/*
|
/*
|
||||||
* Context
|
* Context
|
||||||
*/
|
*/
|
||||||
|
class Definition;
|
||||||
|
class Ope;
|
||||||
|
|
||||||
struct Context
|
struct Context
|
||||||
{
|
{
|
||||||
const char* path;
|
const char* path;
|
||||||
@ -437,27 +440,38 @@ struct Context
|
|||||||
const char* message_pos;
|
const char* message_pos;
|
||||||
std::string message; // TODO: should be `int`.
|
std::string message; // TODO: should be `int`.
|
||||||
|
|
||||||
|
std::vector<Definition*> definition_stack;
|
||||||
|
|
||||||
|
std::vector<std::shared_ptr<SemanticValues>> value_stack;
|
||||||
|
size_t value_stack_size;
|
||||||
|
|
||||||
|
std::shared_ptr<Ope> whiteSpaceOpe;
|
||||||
|
|
||||||
const size_t def_count;
|
const size_t def_count;
|
||||||
const bool enablePackratParsing;
|
const bool enablePackratParsing;
|
||||||
std::vector<bool> cache_register;
|
std::vector<bool> cache_register;
|
||||||
std::vector<bool> cache_success;
|
std::vector<bool> cache_success;
|
||||||
|
|
||||||
std::vector<std::shared_ptr<SemanticValues>> stack;
|
|
||||||
size_t stack_size;
|
|
||||||
|
|
||||||
std::map<std::pair<size_t, size_t>, std::tuple<size_t, any>> cache_result;
|
std::map<std::pair<size_t, size_t>, std::tuple<size_t, any>> cache_result;
|
||||||
|
|
||||||
Context(const char* path, const char* s, size_t l, size_t def_count, bool enablePackratParsing)
|
Context(
|
||||||
|
const char* path,
|
||||||
|
const char* s,
|
||||||
|
size_t l,
|
||||||
|
size_t def_count,
|
||||||
|
std::shared_ptr<Ope> whiteSpaceOpe,
|
||||||
|
bool enablePackratParsing)
|
||||||
: path(path)
|
: path(path)
|
||||||
, s(s)
|
, s(s)
|
||||||
, l(l)
|
, l(l)
|
||||||
, error_pos(nullptr)
|
, error_pos(nullptr)
|
||||||
, message_pos(nullptr)
|
, message_pos(nullptr)
|
||||||
|
, whiteSpaceOpe(whiteSpaceOpe)
|
||||||
|
, value_stack_size(0)
|
||||||
, def_count(def_count)
|
, def_count(def_count)
|
||||||
, enablePackratParsing(enablePackratParsing)
|
, enablePackratParsing(enablePackratParsing)
|
||||||
, cache_register(enablePackratParsing ? def_count * (l + 1) : 0)
|
, cache_register(enablePackratParsing ? def_count * (l + 1) : 0)
|
||||||
, cache_success(enablePackratParsing ? def_count * (l + 1) : 0)
|
, cache_success(enablePackratParsing ? def_count * (l + 1) : 0)
|
||||||
, stack_size(0)
|
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -493,11 +507,11 @@ struct Context
|
|||||||
}
|
}
|
||||||
|
|
||||||
inline SemanticValues& push() {
|
inline SemanticValues& push() {
|
||||||
assert(stack_size <= stack.size());
|
assert(value_stack_size <= value_stack.size());
|
||||||
if (stack_size == stack.size()) {
|
if (value_stack_size == value_stack.size()) {
|
||||||
stack.emplace_back(std::make_shared<SemanticValues>());
|
value_stack.emplace_back(std::make_shared<SemanticValues>());
|
||||||
}
|
}
|
||||||
auto& sv = *stack[stack_size++];
|
auto& sv = *value_stack[value_stack_size++];
|
||||||
if (!sv.empty()) {
|
if (!sv.empty()) {
|
||||||
sv.clear();
|
sv.clear();
|
||||||
}
|
}
|
||||||
@ -509,7 +523,7 @@ struct Context
|
|||||||
}
|
}
|
||||||
|
|
||||||
void pop() {
|
void pop() {
|
||||||
stack_size--;
|
value_stack_size--;
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_error_pos(const char* s) {
|
void set_error_pos(const char* s) {
|
||||||
@ -754,16 +768,7 @@ class LiteralString : public Ope
|
|||||||
public:
|
public:
|
||||||
LiteralString(const std::string& s) : lit_(s) {}
|
LiteralString(const std::string& s) : lit_(s) {}
|
||||||
|
|
||||||
size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override {
|
size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override;
|
||||||
auto i = 0u;
|
|
||||||
for (; i < lit_.size(); i++) {
|
|
||||||
if (i >= n || s[i] != lit_[i]) {
|
|
||||||
c.set_error_pos(s);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
|
|
||||||
void accept(Visitor& v) override;
|
void accept(Visitor& v) override;
|
||||||
|
|
||||||
@ -936,8 +941,6 @@ public:
|
|||||||
std::weak_ptr<Ope> weak_;
|
std::weak_ptr<Ope> weak_;
|
||||||
};
|
};
|
||||||
|
|
||||||
class Definition;
|
|
||||||
|
|
||||||
class Holder : public Ope
|
class Holder : public Ope
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
@ -1063,6 +1066,8 @@ struct IsToken : public Ope::Visitor
|
|||||||
bool has_rule;
|
bool has_rule;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const char* WHITESPACE_DEFINITION_NAME = "%whitespace";
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Definition
|
* Definition
|
||||||
*/
|
*/
|
||||||
@ -1096,6 +1101,7 @@ public:
|
|||||||
Definition(Definition&& rhs)
|
Definition(Definition&& rhs)
|
||||||
: name(std::move(rhs.name))
|
: name(std::move(rhs.name))
|
||||||
, ignoreSemanticValue(rhs.ignoreSemanticValue)
|
, ignoreSemanticValue(rhs.ignoreSemanticValue)
|
||||||
|
, whiteSpaceOpe(rhs.whiteSpaceOpe)
|
||||||
, enablePackratParsing(rhs.enablePackratParsing)
|
, enablePackratParsing(rhs.enablePackratParsing)
|
||||||
, is_token(rhs.is_token)
|
, is_token(rhs.is_token)
|
||||||
, holder_(std::move(rhs.holder_))
|
, holder_(std::move(rhs.holder_))
|
||||||
@ -1200,6 +1206,10 @@ public:
|
|||||||
holder_->accept(v);
|
holder_->accept(v);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<Ope> get_core_operator() {
|
||||||
|
return holder_->ope_;
|
||||||
|
}
|
||||||
|
|
||||||
std::string name;
|
std::string name;
|
||||||
size_t id;
|
size_t id;
|
||||||
Action action;
|
Action action;
|
||||||
@ -1207,6 +1217,7 @@ public:
|
|||||||
std::function<void (any& dt)> after;
|
std::function<void (any& dt)> after;
|
||||||
std::function<std::string ()> error_message;
|
std::function<std::string ()> error_message;
|
||||||
bool ignoreSemanticValue;
|
bool ignoreSemanticValue;
|
||||||
|
std::shared_ptr<Ope> whiteSpaceOpe;
|
||||||
bool enablePackratParsing;
|
bool enablePackratParsing;
|
||||||
bool is_token;
|
bool is_token;
|
||||||
|
|
||||||
@ -1220,7 +1231,7 @@ private:
|
|||||||
AssignIDToDefinition assignId;
|
AssignIDToDefinition assignId;
|
||||||
holder_->accept(assignId);
|
holder_->accept(assignId);
|
||||||
|
|
||||||
Context cxt(path, s, n, assignId.ids.size(), enablePackratParsing);
|
Context cxt(path, s, n, assignId.ids.size(), whiteSpaceOpe, enablePackratParsing);
|
||||||
auto len = holder_->parse(s, n, sv, cxt, dt);
|
auto len = holder_->parse(s, n, sv, cxt, dt);
|
||||||
return Result{ success(len), len, cxt.error_pos, cxt.message_pos, cxt.message };
|
return Result{ success(len), len, cxt.error_pos, cxt.message_pos, cxt.message };
|
||||||
}
|
}
|
||||||
@ -1232,6 +1243,28 @@ private:
|
|||||||
* Implementations
|
* Implementations
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
inline size_t LiteralString::parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const {
|
||||||
|
auto i = 0u;
|
||||||
|
for (; i < lit_.size(); i++) {
|
||||||
|
if (i >= n || s[i] != lit_[i]) {
|
||||||
|
c.set_error_pos(s);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip whiltespace
|
||||||
|
const auto d = c.definition_stack.back();
|
||||||
|
if (!d->is_token && c.whiteSpaceOpe) {
|
||||||
|
auto len = c.whiteSpaceOpe->parse(s + i, n - i, sv, c, dt);
|
||||||
|
if (fail(len)) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
i += len;
|
||||||
|
}
|
||||||
|
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
|
||||||
inline size_t Holder::parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const {
|
inline size_t Holder::parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const {
|
||||||
if (!ope_) {
|
if (!ope_) {
|
||||||
throw std::logic_error("Uninitialized definition ope was used...");
|
throw std::logic_error("Uninitialized definition ope was used...");
|
||||||
@ -1243,13 +1276,23 @@ inline size_t Holder::parse(const char* s, size_t n, SemanticValues& sv, Context
|
|||||||
size_t token_boundary_n = n;
|
size_t token_boundary_n = n;
|
||||||
|
|
||||||
c.packrat(s, outer_->id, len, val, [&](any& val) {
|
c.packrat(s, outer_->id, len, val, [&](any& val) {
|
||||||
|
c.definition_stack.push_back(outer_);
|
||||||
|
|
||||||
auto& chldsv = c.push();
|
auto& chldsv = c.push();
|
||||||
|
|
||||||
if (outer_->before) {
|
if (outer_->before) {
|
||||||
outer_->before(dt);
|
outer_->before(dt);
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto& rule = *ope_;
|
auto ope = ope_;
|
||||||
|
|
||||||
|
if (outer_->whiteSpaceOpe) {
|
||||||
|
ope = std::make_shared<Sequence>(outer_->whiteSpaceOpe, ope_);
|
||||||
|
} else if (outer_->is_token && c.whiteSpaceOpe) {
|
||||||
|
ope = std::make_shared<Sequence>(std::make_shared<TokenBoundary>(ope_), c.whiteSpaceOpe);
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto& rule = *ope;
|
||||||
len = rule.parse(s, n, chldsv, c, dt);
|
len = rule.parse(s, n, chldsv, c, dt);
|
||||||
|
|
||||||
token_boundary_n = len;
|
token_boundary_n = len;
|
||||||
@ -1282,6 +1325,7 @@ inline size_t Holder::parse(const char* s, size_t n, SemanticValues& sv, Context
|
|||||||
}
|
}
|
||||||
|
|
||||||
c.pop();
|
c.pop();
|
||||||
|
c.definition_stack.pop_back();
|
||||||
});
|
});
|
||||||
|
|
||||||
if (success(len)) {
|
if (success(len)) {
|
||||||
@ -1622,7 +1666,7 @@ private:
|
|||||||
|
|
||||||
g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]);
|
g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]);
|
||||||
g["IdentCont"] <= seq(g["IdentStart"], zom(g["IdentRest"]));
|
g["IdentCont"] <= seq(g["IdentStart"], zom(g["IdentRest"]));
|
||||||
g["IdentStart"] <= cls("a-zA-Z_\x80-\xff");
|
g["IdentStart"] <= cls("a-zA-Z_\x80-\xff%");
|
||||||
g["IdentRest"] <= cho(g["IdentStart"], cls("0-9"));
|
g["IdentRest"] <= cho(g["IdentStart"], cls("0-9"));
|
||||||
|
|
||||||
g["Literal"] <= cho(seq(cls("'"), tok(zom(seq(npd(cls("'")), g["Char"]))), cls("'"), g["Spacing"]),
|
g["Literal"] <= cho(seq(cls("'"), tok(zom(seq(npd(cls("'")), g["Char"]))), cls("'"), g["Spacing"]),
|
||||||
@ -1912,6 +1956,12 @@ private:
|
|||||||
// Set root definition
|
// Set root definition
|
||||||
start = data.start;
|
start = data.start;
|
||||||
|
|
||||||
|
// Automatic whitespace skipping
|
||||||
|
if (grammar.count(WHITESPACE_DEFINITION_NAME)) {
|
||||||
|
auto& rule = (*data.grammar)[start];
|
||||||
|
rule.whiteSpaceOpe = (*data.grammar)[WHITESPACE_DEFINITION_NAME].get_core_operator();
|
||||||
|
}
|
||||||
|
|
||||||
return data.grammar;
|
return data.grammar;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2317,7 +2367,7 @@ public:
|
|||||||
private:
|
private:
|
||||||
void output_log(const char* s, size_t n, Log log, const Definition::Result& r) const {
|
void output_log(const char* s, size_t n, Log log, const Definition::Result& r) const {
|
||||||
if (log) {
|
if (log) {
|
||||||
if (!r.ret) {
|
if (!r.ret) {
|
||||||
if (r.message_pos) {
|
if (r.message_pos) {
|
||||||
auto line = line_info(s, r.message_pos);
|
auto line = line_info(s, r.message_pos);
|
||||||
log(line.first, line.second, r.message);
|
log(line.first, line.second, r.message);
|
||||||
|
36
test/test.cc
36
test/test.cc
@ -255,6 +255,25 @@ TEST_CASE("before/after handlers test", "[general]")
|
|||||||
parser.parse("hello=world", dt);
|
parser.parse("hello=world", dt);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_CASE("WHITESPACE test", "[general]")
|
||||||
|
{
|
||||||
|
peg::parser parser(R"(
|
||||||
|
# Rules
|
||||||
|
ROOT <- ITEM (',' ITEM)*
|
||||||
|
ITEM <- WORD / PHRASE
|
||||||
|
|
||||||
|
# Tokens
|
||||||
|
WORD <- [a-zA-Z0-9_]+
|
||||||
|
PHRASE <- '"' (!'"' .)* '"'
|
||||||
|
|
||||||
|
%whitespace <- [ \t\r\n]*
|
||||||
|
)");
|
||||||
|
|
||||||
|
auto ret = parser.parse(R"( one, "two, three", four )");
|
||||||
|
|
||||||
|
REQUIRE(ret == true);
|
||||||
|
}
|
||||||
|
|
||||||
TEST_CASE("Skip token test", "[general]")
|
TEST_CASE("Skip token test", "[general]")
|
||||||
{
|
{
|
||||||
peg::parser parser(
|
peg::parser parser(
|
||||||
@ -272,6 +291,23 @@ TEST_CASE("Skip token test", "[general]")
|
|||||||
REQUIRE(ret == true);
|
REQUIRE(ret == true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_CASE("Skip token test2", "[general]")
|
||||||
|
{
|
||||||
|
peg::parser parser(R"(
|
||||||
|
ROOT <- ITEM (',' ITEM)*
|
||||||
|
ITEM <- ([a-z0-9])+
|
||||||
|
%whitespace <- [ \t]*
|
||||||
|
)");
|
||||||
|
|
||||||
|
parser["ROOT"] = [&](const SemanticValues& sv) {
|
||||||
|
REQUIRE(sv.size() == 2);
|
||||||
|
};
|
||||||
|
|
||||||
|
auto ret = parser.parse(" item1, item2 ");
|
||||||
|
|
||||||
|
REQUIRE(ret == true);
|
||||||
|
}
|
||||||
|
|
||||||
TEST_CASE("Backtracking test", "[general]")
|
TEST_CASE("Backtracking test", "[general]")
|
||||||
{
|
{
|
||||||
parser parser(
|
parser parser(
|
||||||
|
Loading…
Reference in New Issue
Block a user