mirror of
https://github.com/yhirose/cpp-peglib.git
synced 2024-12-22 20:05:31 +00:00
full UTF-8 support, untested
This commit is contained in:
parent
4e102f04cb
commit
da6ac85201
324
peglib.h
324
peglib.h
@ -35,9 +35,9 @@
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// define if the compiler doesn't support unicode characters reliably in the
|
#ifndef DEFAULT_ENCODING
|
||||||
// source code
|
#define DEFAULT_ENCODING UTF8
|
||||||
//#define PEGLIB_NO_UNICODE_CHARS
|
#endif
|
||||||
|
|
||||||
namespace peg {
|
namespace peg {
|
||||||
|
|
||||||
@ -47,6 +47,112 @@ static void* enabler = nullptr; // workaround for Clang version <= 5.0.0
|
|||||||
extern void* enabler;
|
extern void* enabler;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/*-----------------------------------------------------------------------------
|
||||||
|
* UTF-8 utilities
|
||||||
|
*---------------------------------------------------------------------------*/
|
||||||
|
|
||||||
|
// wchar works differently on linux and windows so everything was done manually
|
||||||
|
typedef int codepoint;
|
||||||
|
enum class Encoding { ASCII, ANSI, UTF8 }; // ANSI should be called ISO8859 but ANSI is more common and catchy
|
||||||
|
|
||||||
|
size_t get_char(const char* s, size_t n, Encoding enc, codepoint& code)
|
||||||
|
{
|
||||||
|
if (n < 1) {
|
||||||
|
code = -1;
|
||||||
|
return static_cast<size_t>(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const unsigned char* us = reinterpret_cast<const unsigned char*>(s);
|
||||||
|
|
||||||
|
switch(enc) {
|
||||||
|
|
||||||
|
case Encoding::ASCII:
|
||||||
|
|
||||||
|
if (us[0] <= 0x7F) {
|
||||||
|
code = us[0];
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
code = -1;
|
||||||
|
return static_cast<size_t>(-1);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case Encoding::ANSI:
|
||||||
|
|
||||||
|
code = us[0];
|
||||||
|
return 1;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case Encoding::UTF8:
|
||||||
|
|
||||||
|
if (us[0] <= 0x7F) { // 0xxx xxxx is ascii
|
||||||
|
code = us[0];
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (us[0] <= 0xBF) { // 10xx xxxx is invalid as first UTF8 byte
|
||||||
|
code = -1;
|
||||||
|
return static_cast<size_t>(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (us[0] <= 0xCF) { // 110x xxxx, 10xx xxxx
|
||||||
|
if (n < 2
|
||||||
|
|| us[1] < 0x80 || us[1] > 0xBF)
|
||||||
|
{ // second byte missing or invalid
|
||||||
|
code = -1;
|
||||||
|
return static_cast<size_t>(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
code = (int(us[0] & 0x1F) << 6)
|
||||||
|
+ int(us[1] & 0x3F);
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (us[0] <= 0xEF) { // 1110 xxxx, 10xx xxxx, 10xx xxxx
|
||||||
|
if (n < 3
|
||||||
|
|| us[1] < 0x80 || us[1] > 0xBF
|
||||||
|
|| us[2] < 0x80 || us[2] > 0xBF)
|
||||||
|
{ // second or third byte missing or invalid
|
||||||
|
code = -1;
|
||||||
|
return static_cast<size_t>(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
code = (int(us[0] & 0x0F) << 12)
|
||||||
|
+ (int(us[1] & 0x3F) << 6)
|
||||||
|
+ int(us[2] & 0x3F);
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (us[0] <= 0xF7) { // 1111 0xxx, 10xx xxxx, 10xx xxxx, 10xx xxxx
|
||||||
|
if (n < 4
|
||||||
|
|| us[1] < 0x80 || us[1] > 0xBF
|
||||||
|
|| us[2] < 0x80 || us[2] > 0xBF
|
||||||
|
|| us[3] < 0x80 || us[3] > 0xBF)
|
||||||
|
{ // second, third or fourth byte missing or invalid
|
||||||
|
code = -1;
|
||||||
|
return static_cast<size_t>(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
code = (int(us[0] & 0x07) << 18)
|
||||||
|
+ (int(us[1] & 0x3F) << 12)
|
||||||
|
+ (int(us[2] & 0x3F) << 6)
|
||||||
|
+ int(us[3] & 0x3F);
|
||||||
|
if (code > 0x10FFFF) { // invalid codepoint
|
||||||
|
code = -1;
|
||||||
|
return static_cast<size_t>(-1);
|
||||||
|
}
|
||||||
|
return 4;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// unknown encoding
|
||||||
|
code = -1;
|
||||||
|
return static_cast<size_t>(-1);
|
||||||
|
}
|
||||||
|
|
||||||
/*-----------------------------------------------------------------------------
|
/*-----------------------------------------------------------------------------
|
||||||
* any
|
* any
|
||||||
*---------------------------------------------------------------------------*/
|
*---------------------------------------------------------------------------*/
|
||||||
@ -511,6 +617,7 @@ public:
|
|||||||
std::vector<std::unordered_map<std::string, std::string>> capture_scope_stack;
|
std::vector<std::unordered_map<std::string, std::string>> capture_scope_stack;
|
||||||
|
|
||||||
const size_t def_count;
|
const size_t def_count;
|
||||||
|
const Encoding encoding;
|
||||||
const bool enablePackratParsing;
|
const bool enablePackratParsing;
|
||||||
std::vector<bool> cache_registered;
|
std::vector<bool> cache_registered;
|
||||||
std::vector<bool> cache_success;
|
std::vector<bool> cache_success;
|
||||||
@ -526,6 +633,7 @@ public:
|
|||||||
size_t a_def_count,
|
size_t a_def_count,
|
||||||
std::shared_ptr<Ope> a_whitespaceOpe,
|
std::shared_ptr<Ope> a_whitespaceOpe,
|
||||||
std::shared_ptr<Ope> a_wordOpe,
|
std::shared_ptr<Ope> a_wordOpe,
|
||||||
|
Encoding a_encoding,
|
||||||
bool a_enablePackratParsing,
|
bool a_enablePackratParsing,
|
||||||
Tracer a_tracer)
|
Tracer a_tracer)
|
||||||
: path(a_path)
|
: path(a_path)
|
||||||
@ -540,6 +648,7 @@ public:
|
|||||||
, in_whitespace(false)
|
, in_whitespace(false)
|
||||||
, wordOpe(a_wordOpe)
|
, wordOpe(a_wordOpe)
|
||||||
, def_count(a_def_count)
|
, def_count(a_def_count)
|
||||||
|
, encoding(a_encoding)
|
||||||
, enablePackratParsing(a_enablePackratParsing)
|
, enablePackratParsing(a_enablePackratParsing)
|
||||||
, cache_registered(enablePackratParsing ? def_count * (l + 1) : 0)
|
, cache_registered(enablePackratParsing ? def_count * (l + 1) : 0)
|
||||||
, cache_success(enablePackratParsing ? def_count * (l + 1) : 0)
|
, cache_success(enablePackratParsing ? def_count * (l + 1) : 0)
|
||||||
@ -971,66 +1080,112 @@ public:
|
|||||||
void accept(Visitor& v) override;
|
void accept(Visitor& v) override;
|
||||||
|
|
||||||
std::string lit_;
|
std::string lit_;
|
||||||
mutable bool init_is_word_;
|
mutable bool init_is_word_;
|
||||||
mutable bool is_word_;
|
mutable bool is_word_;
|
||||||
};
|
};
|
||||||
|
|
||||||
class CharacterClass : public Ope
|
class CharacterClass : public Ope
|
||||||
, public std::enable_shared_from_this<CharacterClass>
|
, public std::enable_shared_from_this<CharacterClass>
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
CharacterClass(const std::string& chars) : chars_(chars) {}
|
CharacterClass(const std::string& chars) {
|
||||||
|
|
||||||
|
chars_ = chars;
|
||||||
|
|
||||||
|
auto i = 0u;
|
||||||
|
codepoint ch = 0;
|
||||||
|
|
||||||
|
while (i < chars.size()){
|
||||||
|
auto len = get_char(chars.c_str()+i, chars.size()-i, Encoding::UTF8, ch);
|
||||||
|
codepoints_.push_back(ch); // might push -1 but that's fine
|
||||||
|
i+=len;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override {
|
size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override {
|
||||||
c.trace("CharacterClass", s, n, sv, dt);
|
c.trace("CharacterClass", s, n, sv, dt);
|
||||||
// TODO: UTF8 support
|
|
||||||
if (n < 1) {
|
if (c.encoding == Encoding::ASCII || c.encoding == Encoding::ANSI) {
|
||||||
c.set_error_pos(s);
|
|
||||||
return static_cast<size_t>(-1);
|
if (n < 1) {
|
||||||
}
|
c.set_error_pos(s);
|
||||||
auto ch = s[0];
|
return static_cast<size_t>(-1);
|
||||||
auto i = 0u;
|
}
|
||||||
while (i < chars_.size()) {
|
|
||||||
if (i + 2 < chars_.size() && chars_[i + 1] == '-') {
|
auto ch = s[0];
|
||||||
if (chars_[i] <= ch && ch <= chars_[i + 2]) {
|
auto i = 0u;
|
||||||
return 1;
|
while (i < chars_.size()) {
|
||||||
|
if (i + 2 < chars_.size() && chars_[i + 1] == '-') {
|
||||||
|
if (chars_[i] <= ch && ch <= chars_[i + 2]) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
i += 3;
|
||||||
|
} else {
|
||||||
|
if (chars_[i] == ch) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
i += 1;
|
||||||
}
|
}
|
||||||
i += 3;
|
|
||||||
} else {
|
|
||||||
if (chars_[i] == ch) {
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
i += 1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else if (c.encoding == Encoding::UTF8) {
|
||||||
|
|
||||||
|
codepoint ch = 0;
|
||||||
|
auto len = get_char(s, n, Encoding::UTF8, ch);
|
||||||
|
|
||||||
|
if (len < 1) {
|
||||||
|
c.set_error_pos(s);
|
||||||
|
return static_cast<size_t>(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto i = 0u;
|
||||||
|
while (i < codepoints_.size()) {
|
||||||
|
if (i + 2 < codepoints_.size() && codepoints_[i + 1] == '-') {
|
||||||
|
if (codepoints_[i] != -1 && codepoints_[i] <= ch && ch <= codepoints_[i + 2]) {
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
i += 3;
|
||||||
|
} else {
|
||||||
|
if (codepoints_[i] == ch) {
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
i += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
c.set_error_pos(s);
|
c.set_error_pos(s);
|
||||||
return static_cast<size_t>(-1);
|
return static_cast<size_t>(-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
void accept(Visitor& v) override;
|
void accept(Visitor& v) override;
|
||||||
|
|
||||||
std::string chars_;
|
std::string chars_; // for ASCII/ANSI
|
||||||
|
std::vector<codepoint> codepoints_; // for UTF8
|
||||||
};
|
};
|
||||||
|
|
||||||
class Character : public Ope
|
class Character : public Ope
|
||||||
, public std::enable_shared_from_this<Character>
|
, public std::enable_shared_from_this<Character>
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
Character(char ch) : ch_(ch) {}
|
Character(codepoint ch) : ch_(ch) {}
|
||||||
|
|
||||||
size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override {
|
size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override {
|
||||||
c.trace("Character", s, n, sv, dt);
|
c.trace("Character", s, n, sv, dt);
|
||||||
// TODO: UTF8 support
|
|
||||||
if (n < 1 || s[0] != ch_) {
|
codepoint code = 0;
|
||||||
|
auto len = get_char(s, n, c.encoding, code);
|
||||||
|
|
||||||
|
if (len < 1 || code != ch_) {
|
||||||
c.set_error_pos(s);
|
c.set_error_pos(s);
|
||||||
return static_cast<size_t>(-1);
|
return static_cast<size_t>(-1);
|
||||||
}
|
}
|
||||||
return 1;
|
return len;
|
||||||
}
|
}
|
||||||
|
|
||||||
void accept(Visitor& v) override;
|
void accept(Visitor& v) override;
|
||||||
|
|
||||||
char ch_;
|
codepoint ch_;
|
||||||
};
|
};
|
||||||
|
|
||||||
class AnyCharacter : public Ope
|
class AnyCharacter : public Ope
|
||||||
@ -1039,12 +1194,14 @@ class AnyCharacter : public Ope
|
|||||||
public:
|
public:
|
||||||
size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override {
|
size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override {
|
||||||
c.trace("AnyCharacter", s, n, sv, dt);
|
c.trace("AnyCharacter", s, n, sv, dt);
|
||||||
// TODO: UTF8 support
|
|
||||||
if (n < 1) {
|
codepoint code = 0;
|
||||||
|
auto len = get_char(s, n, c.encoding, code);
|
||||||
|
|
||||||
|
if (len < 1) {
|
||||||
c.set_error_pos(s);
|
c.set_error_pos(s);
|
||||||
return static_cast<size_t>(-1);
|
|
||||||
}
|
}
|
||||||
return 1;
|
return len;
|
||||||
}
|
}
|
||||||
|
|
||||||
void accept(Visitor& v) override;
|
void accept(Visitor& v) override;
|
||||||
@ -1273,7 +1430,7 @@ inline std::shared_ptr<Ope> cls(const std::string& chars) {
|
|||||||
return std::make_shared<CharacterClass>(chars);
|
return std::make_shared<CharacterClass>(chars);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline std::shared_ptr<Ope> chr(char dt) {
|
inline std::shared_ptr<Ope> chr(codepoint dt) {
|
||||||
return std::make_shared<Character>(dt);
|
return std::make_shared<Character>(dt);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1605,6 +1762,7 @@ public:
|
|||||||
|
|
||||||
Definition()
|
Definition()
|
||||||
: ignoreSemanticValue(false)
|
: ignoreSemanticValue(false)
|
||||||
|
, encoding(Encoding::DEFAULT_ENCODING)
|
||||||
, enablePackratParsing(false)
|
, enablePackratParsing(false)
|
||||||
, is_macro(false)
|
, is_macro(false)
|
||||||
, holder_(std::make_shared<Holder>(this))
|
, holder_(std::make_shared<Holder>(this))
|
||||||
@ -1613,6 +1771,7 @@ public:
|
|||||||
Definition(const Definition& rhs)
|
Definition(const Definition& rhs)
|
||||||
: name(rhs.name)
|
: name(rhs.name)
|
||||||
, ignoreSemanticValue(false)
|
, ignoreSemanticValue(false)
|
||||||
|
, encoding(Encoding::DEFAULT_ENCODING)
|
||||||
, enablePackratParsing(false)
|
, enablePackratParsing(false)
|
||||||
, is_macro(false)
|
, is_macro(false)
|
||||||
, holder_(rhs.holder_)
|
, holder_(rhs.holder_)
|
||||||
@ -1626,6 +1785,7 @@ public:
|
|||||||
, ignoreSemanticValue(rhs.ignoreSemanticValue)
|
, ignoreSemanticValue(rhs.ignoreSemanticValue)
|
||||||
, whitespaceOpe(rhs.whitespaceOpe)
|
, whitespaceOpe(rhs.whitespaceOpe)
|
||||||
, wordOpe(rhs.wordOpe)
|
, wordOpe(rhs.wordOpe)
|
||||||
|
, encoding(rhs.encoding)
|
||||||
, enablePackratParsing(rhs.enablePackratParsing)
|
, enablePackratParsing(rhs.enablePackratParsing)
|
||||||
, is_macro(rhs.is_macro)
|
, is_macro(rhs.is_macro)
|
||||||
, holder_(std::move(rhs.holder_))
|
, holder_(std::move(rhs.holder_))
|
||||||
@ -1636,6 +1796,7 @@ public:
|
|||||||
|
|
||||||
Definition(const std::shared_ptr<Ope>& ope)
|
Definition(const std::shared_ptr<Ope>& ope)
|
||||||
: ignoreSemanticValue(false)
|
: ignoreSemanticValue(false)
|
||||||
|
, encoding(Encoding::DEFAULT_ENCODING)
|
||||||
, enablePackratParsing(false)
|
, enablePackratParsing(false)
|
||||||
, is_macro(false)
|
, is_macro(false)
|
||||||
, holder_(std::make_shared<Holder>(this))
|
, holder_(std::make_shared<Holder>(this))
|
||||||
@ -1749,6 +1910,7 @@ public:
|
|||||||
bool ignoreSemanticValue;
|
bool ignoreSemanticValue;
|
||||||
std::shared_ptr<Ope> whitespaceOpe;
|
std::shared_ptr<Ope> whitespaceOpe;
|
||||||
std::shared_ptr<Ope> wordOpe;
|
std::shared_ptr<Ope> wordOpe;
|
||||||
|
Encoding encoding;
|
||||||
bool enablePackratParsing;
|
bool enablePackratParsing;
|
||||||
bool is_macro;
|
bool is_macro;
|
||||||
std::vector<std::string> params;
|
std::vector<std::string> params;
|
||||||
@ -1775,7 +1937,7 @@ private:
|
|||||||
wordOpe->accept(vis);
|
wordOpe->accept(vis);
|
||||||
}
|
}
|
||||||
|
|
||||||
Context cxt(path, s, n, vis.ids.size(), whitespaceOpe, wordOpe, enablePackratParsing, tracer);
|
Context cxt(path, s, n, vis.ids.size(), whitespaceOpe, wordOpe, encoding, enablePackratParsing, tracer);
|
||||||
auto len = ope->parse(s, n, sv, cxt, dt);
|
auto len = ope->parse(s, n, sv, cxt, dt);
|
||||||
return Result{ success(len), len, cxt.error_pos, cxt.message_pos, cxt.message };
|
return Result{ success(len), len, cxt.error_pos, cxt.message_pos, cxt.message };
|
||||||
}
|
}
|
||||||
@ -1800,27 +1962,27 @@ inline size_t parse_literal(const char* s, size_t n, SemanticValues& sv, Context
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Word check
|
// Word check
|
||||||
static Context dummy_c(nullptr, lit.data(), lit.size(), 0, nullptr, nullptr, false, nullptr);
|
static Context dummy_c(nullptr, lit.data(), lit.size(), 0, nullptr, nullptr, Encoding::DEFAULT_ENCODING, false, nullptr);
|
||||||
static SemanticValues dummy_sv;
|
static SemanticValues dummy_sv;
|
||||||
static any dummy_dt;
|
static any dummy_dt;
|
||||||
|
|
||||||
if (!init_is_word) { // TODO: Protect with mutex
|
if (!init_is_word) { // TODO: Protect with mutex
|
||||||
if (c.wordOpe) {
|
if (c.wordOpe) {
|
||||||
auto len = c.wordOpe->parse(lit.data(), lit.size(), dummy_sv, dummy_c, dummy_dt);
|
auto len = c.wordOpe->parse(lit.data(), lit.size(), dummy_sv, dummy_c, dummy_dt);
|
||||||
is_word = success(len);
|
is_word = success(len);
|
||||||
}
|
}
|
||||||
init_is_word = true;
|
init_is_word = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (is_word) {
|
if (is_word) {
|
||||||
auto ope = std::make_shared<NotPredicate>(c.wordOpe);
|
auto ope = std::make_shared<NotPredicate>(c.wordOpe);
|
||||||
auto len = ope->parse(s + i, n - i, dummy_sv, dummy_c, dummy_dt);
|
auto len = ope->parse(s + i, n - i, dummy_sv, dummy_c, dummy_dt);
|
||||||
if (fail(len)) {
|
if (fail(len)) {
|
||||||
return static_cast<size_t>(-1);
|
return static_cast<size_t>(-1);
|
||||||
}
|
}
|
||||||
i += len;
|
i += len;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip whiltespace
|
// Skip whiltespace
|
||||||
if (!c.in_token) {
|
if (!c.in_token) {
|
||||||
@ -1842,7 +2004,7 @@ inline size_t LiteralString::parse(const char* s, size_t n, SemanticValues& sv,
|
|||||||
}
|
}
|
||||||
|
|
||||||
inline size_t TokenBoundary::parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const {
|
inline size_t TokenBoundary::parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const {
|
||||||
c.in_token = true;
|
c.in_token = true;
|
||||||
auto se = make_scope_exit([&]() { c.in_token = false; });
|
auto se = make_scope_exit([&]() { c.in_token = false; });
|
||||||
const auto& rule = *ope_;
|
const auto& rule = *ope_;
|
||||||
auto len = rule.parse(s, n, sv, c, dt);
|
auto len = rule.parse(s, n, sv, c, dt);
|
||||||
@ -1882,6 +2044,7 @@ inline size_t Holder::parse(const char* s, size_t n, SemanticValues& sv, Context
|
|||||||
any val;
|
any val;
|
||||||
|
|
||||||
c.packrat(s, outer_->id, len, val, [&](any& a_val) {
|
c.packrat(s, outer_->id, len, val, [&](any& a_val) {
|
||||||
|
|
||||||
if (outer_->enter) {
|
if (outer_->enter) {
|
||||||
outer_->enter(s, n, dt);
|
outer_->enter(s, n, dt);
|
||||||
}
|
}
|
||||||
@ -1946,33 +2109,33 @@ inline any Holder::reduce(const SemanticValues& sv, any& dt) const {
|
|||||||
|
|
||||||
inline size_t Reference::parse(
|
inline size_t Reference::parse(
|
||||||
const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const {
|
const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const {
|
||||||
if (rule_) {
|
if (rule_) {
|
||||||
// Reference rule
|
// Reference rule
|
||||||
if (rule_->is_macro) {
|
if (rule_->is_macro) {
|
||||||
// Macro
|
// Macro
|
||||||
FindReference vis(c.top_args(), rule_->params);
|
FindReference vis(c.top_args(), rule_->params);
|
||||||
|
|
||||||
// Collect arguments
|
// Collect arguments
|
||||||
std::vector<std::shared_ptr<Ope>> args;
|
std::vector<std::shared_ptr<Ope>> args;
|
||||||
for (auto arg: args_) {
|
for (auto arg: args_) {
|
||||||
arg->accept(vis);
|
arg->accept(vis);
|
||||||
args.push_back(vis.found_ope);
|
args.push_back(vis.found_ope);
|
||||||
}
|
}
|
||||||
|
|
||||||
c.push_args(args);
|
c.push_args(args);
|
||||||
auto se = make_scope_exit([&]() { c.pop_args(); });
|
auto se = make_scope_exit([&]() { c.pop_args(); });
|
||||||
auto ope = get_core_operator();
|
auto ope = get_core_operator();
|
||||||
return ope->parse(s, n, sv, c, dt);
|
return ope->parse(s, n, sv, c, dt);
|
||||||
} else {
|
} else {
|
||||||
// Definition
|
// Definition
|
||||||
auto ope = get_core_operator();
|
auto ope = get_core_operator();
|
||||||
return ope->parse(s, n, sv, c, dt);
|
return ope->parse(s, n, sv, c, dt);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Reference parameter in macro
|
// Reference parameter in macro
|
||||||
const auto& args = c.top_args();
|
const auto& args = c.top_args();
|
||||||
return args[iarg_]->parse(s, n, sv, c, dt);
|
return args[iarg_]->parse(s, n, sv, c, dt);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline std::shared_ptr<Ope> Reference::get_core_operator() const {
|
inline std::shared_ptr<Ope> Reference::get_core_operator() const {
|
||||||
@ -2124,9 +2287,10 @@ public:
|
|||||||
const char* s,
|
const char* s,
|
||||||
size_t n,
|
size_t n,
|
||||||
std::string& start,
|
std::string& start,
|
||||||
Log log)
|
Log log,
|
||||||
|
Encoding enc = Encoding::DEFAULT_ENCODING)
|
||||||
{
|
{
|
||||||
return get_instance().perform_core(s, n, start, log);
|
return get_instance().perform_core(s, n, start, log, enc);
|
||||||
}
|
}
|
||||||
|
|
||||||
// For debuging purpose
|
// For debuging purpose
|
||||||
@ -2173,7 +2337,8 @@ private:
|
|||||||
|
|
||||||
g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]);
|
g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]);
|
||||||
g["IdentCont"] <= seq(g["IdentStart"], zom(g["IdentRest"]));
|
g["IdentCont"] <= seq(g["IdentStart"], zom(g["IdentRest"]));
|
||||||
g["IdentStart"] <= cls("a-zA-Z_\x80-\xff%");
|
g["IdentStart"] <= cho(seq(npd(cls("\x01-\x7f")), dot()), cls("a-zA-Z_%"));
|
||||||
|
|
||||||
g["IdentRest"] <= cho(g["IdentStart"], cls("0-9"));
|
g["IdentRest"] <= cho(g["IdentStart"], cls("0-9"));
|
||||||
|
|
||||||
g["Literal"] <= cho(seq(cls("'"), tok(zom(seq(npd(cls("'")), g["Char"]))), cls("'"), g["Spacing"]),
|
g["Literal"] <= cho(seq(cls("'"), tok(zom(seq(npd(cls("'")), g["Char"]))), cls("'"), g["Spacing"]),
|
||||||
@ -2188,11 +2353,7 @@ private:
|
|||||||
seq(lit("\\x"), cls("0-9a-fA-F"), opt(cls("0-9a-fA-F"))),
|
seq(lit("\\x"), cls("0-9a-fA-F"), opt(cls("0-9a-fA-F"))),
|
||||||
seq(npd(chr('\\')), dot()));
|
seq(npd(chr('\\')), dot()));
|
||||||
|
|
||||||
#if !defined(PEGLIB_NO_UNICODE_CHARS)
|
g["LEFTARROW"] <= seq(cho(lit("<-"), lit("\xe2\x86\x90")), g["Spacing"]);
|
||||||
g["LEFTARROW"] <= seq(cho(lit("<-"), lit(u8"←")), g["Spacing"]);
|
|
||||||
#else
|
|
||||||
g["LEFTARROW"] <= seq(lit("<-"), g["Spacing"]);
|
|
||||||
#endif
|
|
||||||
~g["SLASH"] <= seq(chr('/'), g["Spacing"]);
|
~g["SLASH"] <= seq(chr('/'), g["Spacing"]);
|
||||||
g["AND"] <= seq(chr('&'), g["Spacing"]);
|
g["AND"] <= seq(chr('&'), g["Spacing"]);
|
||||||
g["NOT"] <= seq(chr('!'), g["Spacing"]);
|
g["NOT"] <= seq(chr('!'), g["Spacing"]);
|
||||||
@ -2413,10 +2574,12 @@ private:
|
|||||||
const char* s,
|
const char* s,
|
||||||
size_t n,
|
size_t n,
|
||||||
std::string& start,
|
std::string& start,
|
||||||
Log log)
|
Log log,
|
||||||
|
Encoding enc)
|
||||||
{
|
{
|
||||||
Data data;
|
Data data;
|
||||||
any dt = &data;
|
any dt = &data;
|
||||||
|
g["Grammar"].encoding = enc;
|
||||||
auto r = g["Grammar"].parse(s, n, dt);
|
auto r = g["Grammar"].parse(s, n, dt);
|
||||||
|
|
||||||
if (!r.ret) {
|
if (!r.ret) {
|
||||||
@ -2763,19 +2926,25 @@ class parser
|
|||||||
public:
|
public:
|
||||||
parser() = default;
|
parser() = default;
|
||||||
|
|
||||||
parser(const char* s, size_t n) {
|
parser(const char* s, size_t n, Encoding enc = Encoding::DEFAULT_ENCODING)
|
||||||
|
: enc_(enc)
|
||||||
|
{
|
||||||
load_grammar(s, n);
|
load_grammar(s, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
parser(const char* s)
|
parser(const char* s, Encoding enc = Encoding::DEFAULT_ENCODING)
|
||||||
: parser(s, strlen(s)) {}
|
: parser(s, strlen(s), enc) {}
|
||||||
|
|
||||||
operator bool() {
|
operator bool() {
|
||||||
return grammar_ != nullptr;
|
return grammar_ != nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool load_grammar(const char* s, size_t n) {
|
bool load_grammar(const char* s, size_t n) {
|
||||||
grammar_ = ParserGenerator::parse(s, n, start_, log);
|
grammar_ = ParserGenerator::parse(s, n, start_, log, enc_);
|
||||||
|
if (grammar_ != nullptr) {
|
||||||
|
auto& rule = (*grammar_)[start_];
|
||||||
|
rule.encoding = enc_;
|
||||||
|
}
|
||||||
return grammar_ != nullptr;
|
return grammar_ != nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2948,6 +3117,7 @@ private:
|
|||||||
|
|
||||||
std::shared_ptr<Grammar> grammar_;
|
std::shared_ptr<Grammar> grammar_;
|
||||||
std::string start_;
|
std::string start_;
|
||||||
|
const Encoding enc_;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace peg
|
} // namespace peg
|
||||||
|
@ -5,7 +5,6 @@
|
|||||||
#include <peglib.h>
|
#include <peglib.h>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
#if !defined(PEGLIB_NO_UNICODE_CHARS)
|
|
||||||
TEST_CASE("Simple syntax test (with unicode)", "[general]")
|
TEST_CASE("Simple syntax test (with unicode)", "[general]")
|
||||||
{
|
{
|
||||||
peg::parser parser(
|
peg::parser parser(
|
||||||
@ -15,8 +14,8 @@ TEST_CASE("Simple syntax test (with unicode)", "[general]")
|
|||||||
|
|
||||||
bool ret = parser;
|
bool ret = parser;
|
||||||
REQUIRE(ret == true);
|
REQUIRE(ret == true);
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
TEST_CASE("Simple syntax test", "[general]")
|
TEST_CASE("Simple syntax test", "[general]")
|
||||||
{
|
{
|
||||||
@ -1373,6 +1372,7 @@ TEST_CASE("PEG Literal", "[peg]")
|
|||||||
REQUIRE(exact(g, "Literal", "abc") == false);
|
REQUIRE(exact(g, "Literal", "abc") == false);
|
||||||
REQUIRE(exact(g, "Literal", "") == false);
|
REQUIRE(exact(g, "Literal", "") == false);
|
||||||
REQUIRE(exact(g, "Literal", u8"日本語") == false);
|
REQUIRE(exact(g, "Literal", u8"日本語") == false);
|
||||||
|
REQUIRE(exact(g, "Literal", u8"'日本語'") == true);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_CASE("PEG Class", "[peg]")
|
TEST_CASE("PEG Class", "[peg]")
|
||||||
@ -1391,6 +1391,7 @@ TEST_CASE("PEG Class", "[peg]")
|
|||||||
REQUIRE(exact(g, "Class", "]") == false);
|
REQUIRE(exact(g, "Class", "]") == false);
|
||||||
REQUIRE(exact(g, "Class", "a]") == false);
|
REQUIRE(exact(g, "Class", "a]") == false);
|
||||||
REQUIRE(exact(g, "Class", u8"あ-ん") == false);
|
REQUIRE(exact(g, "Class", u8"あ-ん") == false);
|
||||||
|
REQUIRE(exact(g, "Class", u8"[あ-ん]") == true);
|
||||||
REQUIRE(exact(g, "Class", "[-+]") == true);
|
REQUIRE(exact(g, "Class", "[-+]") == true);
|
||||||
REQUIRE(exact(g, "Class", "[+-]") == false);
|
REQUIRE(exact(g, "Class", "[+-]") == false);
|
||||||
}
|
}
|
||||||
@ -1436,7 +1437,7 @@ TEST_CASE("PEG Char", "[peg]")
|
|||||||
REQUIRE(exact(g, "Char", " ") == true);
|
REQUIRE(exact(g, "Char", " ") == true);
|
||||||
REQUIRE(exact(g, "Char", " ") == false);
|
REQUIRE(exact(g, "Char", " ") == false);
|
||||||
REQUIRE(exact(g, "Char", "") == false);
|
REQUIRE(exact(g, "Char", "") == false);
|
||||||
REQUIRE(exact(g, "Char", u8"あ") == false);
|
REQUIRE(exact(g, "Char", u8"あ") == true);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_CASE("PEG Operators", "[peg]")
|
TEST_CASE("PEG Operators", "[peg]")
|
||||||
|
Loading…
Reference in New Issue
Block a user