mirror of
https://github.com/yhirose/cpp-peglib.git
synced 2025-01-22 13:25:30 +00:00
Merge branch 'unicode'
This commit is contained in:
commit
c006918329
@ -417,7 +417,7 @@ assert(g.parse(" Hello BNF! "));
|
||||
Unicode support
|
||||
---------------
|
||||
|
||||
Since cpp-peglib only accepts 8 bits characters, it probably accepts UTF-8 text. But `.` matches only a byte, not a Unicode character. Also, it dosn't support `\u????`.
|
||||
cpp-peglib accepts UTF8 text. `.` matches a Unicode codepoint. Also, it supports `\u????`.
|
||||
|
||||
peglint - PEG syntax lint utility
|
||||
---------------------------------
|
||||
@ -535,7 +535,7 @@ Tested compilers
|
||||
TODO
|
||||
----
|
||||
|
||||
* Unicode support (`.` matches a Unicode char. `\u????`, `\p{L}`)
|
||||
* Advanced Unicode support ([Unicode regular expressoin](http://www.unicode.org/reports/tr18/))
|
||||
|
||||
License
|
||||
-------
|
||||
|
395
peglib.h
395
peglib.h
@ -205,6 +205,213 @@ auto make_scope_exit(EF&& exit_function) -> scope_exit<EF> {
|
||||
return scope_exit<typename std::remove_reference<EF>::type>(std::forward<EF>(exit_function));
|
||||
}
|
||||
|
||||
/*-----------------------------------------------------------------------------
|
||||
* UTF8 functions
|
||||
*---------------------------------------------------------------------------*/
|
||||
|
||||
inline size_t codepoint_length(const char *s8, size_t l) {
|
||||
if (l) {
|
||||
auto b = static_cast<uint8_t>(s8[0]);
|
||||
if ((b & 0x80) == 0) {
|
||||
return 1;
|
||||
} else if ((b & 0xE0) == 0xC0) {
|
||||
return 2;
|
||||
} else if ((b & 0xF0) == 0xE0) {
|
||||
return 3;
|
||||
} else if ((b & 0xF8) == 0xF0) {
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
inline size_t encode_codepoint(char32_t cp, char *buff) {
|
||||
if (cp < 0x0080) {
|
||||
buff[0] = static_cast<char>(cp & 0x7F);
|
||||
return 1;
|
||||
} else if (cp < 0x0800) {
|
||||
buff[0] = static_cast<char>(0xC0 | ((cp >> 6) & 0x1F));
|
||||
buff[1] = static_cast<char>(0x80 | (cp & 0x3F));
|
||||
return 2;
|
||||
} else if (cp < 0xD800) {
|
||||
buff[0] = static_cast<char>(0xE0 | ((cp >> 12) & 0xF));
|
||||
buff[1] = static_cast<char>(0x80 | ((cp >> 6) & 0x3F));
|
||||
buff[2] = static_cast<char>(0x80 | (cp & 0x3F));
|
||||
return 3;
|
||||
} else if (cp < 0xE000) {
|
||||
// D800 - DFFF is invalid...
|
||||
return 0;
|
||||
} else if (cp < 0x10000) {
|
||||
buff[0] = static_cast<char>(0xE0 | ((cp >> 12) & 0xF));
|
||||
buff[1] = static_cast<char>(0x80 | ((cp >> 6) & 0x3F));
|
||||
buff[2] = static_cast<char>(0x80 | (cp & 0x3F));
|
||||
return 3;
|
||||
} else if (cp < 0x110000) {
|
||||
buff[0] = static_cast<char>(0xF0 | ((cp >> 18) & 0x7));
|
||||
buff[1] = static_cast<char>(0x80 | ((cp >> 12) & 0x3F));
|
||||
buff[2] = static_cast<char>(0x80 | ((cp >> 6) & 0x3F));
|
||||
buff[3] = static_cast<char>(0x80 | (cp & 0x3F));
|
||||
return 4;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
inline std::string encode_codepoint(char32_t cp) {
|
||||
char buff[4];
|
||||
auto l = encode_codepoint(cp, buff);
|
||||
return std::string(buff, l);
|
||||
}
|
||||
|
||||
inline bool decode_codepoint(const char *s8, size_t l, size_t &bytes,
|
||||
char32_t &cp) {
|
||||
if (l) {
|
||||
auto b = static_cast<uint8_t>(s8[0]);
|
||||
if ((b & 0x80) == 0) {
|
||||
bytes = 1;
|
||||
cp = b;
|
||||
return true;
|
||||
} else if ((b & 0xE0) == 0xC0) {
|
||||
if (l >= 2) {
|
||||
bytes = 2;
|
||||
cp = ((static_cast<char32_t>(s8[0] & 0x1F)) << 6) |
|
||||
(static_cast<char32_t>(s8[1] & 0x3F));
|
||||
return true;
|
||||
}
|
||||
} else if ((b & 0xF0) == 0xE0) {
|
||||
if (l >= 3) {
|
||||
bytes = 3;
|
||||
cp = ((static_cast<char32_t>(s8[0] & 0x0F)) << 12) |
|
||||
((static_cast<char32_t>(s8[1] & 0x3F)) << 6) |
|
||||
(static_cast<char32_t>(s8[2] & 0x3F));
|
||||
return true;
|
||||
}
|
||||
} else if ((b & 0xF8) == 0xF0) {
|
||||
if (l >= 4) {
|
||||
bytes = 4;
|
||||
cp = ((static_cast<char32_t>(s8[0] & 0x07)) << 18) |
|
||||
((static_cast<char32_t>(s8[1] & 0x3F)) << 12) |
|
||||
((static_cast<char32_t>(s8[2] & 0x3F)) << 6) |
|
||||
(static_cast<char32_t>(s8[3] & 0x3F));
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
inline size_t decode_codepoint(const char *s8, size_t l, char32_t &out) {
|
||||
size_t bytes;
|
||||
if (decode_codepoint(s8, l, bytes, out)) {
|
||||
return bytes;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
inline char32_t decode_codepoint(const char *s8, size_t l) {
|
||||
char32_t out = 0;
|
||||
decode_codepoint(s8, l, out);
|
||||
return out;
|
||||
}
|
||||
|
||||
inline std::u32string decode(const char *s8, size_t l) {
|
||||
std::u32string out;
|
||||
size_t i = 0;
|
||||
while (i < l) {
|
||||
auto beg = i++;
|
||||
while (i < l && (s8[i] & 0xc0) == 0x80) {
|
||||
i++;
|
||||
}
|
||||
out += decode_codepoint(&s8[beg], (i - beg));
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
/*-----------------------------------------------------------------------------
|
||||
* resolve_escape_sequence
|
||||
*---------------------------------------------------------------------------*/
|
||||
|
||||
inline bool is_hex(char c, int& v) {
|
||||
if ('0' <= c && c <= '9') {
|
||||
v = c - '0';
|
||||
return true;
|
||||
} else if ('a' <= c && c <= 'f') {
|
||||
v = c - 'a' + 10;
|
||||
return true;
|
||||
} else if ('A' <= c && c <= 'F') {
|
||||
v = c - 'A' + 10;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
inline bool is_digit(char c, int& v) {
|
||||
if ('0' <= c && c <= '9') {
|
||||
v = c - '0';
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
inline std::pair<int, size_t> parse_hex_number(const char* s, size_t n, size_t i) {
|
||||
int ret = 0;
|
||||
int val;
|
||||
while (i < n && is_hex(s[i], val)) {
|
||||
ret = static_cast<int>(ret * 16 + val);
|
||||
i++;
|
||||
}
|
||||
return std::make_pair(ret, i);
|
||||
}
|
||||
|
||||
inline std::pair<int, size_t> parse_octal_number(const char* s, size_t n, size_t i) {
|
||||
int ret = 0;
|
||||
int val;
|
||||
while (i < n && is_digit(s[i], val)) {
|
||||
ret = static_cast<int>(ret * 8 + val);
|
||||
i++;
|
||||
}
|
||||
return std::make_pair(ret, i);
|
||||
}
|
||||
|
||||
inline std::string resolve_escape_sequence(const char* s, size_t n) {
|
||||
std::string r;
|
||||
r.reserve(n);
|
||||
|
||||
size_t i = 0;
|
||||
while (i < n) {
|
||||
auto ch = s[i];
|
||||
if (ch == '\\') {
|
||||
i++;
|
||||
switch (s[i]) {
|
||||
case 'n': r += '\n'; i++; break;
|
||||
case 'r': r += '\r'; i++; break;
|
||||
case 't': r += '\t'; i++; break;
|
||||
case '\'': r += '\''; i++; break;
|
||||
case '"': r += '"'; i++; break;
|
||||
case '[': r += '['; i++; break;
|
||||
case ']': r += ']'; i++; break;
|
||||
case '\\': r += '\\'; i++; break;
|
||||
case 'x':
|
||||
case 'u': {
|
||||
char32_t cp;
|
||||
std::tie(cp, i) = parse_hex_number(s, n, i + 1);
|
||||
r += encode_codepoint(cp);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
char32_t cp;
|
||||
std::tie(cp, i) = parse_octal_number(s, n, i);
|
||||
r += encode_codepoint(cp);
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
r += ch;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
/*-----------------------------------------------------------------------------
|
||||
* PEG
|
||||
*---------------------------------------------------------------------------*/
|
||||
@ -979,37 +1186,51 @@ class CharacterClass : public Ope
|
||||
, public std::enable_shared_from_this<CharacterClass>
|
||||
{
|
||||
public:
|
||||
CharacterClass(const std::string& chars) : chars_(chars) {}
|
||||
CharacterClass(const std::string& s) {
|
||||
auto chars = decode(s.c_str(), s.length());
|
||||
auto i = 0u;
|
||||
while (i < chars.size()) {
|
||||
if (i + 2 < chars.size() && chars[i + 1] == '-') {
|
||||
auto cp1 = chars[i];
|
||||
auto cp2 = chars[i + 2];
|
||||
ranges_.emplace_back(std::make_pair(cp1, cp2));
|
||||
i += 3;
|
||||
} else {
|
||||
auto cp = chars[i];
|
||||
ranges_.emplace_back(std::make_pair(cp, cp));
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CharacterClass(const std::vector<std::pair<char32_t, char32_t>>& ranges) : ranges_(ranges) {}
|
||||
|
||||
size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override {
|
||||
c.trace("CharacterClass", s, n, sv, dt);
|
||||
// TODO: UTF8 support
|
||||
|
||||
if (n < 1) {
|
||||
c.set_error_pos(s);
|
||||
return static_cast<size_t>(-1);
|
||||
}
|
||||
auto ch = s[0];
|
||||
auto i = 0u;
|
||||
while (i < chars_.size()) {
|
||||
if (i + 2 < chars_.size() && chars_[i + 1] == '-') {
|
||||
if (chars_[i] <= ch && ch <= chars_[i + 2]) {
|
||||
return 1;
|
||||
|
||||
char32_t cp;
|
||||
auto len = decode_codepoint(s, n, cp);
|
||||
|
||||
if (!ranges_.empty()) {
|
||||
for (const auto& range: ranges_) {
|
||||
if (range.first <= cp && cp <= range.second) {
|
||||
return len;
|
||||
}
|
||||
i += 3;
|
||||
} else {
|
||||
if (chars_[i] == ch) {
|
||||
return 1;
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
c.set_error_pos(s);
|
||||
return static_cast<size_t>(-1);
|
||||
}
|
||||
|
||||
void accept(Visitor& v) override;
|
||||
|
||||
std::string chars_;
|
||||
std::vector<std::pair<char32_t, char32_t>> ranges_;
|
||||
};
|
||||
|
||||
class Character : public Ope
|
||||
@ -1020,7 +1241,6 @@ public:
|
||||
|
||||
size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override {
|
||||
c.trace("Character", s, n, sv, dt);
|
||||
// TODO: UTF8 support
|
||||
if (n < 1 || s[0] != ch_) {
|
||||
c.set_error_pos(s);
|
||||
return static_cast<size_t>(-1);
|
||||
@ -1039,12 +1259,12 @@ class AnyCharacter : public Ope
|
||||
public:
|
||||
size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override {
|
||||
c.trace("AnyCharacter", s, n, sv, dt);
|
||||
// TODO: UTF8 support
|
||||
if (n < 1) {
|
||||
auto len = codepoint_length(s, n);
|
||||
if (len < 1) {
|
||||
c.set_error_pos(s);
|
||||
return static_cast<size_t>(-1);
|
||||
}
|
||||
return 1;
|
||||
return len;
|
||||
}
|
||||
|
||||
void accept(Visitor& v) override;
|
||||
@ -1282,8 +1502,12 @@ inline std::shared_ptr<Ope> lit(const std::string& lit) {
|
||||
return std::make_shared<LiteralString>(lit);
|
||||
}
|
||||
|
||||
inline std::shared_ptr<Ope> cls(const std::string& chars) {
|
||||
return std::make_shared<CharacterClass>(chars);
|
||||
inline std::shared_ptr<Ope> cls(const std::string& s) {
|
||||
return std::make_shared<CharacterClass>(s);
|
||||
}
|
||||
|
||||
inline std::shared_ptr<Ope> cls(const std::vector<std::pair<char32_t, char32_t>>& ranges) {
|
||||
return std::make_shared<CharacterClass>(ranges);
|
||||
}
|
||||
|
||||
inline std::shared_ptr<Ope> chr(char dt) {
|
||||
@ -2205,7 +2429,10 @@ private:
|
||||
|
||||
g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]);
|
||||
g["IdentCont"] <= seq(g["IdentStart"], zom(g["IdentRest"]));
|
||||
g["IdentStart"] <= cls("a-zA-Z_\x80-\xff%");
|
||||
|
||||
const static std::vector<std::pair<char32_t, char32_t>> range = {{ 0x0080, 0xFFFF }};
|
||||
g["IdentStart"] <= cho(cls("a-zA-Z_%"), cls(range));
|
||||
|
||||
g["IdentRest"] <= cho(g["IdentStart"], cls("0-9"));
|
||||
|
||||
g["Literal"] <= cho(seq(cls("'"), tok(zom(seq(npd(cls("'")), g["Char"]))), cls("'"), g["Spacing"]),
|
||||
@ -2218,12 +2445,13 @@ private:
|
||||
seq(chr('\\'), cls("0-3"), cls("0-7"), cls("0-7")),
|
||||
seq(chr('\\'), cls("0-7"), opt(cls("0-7"))),
|
||||
seq(lit("\\x"), cls("0-9a-fA-F"), opt(cls("0-9a-fA-F"))),
|
||||
seq(lit("\\u"), cls("0-9a-fA-F"), cls("0-9a-fA-F"), cls("0-9a-fA-F"), cls("0-9a-fA-F")),
|
||||
seq(npd(chr('\\')), dot()));
|
||||
|
||||
#if !defined(PEGLIB_NO_UNICODE_CHARS)
|
||||
g["LEFTARROW"] <= seq(cho(lit("<-"), lit(u8"←")), g["Spacing"]);
|
||||
#else
|
||||
#if defined(PEGLIB_NO_UNICODE_CHARS)
|
||||
g["LEFTARROW"] <= seq(lit("<-"), g["Spacing"]);
|
||||
#else
|
||||
g["LEFTARROW"] <= seq(cho(lit("<-"), lit(u8"←")), g["Spacing"]);
|
||||
#endif
|
||||
~g["SLASH"] <= seq(chr('/'), g["Spacing"]);
|
||||
g["AND"] <= seq(chr('&'), g["Spacing"]);
|
||||
@ -2235,7 +2463,7 @@ private:
|
||||
~g["CLOSE"] <= seq(chr(')'), g["Spacing"]);
|
||||
g["DOT"] <= seq(chr('.'), g["Spacing"]);
|
||||
|
||||
g["Spacing"] <= zom(cho(g["Space"], g["Comment"]));
|
||||
~g["Spacing"] <= zom(cho(g["Space"], g["Comment"]));
|
||||
g["Comment"] <= seq(chr('#'), zom(seq(npd(g["EndOfLine"]), dot())), g["EndOfLine"]);
|
||||
g["Space"] <= cho(chr(' '), chr('\t'), g["EndOfLine"]);
|
||||
g["EndOfLine"] <= cho(lit("\r\n"), chr('\n'), chr('\r'));
|
||||
@ -2407,13 +2635,41 @@ private:
|
||||
return std::string(sv.c_str(), sv.length());
|
||||
};
|
||||
|
||||
g["Literal"] = [this](const SemanticValues& sv) {
|
||||
g["IdentStart"] = [](const SemanticValues& /*sv*/) {
|
||||
return std::string();
|
||||
};
|
||||
|
||||
g["IdentRest"] = [](const SemanticValues& /*sv*/) {
|
||||
return std::string();
|
||||
};
|
||||
|
||||
g["Literal"] = [](const SemanticValues& sv) {
|
||||
const auto& tok = sv.tokens.front();
|
||||
return lit(resolve_escape_sequence(tok.first, tok.second));
|
||||
};
|
||||
g["Class"] = [this](const SemanticValues& sv) {
|
||||
const auto& tok = sv.tokens.front();
|
||||
return cls(resolve_escape_sequence(tok.first, tok.second));
|
||||
g["Class"] = [](const SemanticValues& sv) {
|
||||
auto ranges = sv.transform<std::pair<char32_t, char32_t>>();
|
||||
return cls(ranges);
|
||||
};
|
||||
g["Range"] = [](const SemanticValues& sv) {
|
||||
switch (sv.choice()) {
|
||||
case 0: {
|
||||
auto s1 = sv[0].get<std::string>();
|
||||
auto s2 = sv[1].get<std::string>();
|
||||
auto cp1 = decode_codepoint(s1.c_str(), s1.length());
|
||||
auto cp2 = decode_codepoint(s2.c_str(), s2.length());
|
||||
return std::make_pair(cp1, cp2);
|
||||
}
|
||||
case 1: {
|
||||
auto s = sv[0].get<std::string>();
|
||||
auto cp = decode_codepoint(s.c_str(), s.length());
|
||||
return std::make_pair(cp, cp);
|
||||
}
|
||||
}
|
||||
return std::make_pair<char32_t, char32_t>(0, 0);
|
||||
};
|
||||
g["Char"] = [](const SemanticValues& sv) {
|
||||
return resolve_escape_sequence(sv.c_str(), sv.length());
|
||||
};
|
||||
|
||||
g["AND"] = [](const SemanticValues& sv) { return *sv.c_str(); };
|
||||
@ -2563,85 +2819,6 @@ private:
|
||||
return data.grammar;
|
||||
}
|
||||
|
||||
bool is_hex(char c, int& v) {
|
||||
if ('0' <= c && c <= '9') {
|
||||
v = c - '0';
|
||||
return true;
|
||||
} else if ('a' <= c && c <= 'f') {
|
||||
v = c - 'a' + 10;
|
||||
return true;
|
||||
} else if ('A' <= c && c <= 'F') {
|
||||
v = c - 'A' + 10;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool is_digit(char c, int& v) {
|
||||
if ('0' <= c && c <= '9') {
|
||||
v = c - '0';
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
std::pair<char, size_t> parse_hex_number(const char* s, size_t n, size_t i) {
|
||||
char ret = 0;
|
||||
int val;
|
||||
while (i < n && is_hex(s[i], val)) {
|
||||
ret = static_cast<char>(ret * 16 + val);
|
||||
i++;
|
||||
}
|
||||
return std::make_pair(ret, i);
|
||||
}
|
||||
|
||||
std::pair<char, size_t> parse_octal_number(const char* s, size_t n, size_t i) {
|
||||
char ret = 0;
|
||||
int val;
|
||||
while (i < n && is_digit(s[i], val)) {
|
||||
ret = static_cast<char>(ret * 8 + val);
|
||||
i++;
|
||||
}
|
||||
return std::make_pair(ret, i);
|
||||
}
|
||||
|
||||
std::string resolve_escape_sequence(const char* s, size_t n) {
|
||||
std::string r;
|
||||
r.reserve(n);
|
||||
|
||||
size_t i = 0;
|
||||
while (i < n) {
|
||||
auto ch = s[i];
|
||||
if (ch == '\\') {
|
||||
i++;
|
||||
switch (s[i]) {
|
||||
case 'n': r += '\n'; i++; break;
|
||||
case 'r': r += '\r'; i++; break;
|
||||
case 't': r += '\t'; i++; break;
|
||||
case '\'': r += '\''; i++; break;
|
||||
case '"': r += '"'; i++; break;
|
||||
case '[': r += '['; i++; break;
|
||||
case ']': r += ']'; i++; break;
|
||||
case '\\': r += '\\'; i++; break;
|
||||
case 'x': {
|
||||
std::tie(ch, i) = parse_hex_number(s, n, i + 1);
|
||||
r += ch;
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
std::tie(ch, i) = parse_octal_number(s, n, i);
|
||||
r += ch;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
r += ch;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
Grammar g;
|
||||
};
|
||||
|
||||
|
68
test/test.cc
68
test/test.cc
@ -356,13 +356,13 @@ TEST_CASE("Backtracking with AST", "[general]")
|
||||
REQUIRE(ast->nodes.size() == 2);
|
||||
}
|
||||
|
||||
TEST_CASE("Octal/Hex value test", "[general]")
|
||||
TEST_CASE("Octal/Hex/Unicode value test", "[general]")
|
||||
{
|
||||
peg::parser parser(
|
||||
R"( ROOT <- '\132\x7a' )"
|
||||
R"( ROOT <- '\132\x7a\u30f3' )"
|
||||
);
|
||||
|
||||
auto ret = parser.parse("Zz");
|
||||
auto ret = parser.parse("Zzン");
|
||||
|
||||
REQUIRE(ret == true);
|
||||
}
|
||||
@ -1002,12 +1002,10 @@ TEST_CASE("Semantic predicate test", "[predicate]")
|
||||
};
|
||||
|
||||
long val;
|
||||
auto ret = parser.parse("100", val);
|
||||
REQUIRE(ret == true);
|
||||
REQUIRE(parser.parse("100", val));
|
||||
REQUIRE(val == 100);
|
||||
|
||||
ret = parser.parse("200", val);
|
||||
REQUIRE(ret == false);
|
||||
REQUIRE(!parser.parse("200", val));
|
||||
}
|
||||
|
||||
TEST_CASE("Japanese character", "[unicode]")
|
||||
@ -1023,10 +1021,59 @@ TEST_CASE("Japanese character", "[unicode]")
|
||||
助詞 <- 'が' / 'を' / 'た' / 'ます' / 'に'
|
||||
)");
|
||||
|
||||
auto ret = parser.parse(u8R"(サーバーを復旧します。)");
|
||||
bool ret = parser;
|
||||
REQUIRE(ret == true);
|
||||
|
||||
REQUIRE(parser.parse(u8R"(サーバーを復旧します。)"));
|
||||
}
|
||||
|
||||
TEST_CASE("dot with a code", "[unicode]")
|
||||
{
|
||||
peg::parser parser(" S <- 'a' . 'b' ");
|
||||
REQUIRE(parser.parse(u8R"(aあb)"));
|
||||
}
|
||||
|
||||
TEST_CASE("dot with a char", "[unicode]")
|
||||
{
|
||||
peg::parser parser(" S <- 'a' . 'b' ");
|
||||
REQUIRE(parser.parse(u8R"(aåb)"));
|
||||
}
|
||||
|
||||
TEST_CASE("character class", "[unicode]")
|
||||
{
|
||||
peg::parser parser(R"(
|
||||
S <- 'a' [い-おAさC-Eた-とは] 'b'
|
||||
)");
|
||||
|
||||
bool ret = parser;
|
||||
REQUIRE(ret == true);
|
||||
|
||||
REQUIRE(!parser.parse(u8R"(aあb)"));
|
||||
REQUIRE(parser.parse(u8R"(aいb)"));
|
||||
REQUIRE(parser.parse(u8R"(aうb)"));
|
||||
REQUIRE(parser.parse(u8R"(aおb)"));
|
||||
REQUIRE(!parser.parse(u8R"(aかb)"));
|
||||
REQUIRE(parser.parse(u8R"(aAb)"));
|
||||
REQUIRE(!parser.parse(u8R"(aBb)"));
|
||||
REQUIRE(parser.parse(u8R"(aEb)"));
|
||||
REQUIRE(!parser.parse(u8R"(aFb)"));
|
||||
REQUIRE(!parser.parse(u8R"(aそb)"));
|
||||
REQUIRE(parser.parse(u8R"(aたb)"));
|
||||
REQUIRE(parser.parse(u8R"(aちb)"));
|
||||
REQUIRE(parser.parse(u8R"(aとb)"));
|
||||
REQUIRE(!parser.parse(u8R"(aなb)"));
|
||||
REQUIRE(parser.parse(u8R"(aはb)"));
|
||||
REQUIRE(!parser.parse(u8R"(a?b)"));
|
||||
}
|
||||
|
||||
#if 0 // TODO: Unicode Grapheme support
|
||||
TEST_CASE("dot with a grapheme", "[unicode]")
|
||||
{
|
||||
peg::parser parser(" S <- 'a' . 'b' ");
|
||||
REQUIRE(parser.parse(u8R"(aसिb)"));
|
||||
}
|
||||
#endif
|
||||
|
||||
TEST_CASE("Macro simple test", "[macro]")
|
||||
{
|
||||
parser parser(R"(
|
||||
@ -1397,6 +1444,8 @@ TEST_CASE("PEG Literal", "[peg]")
|
||||
REQUIRE(exact(g, "Literal", "\"'\"abc\"'\" ") == false);
|
||||
REQUIRE(exact(g, "Literal", "abc") == false);
|
||||
REQUIRE(exact(g, "Literal", "") == false);
|
||||
REQUIRE(exact(g, "Literal", u8"'日本語'") == true);
|
||||
REQUIRE(exact(g, "Literal", u8"\"日本語\"") == true);
|
||||
REQUIRE(exact(g, "Literal", u8"日本語") == false);
|
||||
}
|
||||
|
||||
@ -1415,6 +1464,7 @@ TEST_CASE("PEG Class", "[peg]")
|
||||
REQUIRE(exact(g, "Class", "[a") == false);
|
||||
REQUIRE(exact(g, "Class", "]") == false);
|
||||
REQUIRE(exact(g, "Class", "a]") == false);
|
||||
REQUIRE(exact(g, "Class", u8"[あ-ん]") == true);
|
||||
REQUIRE(exact(g, "Class", u8"あ-ん") == false);
|
||||
REQUIRE(exact(g, "Class", "[-+]") == true);
|
||||
REQUIRE(exact(g, "Class", "[+-]") == false);
|
||||
@ -1461,7 +1511,7 @@ TEST_CASE("PEG Char", "[peg]")
|
||||
REQUIRE(exact(g, "Char", " ") == true);
|
||||
REQUIRE(exact(g, "Char", " ") == false);
|
||||
REQUIRE(exact(g, "Char", "") == false);
|
||||
REQUIRE(exact(g, "Char", u8"あ") == false);
|
||||
REQUIRE(exact(g, "Char", u8"あ") == true);
|
||||
}
|
||||
|
||||
TEST_CASE("PEG Operators", "[peg]")
|
||||
|
Loading…
Reference in New Issue
Block a user