Handled UTF-8 codes from 0x80 as valid identifier codes.

2025-07-12 02:02:08 +00:00 · 2015-08-08 20:30:05 -04:00 · 2015-08-08 20:30:05 -04:00 · a3cfd1b8ad
commit a3cfd1b8ad
parent de5cfa955d
2 changed files with 19 additions and 1 deletions
--- a/peglib.h
+++ b/peglib.h
@ -1588,7 +1588,7 @@ private:

        g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]);
        g["IdentCont"]  <= seq(g["IdentStart"], zom(g["IdentRest"]));
-        g["IdentStart"] <= cls("a-zA-Z_");
+        g["IdentStart"] <= cls("a-zA-Z_\x80-\xff");
        g["IdentRest"]  <= cho(g["IdentStart"], cls("0-9"));

        g["Literal"]    <= cho(seq(cls("'"), anc(zom(seq(npd(cls("'")), g["Char"]))), cls("'"), g["Spacing"]),
--- a/test/test.cc
+++ b/test/test.cc
@ -653,6 +653,24 @@ TEST_CASE("Semantic predicate test", "[predicate]")
    REQUIRE(ret == false);
 }

+TEST_CASE("Japanese character", "[unicode]")
+{
+    peglib::peg parser(R"(
+        文 <- 修飾語? 主語 述語 '。'
+        主語 <- 名詞 助詞
+        述語 <- 動詞 助詞
+        修飾語 <- 形容詞
+        名詞 <- 'サーバー' / 'クライアント'
+        形容詞 <- '古い' / '新しい'
+        動詞 <- '落ち' / '復旧し'
+        助詞 <- 'が' / 'を' / 'た' / 'ます' / 'に'
+    )");
+
+    auto ret = parser.parse(R"(サーバーを復旧します。)");
+
+    REQUIRE(ret == true);
+}
+
 bool exact(Grammar& g, const char* d, const char* s) {
    auto n = strlen(s);
    auto r = g[d].parse(s, n);