diff --git a/README.md b/README.md index f92d9f2..c0abb30 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,9 @@ You can also try the online version, PEG Playground at https://yhirose.github.io The PEG syntax is well described on page 2 in the [document](http://www.brynosaurus.com/pub/lang/peg.pdf) by Bryan Ford. *cpp-peglib* also supports the following additional syntax for now: * `'...'i` (Case-insensitive literal operator) + * `[...]i` (Case-insensitive character class operator) * `[^...]` (Negated character class operator) + * `[^...]i` (Case-insensitive negated character class operator) * `{2,5}` (Regex-like repetition operator) * `<` ... `>` (Token boundary operator) * `~` (Ignore operator) diff --git a/docs/native.wasm b/docs/native.wasm index 84f4e48..9cbaa0e 100755 Binary files a/docs/native.wasm and b/docs/native.wasm differ diff --git a/grammar/cpp-peglib.peg b/grammar/cpp-peglib.peg index 060e1d6..b37dd03 100644 --- a/grammar/cpp-peglib.peg +++ b/grammar/cpp-peglib.peg @@ -28,7 +28,9 @@ Primary <- / LiteralI / Dictionary / Literal + / NegatedClassI / NegatedClass + / ClassI / Class / DOT @@ -56,7 +58,9 @@ LiteralI <- # NOTE: The original Brian Ford's paper uses 'zom' instead of 'oom'. Class <- '[' !'^' <(!']' Range)+> ']' Spacing +ClassI <- '[' !'^' <(!']' Range)+> ']i' Spacing NegatedClass <- "[^" <(!']' Range)+> ']' Spacing +NegatedClassI <- "[^" <(!']' Range)+> ']i' Spacing Range <- (Char '-' ! ']' Char) / Char diff --git a/peglib.h b/peglib.h index 52eb303..c61dc17 100644 --- a/peglib.h +++ b/peglib.h @@ -1265,7 +1265,8 @@ public: class CharacterClass : public Ope, public std::enable_shared_from_this { public: - CharacterClass(const std::string &s, bool negated) : negated_(negated) { + CharacterClass(const std::string &s, bool negated, bool ignore_case) + : negated_(negated), ignore_case_(ignore_case) { auto chars = decode(s.data(), s.length()); auto i = 0u; while (i < chars.size()) { @@ -1284,8 +1285,8 @@ public: } CharacterClass(const std::vector> &ranges, - bool negated) - : ranges_(ranges), negated_(negated) { + bool negated, bool ignore_case) + : ranges_(ranges), negated_(negated), ignore_case_(ignore_case) { assert(!ranges_.empty()); } @@ -1300,7 +1301,7 @@ public: auto len = decode_codepoint(s, n, cp); for (const auto &range : ranges_) { - if (range.first <= cp && cp <= range.second) { + if (in_range(range, cp)) { if (negated_) { c.set_error_pos(s); return static_cast(-1); @@ -1320,8 +1321,20 @@ public: void accept(Visitor &v) override; +private: + bool in_range(const std::pair &range, char32_t cp) const { + if (ignore_case_) { + auto cpl = std::tolower(cp); + return std::tolower(range.first) <= cpl && + cpl <= std::tolower(range.second); + } else { + return range.first <= cp && cp <= range.second; + } + } + std::vector> ranges_; bool negated_; + bool ignore_case_; }; class Character : public Ope, public std::enable_shared_from_this { @@ -1646,21 +1659,23 @@ inline std::shared_ptr liti(std::string &&s) { } inline std::shared_ptr cls(const std::string &s) { - return std::make_shared(s, false); + return std::make_shared(s, false, false); } inline std::shared_ptr -cls(const std::vector> &ranges) { - return std::make_shared(ranges, false); +cls(const std::vector> &ranges, + bool ignore_case = false) { + return std::make_shared(ranges, false, ignore_case); } inline std::shared_ptr ncls(const std::string &s) { - return std::make_shared(s, true); + return std::make_shared(s, true, false); } inline std::shared_ptr -ncls(const std::vector> &ranges) { - return std::make_shared(ranges, true); +ncls(const std::vector> &ranges, + bool ignore_case = false) { + return std::make_shared(ranges, true, ignore_case); } inline std::shared_ptr chr(char dt) { @@ -2934,9 +2949,7 @@ inline size_t Recovery::parse_core(const char *s, size_t n, len = rule.parse(s, n, dummy_vs, c, dummy_dt); } - if (success(len)) { - c.recovered = true; - } + if (success(len)) { c.recovered = true; } // Cut if (!c.cut_stack.empty()) { @@ -3241,16 +3254,16 @@ private: seq(g["Suffix"], opt(seq(g["LABEL"], g["Identifier"]))); g["Suffix"] <= seq(g["Primary"], opt(g["Loop"])); g["Loop"] <= cho(g["QUESTION"], g["STAR"], g["PLUS"], g["Repetition"]); - g["Primary"] <= cho(seq(g["Ignore"], g["IdentCont"], g["Arguments"], - npd(g["LEFTARROW"])), - seq(g["Ignore"], g["Identifier"], - npd(seq(opt(g["Parameters"]), g["LEFTARROW"]))), - seq(g["OPEN"], g["Expression"], g["CLOSE"]), - seq(g["BeginTok"], g["Expression"], g["EndTok"]), - g["CapScope"], - seq(g["BeginCap"], g["Expression"], g["EndCap"]), - g["BackRef"], g["LiteralI"], g["Dictionary"], - g["Literal"], g["NegatedClass"], g["Class"], g["DOT"]); + g["Primary"] <= + cho(seq(g["Ignore"], g["IdentCont"], g["Arguments"], + npd(g["LEFTARROW"])), + seq(g["Ignore"], g["Identifier"], + npd(seq(opt(g["Parameters"]), g["LEFTARROW"]))), + seq(g["OPEN"], g["Expression"], g["CLOSE"]), + seq(g["BeginTok"], g["Expression"], g["EndTok"]), g["CapScope"], + seq(g["BeginCap"], g["Expression"], g["EndCap"]), g["BackRef"], + g["LiteralI"], g["Dictionary"], g["Literal"], g["NegatedClassI"], + g["NegatedClass"], g["ClassI"], g["Class"], g["DOT"]); g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]); g["IdentCont"] <= seq(g["IdentStart"], zom(g["IdentRest"])); @@ -3281,9 +3294,16 @@ private: g["Class"] <= seq(chr('['), npd(chr('^')), tok(oom(seq(npd(chr(']')), g["Range"]))), chr(']'), g["Spacing"]); + g["ClassI"] <= seq(chr('['), npd(chr('^')), + tok(oom(seq(npd(chr(']')), g["Range"]))), lit("]i"), + g["Spacing"]); + g["NegatedClass"] <= seq(lit("[^"), tok(oom(seq(npd(chr(']')), g["Range"]))), chr(']'), g["Spacing"]); + g["NegatedClassI"] <= seq(lit("[^"), + tok(oom(seq(npd(chr(']')), g["Range"]))), + lit("]i"), g["Spacing"]); // NOTE: This is different from The original Brian Ford's paper, and this // modification allows us to specify `[+-]` as a valid char class. @@ -3634,10 +3654,18 @@ private: auto ranges = vs.transform>(); return cls(ranges); }; + g["ClassI"] = [](const SemanticValues &vs) { + auto ranges = vs.transform>(); + return cls(ranges, true); + }; g["NegatedClass"] = [](const SemanticValues &vs) { auto ranges = vs.transform>(); return ncls(ranges); }; + g["NegatedClassI"] = [](const SemanticValues &vs) { + auto ranges = vs.transform>(); + return ncls(ranges, true); + }; g["Range"] = [](const SemanticValues &vs) { switch (vs.choice()) { case 0: { diff --git a/test/test1.cc b/test/test1.cc index 748b6f6..f3f1ef9 100644 --- a/test/test1.cc +++ b/test/test1.cc @@ -431,13 +431,13 @@ TEST(GeneralTest, Octal_Hex_Unicode_value_test) { EXPECT_TRUE(ret); } -TEST(GeneralTest, Ignore_case_test) { +TEST(GeneralTest, Ignore_case_literal_test) { parser parser(R"( - ROOT <- HELLO WORLD - HELLO <- 'hello'i - WORLD <- 'world'i - %whitespace <- [ \t\r\n]* - )"); + ROOT <- HELLO WORLD + HELLO <- 'hello'i + WORLD <- 'world'i + %whitespace <- [ \t\r\n]* + )"); parser["HELLO"] = [](const SemanticValues &vs) { EXPECT_EQ("Hello", vs.token()); @@ -451,6 +451,23 @@ TEST(GeneralTest, Ignore_case_test) { EXPECT_TRUE(ret); } +TEST(GeneralTest, Ignore_case_character_class_test) { + parser parser(R"(ROOT <- [a-z]i+)"); + + EXPECT_TRUE(parser.parse("abc")); + EXPECT_TRUE(parser.parse("ABC")); + EXPECT_TRUE(parser.parse("Abc")); + EXPECT_TRUE(parser.parse("Abc")); + EXPECT_FALSE(parser.parse("123")); +} + +TEST(GeneralTest, Ignore_case_negate_character_class_test) { + parser parser(R"(ROOT <- [^a-z]i+)"); + + EXPECT_TRUE(parser.parse("123")); + EXPECT_FALSE(parser.parse("ABC")); +} + TEST(GeneralTest, mutable_lambda_test) { std::vector vec;