diff --git a/README.md b/README.md index f57c5ff..27647be 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ You can also try the online version, PEG Playground at https://yhirose.github.io The PEG syntax is well described on page 2 in the [document](http://www.brynosaurus.com/pub/lang/peg.pdf). *cpp-peglib* also supports the following additional syntax for now: * `'...'i` (Case-insensitive literal operator) + * `[^...]` (Negated character class operator) * `<` ... `>` (Token boundary operator) * `~` (Ignore operator) * `\x20` (Hex number char) @@ -371,26 +372,27 @@ auto ret = ROOT.parse(" [tag1] [tag:2] [tag-3] "); The following are available operators: -| Operator | Description | -| :------- | :-------------------- | -| seq | Sequence | -| cho | Prioritized Choice | -| zom | Zero or More | -| oom | One or More | -| opt | Optional | -| apd | And predicate | -| npd | Not predicate | -| lit | Literal string | +| Operator | Description | +| :------- | :------------------------------ | +| seq | Sequence | +| cho | Prioritized Choice | +| zom | Zero or More | +| oom | One or More | +| opt | Optional | +| apd | And predicate | +| npd | Not predicate | +| lit | Literal string | | liti | Case-insensitive Literal string | -| cls | Character class | -| chr | Character | -| dot | Any character | -| tok | Token boundary | -| ign | Ignore semantic value | -| csc | Capture scope | -| cap | Capture | -| bkr | Back reference | -| usr | User defined parser | +| cls | Character class | +| ncls | Negated Character class | +| chr | Character | +| dot | Any character | +| tok | Token boundary | +| ign | Ignore semantic value | +| csc | Capture scope | +| cap | Capture | +| bkr | Back reference | +| usr | User defined parser | Adjust definitions ------------------ diff --git a/peglib.h b/peglib.h index bc14c39..dc451f9 100644 --- a/peglib.h +++ b/peglib.h @@ -1278,7 +1278,7 @@ class CharacterClass : public Ope , public std::enable_shared_from_this { public: - CharacterClass(const std::string& s) { + CharacterClass(const std::string& s, bool negated): negated_(negated) { auto chars = decode(s.c_str(), s.length()); auto i = 0u; while (i < chars.size()) { @@ -1293,9 +1293,12 @@ public: i += 1; } } + assert(!ranges_.empty()); } - CharacterClass(const std::vector>& ranges) : ranges_(ranges) {} + CharacterClass(const std::vector>& ranges, bool negated) : ranges_(ranges), negated_(negated) { + assert(!ranges_.empty()); + } size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override { c.trace("CharacterClass", s, n, sv, dt); @@ -1308,21 +1311,29 @@ public: char32_t cp = 0; auto len = decode_codepoint(s, n, cp); - if (!ranges_.empty()) { - for (const auto& range: ranges_) { - if (range.first <= cp && cp <= range.second) { + for (const auto& range: ranges_) { + if (range.first <= cp && cp <= range.second) { + if (negated_) { + c.set_error_pos(s); + return static_cast(-1); + } else { return len; } } } - c.set_error_pos(s); - return static_cast(-1); + if (negated_) { + return len; + } else { + c.set_error_pos(s); + return static_cast(-1); + } } void accept(Visitor& v) override; std::vector> ranges_; + bool negated_; }; class Character : public Ope @@ -1599,11 +1610,19 @@ inline std::shared_ptr liti(const std::string& s) { } inline std::shared_ptr cls(const std::string& s) { - return std::make_shared(s); + return std::make_shared(s, false); } inline std::shared_ptr cls(const std::vector>& ranges) { - return std::make_shared(ranges); + return std::make_shared(ranges, false); +} + +inline std::shared_ptr ncls(const std::string& s) { + return std::make_shared(s, true); +} + +inline std::shared_ptr ncls(const std::vector>& ranges) { + return std::make_shared(ranges, true); } inline std::shared_ptr chr(char dt) { @@ -2694,7 +2713,7 @@ private: seq(g["BeginTok"], g["Expression"], g["EndTok"]), seq(g["BeginCapScope"], g["Expression"], g["EndCapScope"]), seq(g["BeginCap"], g["Expression"], g["EndCap"]), - g["BackRef"], g["LiteralI"], g["Literal"], g["Class"], g["DOT"]); + g["BackRef"], g["LiteralI"], g["Literal"], g["NegatedClass"], g["Class"], g["DOT"]); g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]); g["IdentCont"] <= seq(g["IdentStart"], zom(g["IdentRest"])); @@ -2710,10 +2729,12 @@ private: g["Literal"] <= cho(seq(cls("'"), tok(zom(seq(npd(cls("'")), g["Char"]))), cls("'"), g["Spacing"]), seq(cls("\""), tok(zom(seq(npd(cls("\"")), g["Char"]))), cls("\""), g["Spacing"])); - g["Class"] <= seq(chr('['), tok(zom(seq(npd(chr(']')), g["Range"]))), chr(']'), g["Spacing"]); + // NOTE: The original Brian Ford's paper uses 'zom' instead of 'oom'. + g["Class"] <= seq(chr('['), npd(chr('^')), tok(oom(seq(npd(chr(']')), g["Range"]))), chr(']'), g["Spacing"]); + g["NegatedClass"] <= seq(lit("[^"), tok(oom(seq(npd(chr(']')), g["Range"]))), chr(']'), g["Spacing"]); g["Range"] <= cho(seq(g["Char"], chr('-'), g["Char"]), g["Char"]); - g["Char"] <= cho(seq(chr('\\'), cls("nrt'\"[]\\")), + g["Char"] <= cho(seq(chr('\\'), cls("nrt'\"[]\\^")), seq(chr('\\'), cls("0-3"), cls("0-7"), cls("0-7")), seq(chr('\\'), cls("0-7"), opt(cls("0-7"))), seq(lit("\\x"), cls("0-9a-fA-F"), opt(cls("0-9a-fA-F"))), @@ -2923,6 +2944,10 @@ private: auto ranges = sv.transform>(); return cls(ranges); }; + g["NegatedClass"] = [](const SemanticValues& sv) { + auto ranges = sv.transform>(); + return ncls(ranges); + }; g["Range"] = [](const SemanticValues& sv) { switch (sv.choice()) { case 0: { diff --git a/test/test.cc b/test/test.cc index d288c53..189ac42 100644 --- a/test/test.cc +++ b/test/test.cc @@ -1000,6 +1000,21 @@ TEST_CASE("Semantic value tag", "[general]") } } +TEST_CASE("Negated Class test", "[general]") +{ + peg::parser parser(R"( + ROOT <- [^a-z_]+ + )"); + + bool ret = parser; + REQUIRE(ret == true); + + REQUIRE(parser.parse("ABC123")); + REQUIRE_FALSE(parser.parse("ABcZ")); + REQUIRE_FALSE(parser.parse("ABCZ_")); + REQUIRE_FALSE(parser.parse("")); +} + TEST_CASE("Packrat parser test with %whitespace%", "[packrat]") { peg::parser parser(R"( @@ -1758,7 +1773,7 @@ TEST_CASE("PEG Literal", "[peg]") TEST_CASE("PEG Class", "[peg]") { auto g = ParserGenerator::grammar(); - REQUIRE(exact(g, "Class", "[]") == true); + REQUIRE(exact(g, "Class", "[]") == false); // NOTE: This is different from the Brian Ford's paper, but same as RegExp REQUIRE(exact(g, "Class", "[a]") == true); REQUIRE(exact(g, "Class", "[a-z]") == true); REQUIRE(exact(g, "Class", "[az]") == true); @@ -1774,6 +1789,29 @@ TEST_CASE("PEG Class", "[peg]") REQUIRE(exact(g, "Class", u8"あ-ん") == false); REQUIRE(exact(g, "Class", "[-+]") == true); REQUIRE(exact(g, "Class", "[+-]") == false); + REQUIRE(exact(g, "Class", "[\\^]") == true); +} + +TEST_CASE("PEG Negated Class", "[peg]") +{ + auto g = ParserGenerator::grammar(); + REQUIRE(exact(g, "NegatedClass", "[^]") == false); + REQUIRE(exact(g, "NegatedClass", "[^a]") == true); + REQUIRE(exact(g, "NegatedClass", "[^a-z]") == true); + REQUIRE(exact(g, "NegatedClass", "[^az]") == true); + REQUIRE(exact(g, "NegatedClass", "[^a-zA-Z-]") == true); + REQUIRE(exact(g, "NegatedClass", "[^a-zA-Z-0-9]") == true); + REQUIRE(exact(g, "NegatedClass", "[^a-]") == false); + REQUIRE(exact(g, "NegatedClass", "[^-a]") == true); + REQUIRE(exact(g, "NegatedClass", "[^") == false); + REQUIRE(exact(g, "NegatedClass", "[^a") == false); + REQUIRE(exact(g, "NegatedClass", "^]") == false); + REQUIRE(exact(g, "NegatedClass", "^a]") == false); + REQUIRE(exact(g, "NegatedClass", u8"[^あ-ん]") == true); + REQUIRE(exact(g, "NegatedClass", u8"^あ-ん") == false); + REQUIRE(exact(g, "NegatedClass", "[^-+]") == true); + REQUIRE(exact(g, "NegatedClass", "[^+-]") == false); + REQUIRE(exact(g, "NegatedClass", "[^^]") == true); } TEST_CASE("PEG Range", "[peg]")