Added Negated Characgter Class

This commit is contained in:
yhirose 2020-01-26 23:38:39 -05:00
parent 3dc0205ffa
commit 2180657eea
3 changed files with 97 additions and 32 deletions

View File

@ -11,6 +11,7 @@ You can also try the online version, PEG Playground at https://yhirose.github.io
The PEG syntax is well described on page 2 in the [document](http://www.brynosaurus.com/pub/lang/peg.pdf). *cpp-peglib* also supports the following additional syntax for now: The PEG syntax is well described on page 2 in the [document](http://www.brynosaurus.com/pub/lang/peg.pdf). *cpp-peglib* also supports the following additional syntax for now:
* `'...'i` (Case-insensitive literal operator) * `'...'i` (Case-insensitive literal operator)
* `[^...]` (Negated character class operator)
* `<` ... `>` (Token boundary operator) * `<` ... `>` (Token boundary operator)
* `~` (Ignore operator) * `~` (Ignore operator)
* `\x20` (Hex number char) * `\x20` (Hex number char)
@ -371,26 +372,27 @@ auto ret = ROOT.parse(" [tag1] [tag:2] [tag-3] ");
The following are available operators: The following are available operators:
| Operator | Description | | Operator | Description |
| :------- | :-------------------- | | :------- | :------------------------------ |
| seq | Sequence | | seq | Sequence |
| cho | Prioritized Choice | | cho | Prioritized Choice |
| zom | Zero or More | | zom | Zero or More |
| oom | One or More | | oom | One or More |
| opt | Optional | | opt | Optional |
| apd | And predicate | | apd | And predicate |
| npd | Not predicate | | npd | Not predicate |
| lit | Literal string | | lit | Literal string |
| liti | Case-insensitive Literal string | | liti | Case-insensitive Literal string |
| cls | Character class | | cls | Character class |
| chr | Character | | ncls | Negated Character class |
| dot | Any character | | chr | Character |
| tok | Token boundary | | dot | Any character |
| ign | Ignore semantic value | | tok | Token boundary |
| csc | Capture scope | | ign | Ignore semantic value |
| cap | Capture | | csc | Capture scope |
| bkr | Back reference | | cap | Capture |
| usr | User defined parser | | bkr | Back reference |
| usr | User defined parser |
Adjust definitions Adjust definitions
------------------ ------------------

View File

@ -1278,7 +1278,7 @@ class CharacterClass : public Ope
, public std::enable_shared_from_this<CharacterClass> , public std::enable_shared_from_this<CharacterClass>
{ {
public: public:
CharacterClass(const std::string& s) { CharacterClass(const std::string& s, bool negated): negated_(negated) {
auto chars = decode(s.c_str(), s.length()); auto chars = decode(s.c_str(), s.length());
auto i = 0u; auto i = 0u;
while (i < chars.size()) { while (i < chars.size()) {
@ -1293,9 +1293,12 @@ public:
i += 1; i += 1;
} }
} }
assert(!ranges_.empty());
} }
CharacterClass(const std::vector<std::pair<char32_t, char32_t>>& ranges) : ranges_(ranges) {} CharacterClass(const std::vector<std::pair<char32_t, char32_t>>& ranges, bool negated) : ranges_(ranges), negated_(negated) {
assert(!ranges_.empty());
}
size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override { size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override {
c.trace("CharacterClass", s, n, sv, dt); c.trace("CharacterClass", s, n, sv, dt);
@ -1308,21 +1311,29 @@ public:
char32_t cp = 0; char32_t cp = 0;
auto len = decode_codepoint(s, n, cp); auto len = decode_codepoint(s, n, cp);
if (!ranges_.empty()) { for (const auto& range: ranges_) {
for (const auto& range: ranges_) { if (range.first <= cp && cp <= range.second) {
if (range.first <= cp && cp <= range.second) { if (negated_) {
c.set_error_pos(s);
return static_cast<size_t>(-1);
} else {
return len; return len;
} }
} }
} }
c.set_error_pos(s); if (negated_) {
return static_cast<size_t>(-1); return len;
} else {
c.set_error_pos(s);
return static_cast<size_t>(-1);
}
} }
void accept(Visitor& v) override; void accept(Visitor& v) override;
std::vector<std::pair<char32_t, char32_t>> ranges_; std::vector<std::pair<char32_t, char32_t>> ranges_;
bool negated_;
}; };
class Character : public Ope class Character : public Ope
@ -1599,11 +1610,19 @@ inline std::shared_ptr<Ope> liti(const std::string& s) {
} }
inline std::shared_ptr<Ope> cls(const std::string& s) { inline std::shared_ptr<Ope> cls(const std::string& s) {
return std::make_shared<CharacterClass>(s); return std::make_shared<CharacterClass>(s, false);
} }
inline std::shared_ptr<Ope> cls(const std::vector<std::pair<char32_t, char32_t>>& ranges) { inline std::shared_ptr<Ope> cls(const std::vector<std::pair<char32_t, char32_t>>& ranges) {
return std::make_shared<CharacterClass>(ranges); return std::make_shared<CharacterClass>(ranges, false);
}
inline std::shared_ptr<Ope> ncls(const std::string& s) {
return std::make_shared<CharacterClass>(s, true);
}
inline std::shared_ptr<Ope> ncls(const std::vector<std::pair<char32_t, char32_t>>& ranges) {
return std::make_shared<CharacterClass>(ranges, true);
} }
inline std::shared_ptr<Ope> chr(char dt) { inline std::shared_ptr<Ope> chr(char dt) {
@ -2694,7 +2713,7 @@ private:
seq(g["BeginTok"], g["Expression"], g["EndTok"]), seq(g["BeginTok"], g["Expression"], g["EndTok"]),
seq(g["BeginCapScope"], g["Expression"], g["EndCapScope"]), seq(g["BeginCapScope"], g["Expression"], g["EndCapScope"]),
seq(g["BeginCap"], g["Expression"], g["EndCap"]), seq(g["BeginCap"], g["Expression"], g["EndCap"]),
g["BackRef"], g["LiteralI"], g["Literal"], g["Class"], g["DOT"]); g["BackRef"], g["LiteralI"], g["Literal"], g["NegatedClass"], g["Class"], g["DOT"]);
g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]); g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]);
g["IdentCont"] <= seq(g["IdentStart"], zom(g["IdentRest"])); g["IdentCont"] <= seq(g["IdentStart"], zom(g["IdentRest"]));
@ -2710,10 +2729,12 @@ private:
g["Literal"] <= cho(seq(cls("'"), tok(zom(seq(npd(cls("'")), g["Char"]))), cls("'"), g["Spacing"]), g["Literal"] <= cho(seq(cls("'"), tok(zom(seq(npd(cls("'")), g["Char"]))), cls("'"), g["Spacing"]),
seq(cls("\""), tok(zom(seq(npd(cls("\"")), g["Char"]))), cls("\""), g["Spacing"])); seq(cls("\""), tok(zom(seq(npd(cls("\"")), g["Char"]))), cls("\""), g["Spacing"]));
g["Class"] <= seq(chr('['), tok(zom(seq(npd(chr(']')), g["Range"]))), chr(']'), g["Spacing"]); // NOTE: The original Brian Ford's paper uses 'zom' instead of 'oom'.
g["Class"] <= seq(chr('['), npd(chr('^')), tok(oom(seq(npd(chr(']')), g["Range"]))), chr(']'), g["Spacing"]);
g["NegatedClass"] <= seq(lit("[^"), tok(oom(seq(npd(chr(']')), g["Range"]))), chr(']'), g["Spacing"]);
g["Range"] <= cho(seq(g["Char"], chr('-'), g["Char"]), g["Char"]); g["Range"] <= cho(seq(g["Char"], chr('-'), g["Char"]), g["Char"]);
g["Char"] <= cho(seq(chr('\\'), cls("nrt'\"[]\\")), g["Char"] <= cho(seq(chr('\\'), cls("nrt'\"[]\\^")),
seq(chr('\\'), cls("0-3"), cls("0-7"), cls("0-7")), seq(chr('\\'), cls("0-3"), cls("0-7"), cls("0-7")),
seq(chr('\\'), cls("0-7"), opt(cls("0-7"))), seq(chr('\\'), cls("0-7"), opt(cls("0-7"))),
seq(lit("\\x"), cls("0-9a-fA-F"), opt(cls("0-9a-fA-F"))), seq(lit("\\x"), cls("0-9a-fA-F"), opt(cls("0-9a-fA-F"))),
@ -2923,6 +2944,10 @@ private:
auto ranges = sv.transform<std::pair<char32_t, char32_t>>(); auto ranges = sv.transform<std::pair<char32_t, char32_t>>();
return cls(ranges); return cls(ranges);
}; };
g["NegatedClass"] = [](const SemanticValues& sv) {
auto ranges = sv.transform<std::pair<char32_t, char32_t>>();
return ncls(ranges);
};
g["Range"] = [](const SemanticValues& sv) { g["Range"] = [](const SemanticValues& sv) {
switch (sv.choice()) { switch (sv.choice()) {
case 0: { case 0: {

View File

@ -1000,6 +1000,21 @@ TEST_CASE("Semantic value tag", "[general]")
} }
} }
TEST_CASE("Negated Class test", "[general]")
{
peg::parser parser(R"(
ROOT <- [^a-z_]+
)");
bool ret = parser;
REQUIRE(ret == true);
REQUIRE(parser.parse("ABC123"));
REQUIRE_FALSE(parser.parse("ABcZ"));
REQUIRE_FALSE(parser.parse("ABCZ_"));
REQUIRE_FALSE(parser.parse(""));
}
TEST_CASE("Packrat parser test with %whitespace%", "[packrat]") TEST_CASE("Packrat parser test with %whitespace%", "[packrat]")
{ {
peg::parser parser(R"( peg::parser parser(R"(
@ -1758,7 +1773,7 @@ TEST_CASE("PEG Literal", "[peg]")
TEST_CASE("PEG Class", "[peg]") TEST_CASE("PEG Class", "[peg]")
{ {
auto g = ParserGenerator::grammar(); auto g = ParserGenerator::grammar();
REQUIRE(exact(g, "Class", "[]") == true); REQUIRE(exact(g, "Class", "[]") == false); // NOTE: This is different from the Brian Ford's paper, but same as RegExp
REQUIRE(exact(g, "Class", "[a]") == true); REQUIRE(exact(g, "Class", "[a]") == true);
REQUIRE(exact(g, "Class", "[a-z]") == true); REQUIRE(exact(g, "Class", "[a-z]") == true);
REQUIRE(exact(g, "Class", "[az]") == true); REQUIRE(exact(g, "Class", "[az]") == true);
@ -1774,6 +1789,29 @@ TEST_CASE("PEG Class", "[peg]")
REQUIRE(exact(g, "Class", u8"あ-ん") == false); REQUIRE(exact(g, "Class", u8"あ-ん") == false);
REQUIRE(exact(g, "Class", "[-+]") == true); REQUIRE(exact(g, "Class", "[-+]") == true);
REQUIRE(exact(g, "Class", "[+-]") == false); REQUIRE(exact(g, "Class", "[+-]") == false);
REQUIRE(exact(g, "Class", "[\\^]") == true);
}
TEST_CASE("PEG Negated Class", "[peg]")
{
auto g = ParserGenerator::grammar();
REQUIRE(exact(g, "NegatedClass", "[^]") == false);
REQUIRE(exact(g, "NegatedClass", "[^a]") == true);
REQUIRE(exact(g, "NegatedClass", "[^a-z]") == true);
REQUIRE(exact(g, "NegatedClass", "[^az]") == true);
REQUIRE(exact(g, "NegatedClass", "[^a-zA-Z-]") == true);
REQUIRE(exact(g, "NegatedClass", "[^a-zA-Z-0-9]") == true);
REQUIRE(exact(g, "NegatedClass", "[^a-]") == false);
REQUIRE(exact(g, "NegatedClass", "[^-a]") == true);
REQUIRE(exact(g, "NegatedClass", "[^") == false);
REQUIRE(exact(g, "NegatedClass", "[^a") == false);
REQUIRE(exact(g, "NegatedClass", "^]") == false);
REQUIRE(exact(g, "NegatedClass", "^a]") == false);
REQUIRE(exact(g, "NegatedClass", u8"[^あ-ん]") == true);
REQUIRE(exact(g, "NegatedClass", u8"^あ-ん") == false);
REQUIRE(exact(g, "NegatedClass", "[^-+]") == true);
REQUIRE(exact(g, "NegatedClass", "[^+-]") == false);
REQUIRE(exact(g, "NegatedClass", "[^^]") == true);
} }
TEST_CASE("PEG Range", "[peg]") TEST_CASE("PEG Range", "[peg]")