Added Negated Characgter Class

This commit is contained in:
yhirose 2020-01-26 23:38:39 -05:00
parent 3dc0205ffa
commit 2180657eea
3 changed files with 97 additions and 32 deletions

View File

@ -11,6 +11,7 @@ You can also try the online version, PEG Playground at https://yhirose.github.io
The PEG syntax is well described on page 2 in the [document](http://www.brynosaurus.com/pub/lang/peg.pdf). *cpp-peglib* also supports the following additional syntax for now:
* `'...'i` (Case-insensitive literal operator)
* `[^...]` (Negated character class operator)
* `<` ... `>` (Token boundary operator)
* `~` (Ignore operator)
* `\x20` (Hex number char)
@ -371,26 +372,27 @@ auto ret = ROOT.parse(" [tag1] [tag:2] [tag-3] ");
The following are available operators:
| Operator | Description |
| :------- | :-------------------- |
| seq | Sequence |
| cho | Prioritized Choice |
| zom | Zero or More |
| oom | One or More |
| opt | Optional |
| apd | And predicate |
| npd | Not predicate |
| lit | Literal string |
| Operator | Description |
| :------- | :------------------------------ |
| seq | Sequence |
| cho | Prioritized Choice |
| zom | Zero or More |
| oom | One or More |
| opt | Optional |
| apd | And predicate |
| npd | Not predicate |
| lit | Literal string |
| liti | Case-insensitive Literal string |
| cls | Character class |
| chr | Character |
| dot | Any character |
| tok | Token boundary |
| ign | Ignore semantic value |
| csc | Capture scope |
| cap | Capture |
| bkr | Back reference |
| usr | User defined parser |
| cls | Character class |
| ncls | Negated Character class |
| chr | Character |
| dot | Any character |
| tok | Token boundary |
| ign | Ignore semantic value |
| csc | Capture scope |
| cap | Capture |
| bkr | Back reference |
| usr | User defined parser |
Adjust definitions
------------------

View File

@ -1278,7 +1278,7 @@ class CharacterClass : public Ope
, public std::enable_shared_from_this<CharacterClass>
{
public:
CharacterClass(const std::string& s) {
CharacterClass(const std::string& s, bool negated): negated_(negated) {
auto chars = decode(s.c_str(), s.length());
auto i = 0u;
while (i < chars.size()) {
@ -1293,9 +1293,12 @@ public:
i += 1;
}
}
assert(!ranges_.empty());
}
CharacterClass(const std::vector<std::pair<char32_t, char32_t>>& ranges) : ranges_(ranges) {}
CharacterClass(const std::vector<std::pair<char32_t, char32_t>>& ranges, bool negated) : ranges_(ranges), negated_(negated) {
assert(!ranges_.empty());
}
size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override {
c.trace("CharacterClass", s, n, sv, dt);
@ -1308,21 +1311,29 @@ public:
char32_t cp = 0;
auto len = decode_codepoint(s, n, cp);
if (!ranges_.empty()) {
for (const auto& range: ranges_) {
if (range.first <= cp && cp <= range.second) {
for (const auto& range: ranges_) {
if (range.first <= cp && cp <= range.second) {
if (negated_) {
c.set_error_pos(s);
return static_cast<size_t>(-1);
} else {
return len;
}
}
}
c.set_error_pos(s);
return static_cast<size_t>(-1);
if (negated_) {
return len;
} else {
c.set_error_pos(s);
return static_cast<size_t>(-1);
}
}
void accept(Visitor& v) override;
std::vector<std::pair<char32_t, char32_t>> ranges_;
bool negated_;
};
class Character : public Ope
@ -1599,11 +1610,19 @@ inline std::shared_ptr<Ope> liti(const std::string& s) {
}
inline std::shared_ptr<Ope> cls(const std::string& s) {
return std::make_shared<CharacterClass>(s);
return std::make_shared<CharacterClass>(s, false);
}
inline std::shared_ptr<Ope> cls(const std::vector<std::pair<char32_t, char32_t>>& ranges) {
return std::make_shared<CharacterClass>(ranges);
return std::make_shared<CharacterClass>(ranges, false);
}
inline std::shared_ptr<Ope> ncls(const std::string& s) {
return std::make_shared<CharacterClass>(s, true);
}
inline std::shared_ptr<Ope> ncls(const std::vector<std::pair<char32_t, char32_t>>& ranges) {
return std::make_shared<CharacterClass>(ranges, true);
}
inline std::shared_ptr<Ope> chr(char dt) {
@ -2694,7 +2713,7 @@ private:
seq(g["BeginTok"], g["Expression"], g["EndTok"]),
seq(g["BeginCapScope"], g["Expression"], g["EndCapScope"]),
seq(g["BeginCap"], g["Expression"], g["EndCap"]),
g["BackRef"], g["LiteralI"], g["Literal"], g["Class"], g["DOT"]);
g["BackRef"], g["LiteralI"], g["Literal"], g["NegatedClass"], g["Class"], g["DOT"]);
g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]);
g["IdentCont"] <= seq(g["IdentStart"], zom(g["IdentRest"]));
@ -2710,10 +2729,12 @@ private:
g["Literal"] <= cho(seq(cls("'"), tok(zom(seq(npd(cls("'")), g["Char"]))), cls("'"), g["Spacing"]),
seq(cls("\""), tok(zom(seq(npd(cls("\"")), g["Char"]))), cls("\""), g["Spacing"]));
g["Class"] <= seq(chr('['), tok(zom(seq(npd(chr(']')), g["Range"]))), chr(']'), g["Spacing"]);
// NOTE: The original Brian Ford's paper uses 'zom' instead of 'oom'.
g["Class"] <= seq(chr('['), npd(chr('^')), tok(oom(seq(npd(chr(']')), g["Range"]))), chr(']'), g["Spacing"]);
g["NegatedClass"] <= seq(lit("[^"), tok(oom(seq(npd(chr(']')), g["Range"]))), chr(']'), g["Spacing"]);
g["Range"] <= cho(seq(g["Char"], chr('-'), g["Char"]), g["Char"]);
g["Char"] <= cho(seq(chr('\\'), cls("nrt'\"[]\\")),
g["Char"] <= cho(seq(chr('\\'), cls("nrt'\"[]\\^")),
seq(chr('\\'), cls("0-3"), cls("0-7"), cls("0-7")),
seq(chr('\\'), cls("0-7"), opt(cls("0-7"))),
seq(lit("\\x"), cls("0-9a-fA-F"), opt(cls("0-9a-fA-F"))),
@ -2923,6 +2944,10 @@ private:
auto ranges = sv.transform<std::pair<char32_t, char32_t>>();
return cls(ranges);
};
g["NegatedClass"] = [](const SemanticValues& sv) {
auto ranges = sv.transform<std::pair<char32_t, char32_t>>();
return ncls(ranges);
};
g["Range"] = [](const SemanticValues& sv) {
switch (sv.choice()) {
case 0: {

View File

@ -1000,6 +1000,21 @@ TEST_CASE("Semantic value tag", "[general]")
}
}
TEST_CASE("Negated Class test", "[general]")
{
peg::parser parser(R"(
ROOT <- [^a-z_]+
)");
bool ret = parser;
REQUIRE(ret == true);
REQUIRE(parser.parse("ABC123"));
REQUIRE_FALSE(parser.parse("ABcZ"));
REQUIRE_FALSE(parser.parse("ABCZ_"));
REQUIRE_FALSE(parser.parse(""));
}
TEST_CASE("Packrat parser test with %whitespace%", "[packrat]")
{
peg::parser parser(R"(
@ -1758,7 +1773,7 @@ TEST_CASE("PEG Literal", "[peg]")
TEST_CASE("PEG Class", "[peg]")
{
auto g = ParserGenerator::grammar();
REQUIRE(exact(g, "Class", "[]") == true);
REQUIRE(exact(g, "Class", "[]") == false); // NOTE: This is different from the Brian Ford's paper, but same as RegExp
REQUIRE(exact(g, "Class", "[a]") == true);
REQUIRE(exact(g, "Class", "[a-z]") == true);
REQUIRE(exact(g, "Class", "[az]") == true);
@ -1774,6 +1789,29 @@ TEST_CASE("PEG Class", "[peg]")
REQUIRE(exact(g, "Class", u8"あ-ん") == false);
REQUIRE(exact(g, "Class", "[-+]") == true);
REQUIRE(exact(g, "Class", "[+-]") == false);
REQUIRE(exact(g, "Class", "[\\^]") == true);
}
TEST_CASE("PEG Negated Class", "[peg]")
{
auto g = ParserGenerator::grammar();
REQUIRE(exact(g, "NegatedClass", "[^]") == false);
REQUIRE(exact(g, "NegatedClass", "[^a]") == true);
REQUIRE(exact(g, "NegatedClass", "[^a-z]") == true);
REQUIRE(exact(g, "NegatedClass", "[^az]") == true);
REQUIRE(exact(g, "NegatedClass", "[^a-zA-Z-]") == true);
REQUIRE(exact(g, "NegatedClass", "[^a-zA-Z-0-9]") == true);
REQUIRE(exact(g, "NegatedClass", "[^a-]") == false);
REQUIRE(exact(g, "NegatedClass", "[^-a]") == true);
REQUIRE(exact(g, "NegatedClass", "[^") == false);
REQUIRE(exact(g, "NegatedClass", "[^a") == false);
REQUIRE(exact(g, "NegatedClass", "^]") == false);
REQUIRE(exact(g, "NegatedClass", "^a]") == false);
REQUIRE(exact(g, "NegatedClass", u8"[^あ-ん]") == true);
REQUIRE(exact(g, "NegatedClass", u8"^あ-ん") == false);
REQUIRE(exact(g, "NegatedClass", "[^-+]") == true);
REQUIRE(exact(g, "NegatedClass", "[^+-]") == false);
REQUIRE(exact(g, "NegatedClass", "[^^]") == true);
}
TEST_CASE("PEG Range", "[peg]")