mirror of
https://github.com/yhirose/cpp-peglib.git
synced 2025-01-22 13:25:30 +00:00
Added Negated Characgter Class
This commit is contained in:
parent
3dc0205ffa
commit
2180657eea
40
README.md
40
README.md
@ -11,6 +11,7 @@ You can also try the online version, PEG Playground at https://yhirose.github.io
|
||||
The PEG syntax is well described on page 2 in the [document](http://www.brynosaurus.com/pub/lang/peg.pdf). *cpp-peglib* also supports the following additional syntax for now:
|
||||
|
||||
* `'...'i` (Case-insensitive literal operator)
|
||||
* `[^...]` (Negated character class operator)
|
||||
* `<` ... `>` (Token boundary operator)
|
||||
* `~` (Ignore operator)
|
||||
* `\x20` (Hex number char)
|
||||
@ -371,26 +372,27 @@ auto ret = ROOT.parse(" [tag1] [tag:2] [tag-3] ");
|
||||
|
||||
The following are available operators:
|
||||
|
||||
| Operator | Description |
|
||||
| :------- | :-------------------- |
|
||||
| seq | Sequence |
|
||||
| cho | Prioritized Choice |
|
||||
| zom | Zero or More |
|
||||
| oom | One or More |
|
||||
| opt | Optional |
|
||||
| apd | And predicate |
|
||||
| npd | Not predicate |
|
||||
| lit | Literal string |
|
||||
| Operator | Description |
|
||||
| :------- | :------------------------------ |
|
||||
| seq | Sequence |
|
||||
| cho | Prioritized Choice |
|
||||
| zom | Zero or More |
|
||||
| oom | One or More |
|
||||
| opt | Optional |
|
||||
| apd | And predicate |
|
||||
| npd | Not predicate |
|
||||
| lit | Literal string |
|
||||
| liti | Case-insensitive Literal string |
|
||||
| cls | Character class |
|
||||
| chr | Character |
|
||||
| dot | Any character |
|
||||
| tok | Token boundary |
|
||||
| ign | Ignore semantic value |
|
||||
| csc | Capture scope |
|
||||
| cap | Capture |
|
||||
| bkr | Back reference |
|
||||
| usr | User defined parser |
|
||||
| cls | Character class |
|
||||
| ncls | Negated Character class |
|
||||
| chr | Character |
|
||||
| dot | Any character |
|
||||
| tok | Token boundary |
|
||||
| ign | Ignore semantic value |
|
||||
| csc | Capture scope |
|
||||
| cap | Capture |
|
||||
| bkr | Back reference |
|
||||
| usr | User defined parser |
|
||||
|
||||
Adjust definitions
|
||||
------------------
|
||||
|
49
peglib.h
49
peglib.h
@ -1278,7 +1278,7 @@ class CharacterClass : public Ope
|
||||
, public std::enable_shared_from_this<CharacterClass>
|
||||
{
|
||||
public:
|
||||
CharacterClass(const std::string& s) {
|
||||
CharacterClass(const std::string& s, bool negated): negated_(negated) {
|
||||
auto chars = decode(s.c_str(), s.length());
|
||||
auto i = 0u;
|
||||
while (i < chars.size()) {
|
||||
@ -1293,9 +1293,12 @@ public:
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
assert(!ranges_.empty());
|
||||
}
|
||||
|
||||
CharacterClass(const std::vector<std::pair<char32_t, char32_t>>& ranges) : ranges_(ranges) {}
|
||||
CharacterClass(const std::vector<std::pair<char32_t, char32_t>>& ranges, bool negated) : ranges_(ranges), negated_(negated) {
|
||||
assert(!ranges_.empty());
|
||||
}
|
||||
|
||||
size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override {
|
||||
c.trace("CharacterClass", s, n, sv, dt);
|
||||
@ -1308,21 +1311,29 @@ public:
|
||||
char32_t cp = 0;
|
||||
auto len = decode_codepoint(s, n, cp);
|
||||
|
||||
if (!ranges_.empty()) {
|
||||
for (const auto& range: ranges_) {
|
||||
if (range.first <= cp && cp <= range.second) {
|
||||
for (const auto& range: ranges_) {
|
||||
if (range.first <= cp && cp <= range.second) {
|
||||
if (negated_) {
|
||||
c.set_error_pos(s);
|
||||
return static_cast<size_t>(-1);
|
||||
} else {
|
||||
return len;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
c.set_error_pos(s);
|
||||
return static_cast<size_t>(-1);
|
||||
if (negated_) {
|
||||
return len;
|
||||
} else {
|
||||
c.set_error_pos(s);
|
||||
return static_cast<size_t>(-1);
|
||||
}
|
||||
}
|
||||
|
||||
void accept(Visitor& v) override;
|
||||
|
||||
std::vector<std::pair<char32_t, char32_t>> ranges_;
|
||||
bool negated_;
|
||||
};
|
||||
|
||||
class Character : public Ope
|
||||
@ -1599,11 +1610,19 @@ inline std::shared_ptr<Ope> liti(const std::string& s) {
|
||||
}
|
||||
|
||||
inline std::shared_ptr<Ope> cls(const std::string& s) {
|
||||
return std::make_shared<CharacterClass>(s);
|
||||
return std::make_shared<CharacterClass>(s, false);
|
||||
}
|
||||
|
||||
inline std::shared_ptr<Ope> cls(const std::vector<std::pair<char32_t, char32_t>>& ranges) {
|
||||
return std::make_shared<CharacterClass>(ranges);
|
||||
return std::make_shared<CharacterClass>(ranges, false);
|
||||
}
|
||||
|
||||
inline std::shared_ptr<Ope> ncls(const std::string& s) {
|
||||
return std::make_shared<CharacterClass>(s, true);
|
||||
}
|
||||
|
||||
inline std::shared_ptr<Ope> ncls(const std::vector<std::pair<char32_t, char32_t>>& ranges) {
|
||||
return std::make_shared<CharacterClass>(ranges, true);
|
||||
}
|
||||
|
||||
inline std::shared_ptr<Ope> chr(char dt) {
|
||||
@ -2694,7 +2713,7 @@ private:
|
||||
seq(g["BeginTok"], g["Expression"], g["EndTok"]),
|
||||
seq(g["BeginCapScope"], g["Expression"], g["EndCapScope"]),
|
||||
seq(g["BeginCap"], g["Expression"], g["EndCap"]),
|
||||
g["BackRef"], g["LiteralI"], g["Literal"], g["Class"], g["DOT"]);
|
||||
g["BackRef"], g["LiteralI"], g["Literal"], g["NegatedClass"], g["Class"], g["DOT"]);
|
||||
|
||||
g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]);
|
||||
g["IdentCont"] <= seq(g["IdentStart"], zom(g["IdentRest"]));
|
||||
@ -2710,10 +2729,12 @@ private:
|
||||
g["Literal"] <= cho(seq(cls("'"), tok(zom(seq(npd(cls("'")), g["Char"]))), cls("'"), g["Spacing"]),
|
||||
seq(cls("\""), tok(zom(seq(npd(cls("\"")), g["Char"]))), cls("\""), g["Spacing"]));
|
||||
|
||||
g["Class"] <= seq(chr('['), tok(zom(seq(npd(chr(']')), g["Range"]))), chr(']'), g["Spacing"]);
|
||||
// NOTE: The original Brian Ford's paper uses 'zom' instead of 'oom'.
|
||||
g["Class"] <= seq(chr('['), npd(chr('^')), tok(oom(seq(npd(chr(']')), g["Range"]))), chr(']'), g["Spacing"]);
|
||||
g["NegatedClass"] <= seq(lit("[^"), tok(oom(seq(npd(chr(']')), g["Range"]))), chr(']'), g["Spacing"]);
|
||||
|
||||
g["Range"] <= cho(seq(g["Char"], chr('-'), g["Char"]), g["Char"]);
|
||||
g["Char"] <= cho(seq(chr('\\'), cls("nrt'\"[]\\")),
|
||||
g["Char"] <= cho(seq(chr('\\'), cls("nrt'\"[]\\^")),
|
||||
seq(chr('\\'), cls("0-3"), cls("0-7"), cls("0-7")),
|
||||
seq(chr('\\'), cls("0-7"), opt(cls("0-7"))),
|
||||
seq(lit("\\x"), cls("0-9a-fA-F"), opt(cls("0-9a-fA-F"))),
|
||||
@ -2923,6 +2944,10 @@ private:
|
||||
auto ranges = sv.transform<std::pair<char32_t, char32_t>>();
|
||||
return cls(ranges);
|
||||
};
|
||||
g["NegatedClass"] = [](const SemanticValues& sv) {
|
||||
auto ranges = sv.transform<std::pair<char32_t, char32_t>>();
|
||||
return ncls(ranges);
|
||||
};
|
||||
g["Range"] = [](const SemanticValues& sv) {
|
||||
switch (sv.choice()) {
|
||||
case 0: {
|
||||
|
40
test/test.cc
40
test/test.cc
@ -1000,6 +1000,21 @@ TEST_CASE("Semantic value tag", "[general]")
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Negated Class test", "[general]")
|
||||
{
|
||||
peg::parser parser(R"(
|
||||
ROOT <- [^a-z_]+
|
||||
)");
|
||||
|
||||
bool ret = parser;
|
||||
REQUIRE(ret == true);
|
||||
|
||||
REQUIRE(parser.parse("ABC123"));
|
||||
REQUIRE_FALSE(parser.parse("ABcZ"));
|
||||
REQUIRE_FALSE(parser.parse("ABCZ_"));
|
||||
REQUIRE_FALSE(parser.parse(""));
|
||||
}
|
||||
|
||||
TEST_CASE("Packrat parser test with %whitespace%", "[packrat]")
|
||||
{
|
||||
peg::parser parser(R"(
|
||||
@ -1758,7 +1773,7 @@ TEST_CASE("PEG Literal", "[peg]")
|
||||
TEST_CASE("PEG Class", "[peg]")
|
||||
{
|
||||
auto g = ParserGenerator::grammar();
|
||||
REQUIRE(exact(g, "Class", "[]") == true);
|
||||
REQUIRE(exact(g, "Class", "[]") == false); // NOTE: This is different from the Brian Ford's paper, but same as RegExp
|
||||
REQUIRE(exact(g, "Class", "[a]") == true);
|
||||
REQUIRE(exact(g, "Class", "[a-z]") == true);
|
||||
REQUIRE(exact(g, "Class", "[az]") == true);
|
||||
@ -1774,6 +1789,29 @@ TEST_CASE("PEG Class", "[peg]")
|
||||
REQUIRE(exact(g, "Class", u8"あ-ん") == false);
|
||||
REQUIRE(exact(g, "Class", "[-+]") == true);
|
||||
REQUIRE(exact(g, "Class", "[+-]") == false);
|
||||
REQUIRE(exact(g, "Class", "[\\^]") == true);
|
||||
}
|
||||
|
||||
TEST_CASE("PEG Negated Class", "[peg]")
|
||||
{
|
||||
auto g = ParserGenerator::grammar();
|
||||
REQUIRE(exact(g, "NegatedClass", "[^]") == false);
|
||||
REQUIRE(exact(g, "NegatedClass", "[^a]") == true);
|
||||
REQUIRE(exact(g, "NegatedClass", "[^a-z]") == true);
|
||||
REQUIRE(exact(g, "NegatedClass", "[^az]") == true);
|
||||
REQUIRE(exact(g, "NegatedClass", "[^a-zA-Z-]") == true);
|
||||
REQUIRE(exact(g, "NegatedClass", "[^a-zA-Z-0-9]") == true);
|
||||
REQUIRE(exact(g, "NegatedClass", "[^a-]") == false);
|
||||
REQUIRE(exact(g, "NegatedClass", "[^-a]") == true);
|
||||
REQUIRE(exact(g, "NegatedClass", "[^") == false);
|
||||
REQUIRE(exact(g, "NegatedClass", "[^a") == false);
|
||||
REQUIRE(exact(g, "NegatedClass", "^]") == false);
|
||||
REQUIRE(exact(g, "NegatedClass", "^a]") == false);
|
||||
REQUIRE(exact(g, "NegatedClass", u8"[^あ-ん]") == true);
|
||||
REQUIRE(exact(g, "NegatedClass", u8"^あ-ん") == false);
|
||||
REQUIRE(exact(g, "NegatedClass", "[^-+]") == true);
|
||||
REQUIRE(exact(g, "NegatedClass", "[^+-]") == false);
|
||||
REQUIRE(exact(g, "NegatedClass", "[^^]") == true);
|
||||
}
|
||||
|
||||
TEST_CASE("PEG Range", "[peg]")
|
||||
|
Loading…
Reference in New Issue
Block a user