mirror of
https://github.com/yhirose/cpp-peglib.git
synced 2024-12-22 20:05:31 +00:00
Added Negated Characgter Class
This commit is contained in:
parent
3dc0205ffa
commit
2180657eea
40
README.md
40
README.md
@ -11,6 +11,7 @@ You can also try the online version, PEG Playground at https://yhirose.github.io
|
|||||||
The PEG syntax is well described on page 2 in the [document](http://www.brynosaurus.com/pub/lang/peg.pdf). *cpp-peglib* also supports the following additional syntax for now:
|
The PEG syntax is well described on page 2 in the [document](http://www.brynosaurus.com/pub/lang/peg.pdf). *cpp-peglib* also supports the following additional syntax for now:
|
||||||
|
|
||||||
* `'...'i` (Case-insensitive literal operator)
|
* `'...'i` (Case-insensitive literal operator)
|
||||||
|
* `[^...]` (Negated character class operator)
|
||||||
* `<` ... `>` (Token boundary operator)
|
* `<` ... `>` (Token boundary operator)
|
||||||
* `~` (Ignore operator)
|
* `~` (Ignore operator)
|
||||||
* `\x20` (Hex number char)
|
* `\x20` (Hex number char)
|
||||||
@ -371,26 +372,27 @@ auto ret = ROOT.parse(" [tag1] [tag:2] [tag-3] ");
|
|||||||
|
|
||||||
The following are available operators:
|
The following are available operators:
|
||||||
|
|
||||||
| Operator | Description |
|
| Operator | Description |
|
||||||
| :------- | :-------------------- |
|
| :------- | :------------------------------ |
|
||||||
| seq | Sequence |
|
| seq | Sequence |
|
||||||
| cho | Prioritized Choice |
|
| cho | Prioritized Choice |
|
||||||
| zom | Zero or More |
|
| zom | Zero or More |
|
||||||
| oom | One or More |
|
| oom | One or More |
|
||||||
| opt | Optional |
|
| opt | Optional |
|
||||||
| apd | And predicate |
|
| apd | And predicate |
|
||||||
| npd | Not predicate |
|
| npd | Not predicate |
|
||||||
| lit | Literal string |
|
| lit | Literal string |
|
||||||
| liti | Case-insensitive Literal string |
|
| liti | Case-insensitive Literal string |
|
||||||
| cls | Character class |
|
| cls | Character class |
|
||||||
| chr | Character |
|
| ncls | Negated Character class |
|
||||||
| dot | Any character |
|
| chr | Character |
|
||||||
| tok | Token boundary |
|
| dot | Any character |
|
||||||
| ign | Ignore semantic value |
|
| tok | Token boundary |
|
||||||
| csc | Capture scope |
|
| ign | Ignore semantic value |
|
||||||
| cap | Capture |
|
| csc | Capture scope |
|
||||||
| bkr | Back reference |
|
| cap | Capture |
|
||||||
| usr | User defined parser |
|
| bkr | Back reference |
|
||||||
|
| usr | User defined parser |
|
||||||
|
|
||||||
Adjust definitions
|
Adjust definitions
|
||||||
------------------
|
------------------
|
||||||
|
49
peglib.h
49
peglib.h
@ -1278,7 +1278,7 @@ class CharacterClass : public Ope
|
|||||||
, public std::enable_shared_from_this<CharacterClass>
|
, public std::enable_shared_from_this<CharacterClass>
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
CharacterClass(const std::string& s) {
|
CharacterClass(const std::string& s, bool negated): negated_(negated) {
|
||||||
auto chars = decode(s.c_str(), s.length());
|
auto chars = decode(s.c_str(), s.length());
|
||||||
auto i = 0u;
|
auto i = 0u;
|
||||||
while (i < chars.size()) {
|
while (i < chars.size()) {
|
||||||
@ -1293,9 +1293,12 @@ public:
|
|||||||
i += 1;
|
i += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
assert(!ranges_.empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
CharacterClass(const std::vector<std::pair<char32_t, char32_t>>& ranges) : ranges_(ranges) {}
|
CharacterClass(const std::vector<std::pair<char32_t, char32_t>>& ranges, bool negated) : ranges_(ranges), negated_(negated) {
|
||||||
|
assert(!ranges_.empty());
|
||||||
|
}
|
||||||
|
|
||||||
size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override {
|
size_t parse(const char* s, size_t n, SemanticValues& sv, Context& c, any& dt) const override {
|
||||||
c.trace("CharacterClass", s, n, sv, dt);
|
c.trace("CharacterClass", s, n, sv, dt);
|
||||||
@ -1308,21 +1311,29 @@ public:
|
|||||||
char32_t cp = 0;
|
char32_t cp = 0;
|
||||||
auto len = decode_codepoint(s, n, cp);
|
auto len = decode_codepoint(s, n, cp);
|
||||||
|
|
||||||
if (!ranges_.empty()) {
|
for (const auto& range: ranges_) {
|
||||||
for (const auto& range: ranges_) {
|
if (range.first <= cp && cp <= range.second) {
|
||||||
if (range.first <= cp && cp <= range.second) {
|
if (negated_) {
|
||||||
|
c.set_error_pos(s);
|
||||||
|
return static_cast<size_t>(-1);
|
||||||
|
} else {
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
c.set_error_pos(s);
|
if (negated_) {
|
||||||
return static_cast<size_t>(-1);
|
return len;
|
||||||
|
} else {
|
||||||
|
c.set_error_pos(s);
|
||||||
|
return static_cast<size_t>(-1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void accept(Visitor& v) override;
|
void accept(Visitor& v) override;
|
||||||
|
|
||||||
std::vector<std::pair<char32_t, char32_t>> ranges_;
|
std::vector<std::pair<char32_t, char32_t>> ranges_;
|
||||||
|
bool negated_;
|
||||||
};
|
};
|
||||||
|
|
||||||
class Character : public Ope
|
class Character : public Ope
|
||||||
@ -1599,11 +1610,19 @@ inline std::shared_ptr<Ope> liti(const std::string& s) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
inline std::shared_ptr<Ope> cls(const std::string& s) {
|
inline std::shared_ptr<Ope> cls(const std::string& s) {
|
||||||
return std::make_shared<CharacterClass>(s);
|
return std::make_shared<CharacterClass>(s, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline std::shared_ptr<Ope> cls(const std::vector<std::pair<char32_t, char32_t>>& ranges) {
|
inline std::shared_ptr<Ope> cls(const std::vector<std::pair<char32_t, char32_t>>& ranges) {
|
||||||
return std::make_shared<CharacterClass>(ranges);
|
return std::make_shared<CharacterClass>(ranges, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline std::shared_ptr<Ope> ncls(const std::string& s) {
|
||||||
|
return std::make_shared<CharacterClass>(s, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline std::shared_ptr<Ope> ncls(const std::vector<std::pair<char32_t, char32_t>>& ranges) {
|
||||||
|
return std::make_shared<CharacterClass>(ranges, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline std::shared_ptr<Ope> chr(char dt) {
|
inline std::shared_ptr<Ope> chr(char dt) {
|
||||||
@ -2694,7 +2713,7 @@ private:
|
|||||||
seq(g["BeginTok"], g["Expression"], g["EndTok"]),
|
seq(g["BeginTok"], g["Expression"], g["EndTok"]),
|
||||||
seq(g["BeginCapScope"], g["Expression"], g["EndCapScope"]),
|
seq(g["BeginCapScope"], g["Expression"], g["EndCapScope"]),
|
||||||
seq(g["BeginCap"], g["Expression"], g["EndCap"]),
|
seq(g["BeginCap"], g["Expression"], g["EndCap"]),
|
||||||
g["BackRef"], g["LiteralI"], g["Literal"], g["Class"], g["DOT"]);
|
g["BackRef"], g["LiteralI"], g["Literal"], g["NegatedClass"], g["Class"], g["DOT"]);
|
||||||
|
|
||||||
g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]);
|
g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]);
|
||||||
g["IdentCont"] <= seq(g["IdentStart"], zom(g["IdentRest"]));
|
g["IdentCont"] <= seq(g["IdentStart"], zom(g["IdentRest"]));
|
||||||
@ -2710,10 +2729,12 @@ private:
|
|||||||
g["Literal"] <= cho(seq(cls("'"), tok(zom(seq(npd(cls("'")), g["Char"]))), cls("'"), g["Spacing"]),
|
g["Literal"] <= cho(seq(cls("'"), tok(zom(seq(npd(cls("'")), g["Char"]))), cls("'"), g["Spacing"]),
|
||||||
seq(cls("\""), tok(zom(seq(npd(cls("\"")), g["Char"]))), cls("\""), g["Spacing"]));
|
seq(cls("\""), tok(zom(seq(npd(cls("\"")), g["Char"]))), cls("\""), g["Spacing"]));
|
||||||
|
|
||||||
g["Class"] <= seq(chr('['), tok(zom(seq(npd(chr(']')), g["Range"]))), chr(']'), g["Spacing"]);
|
// NOTE: The original Brian Ford's paper uses 'zom' instead of 'oom'.
|
||||||
|
g["Class"] <= seq(chr('['), npd(chr('^')), tok(oom(seq(npd(chr(']')), g["Range"]))), chr(']'), g["Spacing"]);
|
||||||
|
g["NegatedClass"] <= seq(lit("[^"), tok(oom(seq(npd(chr(']')), g["Range"]))), chr(']'), g["Spacing"]);
|
||||||
|
|
||||||
g["Range"] <= cho(seq(g["Char"], chr('-'), g["Char"]), g["Char"]);
|
g["Range"] <= cho(seq(g["Char"], chr('-'), g["Char"]), g["Char"]);
|
||||||
g["Char"] <= cho(seq(chr('\\'), cls("nrt'\"[]\\")),
|
g["Char"] <= cho(seq(chr('\\'), cls("nrt'\"[]\\^")),
|
||||||
seq(chr('\\'), cls("0-3"), cls("0-7"), cls("0-7")),
|
seq(chr('\\'), cls("0-3"), cls("0-7"), cls("0-7")),
|
||||||
seq(chr('\\'), cls("0-7"), opt(cls("0-7"))),
|
seq(chr('\\'), cls("0-7"), opt(cls("0-7"))),
|
||||||
seq(lit("\\x"), cls("0-9a-fA-F"), opt(cls("0-9a-fA-F"))),
|
seq(lit("\\x"), cls("0-9a-fA-F"), opt(cls("0-9a-fA-F"))),
|
||||||
@ -2923,6 +2944,10 @@ private:
|
|||||||
auto ranges = sv.transform<std::pair<char32_t, char32_t>>();
|
auto ranges = sv.transform<std::pair<char32_t, char32_t>>();
|
||||||
return cls(ranges);
|
return cls(ranges);
|
||||||
};
|
};
|
||||||
|
g["NegatedClass"] = [](const SemanticValues& sv) {
|
||||||
|
auto ranges = sv.transform<std::pair<char32_t, char32_t>>();
|
||||||
|
return ncls(ranges);
|
||||||
|
};
|
||||||
g["Range"] = [](const SemanticValues& sv) {
|
g["Range"] = [](const SemanticValues& sv) {
|
||||||
switch (sv.choice()) {
|
switch (sv.choice()) {
|
||||||
case 0: {
|
case 0: {
|
||||||
|
40
test/test.cc
40
test/test.cc
@ -1000,6 +1000,21 @@ TEST_CASE("Semantic value tag", "[general]")
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_CASE("Negated Class test", "[general]")
|
||||||
|
{
|
||||||
|
peg::parser parser(R"(
|
||||||
|
ROOT <- [^a-z_]+
|
||||||
|
)");
|
||||||
|
|
||||||
|
bool ret = parser;
|
||||||
|
REQUIRE(ret == true);
|
||||||
|
|
||||||
|
REQUIRE(parser.parse("ABC123"));
|
||||||
|
REQUIRE_FALSE(parser.parse("ABcZ"));
|
||||||
|
REQUIRE_FALSE(parser.parse("ABCZ_"));
|
||||||
|
REQUIRE_FALSE(parser.parse(""));
|
||||||
|
}
|
||||||
|
|
||||||
TEST_CASE("Packrat parser test with %whitespace%", "[packrat]")
|
TEST_CASE("Packrat parser test with %whitespace%", "[packrat]")
|
||||||
{
|
{
|
||||||
peg::parser parser(R"(
|
peg::parser parser(R"(
|
||||||
@ -1758,7 +1773,7 @@ TEST_CASE("PEG Literal", "[peg]")
|
|||||||
TEST_CASE("PEG Class", "[peg]")
|
TEST_CASE("PEG Class", "[peg]")
|
||||||
{
|
{
|
||||||
auto g = ParserGenerator::grammar();
|
auto g = ParserGenerator::grammar();
|
||||||
REQUIRE(exact(g, "Class", "[]") == true);
|
REQUIRE(exact(g, "Class", "[]") == false); // NOTE: This is different from the Brian Ford's paper, but same as RegExp
|
||||||
REQUIRE(exact(g, "Class", "[a]") == true);
|
REQUIRE(exact(g, "Class", "[a]") == true);
|
||||||
REQUIRE(exact(g, "Class", "[a-z]") == true);
|
REQUIRE(exact(g, "Class", "[a-z]") == true);
|
||||||
REQUIRE(exact(g, "Class", "[az]") == true);
|
REQUIRE(exact(g, "Class", "[az]") == true);
|
||||||
@ -1774,6 +1789,29 @@ TEST_CASE("PEG Class", "[peg]")
|
|||||||
REQUIRE(exact(g, "Class", u8"あ-ん") == false);
|
REQUIRE(exact(g, "Class", u8"あ-ん") == false);
|
||||||
REQUIRE(exact(g, "Class", "[-+]") == true);
|
REQUIRE(exact(g, "Class", "[-+]") == true);
|
||||||
REQUIRE(exact(g, "Class", "[+-]") == false);
|
REQUIRE(exact(g, "Class", "[+-]") == false);
|
||||||
|
REQUIRE(exact(g, "Class", "[\\^]") == true);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_CASE("PEG Negated Class", "[peg]")
|
||||||
|
{
|
||||||
|
auto g = ParserGenerator::grammar();
|
||||||
|
REQUIRE(exact(g, "NegatedClass", "[^]") == false);
|
||||||
|
REQUIRE(exact(g, "NegatedClass", "[^a]") == true);
|
||||||
|
REQUIRE(exact(g, "NegatedClass", "[^a-z]") == true);
|
||||||
|
REQUIRE(exact(g, "NegatedClass", "[^az]") == true);
|
||||||
|
REQUIRE(exact(g, "NegatedClass", "[^a-zA-Z-]") == true);
|
||||||
|
REQUIRE(exact(g, "NegatedClass", "[^a-zA-Z-0-9]") == true);
|
||||||
|
REQUIRE(exact(g, "NegatedClass", "[^a-]") == false);
|
||||||
|
REQUIRE(exact(g, "NegatedClass", "[^-a]") == true);
|
||||||
|
REQUIRE(exact(g, "NegatedClass", "[^") == false);
|
||||||
|
REQUIRE(exact(g, "NegatedClass", "[^a") == false);
|
||||||
|
REQUIRE(exact(g, "NegatedClass", "^]") == false);
|
||||||
|
REQUIRE(exact(g, "NegatedClass", "^a]") == false);
|
||||||
|
REQUIRE(exact(g, "NegatedClass", u8"[^あ-ん]") == true);
|
||||||
|
REQUIRE(exact(g, "NegatedClass", u8"^あ-ん") == false);
|
||||||
|
REQUIRE(exact(g, "NegatedClass", "[^-+]") == true);
|
||||||
|
REQUIRE(exact(g, "NegatedClass", "[^+-]") == false);
|
||||||
|
REQUIRE(exact(g, "NegatedClass", "[^^]") == true);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_CASE("PEG Range", "[peg]")
|
TEST_CASE("PEG Range", "[peg]")
|
||||||
|
Loading…
Reference in New Issue
Block a user