Added [...]i and [^...]i (Fix #220)

This commit is contained in:
yhirose 2022-06-25 16:05:23 -04:00
parent 0366380091
commit 924588bb69
5 changed files with 80 additions and 29 deletions

View File

@ -13,7 +13,9 @@ You can also try the online version, PEG Playground at https://yhirose.github.io
The PEG syntax is well described on page 2 in the [document](http://www.brynosaurus.com/pub/lang/peg.pdf) by Bryan Ford. *cpp-peglib* also supports the following additional syntax for now:
* `'...'i` (Case-insensitive literal operator)
* `[...]i` (Case-insensitive character class operator)
* `[^...]` (Negated character class operator)
* `[^...]i` (Case-insensitive negated character class operator)
* `{2,5}` (Regex-like repetition operator)
* `<` ... `>` (Token boundary operator)
* `~` (Ignore operator)

Binary file not shown.

View File

@ -28,7 +28,9 @@ Primary <-
/ LiteralI
/ Dictionary
/ Literal
/ NegatedClassI
/ NegatedClass
/ ClassI
/ Class
/ DOT
@ -56,7 +58,9 @@ LiteralI <-
# NOTE: The original Brian Ford's paper uses 'zom' instead of 'oom'.
Class <- '[' !'^' <(!']' Range)+> ']' Spacing
ClassI <- '[' !'^' <(!']' Range)+> ']i' Spacing
NegatedClass <- "[^" <(!']' Range)+> ']' Spacing
NegatedClassI <- "[^" <(!']' Range)+> ']i' Spacing
Range <- (Char '-' ! ']' Char) / Char

View File

@ -1265,7 +1265,8 @@ public:
class CharacterClass : public Ope,
public std::enable_shared_from_this<CharacterClass> {
public:
CharacterClass(const std::string &s, bool negated) : negated_(negated) {
CharacterClass(const std::string &s, bool negated, bool ignore_case)
: negated_(negated), ignore_case_(ignore_case) {
auto chars = decode(s.data(), s.length());
auto i = 0u;
while (i < chars.size()) {
@ -1284,8 +1285,8 @@ public:
}
CharacterClass(const std::vector<std::pair<char32_t, char32_t>> &ranges,
bool negated)
: ranges_(ranges), negated_(negated) {
bool negated, bool ignore_case)
: ranges_(ranges), negated_(negated), ignore_case_(ignore_case) {
assert(!ranges_.empty());
}
@ -1300,7 +1301,7 @@ public:
auto len = decode_codepoint(s, n, cp);
for (const auto &range : ranges_) {
if (range.first <= cp && cp <= range.second) {
if (in_range(range, cp)) {
if (negated_) {
c.set_error_pos(s);
return static_cast<size_t>(-1);
@ -1320,8 +1321,20 @@ public:
void accept(Visitor &v) override;
private:
bool in_range(const std::pair<char32_t, char32_t> &range, char32_t cp) const {
if (ignore_case_) {
auto cpl = std::tolower(cp);
return std::tolower(range.first) <= cpl &&
cpl <= std::tolower(range.second);
} else {
return range.first <= cp && cp <= range.second;
}
}
std::vector<std::pair<char32_t, char32_t>> ranges_;
bool negated_;
bool ignore_case_;
};
class Character : public Ope, public std::enable_shared_from_this<Character> {
@ -1646,21 +1659,23 @@ inline std::shared_ptr<Ope> liti(std::string &&s) {
}
inline std::shared_ptr<Ope> cls(const std::string &s) {
return std::make_shared<CharacterClass>(s, false);
return std::make_shared<CharacterClass>(s, false, false);
}
inline std::shared_ptr<Ope>
cls(const std::vector<std::pair<char32_t, char32_t>> &ranges) {
return std::make_shared<CharacterClass>(ranges, false);
cls(const std::vector<std::pair<char32_t, char32_t>> &ranges,
bool ignore_case = false) {
return std::make_shared<CharacterClass>(ranges, false, ignore_case);
}
inline std::shared_ptr<Ope> ncls(const std::string &s) {
return std::make_shared<CharacterClass>(s, true);
return std::make_shared<CharacterClass>(s, true, false);
}
inline std::shared_ptr<Ope>
ncls(const std::vector<std::pair<char32_t, char32_t>> &ranges) {
return std::make_shared<CharacterClass>(ranges, true);
ncls(const std::vector<std::pair<char32_t, char32_t>> &ranges,
bool ignore_case = false) {
return std::make_shared<CharacterClass>(ranges, true, ignore_case);
}
inline std::shared_ptr<Ope> chr(char dt) {
@ -2934,9 +2949,7 @@ inline size_t Recovery::parse_core(const char *s, size_t n,
len = rule.parse(s, n, dummy_vs, c, dummy_dt);
}
if (success(len)) {
c.recovered = true;
}
if (success(len)) { c.recovered = true; }
// Cut
if (!c.cut_stack.empty()) {
@ -3241,16 +3254,16 @@ private:
seq(g["Suffix"], opt(seq(g["LABEL"], g["Identifier"])));
g["Suffix"] <= seq(g["Primary"], opt(g["Loop"]));
g["Loop"] <= cho(g["QUESTION"], g["STAR"], g["PLUS"], g["Repetition"]);
g["Primary"] <= cho(seq(g["Ignore"], g["IdentCont"], g["Arguments"],
npd(g["LEFTARROW"])),
seq(g["Ignore"], g["Identifier"],
npd(seq(opt(g["Parameters"]), g["LEFTARROW"]))),
seq(g["OPEN"], g["Expression"], g["CLOSE"]),
seq(g["BeginTok"], g["Expression"], g["EndTok"]),
g["CapScope"],
seq(g["BeginCap"], g["Expression"], g["EndCap"]),
g["BackRef"], g["LiteralI"], g["Dictionary"],
g["Literal"], g["NegatedClass"], g["Class"], g["DOT"]);
g["Primary"] <=
cho(seq(g["Ignore"], g["IdentCont"], g["Arguments"],
npd(g["LEFTARROW"])),
seq(g["Ignore"], g["Identifier"],
npd(seq(opt(g["Parameters"]), g["LEFTARROW"]))),
seq(g["OPEN"], g["Expression"], g["CLOSE"]),
seq(g["BeginTok"], g["Expression"], g["EndTok"]), g["CapScope"],
seq(g["BeginCap"], g["Expression"], g["EndCap"]), g["BackRef"],
g["LiteralI"], g["Dictionary"], g["Literal"], g["NegatedClassI"],
g["NegatedClass"], g["ClassI"], g["Class"], g["DOT"]);
g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]);
g["IdentCont"] <= seq(g["IdentStart"], zom(g["IdentRest"]));
@ -3281,9 +3294,16 @@ private:
g["Class"] <= seq(chr('['), npd(chr('^')),
tok(oom(seq(npd(chr(']')), g["Range"]))), chr(']'),
g["Spacing"]);
g["ClassI"] <= seq(chr('['), npd(chr('^')),
tok(oom(seq(npd(chr(']')), g["Range"]))), lit("]i"),
g["Spacing"]);
g["NegatedClass"] <= seq(lit("[^"),
tok(oom(seq(npd(chr(']')), g["Range"]))), chr(']'),
g["Spacing"]);
g["NegatedClassI"] <= seq(lit("[^"),
tok(oom(seq(npd(chr(']')), g["Range"]))),
lit("]i"), g["Spacing"]);
// NOTE: This is different from The original Brian Ford's paper, and this
// modification allows us to specify `[+-]` as a valid char class.
@ -3634,10 +3654,18 @@ private:
auto ranges = vs.transform<std::pair<char32_t, char32_t>>();
return cls(ranges);
};
g["ClassI"] = [](const SemanticValues &vs) {
auto ranges = vs.transform<std::pair<char32_t, char32_t>>();
return cls(ranges, true);
};
g["NegatedClass"] = [](const SemanticValues &vs) {
auto ranges = vs.transform<std::pair<char32_t, char32_t>>();
return ncls(ranges);
};
g["NegatedClassI"] = [](const SemanticValues &vs) {
auto ranges = vs.transform<std::pair<char32_t, char32_t>>();
return ncls(ranges, true);
};
g["Range"] = [](const SemanticValues &vs) {
switch (vs.choice()) {
case 0: {

View File

@ -431,13 +431,13 @@ TEST(GeneralTest, Octal_Hex_Unicode_value_test) {
EXPECT_TRUE(ret);
}
TEST(GeneralTest, Ignore_case_test) {
TEST(GeneralTest, Ignore_case_literal_test) {
parser parser(R"(
ROOT <- HELLO WORLD
HELLO <- 'hello'i
WORLD <- 'world'i
%whitespace <- [ \t\r\n]*
)");
ROOT <- HELLO WORLD
HELLO <- 'hello'i
WORLD <- 'world'i
%whitespace <- [ \t\r\n]*
)");
parser["HELLO"] = [](const SemanticValues &vs) {
EXPECT_EQ("Hello", vs.token());
@ -451,6 +451,23 @@ TEST(GeneralTest, Ignore_case_test) {
EXPECT_TRUE(ret);
}
TEST(GeneralTest, Ignore_case_character_class_test) {
parser parser(R"(ROOT <- [a-z]i+)");
EXPECT_TRUE(parser.parse("abc"));
EXPECT_TRUE(parser.parse("ABC"));
EXPECT_TRUE(parser.parse("Abc"));
EXPECT_TRUE(parser.parse("Abc"));
EXPECT_FALSE(parser.parse("123"));
}
TEST(GeneralTest, Ignore_case_negate_character_class_test) {
parser parser(R"(ROOT <- [^a-z]i+)");
EXPECT_TRUE(parser.parse("123"));
EXPECT_FALSE(parser.parse("ABC"));
}
TEST(GeneralTest, mutable_lambda_test) {
std::vector<std::string_view> vec;