This commit is contained in:
yhirose 2022-06-02 15:42:03 -04:00
parent 05205a0ca1
commit bd99157bb5
3 changed files with 113 additions and 49 deletions

Binary file not shown.

134
peglib.h
View File

@ -2811,7 +2811,10 @@ inline size_t BackReference::parse_core(const char *s, size_t n,
return parse_literal(s, n, vs, c, dt, lit, init_is_word, is_word, false); return parse_literal(s, n, vs, c, dt, lit, init_is_word, is_word, false);
} }
} }
throw std::runtime_error("Invalid back reference...");
c.error_info.message_pos = s;
c.error_info.message = "undefined back reference '$" + name_ + "'...";
return static_cast<size_t>(-1);
} }
inline Definition & inline Definition &
@ -3176,10 +3179,16 @@ private:
std::shared_ptr<Grammar> grammar; std::shared_ptr<Grammar> grammar;
std::string start; std::string start;
const char *start_pos = nullptr; const char *start_pos = nullptr;
std::vector<std::pair<std::string, const char *>> duplicates_of_definition; std::vector<std::pair<std::string, const char *>> duplicates_of_definition;
std::vector<std::pair<std::string, const char *>> duplicates_of_instruction; std::vector<std::pair<std::string, const char *>> duplicates_of_instruction;
std::map<std::string, std::vector<Instruction>> instructions; std::map<std::string, std::vector<Instruction>> instructions;
std::set<std::string_view> captures;
std::vector<std::pair<std::string, const char *>> undefined_back_references;
std::vector<std::set<std::string_view>> captures_stack{{}};
std::set<std::string_view> captures_in_current_definition;
bool enablePackratParsing = true; bool enablePackratParsing = true;
Data() : grammar(std::make_shared<Grammar>()) {} Data() : grammar(std::make_shared<Grammar>()) {}
@ -3200,17 +3209,16 @@ private:
seq(g["Suffix"], opt(seq(g["LABEL"], g["Identifier"]))); seq(g["Suffix"], opt(seq(g["LABEL"], g["Identifier"])));
g["Suffix"] <= seq(g["Primary"], opt(g["Loop"])); g["Suffix"] <= seq(g["Primary"], opt(g["Loop"]));
g["Loop"] <= cho(g["QUESTION"], g["STAR"], g["PLUS"], g["Repetition"]); g["Loop"] <= cho(g["QUESTION"], g["STAR"], g["PLUS"], g["Repetition"]);
g["Primary"] <= g["Primary"] <= cho(seq(g["Ignore"], g["IdentCont"], g["Arguments"],
cho(seq(g["Ignore"], g["IdentCont"], g["Arguments"], npd(g["LEFTARROW"])),
npd(g["LEFTARROW"])), seq(g["Ignore"], g["Identifier"],
seq(g["Ignore"], g["Identifier"], npd(seq(opt(g["Parameters"]), g["LEFTARROW"]))),
npd(seq(opt(g["Parameters"]), g["LEFTARROW"]))), seq(g["OPEN"], g["Expression"], g["CLOSE"]),
seq(g["OPEN"], g["Expression"], g["CLOSE"]), seq(g["BeginTok"], g["Expression"], g["EndTok"]),
seq(g["BeginTok"], g["Expression"], g["EndTok"]), g["CapScope"],
seq(g["BeginCapScope"], g["Expression"], g["EndCapScope"]), seq(g["BeginCap"], g["Expression"], g["EndCap"]),
seq(g["BeginCap"], g["Expression"], g["EndCap"]), g["BackRef"], g["BackRef"], g["LiteralI"], g["Dictionary"],
g["LiteralI"], g["Dictionary"], g["Literal"], g["NegatedClass"], g["Literal"], g["NegatedClass"], g["Class"], g["DOT"]);
g["Class"], g["DOT"]);
g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]); g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]);
g["IdentCont"] <= seq(g["IdentStart"], zom(g["IdentRest"])); g["IdentCont"] <= seq(g["IdentStart"], zom(g["IdentRest"]));
@ -3268,6 +3276,8 @@ private:
seq(g["COMMA"], g["Number"])); seq(g["COMMA"], g["Number"]));
g["Number"] <= seq(oom(cls("0-9")), g["Spacing"]); g["Number"] <= seq(oom(cls("0-9")), g["Spacing"]);
g["CapScope"] <= seq(g["BeginCapScope"], g["Expression"], g["EndCapScope"]);
g["LEFTARROW"] <= seq(cho(lit("<-"), lit(u8(u8""))), g["Spacing"]); g["LEFTARROW"] <= seq(cho(lit("<-"), lit(u8(u8""))), g["Spacing"]);
~g["SLASH"] <= seq(chr('/'), g["Spacing"]); ~g["SLASH"] <= seq(chr('/'), g["Spacing"]);
~g["PIPE"] <= seq(chr('|'), g["Spacing"]); ~g["PIPE"] <= seq(chr('|'), g["Spacing"]);
@ -3416,7 +3426,7 @@ private:
g["Definition"].enter = [](const char * /*s*/, size_t /*n*/, std::any &dt) { g["Definition"].enter = [](const char * /*s*/, size_t /*n*/, std::any &dt) {
auto &data = *std::any_cast<Data *>(dt); auto &data = *std::any_cast<Data *>(dt);
data.captures.clear(); data.captures_in_current_definition.clear();
}; };
g["Expression"] = [&](const SemanticValues &vs) { g["Expression"] = [&](const SemanticValues &vs) {
@ -3517,29 +3527,6 @@ private:
} }
}; };
g["RepetitionRange"] = [&](const SemanticValues &vs) {
switch (vs.choice()) {
case 0: { // Number COMMA Number
auto min = std::any_cast<size_t>(vs[0]);
auto max = std::any_cast<size_t>(vs[1]);
return std::pair(min, max);
}
case 1: // Number COMMA
return std::pair(std::any_cast<size_t>(vs[0]),
std::numeric_limits<size_t>::max());
case 2: { // Number
auto n = std::any_cast<size_t>(vs[0]);
return std::pair(n, n);
}
default: // COMMA Number
return std::pair(std::numeric_limits<size_t>::min(),
std::any_cast<size_t>(vs[0]));
}
};
g["Number"] = [&](const SemanticValues &vs) {
return vs.token_to_number<size_t>();
};
g["Primary"] = [&](const SemanticValues &vs, std::any &dt) { g["Primary"] = [&](const SemanticValues &vs, std::any &dt) {
auto &data = *std::any_cast<Data *>(dt); auto &data = *std::any_cast<Data *>(dt);
@ -3577,7 +3564,8 @@ private:
const auto &name = std::any_cast<std::string_view>(vs[0]); const auto &name = std::any_cast<std::string_view>(vs[0]);
auto ope = std::any_cast<std::shared_ptr<Ope>>(vs[1]); auto ope = std::any_cast<std::shared_ptr<Ope>>(vs[1]);
data.captures.insert(name); data.captures_stack.back().insert(name);
data.captures_in_current_definition.insert(name);
return cap(ope, [name](const char *a_s, size_t a_n, Context &c) { return cap(ope, [name](const char *a_s, size_t a_n, Context &c) {
auto &cs = c.capture_scope_stack[c.capture_scope_stack_size - 1]; auto &cs = c.capture_scope_stack[c.capture_scope_stack_size - 1];
@ -3641,6 +3629,40 @@ private:
return resolve_escape_sequence(vs.sv().data(), vs.sv().length()); return resolve_escape_sequence(vs.sv().data(), vs.sv().length());
}; };
g["RepetitionRange"] = [&](const SemanticValues &vs) {
switch (vs.choice()) {
case 0: { // Number COMMA Number
auto min = std::any_cast<size_t>(vs[0]);
auto max = std::any_cast<size_t>(vs[1]);
return std::pair(min, max);
}
case 1: // Number COMMA
return std::pair(std::any_cast<size_t>(vs[0]),
std::numeric_limits<size_t>::max());
case 2: { // Number
auto n = std::any_cast<size_t>(vs[0]);
return std::pair(n, n);
}
default: // COMMA Number
return std::pair(std::numeric_limits<size_t>::min(),
std::any_cast<size_t>(vs[0]));
}
};
g["Number"] = [&](const SemanticValues &vs) {
return vs.token_to_number<size_t>();
};
g["CapScope"].enter = [](const char * /*s*/, size_t /*n*/, std::any &dt) {
auto &data = *std::any_cast<Data *>(dt);
data.captures_stack.emplace_back();
};
g["CapScope"].leave = [](const char * /*s*/, size_t /*n*/,
size_t /*matchlen*/, std::any & /*value*/,
std::any &dt) {
auto &data = *std::any_cast<Data *>(dt);
data.captures_stack.pop_back();
};
g["AND"] = [](const SemanticValues &vs) { return *vs.sv().data(); }; g["AND"] = [](const SemanticValues &vs) { return *vs.sv().data(); };
g["NOT"] = [](const SemanticValues &vs) { return *vs.sv().data(); }; g["NOT"] = [](const SemanticValues &vs) { return *vs.sv().data(); };
g["QUESTION"] = [](const SemanticValues &vs) { return *vs.sv().data(); }; g["QUESTION"] = [](const SemanticValues &vs) { return *vs.sv().data(); };
@ -3655,9 +3677,31 @@ private:
g["BackRef"] = [&](const SemanticValues &vs, std::any &dt) { g["BackRef"] = [&](const SemanticValues &vs, std::any &dt) {
auto &data = *std::any_cast<Data *>(dt); auto &data = *std::any_cast<Data *>(dt);
if (data.captures.find(vs.token()) == data.captures.end()) {
// Undefined back reference check
{
auto found = false;
auto it = data.captures_stack.rbegin();
while (it != data.captures_stack.rend()) {
if (it->find(vs.token()) != it->end()) {
found = true;
break;
}
++it;
}
if (!found) {
auto ptr = vs.token().data() - 1; // include '$' symbol
data.undefined_back_references.emplace_back(vs.token(), ptr);
}
}
// NOTE: Disable packrat parsing if a back reference is not defined in
// captures in the current definition rule.
if (data.captures_in_current_definition.find(vs.token()) ==
data.captures_in_current_definition.end()) {
data.enablePackratParsing = false; data.enablePackratParsing = false;
} }
return bkr(vs.token_to_string()); return bkr(vs.token_to_string());
}; };
@ -3831,6 +3875,18 @@ private:
ret = false; ret = false;
} }
// Check undefined back references
if (!data.undefined_back_references.empty()) {
for (const auto &[name, ptr] : data.undefined_back_references) {
if (log) {
auto line = line_info(s, ptr);
log(line.first, line.second,
"The back reference '" + name + "' is undefined.");
}
}
ret = false;
}
// Set root definition // Set root definition
auto &start_rule = grammar[data.start]; auto &start_rule = grammar[data.start];

View File

@ -569,6 +569,11 @@ TEST(BackreferenceTest, Backreference_test) {
} }
} }
TEST(BackreferenceTest, Undefined_backreference_test) {
parser parser("S <- $bref");
EXPECT_FALSE(parser);
}
TEST(BackreferenceTest, Invalid_backreference_test) { TEST(BackreferenceTest, Invalid_backreference_test) {
parser parser(R"( parser parser(R"(
START <- _ LQUOTE (!RQUOTE .)* RQUOTE _ START <- _ LQUOTE (!RQUOTE .)* RQUOTE _
@ -577,10 +582,10 @@ TEST(BackreferenceTest, Invalid_backreference_test) {
~_ <- [ \t\r\n]* ~_ <- [ \t\r\n]*
)"); )");
EXPECT_THROW(parser.parse(R"delm( EXPECT_FALSE(parser);
EXPECT_FALSE(parser.parse(R"delm(
R"foo("(hello world)")foo" R"foo("(hello world)")foo"
)delm"), )delm"));
std::runtime_error);
} }
TEST(BackreferenceTest, Nested_capture_test) { TEST(BackreferenceTest, Nested_capture_test) {
@ -595,6 +600,7 @@ TEST(BackreferenceTest, Nested_capture_test) {
TEXT_DATA <- ![<] . TEXT_DATA <- ![<] .
)"); )");
EXPECT_TRUE(!!parser);
EXPECT_TRUE(parser.parse("This is <b>a <u>test</u> text</b>.")); EXPECT_TRUE(parser.parse("This is <b>a <u>test</u> text</b>."));
EXPECT_FALSE(parser.parse("This is <b>a <u>test</b> text</u>.")); EXPECT_FALSE(parser.parse("This is <b>a <u>test</b> text</u>."));
EXPECT_FALSE(parser.parse("This is <b>a <u>test text</b>.")); EXPECT_FALSE(parser.parse("This is <b>a <u>test text</b>."));
@ -614,7 +620,8 @@ TEST(BackreferenceTest, Backreference_with_Prioritized_Choice_test) {
CORRECT <- 'correct' CORRECT <- 'correct'
)"); )");
EXPECT_THROW(parser.parse("branchthatiscorrect"), std::runtime_error); EXPECT_TRUE(!!parser);
EXPECT_FALSE(parser.parse("branchthatiscorrect"));
} }
TEST(BackreferenceTest, Backreference_with_Zero_or_More_test) { TEST(BackreferenceTest, Backreference_with_Zero_or_More_test) {
@ -630,15 +637,15 @@ TEST(BackreferenceTest, Backreference_with_Zero_or_More_test) {
CORRECT <- 'correct' CORRECT <- 'correct'
)"); )");
EXPECT_TRUE(!!parser);
EXPECT_TRUE(parser.parse("branchthatiswrongbranchthatiscorrect")); EXPECT_TRUE(parser.parse("branchthatiswrongbranchthatiscorrect"));
EXPECT_FALSE(parser.parse("branchthatiswrongbranchthatIscorrect")); EXPECT_FALSE(parser.parse("branchthatiswrongbranchthatIscorrect"));
EXPECT_FALSE( EXPECT_FALSE(
parser.parse("branchthatiswrongbranchthatIswrongbranchthatiscorrect")); parser.parse("branchthatiswrongbranchthatIswrongbranchthatiscorrect"));
EXPECT_TRUE( EXPECT_TRUE(
parser.parse("branchthatiswrongbranchthatIswrongbranchthatIscorrect")); parser.parse("branchthatiswrongbranchthatIswrongbranchthatIscorrect"));
EXPECT_THROW(parser.parse("branchthatiscorrect"), std::runtime_error); EXPECT_FALSE(parser.parse("branchthatiscorrect"));
EXPECT_THROW(parser.parse("branchthatiswron_branchthatiscorrect"), EXPECT_FALSE(parser.parse("branchthatiswron_branchthatiscorrect"));
std::runtime_error);
} }
TEST(BackreferenceTest, Backreference_with_One_or_More_test) { TEST(BackreferenceTest, Backreference_with_One_or_More_test) {
@ -654,6 +661,7 @@ TEST(BackreferenceTest, Backreference_with_One_or_More_test) {
CORRECT <- 'correct' CORRECT <- 'correct'
)"); )");
EXPECT_TRUE(!!parser);
EXPECT_TRUE(parser.parse("branchthatiswrongbranchthatiscorrect")); EXPECT_TRUE(parser.parse("branchthatiswrongbranchthatiscorrect"));
EXPECT_FALSE(parser.parse("branchthatiswrongbranchthatIscorrect")); EXPECT_FALSE(parser.parse("branchthatiswrongbranchthatIscorrect"));
EXPECT_FALSE( EXPECT_FALSE(
@ -677,15 +685,15 @@ TEST(BackreferenceTest, Backreference_with_Option_test) {
CORRECT <- 'correct' CORRECT <- 'correct'
)"); )");
EXPECT_TRUE(!!parser);
EXPECT_TRUE(parser.parse("branchthatiswrongbranchthatiscorrect")); EXPECT_TRUE(parser.parse("branchthatiswrongbranchthatiscorrect"));
EXPECT_FALSE(parser.parse("branchthatiswrongbranchthatIscorrect")); EXPECT_FALSE(parser.parse("branchthatiswrongbranchthatIscorrect"));
EXPECT_FALSE( EXPECT_FALSE(
parser.parse("branchthatiswrongbranchthatIswrongbranchthatiscorrect")); parser.parse("branchthatiswrongbranchthatIswrongbranchthatiscorrect"));
EXPECT_FALSE( EXPECT_FALSE(
parser.parse("branchthatiswrongbranchthatIswrongbranchthatIscorrect")); parser.parse("branchthatiswrongbranchthatIswrongbranchthatIscorrect"));
EXPECT_THROW(parser.parse("branchthatiscorrect"), std::runtime_error); EXPECT_FALSE(parser.parse("branchthatiscorrect"));
EXPECT_THROW(parser.parse("branchthatiswron_branchthatiscorrect"), EXPECT_FALSE(parser.parse("branchthatiswron_branchthatiscorrect"));
std::runtime_error);
} }
TEST(RepetitionTest, Repetition_0) { TEST(RepetitionTest, Repetition_0) {