diff --git a/docs/native.wasm b/docs/native.wasm index a53b8d7..03951e2 100755 Binary files a/docs/native.wasm and b/docs/native.wasm differ diff --git a/peglib.h b/peglib.h index 82a719f..f344202 100644 --- a/peglib.h +++ b/peglib.h @@ -2811,7 +2811,10 @@ inline size_t BackReference::parse_core(const char *s, size_t n, return parse_literal(s, n, vs, c, dt, lit, init_is_word, is_word, false); } } - throw std::runtime_error("Invalid back reference..."); + + c.error_info.message_pos = s; + c.error_info.message = "undefined back reference '$" + name_ + "'..."; + return static_cast(-1); } inline Definition & @@ -3176,10 +3179,16 @@ private: std::shared_ptr grammar; std::string start; const char *start_pos = nullptr; + std::vector> duplicates_of_definition; + std::vector> duplicates_of_instruction; std::map> instructions; - std::set captures; + + std::vector> undefined_back_references; + std::vector> captures_stack{{}}; + + std::set captures_in_current_definition; bool enablePackratParsing = true; Data() : grammar(std::make_shared()) {} @@ -3200,17 +3209,16 @@ private: seq(g["Suffix"], opt(seq(g["LABEL"], g["Identifier"]))); g["Suffix"] <= seq(g["Primary"], opt(g["Loop"])); g["Loop"] <= cho(g["QUESTION"], g["STAR"], g["PLUS"], g["Repetition"]); - g["Primary"] <= - cho(seq(g["Ignore"], g["IdentCont"], g["Arguments"], - npd(g["LEFTARROW"])), - seq(g["Ignore"], g["Identifier"], - npd(seq(opt(g["Parameters"]), g["LEFTARROW"]))), - seq(g["OPEN"], g["Expression"], g["CLOSE"]), - seq(g["BeginTok"], g["Expression"], g["EndTok"]), - seq(g["BeginCapScope"], g["Expression"], g["EndCapScope"]), - seq(g["BeginCap"], g["Expression"], g["EndCap"]), g["BackRef"], - g["LiteralI"], g["Dictionary"], g["Literal"], g["NegatedClass"], - g["Class"], g["DOT"]); + g["Primary"] <= cho(seq(g["Ignore"], g["IdentCont"], g["Arguments"], + npd(g["LEFTARROW"])), + seq(g["Ignore"], g["Identifier"], + npd(seq(opt(g["Parameters"]), g["LEFTARROW"]))), + seq(g["OPEN"], g["Expression"], g["CLOSE"]), + seq(g["BeginTok"], g["Expression"], g["EndTok"]), + g["CapScope"], + seq(g["BeginCap"], g["Expression"], g["EndCap"]), + g["BackRef"], g["LiteralI"], g["Dictionary"], + g["Literal"], g["NegatedClass"], g["Class"], g["DOT"]); g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]); g["IdentCont"] <= seq(g["IdentStart"], zom(g["IdentRest"])); @@ -3268,6 +3276,8 @@ private: seq(g["COMMA"], g["Number"])); g["Number"] <= seq(oom(cls("0-9")), g["Spacing"]); + g["CapScope"] <= seq(g["BeginCapScope"], g["Expression"], g["EndCapScope"]); + g["LEFTARROW"] <= seq(cho(lit("<-"), lit(u8(u8"←"))), g["Spacing"]); ~g["SLASH"] <= seq(chr('/'), g["Spacing"]); ~g["PIPE"] <= seq(chr('|'), g["Spacing"]); @@ -3416,7 +3426,7 @@ private: g["Definition"].enter = [](const char * /*s*/, size_t /*n*/, std::any &dt) { auto &data = *std::any_cast(dt); - data.captures.clear(); + data.captures_in_current_definition.clear(); }; g["Expression"] = [&](const SemanticValues &vs) { @@ -3517,29 +3527,6 @@ private: } }; - g["RepetitionRange"] = [&](const SemanticValues &vs) { - switch (vs.choice()) { - case 0: { // Number COMMA Number - auto min = std::any_cast(vs[0]); - auto max = std::any_cast(vs[1]); - return std::pair(min, max); - } - case 1: // Number COMMA - return std::pair(std::any_cast(vs[0]), - std::numeric_limits::max()); - case 2: { // Number - auto n = std::any_cast(vs[0]); - return std::pair(n, n); - } - default: // COMMA Number - return std::pair(std::numeric_limits::min(), - std::any_cast(vs[0])); - } - }; - g["Number"] = [&](const SemanticValues &vs) { - return vs.token_to_number(); - }; - g["Primary"] = [&](const SemanticValues &vs, std::any &dt) { auto &data = *std::any_cast(dt); @@ -3577,7 +3564,8 @@ private: const auto &name = std::any_cast(vs[0]); auto ope = std::any_cast>(vs[1]); - data.captures.insert(name); + data.captures_stack.back().insert(name); + data.captures_in_current_definition.insert(name); return cap(ope, [name](const char *a_s, size_t a_n, Context &c) { auto &cs = c.capture_scope_stack[c.capture_scope_stack_size - 1]; @@ -3641,6 +3629,40 @@ private: return resolve_escape_sequence(vs.sv().data(), vs.sv().length()); }; + g["RepetitionRange"] = [&](const SemanticValues &vs) { + switch (vs.choice()) { + case 0: { // Number COMMA Number + auto min = std::any_cast(vs[0]); + auto max = std::any_cast(vs[1]); + return std::pair(min, max); + } + case 1: // Number COMMA + return std::pair(std::any_cast(vs[0]), + std::numeric_limits::max()); + case 2: { // Number + auto n = std::any_cast(vs[0]); + return std::pair(n, n); + } + default: // COMMA Number + return std::pair(std::numeric_limits::min(), + std::any_cast(vs[0])); + } + }; + g["Number"] = [&](const SemanticValues &vs) { + return vs.token_to_number(); + }; + + g["CapScope"].enter = [](const char * /*s*/, size_t /*n*/, std::any &dt) { + auto &data = *std::any_cast(dt); + data.captures_stack.emplace_back(); + }; + g["CapScope"].leave = [](const char * /*s*/, size_t /*n*/, + size_t /*matchlen*/, std::any & /*value*/, + std::any &dt) { + auto &data = *std::any_cast(dt); + data.captures_stack.pop_back(); + }; + g["AND"] = [](const SemanticValues &vs) { return *vs.sv().data(); }; g["NOT"] = [](const SemanticValues &vs) { return *vs.sv().data(); }; g["QUESTION"] = [](const SemanticValues &vs) { return *vs.sv().data(); }; @@ -3655,9 +3677,31 @@ private: g["BackRef"] = [&](const SemanticValues &vs, std::any &dt) { auto &data = *std::any_cast(dt); - if (data.captures.find(vs.token()) == data.captures.end()) { + + // Undefined back reference check + { + auto found = false; + auto it = data.captures_stack.rbegin(); + while (it != data.captures_stack.rend()) { + if (it->find(vs.token()) != it->end()) { + found = true; + break; + } + ++it; + } + if (!found) { + auto ptr = vs.token().data() - 1; // include '$' symbol + data.undefined_back_references.emplace_back(vs.token(), ptr); + } + } + + // NOTE: Disable packrat parsing if a back reference is not defined in + // captures in the current definition rule. + if (data.captures_in_current_definition.find(vs.token()) == + data.captures_in_current_definition.end()) { data.enablePackratParsing = false; } + return bkr(vs.token_to_string()); }; @@ -3831,6 +3875,18 @@ private: ret = false; } + // Check undefined back references + if (!data.undefined_back_references.empty()) { + for (const auto &[name, ptr] : data.undefined_back_references) { + if (log) { + auto line = line_info(s, ptr); + log(line.first, line.second, + "The back reference '" + name + "' is undefined."); + } + } + ret = false; + } + // Set root definition auto &start_rule = grammar[data.start]; diff --git a/test/test2.cc b/test/test2.cc index a471a7b..fdf8a7c 100644 --- a/test/test2.cc +++ b/test/test2.cc @@ -569,6 +569,11 @@ TEST(BackreferenceTest, Backreference_test) { } } +TEST(BackreferenceTest, Undefined_backreference_test) { + parser parser("S <- $bref"); + EXPECT_FALSE(parser); +} + TEST(BackreferenceTest, Invalid_backreference_test) { parser parser(R"( START <- _ LQUOTE (!RQUOTE .)* RQUOTE _ @@ -577,10 +582,10 @@ TEST(BackreferenceTest, Invalid_backreference_test) { ~_ <- [ \t\r\n]* )"); - EXPECT_THROW(parser.parse(R"delm( + EXPECT_FALSE(parser); + EXPECT_FALSE(parser.parse(R"delm( R"foo("(hello world)")foo" - )delm"), - std::runtime_error); + )delm")); } TEST(BackreferenceTest, Nested_capture_test) { @@ -595,6 +600,7 @@ TEST(BackreferenceTest, Nested_capture_test) { TEXT_DATA <- ![<] . )"); + EXPECT_TRUE(!!parser); EXPECT_TRUE(parser.parse("This is a test text.")); EXPECT_FALSE(parser.parse("This is a test text.")); EXPECT_FALSE(parser.parse("This is a test text.")); @@ -614,7 +620,8 @@ TEST(BackreferenceTest, Backreference_with_Prioritized_Choice_test) { CORRECT <- 'correct' )"); - EXPECT_THROW(parser.parse("branchthatiscorrect"), std::runtime_error); + EXPECT_TRUE(!!parser); + EXPECT_FALSE(parser.parse("branchthatiscorrect")); } TEST(BackreferenceTest, Backreference_with_Zero_or_More_test) { @@ -630,15 +637,15 @@ TEST(BackreferenceTest, Backreference_with_Zero_or_More_test) { CORRECT <- 'correct' )"); + EXPECT_TRUE(!!parser); EXPECT_TRUE(parser.parse("branchthatiswrongbranchthatiscorrect")); EXPECT_FALSE(parser.parse("branchthatiswrongbranchthatIscorrect")); EXPECT_FALSE( parser.parse("branchthatiswrongbranchthatIswrongbranchthatiscorrect")); EXPECT_TRUE( parser.parse("branchthatiswrongbranchthatIswrongbranchthatIscorrect")); - EXPECT_THROW(parser.parse("branchthatiscorrect"), std::runtime_error); - EXPECT_THROW(parser.parse("branchthatiswron_branchthatiscorrect"), - std::runtime_error); + EXPECT_FALSE(parser.parse("branchthatiscorrect")); + EXPECT_FALSE(parser.parse("branchthatiswron_branchthatiscorrect")); } TEST(BackreferenceTest, Backreference_with_One_or_More_test) { @@ -654,6 +661,7 @@ TEST(BackreferenceTest, Backreference_with_One_or_More_test) { CORRECT <- 'correct' )"); + EXPECT_TRUE(!!parser); EXPECT_TRUE(parser.parse("branchthatiswrongbranchthatiscorrect")); EXPECT_FALSE(parser.parse("branchthatiswrongbranchthatIscorrect")); EXPECT_FALSE( @@ -677,15 +685,15 @@ TEST(BackreferenceTest, Backreference_with_Option_test) { CORRECT <- 'correct' )"); + EXPECT_TRUE(!!parser); EXPECT_TRUE(parser.parse("branchthatiswrongbranchthatiscorrect")); EXPECT_FALSE(parser.parse("branchthatiswrongbranchthatIscorrect")); EXPECT_FALSE( parser.parse("branchthatiswrongbranchthatIswrongbranchthatiscorrect")); EXPECT_FALSE( parser.parse("branchthatiswrongbranchthatIswrongbranchthatIscorrect")); - EXPECT_THROW(parser.parse("branchthatiscorrect"), std::runtime_error); - EXPECT_THROW(parser.parse("branchthatiswron_branchthatiscorrect"), - std::runtime_error); + EXPECT_FALSE(parser.parse("branchthatiscorrect")); + EXPECT_FALSE(parser.parse("branchthatiswron_branchthatiscorrect")); } TEST(RepetitionTest, Repetition_0) {