mirror of
https://github.com/yhirose/cpp-peglib.git
synced 2025-01-09 09:15:30 +00:00
Fixed error report problems
This commit is contained in:
parent
23b284d0a3
commit
8643927c99
@ -526,7 +526,7 @@ cpp-peglib supports the furthest failure error posision report as descrived in t
|
||||
|
||||
For better error report and recovery, cpp-peglib supports 'recovery' operator with label which can be assosiated with a recovery expression and a custom error message. This idea comes from the fantastic ["Syntax Error Recovery in Parsing Expression Grammars"](https://arxiv.org/pdf/1806.11150.pdf) paper by Sergio Medeiros and Fabio Mascarenhas.
|
||||
|
||||
The custom message supports `%t` which is a place holder for the unexpected token.
|
||||
The custom message supports `%t` which is a place holder for the unexpected token, and `%c` for the unexpected Unicode char.
|
||||
|
||||
Here is an example of Java-like grammar:
|
||||
|
||||
|
BIN
docs/native.wasm
BIN
docs/native.wasm
Binary file not shown.
@ -61,6 +61,7 @@ body {
|
||||
height: 160px;
|
||||
border: 1px solid lightgray;
|
||||
padding: 8px;
|
||||
overflow-y: auto;
|
||||
}
|
||||
.editor-info li {
|
||||
cursor: pointer;
|
||||
|
153
peglib.h
153
peglib.h
@ -87,6 +87,14 @@ inline size_t codepoint_length(const char *s8, size_t l) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
inline size_t codepoint_count(const char *s8, size_t l) {
|
||||
size_t count = 0;
|
||||
for (size_t i = 0; i < l; i += codepoint_length(s8 + i, l - i)) {
|
||||
count++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
inline size_t encode_codepoint(char32_t cp, char *buff) {
|
||||
if (cp < 0x0080) {
|
||||
buff[0] = static_cast<char>(cp & 0x7F);
|
||||
@ -161,16 +169,16 @@ inline bool decode_codepoint(const char *s8, size_t l, size_t &bytes,
|
||||
return false;
|
||||
}
|
||||
|
||||
inline size_t decode_codepoint(const char *s8, size_t l, char32_t &out) {
|
||||
inline size_t decode_codepoint(const char *s8, size_t l, char32_t &cp) {
|
||||
size_t bytes;
|
||||
if (decode_codepoint(s8, l, bytes, out)) { return bytes; }
|
||||
if (decode_codepoint(s8, l, bytes, cp)) { return bytes; }
|
||||
return 0;
|
||||
}
|
||||
|
||||
inline char32_t decode_codepoint(const char *s8, size_t l) {
|
||||
char32_t out = 0;
|
||||
decode_codepoint(s8, l, out);
|
||||
return out;
|
||||
char32_t cp = 0;
|
||||
decode_codepoint(s8, l, cp);
|
||||
return cp;
|
||||
}
|
||||
|
||||
inline std::u32string decode(const char *s8, size_t l) {
|
||||
@ -418,7 +426,7 @@ inline std::pair<size_t, size_t> line_info(const char *start, const char *cur) {
|
||||
p++;
|
||||
}
|
||||
|
||||
auto col = p - col_ptr + 1;
|
||||
auto col = codepoint_count(col_ptr, p - col_ptr) + 1;
|
||||
|
||||
return std::pair(no, col);
|
||||
}
|
||||
@ -551,7 +559,7 @@ private:
|
||||
/*
|
||||
* Semantic action
|
||||
*/
|
||||
template <typename F, typename... Args> std::any call(F fn, Args &&...args) {
|
||||
template <typename F, typename... Args> std::any call(F fn, Args &&... args) {
|
||||
using R = decltype(fn(std::forward<Args>(args)...));
|
||||
if constexpr (std::is_void<R>::value) {
|
||||
fn(std::forward<Args>(args)...);
|
||||
@ -638,6 +646,7 @@ struct ErrorInfo {
|
||||
std::vector<std::pair<const char *, bool>> expected_tokens;
|
||||
const char *message_pos = nullptr;
|
||||
std::string message;
|
||||
mutable const char *last_output_pos = nullptr;
|
||||
|
||||
void clear() {
|
||||
error_pos = nullptr;
|
||||
@ -655,59 +664,71 @@ struct ErrorInfo {
|
||||
|
||||
void output_log(const Log &log, const char *s, size_t n) const {
|
||||
if (message_pos) {
|
||||
auto line = line_info(s, message_pos);
|
||||
std::string msg;
|
||||
if (auto unexpected_token = heuristic_error_token(s, n, message_pos);
|
||||
!unexpected_token.empty()) {
|
||||
msg = replace_all(message, "%t", unexpected_token);
|
||||
} else {
|
||||
msg = message;
|
||||
}
|
||||
log(line.first, line.second, msg);
|
||||
} else if (error_pos) {
|
||||
auto line = line_info(s, error_pos);
|
||||
|
||||
std::string msg;
|
||||
if (expected_tokens.empty()) {
|
||||
msg = "syntax error.";
|
||||
} else {
|
||||
msg = "syntax error";
|
||||
|
||||
// unexpected token
|
||||
if (auto unexpected_token = heuristic_error_token(s, n, error_pos);
|
||||
if (message_pos > last_output_pos) {
|
||||
last_output_pos = message_pos;
|
||||
auto line = line_info(s, message_pos);
|
||||
std::string msg;
|
||||
if (auto unexpected_token = heuristic_error_token(s, n, message_pos);
|
||||
!unexpected_token.empty()) {
|
||||
msg += ", unexpected '";
|
||||
msg += unexpected_token;
|
||||
msg += "'";
|
||||
msg = replace_all(message, "%t", unexpected_token);
|
||||
|
||||
auto unexpected_char = unexpected_token.substr(
|
||||
0, codepoint_length(unexpected_token.data(),
|
||||
unexpected_token.size()));
|
||||
|
||||
msg = replace_all(msg, "%c", unexpected_char);
|
||||
} else {
|
||||
msg = message;
|
||||
}
|
||||
log(line.first, line.second, msg);
|
||||
}
|
||||
} else if (error_pos) {
|
||||
if (error_pos > last_output_pos) {
|
||||
last_output_pos = error_pos;
|
||||
auto line = line_info(s, error_pos);
|
||||
|
||||
auto first_item = true;
|
||||
size_t i = 0;
|
||||
while (i < expected_tokens.size()) {
|
||||
auto [token, is_literal] =
|
||||
expected_tokens[expected_tokens.size() - i - 1];
|
||||
std::string msg;
|
||||
if (expected_tokens.empty()) {
|
||||
msg = "syntax error.";
|
||||
} else {
|
||||
msg = "syntax error";
|
||||
|
||||
// Skip rules start with '_'
|
||||
if (!is_literal && token[0] != '_') {
|
||||
msg += (first_item ? ", expecting " : ", ");
|
||||
if (is_literal) {
|
||||
msg += "'";
|
||||
msg += token;
|
||||
msg += "'";
|
||||
} else {
|
||||
msg += "<";
|
||||
msg += token;
|
||||
msg += ">";
|
||||
}
|
||||
first_item = false;
|
||||
// unexpected token
|
||||
if (auto unexpected_token = heuristic_error_token(s, n, error_pos);
|
||||
!unexpected_token.empty()) {
|
||||
msg += ", unexpected '";
|
||||
msg += unexpected_token;
|
||||
msg += "'";
|
||||
}
|
||||
|
||||
i++;
|
||||
}
|
||||
msg += ".";
|
||||
}
|
||||
auto first_item = true;
|
||||
size_t i = 0;
|
||||
while (i < expected_tokens.size()) {
|
||||
auto [token, is_literal] =
|
||||
expected_tokens[expected_tokens.size() - i - 1];
|
||||
|
||||
log(line.first, line.second, msg);
|
||||
// Skip rules start with '_'
|
||||
if (!is_literal && token[0] != '_') {
|
||||
msg += (first_item ? ", expecting " : ", ");
|
||||
if (is_literal) {
|
||||
msg += "'";
|
||||
msg += token;
|
||||
msg += "'";
|
||||
} else {
|
||||
msg += "<";
|
||||
msg += token;
|
||||
msg += ">";
|
||||
}
|
||||
first_item = false;
|
||||
}
|
||||
|
||||
i++;
|
||||
}
|
||||
msg += ".";
|
||||
}
|
||||
|
||||
log(line.first, line.second, msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -724,7 +745,16 @@ private:
|
||||
i++;
|
||||
}
|
||||
}
|
||||
return escape_characters(error_pos, std::min<size_t>(i, 8));
|
||||
|
||||
size_t count = 8;
|
||||
size_t j = 0;
|
||||
while (count > 0 && j < i) {
|
||||
j += codepoint_length(&error_pos[j], i - j);
|
||||
count--;
|
||||
}
|
||||
|
||||
// return escape_characters(error_pos, std::min<size_t>(i, 8));
|
||||
return escape_characters(error_pos, j);
|
||||
}
|
||||
return std::string();
|
||||
}
|
||||
@ -944,7 +974,7 @@ public:
|
||||
class Sequence : public Ope {
|
||||
public:
|
||||
template <typename... Args>
|
||||
Sequence(const Args &...args)
|
||||
Sequence(const Args &... args)
|
||||
: opes_{static_cast<std::shared_ptr<Ope>>(args)...} {}
|
||||
Sequence(const std::vector<std::shared_ptr<Ope>> &opes) : opes_(opes) {}
|
||||
Sequence(std::vector<std::shared_ptr<Ope>> &&opes) : opes_(opes) {}
|
||||
@ -987,7 +1017,7 @@ public:
|
||||
class PrioritizedChoice : public Ope {
|
||||
public:
|
||||
template <typename... Args>
|
||||
PrioritizedChoice(bool for_label, const Args &...args)
|
||||
PrioritizedChoice(bool for_label, const Args &... args)
|
||||
: opes_{static_cast<std::shared_ptr<Ope>>(args)...},
|
||||
for_label_(for_label) {}
|
||||
PrioritizedChoice(const std::vector<std::shared_ptr<Ope>> &opes)
|
||||
@ -1545,16 +1575,16 @@ public:
|
||||
/*
|
||||
* Factories
|
||||
*/
|
||||
template <typename... Args> std::shared_ptr<Ope> seq(Args &&...args) {
|
||||
template <typename... Args> std::shared_ptr<Ope> seq(Args &&... args) {
|
||||
return std::make_shared<Sequence>(static_cast<std::shared_ptr<Ope>>(args)...);
|
||||
}
|
||||
|
||||
template <typename... Args> std::shared_ptr<Ope> cho(Args &&...args) {
|
||||
template <typename... Args> std::shared_ptr<Ope> cho(Args &&... args) {
|
||||
return std::make_shared<PrioritizedChoice>(
|
||||
false, static_cast<std::shared_ptr<Ope>>(args)...);
|
||||
}
|
||||
|
||||
template <typename... Args> std::shared_ptr<Ope> cho4label_(Args &&...args) {
|
||||
template <typename... Args> std::shared_ptr<Ope> cho4label_(Args &&... args) {
|
||||
return std::make_shared<PrioritizedChoice>(
|
||||
true, static_cast<std::shared_ptr<Ope>>(args)...);
|
||||
}
|
||||
@ -2757,7 +2787,7 @@ inline size_t Recovery::parse_core(const char *s, size_t n,
|
||||
auto label = dynamic_cast<Reference *>(rule.args_[0].get());
|
||||
if (label) {
|
||||
if (!label->rule_->error_message.empty()) {
|
||||
c.error_info.message_pos = c.error_info.error_pos;
|
||||
c.error_info.message_pos = s;
|
||||
c.error_info.message = label->rule_->error_message;
|
||||
}
|
||||
}
|
||||
@ -3043,7 +3073,8 @@ private:
|
||||
|
||||
const static std::vector<std::pair<char32_t, char32_t>> range = {
|
||||
{0x0080, 0xFFFF}};
|
||||
g["IdentStart"] <= cho(cls("a-zA-Z_%"), cls(range));
|
||||
g["IdentStart"] <= seq(npd(lit(u8(u8"↑"))), npd(lit(u8(u8"⇑"))),
|
||||
cho(cls("a-zA-Z_%"), cls(range)));
|
||||
|
||||
g["IdentRest"] <= cho(g["IdentStart"], cls("0-9"));
|
||||
|
||||
|
219
test/test2.cc
219
test/test2.cc
@ -1141,16 +1141,16 @@ comment <- ('#' (!nl .)*)
|
||||
nl <- '\r'? '\n'
|
||||
|
||||
header <- (!__ .)* { message "invalid section header, missing ']'." }
|
||||
entry <- (!(__ / HEADER) .)+ { message "invalid token '%t', expecting another phrase." }
|
||||
entry <- (!(__ / HEADER) .)+ { message "invalid entry." }
|
||||
)");
|
||||
|
||||
REQUIRE(!!pg); // OK
|
||||
|
||||
std::vector<std::string> errors{
|
||||
R"(3:6: invalid token '|', expecting another phrase.)",
|
||||
R"(7:4: invalid token '\n', expecting another phrase.)",
|
||||
R"(3:1: invalid entry.)",
|
||||
R"(7:1: invalid entry.)",
|
||||
R"(10:11: invalid section header, missing ']'.)",
|
||||
R"(18:17: invalid token '=', expecting another phrase.)",
|
||||
R"(18:1: invalid entry.)",
|
||||
};
|
||||
|
||||
size_t i = 0;
|
||||
@ -1294,6 +1294,217 @@ R"(+ START
|
||||
)");
|
||||
}
|
||||
|
||||
TEST_CASE("Error recovery 3", "[error]") {
|
||||
parser pg(R"~(
|
||||
# Grammar
|
||||
START <- __? SECTION*
|
||||
|
||||
SECTION <- HEADER __ ENTRIES __?
|
||||
|
||||
HEADER <- '['^missing_bracket _ CATEGORY (':' _ ATTRIBUTES)? ']'^missing_bracket ___
|
||||
|
||||
CATEGORY <- < (&[-_a-zA-Z0-9\u0080-\uFFFF ] (![\u0080-\uFFFF])^vernacular_char .)+ > _
|
||||
ATTRIBUTES <- ATTRIBUTE (',' _ ATTRIBUTE)*
|
||||
ATTRIBUTE <- < [-_a-zA-Z0-9]+ > _
|
||||
|
||||
ENTRIES <- (ENTRY (__ ENTRY)*)? { no_ast_opt }
|
||||
|
||||
ENTRY <- ONE_WAY PHRASE^expect_phrase (or _ PHRASE^expect_phrase)* ___
|
||||
/ PHRASE (or^missing_or _ PHRASE^expect_phrase) (or _ PHRASE^expect_phrase)* ___ { no_ast_opt }
|
||||
|
||||
ONE_WAY <- PHRASE assign _
|
||||
PHRASE <- WORD (' ' WORD)* _ { no_ast_opt }
|
||||
WORD <- < (![ \t\r\n=|[\]#] (![*?] / %recover(wildcard)) .)+ >
|
||||
|
||||
~assign <- '=' ____
|
||||
~or <- '|' (!'|')^duplicate_or ____
|
||||
|
||||
~_ <- [ \t]*
|
||||
~__ <- _ (comment? nl _)+
|
||||
~___ <- (!operators)^invalid_ope
|
||||
~____ <- (!operators)^invalid_ope_comb
|
||||
|
||||
operators <- [|=]+
|
||||
comment <- ('#' (!nl .)*)
|
||||
nl <- '\r'? '\n'
|
||||
|
||||
# Recovery
|
||||
duplicate_or <- skip_puncs { message "Duplicate OR operator (|)" }
|
||||
missing_or <- '' { message "Missing OR operator (|)" }
|
||||
missing_bracket <- skip_puncs { message "Missing opening/closing square bracket" }
|
||||
expect_phrase <- skip { message "Expect phrase" }
|
||||
invalid_ope_comb <- skip_puncs { message "Use of invalid operator combination" }
|
||||
invalid_ope <- skip { message "Use of invalid operator" }
|
||||
wildcard <- '' { message "Wildcard characters (%c) should not be used" }
|
||||
vernacular_char <- '' { message "Section name %c must be in English" }
|
||||
|
||||
skip <- (!(__) .)*
|
||||
skip_puncs <- [|=]* _
|
||||
)~");
|
||||
|
||||
REQUIRE(!!pg); // OK
|
||||
|
||||
std::vector<std::string> errors{
|
||||
R"(3:7: Wildcard characters (*) should not be used)",
|
||||
R"(4:6: Wildcard characters (?) should not be used)",
|
||||
R"(5:6: Duplicate OR operator (|))",
|
||||
R"(9:4: Missing OR operator (|))",
|
||||
R"(11:16: Expect phrase)",
|
||||
R"(13:11: Missing opening/closing square bracket)",
|
||||
R"(16:10: Section name 日 must be in English)",
|
||||
R"(16:11: Section name 本 must be in English)",
|
||||
R"(16:12: Section name 語 must be in English)",
|
||||
R"(16:13: Section name で must be in English)",
|
||||
R"(16:14: Section name す must be in English)",
|
||||
R"(21:17: Use of invalid operator)",
|
||||
R"(24:10: Use of invalid operator combination)",
|
||||
R"(26:10: Missing OR operator (|))",
|
||||
};
|
||||
|
||||
size_t i = 0;
|
||||
pg.log = [&](size_t ln, size_t col, const std::string &msg) {
|
||||
std::stringstream ss;
|
||||
ss << ln << ":" << col << ": " << msg;
|
||||
REQUIRE(ss.str() == errors[i++]);
|
||||
};
|
||||
|
||||
pg.enable_ast();
|
||||
|
||||
std::shared_ptr<Ast> ast;
|
||||
REQUIRE_FALSE(pg.parse(R"([Section 1]
|
||||
111 = 222 | 333
|
||||
AAA BB* | CCC
|
||||
AAA B?B | CCC
|
||||
aaa || bbb
|
||||
ccc = ddd
|
||||
|
||||
[Section 2]
|
||||
eee
|
||||
fff | ggg
|
||||
fff | ggg 111 |
|
||||
|
||||
[Section 3
|
||||
hhh | iii
|
||||
|
||||
[Section 日本語です]
|
||||
ppp | qqq
|
||||
|
||||
[Section 4]
|
||||
jjj | kkk
|
||||
lll = mmm | nnn = ooo
|
||||
|
||||
[Section 5]
|
||||
ppp qqq |= rrr
|
||||
|
||||
Section 6]
|
||||
sss | ttt
|
||||
)", ast));
|
||||
|
||||
ast = pg.optimize_ast(ast);
|
||||
|
||||
REQUIRE(ast_to_s(ast) ==
|
||||
R"(+ START
|
||||
+ SECTION
|
||||
- HEADER/0[CATEGORY] (Section 1)
|
||||
+ ENTRIES
|
||||
+ ENTRY/0
|
||||
+ ONE_WAY/0[PHRASE]
|
||||
- WORD (111)
|
||||
+ PHRASE
|
||||
- WORD (222)
|
||||
+ PHRASE
|
||||
- WORD (333)
|
||||
+ ENTRY/1
|
||||
+ PHRASE
|
||||
- WORD (AAA)
|
||||
- WORD (BB*)
|
||||
+ PHRASE
|
||||
- WORD (CCC)
|
||||
+ ENTRY/1
|
||||
+ PHRASE
|
||||
- WORD (AAA)
|
||||
- WORD (B?B)
|
||||
+ PHRASE
|
||||
- WORD (CCC)
|
||||
+ ENTRY/1
|
||||
+ PHRASE
|
||||
- WORD (aaa)
|
||||
+ PHRASE
|
||||
- WORD (bbb)
|
||||
+ ENTRY/0
|
||||
+ ONE_WAY/0[PHRASE]
|
||||
- WORD (ccc)
|
||||
+ PHRASE
|
||||
- WORD (ddd)
|
||||
+ SECTION
|
||||
- HEADER/0[CATEGORY] (Section 2)
|
||||
+ ENTRIES
|
||||
+ ENTRY/1
|
||||
+ PHRASE
|
||||
- WORD (eee)
|
||||
+ ENTRY/1
|
||||
+ PHRASE
|
||||
- WORD (fff)
|
||||
+ PHRASE
|
||||
- WORD (ggg)
|
||||
+ ENTRY/1
|
||||
+ PHRASE
|
||||
- WORD (fff)
|
||||
+ PHRASE
|
||||
- WORD (ggg)
|
||||
- WORD (111)
|
||||
+ SECTION
|
||||
- HEADER/0[CATEGORY] (Section 3)
|
||||
+ ENTRIES
|
||||
+ ENTRY/1
|
||||
+ PHRASE
|
||||
- WORD (hhh)
|
||||
+ PHRASE
|
||||
- WORD (iii)
|
||||
+ SECTION
|
||||
- HEADER/0[CATEGORY] (Section 日本語です)
|
||||
+ ENTRIES
|
||||
+ ENTRY/1
|
||||
+ PHRASE
|
||||
- WORD (ppp)
|
||||
+ PHRASE
|
||||
- WORD (qqq)
|
||||
+ SECTION
|
||||
- HEADER/0[CATEGORY] (Section 4)
|
||||
+ ENTRIES
|
||||
+ ENTRY/1
|
||||
+ PHRASE
|
||||
- WORD (jjj)
|
||||
+ PHRASE
|
||||
- WORD (kkk)
|
||||
+ ENTRY/0
|
||||
+ ONE_WAY/0[PHRASE]
|
||||
- WORD (lll)
|
||||
+ PHRASE
|
||||
- WORD (mmm)
|
||||
+ PHRASE
|
||||
- WORD (nnn)
|
||||
+ SECTION
|
||||
- HEADER/0[CATEGORY] (Section 5)
|
||||
+ ENTRIES
|
||||
+ ENTRY/1
|
||||
+ PHRASE
|
||||
- WORD (ppp)
|
||||
- WORD (qqq)
|
||||
+ PHRASE
|
||||
- WORD (rrr)
|
||||
+ ENTRY/1
|
||||
+ PHRASE
|
||||
- WORD (Section)
|
||||
- WORD (6)
|
||||
+ ENTRY/1
|
||||
+ PHRASE
|
||||
- WORD (sss)
|
||||
+ PHRASE
|
||||
- WORD (ttt)
|
||||
)");
|
||||
}
|
||||
|
||||
TEST_CASE("Error recovery Java", "[error]") {
|
||||
parser pg(R"(
|
||||
Prog ← PUBLIC CLASS NAME LCUR PUBLIC STATIC VOID MAIN LPAR STRING LBRA RBRA NAME RPAR BlockStmt RCUR
|
||||
|
Loading…
Reference in New Issue
Block a user