1
0
mirror of https://github.com/yhirose/cpp-peglib.git synced 2025-01-09 09:15:30 +00:00

Fixed error report problems

This commit is contained in:
yhirose 2021-02-04 10:45:18 -05:00
parent 23b284d0a3
commit 8643927c99
5 changed files with 309 additions and 66 deletions

View File

@ -526,7 +526,7 @@ cpp-peglib supports the furthest failure error posision report as descrived in t
For better error report and recovery, cpp-peglib supports 'recovery' operator with label which can be assosiated with a recovery expression and a custom error message. This idea comes from the fantastic ["Syntax Error Recovery in Parsing Expression Grammars"](https://arxiv.org/pdf/1806.11150.pdf) paper by Sergio Medeiros and Fabio Mascarenhas.
The custom message supports `%t` which is a place holder for the unexpected token.
The custom message supports `%t` which is a place holder for the unexpected token, and `%c` for the unexpected Unicode char.
Here is an example of Java-like grammar:

Binary file not shown.

View File

@ -61,6 +61,7 @@ body {
height: 160px;
border: 1px solid lightgray;
padding: 8px;
overflow-y: auto;
}
.editor-info li {
cursor: pointer;

153
peglib.h
View File

@ -87,6 +87,14 @@ inline size_t codepoint_length(const char *s8, size_t l) {
return 0;
}
inline size_t codepoint_count(const char *s8, size_t l) {
size_t count = 0;
for (size_t i = 0; i < l; i += codepoint_length(s8 + i, l - i)) {
count++;
}
return count;
}
inline size_t encode_codepoint(char32_t cp, char *buff) {
if (cp < 0x0080) {
buff[0] = static_cast<char>(cp & 0x7F);
@ -161,16 +169,16 @@ inline bool decode_codepoint(const char *s8, size_t l, size_t &bytes,
return false;
}
inline size_t decode_codepoint(const char *s8, size_t l, char32_t &out) {
inline size_t decode_codepoint(const char *s8, size_t l, char32_t &cp) {
size_t bytes;
if (decode_codepoint(s8, l, bytes, out)) { return bytes; }
if (decode_codepoint(s8, l, bytes, cp)) { return bytes; }
return 0;
}
inline char32_t decode_codepoint(const char *s8, size_t l) {
char32_t out = 0;
decode_codepoint(s8, l, out);
return out;
char32_t cp = 0;
decode_codepoint(s8, l, cp);
return cp;
}
inline std::u32string decode(const char *s8, size_t l) {
@ -418,7 +426,7 @@ inline std::pair<size_t, size_t> line_info(const char *start, const char *cur) {
p++;
}
auto col = p - col_ptr + 1;
auto col = codepoint_count(col_ptr, p - col_ptr) + 1;
return std::pair(no, col);
}
@ -551,7 +559,7 @@ private:
/*
* Semantic action
*/
template <typename F, typename... Args> std::any call(F fn, Args &&...args) {
template <typename F, typename... Args> std::any call(F fn, Args &&... args) {
using R = decltype(fn(std::forward<Args>(args)...));
if constexpr (std::is_void<R>::value) {
fn(std::forward<Args>(args)...);
@ -638,6 +646,7 @@ struct ErrorInfo {
std::vector<std::pair<const char *, bool>> expected_tokens;
const char *message_pos = nullptr;
std::string message;
mutable const char *last_output_pos = nullptr;
void clear() {
error_pos = nullptr;
@ -655,59 +664,71 @@ struct ErrorInfo {
void output_log(const Log &log, const char *s, size_t n) const {
if (message_pos) {
auto line = line_info(s, message_pos);
std::string msg;
if (auto unexpected_token = heuristic_error_token(s, n, message_pos);
!unexpected_token.empty()) {
msg = replace_all(message, "%t", unexpected_token);
} else {
msg = message;
}
log(line.first, line.second, msg);
} else if (error_pos) {
auto line = line_info(s, error_pos);
std::string msg;
if (expected_tokens.empty()) {
msg = "syntax error.";
} else {
msg = "syntax error";
// unexpected token
if (auto unexpected_token = heuristic_error_token(s, n, error_pos);
if (message_pos > last_output_pos) {
last_output_pos = message_pos;
auto line = line_info(s, message_pos);
std::string msg;
if (auto unexpected_token = heuristic_error_token(s, n, message_pos);
!unexpected_token.empty()) {
msg += ", unexpected '";
msg += unexpected_token;
msg += "'";
msg = replace_all(message, "%t", unexpected_token);
auto unexpected_char = unexpected_token.substr(
0, codepoint_length(unexpected_token.data(),
unexpected_token.size()));
msg = replace_all(msg, "%c", unexpected_char);
} else {
msg = message;
}
log(line.first, line.second, msg);
}
} else if (error_pos) {
if (error_pos > last_output_pos) {
last_output_pos = error_pos;
auto line = line_info(s, error_pos);
auto first_item = true;
size_t i = 0;
while (i < expected_tokens.size()) {
auto [token, is_literal] =
expected_tokens[expected_tokens.size() - i - 1];
std::string msg;
if (expected_tokens.empty()) {
msg = "syntax error.";
} else {
msg = "syntax error";
// Skip rules start with '_'
if (!is_literal && token[0] != '_') {
msg += (first_item ? ", expecting " : ", ");
if (is_literal) {
msg += "'";
msg += token;
msg += "'";
} else {
msg += "<";
msg += token;
msg += ">";
}
first_item = false;
// unexpected token
if (auto unexpected_token = heuristic_error_token(s, n, error_pos);
!unexpected_token.empty()) {
msg += ", unexpected '";
msg += unexpected_token;
msg += "'";
}
i++;
}
msg += ".";
}
auto first_item = true;
size_t i = 0;
while (i < expected_tokens.size()) {
auto [token, is_literal] =
expected_tokens[expected_tokens.size() - i - 1];
log(line.first, line.second, msg);
// Skip rules start with '_'
if (!is_literal && token[0] != '_') {
msg += (first_item ? ", expecting " : ", ");
if (is_literal) {
msg += "'";
msg += token;
msg += "'";
} else {
msg += "<";
msg += token;
msg += ">";
}
first_item = false;
}
i++;
}
msg += ".";
}
log(line.first, line.second, msg);
}
}
}
@ -724,7 +745,16 @@ private:
i++;
}
}
return escape_characters(error_pos, std::min<size_t>(i, 8));
size_t count = 8;
size_t j = 0;
while (count > 0 && j < i) {
j += codepoint_length(&error_pos[j], i - j);
count--;
}
// return escape_characters(error_pos, std::min<size_t>(i, 8));
return escape_characters(error_pos, j);
}
return std::string();
}
@ -944,7 +974,7 @@ public:
class Sequence : public Ope {
public:
template <typename... Args>
Sequence(const Args &...args)
Sequence(const Args &... args)
: opes_{static_cast<std::shared_ptr<Ope>>(args)...} {}
Sequence(const std::vector<std::shared_ptr<Ope>> &opes) : opes_(opes) {}
Sequence(std::vector<std::shared_ptr<Ope>> &&opes) : opes_(opes) {}
@ -987,7 +1017,7 @@ public:
class PrioritizedChoice : public Ope {
public:
template <typename... Args>
PrioritizedChoice(bool for_label, const Args &...args)
PrioritizedChoice(bool for_label, const Args &... args)
: opes_{static_cast<std::shared_ptr<Ope>>(args)...},
for_label_(for_label) {}
PrioritizedChoice(const std::vector<std::shared_ptr<Ope>> &opes)
@ -1545,16 +1575,16 @@ public:
/*
* Factories
*/
template <typename... Args> std::shared_ptr<Ope> seq(Args &&...args) {
template <typename... Args> std::shared_ptr<Ope> seq(Args &&... args) {
return std::make_shared<Sequence>(static_cast<std::shared_ptr<Ope>>(args)...);
}
template <typename... Args> std::shared_ptr<Ope> cho(Args &&...args) {
template <typename... Args> std::shared_ptr<Ope> cho(Args &&... args) {
return std::make_shared<PrioritizedChoice>(
false, static_cast<std::shared_ptr<Ope>>(args)...);
}
template <typename... Args> std::shared_ptr<Ope> cho4label_(Args &&...args) {
template <typename... Args> std::shared_ptr<Ope> cho4label_(Args &&... args) {
return std::make_shared<PrioritizedChoice>(
true, static_cast<std::shared_ptr<Ope>>(args)...);
}
@ -2757,7 +2787,7 @@ inline size_t Recovery::parse_core(const char *s, size_t n,
auto label = dynamic_cast<Reference *>(rule.args_[0].get());
if (label) {
if (!label->rule_->error_message.empty()) {
c.error_info.message_pos = c.error_info.error_pos;
c.error_info.message_pos = s;
c.error_info.message = label->rule_->error_message;
}
}
@ -3043,7 +3073,8 @@ private:
const static std::vector<std::pair<char32_t, char32_t>> range = {
{0x0080, 0xFFFF}};
g["IdentStart"] <= cho(cls("a-zA-Z_%"), cls(range));
g["IdentStart"] <= seq(npd(lit(u8(u8""))), npd(lit(u8(u8""))),
cho(cls("a-zA-Z_%"), cls(range)));
g["IdentRest"] <= cho(g["IdentStart"], cls("0-9"));

View File

@ -1141,16 +1141,16 @@ comment <- ('#' (!nl .)*)
nl <- '\r'? '\n'
header <- (!__ .)* { message "invalid section header, missing ']'." }
entry <- (!(__ / HEADER) .)+ { message "invalid token '%t', expecting another phrase." }
entry <- (!(__ / HEADER) .)+ { message "invalid entry." }
)");
REQUIRE(!!pg); // OK
std::vector<std::string> errors{
R"(3:6: invalid token '|', expecting another phrase.)",
R"(7:4: invalid token '\n', expecting another phrase.)",
R"(3:1: invalid entry.)",
R"(7:1: invalid entry.)",
R"(10:11: invalid section header, missing ']'.)",
R"(18:17: invalid token '=', expecting another phrase.)",
R"(18:1: invalid entry.)",
};
size_t i = 0;
@ -1294,6 +1294,217 @@ R"(+ START
)");
}
TEST_CASE("Error recovery 3", "[error]") {
parser pg(R"~(
# Grammar
START <- __? SECTION*
SECTION <- HEADER __ ENTRIES __?
HEADER <- '['^missing_bracket _ CATEGORY (':' _ ATTRIBUTES)? ']'^missing_bracket ___
CATEGORY <- < (&[-_a-zA-Z0-9\u0080-\uFFFF ] (![\u0080-\uFFFF])^vernacular_char .)+ > _
ATTRIBUTES <- ATTRIBUTE (',' _ ATTRIBUTE)*
ATTRIBUTE <- < [-_a-zA-Z0-9]+ > _
ENTRIES <- (ENTRY (__ ENTRY)*)? { no_ast_opt }
ENTRY <- ONE_WAY PHRASE^expect_phrase (or _ PHRASE^expect_phrase)* ___
/ PHRASE (or^missing_or _ PHRASE^expect_phrase) (or _ PHRASE^expect_phrase)* ___ { no_ast_opt }
ONE_WAY <- PHRASE assign _
PHRASE <- WORD (' ' WORD)* _ { no_ast_opt }
WORD <- < (![ \t\r\n=|[\]#] (![*?] / %recover(wildcard)) .)+ >
~assign <- '=' ____
~or <- '|' (!'|')^duplicate_or ____
~_ <- [ \t]*
~__ <- _ (comment? nl _)+
~___ <- (!operators)^invalid_ope
~____ <- (!operators)^invalid_ope_comb
operators <- [|=]+
comment <- ('#' (!nl .)*)
nl <- '\r'? '\n'
# Recovery
duplicate_or <- skip_puncs { message "Duplicate OR operator (|)" }
missing_or <- '' { message "Missing OR operator (|)" }
missing_bracket <- skip_puncs { message "Missing opening/closing square bracket" }
expect_phrase <- skip { message "Expect phrase" }
invalid_ope_comb <- skip_puncs { message "Use of invalid operator combination" }
invalid_ope <- skip { message "Use of invalid operator" }
wildcard <- '' { message "Wildcard characters (%c) should not be used" }
vernacular_char <- '' { message "Section name %c must be in English" }
skip <- (!(__) .)*
skip_puncs <- [|=]* _
)~");
REQUIRE(!!pg); // OK
std::vector<std::string> errors{
R"(3:7: Wildcard characters (*) should not be used)",
R"(4:6: Wildcard characters (?) should not be used)",
R"(5:6: Duplicate OR operator (|))",
R"(9:4: Missing OR operator (|))",
R"(11:16: Expect phrase)",
R"(13:11: Missing opening/closing square bracket)",
R"(16:10: Section name 日 must be in English)",
R"(16:11: Section name 本 must be in English)",
R"(16:12: Section name 語 must be in English)",
R"(16:13: Section name で must be in English)",
R"(16:14: Section name す must be in English)",
R"(21:17: Use of invalid operator)",
R"(24:10: Use of invalid operator combination)",
R"(26:10: Missing OR operator (|))",
};
size_t i = 0;
pg.log = [&](size_t ln, size_t col, const std::string &msg) {
std::stringstream ss;
ss << ln << ":" << col << ": " << msg;
REQUIRE(ss.str() == errors[i++]);
};
pg.enable_ast();
std::shared_ptr<Ast> ast;
REQUIRE_FALSE(pg.parse(R"([Section 1]
111 = 222 | 333
AAA BB* | CCC
AAA B?B | CCC
aaa || bbb
ccc = ddd
[Section 2]
eee
fff | ggg
fff | ggg 111 |
[Section 3
hhh | iii
[Section ]
ppp | qqq
[Section 4]
jjj | kkk
lll = mmm | nnn = ooo
[Section 5]
ppp qqq |= rrr
Section 6]
sss | ttt
)", ast));
ast = pg.optimize_ast(ast);
REQUIRE(ast_to_s(ast) ==
R"(+ START
+ SECTION
- HEADER/0[CATEGORY] (Section 1)
+ ENTRIES
+ ENTRY/0
+ ONE_WAY/0[PHRASE]
- WORD (111)
+ PHRASE
- WORD (222)
+ PHRASE
- WORD (333)
+ ENTRY/1
+ PHRASE
- WORD (AAA)
- WORD (BB*)
+ PHRASE
- WORD (CCC)
+ ENTRY/1
+ PHRASE
- WORD (AAA)
- WORD (B?B)
+ PHRASE
- WORD (CCC)
+ ENTRY/1
+ PHRASE
- WORD (aaa)
+ PHRASE
- WORD (bbb)
+ ENTRY/0
+ ONE_WAY/0[PHRASE]
- WORD (ccc)
+ PHRASE
- WORD (ddd)
+ SECTION
- HEADER/0[CATEGORY] (Section 2)
+ ENTRIES
+ ENTRY/1
+ PHRASE
- WORD (eee)
+ ENTRY/1
+ PHRASE
- WORD (fff)
+ PHRASE
- WORD (ggg)
+ ENTRY/1
+ PHRASE
- WORD (fff)
+ PHRASE
- WORD (ggg)
- WORD (111)
+ SECTION
- HEADER/0[CATEGORY] (Section 3)
+ ENTRIES
+ ENTRY/1
+ PHRASE
- WORD (hhh)
+ PHRASE
- WORD (iii)
+ SECTION
- HEADER/0[CATEGORY] (Section )
+ ENTRIES
+ ENTRY/1
+ PHRASE
- WORD (ppp)
+ PHRASE
- WORD (qqq)
+ SECTION
- HEADER/0[CATEGORY] (Section 4)
+ ENTRIES
+ ENTRY/1
+ PHRASE
- WORD (jjj)
+ PHRASE
- WORD (kkk)
+ ENTRY/0
+ ONE_WAY/0[PHRASE]
- WORD (lll)
+ PHRASE
- WORD (mmm)
+ PHRASE
- WORD (nnn)
+ SECTION
- HEADER/0[CATEGORY] (Section 5)
+ ENTRIES
+ ENTRY/1
+ PHRASE
- WORD (ppp)
- WORD (qqq)
+ PHRASE
- WORD (rrr)
+ ENTRY/1
+ PHRASE
- WORD (Section)
- WORD (6)
+ ENTRY/1
+ PHRASE
- WORD (sss)
+ PHRASE
- WORD (ttt)
)");
}
TEST_CASE("Error recovery Java", "[error]") {
parser pg(R"(
Prog PUBLIC CLASS NAME LCUR PUBLIC STATIC VOID MAIN LPAR STRING LBRA RBRA NAME RPAR BlockStmt RCUR