Fixed error report problems

2025-04-03 01:12:08 +00:00 · 2021-02-04 10:45:18 -05:00 · 2021-02-04 10:45:18 -05:00 · 8643927c99
commit 8643927c99
parent 23b284d0a3
5 changed files with 309 additions and 66 deletions
--- a/README.md
+++ b/README.md
@ -526,7 +526,7 @@ cpp-peglib supports the furthest failure error posision report as descrived in t

 For better error report and recovery, cpp-peglib supports 'recovery' operator with label which can be assosiated with a recovery expression and a custom error message. This idea comes from the fantastic ["Syntax Error Recovery in Parsing Expression Grammars"](https://arxiv.org/pdf/1806.11150.pdf) paper by Sergio Medeiros and Fabio Mascarenhas.

-The custom message supports `%t` which is a place holder for the unexpected token.
+The custom message supports `%t` which is a place holder for the unexpected token, and `%c` for the unexpected Unicode char.

 Here is an example of Java-like grammar:

--- a/docs/native.wasm
+++ b/docs/native.wasm
--- a/docs/style.css
+++ b/docs/style.css
@ -61,6 +61,7 @@ body {
  height: 160px;
  border: 1px solid lightgray;
  padding: 8px;
+  overflow-y: auto;
 }
 .editor-info li {
  cursor: pointer;
--- a/peglib.h
+++ b/peglib.h
@ -87,6 +87,14 @@ inline size_t codepoint_length(const char *s8, size_t l) {
  return 0;
 }

+inline size_t codepoint_count(const char *s8, size_t l) {
+  size_t count = 0;
+  for (size_t i = 0; i < l; i += codepoint_length(s8 + i, l - i)) {
+    count++;
+  }
+  return count;
+}
+
 inline size_t encode_codepoint(char32_t cp, char *buff) {
  if (cp < 0x0080) {
    buff[0] = static_cast<char>(cp & 0x7F);
@ -161,16 +169,16 @@ inline bool decode_codepoint(const char *s8, size_t l, size_t &bytes,
  return false;
 }

-inline size_t decode_codepoint(const char *s8, size_t l, char32_t &out) {
+inline size_t decode_codepoint(const char *s8, size_t l, char32_t &cp) {
  size_t bytes;
-  if (decode_codepoint(s8, l, bytes, out)) { return bytes; }
+  if (decode_codepoint(s8, l, bytes, cp)) { return bytes; }
  return 0;
 }

 inline char32_t decode_codepoint(const char *s8, size_t l) {
-  char32_t out = 0;
-  decode_codepoint(s8, l, out);
-  return out;
+  char32_t cp = 0;
+  decode_codepoint(s8, l, cp);
+  return cp;
 }

 inline std::u32string decode(const char *s8, size_t l) {
@ -418,7 +426,7 @@ inline std::pair<size_t, size_t> line_info(const char *start, const char *cur) {
    p++;
  }

-  auto col = p - col_ptr + 1;
+  auto col = codepoint_count(col_ptr, p - col_ptr) + 1;

  return std::pair(no, col);
 }
@ -551,7 +559,7 @@ private:
 /*
 * Semantic action
 */
-template <typename F, typename... Args> std::any call(F fn, Args &&...args) {
+template <typename F, typename... Args> std::any call(F fn, Args &&... args) {
  using R = decltype(fn(std::forward<Args>(args)...));
  if constexpr (std::is_void<R>::value) {
    fn(std::forward<Args>(args)...);
@ -638,6 +646,7 @@ struct ErrorInfo {
  std::vector<std::pair<const char *, bool>> expected_tokens;
  const char *message_pos = nullptr;
  std::string message;
+  mutable const char *last_output_pos = nullptr;

  void clear() {
    error_pos = nullptr;
@ -655,59 +664,71 @@ struct ErrorInfo {

  void output_log(const Log &log, const char *s, size_t n) const {
    if (message_pos) {
-      auto line = line_info(s, message_pos);
-      std::string msg;
-      if (auto unexpected_token = heuristic_error_token(s, n, message_pos);
-          !unexpected_token.empty()) {
-        msg = replace_all(message, "%t", unexpected_token);
-      } else {
-        msg = message;
-      }
-      log(line.first, line.second, msg);
-    } else if (error_pos) {
-      auto line = line_info(s, error_pos);
-
-      std::string msg;
-      if (expected_tokens.empty()) {
-        msg = "syntax error.";
-      } else {
-        msg = "syntax error";
-
-        // unexpected token
-        if (auto unexpected_token = heuristic_error_token(s, n, error_pos);
+      if (message_pos > last_output_pos) {
+        last_output_pos = message_pos;
+        auto line = line_info(s, message_pos);
+        std::string msg;
+        if (auto unexpected_token = heuristic_error_token(s, n, message_pos);
            !unexpected_token.empty()) {
-          msg += ", unexpected '";
-          msg += unexpected_token;
-          msg += "'";
+          msg = replace_all(message, "%t", unexpected_token);
+
+          auto unexpected_char = unexpected_token.substr(
+              0, codepoint_length(unexpected_token.data(),
+                                  unexpected_token.size()));
+
+          msg = replace_all(msg, "%c", unexpected_char);
+        } else {
+          msg = message;
        }
+        log(line.first, line.second, msg);
+      }
+    } else if (error_pos) {
+      if (error_pos > last_output_pos) {
+        last_output_pos = error_pos;
+        auto line = line_info(s, error_pos);

-        auto first_item = true;
-        size_t i = 0;
-        while (i < expected_tokens.size()) {
-          auto [token, is_literal] =
-              expected_tokens[expected_tokens.size() - i - 1];
+        std::string msg;
+        if (expected_tokens.empty()) {
+          msg = "syntax error.";
+        } else {
+          msg = "syntax error";

-          // Skip rules start with '_'
-          if (!is_literal && token[0] != '_') {
-            msg += (first_item ? ", expecting " : ", ");
-            if (is_literal) {
-              msg += "'";
-              msg += token;
-              msg += "'";
-            } else {
-              msg += "<";
-              msg += token;
-              msg += ">";
-            }
-            first_item = false;
+          // unexpected token
+          if (auto unexpected_token = heuristic_error_token(s, n, error_pos);
+              !unexpected_token.empty()) {
+            msg += ", unexpected '";
+            msg += unexpected_token;
+            msg += "'";
          }

-          i++;
-        }
-        msg += ".";
-      }
+          auto first_item = true;
+          size_t i = 0;
+          while (i < expected_tokens.size()) {
+            auto [token, is_literal] =
+                expected_tokens[expected_tokens.size() - i - 1];

-      log(line.first, line.second, msg);
+            // Skip rules start with '_'
+            if (!is_literal && token[0] != '_') {
+              msg += (first_item ? ", expecting " : ", ");
+              if (is_literal) {
+                msg += "'";
+                msg += token;
+                msg += "'";
+              } else {
+                msg += "<";
+                msg += token;
+                msg += ">";
+              }
+              first_item = false;
+            }
+
+            i++;
+          }
+          msg += ".";
+        }
+
+        log(line.first, line.second, msg);
+      }
    }
  }

@ -724,7 +745,16 @@ private:
          i++;
        }
      }
-      return escape_characters(error_pos, std::min<size_t>(i, 8));
+
+      size_t count = 8;
+      size_t j = 0;
+      while (count > 0 && j < i) {
+        j += codepoint_length(&error_pos[j], i - j);
+        count--;
+      }
+
+      // return escape_characters(error_pos, std::min<size_t>(i, 8));
+      return escape_characters(error_pos, j);
    }
    return std::string();
  }
@ -944,7 +974,7 @@ public:
 class Sequence : public Ope {
 public:
  template <typename... Args>
-  Sequence(const Args &...args)
+  Sequence(const Args &... args)
      : opes_{static_cast<std::shared_ptr<Ope>>(args)...} {}
  Sequence(const std::vector<std::shared_ptr<Ope>> &opes) : opes_(opes) {}
  Sequence(std::vector<std::shared_ptr<Ope>> &&opes) : opes_(opes) {}
@ -987,7 +1017,7 @@ public:
 class PrioritizedChoice : public Ope {
 public:
  template <typename... Args>
-  PrioritizedChoice(bool for_label, const Args &...args)
+  PrioritizedChoice(bool for_label, const Args &... args)
      : opes_{static_cast<std::shared_ptr<Ope>>(args)...},
        for_label_(for_label) {}
  PrioritizedChoice(const std::vector<std::shared_ptr<Ope>> &opes)
@ -1545,16 +1575,16 @@ public:
 /*
 * Factories
 */
-template <typename... Args> std::shared_ptr<Ope> seq(Args &&...args) {
+template <typename... Args> std::shared_ptr<Ope> seq(Args &&... args) {
  return std::make_shared<Sequence>(static_cast<std::shared_ptr<Ope>>(args)...);
 }

-template <typename... Args> std::shared_ptr<Ope> cho(Args &&...args) {
+template <typename... Args> std::shared_ptr<Ope> cho(Args &&... args) {
  return std::make_shared<PrioritizedChoice>(
      false, static_cast<std::shared_ptr<Ope>>(args)...);
 }

-template <typename... Args> std::shared_ptr<Ope> cho4label_(Args &&...args) {
+template <typename... Args> std::shared_ptr<Ope> cho4label_(Args &&... args) {
  return std::make_shared<PrioritizedChoice>(
      true, static_cast<std::shared_ptr<Ope>>(args)...);
 }
@ -2757,7 +2787,7 @@ inline size_t Recovery::parse_core(const char *s, size_t n,
    auto label = dynamic_cast<Reference *>(rule.args_[0].get());
    if (label) {
      if (!label->rule_->error_message.empty()) {
-        c.error_info.message_pos = c.error_info.error_pos;
+        c.error_info.message_pos = s;
        c.error_info.message = label->rule_->error_message;
      }
    }
@ -3043,7 +3073,8 @@ private:

    const static std::vector<std::pair<char32_t, char32_t>> range = {
        {0x0080, 0xFFFF}};
-    g["IdentStart"] <= cho(cls("a-zA-Z_%"), cls(range));
+    g["IdentStart"] <= seq(npd(lit(u8(u8"↑"))), npd(lit(u8(u8"⇑"))),
+                           cho(cls("a-zA-Z_%"), cls(range)));

    g["IdentRest"] <= cho(g["IdentStart"], cls("0-9"));

--- a/test/test2.cc
+++ b/test/test2.cc
@ -1141,16 +1141,16 @@ comment    <- ('#' (!nl .)*)
 nl         <- '\r'? '\n'

 header <- (!__ .)* { message "invalid section header, missing ']'." }
-entry  <- (!(__ / HEADER) .)+ { message "invalid token '%t', expecting another phrase." }
+entry  <- (!(__ / HEADER) .)+ { message "invalid entry." }
  )");

  REQUIRE(!!pg); // OK

  std::vector<std::string> errors{
-    R"(3:6: invalid token '|', expecting another phrase.)",
-    R"(7:4: invalid token '\n', expecting another phrase.)",
+    R"(3:1: invalid entry.)",
+    R"(7:1: invalid entry.)",
    R"(10:11: invalid section header, missing ']'.)",
-    R"(18:17: invalid token '=', expecting another phrase.)",
+    R"(18:1: invalid entry.)",
  };

  size_t i = 0;
@ -1294,6 +1294,217 @@ R"(+ START
 )");
 }

+TEST_CASE("Error recovery 3", "[error]") {
+  parser pg(R"~(
+# Grammar
+START      <- __? SECTION*
+
+SECTION    <- HEADER __ ENTRIES __?
+
+HEADER     <- '['^missing_bracket _ CATEGORY (':' _  ATTRIBUTES)? ']'^missing_bracket ___
+
+CATEGORY   <- < (&[-_a-zA-Z0-9\u0080-\uFFFF ] (![\u0080-\uFFFF])^vernacular_char .)+ > _
+ATTRIBUTES <- ATTRIBUTE (',' _ ATTRIBUTE)*
+ATTRIBUTE  <- < [-_a-zA-Z0-9]+ > _
+
+ENTRIES    <- (ENTRY (__ ENTRY)*)? { no_ast_opt }
+
+ENTRY      <- ONE_WAY PHRASE^expect_phrase (or _ PHRASE^expect_phrase)* ___
+            / PHRASE (or^missing_or _ PHRASE^expect_phrase) (or _ PHRASE^expect_phrase)* ___ { no_ast_opt }
+
+ONE_WAY    <- PHRASE assign _
+PHRASE     <- WORD (' ' WORD)* _ { no_ast_opt }
+WORD       <- < (![ \t\r\n=|[\]#] (![*?] / %recover(wildcard)) .)+ >
+
+~assign    <- '=' ____
+~or        <- '|' (!'|')^duplicate_or ____
+
+~_         <- [ \t]*
+~__        <- _ (comment? nl _)+
+~___       <- (!operators)^invalid_ope
+~____      <- (!operators)^invalid_ope_comb
+
+operators  <- [|=]+
+comment    <- ('#' (!nl .)*)
+nl         <- '\r'? '\n'
+
+# Recovery
+duplicate_or     <- skip_puncs { message "Duplicate OR operator (|)" }
+missing_or       <- '' { message "Missing OR operator (|)" }
+missing_bracket  <- skip_puncs { message "Missing opening/closing square bracket" }
+expect_phrase    <- skip { message "Expect phrase" }
+invalid_ope_comb <- skip_puncs { message "Use of invalid operator combination" }
+invalid_ope      <- skip { message "Use of invalid operator" }
+wildcard         <- '' { message "Wildcard characters (%c) should not be used" }
+vernacular_char  <- '' { message "Section name %c must be in English" }
+
+skip             <- (!(__) .)*
+skip_puncs       <- [|=]* _
+  )~");
+
+  REQUIRE(!!pg); // OK
+
+  std::vector<std::string> errors{
+    R"(3:7: Wildcard characters (*) should not be used)",
+    R"(4:6: Wildcard characters (?) should not be used)",
+    R"(5:6: Duplicate OR operator (|))",
+    R"(9:4: Missing OR operator (|))",
+    R"(11:16: Expect phrase)",
+    R"(13:11: Missing opening/closing square bracket)",
+    R"(16:10: Section name 日 must be in English)",
+    R"(16:11: Section name 本 must be in English)",
+    R"(16:12: Section name 語 must be in English)",
+    R"(16:13: Section name で must be in English)",
+    R"(16:14: Section name す must be in English)",
+    R"(21:17: Use of invalid operator)",
+    R"(24:10: Use of invalid operator combination)",
+    R"(26:10: Missing OR operator (|))",
+  };
+
+  size_t i = 0;
+  pg.log = [&](size_t ln, size_t col, const std::string &msg) {
+    std::stringstream ss;
+    ss << ln << ":" << col << ": " << msg;
+    REQUIRE(ss.str() == errors[i++]);
+  };
+
+  pg.enable_ast();
+
+  std::shared_ptr<Ast> ast;
+  REQUIRE_FALSE(pg.parse(R"([Section 1]
+111 = 222 | 333
+AAA BB* | CCC
+AAA B?B | CCC
+aaa || bbb
+ccc = ddd
+
+[Section 2]
+eee
+fff | ggg
+fff | ggg 111 |
+
+[Section 3
+hhh | iii
+
+[Section 日本語です]
+ppp | qqq
+
+[Section 4]
+jjj | kkk
+lll = mmm | nnn = ooo
+
+[Section 5]
+ppp qqq |= rrr
+
+Section 6]
+sss | ttt
+  )", ast));
+
+  ast = pg.optimize_ast(ast);
+
+  REQUIRE(ast_to_s(ast) ==
+R"(+ START
+  + SECTION
+    - HEADER/0[CATEGORY] (Section 1)
+    + ENTRIES
+      + ENTRY/0
+        + ONE_WAY/0[PHRASE]
+          - WORD (111)
+        + PHRASE
+          - WORD (222)
+        + PHRASE
+          - WORD (333)
+      + ENTRY/1
+        + PHRASE
+          - WORD (AAA)
+          - WORD (BB*)
+        + PHRASE
+          - WORD (CCC)
+      + ENTRY/1
+        + PHRASE
+          - WORD (AAA)
+          - WORD (B?B)
+        + PHRASE
+          - WORD (CCC)
+      + ENTRY/1
+        + PHRASE
+          - WORD (aaa)
+        + PHRASE
+          - WORD (bbb)
+      + ENTRY/0
+        + ONE_WAY/0[PHRASE]
+          - WORD (ccc)
+        + PHRASE
+          - WORD (ddd)
+  + SECTION
+    - HEADER/0[CATEGORY] (Section 2)
+    + ENTRIES
+      + ENTRY/1
+        + PHRASE
+          - WORD (eee)
+      + ENTRY/1
+        + PHRASE
+          - WORD (fff)
+        + PHRASE
+          - WORD (ggg)
+      + ENTRY/1
+        + PHRASE
+          - WORD (fff)
+        + PHRASE
+          - WORD (ggg)
+          - WORD (111)
+  + SECTION
+    - HEADER/0[CATEGORY] (Section 3)
+    + ENTRIES
+      + ENTRY/1
+        + PHRASE
+          - WORD (hhh)
+        + PHRASE
+          - WORD (iii)
+  + SECTION
+    - HEADER/0[CATEGORY] (Section 日本語です)
+    + ENTRIES
+      + ENTRY/1
+        + PHRASE
+          - WORD (ppp)
+        + PHRASE
+          - WORD (qqq)
+  + SECTION
+    - HEADER/0[CATEGORY] (Section 4)
+    + ENTRIES
+      + ENTRY/1
+        + PHRASE
+          - WORD (jjj)
+        + PHRASE
+          - WORD (kkk)
+      + ENTRY/0
+        + ONE_WAY/0[PHRASE]
+          - WORD (lll)
+        + PHRASE
+          - WORD (mmm)
+        + PHRASE
+          - WORD (nnn)
+  + SECTION
+    - HEADER/0[CATEGORY] (Section 5)
+    + ENTRIES
+      + ENTRY/1
+        + PHRASE
+          - WORD (ppp)
+          - WORD (qqq)
+        + PHRASE
+          - WORD (rrr)
+      + ENTRY/1
+        + PHRASE
+          - WORD (Section)
+          - WORD (6)
+      + ENTRY/1
+        + PHRASE
+          - WORD (sss)
+        + PHRASE
+          - WORD (ttt)
+)");
+}
+
 TEST_CASE("Error recovery Java", "[error]") {
  parser pg(R"(
 Prog       ← PUBLIC CLASS NAME LCUR PUBLIC STATIC VOID MAIN LPAR STRING LBRA RBRA NAME RPAR BlockStmt RCUR