Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bug] Raw-strings in c++ can break monarch #4775

Open
1 of 2 tasks
OfekShilon opened this issue Dec 7, 2024 · 1 comment
Open
1 of 2 tasks

[Bug] Raw-strings in c++ can break monarch #4775

OfekShilon opened this issue Dec 7, 2024 · 1 comment
Assignees
Labels
bug Issue identified by VS Code Team member as probable bug

Comments

@OfekShilon
Copy link
Contributor

OfekShilon commented Dec 7, 2024

Reproducible in vscode.dev or in VS Code Desktop?

  • Not reproducible in vscode.dev or VS Code Desktop

Reproducible in the monaco editor playground?

Monaco Editor Playground Link

https://microsoft.github.io/monaco-editor/monarch.html
Unfortunately the monarch playground does not supply full links. See full code and steps below.

Monaco Editor Playground Code

Paste into the 'Language Syntax Definition' the language from monaco's cpp.ts :

// Create your own language definition here
// You can safely look at other samples without losing modifications.
// Modifications are not saved on browser refresh/close though -- copy often!
return {
  // Set defaultToken to invalid to see what you do not tokenize yet
    defaultToken: "",
  tokenPostfix: ".cpp",
  brackets: [
    { token: "delimiter.curly", open: "{", close: "}" },
    { token: "delimiter.parenthesis", open: "(", close: ")" },
    { token: "delimiter.square", open: "[", close: "]" },
    { token: "delimiter.angle", open: "<", close: ">" }
  ],
  keywords: [
    "abstract",
    "amp",
    "array",
    "auto",
    "bool",
    "break",
    "case",
    "catch",
    "char",
    "class",
    "const",
    "constexpr",
    "const_cast",
    "continue",
    "cpu",
    "decltype",
    "default",
    "delegate",
    "delete",
    "do",
    "double",
    "dynamic_cast",
    "each",
    "else",
    "enum",
    "event",
    "explicit",
    "export",
    "extern",
    "false",
    "final",
    "finally",
    "float",
    "for",
    "friend",
    "gcnew",
    "generic",
    "goto",
    "if",
    "in",
    "initonly",
    "inline",
    "int",
    "interface",
    "interior_ptr",
    "internal",
    "literal",
    "long",
    "mutable",
    "namespace",
    "new",
    "noexcept",
    "nullptr",
    "__nullptr",
    "operator",
    "override",
    "partial",
    "pascal",
    "pin_ptr",
    "private",
    "property",
    "protected",
    "public",
    "ref",
    "register",
    "reinterpret_cast",
    "restrict",
    "return",
    "safe_cast",
    "sealed",
    "short",
    "signed",
    "sizeof",
    "static",
    "static_assert",
    "static_cast",
    "struct",
    "switch",
    "template",
    "this",
    "thread_local",
    "throw",
    "tile_static",
    "true",
    "try",
    "typedef",
    "typeid",
    "typename",
    "union",
    "unsigned",
    "using",
    "virtual",
    "void",
    "volatile",
    "wchar_t",
    "where",
    "while",
    "_asm",
    // reserved word with one underscores
    "_based",
    "_cdecl",
    "_declspec",
    "_fastcall",
    "_if_exists",
    "_if_not_exists",
    "_inline",
    "_multiple_inheritance",
    "_pascal",
    "_single_inheritance",
    "_stdcall",
    "_virtual_inheritance",
    "_w64",
    "__abstract",
    // reserved word with two underscores
    "__alignof",
    "__asm",
    "__assume",
    "__based",
    "__box",
    "__builtin_alignof",
    "__cdecl",
    "__clrcall",
    "__declspec",
    "__delegate",
    "__event",
    "__except",
    "__fastcall",
    "__finally",
    "__forceinline",
    "__gc",
    "__hook",
    "__identifier",
    "__if_exists",
    "__if_not_exists",
    "__inline",
    "__int128",
    "__int16",
    "__int32",
    "__int64",
    "__int8",
    "__interface",
    "__leave",
    "__m128",
    "__m128d",
    "__m128i",
    "__m256",
    "__m256d",
    "__m256i",
    "__m512",
    "__m512d",
    "__m512i",
    "__m64",
    "__multiple_inheritance",
    "__newslot",
    "__nogc",
    "__noop",
    "__nounwind",
    "__novtordisp",
    "__pascal",
    "__pin",
    "__pragma",
    "__property",
    "__ptr32",
    "__ptr64",
    "__raise",
    "__restrict",
    "__resume",
    "__sealed",
    "__single_inheritance",
    "__stdcall",
    "__super",
    "__thiscall",
    "__try",
    "__try_cast",
    "__typeof",
    "__unaligned",
    "__unhook",
    "__uuidof",
    "__value",
    "__virtual_inheritance",
    "__w64",
    "__wchar_t"
  ],
  operators: [
    "=",
    ">",
    "<",
    "!",
    "~",
    "?",
    ":",
    "==",
    "<=",
    ">=",
    "!=",
    "&&",
    "||",
    "++",
    "--",
    "+",
    "-",
    "*",
    "/",
    "&",
    "|",
    "^",
    "%",
    "<<",
    ">>",
    "+=",
    "-=",
    "*=",
    "/=",
    "&=",
    "|=",
    "^=",
    "%=",
    "<<=",
    ">>="
  ],
  // we include these common regular expressions
  symbols: /[=><!~?:&|+\-*\/\^%]+/,
  escapes: /\\(?:[0abfnrtv\\"']|x[0-9A-Fa-f]{1,4}|u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})/,
  integersuffix: /([uU](ll|LL|l|L)|(ll|LL|l|L)?[uU]?)/,
  floatsuffix: /[fFlL]?/,
  encoding: /u|u8|U|L/,
  // The main tokenizer for our languages
  tokenizer: {
    root: [
      // C++ 11 Raw String
      [/@encoding?R\"(?:([^ ()\\\t]*))\(/, { token: "string.raw.begin", next: "@raw.$1" }],
      // identifiers and keywords
      [
        /[a-zA-Z_]\w*/,
        {
          cases: {
            "@keywords": { token: "keyword.$0" },
            "@default": "identifier"
          }
        }
      ],
      // The preprocessor checks must be before whitespace as they check /^\s*#/ which
      // otherwise fails to match later after other whitespace has been removed.
      // Inclusion
      [/^\s*#\s*include/, { token: "keyword.directive.include", next: "@include" }],
      // Preprocessor directive
      [/^\s*#\s*\w+/, "keyword.directive"],
      // whitespace
      { include: "@whitespace" },
      // [[ attributes ]].
      [/\[\s*\[/, { token: "annotation", next: "@annotation" }],
      // delimiters and operators
      [/[{}()<>\[\]]/, "@brackets"],
      [
        /@symbols/,
        {
          cases: {
            "@operators": "delimiter",
            "@default": ""
          }
        }
      ],
      // numbers
      [/\d*\d+[eE]([\-+]?\d+)?(@floatsuffix)/, "number.float"],
      [/\d*\.\d+([eE][\-+]?\d+)?(@floatsuffix)/, "number.float"],
      [/0[xX][0-9a-fA-F']*[0-9a-fA-F](@integersuffix)/, "number.hex"],
      [/0[0-7']*[0-7](@integersuffix)/, "number.octal"],
      [/0[bB][0-1']*[0-1](@integersuffix)/, "number.binary"],
      [/\d[\d']*\d(@integersuffix)/, "number"],
      [/\d(@integersuffix)/, "number"],
      // delimiter: after number because of .\d floats
      [/[;,.]/, "delimiter"],
      // strings
      [/"([^"\\]|\\.)*$/, "string.invalid"],
      // non-teminated string
      [/"/, "string", "@string"],
      // characters
      [/'[^\\']'/, "string"],
      [/(')(@escapes)(')/, ["string", "string.escape", "string"]],
      [/'/, "string.invalid"]
    ],
    whitespace: [
      [/[ \t\r\n]+/, ""],
      [/\/\*\*(?!\/)/, "comment.doc", "@doccomment"],
      [/\/\*/, "comment", "@comment"],
      [/\/\/.*\\$/, "comment", "@linecomment"],
      [/\/\/.*$/, "comment"]
    ],
    comment: [
      [/[^\/*]+/, "comment"],
      [/\*\//, "comment", "@pop"],
      [/[\/*]/, "comment"]
    ],
    //For use with continuous line comments
    linecomment: [
      [/.*[^\\]$/, "comment", "@pop"],
      [/[^]+/, "comment"]
    ],
    //Identical copy of comment above, except for the addition of .doc
    doccomment: [
      [/[^\/*]+/, "comment.doc"],
      [/\*\//, "comment.doc", "@pop"],
      [/[\/*]/, "comment.doc"]
    ],
    string: [
      [/[^\\"]+/, "string"],
      [/@escapes/, "string.escape"],
      [/\\./, "string.escape.invalid"],
      [/"/, "string", "@pop"]
    ],
    raw: [
      [/[^)]+/, "string.raw"],
      [/\)$S2\"/, { token: "string.raw.end", next: "@pop" }],
      [/\)/, "string.raw"]
    ],
    annotation: [
      { include: "@whitespace" },
      [/using|alignas/, "keyword"],
      [/[a-zA-Z0-9_]+/, "annotation"],
      [/[,:]/, "delimiter"],
      [/[()]/, "@brackets"],
      [/\]\s*\]/, { token: "annotation", next: "@pop" }]
    ],
    include: [
      [
        /(\s*)(<)([^<>]*)(>)/,
        [
          "",
          "keyword.directive.include.begin",
          "string.include.identifier",
          { token: "keyword.directive.include.end", next: "@pop" }
        ]
      ],
      [
        /(\s*)(")([^"]*)(")/,
        [
          "",
          "keyword.directive.include.begin",
          "string.include.identifier",
          { token: "keyword.directive.include.end", next: "@pop" }
        ]
      ]
    ]
  },
};

And paste into the right 'Language Editor' pane :

R"[())"

Reproduction Steps

Stand anywhere on the Language-Editor line (R"[())"), click F1 and select 'Inspect Tokens', and you'd see the internal exception:

Image

Image

This originates in a reported compiler-explorer bug.

Actual (Problematic) Behavior

Typical exception stack (from monarch playground):

errors.ts:26  Uncaught Error: Invalid regular expression: /^(?:\)[\")/: Unterminated character class

SyntaxError: Invalid regular expression: /^(?:\)[\")/: Unterminated character class
    at new RegExp (<anonymous>)
    at Rule.regex (monarchCompile.ts:132:17)
    at Rule.resolveRegex (monarchCompile.ts:417:16)
    at MonarchTokenizer._myTokenize (monarchLexer.ts:646:39)
    at MonarchTokenizer._tokenize (monarchLexer.ts:500:16)
    at MonarchTokenizer.tokenizeEncoded (monarchLexer.ts:492:29)
    at safeTokenize (textModelTokens.ts:403:28)
    at TokenizerWithStateStoreAndTextModel.updateTokensUntilLine (textModelTokens.ts:69:14)
    at DefaultBackgroundTokenizer._tokenizeOneInvalidLine (textModelTokens.ts:515:33)
    at DefaultBackgroundTokenizer._backgroundTokenizeForAtLeast1ms (textModelTokens.ts:492:37)
    at new RegExp (<anonymous>)
    at Rule.regex (monarchCompile.ts:132:17)
    at Rule.resolveRegex (monarchCompile.ts:417:16)
    at MonarchTokenizer._myTokenize (monarchLexer.ts:646:39)
    at MonarchTokenizer._tokenize (monarchLexer.ts:500:16)
    at MonarchTokenizer.tokenizeEncoded (monarchLexer.ts:492:29)
    at safeTokenize (textModelTokens.ts:403:28)
    at TokenizerWithStateStoreAndTextModel.updateTokensUntilLine (textModelTokens.ts:69:14)
    at DefaultBackgroundTokenizer._tokenizeOneInvalidLine (textModelTokens.ts:515:33)
    at DefaultBackgroundTokenizer._backgroundTokenizeForAtLeast1ms (textModelTokens.ts:492:37)
    at errors.ts:26:12

Expected Behavior

Graceful failure, perhaps not using user-supplied code as part of a regex.

Additional Context

No response

@rzhao271 rzhao271 added the bug Issue identified by VS Code Team member as probable bug label Dec 18, 2024
@OfekShilon OfekShilon changed the title [Bug] Invalid c++ code breaks monarch [Bug] Raw-strings in c++ can break monarch Dec 21, 2024
@OfekShilon
Copy link
Contributor Author

OfekShilon commented Dec 22, 2024

I submitted a fix suggestion in microsoft/vscode#236809. @hediet

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Issue identified by VS Code Team member as probable bug
Projects
None yet
Development

No branches or pull requests

3 participants