start writing lexer tests; fix some unicode issues

bufbuild · Oct 28, 2024 · 81995a0 · 81995a0
1 parent b046b8a
commit 81995a0
Show file tree

Hide file tree

Showing 24 changed files with 459 additions and 27 deletions.
diff --git a/experimental/parser/diagnostics.go b/experimental/parser/diagnostics.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package ast
+package parser
 
 import (
 	"fmt"
@@ -67,7 +67,10 @@ func (e ErrUnrecognized) Error() string {
 }
 
 func (e ErrUnrecognized) Diagnose(d *report.Diagnostic) {
-	d.With(report.Snippet(e.Token))
+	d.With(
+		report.Snippet(e.Token),
+		report.Debugf("%v, %v, %q", e.Token.ID(), e.Token.Span(), e.Token.Text()),
+	)
 }
 
 // ErrUnterminated diagnoses a delimiter for which we found one half of a matched
@@ -107,12 +110,12 @@ func (e ErrUnterminated) Diagnose(d *report.Diagnostic) {
 	openTok, closeTok := e.OpenClose()
 
 	if text == openTok {
-		d.With(report.Snippetf(e.Span, "expected to be closed by `%s", closeTok))
+		d.With(report.Snippetf(e.Span, "expected to be closed by `%s`", closeTok))
 		if e.Mismatch.IndexedFile != nil {
 			d.With(report.Snippetf(e.Mismatch, "closed by this instead"))
 		}
 	} else {
-		d.With(report.Snippetf(e.Span, "expected to be opened by `%s", openTok))
+		d.With(report.Snippetf(e.Span, "expected to be opened by `%s`", openTok))
 	}
 	if text == "*/" {
 		d.With(report.Note("Protobuf does not support nested block comments"))

diff --git a/experimental/parser/lexer.go b/experimental/parser/lexer.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package ast
+package parser
 
 import (
 	"fmt"
@@ -23,6 +23,7 @@ import (
 
 	"github.com/bufbuild/protocompile/experimental/report"
 	"github.com/bufbuild/protocompile/experimental/token"
+	"github.com/rivo/uniseg"
 )
 
 // Lex performs lexical analysis on the file contained in ctx, and appends any
@@ -42,12 +43,23 @@ type lexer struct {
 	*token.Stream // Embedded so we don't have to call Stream() everywhere.
 	*report.Report
 
+	// This is outlined so that it's easy to print in the panic handler.
+	lexerState
+}
+
+type lexerState struct {
 	cursor, count int
 	openStack     []token.Token
 }
 
 // Lex performs lexical analysis, and places any diagnostics in report.
 func (l *lexer) Lex() {
+	defer func() {
+		if panicked := recover(); panicked != nil {
+			panic(fmt.Sprintf("panic while lexing: %s; %#v", panicked, l.lexerState))
+		}
+	}()
+
 	// Check that the file isn't too big. We give up immediately if that's
 	// the case.
 	if len(l.Text()) > MaxFileSize {
@@ -81,10 +93,12 @@ func (l *lexer) Lex() {
 		prevCount = l.count
 
 		switch {
-		case unicode.IsSpace(r):
+		case unicode.In(r, unicode.Pattern_White_Space):
 			// Whitepace. Consume as much whitespace as possible and mint a
 			// whitespace token.
-			l.TakeWhile(unicode.IsSpace)
+			l.TakeWhile(func(r rune) bool {
+				return unicode.In(r, unicode.Pattern_White_Space)
+			})
 			l.Push(l.cursor-start, token.Space)
 
 		case r == '/' && l.Peek() == '/':
@@ -110,7 +124,7 @@ func (l *lexer) Lex() {
 			// to lex a partial comment is hopeless.
 
 			var text string
-			if comment, ok := l.SeekInclusive("\n"); ok {
+			if comment, ok := l.SeekInclusive("*/"); ok {
 				text = comment
 			} else {
 				// Create a span for the /*, that's what we're gonna highlight.
@@ -119,6 +133,8 @@ func (l *lexer) Lex() {
 			}
 			l.Push(len("/*")+len(text), token.Comment)
 		case r == '*' && l.Peek() == '/':
+			l.cursor++ // Skip the /.
+
 			// The user definitely thought nested comments were allowed. :/
 			tok := l.Push(len("*/"), token.Unrecognized)
 			l.Error(ErrUnterminated{Span: tok.Span()})
@@ -172,23 +188,43 @@ func (l *lexer) Lex() {
 			l.cursor -= utf8.RuneLen(r)
 			l.LexNumber()
 
-		case r == '_' || unicode.IsLetter(r): // Consume fairly-open-ended identifiers, legalize to ASCII later.
-			l.TakeWhile(func(r rune) bool {
-				return r == '_' || unicode.IsDigit(r) || unicode.IsLetter(r)
+		case r == '_' || xidStart(r):
+			// Back up behind the rune we just popped.
+			l.cursor -= utf8.RuneLen(r)
+			rawId := l.TakeWhile(xidContinue)
+
+			// Eject any trailing unprintable characters.
+			id := strings.TrimRightFunc(rawId, func(r rune) bool {
+				return !unicode.IsPrint(r)
 			})
-			tok := l.Push(l.cursor-start, token.Ident)
+			if id == "" {
+				// This "identifier" appears to consist entirely of unprintable
+				// characters (e.g. combining marks).
+				tok := l.Push(len(rawId), token.Unrecognized)
+				l.Error(ErrUnrecognized{Token: tok})
+				continue
+			}
+
+			l.cursor -= len(rawId) - len(id)
+			tok := l.Push(len(id), token.Ident)
 
 			// Legalize non-ASCII runes.
 			if !isASCIIIdent(tok.Text()) {
 				l.Error(ErrNonASCIIIdent{Token: tok})
 			}
 
-		default: // Consume as much stuff we don't understand as possible, diagnose it.
-			l.TakeWhile(func(r rune) bool {
+		default:
+			// Back up behind the rune we just popped.
+			l.cursor -= utf8.RuneLen(r)
+
+			// Consume as many grapheme clusters as possible, and diagnose it.
+			unknown := l.TakeGraphemesWhile(func(g string) bool {
+				r, _ := utf8.DecodeRuneInString(g)
 				return !strings.ContainsRune(";,/:=-.([{<>}])_\"'", r) &&
-					unicode.IsDigit(r) && unicode.IsLetter(r)
+					!xidContinue(r) &&
+					!unicode.In(r, unicode.Pattern_White_Space)
 			})
-			tok := l.Push(l.cursor-start, token.Unrecognized)
+			tok := l.Push(len(unknown), token.Unrecognized)
 			l.Error(ErrUnrecognized{Token: tok})
 		}
 	}
@@ -429,6 +465,8 @@ escapeLoop:
 			buf.WriteByte('\n')
 		case 'r':
 			buf.WriteByte('\r')
+		case 't':
+			buf.WriteByte('\t')
 		case 'v':
 			buf.WriteByte('\v') // U+000B
 		case '\\', '\'', '"', '?':
@@ -567,6 +605,21 @@ func (l *lexer) TakeWhile(f func(rune) bool) string {
 	return l.Text()[start:l.cursor]
 }
 
+// TakeWhile consumes grapheme clusters while they match the given function.
+// Returns consumed characters.
+func (l *lexer) TakeGraphemesWhile(f func(string) bool) string {
+	start := l.cursor
+
+	for gs := uniseg.NewGraphemes(l.Rest()); gs.Next(); {
+		g := gs.Str()
+		if !f(g) {
+			break
+		}
+		l.cursor += len(g)
+	}
+	return l.Text()[start:l.cursor]
+}
+
 // SeekInclusive seek until the given needle is found; returns the prefix inclusive that
 // needle, and updates the cursor to point after it.
 func (l *lexer) SeekInclusive(needle string) (string, bool) {

diff --git a/experimental/parser/lexer_test.go b/experimental/parser/lexer_test.go
@@ -0,0 +1,80 @@
+// Copyright 2020-2024 Buf Technologies, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package parser_test
+
+import (
+	"fmt"
+	"strings"
+	"testing"
+
+	"github.com/bufbuild/protocompile/experimental/ast"
+	"github.com/bufbuild/protocompile/experimental/parser"
+	"github.com/bufbuild/protocompile/experimental/report"
+	"github.com/bufbuild/protocompile/experimental/token"
+	"github.com/bufbuild/protocompile/internal/golden"
+)
+
+func TestRender(t *testing.T) {
+	t.Parallel()
+
+	corpus := golden.Corpus{
+		Root:      "testdata/lexer",
+		Refresh:   "PROTOCOMPILE_REFRESH",
+		Extension: "proto",
+		Outputs: []golden.Output{
+			{Extension: "tokens.tsv"},
+			{Extension: "stderr.txt"},
+		},
+	}
+
+	corpus.Run(t, func(t *testing.T, path, text string, outputs []string) {
+		errs := &report.Report{Tracing: 10}
+		ctx := ast.NewContext(report.File{Path: path, Text: text})
+		parser.Lex(ctx, errs)
+
+		stderr, _, _ := report.Renderer{
+			Colorize:  true,
+			ShowDebug: true,
+		}.RenderString(errs)
+		t.Log(stderr)
+		outputs[1], _, _ = report.Renderer{}.RenderString(errs)
+
+		var tsv strings.Builder
+		tsv.WriteString("#\t\tkind\t\toffsets\t\tlinecol\t\ttext\n")
+		ctx.Stream().All()(func(tok token.Token) bool {
+			sp := tok.Span()
+			start := ctx.Stream().IndexedFile.Search(sp.Start)
+			fmt.Fprintf(
+				&tsv, "%v\t\t%v\t\t%03d:%03d\t\t%03d:%03d\t\t%q",
+				int32(tok.ID())-1, tok.Kind(),
+				sp.Start, sp.End,
+				start.Line, start.Column,
+				tok.Text(),
+			)
+
+			if v, ok := tok.AsInt(); ok {
+				fmt.Fprintf(&tsv, "\tint:%d", v)
+			} else if v, ok := tok.AsFloat(); ok {
+				fmt.Fprintf(&tsv, "\tfloat:%g", v)
+			} else if v, ok := tok.AsString(); ok {
+				fmt.Fprintf(&tsv, "\tstring:%q", v)
+			}
+
+			tsv.WriteByte('\n')
+			return true
+		})
+		outputs[0] = tsv.String()
+	})
+}
diff --git a/experimental/parser/testdata/lexer/comments/eof.proto b/experimental/parser/testdata/lexer/comments/eof.proto
@@ -0,0 +1 @@
+// This comment does not end in a newline.
diff --git a/experimental/parser/testdata/lexer/comments/eof.proto.tokens.tsv b/experimental/parser/testdata/lexer/comments/eof.proto.tokens.tsv
@@ -0,0 +1,2 @@
+#		kind		offsets		linecol		text
+0		Comment		000:042		001:001		"// This comment does not end in a newline."
diff --git a/experimental/parser/testdata/lexer/comments/nested.proto b/experimental/parser/testdata/lexer/comments/nested.proto
@@ -0,0 +1,4 @@
+/*
+    Nesting
+    /* is not allowed */
+*/
diff --git a/experimental/parser/testdata/lexer/comments/nested.proto.stderr.txt b/experimental/parser/testdata/lexer/comments/nested.proto.stderr.txt
@@ -0,0 +1,8 @@
+error: encountered unterminated `*/` delimiter
+  --> testdata/lexer/comments/nested.proto:4:1
+   | 
+ 4 | */
+   | ^^ expected to be opened by `/*`
+   = note: Protobuf does not support nested block comments
+
+encountered 1 error
diff --git a/experimental/parser/testdata/lexer/comments/nested.proto.tokens.tsv b/experimental/parser/testdata/lexer/comments/nested.proto.tokens.tsv
@@ -0,0 +1,4 @@
+#		kind		offsets		linecol		text
+0		Comment		000:039		001:001		"/*\n    Nesting\n    /* is not allowed */"
+1		Space		039:040		003:025		"\n"
+2		Unrecognized		040:042		004:001		"*/"
diff --git a/experimental/parser/testdata/lexer/comments/ok.proto b/experimental/parser/testdata/lexer/comments/ok.proto
@@ -0,0 +1,8 @@
+// Single line comment.
+//go:style-intrinsic
+/*
+    Multiline comment
+*/
+
+//
+/**/ // Empty
diff --git a/experimental/parser/testdata/lexer/comments/ok.proto.tokens.tsv b/experimental/parser/testdata/lexer/comments/ok.proto.tokens.tsv
@@ -0,0 +1,9 @@
+#		kind		offsets		linecol		text
+0		Comment		000:024		001:001		"// Single line comment.\n"
+1		Comment		024:045		002:001		"//go:style-intrinsic\n"
+2		Comment		045:072		003:001		"/*\n    Multiline comment\n*/"
+3		Space		072:074		005:003		"\n\n"
+4		Comment		074:077		007:001		"//\n"
+5		Comment		077:081		008:001		"/**/"
+6		Space		081:082		008:005		" "
+7		Comment		082:091		008:006		"// Empty\n"
diff --git a/experimental/parser/testdata/lexer/comments/unterminated.proto b/experimental/parser/testdata/lexer/comments/unterminated.proto
@@ -0,0 +1,2 @@
+/*
+    Oops I dropped my * /
diff --git a/experimental/parser/testdata/lexer/comments/unterminated.proto.stderr.txt b/experimental/parser/testdata/lexer/comments/unterminated.proto.stderr.txt
@@ -0,0 +1,7 @@
+error: encountered unterminated `/*` delimiter
+  --> testdata/lexer/comments/unterminated.proto:1:1
+   | 
+ 1 | /*
+   | ^^ expected to be closed by `*/`
+
+encountered 1 error
diff --git a/experimental/parser/testdata/lexer/comments/unterminated.proto.tokens.tsv b/experimental/parser/testdata/lexer/comments/unterminated.proto.tokens.tsv
@@ -0,0 +1,2 @@
+#		kind		offsets		linecol		text
+0		Comment		000:028		001:001		"/*\n    Oops I dropped my * /"
diff --git a/experimental/parser/testdata/lexer/idents/non-ascii.proto b/experimental/parser/testdata/lexer/idents/non-ascii.proto
@@ -0,0 +1,4 @@
+kitty_🐈‍⬛
+黑猫 
+काली बिल्ली
+黑猫_suffix
diff --git a/experimental/parser/testdata/lexer/idents/non-ascii.proto.stderr.txt b/experimental/parser/testdata/lexer/idents/non-ascii.proto.stderr.txt
@@ -0,0 +1,31 @@
+error: unrecongnized token
+  --> testdata/lexer/idents/non-ascii.proto:1:7
+   | 
+ 1 | kitty_🐈<U+200D>⬛
+   |       ^^^^^^^^^^^^ 
+
+error: non-ASCII identifiers are not allowed
+  --> testdata/lexer/idents/non-ascii.proto:2:1
+   | 
+ 2 | 黑猫 
+   | ^^^^ 
+
+error: non-ASCII identifiers are not allowed
+  --> testdata/lexer/idents/non-ascii.proto:3:1
+   | 
+ 3 | काली बिल्ली
+   | ^^^^ 
+
+error: non-ASCII identifiers are not allowed
+  --> testdata/lexer/idents/non-ascii.proto:3:6
+   | 
+ 3 | काली बिल्ली
+   |      ^^^^^ 
+
+error: non-ASCII identifiers are not allowed
+  --> testdata/lexer/idents/non-ascii.proto:4:1
+   | 
+ 4 | 黑猫_suffix
+   | ^^^^^^^^^^^ 
+
+encountered 5 errors
diff --git a/experimental/parser/testdata/lexer/idents/non-ascii.proto.tokens.tsv b/experimental/parser/testdata/lexer/idents/non-ascii.proto.tokens.tsv
@@ -0,0 +1,11 @@
+#		kind		offsets		linecol		text
+0		Ident		000:006		001:001		"kitty_"
+1		Unrecognized		006:016		001:007		"🐈\u200d⬛"
+2		Space		016:017		001:009		"\n"
+3		Ident		017:023		002:001		"黑猫"
+4		Space		023:025		002:005		" \n"
+5		Ident		025:037		003:001		"काली"
+6		Space		037:038		003:005		" "
+7		Ident		038:056		003:006		"बिल्ली"
+8		Space		056:057		003:011		"\n"
+9		Ident		057:070		004:001		"黑猫_suffix"
diff --git a/experimental/parser/testdata/lexer/idents/ok.proto b/experimental/parser/testdata/lexer/idents/ok.proto
@@ -0,0 +1,10 @@
+a b c
+string message
+foo_bar fooBar
+_ __ ____
+__snake__
+SCREAMING
+abcdefghijklmnopqrstuvwxyz
+ABCDEFGHIJKLMNOPQRSTUVWXYZ
+_0123456789
+_0_