Skip to content

Commit

Permalink
start writing lexer tests; fix some unicode issues
Browse files Browse the repository at this point in the history
  • Loading branch information
mcy committed Oct 28, 2024
1 parent b046b8a commit 81995a0
Show file tree
Hide file tree
Showing 24 changed files with 459 additions and 27 deletions.
11 changes: 7 additions & 4 deletions experimental/parser/diagnostics.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.

package ast
package parser

import (
"fmt"
Expand Down Expand Up @@ -67,7 +67,10 @@ func (e ErrUnrecognized) Error() string {
}

func (e ErrUnrecognized) Diagnose(d *report.Diagnostic) {
d.With(report.Snippet(e.Token))
d.With(
report.Snippet(e.Token),
report.Debugf("%v, %v, %q", e.Token.ID(), e.Token.Span(), e.Token.Text()),
)
}

// ErrUnterminated diagnoses a delimiter for which we found one half of a matched
Expand Down Expand Up @@ -107,12 +110,12 @@ func (e ErrUnterminated) Diagnose(d *report.Diagnostic) {
openTok, closeTok := e.OpenClose()

if text == openTok {
d.With(report.Snippetf(e.Span, "expected to be closed by `%s", closeTok))
d.With(report.Snippetf(e.Span, "expected to be closed by `%s`", closeTok))
if e.Mismatch.IndexedFile != nil {
d.With(report.Snippetf(e.Mismatch, "closed by this instead"))
}
} else {
d.With(report.Snippetf(e.Span, "expected to be opened by `%s", openTok))
d.With(report.Snippetf(e.Span, "expected to be opened by `%s`", openTok))
}
if text == "*/" {
d.With(report.Note("Protobuf does not support nested block comments"))
Expand Down
77 changes: 65 additions & 12 deletions experimental/parser/lexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.

package ast
package parser

import (
"fmt"
Expand All @@ -23,6 +23,7 @@ import (

"github.com/bufbuild/protocompile/experimental/report"
"github.com/bufbuild/protocompile/experimental/token"
"github.com/rivo/uniseg"
)

// Lex performs lexical analysis on the file contained in ctx, and appends any
Expand All @@ -42,12 +43,23 @@ type lexer struct {
*token.Stream // Embedded so we don't have to call Stream() everywhere.
*report.Report

// This is outlined so that it's easy to print in the panic handler.
lexerState
}

type lexerState struct {
cursor, count int
openStack []token.Token
}

// Lex performs lexical analysis, and places any diagnostics in report.
func (l *lexer) Lex() {
defer func() {
if panicked := recover(); panicked != nil {
panic(fmt.Sprintf("panic while lexing: %s; %#v", panicked, l.lexerState))
}
}()

// Check that the file isn't too big. We give up immediately if that's
// the case.
if len(l.Text()) > MaxFileSize {
Expand Down Expand Up @@ -81,10 +93,12 @@ func (l *lexer) Lex() {
prevCount = l.count

switch {
case unicode.IsSpace(r):
case unicode.In(r, unicode.Pattern_White_Space):
// Whitepace. Consume as much whitespace as possible and mint a
// whitespace token.
l.TakeWhile(unicode.IsSpace)
l.TakeWhile(func(r rune) bool {
return unicode.In(r, unicode.Pattern_White_Space)
})
l.Push(l.cursor-start, token.Space)

case r == '/' && l.Peek() == '/':
Expand All @@ -110,7 +124,7 @@ func (l *lexer) Lex() {
// to lex a partial comment is hopeless.

var text string
if comment, ok := l.SeekInclusive("\n"); ok {
if comment, ok := l.SeekInclusive("*/"); ok {
text = comment
} else {
// Create a span for the /*, that's what we're gonna highlight.
Expand All @@ -119,6 +133,8 @@ func (l *lexer) Lex() {
}
l.Push(len("/*")+len(text), token.Comment)
case r == '*' && l.Peek() == '/':
l.cursor++ // Skip the /.

// The user definitely thought nested comments were allowed. :/
tok := l.Push(len("*/"), token.Unrecognized)
l.Error(ErrUnterminated{Span: tok.Span()})
Expand Down Expand Up @@ -172,23 +188,43 @@ func (l *lexer) Lex() {
l.cursor -= utf8.RuneLen(r)
l.LexNumber()

case r == '_' || unicode.IsLetter(r): // Consume fairly-open-ended identifiers, legalize to ASCII later.
l.TakeWhile(func(r rune) bool {
return r == '_' || unicode.IsDigit(r) || unicode.IsLetter(r)
case r == '_' || xidStart(r):
// Back up behind the rune we just popped.
l.cursor -= utf8.RuneLen(r)
rawId := l.TakeWhile(xidContinue)

Check failure on line 194 in experimental/parser/lexer.go

View workflow job for this annotation

GitHub Actions / ci (1.23.x)

var-naming: var rawId should be rawID (revive)

// Eject any trailing unprintable characters.
id := strings.TrimRightFunc(rawId, func(r rune) bool {
return !unicode.IsPrint(r)
})
tok := l.Push(l.cursor-start, token.Ident)
if id == "" {
// This "identifier" appears to consist entirely of unprintable
// characters (e.g. combining marks).
tok := l.Push(len(rawId), token.Unrecognized)
l.Error(ErrUnrecognized{Token: tok})
continue
}

l.cursor -= len(rawId) - len(id)
tok := l.Push(len(id), token.Ident)

// Legalize non-ASCII runes.
if !isASCIIIdent(tok.Text()) {
l.Error(ErrNonASCIIIdent{Token: tok})
}

default: // Consume as much stuff we don't understand as possible, diagnose it.
l.TakeWhile(func(r rune) bool {
default:
// Back up behind the rune we just popped.
l.cursor -= utf8.RuneLen(r)

// Consume as many grapheme clusters as possible, and diagnose it.
unknown := l.TakeGraphemesWhile(func(g string) bool {
r, _ := utf8.DecodeRuneInString(g)
return !strings.ContainsRune(";,/:=-.([{<>}])_\"'", r) &&
unicode.IsDigit(r) && unicode.IsLetter(r)
!xidContinue(r) &&
!unicode.In(r, unicode.Pattern_White_Space)
})
tok := l.Push(l.cursor-start, token.Unrecognized)
tok := l.Push(len(unknown), token.Unrecognized)
l.Error(ErrUnrecognized{Token: tok})
}
}
Expand Down Expand Up @@ -429,6 +465,8 @@ escapeLoop:
buf.WriteByte('\n')
case 'r':
buf.WriteByte('\r')
case 't':
buf.WriteByte('\t')
case 'v':
buf.WriteByte('\v') // U+000B
case '\\', '\'', '"', '?':
Expand Down Expand Up @@ -567,6 +605,21 @@ func (l *lexer) TakeWhile(f func(rune) bool) string {
return l.Text()[start:l.cursor]
}

// TakeWhile consumes grapheme clusters while they match the given function.
// Returns consumed characters.
func (l *lexer) TakeGraphemesWhile(f func(string) bool) string {
start := l.cursor

for gs := uniseg.NewGraphemes(l.Rest()); gs.Next(); {
g := gs.Str()
if !f(g) {
break
}
l.cursor += len(g)
}
return l.Text()[start:l.cursor]
}

// SeekInclusive seek until the given needle is found; returns the prefix inclusive that
// needle, and updates the cursor to point after it.
func (l *lexer) SeekInclusive(needle string) (string, bool) {
Expand Down
80 changes: 80 additions & 0 deletions experimental/parser/lexer_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// Copyright 2020-2024 Buf Technologies, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package parser_test

import (
"fmt"
"strings"
"testing"

"github.com/bufbuild/protocompile/experimental/ast"
"github.com/bufbuild/protocompile/experimental/parser"
"github.com/bufbuild/protocompile/experimental/report"
"github.com/bufbuild/protocompile/experimental/token"
"github.com/bufbuild/protocompile/internal/golden"
)

func TestRender(t *testing.T) {
t.Parallel()

corpus := golden.Corpus{
Root: "testdata/lexer",
Refresh: "PROTOCOMPILE_REFRESH",
Extension: "proto",
Outputs: []golden.Output{
{Extension: "tokens.tsv"},
{Extension: "stderr.txt"},
},
}

corpus.Run(t, func(t *testing.T, path, text string, outputs []string) {
errs := &report.Report{Tracing: 10}
ctx := ast.NewContext(report.File{Path: path, Text: text})
parser.Lex(ctx, errs)

stderr, _, _ := report.Renderer{
Colorize: true,
ShowDebug: true,
}.RenderString(errs)
t.Log(stderr)
outputs[1], _, _ = report.Renderer{}.RenderString(errs)

var tsv strings.Builder
tsv.WriteString("#\t\tkind\t\toffsets\t\tlinecol\t\ttext\n")
ctx.Stream().All()(func(tok token.Token) bool {
sp := tok.Span()
start := ctx.Stream().IndexedFile.Search(sp.Start)
fmt.Fprintf(
&tsv, "%v\t\t%v\t\t%03d:%03d\t\t%03d:%03d\t\t%q",
int32(tok.ID())-1, tok.Kind(),
sp.Start, sp.End,
start.Line, start.Column,
tok.Text(),
)

if v, ok := tok.AsInt(); ok {
fmt.Fprintf(&tsv, "\tint:%d", v)
} else if v, ok := tok.AsFloat(); ok {
fmt.Fprintf(&tsv, "\tfloat:%g", v)
} else if v, ok := tok.AsString(); ok {
fmt.Fprintf(&tsv, "\tstring:%q", v)
}

tsv.WriteByte('\n')
return true
})
outputs[0] = tsv.String()
})
}
1 change: 1 addition & 0 deletions experimental/parser/testdata/lexer/comments/eof.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
// This comment does not end in a newline.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# kind offsets linecol text
0 Comment 000:042 001:001 "// This comment does not end in a newline."
4 changes: 4 additions & 0 deletions experimental/parser/testdata/lexer/comments/nested.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
/*
Nesting
/* is not allowed */
*/
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
error: encountered unterminated `*/` delimiter
--> testdata/lexer/comments/nested.proto:4:1
|
4 | */
| ^^ expected to be opened by `/*`
= note: Protobuf does not support nested block comments

encountered 1 error
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# kind offsets linecol text
0 Comment 000:039 001:001 "/*\n Nesting\n /* is not allowed */"
1 Space 039:040 003:025 "\n"
2 Unrecognized 040:042 004:001 "*/"
8 changes: 8 additions & 0 deletions experimental/parser/testdata/lexer/comments/ok.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
// Single line comment.
//go:style-intrinsic
/*
Multiline comment
*/

//
/**/ // Empty
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# kind offsets linecol text
0 Comment 000:024 001:001 "// Single line comment.\n"
1 Comment 024:045 002:001 "//go:style-intrinsic\n"
2 Comment 045:072 003:001 "/*\n Multiline comment\n*/"
3 Space 072:074 005:003 "\n\n"
4 Comment 074:077 007:001 "//\n"
5 Comment 077:081 008:001 "/**/"
6 Space 081:082 008:005 " "
7 Comment 082:091 008:006 "// Empty\n"
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/*
Oops I dropped my * /
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
error: encountered unterminated `/*` delimiter
--> testdata/lexer/comments/unterminated.proto:1:1
|
1 | /*
| ^^ expected to be closed by `*/`

encountered 1 error
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# kind offsets linecol text
0 Comment 000:028 001:001 "/*\n Oops I dropped my * /"
4 changes: 4 additions & 0 deletions experimental/parser/testdata/lexer/idents/non-ascii.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
kitty_🐈‍⬛
黑猫
काली बिल्ली
黑猫_suffix
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
error: unrecongnized token
--> testdata/lexer/idents/non-ascii.proto:1:7
|
1 | kitty_🐈<U+200D>⬛
| ^^^^^^^^^^^^

error: non-ASCII identifiers are not allowed
--> testdata/lexer/idents/non-ascii.proto:2:1
|
2 | 黑猫
| ^^^^

error: non-ASCII identifiers are not allowed
--> testdata/lexer/idents/non-ascii.proto:3:1
|
3 | काली बिल्ली
| ^^^^

error: non-ASCII identifiers are not allowed
--> testdata/lexer/idents/non-ascii.proto:3:6
|
3 | काली बिल्ली
| ^^^^^

error: non-ASCII identifiers are not allowed
--> testdata/lexer/idents/non-ascii.proto:4:1
|
4 | 黑猫_suffix
| ^^^^^^^^^^^

encountered 5 errors
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# kind offsets linecol text
0 Ident 000:006 001:001 "kitty_"
1 Unrecognized 006:016 001:007 "🐈\u200d⬛"
2 Space 016:017 001:009 "\n"
3 Ident 017:023 002:001 "黑猫"
4 Space 023:025 002:005 " \n"
5 Ident 025:037 003:001 "काली"
6 Space 037:038 003:005 " "
7 Ident 038:056 003:006 "बिल्ली"
8 Space 056:057 003:011 "\n"
9 Ident 057:070 004:001 "黑猫_suffix"
10 changes: 10 additions & 0 deletions experimental/parser/testdata/lexer/idents/ok.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
a b c
string message
foo_bar fooBar
_ __ ____
__snake__
SCREAMING
abcdefghijklmnopqrstuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
_0123456789
_0_
Loading

0 comments on commit 81995a0

Please sign in to comment.