Skip to content

Commit

Permalink
fix fake-eof and related behavior in lex/parse
Browse files Browse the repository at this point in the history
Lexer is now sending explicit tFAIL token after error. That token is
interpreted in the parser accordingly. The assumption is that, contrary
to parse errors, it doesn't make sense to continue processing input
after a lex error, as there's usually a cascade of mess afterwards.
Even if I could do better than this assumption, that's the silent
behavior of previous code, which was misbehaving anyway - see below.

In addition to sending tFAIL token by lexer and interpreting it in the
parser (eg. there are two final tokens now, examined by parser.matchEnd
and parser.checkEnd; the error printing is fixed as well), there is also
a check inside block parsing to return earlier after tFAIL. This can be
extended in the future for other parts of the parser.

Now about a previous subtle bug: tEOF after tERR was read by
parser.advance only because it was zero-value from closed channel.
It could be seen as 'line 1:1: error at end: ...' error message,
even though 1:1 is definitely not end of the stream.
Parser was relying on this behavior, so for example after starting
tokens with iota+1 it went into infinite loop.
  • Loading branch information
wkhere committed May 24, 2024
1 parent 2203435 commit eedaead
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 63 deletions.
14 changes: 8 additions & 6 deletions lex.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,8 +185,10 @@ func isAlphaNum(r rune) bool {

// state finalizers

func (l *lexer) errorf(format string, args ...any) stateFn {
func (l *lexer) fail(format string, args ...any) stateFn {
l.emitError(format, args...)
l.ignore()
l.emit(tFAIL)
return nil
}

Expand Down Expand Up @@ -266,7 +268,7 @@ func lexStart(l *lexer) stateFn {
case isDigit(r):
return lexNumber
default:
return l.errorf("unknown char %#U", r)
return l.fail("unknown char %#U", r)
}
}

Expand All @@ -282,7 +284,7 @@ func lexTwoRunes(r1 rune, match twoRuneMatch) stateFn {
l.emit(r1t)
return lexStart
}
return l.errorf(
return l.fail(
"expected char %q to start token %q",
r1, fmt.Sprintf("%c%c", r1, match.r2),
)
Expand Down Expand Up @@ -353,14 +355,14 @@ func lexFloat(l *lexer) stateFn {
if l.accept(".") {
ok := l.acceptRun(digits)
if !ok {
return l.errorf("need more digits after a dot")
return l.fail("need more digits after a dot")
}
}
if l.accept("eE") {
l.accept("+-")
ok := l.acceptRun(digits)
if !ok {
return l.errorf("need more digits for an exponent")
return l.fail("need more digits for an exponent")
}
}
l.emit(tFLOAT)
Expand All @@ -377,7 +379,7 @@ loop:
}
fallthrough
case eof, '\n':
return l.errorf("unterminated quoted string")
return l.fail("unterminated quoted string")
case '"':
break loop
}
Expand Down
24 changes: 13 additions & 11 deletions lex_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ func (s tstream) collect() (a []token) {
// shorter syntax in tab literals:
type tt = []token

func teof(pos int) token { return token{tEOF, "", nil, pos} }
func teof(pos int) token { return token{tEOF, "", nil, pos} }
func tfail(pos int) token { return token{tFAIL, "", nil, pos} }
func terrchar(c rune, line int) token {
return token{tERR, "", fmt.Errorf("unknown char %#U", c), line}
}
Expand All @@ -30,12 +31,12 @@ var lexTab = []struct {
}{
{0, "", tt{teof(0)}},

{1, "@", tt{terrchar('@', 1)}},
{2, `"`, tt{{tERR, "", fmt.Errorf("unterminated quoted string"), 1}}},
{3, "\"\n", tt{{tERR, "", fmt.Errorf("unterminated quoted string"), 2}}},
{4, "\"\n", tt{{tERR, "", fmt.Errorf("unterminated quoted string"), 2}}},
{5, `"\`, tt{{tERR, "", fmt.Errorf("unterminated quoted string"), 2}}},
{6, `"\a`, tt{{tERR, "", fmt.Errorf("unterminated quoted string"), 3}}},
{1, "@", tt{terrchar('@', 1), tfail(1)}},
{2, `"`, tt{{tERR, "", fmt.Errorf("unterminated quoted string"), 1}, tfail(1)}},
{3, "\"\n", tt{{tERR, "", fmt.Errorf("unterminated quoted string"), 2}, tfail(2)}},
{4, "\"\n", tt{{tERR, "", fmt.Errorf("unterminated quoted string"), 2}, tfail(2)}},
{5, `"\`, tt{{tERR, "", fmt.Errorf("unterminated quoted string"), 2}, tfail(2)}},
{6, `"\a`, tt{{tERR, "", fmt.Errorf("unterminated quoted string"), 3}, tfail(3)}},

{7, `1234`, tt{{tINT, "1234", nil, 4}, teof(4)}},
{8, `12.34`, tt{{tFLOAT, "12.34", nil, 5}, teof(5)}},
Expand All @@ -46,19 +47,20 @@ var lexTab = []struct {
{13, `12.34e10`, tt{{tFLOAT, "12.34e10", nil, 8}, teof(8)}},
{14, `12.34e+10`, tt{{tFLOAT, "12.34e+10", nil, 9}, teof(9)}},
{15, `12.34e-10`, tt{{tFLOAT, "12.34e-10", nil, 9}, teof(9)}},
{16, `12.`, tt{{tERR, "", fmt.Errorf("need more digits after a dot"), 3}}},
{17, `12e`, tt{{tERR, "", fmt.Errorf("need more digits for an exponent"), 3}}},
{16, `12.`, tt{{tERR, "", fmt.Errorf("need more digits after a dot"), 3}, tfail(3)}},
{17, `12e`, tt{{tERR, "", fmt.Errorf("need more digits for an exponent"), 3}, tfail(3)}},

{18, `0x10`, tt{{tINT, "0x10", nil, 4}, teof(4)}},
{19, `0X10`, tt{{tINT, "0X10", nil, 4}, teof(4)}},
{20, `0x10.0`, tt{{tINT, "0x10", nil, 4}, terrchar('.', 5)}},
{20, `0x10.0`, tt{{tINT, "0x10", nil, 4}, terrchar('.', 5), tfail(5)}},

{21, `>`, tt{{tGT, ">", nil, 1}, teof(1)}},
{22, `>=`, tt{{tGE, ">=", nil, 2}, teof(2)}},
{23, `< 5`, tt{{tLT, "<", nil, 1}, {tINT, "5", nil, 3}, teof(3)}},
{24, `<= 5`, tt{{tLE, "<=", nil, 2}, {tINT, "5", nil, 4}, teof(4)}},
{25, `!<`, tt{
{tERR, "", fmt.Errorf(`expected char '!' to start token "!="`), 1},
tfail(1),
}},

{26, `{}`, tt{{tLCURLY, "{", nil, 1}, {tRCURLY, "}", nil, 2}, teof(2)}},
Expand Down Expand Up @@ -134,7 +136,7 @@ func ExampleLexer() {
// {tINT "0" 1}{tEOF "" 1}
// {tINT "1" 1}{tEOF "" 1}
// {tMINUS "-" 1}{tFLOAT "3.14" 5}{tEOF "" 5}
// {tERR "unknown char U+0040 '@'" 1}
// {tERR "unknown char U+0040 '@'" 1}{tFAIL "" 1}
}

func runExample(s string) {
Expand Down
40 changes: 30 additions & 10 deletions parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ func parse(inputs <-chan string, name string, cf parseConfig) (

p.advance()

for !p.match(tEOF) {
for !p.matchEnd() {
decl(p)

p.match(tSEMICOLON) // no check as it is optional
Expand All @@ -55,6 +55,7 @@ type parser struct {

prev, current token
hadError bool
hadLexFail bool
panicMode bool

identRefs map[string]int // map ident names to const indices
Expand Down Expand Up @@ -156,7 +157,7 @@ func blockStmt(p *parser) {
p.beginScope()
defer p.endScope()

for !p.check(tRCURLY) && !p.check(tEOF) {
for !p.check(tRCURLY) && !p.checkEnd() {
decl(p)
if p.panicMode {
p.advance()
Expand All @@ -165,6 +166,9 @@ func blockStmt(p *parser) {
p.match(tSEMICOLON) // optional
}

if p.hadLexFail {
return
}
p.consume(tRCURLY, "expected '}'")
}

Expand Down Expand Up @@ -248,8 +252,9 @@ func init() {

tSEMICOLON: {nil, nil, precNone},

tERR: {nil, nil, precNone},
tEOF: {nil, nil, precNone},
tERR: {nil, nil, precNone},
tEOF: {nil, nil, precNone},
tFAIL: {nil, nil, precNone},
}
}

Expand Down Expand Up @@ -400,12 +405,15 @@ func (p *parser) advance() {
p.prev = p.current

for {
var ok bool
p.current, ok = p.lexer.nextToken()
current, ok := p.lexer.nextToken()
if !ok {
return
}
p.current = current
p.stats.tokens++
if p.current.typ == tFAIL {
p.hadLexFail = true
}
if p.current.typ != tERR {
break
}
Expand All @@ -430,14 +438,26 @@ func (p *parser) match(typ tokenType) bool {
return true
}

func (p *parser) matchEnd() bool {
if !p.checkEnd() {
return false
}
p.advance()
return true
}

func (p *parser) check(typ tokenType) bool {
return p.current.typ == typ
}

func (p *parser) checkEnd() bool {
return p.current.typ <= tEOF
}

func (p *parser) sync() {
p.panicMode = false

for p.current.typ != tEOF {
for !p.checkEnd() {
switch p.current.typ {
case tVAR, tDEF, tPRINT, tEVAL: // tokens delimiting a statement
return
Expand Down Expand Up @@ -701,10 +721,10 @@ func (p *parser) errorAt(t *token, msg string) {

p.log.Printf("line %s: error", p.linePos.format(t.pos))

switch {
case t.typ == tEOF:
switch t.typ {
case tEOF:
p.log.Print(" at end")
case t.typ == tERR: // nop
case tERR, tFAIL: // nop
default:
p.log.Printf(" at '%s'", t.val)
}
Expand Down
3 changes: 2 additions & 1 deletion token.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ type token struct {
}

const (
tEOF tokenType = iota
tFAIL tokenType = iota
tEOF // values <= tEOF are finalizers
tERR

tINT
Expand Down
71 changes: 36 additions & 35 deletions tokentype_string.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit eedaead

Please sign in to comment.