-
Notifications
You must be signed in to change notification settings - Fork 78
/
lexer.py
461 lines (362 loc) · 15.3 KB
/
lexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
"""Objects for the lexing phase of the compiler.
The lexing phase takes the entire contents of a raw input file and
generates a flat list of tokens present in that input file.
"""
import re
import shivyc.token_kinds as token_kinds
from shivyc.errors import CompilerError, Position, Range, error_collector
from shivyc.tokens import Token
from shivyc.token_kinds import symbol_kinds, keyword_kinds
class Tagged:
"""Class representing tagged characters.
c (char) - the character that is tagged
p (Position) - position of the tagged character
r (Range) - a length-one range for the character
"""
def __init__(self, c, p):
"""Initialize object."""
self.c = c
self.p = p
self.r = Range(p, p)
def tokenize(code, filename):
"""Convert given code into a flat list of Tokens.
lines - List of list of Tagged objects, where each embedded list is a
separate line in the input program.
return - List of Token objects.
"""
# Store tokens as they are generated
tokens = []
lines = split_to_tagged_lines(code, filename)
join_extended_lines(lines)
in_comment = False
for line in lines:
try:
line_tokens, in_comment = tokenize_line(line, in_comment)
tokens += line_tokens
except CompilerError as e:
error_collector.add(e)
return tokens
def split_to_tagged_lines(text, filename):
"""Split the input text into tagged lines.
No newline escaping or other preprocessing is done by this function.
text (str) - Input file contents as a string.
filename (str) - Input file name.
return - Tagged lines. List of list of Tagged objects, where each second
order list is a separate line in the input progam. No newline characters.
"""
lines = text.splitlines()
tagged_lines = []
for line_num, line in enumerate(lines):
tagged_line = []
for col, char in enumerate(line):
p = Position(filename, line_num + 1, col + 1, line)
tagged_line.append(Tagged(char, p))
tagged_lines.append(tagged_line)
return tagged_lines
def join_extended_lines(lines):
"""Join together any lines which end in an escaped newline.
This function modifies the given lines object in place.
lines - List of list of Tagged objects, where each embedded list is a
separate line in the input program.
"""
# TODO: GCC supports \ followed by whitespace. Should ShivyC do this too?
i = 0
while i < len(lines):
if lines[i] and lines[i][-1].c == "\\":
# There is a next line to collapse into this one
if i + 1 < len(lines):
del lines[i][-1] # remove trailing backslash
lines[i] += lines[i + 1] # concatenate with next line
del lines[i + 1] # remove next line
# Decrement i, so this line is checked for a new trailing
# backslash.
i -= 1
# There is no next line to collapse into this one
else:
# TODO: print warning?
del lines[i][-1] # remove trailing backslash
i += 1
def tokenize_line(line, in_comment):
"""Tokenize the given single line.
line - List of Tagged objects.
in_comment - Whether the first character in this line is part of a
C-style comment body.
return - List of Token objects, and boolean indicating whether the next
character is part of a comment body.
"""
tokens = []
# line[chunk_start:chunk_end] is the section of the line currently
# being considered for conversion into a token; this string will be
# called the 'chunk'. Everything before the chunk has already been
# tokenized, and everything after has not yet been examined
chunk_start = 0
chunk_end = 0
# Flag that is set True if the line begins with `#` and `include`,
# perhaps with comments and whitespace in between.
include_line = False
# Flag that is set True if the line is an include directive and the
# filename has been seen and succesfully parsed.
seen_filename = False
while chunk_end < len(line):
symbol_kind = match_symbol_kind_at(line, chunk_end)
next_symbol_kind = match_symbol_kind_at(line, chunk_end + 1)
# Set include_line flag True as soon as a `#include` is detected.
if match_include_command(tokens):
include_line = True
if in_comment:
# If next characters end the comment...
if (symbol_kind == token_kinds.star and
next_symbol_kind == token_kinds.slash):
in_comment = False
chunk_start = chunk_end + 2
chunk_end = chunk_start
# Otherwise, just skip one character.
else:
chunk_start = chunk_end + 1
chunk_end = chunk_start
# If next characters start a comment, process previous chunk and set
# in_comment to true.
elif (symbol_kind == token_kinds.slash and
next_symbol_kind == token_kinds.star):
add_chunk(line[chunk_start:chunk_end], tokens)
in_comment = True
# If next two characters are //, we skip the rest of this line.
elif (symbol_kind == token_kinds.slash and
next_symbol_kind == token_kinds.slash):
break
# Skip spaces and process previous chunk.
elif line[chunk_end].c.isspace():
add_chunk(line[chunk_start:chunk_end], tokens)
chunk_start = chunk_end + 1
chunk_end = chunk_start
# If this is an include line, and not a comment or whitespace,
# expect the line to match an include filename.
elif include_line:
# If the filename has already been seen, there should be no more
# tokens.
if seen_filename:
descrip = "extra tokens at end of include directive"
raise CompilerError(descrip, line[chunk_end].r)
filename, end = read_include_filename(line, chunk_end)
tokens.append(Token(token_kinds.include_file, filename,
r=Range(line[chunk_end].p, line[end].p)))
chunk_start = end + 1
chunk_end = chunk_start
seen_filename = True
# If next character is a quote, we read the whole string as a token.
elif symbol_kind in {token_kinds.dquote, token_kinds.squote}:
if symbol_kind == token_kinds.dquote:
quote_str = '"'
kind = token_kinds.string
add_null = True
else:
quote_str = "'"
kind = token_kinds.char_string
add_null = False
chars, end = read_string(line, chunk_end + 1, quote_str, add_null)
rep = chunk_to_str(line[chunk_end:end + 1])
r = Range(line[chunk_end].p, line[end].p)
if kind == token_kinds.char_string and len(chars) == 0:
err = "empty character constant"
error_collector.add(CompilerError(err, r))
elif kind == token_kinds.char_string and len(chars) > 1:
err = "multiple characters in character constant"
error_collector.add(CompilerError(err, r))
tokens.append(Token(kind, chars, rep, r=r))
chunk_start = end + 1
chunk_end = chunk_start
# If next character is another symbol, add previous chunk and then
# add the symbol.
elif symbol_kind:
symbol_start_index = chunk_end
symbol_end_index = chunk_end + len(symbol_kind.text_repr) - 1
r = Range(line[symbol_start_index].p, line[symbol_end_index].p)
symbol_token = Token(symbol_kind, r=r)
add_chunk(line[chunk_start:chunk_end], tokens)
tokens.append(symbol_token)
chunk_start = chunk_end + len(symbol_kind.text_repr)
chunk_end = chunk_start
# Include another character in the chunk.
else:
chunk_end += 1
# Flush out anything that is left in the chunk to the output
add_chunk(line[chunk_start:chunk_end], tokens)
# Catch a `#include` on a line by itself.
if (include_line or match_include_command(tokens)) and not seen_filename:
read_include_filename(line, chunk_end)
return tokens, in_comment
def chunk_to_str(chunk):
"""Convert the given chunk to a string.
chunk - list of Tagged characters.
return - string representation of the list of Tagged characters
"""
return "".join(c.c for c in chunk)
def match_symbol_kind_at(content, start):
"""Return the longest matching symbol token kind.
content - List of Tagged objects in which to search for match.
start (int) - Index, inclusive, at which to start searching for a match.
returns (TokenType or None) - Symbol token found, or None if no token
is found.
"""
for symbol_kind in symbol_kinds:
try:
for i, c in enumerate(symbol_kind.text_repr):
if content[start + i].c != c:
break
else:
return symbol_kind
except IndexError:
pass
return None
def match_include_command(tokens):
"""Check if end of `tokens` is a `#include` directive."""
return (len(tokens) == 2 and
tokens[-2].kind == token_kinds.pound and
tokens[-1].kind == token_kinds.identifier and
tokens[-1].content == "include")
def read_string(line, start, delim, null):
"""Return a lexed string list in input characters.
Also returns the index of the string end quote.
line[start] should be the first character after the opening quote of the
string to be lexed. This function continues reading characters until
an unescaped closing quote is reached. The length returned is the
number of input characters that were read, not the length of the
string. The latter is the length of the lexed string list.
The lexed string is a list of integers, where each integer is the
ASCII value (between 0 and 128) of the corresponding character in
the string. The returned lexed string includes a null-terminator.
line - List of Tagged objects for each character in the line.
start - Index at which to start reading the string.
delim - Delimiter with which the string ends, like `"` or `'`
null - Whether to add a null-terminator to the returned character list
"""
i = start
chars = []
escapes = {"'": 39,
'"': 34,
"?": 63,
"\\": 92,
"a": 7,
"b": 8,
"f": 12,
"n": 10,
"r": 13,
"t": 9,
"v": 11}
octdigits = "01234567"
hexdigits = "0123456789abcdefABCDEF"
while True:
if i >= len(line):
descrip = "missing terminating quote"
raise CompilerError(descrip, line[start - 1].r)
elif line[i].c == delim:
if null: chars.append(0)
return chars, i
elif (i + 1 < len(line)
and line[i].c == "\\"
and line[i + 1].c in escapes):
chars.append(escapes[line[i + 1].c])
i += 2
elif (i + 1 < len(line)
and line[i].c == "\\"
and line[i + 1].c in octdigits):
octal = line[i + 1].c
i += 2
while (i < len(line)
and len(octal) < 3
and line[i].c in octdigits):
octal += line[i].c
i += 1
chars.append(int(octal, 8))
elif (i + 2 < len(line)
and line[i].c == "\\"
and line[i + 1].c == "x"
and line[i + 2].c in hexdigits):
hexa = line[i + 2].c
i += 3
while i < len(line) and line[i].c in hexdigits:
hexa += line[i].c
i += 1
chars.append(int(hexa, 16))
else:
chars.append(ord(line[i].c))
i += 1
def read_include_filename(line, start):
"""Read a filename that follows a #include directive.
Expects line[start] to be one of `<` or `"`, then reads characters until a
matching symbol is reached. Then, returns as a string the characters
read including the initial and final symbol markers. The index returned
is that of the closing token in the filename.
"""
if start < len(line) and line[start].c == '"':
end = '"'
elif start < len(line) and line[start].c == "<":
end = ">"
else:
descrip = "expected \"FILENAME\" or <FILENAME> after include directive"
if start < len(line):
char = line[start]
else:
char = line[-1]
raise CompilerError(descrip, char.r)
i = start + 1
try:
while line[i].c != end:
i += 1
except IndexError:
descrip = "missing terminating character for include filename"
raise CompilerError(descrip, line[start].r)
return chunk_to_str(line[start:i + 1]), i
def add_chunk(chunk, tokens):
"""Convert chunk into a token if possible and add to tokens.
If chunk is non-empty but cannot be made into a token, this function
records a compiler error. We don't need to check for symbol kind tokens
here because they are converted before they are shifted into the chunk.
chunk - Chunk to convert into a token, as list of Tagged characters.
tokens (List[Token]) - List of the tokens thusfar parsed.
"""
if chunk:
range = Range(chunk[0].p, chunk[-1].p)
keyword_kind = match_keyword_kind(chunk)
if keyword_kind:
tokens.append(Token(keyword_kind, r=range))
return
number_string = match_number_string(chunk)
if number_string:
tokens.append(Token(token_kinds.number, number_string, r=range))
return
identifier_name = match_identifier_name(chunk)
if identifier_name:
tokens.append(Token(
token_kinds.identifier, identifier_name, r=range))
return
descrip = f"unrecognized token at '{chunk_to_str(chunk)}'"
raise CompilerError(descrip, range)
def match_keyword_kind(token_repr):
"""Find the longest keyword token kind with representation token_repr.
token_repr - Token representation to match exactly, as list of Tagged
characters.
returns (TokenKind, or None) - Keyword token kind that matched.
"""
token_str = chunk_to_str(token_repr)
for keyword_kind in keyword_kinds:
if keyword_kind.text_repr == token_str:
return keyword_kind
return None
def match_number_string(token_repr):
"""Return a string that represents the given constant number.
token_repr - List of Tagged characters.
returns (str, or None) - String representation of the number.
"""
token_str = chunk_to_str(token_repr)
return token_str if token_str.isdigit() else None
def match_identifier_name(token_repr):
"""Return a string that represents the name of an identifier.
token_repr - List of Tagged characters.
returns (str, or None) - String name of the identifier.
"""
token_str = chunk_to_str(token_repr)
if re.match(r"[_a-zA-Z][_a-zA-Z0-9]*$", token_str):
return token_str
else:
return None