This repository has been archived by the owner on Jun 15, 2019. It is now read-only.
forked from zaach/ebnf-parser
-
Notifications
You must be signed in to change notification settings - Fork 1
/
ebnf.y
144 lines (113 loc) · 3.47 KB
/
ebnf.y
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
/* EBNF grammar spec */
%code imports %{
import XRegExp from '@gerhobbelt/xregexp'; // for helping out the `%options xregexp` in the lexer
%}
%lex
ASCII_LETTER [a-zA-z]
// \p{Alphabetic} already includes [a-zA-z], hence we don't need to merge
// with {UNICODE_LETTER} (though jison has code to optimize if you *did*
// include the `[a-zA-Z]` anyway):
UNICODE_LETTER [\p{Alphabetic}]
ALPHA [{UNICODE_LETTER}_]
DIGIT [\p{Number}]
WHITESPACE [\s\r\n\p{Separator}]
ALNUM [{ALPHA}{DIGIT}]
NAME [{ALPHA}](?:[{ALNUM}-]*{ALNUM})?
ID [{ALPHA}]{ALNUM}*
DECIMAL_NUMBER [1-9][0-9]*
HEX_NUMBER "0"[xX][0-9a-fA-F]+
BR \r\n|\n|\r
// WhiteSpace MUST NOT match CR/LF and the regex `\s` DOES, so we cannot use
// that one directly. Instead we define the {WS} macro here:
WS [^\S\r\n]
// Quoted string content: support *escaped* quotes inside strings:
QUOTED_STRING_CONTENT (?:\\\'|\\[^\']|[^\\\'])*
DOUBLEQUOTED_STRING_CONTENT (?:\\\"|\\[^\"]|[^\\\"])*
%options easy_keyword_rules
%options ranges
%options xregexp
%%
\s+ /* skip whitespace */
{ID} return 'SYMBOL';
"$end" return 'SYMBOL';
"["{ID}"]" yytext = this.matches[1]; return 'ALIAS';
// Stringified tokens are always `'`-surrounded by the bnf.y grammar unless the token
// itself contain an `'`.
//
// Note about edge case: EBNF grammars should not barf a hairball if someone
// ever decided that the combo of quotes, i.e. `'"` would be a legal token in their grammar,
// e.g. `rule: A '\'"' B`.
//
// And, yes, we assume that the `bnf.y` parser is our regular input source, so we may
// be a bit stricter here in what we lex than in the userland-facing `bnf.l` lexer.
\'{QUOTED_STRING_CONTENT}\'
return 'SYMBOL';
\"{DOUBLEQUOTED_STRING_CONTENT}\"
return 'SYMBOL';
"." return 'SYMBOL';
"(" return '(';
")" return ')';
"*" return '*';
"?" return '?';
"|" return '|';
"+" return '+';
<<EOF>> return 'EOF';
/lex
%start production
%%
production
: handle EOF
{ return $handle; }
;
handle_list
: handle
{ $$ = [$handle]; }
| handle_list '|' handle
{
$handle_list.push($handle);
$$ = $handle_list;
}
;
handle
: %epsilon
{ $$ = []; }
| rule
{ $$ = $rule; }
;
rule
: suffixed_expression
{ $$ = [$suffixed_expression]; }
| rule suffixed_expression
{
$rule.push($suffixed_expression);
$$ = $rule;
}
;
suffixed_expression
: expression suffix ALIAS
{ $$ = ['xalias', $suffix, $expression, $ALIAS]; }
| expression suffix
{
if ($suffix) {
$$ = [$suffix, $expression];
} else {
$$ = $expression;
}
}
;
expression
: SYMBOL
{ $$ = ['symbol', $SYMBOL]; }
| '(' handle_list ')'
{ $$ = ['()', $handle_list]; }
;
suffix
: %epsilon
{ $$ = undefined; }
| '*'
{ $$ = $1; }
| '?'
{ $$ = $1; }
| '+'
{ $$ = $1; }
;