-
Notifications
You must be signed in to change notification settings - Fork 6
/
Clojure.g4
248 lines (172 loc) · 7.33 KB
/
Clojure.g4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
grammar Clojure;
/*
* NOTES to myself and to other developers:
*
* - You have to remember that the parser cannot check for semantics
* - You have to find the right balance of dividing enforcement between the
* grammar and your own code.
*
* The parser should only check the syntax. So the rule of thumb is that when
* in doubt you let the parser pass the content up to your program. Then, in
* your program, you check the semantics and make sure that the rule actually
* have a proper meaning
*
* https://tomassetti.me/antlr-mega-tutorial/#lexers-and-parser
*/
code: input* EOF;
// useful rule to differentiate actual clojure content from anything else
input: ignore | form ;
ignore: whitespace | comment | discard;
form: literal | collection | reader_macro;
// sets and namespaced map are not considerd collection from grammar perspective
// since they start with # -> dispatch macro
collection: list | vector | map;
list: '(' input* ')';
vector: '[' input* ']';
map: '{' input* '}';
literal: keyword | macro_keyword | string | number | character | symbol;
keyword: KEYWORD;
macro_keyword: MACRO_KEYWORD;
string: STRING;
number: (OCTAL | HEXADECIMAL | RADIX | RATIO | LONG | DOUBLE);
character: (NAMED_CHAR | OCTAL_CHAR | UNICODE_CHAR | UNICODE);
symbol: SYMBOL;
reader_macro: ( unquote
| metadata
| backtick
| quote
| dispatch
| unquote_splicing
| deref
);
metadata: ((metadata_entry | deprecated_metadata_entry) ignore*)+
( symbol
| collection
| set
| namespaced_map
| tag
| fn
| unquote
| unquote_splicing
| conditional
| conditional_splicing
| deref
| quote
| backtick
| var_quote
);
metadata_entry: '^' ignore* ( map | symbol | string | keyword | macro_keyword | conditional);
/**
* According to https://github.com/clojure/clojure-site/blob/7493bdb10222719923519bfd6d2699a26677ee82/content/guides/weird_characters.adoc#-and----metadata
* the declaration `#^` is deprecated
*
* In order to support roundtrip of parser rules it is required to exactly identify the
* character used which would not be possible with something like '#'? '^'
*/
deprecated_metadata_entry: '#^' ignore* ( map | symbol | string | keyword | macro_keyword | conditional);
backtick: '`' ignore* form;
quote: '\'' ignore* form;
unquote: '~' ignore* form;
unquote_splicing: '~@' ignore* form;
deref: '@' ignore* form;
dispatch: ( fn
| regex
| set
| conditional
| conditional_splicing
| namespaced_map
| var_quote
| tag
| symbolic
| eval
);
fn: '#' list; // no whitespace allowed
regex: '#' STRING;
set: '#{' input* '}'; // no whitespace allowed
namespaced_map: '#' (keyword | macro_keyword | auto_resolve)
ignore*
map;
auto_resolve: '::';
var_quote: '#\'' ignore* form;
discard: '#_' ignore* form;
tag: '#' symbol ignore* form;
conditional: '#?' whitespace* list;
conditional_splicing: '#?@' whitespace* list;
/* This definition allows arbitrary symbolic values; following
* on LispReader to just read the form and throw if the symbol
* is not known.
*/
symbolic: '##' ignore* SYMBOL;
// I assume symbol and list from lisp reader, but tools.reader seems to
// indicate something else
eval: '#=' ignore* (symbol | list | conditional);
whitespace: WHITESPACE;
comment: COMMENT;
// check LispReader for the patterns used to match numbers
OCTAL: SIGN? ZERO [0-7]+ BIG_INT?;
HEXADECIMAL: SIGN? ZERO [xX][0-9A-Fa-f]+ BIG_INT?;
// radix cannot be read as a big int? 🤔 is this a bug in LispReader?
RADIX: SIGN? ([2-9] | ([1-2][0-9]) | ('3'[0-6]))
[rR]
[0-9a-zA-Z]+;
RATIO: SIGN? DIGIT+ '/' DIGIT+;
LONG: SIGN? DECIMAL BIG_INT?;
fragment BIG_INT: 'N';
// forbids numbers like: 0002341349 (invalid octal) from matching double
DOUBLE: SIGN? DECIMAL+ ('M' | (FRACTION | EXPONENT | (FRACTION EXPONENT)) 'M'?);
fragment FRACTION: ('.' DIGIT*);
fragment EXPONENT: [eE] SIGN? DIGIT+;
fragment DECIMAL: ZERO | ([1-9] DIGIT*);
fragment ZERO: '0';
STRING: '"' ~["\\]* ('\\' . ~["\\]*)* '"';
// any unicode whitespace "character"
WHITESPACE: [\p{White_Space},]+;
COMMENT: (';' | '#!') ~[\r\n]*;
NAMED_CHAR: ESCAPE ('newline' | 'return' | 'space' | 'tab' | 'formfeed' | 'backspace');
// This is supposed to be the JavaScript friendly version of #'\P{M}\p{M}*+'
// mentioned here: https://www.regular-expressions.info/unicode.html
// It's cooked by this generator: http://kourge.net/projects/regexp-unicode-block
// ticking all 'Combining Diacritical Marks' boxes *))
UNICODE_CHAR: ESCAPE ~[\u0300-\u036F\u1DC0-\u1DFF\u20D0-\u20FF];
UNICODE: ESCAPE 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F];
// octal character must be between 0 and 377
// https://github.com/clojure/clojure/blob/06097b1369c502090be6489be27cc280633cb1bd/src/jvm/clojure/lang/LispReader.java#L604
// https://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html
OCTAL_CHAR: ESCAPE 'o' ([0-7] | ([0-7] [0-7]) | ([0-3] [0-7] [0-7]));
fragment ESCAPE: '\\';
// Parcera currently (01.11.20) doesnt support multiple / inside
// keywords nor symbols as documented in https://clojure.org/reference/reader#_symbols
// See https://github.com/carocad/parcera/pull/94 for a more indept discussion.
// ::/ is NOT a valid macro keyword, unlike :/
MACRO_KEYWORD: '::' (SIMPLE_KEYWORD '/')? SIMPLE_KEYWORD;
KEYWORD: ':' (SIMPLE_KEYWORD '/')? (SIMPLE_KEYWORD | '/');
fragment SIMPLE_KEYWORD: // a single character like + -
KEYWORD_HEAD
// a keyword can contain : on the body
| (KEYWORD_HEAD KEYWORD_BODY+);
fragment KEYWORD_BODY: KEYWORD_HEAD | ':';
fragment KEYWORD_HEAD: ALLOWED_NAME_CHARACTER | DIGIT | [#'] | SIGN;
SYMBOL: (SIMPLE_SYMBOL '/')? (SIMPLE_SYMBOL | '/');
fragment SIMPLE_SYMBOL: // a single character like + - / etc
(ALLOWED_NAME_CHARACTER | SIGN)
// a symbol that starts with +- cannot be followed by a number
| (SIGN SYMBOL_HEAD SYMBOL_BODY*)
// a symbol that doesnt start with +- can be followed by a number like 't2#'
| (ALLOWED_NAME_CHARACTER (SYMBOL_HEAD | DIGIT | ':') SYMBOL_BODY*);
// symbols can contain : # ' as part of their names
fragment SYMBOL_BODY: SYMBOL_HEAD | DIGIT | ':';
fragment SYMBOL_HEAD: ALLOWED_NAME_CHARACTER | [#'] | SIGN;
// https://stackoverflow.com/a/15503680
// used to avoid the parser matching a single invalid token as the composition
// of two valid tokens. Examples:
// +9hello -> [:number +9] [:symbol hello]
// \o423 -> [:character \o43] [:number 2]
SENTINEL: (ESCAPE (ALLOWED_NAME_CHARACTER | DIGIT)+) // invalid literal chars
| '::/' // invalid macro keyword
| (':' (ALLOWED_NAME_CHARACTER | DIGIT | SIGN | ':' | '/')+) // invalid keyword
| (ALLOWED_NAME_CHARACTER | DIGIT | SIGN | '/')+; // invalid symbol
// these is the set of characters that are allowed by all symbols and keywords
// however, this is more strict that necessary so that we can re-use it for both
fragment ALLOWED_NAME_CHARACTER: ~[\p{White_Space},()[\]{}"@~^;`\\:#'/0-9+-];
fragment SIGN: [+-];
fragment DIGIT: [0-9];