forked from ocsigen/js_of_ocaml
-
Notifications
You must be signed in to change notification settings - Fork 0
/
js_lexer.mll
346 lines (304 loc) · 11.8 KB
/
js_lexer.mll
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
{
(* Js_of_ocaml compiler
* Copyright (C) 2013 Hugo Heuzard
*)
(* Yoann Padioleau
*
* Copyright (C) 2010 Facebook
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* version 2.1 as published by the Free Software Foundation, with the
* special exception on linking described in file license.txt.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the file
* license.txt for more details.
*)
open Js_token
let tok lexbuf = Lexing.lexeme lexbuf
let keyword_table =
let h = Hashtbl.create 17 in
List.iter (fun (s,f) -> Hashtbl.add h s f ) [
"catch", (fun ii -> T_CATCH ii);
"finally", (fun ii -> T_FINALLY ii);
"in", (fun ii -> T_IN ii);
"instanceof", (fun ii -> T_INSTANCEOF ii);
"else", (fun ii -> T_ELSE ii);
"while", (fun ii -> T_WHILE ii);
"break", (fun ii -> T_BREAK ii);
"case", (fun ii -> T_CASE ii);
"continue", (fun ii -> T_CONTINUE ii);
"default", (fun ii -> T_DEFAULT ii);
"delete", (fun ii -> T_DELETE ii);
"do", (fun ii -> T_DO ii);
"else", (fun ii -> T_ELSE ii);
"for", (fun ii -> T_FOR ii);
"function", (fun ii -> T_FUNCTION ii);
"if", (fun ii -> T_IF ii);
"new", (fun ii -> T_NEW ii);
"return", (fun ii -> T_RETURN ii);
"switch", (fun ii -> T_SWITCH ii);
"this", (fun ii -> T_THIS ii);
"throw", (fun ii -> T_THROW ii);
"try", (fun ii -> T_TRY ii);
"typeof", (fun ii -> T_TYPEOF ii);
"var", (fun ii -> T_VAR ii);
"void", (fun ii -> T_VOID ii);
"while", (fun ii -> T_WHILE ii);
"with", (fun ii -> T_WITH ii);
"null", (fun ii -> T_NULL ii);
"false", (fun ii -> T_FALSE ii);
"true", (fun ii -> T_TRUE ii);
"debugger", (fun ii -> T_DEBUGGER ii);
];
h
let hexa_to_int = function
| '0'..'9' as x -> Char.code x - Char.code '0'
| 'a'..'f' as x -> Char.code x - Char.code 'a' + 10
| 'A'..'F' as x -> Char.code x - Char.code 'A' + 10
| _ -> assert false;;
}
(*****************************************************************************)
let NEWLINE = ("\r"|"\n"|"\r\n")
let hexa = ['0'-'9''a'-'f''A'-'F']
let inputCharacter = [^ '\r' '\n' ]
(*****************************************************************************)
rule initial tokinfo prev = parse
(* ----------------------------------------------------------------------- *)
(* spacing/comments *)
(* ----------------------------------------------------------------------- *)
| "/*" {
let info = tokinfo lexbuf in
let buf = Buffer.create 127 in
let nl = ref false in
st_comment buf nl lexbuf;
let content = Buffer.contents buf in
if !nl
then TCommentML(info,content)
else TComment(info,content)
}
(* don't keep the trailing \n; it will be in another token *)
| "//" (inputCharacter* as cmt) { TComment(tokinfo lexbuf,cmt) }
| ([' ' '\t' ]+ as cmt) { TCommentSpace(tokinfo lexbuf,cmt) }
| NEWLINE {
lexbuf.Lexing.lex_curr_p <- { lexbuf.Lexing.lex_curr_p with
Lexing.pos_lnum = lexbuf.Lexing.lex_curr_p.Lexing.pos_lnum + 1 };
TCommentNewline(tokinfo lexbuf,"") }
(* ----------------------------------------------------------------------- *)
(* symbols *)
(* ----------------------------------------------------------------------- *)
| "{" { T_LCURLY (tokinfo lexbuf); }
| "}" { T_RCURLY (tokinfo lexbuf); }
| "(" { T_LPAREN (tokinfo lexbuf); }
| ")" { T_RPAREN (tokinfo lexbuf); }
| "[" { T_LBRACKET (tokinfo lexbuf); }
| "]" { T_RBRACKET (tokinfo lexbuf); }
| "." { T_PERIOD (tokinfo lexbuf); }
| ";" { T_SEMICOLON (tokinfo lexbuf); }
| "," { T_COMMA (tokinfo lexbuf); }
| ":" { T_COLON (tokinfo lexbuf); }
| "?" { T_PLING (tokinfo lexbuf); }
| "&&" { T_AND (tokinfo lexbuf); }
| "||" { T_OR (tokinfo lexbuf); }
| "===" { T_STRICT_EQUAL (tokinfo lexbuf); }
| "!==" { T_STRICT_NOT_EQUAL (tokinfo lexbuf); }
| "<=" { T_LESS_THAN_EQUAL (tokinfo lexbuf); }
| ">=" { T_GREATER_THAN_EQUAL (tokinfo lexbuf); }
| "==" { T_EQUAL (tokinfo lexbuf); }
| "!=" { T_NOT_EQUAL (tokinfo lexbuf); }
| "++" {
let cpi = tokinfo lexbuf in
match prev with
| Some p when (Js_token.info_of_tok p).Parse_info.line = cpi.Parse_info.line ->
T_INCR_NB(cpi)
| _ -> T_INCR(cpi) }
| "--" {
let cpi = tokinfo lexbuf in
match prev with
| Some p when (Js_token.info_of_tok p).Parse_info.line = cpi.Parse_info.line ->
T_DECR_NB(cpi)
| _ -> T_DECR(cpi) }
| "<<=" { T_LSHIFT_ASSIGN (tokinfo lexbuf); }
| "<<" { T_LSHIFT (tokinfo lexbuf); }
| ">>=" { T_RSHIFT_ASSIGN (tokinfo lexbuf); }
| ">>>=" { T_RSHIFT3_ASSIGN (tokinfo lexbuf); }
| ">>>" { T_RSHIFT3 (tokinfo lexbuf); }
| ">>" { T_RSHIFT (tokinfo lexbuf); }
| "+=" { T_PLUS_ASSIGN (tokinfo lexbuf); }
| "-=" { T_MINUS_ASSIGN (tokinfo lexbuf); }
| "*=" { T_MULT_ASSIGN (tokinfo lexbuf); }
| "%=" { T_MOD_ASSIGN (tokinfo lexbuf); }
| "&=" { T_BIT_AND_ASSIGN (tokinfo lexbuf); }
| "|=" { T_BIT_OR_ASSIGN (tokinfo lexbuf); }
| "^=" { T_BIT_XOR_ASSIGN (tokinfo lexbuf); }
| "<" { T_LESS_THAN (tokinfo lexbuf); }
| ">" { T_GREATER_THAN (tokinfo lexbuf); }
| "+" { T_PLUS (tokinfo lexbuf); }
| "-" { T_MINUS (tokinfo lexbuf); }
| "*" { T_MULT (tokinfo lexbuf); }
(* for '/' see below the regexp handling *)
| "%" { T_MOD (tokinfo lexbuf); }
| "|" { T_BIT_OR (tokinfo lexbuf); }
| "&" { T_BIT_AND (tokinfo lexbuf); }
| "^" { T_BIT_XOR (tokinfo lexbuf); }
| "!" { T_NOT (tokinfo lexbuf); }
| "~" { T_BIT_NOT (tokinfo lexbuf); }
| "=" { T_ASSIGN (tokinfo lexbuf); }
(* ----------------------------------------------------------------------- *)
(* Keywords and ident *)
(* ----------------------------------------------------------------------- *)
| ['a'-'z''A'-'Z''$''_']['a'-'z''A'-'Z''$''_''0'-'9']* {
let s = tok lexbuf in
let info = tokinfo lexbuf in
try
let f = Hashtbl.find keyword_table s in
f info (* need case insensitive ? *)
with
| Not_found -> T_IDENTIFIER (s, info)
}
(* ----------------------------------------------------------------------- *)
(* Constant *)
(* ----------------------------------------------------------------------- *)
| "0" ['X''x'] hexa+ {
let s = tok lexbuf in
let info = tokinfo lexbuf in
T_NUMBER (s, float_of_string s, info)
}
| '0'['0'-'7']+ {
let s = tok lexbuf in
let s' = String.sub s 1 (String.length s - 1 ) in
let info = tokinfo lexbuf in
T_NUMBER (s, Int64.(to_float (of_string ("0o"^s'))), info)
}
| ['0'-'9']*'.'?['0'-'9']+['e''E']['-''+']?['0'-'9']+ (* {1,3} *) {
let s = tok lexbuf in
let info = tokinfo lexbuf in
T_NUMBER (s, float_of_string s,info)
}
| ['0'-'9']+'.'? |
['0'-'9']*'.'['0'-'9']+ {
let s = tok lexbuf in
let info = tokinfo lexbuf in
T_NUMBER (s, float_of_string s, info)
}
(* ----------------------------------------------------------------------- *)
(* Strings *)
(* ----------------------------------------------------------------------- *)
| ("'"|'"') as quote {
let info = tokinfo lexbuf in
let buf = Buffer.create 127 in
string_quote quote buf lexbuf;
let s = Buffer.contents buf in
(* s does not contain the enclosing "'" but the info does *)
T_STRING (s, info)
}
(* ----------------------------------------------------------------------- *)
(* Regexp *)
(* ----------------------------------------------------------------------- *)
(* take care of ambiguity with start of comment //, and with
* '/' as a divisor operator
*
* it can not be '/' [^ '/']* '/' because then
* comments will not be recognized as lex tries
* to find the longest match.
*
* It can not be
* '/' [^'*''/'] ([^'/''\n'])* '/' ['A'-'Z''a'-'z']*
* because a / (b/c) will be recognized as a regexp.
*
*)
| "/" | "/=" {
let s = tok lexbuf in
let info = tokinfo lexbuf in
match prev with
| Some (
T_IDENTIFIER _
| T_NUMBER _ | T_STRING _ | T_REGEX _
| T_FALSE _ | T_TRUE _ | T_NULL _
| T_THIS _
| T_INCR _ | T_DECR _
| T_RBRACKET _ | T_RPAREN _
) -> begin match s with
| "/" -> T_DIV (info);
| "/=" -> T_DIV_ASSIGN info
| _ -> assert false
end
| _ ->
(* raise (Token t); *)
let buf = Buffer.create 127 in
Buffer.add_string buf s;
regexp buf lexbuf;
T_REGEX (Buffer.contents buf, info)
}
(* ----------------------------------------------------------------------- *)
(* eof *)
(* ----------------------------------------------------------------------- *)
| eof { EOF (tokinfo lexbuf) }
| _ {
(* Format.eprintf "LEXER:unrecognised symbol, in token rule: %s@." (tok lexbuf); *)
TUnknown (tokinfo lexbuf, tok lexbuf)
}
(*****************************************************************************)
and string_escape quote buf = parse
| '\\'{ Buffer.add_string buf "\\\\" }
| 'x' hexa hexa
| 'u' hexa hexa hexa hexa {
Buffer.add_char buf '\\';
Buffer.add_string buf (Lexing.lexeme lexbuf) }
| (_ as c)
{ if c <> '\'' && c <> '\"' then Buffer.add_char buf '\\';
Buffer.add_char buf c }
| eof { Format.eprintf "LEXER: WIERD end of file in string_escape@."; ()}
and string_quote q buf = parse
| ("'"|'"') as q' {
if q = q'
then ()
else (Buffer.add_char buf q'; string_quote q buf lexbuf) }
| '\\' {
string_escape q buf lexbuf;
string_quote q buf lexbuf
}
| (_ as x) { Buffer.add_char buf x; string_quote q buf lexbuf }
| eof { Format.eprintf "LEXER: WIERD end of file in quoted string@."; ()}
(*****************************************************************************)
and regexp buf = parse
| '\\' (_ as x) { Buffer.add_char buf '\\';
Buffer.add_char buf x;
regexp buf lexbuf }
| '/' { Buffer.add_char buf '/'; regexp_maybe_ident buf lexbuf }
| '[' { Buffer.add_char buf '['; regexp_class buf lexbuf }
| (_ as x) { Buffer.add_char buf x; regexp buf lexbuf }
| eof { Format.eprintf "LEXER: WIERD end of file in regexp@."; ()}
and regexp_class buf = parse
| ']' { Buffer.add_char buf ']';
regexp buf lexbuf }
| '\\' (_ as x) { Buffer.add_char buf '\\';
Buffer.add_char buf x;
regexp_class buf lexbuf }
| (_ as x) { Buffer.add_char buf x; regexp_class buf lexbuf }
| eof { Format.eprintf "LEXER: WIERD end of file in regexp_class@."; ()}
and regexp_maybe_ident buf = parse
| ['A'-'Z''a'-'z']* { Buffer.add_string buf (tok lexbuf) }
(*****************************************************************************)
and st_comment buf nl = parse
| "*/" { Buffer.add_string buf (tok lexbuf) }
(* noteopti: *)
| NEWLINE { Buffer.add_string buf (tok lexbuf);
nl := true;
st_comment buf nl lexbuf }
| [^'*' '\n' '\r' ]+ { Buffer.add_string buf (tok lexbuf);st_comment buf nl lexbuf }
| '*' { Buffer.add_char buf '*';st_comment buf nl lexbuf }
| eof { Format.eprintf "LEXER: end of file in comment@."; Buffer.add_string buf "*/"}
| _ {
let s = tok lexbuf in
Format.eprintf "LEXER: unrecognised symbol in comment: %s@." s;
Buffer.add_string buf s;
st_comment buf nl lexbuf
}
and pos = parse
| '#' [' ' '\t' ]+ (['0'-'9']+ as line) [' ' '\t' ]+ (("'"|'"') as quote) {
let buf = Buffer.create 127 in
string_quote quote buf lexbuf;
Buffer.contents buf, int_of_string line }