CoCalc -- gdscript_tokenizer

GitHub Repository: godotengine/godot
Path: blob/master/modules/gdscript/gdscript_tokenizer_buffer.cpp
¹⁰²⁷⁷ views
1
/**************************************************************************/
2
/*  gdscript_tokenizer_buffer.cpp                                         */
3
/**************************************************************************/
4
/*                         This file is part of:                          */
5
/*                             GODOT ENGINE                               */
6
/*                        https://godotengine.org                         */
7
/**************************************************************************/
8
/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
9
/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
10
/*                                                                        */
11
/* Permission is hereby granted, free of charge, to any person obtaining  */
12
/* a copy of this software and associated documentation files (the        */
13
/* "Software"), to deal in the Software without restriction, including    */
14
/* without limitation the rights to use, copy, modify, merge, publish,    */
15
/* distribute, sublicense, and/or sell copies of the Software, and to     */
16
/* permit persons to whom the Software is furnished to do so, subject to  */
17
/* the following conditions:                                              */
18
/*                                                                        */
19
/* The above copyright notice and this permission notice shall be         */
20
/* included in all copies or substantial portions of the Software.        */
21
/*                                                                        */
22
/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
23
/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
24
/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
25
/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
26
/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
27
/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
28
/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
29
/**************************************************************************/
30

31
#include "gdscript_tokenizer_buffer.h"
32

33
#include "core/io/compression.h"
34
#include "core/io/marshalls.h"
35

36
int GDScriptTokenizerBuffer::_token_to_binary(const Token &p_token, Vector<uint8_t> &r_buffer, int p_start, HashMap<StringName, uint32_t> &r_identifiers_map, HashMap<Variant, uint32_t, VariantHasher, VariantComparator> &r_constants_map) {
37
	int pos = p_start;
38

39
	int token_type = p_token.type & TOKEN_MASK;
40

41
	switch (p_token.type) {
42
		case GDScriptTokenizer::Token::ANNOTATION:
43
		case GDScriptTokenizer::Token::IDENTIFIER: {
44
			// Add identifier to map.
45
			int identifier_pos;
46
			StringName id = p_token.get_identifier();
47
			if (r_identifiers_map.has(id)) {
48
				identifier_pos = r_identifiers_map[id];
49
			} else {
50
				identifier_pos = r_identifiers_map.size();
51
				r_identifiers_map[id] = identifier_pos;
52
			}
53
			token_type |= identifier_pos << TOKEN_BITS;
54
		} break;
55
		case GDScriptTokenizer::Token::ERROR:
56
		case GDScriptTokenizer::Token::LITERAL: {
57
			// Add literal to map.
58
			int constant_pos;
59
			if (r_constants_map.has(p_token.literal)) {
60
				constant_pos = r_constants_map[p_token.literal];
61
			} else {
62
				constant_pos = r_constants_map.size();
63
				r_constants_map[p_token.literal] = constant_pos;
64
			}
65
			token_type |= constant_pos << TOKEN_BITS;
66
		} break;
67
		default:
68
			break;
69
	}
70

71
	// Encode token.
72
	int token_len;
73
	if (token_type & TOKEN_MASK) {
74
		token_len = 8;
75
		r_buffer.resize(pos + token_len);
76
		encode_uint32(token_type | TOKEN_BYTE_MASK, &r_buffer.write[pos]);
77
		pos += 4;
78
	} else {
79
		token_len = 5;
80
		r_buffer.resize(pos + token_len);
81
		r_buffer.write[pos] = token_type;
82
		pos++;
83
	}
84
	encode_uint32(p_token.start_line, &r_buffer.write[pos]);
85
	return token_len;
86
}
87

88
GDScriptTokenizer::Token GDScriptTokenizerBuffer::_binary_to_token(const uint8_t *p_buffer) {
89
	Token token;
90
	const uint8_t *b = p_buffer;
91

92
	uint32_t token_type = decode_uint32(b);
93
	token.type = (Token::Type)(token_type & TOKEN_MASK);
94
	if (token_type & TOKEN_BYTE_MASK) {
95
		b += 4;
96
	} else {
97
		b++;
98
	}
99
	token.start_line = decode_uint32(b);
100
	token.end_line = token.start_line;
101

102
	token.literal = token.get_name();
103
	if (token.type == Token::CONST_NAN) {
104
		token.literal = String("NAN"); // Special case since name and notation are different.
105
	}
106

107
	switch (token.type) {
108
		case GDScriptTokenizer::Token::ANNOTATION:
109
		case GDScriptTokenizer::Token::IDENTIFIER: {
110
			// Get name from map.
111
			int identifier_pos = token_type >> TOKEN_BITS;
112
			if (unlikely(identifier_pos >= identifiers.size())) {
113
				Token error;
114
				error.type = Token::ERROR;
115
				error.literal = "Identifier index out of bounds.";
116
				return error;
117
			}
118
			token.literal = identifiers[identifier_pos];
119
		} break;
120
		case GDScriptTokenizer::Token::ERROR:
121
		case GDScriptTokenizer::Token::LITERAL: {
122
			// Get literal from map.
123
			int constant_pos = token_type >> TOKEN_BITS;
124
			if (unlikely(constant_pos >= constants.size())) {
125
				Token error;
126
				error.type = Token::ERROR;
127
				error.literal = "Constant index out of bounds.";
128
				return error;
129
			}
130
			token.literal = constants[constant_pos];
131
		} break;
132
		default:
133
			break;
134
	}
135

136
	return token;
137
}
138

139
Error GDScriptTokenizerBuffer::set_code_buffer(const Vector<uint8_t> &p_buffer) {
140
	const uint8_t *buf = p_buffer.ptr();
141
	ERR_FAIL_COND_V(p_buffer.size() < 12 || p_buffer[0] != 'G' || p_buffer[1] != 'D' || p_buffer[2] != 'S' || p_buffer[3] != 'C', ERR_INVALID_DATA);
142

143
	int version = decode_uint32(&buf[4]);
144
	ERR_FAIL_COND_V_MSG(version != TOKENIZER_VERSION, ERR_INVALID_DATA, "Binary GDScript is not compatible with this engine version.");
145

146
	int decompressed_size = decode_uint32(&buf[8]);
147

148
	Vector<uint8_t> contents;
149
	if (decompressed_size == 0) {
150
		contents = p_buffer.slice(12);
151
	} else {
152
		contents.resize(decompressed_size);
153
		const int64_t result = Compression::decompress(contents.ptrw(), contents.size(), &buf[12], p_buffer.size() - 12, Compression::MODE_ZSTD);
154
		ERR_FAIL_COND_V_MSG(result != decompressed_size, ERR_INVALID_DATA, "Error decompressing GDScript tokenizer buffer.");
155
	}
156

157
	int total_len = contents.size();
158
	buf = contents.ptr();
159
	uint32_t identifier_count = decode_uint32(&buf[0]);
160
	uint32_t constant_count = decode_uint32(&buf[4]);
161
	uint32_t token_line_count = decode_uint32(&buf[8]);
162
	uint32_t token_count = decode_uint32(&buf[12]);
163

164
	const uint8_t *b = &buf[16];
165
	total_len -= 16;
166

167
	identifiers.resize(identifier_count);
168
	for (uint32_t i = 0; i < identifier_count; i++) {
169
		uint32_t len = decode_uint32(b);
170
		total_len -= 4;
171
		ERR_FAIL_COND_V((len * 4u) > (uint32_t)total_len, ERR_INVALID_DATA);
172
		b += 4;
173
		Vector<uint32_t> cs;
174
		cs.resize(len);
175
		for (uint32_t j = 0; j < len; j++) {
176
			uint8_t tmp[4];
177
			for (uint32_t k = 0; k < 4; k++) {
178
				tmp[k] = b[j * 4 + k] ^ 0xb6;
179
			}
180
			cs.write[j] = decode_uint32(tmp);
181
		}
182

183
		String s = String::utf32(Span(reinterpret_cast<const char32_t *>(cs.ptr()), len));
184
		b += len * 4;
185
		total_len -= len * 4;
186
		identifiers.write[i] = s;
187
	}
188

189
	constants.resize(constant_count);
190
	for (uint32_t i = 0; i < constant_count; i++) {
191
		Variant v;
192
		int len;
193
		Error err = decode_variant(v, b, total_len, &len, false);
194
		if (err) {
195
			return err;
196
		}
197
		b += len;
198
		total_len -= len;
199
		constants.write[i] = v;
200
	}
201

202
	for (uint32_t i = 0; i < token_line_count; i++) {
203
		ERR_FAIL_COND_V(total_len < 8, ERR_INVALID_DATA);
204
		uint32_t token_index = decode_uint32(b);
205
		b += 4;
206
		uint32_t line = decode_uint32(b);
207
		b += 4;
208
		total_len -= 8;
209
		token_lines[token_index] = line;
210
	}
211
	for (uint32_t i = 0; i < token_line_count; i++) {
212
		ERR_FAIL_COND_V(total_len < 8, ERR_INVALID_DATA);
213
		uint32_t token_index = decode_uint32(b);
214
		b += 4;
215
		uint32_t column = decode_uint32(b);
216
		b += 4;
217
		total_len -= 8;
218
		token_columns[token_index] = column;
219
	}
220

221
	tokens.resize(token_count);
222
	for (uint32_t i = 0; i < token_count; i++) {
223
		int token_len = 5;
224
		if ((*b) & TOKEN_BYTE_MASK) {
225
			token_len = 8;
226
		}
227
		ERR_FAIL_COND_V(total_len < token_len, ERR_INVALID_DATA);
228
		Token token = _binary_to_token(b);
229
		b += token_len;
230
		ERR_FAIL_INDEX_V(token.type, Token::TK_MAX, ERR_INVALID_DATA);
231
		tokens.write[i] = token;
232
		total_len -= token_len;
233
	}
234

235
	ERR_FAIL_COND_V(total_len > 0, ERR_INVALID_DATA);
236

237
	return OK;
238
}
239

240
Vector<uint8_t> GDScriptTokenizerBuffer::parse_code_string(const String &p_code, CompressMode p_compress_mode) {
241
	HashMap<StringName, uint32_t> identifier_map;
242
	HashMap<Variant, uint32_t, VariantHasher, VariantComparator> constant_map;
243
	Vector<uint8_t> token_buffer;
244
	HashMap<uint32_t, uint32_t> token_lines;
245
	HashMap<uint32_t, uint32_t> token_columns;
246

247
	GDScriptTokenizerText tokenizer;
248
	tokenizer.set_source_code(p_code);
249
	tokenizer.set_multiline_mode(true); // Ignore whitespace tokens.
250
	Token current = tokenizer.scan();
251
	int token_pos = 0;
252
	int last_token_line = 0;
253
	int token_counter = 0;
254

255
	while (current.type != Token::TK_EOF) {
256
		int token_len = _token_to_binary(current, token_buffer, token_pos, identifier_map, constant_map);
257
		token_pos += token_len;
258
		if (token_counter > 0 && current.start_line > last_token_line) {
259
			token_lines[token_counter] = current.start_line;
260
			token_columns[token_counter] = current.start_column;
261
		}
262
		last_token_line = current.end_line;
263

264
		current = tokenizer.scan();
265
		token_counter++;
266
	}
267

268
	// Reverse maps.
269
	Vector<StringName> rev_identifier_map;
270
	rev_identifier_map.resize(identifier_map.size());
271
	for (const KeyValue<StringName, uint32_t> &E : identifier_map) {
272
		rev_identifier_map.write[E.value] = E.key;
273
	}
274
	Vector<Variant> rev_constant_map;
275
	rev_constant_map.resize(constant_map.size());
276
	for (const KeyValue<Variant, uint32_t> &E : constant_map) {
277
		rev_constant_map.write[E.value] = E.key;
278
	}
279
	HashMap<uint32_t, uint32_t> rev_token_lines;
280
	for (const KeyValue<uint32_t, uint32_t> &E : token_lines) {
281
		rev_token_lines[E.value] = E.key;
282
	}
283

284
	// Remove continuation lines from map.
285
	for (int line : tokenizer.get_continuation_lines()) {
286
		if (rev_token_lines.has(line)) {
287
			token_lines.erase(rev_token_lines[line]);
288
			token_columns.erase(rev_token_lines[line]);
289
		}
290
	}
291

292
	Vector<uint8_t> contents;
293
	contents.resize(16);
294
	encode_uint32(identifier_map.size(), &contents.write[0]);
295
	encode_uint32(constant_map.size(), &contents.write[4]);
296
	encode_uint32(token_lines.size(), &contents.write[8]);
297
	encode_uint32(token_counter, &contents.write[12]);
298

299
	int buf_pos = 16;
300

301
	// Save identifiers.
302
	for (const StringName &id : rev_identifier_map) {
303
		String s = id.operator String();
304
		int len = s.length();
305

306
		contents.resize(buf_pos + (len + 1) * 4);
307

308
		encode_uint32(len, &contents.write[buf_pos]);
309
		buf_pos += 4;
310

311
		for (int i = 0; i < len; i++) {
312
			uint8_t tmp[4];
313
			encode_uint32(s[i], tmp);
314

315
			for (int b = 0; b < 4; b++) {
316
				contents.write[buf_pos + b] = tmp[b] ^ 0xb6;
317
			}
318

319
			buf_pos += 4;
320
		}
321
	}
322

323
	// Save constants.
324
	for (const Variant &v : rev_constant_map) {
325
		int len;
326
		// Objects cannot be constant, never encode objects.
327
		Error err = encode_variant(v, nullptr, len, false);
328
		ERR_FAIL_COND_V_MSG(err != OK, Vector<uint8_t>(), "Error when trying to encode Variant.");
329
		contents.resize(buf_pos + len);
330
		encode_variant(v, &contents.write[buf_pos], len, false);
331
		buf_pos += len;
332
	}
333

334
	// Save lines and columns.
335
	contents.resize(buf_pos + token_lines.size() * 16);
336
	for (const KeyValue<uint32_t, uint32_t> &e : token_lines) {
337
		encode_uint32(e.key, &contents.write[buf_pos]);
338
		buf_pos += 4;
339
		encode_uint32(e.value, &contents.write[buf_pos]);
340
		buf_pos += 4;
341
	}
342
	for (const KeyValue<uint32_t, uint32_t> &e : token_columns) {
343
		encode_uint32(e.key, &contents.write[buf_pos]);
344
		buf_pos += 4;
345
		encode_uint32(e.value, &contents.write[buf_pos]);
346
		buf_pos += 4;
347
	}
348

349
	// Store tokens.
350
	contents.append_array(token_buffer);
351

352
	Vector<uint8_t> buf;
353

354
	// Save header.
355
	buf.resize(12);
356
	buf.write[0] = 'G';
357
	buf.write[1] = 'D';
358
	buf.write[2] = 'S';
359
	buf.write[3] = 'C';
360
	encode_uint32(TOKENIZER_VERSION, &buf.write[4]);
361

362
	switch (p_compress_mode) {
363
		case COMPRESS_NONE:
364
			encode_uint32(0u, &buf.write[8]);
365
			buf.append_array(contents);
366
			break;
367

368
		case COMPRESS_ZSTD: {
369
			encode_uint32(contents.size(), &buf.write[8]);
370
			Vector<uint8_t> compressed;
371
			const int64_t max_size = Compression::get_max_compressed_buffer_size(contents.size(), Compression::MODE_ZSTD);
372
			compressed.resize(max_size);
373

374
			const int64_t compressed_size = Compression::compress(compressed.ptrw(), contents.ptr(), contents.size(), Compression::MODE_ZSTD);
375
			ERR_FAIL_COND_V_MSG(compressed_size < 0, Vector<uint8_t>(), "Error compressing GDScript tokenizer buffer.");
376
			compressed.resize(compressed_size);
377

378
			buf.append_array(compressed);
379
		} break;
380
	}
381

382
	return buf;
383
}
384

385
int GDScriptTokenizerBuffer::get_cursor_line() const {
386
	return 0;
387
}
388

389
int GDScriptTokenizerBuffer::get_cursor_column() const {
390
	return 0;
391
}
392

393
void GDScriptTokenizerBuffer::set_cursor_position(int p_line, int p_column) {
394
}
395

396
void GDScriptTokenizerBuffer::set_multiline_mode(bool p_state) {
397
	multiline_mode = p_state;
398
}
399

400
bool GDScriptTokenizerBuffer::is_past_cursor() const {
401
	return false;
402
}
403

404
void GDScriptTokenizerBuffer::push_expression_indented_block() {
405
	indent_stack_stack.push_back(indent_stack);
406
}
407

408
void GDScriptTokenizerBuffer::pop_expression_indented_block() {
409
	ERR_FAIL_COND(indent_stack_stack.is_empty());
410
	indent_stack = indent_stack_stack.back()->get();
411
	indent_stack_stack.pop_back();
412
}
413

414
GDScriptTokenizer::Token GDScriptTokenizerBuffer::scan() {
415
	// Add final newline.
416
	if (current >= tokens.size() && !last_token_was_newline) {
417
		Token newline;
418
		newline.type = Token::NEWLINE;
419
		newline.start_line = current_line;
420
		newline.end_line = current_line;
421
		last_token_was_newline = true;
422
		return newline;
423
	}
424

425
	// Resolve pending indentation change.
426
	if (pending_indents > 0) {
427
		pending_indents--;
428
		Token indent;
429
		indent.type = Token::INDENT;
430
		indent.start_line = current_line;
431
		indent.end_line = current_line;
432
		return indent;
433
	} else if (pending_indents < 0) {
434
		pending_indents++;
435
		Token dedent;
436
		dedent.type = Token::DEDENT;
437
		dedent.start_line = current_line;
438
		dedent.end_line = current_line;
439
		return dedent;
440
	}
441

442
	if (current >= tokens.size()) {
443
		if (!indent_stack.is_empty()) {
444
			pending_indents -= indent_stack.size();
445
			indent_stack.clear();
446
			return scan();
447
		}
448
		Token eof;
449
		eof.type = Token::TK_EOF;
450
		return eof;
451
	};
452

453
	if (!last_token_was_newline && token_lines.has(current)) {
454
		current_line = token_lines[current];
455
		uint32_t current_column = token_columns[current];
456

457
		// Check if there's a need to indent/dedent.
458
		if (!multiline_mode) {
459
			uint32_t previous_indent = 0;
460
			if (!indent_stack.is_empty()) {
461
				previous_indent = indent_stack.back()->get();
462
			}
463
			if (current_column - 1 > previous_indent) {
464
				pending_indents++;
465
				indent_stack.push_back(current_column - 1);
466
			} else {
467
				while (current_column - 1 < previous_indent) {
468
					pending_indents--;
469
					indent_stack.pop_back();
470
					if (indent_stack.is_empty()) {
471
						break;
472
					}
473
					previous_indent = indent_stack.back()->get();
474
				}
475
			}
476

477
			Token newline;
478
			newline.type = Token::NEWLINE;
479
			newline.start_line = current_line;
480
			newline.end_line = current_line;
481
			last_token_was_newline = true;
482

483
			return newline;
484
		}
485
	}
486

487
	last_token_was_newline = false;
488

489
	Token token = tokens[current++];
490
	return token;
491
}
492

493
Product

Resources

Company