Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
81169 views
1
"use strict"
2
3
// Multibyte codec. In this scheme, a character is represented by 1 or more bytes.
4
// Our codec supports UTF-16 surrogates, extensions for GB18030 and unicode sequences.
5
// To save memory and loading time, we read table files only when requested.
6
7
exports._dbcs = DBCSCodec;
8
9
var UNASSIGNED = -1,
10
GB18030_CODE = -2,
11
SEQ_START = -10,
12
NODE_START = -1000,
13
UNASSIGNED_NODE = new Array(0x100),
14
DEF_CHAR = -1;
15
16
for (var i = 0; i < 0x100; i++)
17
UNASSIGNED_NODE[i] = UNASSIGNED;
18
19
20
// Class DBCSCodec reads and initializes mapping tables.
21
function DBCSCodec(codecOptions, iconv) {
22
this.encodingName = codecOptions.encodingName;
23
if (!codecOptions)
24
throw new Error("DBCS codec is called without the data.")
25
if (!codecOptions.table)
26
throw new Error("Encoding '" + this.encodingName + "' has no data.");
27
28
// Load tables.
29
var mappingTable = codecOptions.table();
30
31
32
// Decode tables: MBCS -> Unicode.
33
34
// decodeTables is a trie, encoded as an array of arrays of integers. Internal arrays are trie nodes and all have len = 256.
35
// Trie root is decodeTables[0].
36
// Values: >= 0 -> unicode character code. can be > 0xFFFF
37
// == UNASSIGNED -> unknown/unassigned sequence.
38
// == GB18030_CODE -> this is the end of a GB18030 4-byte sequence.
39
// <= NODE_START -> index of the next node in our trie to process next byte.
40
// <= SEQ_START -> index of the start of a character code sequence, in decodeTableSeq.
41
this.decodeTables = [];
42
this.decodeTables[0] = UNASSIGNED_NODE.slice(0); // Create root node.
43
44
// Sometimes a MBCS char corresponds to a sequence of unicode chars. We store them as arrays of integers here.
45
this.decodeTableSeq = [];
46
47
// Actual mapping tables consist of chunks. Use them to fill up decode tables.
48
for (var i = 0; i < mappingTable.length; i++)
49
this._addDecodeChunk(mappingTable[i]);
50
51
this.defaultCharUnicode = iconv.defaultCharUnicode;
52
53
54
// Encode tables: Unicode -> DBCS.
55
56
// `encodeTable` is array mapping from unicode char to encoded char. All its values are integers for performance.
57
// Because it can be sparse, it is represented as array of buckets by 256 chars each. Bucket can be null.
58
// Values: >= 0 -> it is a normal char. Write the value (if <=256 then 1 byte, if <=65536 then 2 bytes, etc.).
59
// == UNASSIGNED -> no conversion found. Output a default char.
60
// <= SEQ_START -> it's an index in encodeTableSeq, see below. The character starts a sequence.
61
this.encodeTable = [];
62
63
// `encodeTableSeq` is used when a sequence of unicode characters is encoded as a single code. We use a tree of
64
// objects where keys correspond to characters in sequence and leafs are the encoded dbcs values. A special DEF_CHAR key
65
// means end of sequence (needed when one sequence is a strict subsequence of another).
66
// Objects are kept separately from encodeTable to increase performance.
67
this.encodeTableSeq = [];
68
69
// Some chars can be decoded, but need not be encoded.
70
var skipEncodeChars = {};
71
if (codecOptions.encodeSkipVals)
72
for (var i = 0; i < codecOptions.encodeSkipVals.length; i++) {
73
var val = codecOptions.encodeSkipVals[i];
74
if (typeof val === 'number')
75
skipEncodeChars[val] = true;
76
else
77
for (var j = val.from; j <= val.to; j++)
78
skipEncodeChars[j] = true;
79
}
80
81
// Use decode trie to recursively fill out encode tables.
82
this._fillEncodeTable(0, 0, skipEncodeChars);
83
84
// Add more encoding pairs when needed.
85
if (codecOptions.encodeAdd) {
86
for (var uChar in codecOptions.encodeAdd)
87
if (Object.prototype.hasOwnProperty.call(codecOptions.encodeAdd, uChar))
88
this._setEncodeChar(uChar.charCodeAt(0), codecOptions.encodeAdd[uChar]);
89
}
90
91
this.defCharSB = this.encodeTable[0][iconv.defaultCharSingleByte.charCodeAt(0)];
92
if (this.defCharSB === UNASSIGNED) this.defCharSB = this.encodeTable[0]['?'];
93
if (this.defCharSB === UNASSIGNED) this.defCharSB = "?".charCodeAt(0);
94
95
96
// Load & create GB18030 tables when needed.
97
if (typeof codecOptions.gb18030 === 'function') {
98
this.gb18030 = codecOptions.gb18030(); // Load GB18030 ranges.
99
100
// Add GB18030 decode tables.
101
var thirdByteNodeIdx = this.decodeTables.length;
102
var thirdByteNode = this.decodeTables[thirdByteNodeIdx] = UNASSIGNED_NODE.slice(0);
103
104
var fourthByteNodeIdx = this.decodeTables.length;
105
var fourthByteNode = this.decodeTables[fourthByteNodeIdx] = UNASSIGNED_NODE.slice(0);
106
107
for (var i = 0x81; i <= 0xFE; i++) {
108
var secondByteNodeIdx = NODE_START - this.decodeTables[0][i];
109
var secondByteNode = this.decodeTables[secondByteNodeIdx];
110
for (var j = 0x30; j <= 0x39; j++)
111
secondByteNode[j] = NODE_START - thirdByteNodeIdx;
112
}
113
for (var i = 0x81; i <= 0xFE; i++)
114
thirdByteNode[i] = NODE_START - fourthByteNodeIdx;
115
for (var i = 0x30; i <= 0x39; i++)
116
fourthByteNode[i] = GB18030_CODE
117
}
118
}
119
120
DBCSCodec.prototype.encoder = DBCSEncoder;
121
DBCSCodec.prototype.decoder = DBCSDecoder;
122
123
// Decoder helpers
124
DBCSCodec.prototype._getDecodeTrieNode = function(addr) {
125
var bytes = [];
126
for (; addr > 0; addr >>= 8)
127
bytes.push(addr & 0xFF);
128
if (bytes.length == 0)
129
bytes.push(0);
130
131
var node = this.decodeTables[0];
132
for (var i = bytes.length-1; i > 0; i--) { // Traverse nodes deeper into the trie.
133
var val = node[bytes[i]];
134
135
if (val == UNASSIGNED) { // Create new node.
136
node[bytes[i]] = NODE_START - this.decodeTables.length;
137
this.decodeTables.push(node = UNASSIGNED_NODE.slice(0));
138
}
139
else if (val <= NODE_START) { // Existing node.
140
node = this.decodeTables[NODE_START - val];
141
}
142
else
143
throw new Error("Overwrite byte in " + this.encodingName + ", addr: " + addr.toString(16));
144
}
145
return node;
146
}
147
148
149
DBCSCodec.prototype._addDecodeChunk = function(chunk) {
150
// First element of chunk is the hex mbcs code where we start.
151
var curAddr = parseInt(chunk[0], 16);
152
153
// Choose the decoding node where we'll write our chars.
154
var writeTable = this._getDecodeTrieNode(curAddr);
155
curAddr = curAddr & 0xFF;
156
157
// Write all other elements of the chunk to the table.
158
for (var k = 1; k < chunk.length; k++) {
159
var part = chunk[k];
160
if (typeof part === "string") { // String, write as-is.
161
for (var l = 0; l < part.length;) {
162
var code = part.charCodeAt(l++);
163
if (0xD800 <= code && code < 0xDC00) { // Decode surrogate
164
var codeTrail = part.charCodeAt(l++);
165
if (0xDC00 <= codeTrail && codeTrail < 0xE000)
166
writeTable[curAddr++] = 0x10000 + (code - 0xD800) * 0x400 + (codeTrail - 0xDC00);
167
else
168
throw new Error("Incorrect surrogate pair in " + this.encodingName + " at chunk " + chunk[0]);
169
}
170
else if (0x0FF0 < code && code <= 0x0FFF) { // Character sequence (our own encoding used)
171
var len = 0xFFF - code + 2;
172
var seq = [];
173
for (var m = 0; m < len; m++)
174
seq.push(part.charCodeAt(l++)); // Simple variation: don't support surrogates or subsequences in seq.
175
176
writeTable[curAddr++] = SEQ_START - this.decodeTableSeq.length;
177
this.decodeTableSeq.push(seq);
178
}
179
else
180
writeTable[curAddr++] = code; // Basic char
181
}
182
}
183
else if (typeof part === "number") { // Integer, meaning increasing sequence starting with prev character.
184
var charCode = writeTable[curAddr - 1] + 1;
185
for (var l = 0; l < part; l++)
186
writeTable[curAddr++] = charCode++;
187
}
188
else
189
throw new Error("Incorrect type '" + typeof part + "' given in " + this.encodingName + " at chunk " + chunk[0]);
190
}
191
if (curAddr > 0xFF)
192
throw new Error("Incorrect chunk in " + this.encodingName + " at addr " + chunk[0] + ": too long" + curAddr);
193
}
194
195
// Encoder helpers
196
DBCSCodec.prototype._getEncodeBucket = function(uCode) {
197
var high = uCode >> 8; // This could be > 0xFF because of astral characters.
198
if (this.encodeTable[high] === undefined)
199
this.encodeTable[high] = UNASSIGNED_NODE.slice(0); // Create bucket on demand.
200
return this.encodeTable[high];
201
}
202
203
DBCSCodec.prototype._setEncodeChar = function(uCode, dbcsCode) {
204
var bucket = this._getEncodeBucket(uCode);
205
var low = uCode & 0xFF;
206
if (bucket[low] <= SEQ_START)
207
this.encodeTableSeq[SEQ_START-bucket[low]][DEF_CHAR] = dbcsCode; // There's already a sequence, set a single-char subsequence of it.
208
else if (bucket[low] == UNASSIGNED)
209
bucket[low] = dbcsCode;
210
}
211
212
DBCSCodec.prototype._setEncodeSequence = function(seq, dbcsCode) {
213
214
// Get the root of character tree according to first character of the sequence.
215
var uCode = seq[0];
216
var bucket = this._getEncodeBucket(uCode);
217
var low = uCode & 0xFF;
218
219
var node;
220
if (bucket[low] <= SEQ_START) {
221
// There's already a sequence with - use it.
222
node = this.encodeTableSeq[SEQ_START-bucket[low]];
223
}
224
else {
225
// There was no sequence object - allocate a new one.
226
node = {};
227
if (bucket[low] !== UNASSIGNED) node[DEF_CHAR] = bucket[low]; // If a char was set before - make it a single-char subsequence.
228
bucket[low] = SEQ_START - this.encodeTableSeq.length;
229
this.encodeTableSeq.push(node);
230
}
231
232
// Traverse the character tree, allocating new nodes as needed.
233
for (var j = 1; j < seq.length-1; j++) {
234
var oldVal = node[uCode];
235
if (typeof oldVal === 'object')
236
node = oldVal;
237
else {
238
node = node[uCode] = {}
239
if (oldVal !== undefined)
240
node[DEF_CHAR] = oldVal
241
}
242
}
243
244
// Set the leaf to given dbcsCode.
245
uCode = seq[seq.length-1];
246
node[uCode] = dbcsCode;
247
}
248
249
DBCSCodec.prototype._fillEncodeTable = function(nodeIdx, prefix, skipEncodeChars) {
250
var node = this.decodeTables[nodeIdx];
251
for (var i = 0; i < 0x100; i++) {
252
var uCode = node[i];
253
var mbCode = prefix + i;
254
if (skipEncodeChars[mbCode])
255
continue;
256
257
if (uCode >= 0)
258
this._setEncodeChar(uCode, mbCode);
259
else if (uCode <= NODE_START)
260
this._fillEncodeTable(NODE_START - uCode, mbCode << 8, skipEncodeChars);
261
else if (uCode <= SEQ_START)
262
this._setEncodeSequence(this.decodeTableSeq[SEQ_START - uCode], mbCode);
263
}
264
}
265
266
267
268
// == Encoder ==================================================================
269
270
function DBCSEncoder(options, codec) {
271
// Encoder state
272
this.leadSurrogate = -1;
273
this.seqObj = undefined;
274
275
// Static data
276
this.encodeTable = codec.encodeTable;
277
this.encodeTableSeq = codec.encodeTableSeq;
278
this.defaultCharSingleByte = codec.defCharSB;
279
this.gb18030 = codec.gb18030;
280
}
281
282
DBCSEncoder.prototype.write = function(str) {
283
var newBuf = new Buffer(str.length * (this.gb18030 ? 4 : 3)),
284
leadSurrogate = this.leadSurrogate,
285
seqObj = this.seqObj, nextChar = -1,
286
i = 0, j = 0;
287
288
while (true) {
289
// 0. Get next character.
290
if (nextChar === -1) {
291
if (i == str.length) break;
292
var uCode = str.charCodeAt(i++);
293
}
294
else {
295
var uCode = nextChar;
296
nextChar = -1;
297
}
298
299
// 1. Handle surrogates.
300
if (0xD800 <= uCode && uCode < 0xE000) { // Char is one of surrogates.
301
if (uCode < 0xDC00) { // We've got lead surrogate.
302
if (leadSurrogate === -1) {
303
leadSurrogate = uCode;
304
continue;
305
} else {
306
leadSurrogate = uCode;
307
// Double lead surrogate found.
308
uCode = UNASSIGNED;
309
}
310
} else { // We've got trail surrogate.
311
if (leadSurrogate !== -1) {
312
uCode = 0x10000 + (leadSurrogate - 0xD800) * 0x400 + (uCode - 0xDC00);
313
leadSurrogate = -1;
314
} else {
315
// Incomplete surrogate pair - only trail surrogate found.
316
uCode = UNASSIGNED;
317
}
318
319
}
320
}
321
else if (leadSurrogate !== -1) {
322
// Incomplete surrogate pair - only lead surrogate found.
323
nextChar = uCode; uCode = UNASSIGNED; // Write an error, then current char.
324
leadSurrogate = -1;
325
}
326
327
// 2. Convert uCode character.
328
var dbcsCode = UNASSIGNED;
329
if (seqObj !== undefined && uCode != UNASSIGNED) { // We are in the middle of the sequence
330
var resCode = seqObj[uCode];
331
if (typeof resCode === 'object') { // Sequence continues.
332
seqObj = resCode;
333
continue;
334
335
} else if (typeof resCode == 'number') { // Sequence finished. Write it.
336
dbcsCode = resCode;
337
338
} else if (resCode == undefined) { // Current character is not part of the sequence.
339
340
// Try default character for this sequence
341
resCode = seqObj[DEF_CHAR];
342
if (resCode !== undefined) {
343
dbcsCode = resCode; // Found. Write it.
344
nextChar = uCode; // Current character will be written too in the next iteration.
345
346
} else {
347
// TODO: What if we have no default? (resCode == undefined)
348
// Then, we should write first char of the sequence as-is and try the rest recursively.
349
// Didn't do it for now because no encoding has this situation yet.
350
// Currently, just skip the sequence and write current char.
351
}
352
}
353
seqObj = undefined;
354
}
355
else if (uCode >= 0) { // Regular character
356
var subtable = this.encodeTable[uCode >> 8];
357
if (subtable !== undefined)
358
dbcsCode = subtable[uCode & 0xFF];
359
360
if (dbcsCode <= SEQ_START) { // Sequence start
361
seqObj = this.encodeTableSeq[SEQ_START-dbcsCode];
362
continue;
363
}
364
365
if (dbcsCode == UNASSIGNED && this.gb18030) {
366
// Use GB18030 algorithm to find character(s) to write.
367
var idx = findIdx(this.gb18030.uChars, uCode);
368
if (idx != -1) {
369
var dbcsCode = this.gb18030.gbChars[idx] + (uCode - this.gb18030.uChars[idx]);
370
newBuf[j++] = 0x81 + Math.floor(dbcsCode / 12600); dbcsCode = dbcsCode % 12600;
371
newBuf[j++] = 0x30 + Math.floor(dbcsCode / 1260); dbcsCode = dbcsCode % 1260;
372
newBuf[j++] = 0x81 + Math.floor(dbcsCode / 10); dbcsCode = dbcsCode % 10;
373
newBuf[j++] = 0x30 + dbcsCode;
374
continue;
375
}
376
}
377
}
378
379
// 3. Write dbcsCode character.
380
if (dbcsCode === UNASSIGNED)
381
dbcsCode = this.defaultCharSingleByte;
382
383
if (dbcsCode < 0x100) {
384
newBuf[j++] = dbcsCode;
385
}
386
else if (dbcsCode < 0x10000) {
387
newBuf[j++] = dbcsCode >> 8; // high byte
388
newBuf[j++] = dbcsCode & 0xFF; // low byte
389
}
390
else {
391
newBuf[j++] = dbcsCode >> 16;
392
newBuf[j++] = (dbcsCode >> 8) & 0xFF;
393
newBuf[j++] = dbcsCode & 0xFF;
394
}
395
}
396
397
this.seqObj = seqObj;
398
this.leadSurrogate = leadSurrogate;
399
return newBuf.slice(0, j);
400
}
401
402
DBCSEncoder.prototype.end = function() {
403
if (this.leadSurrogate === -1 && this.seqObj === undefined)
404
return; // All clean. Most often case.
405
406
var newBuf = new Buffer(10), j = 0;
407
408
if (this.seqObj) { // We're in the sequence.
409
var dbcsCode = this.seqObj[DEF_CHAR];
410
if (dbcsCode !== undefined) { // Write beginning of the sequence.
411
if (dbcsCode < 0x100) {
412
newBuf[j++] = dbcsCode;
413
}
414
else {
415
newBuf[j++] = dbcsCode >> 8; // high byte
416
newBuf[j++] = dbcsCode & 0xFF; // low byte
417
}
418
} else {
419
// See todo above.
420
}
421
this.seqObj = undefined;
422
}
423
424
if (this.leadSurrogate !== -1) {
425
// Incomplete surrogate pair - only lead surrogate found.
426
newBuf[j++] = this.defaultCharSingleByte;
427
this.leadSurrogate = -1;
428
}
429
430
return newBuf.slice(0, j);
431
}
432
433
// Export for testing
434
DBCSEncoder.prototype.findIdx = findIdx;
435
436
437
// == Decoder ==================================================================
438
439
function DBCSDecoder(options, codec) {
440
// Decoder state
441
this.nodeIdx = 0;
442
this.prevBuf = new Buffer(0);
443
444
// Static data
445
this.decodeTables = codec.decodeTables;
446
this.decodeTableSeq = codec.decodeTableSeq;
447
this.defaultCharUnicode = codec.defaultCharUnicode;
448
this.gb18030 = codec.gb18030;
449
}
450
451
DBCSDecoder.prototype.write = function(buf) {
452
var newBuf = new Buffer(buf.length*2),
453
nodeIdx = this.nodeIdx,
454
prevBuf = this.prevBuf, prevBufOffset = this.prevBuf.length,
455
seqStart = -this.prevBuf.length, // idx of the start of current parsed sequence.
456
uCode;
457
458
if (prevBufOffset > 0) // Make prev buf overlap a little to make it easier to slice later.
459
prevBuf = Buffer.concat([prevBuf, buf.slice(0, 10)]);
460
461
for (var i = 0, j = 0; i < buf.length; i++) {
462
var curByte = (i >= 0) ? buf[i] : prevBuf[i + prevBufOffset];
463
464
// Lookup in current trie node.
465
var uCode = this.decodeTables[nodeIdx][curByte];
466
467
if (uCode >= 0) {
468
// Normal character, just use it.
469
}
470
else if (uCode === UNASSIGNED) { // Unknown char.
471
// TODO: Callback with seq.
472
//var curSeq = (seqStart >= 0) ? buf.slice(seqStart, i+1) : prevBuf.slice(seqStart + prevBufOffset, i+1 + prevBufOffset);
473
i = seqStart; // Try to parse again, after skipping first byte of the sequence ('i' will be incremented by 'for' cycle).
474
uCode = this.defaultCharUnicode.charCodeAt(0);
475
}
476
else if (uCode === GB18030_CODE) {
477
var curSeq = (seqStart >= 0) ? buf.slice(seqStart, i+1) : prevBuf.slice(seqStart + prevBufOffset, i+1 + prevBufOffset);
478
var ptr = (curSeq[0]-0x81)*12600 + (curSeq[1]-0x30)*1260 + (curSeq[2]-0x81)*10 + (curSeq[3]-0x30);
479
var idx = findIdx(this.gb18030.gbChars, ptr);
480
uCode = this.gb18030.uChars[idx] + ptr - this.gb18030.gbChars[idx];
481
}
482
else if (uCode <= NODE_START) { // Go to next trie node.
483
nodeIdx = NODE_START - uCode;
484
continue;
485
}
486
else if (uCode <= SEQ_START) { // Output a sequence of chars.
487
var seq = this.decodeTableSeq[SEQ_START - uCode];
488
for (var k = 0; k < seq.length - 1; k++) {
489
uCode = seq[k];
490
newBuf[j++] = uCode & 0xFF;
491
newBuf[j++] = uCode >> 8;
492
}
493
uCode = seq[seq.length-1];
494
}
495
else
496
throw new Error("iconv-lite internal error: invalid decoding table value " + uCode + " at " + nodeIdx + "/" + curByte);
497
498
// Write the character to buffer, handling higher planes using surrogate pair.
499
if (uCode > 0xFFFF) {
500
uCode -= 0x10000;
501
var uCodeLead = 0xD800 + Math.floor(uCode / 0x400);
502
newBuf[j++] = uCodeLead & 0xFF;
503
newBuf[j++] = uCodeLead >> 8;
504
505
uCode = 0xDC00 + uCode % 0x400;
506
}
507
newBuf[j++] = uCode & 0xFF;
508
newBuf[j++] = uCode >> 8;
509
510
// Reset trie node.
511
nodeIdx = 0; seqStart = i+1;
512
}
513
514
this.nodeIdx = nodeIdx;
515
this.prevBuf = (seqStart >= 0) ? buf.slice(seqStart) : prevBuf.slice(seqStart + prevBufOffset);
516
return newBuf.slice(0, j).toString('ucs2');
517
}
518
519
DBCSDecoder.prototype.end = function() {
520
var ret = '';
521
522
// Try to parse all remaining chars.
523
while (this.prevBuf.length > 0) {
524
// Skip 1 character in the buffer.
525
ret += this.defaultCharUnicode;
526
var buf = this.prevBuf.slice(1);
527
528
// Parse remaining as usual.
529
this.prevBuf = new Buffer(0);
530
this.nodeIdx = 0;
531
if (buf.length > 0)
532
ret += this.write(buf);
533
}
534
535
this.nodeIdx = 0;
536
return ret;
537
}
538
539
// Binary search for GB18030. Returns largest i such that table[i] <= val.
540
function findIdx(table, val) {
541
if (table[0] > val)
542
return -1;
543
544
var l = 0, r = table.length;
545
while (l < r-1) { // always table[l] <= val < table[r]
546
var mid = l + Math.floor((r-l+1)/2);
547
if (table[mid] <= val)
548
l = mid;
549
else
550
r = mid;
551
}
552
return l;
553
}
554
555
556