Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mobile
Path: blob/master/src/java.base/share/classes/jdk/internal/icu/impl/BMPSet.java
41161 views
1
/*
2
* Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved.
3
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
*
5
* This code is free software; you can redistribute it and/or modify it
6
* under the terms of the GNU General Public License version 2 only, as
7
* published by the Free Software Foundation. Oracle designates this
8
* particular file as subject to the "Classpath" exception as provided
9
* by Oracle in the LICENSE file that accompanied this code.
10
*
11
* This code is distributed in the hope that it will be useful, but WITHOUT
12
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14
* version 2 for more details (a copy is included in the LICENSE file that
15
* accompanied this code).
16
*
17
* You should have received a copy of the GNU General Public License version
18
* 2 along with this work; if not, write to the Free Software Foundation,
19
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20
*
21
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22
* or visit www.oracle.com if you need additional information or have any
23
* questions.
24
*/
25
26
/*
27
******************************************************************************
28
*
29
* Copyright (C) 2009-2014, International Business Machines
30
* Corporation and others. All Rights Reserved.
31
*
32
******************************************************************************
33
*/
34
35
package jdk.internal.icu.impl;
36
37
import jdk.internal.icu.text.UnicodeSet.SpanCondition;
38
import jdk.internal.icu.util.OutputInt;
39
40
/**
41
* Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points.
42
*
43
* Latin-1: Look up bytes.
44
* 2-byte characters: Bits organized vertically.
45
* 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges.
46
* Supplementary characters: Call contains() on the parent set.
47
*/
48
public final class BMPSet {
49
50
/**
51
* One boolean ('true' or 'false') per Latin-1 character.
52
*/
53
private boolean[] latin1Contains;
54
55
/**
56
* One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points
57
* correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6}
58
* trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead)
59
*
60
* Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) for faster validity checking at
61
* runtime.
62
*/
63
private int[] table7FF;
64
65
/**
66
* One bit per 64 BMP code points. The bits are organized vertically; consecutive 64-code point blocks
67
* correspond to the same bit position in consecutive table words. With code point parts lead=c{15..12}
68
* t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit
69
* indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed
70
* and set.contains(c) must be called.
71
*
72
* Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to the result of contains(FFFD) for faster
73
* validity checking at runtime.
74
*/
75
private int[] bmpBlockBits;
76
77
/**
78
* Inversion list indexes for restricted binary searches in findCodePoint(), from findCodePoint(U+0800, U+1000,
79
* U+2000, .., U+F000, U+10000). U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are
80
* always looked up in the bit tables. The last pair of indexes is for finding supplementary code points.
81
*/
82
private int[] list4kStarts;
83
84
/**
85
* The inversion list of the parent set, for the slower contains() implementation for mixed BMP blocks and for
86
* supplementary code points. The list is terminated with list[listLength-1]=0x110000.
87
*/
88
private final int[] list;
89
private final int listLength; // length used; list may be longer to minimize reallocs
90
91
public BMPSet(final int[] parentList, int parentListLength) {
92
list = parentList;
93
listLength = parentListLength;
94
latin1Contains = new boolean[0x100];
95
table7FF = new int[64];
96
bmpBlockBits = new int[64];
97
list4kStarts = new int[18];
98
99
/*
100
* Set the list indexes for binary searches for U+0800, U+1000, U+2000, .., U+F000, U+10000. U+0800 is the
101
* first 3-byte-UTF-8 code point. Lower code points are looked up in the bit tables. The last pair of
102
* indexes is for finding supplementary code points.
103
*/
104
list4kStarts[0] = findCodePoint(0x800, 0, listLength - 1);
105
int i;
106
for (i = 1; i <= 0x10; ++i) {
107
list4kStarts[i] = findCodePoint(i << 12, list4kStarts[i - 1], listLength - 1);
108
}
109
list4kStarts[0x11] = listLength - 1;
110
111
initBits();
112
}
113
114
public boolean contains(int c) {
115
if (c <= 0xff) {
116
return (latin1Contains[c]);
117
} else if (c <= 0x7ff) {
118
return ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0);
119
} else if (c < 0xd800 || (c >= 0xe000 && c <= 0xffff)) {
120
int lead = c >> 12;
121
int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
122
if (twoBits <= 1) {
123
// All 64 code points with the same bits 15..6
124
// are either in the set or not.
125
return (0 != twoBits);
126
} else {
127
// Look up the code point in its 4k block of code points.
128
return containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1]);
129
}
130
} else if (c <= 0x10ffff) {
131
// surrogate or supplementary code point
132
return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
133
} else {
134
// Out-of-range code points get false, consistent with long-standing
135
// behavior of UnicodeSet.contains(c).
136
return false;
137
}
138
}
139
140
/**
141
* Span the initial substring for which each character c has spanCondition==contains(c). It must be
142
* spanCondition==0 or 1.
143
*
144
* @param start The start index
145
* @param outCount If not null: Receives the number of code points in the span.
146
* @return the limit (exclusive end) of the span
147
*
148
* NOTE: to reduce the overhead of function call to contains(c), it is manually inlined here. Check for
149
* sufficient length for trail unit for each surrogate pair. Handle single surrogates as surrogate code points
150
* as usual in ICU.
151
*/
152
public final int span(CharSequence s, int start, SpanCondition spanCondition,
153
OutputInt outCount) {
154
char c, c2;
155
int i = start;
156
int limit = s.length();
157
int numSupplementary = 0;
158
if (SpanCondition.NOT_CONTAINED != spanCondition) {
159
// span
160
while (i < limit) {
161
c = s.charAt(i);
162
if (c <= 0xff) {
163
if (!latin1Contains[c]) {
164
break;
165
}
166
} else if (c <= 0x7ff) {
167
if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
168
break;
169
}
170
} else if (c < 0xd800 ||
171
c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {
172
int lead = c >> 12;
173
int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
174
if (twoBits <= 1) {
175
// All 64 code points with the same bits 15..6
176
// are either in the set or not.
177
if (twoBits == 0) {
178
break;
179
}
180
} else {
181
// Look up the code point in its 4k block of code points.
182
if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
183
break;
184
}
185
}
186
} else {
187
// surrogate pair
188
int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
189
if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
190
break;
191
}
192
++numSupplementary;
193
++i;
194
}
195
++i;
196
}
197
} else {
198
// span not
199
while (i < limit) {
200
c = s.charAt(i);
201
if (c <= 0xff) {
202
if (latin1Contains[c]) {
203
break;
204
}
205
} else if (c <= 0x7ff) {
206
if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
207
break;
208
}
209
} else if (c < 0xd800 ||
210
c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {
211
int lead = c >> 12;
212
int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
213
if (twoBits <= 1) {
214
// All 64 code points with the same bits 15..6
215
// are either in the set or not.
216
if (twoBits != 0) {
217
break;
218
}
219
} else {
220
// Look up the code point in its 4k block of code points.
221
if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
222
break;
223
}
224
}
225
} else {
226
// surrogate pair
227
int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
228
if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
229
break;
230
}
231
++numSupplementary;
232
++i;
233
}
234
++i;
235
}
236
}
237
if (outCount != null) {
238
int spanLength = i - start;
239
outCount.value = spanLength - numSupplementary; // number of code points
240
}
241
return i;
242
}
243
244
/**
245
* Symmetrical with span().
246
* Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >=
247
* limit and spanCondition==0 or 1.
248
*
249
* @return The string index which starts the span (i.e. inclusive).
250
*/
251
public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) {
252
char c, c2;
253
254
if (SpanCondition.NOT_CONTAINED != spanCondition) {
255
// span
256
for (;;) {
257
c = s.charAt(--limit);
258
if (c <= 0xff) {
259
if (!latin1Contains[c]) {
260
break;
261
}
262
} else if (c <= 0x7ff) {
263
if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
264
break;
265
}
266
} else if (c < 0xd800 ||
267
c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {
268
int lead = c >> 12;
269
int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
270
if (twoBits <= 1) {
271
// All 64 code points with the same bits 15..6
272
// are either in the set or not.
273
if (twoBits == 0) {
274
break;
275
}
276
} else {
277
// Look up the code point in its 4k block of code points.
278
if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
279
break;
280
}
281
}
282
} else {
283
// surrogate pair
284
int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
285
if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
286
break;
287
}
288
--limit;
289
}
290
if (0 == limit) {
291
return 0;
292
}
293
}
294
} else {
295
// span not
296
for (;;) {
297
c = s.charAt(--limit);
298
if (c <= 0xff) {
299
if (latin1Contains[c]) {
300
break;
301
}
302
} else if (c <= 0x7ff) {
303
if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
304
break;
305
}
306
} else if (c < 0xd800 ||
307
c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {
308
int lead = c >> 12;
309
int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
310
if (twoBits <= 1) {
311
// All 64 code points with the same bits 15..6
312
// are either in the set or not.
313
if (twoBits != 0) {
314
break;
315
}
316
} else {
317
// Look up the code point in its 4k block of code points.
318
if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
319
break;
320
}
321
}
322
} else {
323
// surrogate pair
324
int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
325
if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
326
break;
327
}
328
--limit;
329
}
330
if (0 == limit) {
331
return 0;
332
}
333
}
334
}
335
return limit + 1;
336
}
337
338
/**
339
* Set bits in a bit rectangle in "vertical" bit organization. start<limit<=0x800
340
*/
341
private static void set32x64Bits(int[] table, int start, int limit) {
342
assert (64 == table.length);
343
int lead = start >> 6; // Named for UTF-8 2-byte lead byte with upper 5 bits.
344
int trail = start & 0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits.
345
346
// Set one bit indicating an all-one block.
347
int bits = 1 << lead;
348
if ((start + 1) == limit) { // Single-character shortcut.
349
table[trail] |= bits;
350
return;
351
}
352
353
int limitLead = limit >> 6;
354
int limitTrail = limit & 0x3f;
355
356
if (lead == limitLead) {
357
// Partial vertical bit column.
358
while (trail < limitTrail) {
359
table[trail++] |= bits;
360
}
361
} else {
362
// Partial vertical bit column,
363
// followed by a bit rectangle,
364
// followed by another partial vertical bit column.
365
if (trail > 0) {
366
do {
367
table[trail++] |= bits;
368
} while (trail < 64);
369
++lead;
370
}
371
if (lead < limitLead) {
372
bits = ~((1 << lead) - 1);
373
if (limitLead < 0x20) {
374
bits &= (1 << limitLead) - 1;
375
}
376
for (trail = 0; trail < 64; ++trail) {
377
table[trail] |= bits;
378
}
379
}
380
// limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
381
// In that case, bits=1<<limitLead == 1<<0 == 1
382
// (because Java << uses only the lower 5 bits of the shift operand)
383
// but the bits value is not used because trail<limitTrail is already false.
384
bits = 1 << limitLead;
385
for (trail = 0; trail < limitTrail; ++trail) {
386
table[trail] |= bits;
387
}
388
}
389
}
390
391
private void initBits() {
392
int start, limit;
393
int listIndex = 0;
394
395
// Set latin1Contains[].
396
do {
397
start = list[listIndex++];
398
if (listIndex < listLength) {
399
limit = list[listIndex++];
400
} else {
401
limit = 0x110000;
402
}
403
if (start >= 0x100) {
404
break;
405
}
406
do {
407
latin1Contains[start++] = true;
408
} while (start < limit && start < 0x100);
409
} while (limit <= 0x100);
410
411
// Set table7FF[].
412
while (start < 0x800) {
413
set32x64Bits(table7FF, start, limit <= 0x800 ? limit : 0x800);
414
if (limit > 0x800) {
415
start = 0x800;
416
break;
417
}
418
419
start = list[listIndex++];
420
if (listIndex < listLength) {
421
limit = list[listIndex++];
422
} else {
423
limit = 0x110000;
424
}
425
}
426
427
// Set bmpBlockBits[].
428
int minStart = 0x800;
429
while (start < 0x10000) {
430
if (limit > 0x10000) {
431
limit = 0x10000;
432
}
433
434
if (start < minStart) {
435
start = minStart;
436
}
437
if (start < limit) { // Else: Another range entirely in a known mixed-value block.
438
if (0 != (start & 0x3f)) {
439
// Mixed-value block of 64 code points.
440
start >>= 6;
441
bmpBlockBits[start & 0x3f] |= 0x10001 << (start >> 6);
442
start = (start + 1) << 6; // Round up to the next block boundary.
443
minStart = start; // Ignore further ranges in this block.
444
}
445
if (start < limit) {
446
if (start < (limit & ~0x3f)) {
447
// Multiple all-ones blocks of 64 code points each.
448
set32x64Bits(bmpBlockBits, start >> 6, limit >> 6);
449
}
450
451
if (0 != (limit & 0x3f)) {
452
// Mixed-value block of 64 code points.
453
limit >>= 6;
454
bmpBlockBits[limit & 0x3f] |= 0x10001 << (limit >> 6);
455
limit = (limit + 1) << 6; // Round up to the next block boundary.
456
minStart = limit; // Ignore further ranges in this block.
457
}
458
}
459
}
460
461
if (limit == 0x10000) {
462
break;
463
}
464
465
start = list[listIndex++];
466
if (listIndex < listLength) {
467
limit = list[listIndex++];
468
} else {
469
limit = 0x110000;
470
}
471
}
472
}
473
474
/**
475
* Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code
476
* points in a certain range.
477
*
478
* For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and
479
* hi=findCodePoint(end) with 0<=lo<=hi<len. findCodePoint(c) defaults to lo=0 and hi=len-1.
480
*
481
* @param c
482
* a character in a subrange of MIN_VALUE..MAX_VALUE
483
* @param lo
484
* The lowest index to be returned.
485
* @param hi
486
* The highest index to be returned.
487
* @return the smallest integer i in the range lo..hi, inclusive, such that c < list[i]
488
*/
489
private int findCodePoint(int c, int lo, int hi) {
490
/* Examples:
491
findCodePoint(c)
492
set list[] c=0 1 3 4 7 8
493
=== ============== ===========
494
[] [110000] 0 0 0 0 0 0
495
[\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
496
[\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
497
[:Any:] [0, 110000] 1 1 1 1 1 1
498
*/
499
500
// Return the smallest i such that c < list[i]. Assume
501
// list[len - 1] == HIGH and that c is legal (0..HIGH-1).
502
if (c < list[lo])
503
return lo;
504
// High runner test. c is often after the last range, so an
505
// initial check for this condition pays off.
506
if (lo >= hi || c >= list[hi - 1])
507
return hi;
508
// invariant: c >= list[lo]
509
// invariant: c < list[hi]
510
for (;;) {
511
int i = (lo + hi) >>> 1;
512
if (i == lo) {
513
break; // Found!
514
} else if (c < list[i]) {
515
hi = i;
516
} else {
517
lo = i;
518
}
519
}
520
return hi;
521
}
522
523
private final boolean containsSlow(int c, int lo, int hi) {
524
return (0 != (findCodePoint(c, lo, hi) & 1));
525
}
526
}
527
528
529