Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mobile
Path: blob/master/src/java.base/share/classes/jdk/internal/icu/text/UTF16.java
41161 views
1
/*
2
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
3
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
*
5
* This code is free software; you can redistribute it and/or modify it
6
* under the terms of the GNU General Public License version 2 only, as
7
* published by the Free Software Foundation. Oracle designates this
8
* particular file as subject to the "Classpath" exception as provided
9
* by Oracle in the LICENSE file that accompanied this code.
10
*
11
* This code is distributed in the hope that it will be useful, but WITHOUT
12
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14
* version 2 for more details (a copy is included in the LICENSE file that
15
* accompanied this code).
16
*
17
* You should have received a copy of the GNU General Public License version
18
* 2 along with this work; if not, write to the Free Software Foundation,
19
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20
*
21
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22
* or visit www.oracle.com if you need additional information or have any
23
* questions.
24
*/
25
/**
26
*******************************************************************************
27
* Copyright (C) 1996-2014, International Business Machines Corporation and
28
* others. All Rights Reserved.
29
*******************************************************************************
30
*/
31
32
package jdk.internal.icu.text;
33
34
import jdk.internal.icu.impl.UCharacterProperty;
35
36
/**
37
* <p>Standalone utility class providing UTF16 character conversions and
38
* indexing conversions.
39
* <p>Code that uses strings alone rarely need modification.
40
* By design, UTF-16 does not allow overlap, so searching for strings is a safe
41
* operation. Similarly, concatenation is always safe. Substringing is safe if
42
* the start and end are both on UTF-32 boundaries. In normal code, the values
43
* for start and end are on those boundaries, since they arose from operations
44
* like searching. If not, the nearest UTF-32 boundaries can be determined
45
* using <code>bounds()</code>.
46
* <strong>Examples:</strong>
47
* <p>The following examples illustrate use of some of these methods.
48
* <pre>{@code
49
* // iteration forwards: Original
50
* for (int i = 0; i < s.length(); ++i) {
51
* char ch = s.charAt(i);
52
* doSomethingWith(ch);
53
* }
54
*
55
* // iteration forwards: Changes for UTF-32
56
* int ch;
57
* for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) {
58
* ch = UTF16.charAt(s, i);
59
* doSomethingWith(ch);
60
* }
61
*
62
* // iteration backwards: Original
63
* for (int i = s.length() - 1; i >= 0; --i) {
64
* char ch = s.charAt(i);
65
* doSomethingWith(ch);
66
* }
67
*
68
* // iteration backwards: Changes for UTF-32
69
* int ch;
70
* for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) {
71
* ch = UTF16.charAt(s, i);
72
* doSomethingWith(ch);
73
* }
74
* }</pre>
75
* <strong>Notes:</strong>
76
* <ul>
77
* <li>
78
* <strong>Naming:</strong> For clarity, High and Low surrogates are called
79
* <code>Lead</code> and <code>Trail</code> in the API, which gives a better
80
* sense of their ordering in a string. <code>offset16</code> and
81
* <code>offset32</code> are used to distinguish offsets to UTF-16
82
* boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is
83
* used to contain UTF-32 characters, as opposed to <code>char16</code>,
84
* which is a UTF-16 code unit.
85
* </li>
86
* <li>
87
* <strong>Roundtripping Offsets:</strong> You can always roundtrip from a
88
* UTF-32 offset to a UTF-16 offset and back. Because of the difference in
89
* structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and
90
* back if and only if <code>bounds(string, offset16) != TRAIL</code>.
91
* </li>
92
* <li>
93
* <strong>Exceptions:</strong> The error checking will throw an exception
94
* if indices are out of bounds. Other than that, all methods will
95
* behave reasonably, even if unmatched surrogates or out-of-bounds UTF-32
96
* values are present. <code>UCharacter.isLegal()</code> can be used to check
97
* for validity if desired.
98
* </li>
99
* <li>
100
* <strong>Unmatched Surrogates:</strong> If the string contains unmatched
101
* surrogates, then these are counted as one UTF-32 value. This matches
102
* their iteration behavior, which is vital. It also matches common display
103
* practice as missing glyphs (see the Unicode Standard Section 5.4, 5.5).
104
* </li>
105
* <li>
106
* <strong>Optimization:</strong> The method implementations may need
107
* optimization if the compiler doesn't fold static final methods. Since
108
* surrogate pairs will form an exceeding small percentage of all the text
109
* in the world, the singleton case should always be optimized for.
110
* </li>
111
* </ul>
112
* @author Mark Davis, with help from Markus Scherer
113
* @stable ICU 2.1
114
*/
115
116
public final class UTF16
117
{
118
// public variables ---------------------------------------------------
119
120
/**
121
* The lowest Unicode code point value.
122
* @stable ICU 2.1
123
*/
124
public static final int CODEPOINT_MIN_VALUE = 0;
125
/**
126
* The highest Unicode code point value (scalar value) according to the
127
* Unicode Standard.
128
* @stable ICU 2.1
129
*/
130
public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
131
/**
132
* The minimum value for Supplementary code points
133
* @stable ICU 2.1
134
*/
135
public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
136
/**
137
* Lead surrogate minimum value
138
* @stable ICU 2.1
139
*/
140
public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
141
/**
142
* Trail surrogate minimum value
143
* @stable ICU 2.1
144
*/
145
public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
146
/**
147
* Lead surrogate maximum value
148
* @stable ICU 2.1
149
*/
150
public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
151
/**
152
* Trail surrogate maximum value
153
* @stable ICU 2.1
154
*/
155
public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
156
/**
157
* Surrogate minimum value
158
* @stable ICU 2.1
159
*/
160
public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
161
/**
162
* Lead surrogate bitmask
163
*/
164
private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
165
/**
166
* Trail surrogate bitmask
167
*/
168
private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
169
/**
170
* Surrogate bitmask
171
*/
172
private static final int SURROGATE_BITMASK = 0xFFFFF800;
173
/**
174
* Lead surrogate bits
175
*/
176
private static final int LEAD_SURROGATE_BITS = 0xD800;
177
/**
178
* Trail surrogate bits
179
*/
180
private static final int TRAIL_SURROGATE_BITS = 0xDC00;
181
/**
182
* Surrogate bits
183
*/
184
private static final int SURROGATE_BITS = 0xD800;
185
186
// constructor --------------------------------------------------------
187
188
// /CLOVER:OFF
189
/**
190
* Prevent instance from being created.
191
*/
192
private UTF16() {
193
}
194
195
// /CLOVER:ON
196
// public method ------------------------------------------------------
197
198
/**
199
* Extract a single UTF-32 value from a string.
200
* Used when iterating forwards or backwards (with
201
* <code>UTF16.getCharCount()</code>, as well as random access. If a
202
* validity check is required, use
203
* <code><a href="../lang/UCharacter.html#isLegal(char)">
204
* UCharacter.isLegal()</a></code> on the return value.
205
* If the char retrieved is part of a surrogate pair, its supplementary
206
* character will be returned. If a complete supplementary character is
207
* not found the incomplete character will be returned
208
* @param source array of UTF-16 chars
209
* @param offset16 UTF-16 offset to the start of the character.
210
* @return UTF-32 value for the UTF-32 value that contains the char at
211
* offset16. The boundaries of that codepoint are the same as in
212
* <code>bounds32()</code>.
213
* @exception IndexOutOfBoundsException thrown if offset16 is out of
214
* bounds.
215
* @stable ICU 2.1
216
*/
217
public static int charAt(String source, int offset16) {
218
char single = source.charAt(offset16);
219
if (single < LEAD_SURROGATE_MIN_VALUE) {
220
return single;
221
}
222
return _charAt(source, offset16, single);
223
}
224
225
private static int _charAt(String source, int offset16, char single) {
226
if (single > TRAIL_SURROGATE_MAX_VALUE) {
227
return single;
228
}
229
230
// Convert the UTF-16 surrogate pair if necessary.
231
// For simplicity in usage, and because the frequency of pairs is
232
// low, look both directions.
233
234
if (single <= LEAD_SURROGATE_MAX_VALUE) {
235
++offset16;
236
if (source.length() != offset16) {
237
char trail = source.charAt(offset16);
238
if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
239
return UCharacterProperty.getRawSupplementary(single, trail);
240
}
241
}
242
} else {
243
--offset16;
244
if (offset16 >= 0) {
245
// single is a trail surrogate so
246
char lead = source.charAt(offset16);
247
if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
248
return UCharacterProperty.getRawSupplementary(lead, single);
249
}
250
}
251
}
252
return single; // return unmatched surrogate
253
}
254
255
/**
256
* Extract a single UTF-32 value from a string.
257
* Used when iterating forwards or backwards (with
258
* <code>UTF16.getCharCount()</code>, as well as random access. If a
259
* validity check is required, use
260
* <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
261
* </a></code> on the return value.
262
* If the char retrieved is part of a surrogate pair, its supplementary
263
* character will be returned. If a complete supplementary character is
264
* not found the incomplete character will be returned
265
* @param source array of UTF-16 chars
266
* @param offset16 UTF-16 offset to the start of the character.
267
* @return UTF-32 value for the UTF-32 value that contains the char at
268
* offset16. The boundaries of that codepoint are the same as in
269
* <code>bounds32()</code>.
270
* @exception IndexOutOfBoundsException thrown if offset16 is out of bounds.
271
* @stable ICU 2.1
272
*/
273
public static int charAt(CharSequence source, int offset16) {
274
char single = source.charAt(offset16);
275
if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
276
return single;
277
}
278
return _charAt(source, offset16, single);
279
}
280
281
private static int _charAt(CharSequence source, int offset16, char single) {
282
if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
283
return single;
284
}
285
286
// Convert the UTF-16 surrogate pair if necessary.
287
// For simplicity in usage, and because the frequency of pairs is
288
// low, look both directions.
289
290
if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
291
++offset16;
292
if (source.length() != offset16) {
293
char trail = source.charAt(offset16);
294
if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE
295
&& trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
296
return UCharacterProperty.getRawSupplementary(single, trail);
297
}
298
}
299
} else {
300
--offset16;
301
if (offset16 >= 0) {
302
// single is a trail surrogate so
303
char lead = source.charAt(offset16);
304
if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE
305
&& lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
306
return UCharacterProperty.getRawSupplementary(lead, single);
307
}
308
}
309
}
310
return single; // return unmatched surrogate
311
}
312
313
/**
314
* Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards
315
* (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
316
* required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
317
* </a></code>
318
* on the return value. If the char retrieved is part of a surrogate pair, its supplementary
319
* character will be returned. If a complete supplementary character is not found the incomplete
320
* character will be returned
321
*
322
* @param source Array of UTF-16 chars
323
* @param start Offset to substring in the source array for analyzing
324
* @param limit Offset to substring in the source array for analyzing
325
* @param offset16 UTF-16 offset relative to start
326
* @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
327
* of that codepoint are the same as in <code>bounds32()</code>.
328
* @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit.
329
* @stable ICU 2.1
330
*/
331
public static int charAt(char source[], int start, int limit, int offset16) {
332
offset16 += start;
333
if (offset16 < start || offset16 >= limit) {
334
throw new ArrayIndexOutOfBoundsException(offset16);
335
}
336
337
char single = source[offset16];
338
if (!isSurrogate(single)) {
339
return single;
340
}
341
342
// Convert the UTF-16 surrogate pair if necessary.
343
// For simplicity in usage, and because the frequency of pairs is
344
// low, look both directions.
345
if (single <= LEAD_SURROGATE_MAX_VALUE) {
346
offset16++;
347
if (offset16 >= limit) {
348
return single;
349
}
350
char trail = source[offset16];
351
if (isTrailSurrogate(trail)) {
352
return UCharacterProperty.getRawSupplementary(single, trail);
353
}
354
}
355
else { // isTrailSurrogate(single), so
356
if (offset16 == start) {
357
return single;
358
}
359
offset16--;
360
char lead = source[offset16];
361
if (isLeadSurrogate(lead))
362
return UCharacterProperty.getRawSupplementary(lead, single);
363
}
364
return single; // return unmatched surrogate
365
}
366
367
/**
368
* Determines how many chars this char32 requires.
369
* If a validity check is required, use <code>
370
* <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
371
* char32 before calling.
372
* @param char32 the input codepoint.
373
* @return 2 if is in supplementary space, otherwise 1.
374
* @stable ICU 2.1
375
*/
376
public static int getCharCount(int char32)
377
{
378
if (char32 < SUPPLEMENTARY_MIN_VALUE) {
379
return 1;
380
}
381
return 2;
382
}
383
384
/**
385
* Determines whether the code value is a surrogate.
386
* @param char16 the input character.
387
* @return true if the input character is a surrogate.
388
* @stable ICU 2.1
389
*/
390
public static boolean isSurrogate(char char16)
391
{
392
return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
393
}
394
395
/**
396
* Determines whether the character is a trail surrogate.
397
* @param char16 the input character.
398
* @return true if the input character is a trail surrogate.
399
* @stable ICU 2.1
400
*/
401
public static boolean isTrailSurrogate(char char16)
402
{
403
return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
404
}
405
406
/**
407
* Determines whether the character is a lead surrogate.
408
* @param char16 the input character.
409
* @return true if the input character is a lead surrogate
410
* @stable ICU 2.1
411
*/
412
public static boolean isLeadSurrogate(char char16)
413
{
414
return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
415
}
416
417
/**
418
* Returns the lead surrogate.
419
* If a validity check is required, use
420
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
421
* on char32 before calling.
422
* @param char32 the input character.
423
* @return lead surrogate if the getCharCount(ch) is 2; <br>
424
* and 0 otherwise (note: 0 is not a valid lead surrogate).
425
* @stable ICU 2.1
426
*/
427
public static char getLeadSurrogate(int char32)
428
{
429
if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
430
return (char)(LEAD_SURROGATE_OFFSET_ +
431
(char32 >> LEAD_SURROGATE_SHIFT_));
432
}
433
434
return 0;
435
}
436
437
/**
438
* Returns the trail surrogate.
439
* If a validity check is required, use
440
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
441
* on char32 before calling.
442
* @param char32 the input character.
443
* @return the trail surrogate if the getCharCount(ch) is 2; <br> otherwise
444
* the character itself
445
* @stable ICU 2.1
446
*/
447
public static char getTrailSurrogate(int char32)
448
{
449
if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
450
return (char)(TRAIL_SURROGATE_MIN_VALUE +
451
(char32 & TRAIL_SURROGATE_MASK_));
452
}
453
454
return (char) char32;
455
}
456
457
/**
458
* Convenience method corresponding to String.valueOf(char). Returns a one
459
* or two char string containing the UTF-32 value in UTF16 format. If a
460
* validity check is required, use
461
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
462
* on char32 before calling.
463
* @param char32 the input character.
464
* @return string value of char32 in UTF16 format
465
* @exception IllegalArgumentException thrown if char32 is a invalid
466
* codepoint.
467
* @stable ICU 2.1
468
*/
469
public static String valueOf(int char32)
470
{
471
if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
472
throw new IllegalArgumentException("Illegal codepoint");
473
}
474
return toString(char32);
475
}
476
477
/**
478
* Append a single UTF-32 value to the end of a StringBuffer.
479
* If a validity check is required, use
480
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
481
* on char32 before calling.
482
* @param target the buffer to append to
483
* @param char32 value to append.
484
* @return the updated StringBuffer
485
* @exception IllegalArgumentException thrown when char32 does not lie
486
* within the range of the Unicode codepoints
487
* @stable ICU 2.1
488
*/
489
public static StringBuffer append(StringBuffer target, int char32)
490
{
491
// Check for irregular values
492
if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
493
throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
494
}
495
496
// Write the UTF-16 values
497
if (char32 >= SUPPLEMENTARY_MIN_VALUE)
498
{
499
target.append(getLeadSurrogate(char32));
500
target.append(getTrailSurrogate(char32));
501
}
502
else {
503
target.append((char) char32);
504
}
505
return target;
506
}
507
508
/**
509
* Shifts offset16 by the argument number of codepoints within a subarray.
510
* @param source char array
511
* @param start position of the subarray to be performed on
512
* @param limit position of the subarray to be performed on
513
* @param offset16 UTF16 position to shift relative to start
514
* @param shift32 number of codepoints to shift
515
* @return new shifted offset16 relative to start
516
* @exception IndexOutOfBoundsException if the new offset16 is out of
517
* bounds with respect to the subarray or the subarray bounds
518
* are out of range.
519
* @stable ICU 2.1
520
*/
521
public static int moveCodePointOffset(char source[], int start, int limit,
522
int offset16, int shift32)
523
{
524
int size = source.length;
525
int count;
526
char ch;
527
int result = offset16 + start;
528
if (start < 0 || limit < start) {
529
throw new StringIndexOutOfBoundsException(start);
530
}
531
if (limit > size) {
532
throw new StringIndexOutOfBoundsException(limit);
533
}
534
if (offset16 < 0 || result > limit) {
535
throw new StringIndexOutOfBoundsException(offset16);
536
}
537
if (shift32 > 0) {
538
if (shift32 + result > size) {
539
throw new StringIndexOutOfBoundsException(result);
540
}
541
count = shift32;
542
while (result < limit && count > 0)
543
{
544
ch = source[result];
545
if (isLeadSurrogate(ch) && (result + 1 < limit) &&
546
isTrailSurrogate(source[result + 1])) {
547
result++;
548
}
549
count--;
550
result++;
551
}
552
} else {
553
if (result + shift32 < start) {
554
throw new StringIndexOutOfBoundsException(result);
555
}
556
for (count = -shift32; count > 0; count--) {
557
result--;
558
if (result < start) {
559
break;
560
}
561
ch = source[result];
562
if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) {
563
result--;
564
}
565
}
566
}
567
if (count != 0) {
568
throw new StringIndexOutOfBoundsException(shift32);
569
}
570
result -= start;
571
return result;
572
}
573
574
// private data members -------------------------------------------------
575
576
/**
577
* Shift value for lead surrogate to form a supplementary character.
578
*/
579
private static final int LEAD_SURROGATE_SHIFT_ = 10;
580
581
/**
582
* Mask to retrieve the significant value from a trail surrogate.
583
*/
584
private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
585
586
/**
587
* Value that all lead surrogate starts with
588
*/
589
private static final int LEAD_SURROGATE_OFFSET_ =
590
LEAD_SURROGATE_MIN_VALUE -
591
(SUPPLEMENTARY_MIN_VALUE
592
>> LEAD_SURROGATE_SHIFT_);
593
594
// private methods ------------------------------------------------------
595
596
/**
597
* <p>Converts argument code point and returns a String object representing
598
* the code point's value in UTF16 format.
599
* <p>This method does not check for the validity of the codepoint, the
600
* results are not guaranteed if a invalid codepoint is passed as
601
* argument.
602
* <p>The result is a string whose length is 1 for non-supplementary code
603
* points, 2 otherwise.
604
* @param ch code point
605
* @return string representation of the code point
606
*/
607
private static String toString(int ch)
608
{
609
if (ch < SUPPLEMENTARY_MIN_VALUE) {
610
return String.valueOf((char) ch);
611
}
612
613
StringBuilder result = new StringBuilder();
614
result.append(getLeadSurrogate(ch));
615
result.append(getTrailSurrogate(ch));
616
return result.toString();
617
}
618
}
619
620