Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mobile
Path: blob/master/src/java.base/share/classes/jdk/internal/icu/impl/UCharacterProperty.java
41161 views
1
/*
2
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
3
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
*
5
* This code is free software; you can redistribute it and/or modify it
6
* under the terms of the GNU General Public License version 2 only, as
7
* published by the Free Software Foundation. Oracle designates this
8
* particular file as subject to the "Classpath" exception as provided
9
* by Oracle in the LICENSE file that accompanied this code.
10
*
11
* This code is distributed in the hope that it will be useful, but WITHOUT
12
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14
* version 2 for more details (a copy is included in the LICENSE file that
15
* accompanied this code).
16
*
17
* You should have received a copy of the GNU General Public License version
18
* 2 along with this work; if not, write to the Free Software Foundation,
19
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20
*
21
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22
* or visit www.oracle.com if you need additional information or have any
23
* questions.
24
*/
25
/*
26
*******************************************************************************
27
* Copyright (C) 1996-2014, International Business Machines Corporation and
28
* others. All Rights Reserved.
29
*******************************************************************************
30
*/
31
32
package jdk.internal.icu.impl;
33
34
import java.io.IOException;
35
import java.nio.ByteBuffer;
36
import java.util.Iterator;
37
import java.util.MissingResourceException;
38
39
import jdk.internal.icu.lang.UCharacter.HangulSyllableType;
40
import jdk.internal.icu.lang.UCharacter.NumericType;
41
import jdk.internal.icu.text.UTF16;
42
import jdk.internal.icu.text.UnicodeSet;
43
import jdk.internal.icu.util.VersionInfo;
44
45
/**
46
* <p>Internal class used for Unicode character property database.</p>
47
* <p>This classes store binary data read from uprops.icu.
48
* It does not have the capability to parse the data into more high-level
49
* information. It only returns bytes of information when required.</p>
50
* <p>Due to the form most commonly used for retrieval, array of char is used
51
* to store the binary data.</p>
52
* <p>UCharacterPropertyDB also contains information on accessing indexes to
53
* significant points in the binary data.</p>
54
* <p>Responsibility for molding the binary data into more meaning form lies on
55
* <a href=UCharacter.html>UCharacter</a>.</p>
56
* @author Syn Wee Quek
57
* @since release 2.1, february 1st 2002
58
*/
59
60
public final class UCharacterProperty
61
{
62
// public data members -----------------------------------------------
63
64
/*
65
* public singleton instance
66
*/
67
public static final UCharacterProperty INSTANCE;
68
69
/**
70
* Trie data
71
*/
72
public Trie2_16 m_trie_;
73
74
/**
75
* Unicode version
76
*/
77
public VersionInfo m_unicodeVersion_;
78
79
/**
80
* Character type mask
81
*/
82
public static final int TYPE_MASK = 0x1F;
83
84
// uprops.h enum UPropertySource --------------------------------------- ***
85
86
/** From uchar.c/uprops.icu main trie */
87
public static final int SRC_CHAR=1;
88
/** From uchar.c/uprops.icu properties vectors trie */
89
public static final int SRC_PROPSVEC=2;
90
/** From ubidi_props.c/ubidi.icu */
91
public static final int SRC_BIDI=5;
92
/** From normalizer2impl.cpp/nfc.nrm */
93
public static final int SRC_NFC=8;
94
/** From normalizer2impl.cpp/nfkc.nrm */
95
public static final int SRC_NFKC=9;
96
97
// public methods ----------------------------------------------------
98
99
/**
100
* Gets the main property value for code point ch.
101
* @param ch code point whose property value is to be retrieved
102
* @return property value of code point
103
*/
104
public final int getProperty(int ch)
105
{
106
return m_trie_.get(ch);
107
}
108
109
/**
110
* Gets the unicode additional properties.
111
* Java version of C u_getUnicodeProperties().
112
* @param codepoint codepoint whose additional properties is to be
113
* retrieved
114
* @param column The column index.
115
* @return unicode properties
116
*/
117
public int getAdditional(int codepoint, int column) {
118
assert column >= 0;
119
if (column >= m_additionalColumnsCount_) {
120
return 0;
121
}
122
return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column];
123
}
124
125
/**
126
* <p>Get the "age" of the code point.</p>
127
* <p>The "age" is the Unicode version when the code point was first
128
* designated (as a non-character or for Private Use) or assigned a
129
* character.</p>
130
* <p>This can be useful to avoid emitting code points to receiving
131
* processes that do not accept newer characters.</p>
132
* <p>The data is from the UCD file DerivedAge.txt.</p>
133
* <p>This API does not check the validity of the codepoint.</p>
134
* @param codepoint The code point.
135
* @return the Unicode version number
136
*/
137
public VersionInfo getAge(int codepoint)
138
{
139
int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
140
return VersionInfo.getInstance(
141
(version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
142
version & LAST_NIBBLE_MASK_, 0, 0);
143
}
144
145
// int-value and enumerated properties --------------------------------- ***
146
147
public int getType(int c) {
148
return getProperty(c)&TYPE_MASK;
149
}
150
151
/*
152
* Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
153
* Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break.
154
*/
155
private static final int /* UHangulSyllableType */ gcbToHst[]={
156
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */
157
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */
158
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */
159
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */
160
HangulSyllableType.LEADING_JAMO, /* U_GCB_L */
161
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */
162
HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */
163
HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */
164
HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */
165
HangulSyllableType.VOWEL_JAMO /* U_GCB_V */
166
/*
167
* Omit GCB values beyond what we need for hst.
168
* The code below checks for the array length.
169
*/
170
};
171
172
private class IntProperty {
173
int column; // SRC_PROPSVEC column, or "source" if mask==0
174
int mask;
175
int shift;
176
177
IntProperty(int column, int mask, int shift) {
178
this.column=column;
179
this.mask=mask;
180
this.shift=shift;
181
}
182
183
IntProperty(int source) {
184
this.column=source;
185
this.mask=0;
186
}
187
188
int getValue(int c) {
189
// systematic, directly stored properties
190
return (getAdditional(c, column)&mask)>>>shift;
191
}
192
}
193
194
private class BiDiIntProperty extends IntProperty {
195
BiDiIntProperty() {
196
super(SRC_BIDI);
197
}
198
}
199
200
private class CombiningClassIntProperty extends IntProperty {
201
CombiningClassIntProperty(int source) {
202
super(source);
203
}
204
}
205
206
private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties
207
int which;
208
int max;
209
210
NormQuickCheckIntProperty(int source, int which, int max) {
211
super(source);
212
this.which=which;
213
this.max=max;
214
}
215
}
216
217
private IntProperty intProp = new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE
218
int getValue(int c) {
219
return UBiDiProps.INSTANCE.getPairedBracketType(c);
220
}
221
};
222
223
public int getIntPropertyValue(int c, int which) {
224
if (which == BIDI_PAIRED_BRACKET_TYPE) {
225
return intProp.getValue(c);
226
}
227
return 0; // undefined
228
}
229
230
/**
231
* Forms a supplementary code point from the argument character<br>
232
* Note this is for internal use hence no checks for the validity of the
233
* surrogate characters are done
234
* @param lead lead surrogate character
235
* @param trail trailing surrogate character
236
* @return code point of the supplementary character
237
*/
238
public static int getRawSupplementary(char lead, char trail)
239
{
240
return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
241
}
242
243
/**
244
* Gets the type mask
245
* @param type character type
246
* @return mask
247
*/
248
public static final int getMask(int type)
249
{
250
return 1 << type;
251
}
252
253
/**
254
* Returns the digit values of characters like 'A' - 'Z', normal,
255
* half-width and full-width. This method assumes that the other digit
256
* characters are checked by the calling method.
257
* @param ch character to test
258
* @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
259
* its corresponding digit will be returned.
260
*/
261
public static int getEuropeanDigit(int ch) {
262
if ((ch > 0x7a && ch < 0xff21)
263
|| ch < 0x41 || (ch > 0x5a && ch < 0x61)
264
|| ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {
265
return -1;
266
}
267
if (ch <= 0x7a) {
268
// ch >= 0x41 or ch < 0x61
269
return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
270
}
271
// ch >= 0xff21
272
if (ch <= 0xff3a) {
273
return ch + 10 - 0xff21;
274
}
275
// ch >= 0xff41 && ch <= 0xff5a
276
return ch + 10 - 0xff41;
277
}
278
279
public int digit(int c) {
280
int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_;
281
if(value<=9) {
282
return value;
283
} else {
284
return -1;
285
}
286
}
287
288
// protected variables -----------------------------------------------
289
290
/**
291
* Extra property trie
292
*/
293
Trie2_16 m_additionalTrie_;
294
/**
295
* Extra property vectors, 1st column for age and second for binary
296
* properties.
297
*/
298
int m_additionalVectors_[];
299
/**
300
* Number of additional columns
301
*/
302
int m_additionalColumnsCount_;
303
/**
304
* Maximum values for block, bits used as in vector word
305
* 0
306
*/
307
int m_maxBlockScriptValue_;
308
/**
309
* Maximum values for script, bits used as in vector word
310
* 0
311
*/
312
int m_maxJTGValue_;
313
/**
314
* Script_Extensions data
315
*/
316
public char[] m_scriptExtensions_;
317
318
// private variables -------------------------------------------------
319
320
/**
321
* Default name of the datafile
322
*/
323
@SuppressWarnings("deprecation")
324
private static final String DATA_FILE_NAME_ =
325
"/jdk/internal/icu/impl/data/icudt" +
326
VersionInfo.ICU_DATA_VERSION_PATH +
327
"/uprops.icu";
328
329
/**
330
* Shift value for lead surrogate to form a supplementary character.
331
*/
332
private static final int LEAD_SURROGATE_SHIFT_ = 10;
333
/**
334
* Offset to add to combined surrogate pair to avoid masking.
335
*/
336
private static final int SURROGATE_OFFSET_ =
337
UTF16.SUPPLEMENTARY_MIN_VALUE -
338
(UTF16.SURROGATE_MIN_VALUE <<
339
LEAD_SURROGATE_SHIFT_) -
340
UTF16.TRAIL_SURROGATE_MIN_VALUE;
341
342
343
// property data constants -------------------------------------------------
344
345
/**
346
* Numeric types and values in the main properties words.
347
*/
348
private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6;
349
private static final int getNumericTypeValue(int props) {
350
return props >> NUMERIC_TYPE_VALUE_SHIFT_;
351
}
352
353
/* constants for the storage form of numeric types and values */
354
/** No numeric value. */
355
private static final int NTV_NONE_ = 0;
356
/** Decimal digits: nv=0..9 */
357
private static final int NTV_DECIMAL_START_ = 1;
358
/** Other digits: nv=0..9 */
359
private static final int NTV_DIGIT_START_ = 11;
360
/** Small integers: nv=0..154 */
361
private static final int NTV_NUMERIC_START_ = 21;
362
363
private static final int ntvGetType(int ntv) {
364
return
365
(ntv==NTV_NONE_) ? NumericType.NONE :
366
(ntv<NTV_DIGIT_START_) ? NumericType.DECIMAL :
367
(ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT :
368
NumericType.NUMERIC;
369
}
370
371
/*
372
* Properties in vector word 0
373
* Bits
374
* 31..24 DerivedAge version major/minor one nibble each
375
* 23..22 3..1: Bits 21..20 & 7..0 = Script_Extensions index
376
* 3: Script value from Script_Extensions
377
* 2: Script=Inherited
378
* 1: Script=Common
379
* 0: Script=bits 21..20 & 7..0
380
* 21..20 Bits 9..8 of the UScriptCode, or index to Script_Extensions
381
* 19..17 East Asian Width
382
* 16.. 8 UBlockCode
383
* 7.. 0 UScriptCode, or index to Script_Extensions
384
*/
385
386
/**
387
* Script_Extensions: mask includes Script
388
*/
389
public static final int SCRIPT_X_MASK = 0x00f000ff;
390
//private static final int SCRIPT_X_SHIFT = 22;
391
392
// The UScriptCode or Script_Extensions index is split across two bit fields.
393
// (Starting with Unicode 13/ICU 66/2019 due to more varied Script_Extensions.)
394
// Shift the high bits right by 12 to assemble the full value.
395
public static final int SCRIPT_HIGH_MASK = 0x00300000;
396
public static final int SCRIPT_HIGH_SHIFT = 12;
397
public static final int MAX_SCRIPT = 0x3ff;
398
399
/**
400
* Integer properties mask and shift values for East Asian cell width.
401
* Equivalent to icu4c UPROPS_EA_MASK
402
*/
403
private static final int EAST_ASIAN_MASK_ = 0x000e0000;
404
/**
405
* Integer properties mask and shift values for East Asian cell width.
406
* Equivalent to icu4c UPROPS_EA_SHIFT
407
*/
408
private static final int EAST_ASIAN_SHIFT_ = 17;
409
/**
410
* Integer properties mask and shift values for blocks.
411
* Equivalent to icu4c UPROPS_BLOCK_MASK
412
*/
413
private static final int BLOCK_MASK_ = 0x0001ff00;
414
/**
415
* Integer properties mask and shift values for blocks.
416
* Equivalent to icu4c UPROPS_BLOCK_SHIFT
417
*/
418
private static final int BLOCK_SHIFT_ = 8;
419
/**
420
* Integer properties mask and shift values for scripts.
421
* Equivalent to icu4c UPROPS_SHIFT_LOW_MASK.
422
*/
423
public static final int SCRIPT_LOW_MASK = 0x000000ff;
424
425
public static final int mergeScriptCodeOrIndex(int scriptX) {
426
return
427
((scriptX & SCRIPT_HIGH_MASK) >> SCRIPT_HIGH_SHIFT) |
428
(scriptX & SCRIPT_LOW_MASK);
429
}
430
431
/**
432
* Additional properties used in internal trie data
433
*/
434
/*
435
* Properties in vector word 1
436
* Each bit encodes one binary property.
437
* The following constants represent the bit number, use 1<<UPROPS_XYZ.
438
* UPROPS_BINARY_1_TOP<=32!
439
*
440
* Keep this list of property enums in sync with
441
* propListNames[] in icu/source/tools/genprops/props2.c!
442
*
443
* ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_".
444
*/
445
private static final int WHITE_SPACE_PROPERTY_ = 0;
446
private static final int DASH_PROPERTY_ = 1;
447
private static final int HYPHEN_PROPERTY_ = 2;
448
private static final int QUOTATION_MARK_PROPERTY_ = 3;
449
private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4;
450
private static final int MATH_PROPERTY_ = 5;
451
private static final int HEX_DIGIT_PROPERTY_ = 6;
452
private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7;
453
private static final int ALPHABETIC_PROPERTY_ = 8;
454
private static final int IDEOGRAPHIC_PROPERTY_ = 9;
455
private static final int DIACRITIC_PROPERTY_ = 10;
456
private static final int EXTENDER_PROPERTY_ = 11;
457
private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12;
458
private static final int GRAPHEME_EXTEND_PROPERTY_ = 13;
459
private static final int GRAPHEME_LINK_PROPERTY_ = 14;
460
private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15;
461
private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16;
462
private static final int RADICAL_PROPERTY_ = 17;
463
private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18;
464
private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19;
465
private static final int DEPRECATED_PROPERTY_ = 20;
466
private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21;
467
private static final int XID_START_PROPERTY_ = 22;
468
private static final int XID_CONTINUE_PROPERTY_ = 23;
469
private static final int ID_START_PROPERTY_ = 24;
470
private static final int ID_CONTINUE_PROPERTY_ = 25;
471
private static final int GRAPHEME_BASE_PROPERTY_ = 26;
472
private static final int S_TERM_PROPERTY_ = 27;
473
private static final int VARIATION_SELECTOR_PROPERTY_ = 28;
474
private static final int PATTERN_SYNTAX = 29; /* new in ICU 3.4 and Unicode 4.1 */
475
private static final int PATTERN_WHITE_SPACE = 30;
476
477
/*
478
* Properties in vector word 2
479
* Bits
480
* 31..26 reserved
481
* 25..20 Line Break
482
* 19..15 Sentence Break
483
* 14..10 Word Break
484
* 9.. 5 Grapheme Cluster Break
485
* 4.. 0 Decomposition Type
486
*/
487
private static final int LB_MASK = 0x03f00000;
488
private static final int LB_SHIFT = 20;
489
490
private static final int SB_MASK = 0x000f8000;
491
private static final int SB_SHIFT = 15;
492
493
private static final int WB_MASK = 0x00007c00;
494
private static final int WB_SHIFT = 10;
495
496
private static final int GCB_MASK = 0x000003e0;
497
private static final int GCB_SHIFT = 5;
498
499
/**
500
* Integer properties mask for decomposition type.
501
* Equivalent to icu4c UPROPS_DT_MASK.
502
*/
503
private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f;
504
505
/**
506
* First nibble shift
507
*/
508
private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
509
/**
510
* Second nibble mask
511
*/
512
private static final int LAST_NIBBLE_MASK_ = 0xF;
513
/**
514
* Age value shift
515
*/
516
private static final int AGE_SHIFT_ = 24;
517
518
// private constructors --------------------------------------------------
519
520
/**
521
* Constructor
522
* @exception IOException thrown when data reading fails or data corrupted
523
*/
524
private UCharacterProperty() throws IOException
525
{
526
// jar access
527
ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME_);
528
m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, new IsAcceptable());
529
// Read or skip the 16 indexes.
530
int propertyOffset = bytes.getInt();
531
/* exceptionOffset = */ bytes.getInt();
532
/* caseOffset = */ bytes.getInt();
533
int additionalOffset = bytes.getInt();
534
int additionalVectorsOffset = bytes.getInt();
535
m_additionalColumnsCount_ = bytes.getInt();
536
int scriptExtensionsOffset = bytes.getInt();
537
int reservedOffset7 = bytes.getInt();
538
/* reservedOffset8 = */ bytes.getInt();
539
/* dataTopOffset = */ bytes.getInt();
540
m_maxBlockScriptValue_ = bytes.getInt();
541
m_maxJTGValue_ = bytes.getInt();
542
ICUBinary.skipBytes(bytes, (16 - 12) << 2);
543
544
// read the main properties trie
545
m_trie_ = Trie2_16.createFromSerialized(bytes);
546
int expectedTrieLength = (propertyOffset - 16) * 4;
547
int trieLength = m_trie_.getSerializedLength();
548
if(trieLength > expectedTrieLength) {
549
throw new IOException("uprops.icu: not enough bytes for main trie");
550
}
551
// skip padding after trie bytes
552
ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
553
554
// skip unused intervening data structures
555
ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4);
556
557
if(m_additionalColumnsCount_ > 0) {
558
// reads the additional property block
559
m_additionalTrie_ = Trie2_16.createFromSerialized(bytes);
560
expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4;
561
trieLength = m_additionalTrie_.getSerializedLength();
562
if(trieLength > expectedTrieLength) {
563
throw new IOException("uprops.icu: not enough bytes for additional-properties trie");
564
}
565
// skip padding after trie bytes
566
ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
567
568
// additional properties
569
int size = scriptExtensionsOffset - additionalVectorsOffset;
570
m_additionalVectors_ = new int[size];
571
for (int i = 0; i < size; i ++) {
572
m_additionalVectors_[i] = bytes.getInt();
573
}
574
}
575
576
// Script_Extensions
577
int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2;
578
if(numChars > 0) {
579
m_scriptExtensions_ = new char[numChars];
580
for(int i = 0; i < numChars; ++i) {
581
m_scriptExtensions_[i] = bytes.getChar();
582
}
583
}
584
}
585
586
private static final class IsAcceptable implements ICUBinary.Authenticate {
587
// @Override when we switch to Java 6
588
public boolean isDataVersionAcceptable(byte version[]) {
589
return version[0] == 7;
590
}
591
}
592
593
private static final int DATA_FORMAT = 0x5550726F; // "UPro"
594
595
public void upropsvec_addPropertyStarts(UnicodeSet set) {
596
/* add the start code point of each same-value range of the properties vectors trie */
597
if(m_additionalColumnsCount_>0) {
598
/* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
599
Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator();
600
Trie2.Range range;
601
while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
602
set.add(range.startCodePoint);
603
}
604
}
605
}
606
607
// This static initializer block must be placed after
608
// other static member initialization
609
static {
610
try {
611
INSTANCE = new UCharacterProperty();
612
}
613
catch (IOException e) {
614
throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME_,"");
615
}
616
}
617
618
619
// Moved from UProperty.java
620
/**
621
* Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3).
622
* Used in UAX #9: Unicode Bidirectional Algorithm
623
* (http://www.unicode.org/reports/tr9/)
624
* Returns UCharacter.BidiPairedBracketType values.
625
* @stable ICU 52
626
*/
627
public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015;
628
629
}
630
631