CoCalc -- UCharacter.java

GitHub Repository: PojavLauncherTeam/mobile
Path: blob/master/src/java.base/share/classes/jdk/internal/icu/lang/UCharacter.java
⁴¹¹⁶¹ views
1
/*
2
 * Copyright (c) 2009, 2020, Oracle and/or its affiliates. All rights reserved.
3
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
 *
5
 * This code is free software; you can redistribute it and/or modify it
6
 * under the terms of the GNU General Public License version 2 only, as
7
 * published by the Free Software Foundation.  Oracle designates this
8
 * particular file as subject to the "Classpath" exception as provided
9
 * by Oracle in the LICENSE file that accompanied this code.
10
 *
11
 * This code is distributed in the hope that it will be useful, but WITHOUT
12
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14
 * version 2 for more details (a copy is included in the LICENSE file that
15
 * accompanied this code).
16
 *
17
 * You should have received a copy of the GNU General Public License version
18
 * 2 along with this work; if not, write to the Free Software Foundation,
19
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20
 *
21
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22
 * or visit www.oracle.com if you need additional information or have any
23
 * questions.
24
 */
25

26
/**
27
*******************************************************************************
28
* Copyright (C) 1996-2014, International Business Machines Corporation and
29
* others. All Rights Reserved.
30
*******************************************************************************
31
*/
32

33
package jdk.internal.icu.lang;
34

35
import jdk.internal.icu.impl.UBiDiProps;
36
import jdk.internal.icu.impl.UCharacterProperty;
37
import jdk.internal.icu.text.Normalizer2;
38
import jdk.internal.icu.text.UTF16;
39
import jdk.internal.icu.util.VersionInfo;
40

41
/**
42
 * <p>The UCharacter class provides extensions to the
43
 * <a href="http://java.sun.com/j2se/1.5/docs/api/java/lang/Character.html">
44
 * java.lang.Character</a> class. These extensions provide support for
45
 * more Unicode properties and together with the <a href=../text/UTF16.html>UTF16</a>
46
 * class, provide support for supplementary characters (those with code
47
 * points above U+FFFF).
48
 * Each ICU release supports the latest version of Unicode available at that time.
49
 *
50
 * <p>Code points are represented in these API using ints. While it would be
51
 * more convenient in Java to have a separate primitive datatype for them,
52
 * ints suffice in the meantime.
53
 *
54
 * <p>To use this class please add the jar file name icu4j.jar to the
55
 * class path, since it contains data files which supply the information used
56
 * by this file.<br>
57
 * E.g. In Windows <br>
58
 * <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br>
59
 * Otherwise, another method would be to copy the files uprops.dat and
60
 * unames.icu from the icu4j source subdirectory
61
 * <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory
62
 * <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>.
63
 *
64
 * <p>Aside from the additions for UTF-16 support, and the updated Unicode
65
 * properties, the main differences between UCharacter and Character are:
66
 * <ul>
67
 * <li> UCharacter is not designed to be a char wrapper and does not have
68
 *      APIs to which involves management of that single char.<br>
69
 *      These include:
70
 *      <ul>
71
 *        <li> char charValue(),
72
 *        <li> int compareTo(java.lang.Character, java.lang.Character), etc.
73
 *      </ul>
74
 * <li> UCharacter does not include Character APIs that are deprecated, nor
75
 *      does it include the Java-specific character information, such as
76
 *      boolean isJavaIdentifierPart(char ch).
77
 * <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric
78
 *      values '10' - '35'. UCharacter also does this in digit and
79
 *      getNumericValue, to adhere to the java semantics of these
80
 *      methods.  New methods unicodeDigit, and
81
 *      getUnicodeNumericValue do not treat the above code points
82
 *      as having numeric values.  This is a semantic change from ICU4J 1.3.1.
83
 * </ul>
84
 * <p>
85
 * Further detail on differences can be determined using the program
86
 *        <a href=
87
 * "http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java">
88
 *        com.ibm.icu.dev.test.lang.UCharacterCompare</a>
89
 * </p>
90
 * <p>
91
 * In addition to Java compatibility functions, which calculate derived properties,
92
 * this API provides low-level access to the Unicode Character Database.
93
 * </p>
94
 * <p>
95
 * Unicode assigns each code point (not just assigned character) values for
96
 * many properties.
97
 * Most of them are simple boolean flags, or constants from a small enumerated list.
98
 * For some properties, values are strings or other relatively more complex types.
99
 * </p>
100
 * <p>
101
 * For more information see
102
 * <a href="http://www.unicode/org/ucd/">"About the Unicode Character Database"</a>
103
 * (http://www.unicode.org/ucd/)
104
 * and the <a href="http://www.icu-project.org/userguide/properties.html">ICU
105
 * User Guide chapter on Properties</a>
106
 * (http://www.icu-project.org/userguide/properties.html).
107
 * </p>
108
 * <p>
109
 * There are also functions that provide easy migration from C/POSIX functions
110
 * like isblank(). Their use is generally discouraged because the C/POSIX
111
 * standards do not define their semantics beyond the ASCII range, which means
112
 * that different implementations exhibit very different behavior.
113
 * Instead, Unicode properties should be used directly.
114
 * </p>
115
 * <p>
116
 * There are also only a few, broad C/POSIX character classes, and they tend
117
 * to be used for conflicting purposes. For example, the "isalpha()" class
118
 * is sometimes used to determine word boundaries, while a more sophisticated
119
 * approach would at least distinguish initial letters from continuation
120
 * characters (the latter including combining marks).
121
 * (In ICU, BreakIterator is the most sophisticated API for word boundaries.)
122
 * Another example: There is no "istitle()" class for titlecase characters.
123
 * </p>
124
 * <p>
125
 * ICU 3.4 and later provides API access for all twelve C/POSIX character classes.
126
 * ICU implements them according to the Standard Recommendations in
127
 * Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions
128
 * (http://www.unicode.org/reports/tr18/#Compatibility_Properties).
129
 * </p>
130
 * <p>
131
 * API access for C/POSIX character classes is as follows:
132
 * <pre>{@code
133
 * - alpha:     isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC)
134
 * - lower:     isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE)
135
 * - upper:     isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE)
136
 * - punct:     ((1<<getType(c)) & ((1<<DASH_PUNCTUATION)|(1<<START_PUNCTUATION)|
137
 *               (1<<END_PUNCTUATION)|(1<<CONNECTOR_PUNCTUATION)|(1<<OTHER_PUNCTUATION)|
138
 *               (1<<INITIAL_PUNCTUATION)|(1<<FINAL_PUNCTUATION)))!=0
139
 * - digit:     isDigit(c) or getType(c)==DECIMAL_DIGIT_NUMBER
140
 * - xdigit:    hasBinaryProperty(c, UProperty.POSIX_XDIGIT)
141
 * - alnum:     hasBinaryProperty(c, UProperty.POSIX_ALNUM)
142
 * - space:     isUWhiteSpace(c) or hasBinaryProperty(c, UProperty.WHITE_SPACE)
143
 * - blank:     hasBinaryProperty(c, UProperty.POSIX_BLANK)
144
 * - cntrl:     getType(c)==CONTROL
145
 * - graph:     hasBinaryProperty(c, UProperty.POSIX_GRAPH)
146
 * - print:     hasBinaryProperty(c, UProperty.POSIX_PRINT)
147
 * }</pre>
148
 * </p>
149
 * <p>
150
 * The C/POSIX character classes are also available in UnicodeSet patterns,
151
 * using patterns like [:graph:] or \p{graph}.
152
 * </p>
153
 *
154
 * There are several ICU (and Java) whitespace functions.
155
 * Comparison:<ul>
156
 * <li> isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
157
 *       most of general categories "Z" (separators) + most whitespace ISO controls
158
 *       (including no-break spaces, but excluding IS1..IS4 and ZWSP)
159
 * <li> isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
160
 * <li> isSpaceChar: just Z (including no-break spaces)</ul>
161
 * </p>
162
 * <p>
163
 * This class is not subclassable.
164
 * </p>
165
 * @author Syn Wee Quek
166
 * @stable ICU 2.1
167
 * @see com.ibm.icu.lang.UCharacterEnums
168
 */
169

170
public final class UCharacter
171
{
172

173
    /**
174
     * Joining Group constants.
175
     * @see UProperty#JOINING_GROUP
176
     * @stable ICU 2.4
177
     */
178
    public static interface JoiningGroup
179
    {
180
        /**
181
         * @stable ICU 2.4
182
         */
183
        public static final int NO_JOINING_GROUP = 0;
184
    }
185

186
    /**
187
     * Numeric Type constants.
188
     * @see UProperty#NUMERIC_TYPE
189
     * @stable ICU 2.4
190
     */
191
    public static interface NumericType
192
    {
193
        /**
194
         * @stable ICU 2.4
195
         */
196
        public static final int NONE = 0;
197
        /**
198
         * @stable ICU 2.4
199
         */
200
        public static final int DECIMAL = 1;
201
        /**
202
         * @stable ICU 2.4
203
         */
204
        public static final int DIGIT = 2;
205
        /**
206
         * @stable ICU 2.4
207
         */
208
        public static final int NUMERIC = 3;
209
        /**
210
         * @stable ICU 2.4
211
         */
212
        public static final int COUNT = 4;
213
    }
214

215
    /**
216
     * Hangul Syllable Type constants.
217
     *
218
     * @see UProperty#HANGUL_SYLLABLE_TYPE
219
     * @stable ICU 2.6
220
     */
221
    public static interface HangulSyllableType
222
    {
223
        /**
224
         * @stable ICU 2.6
225
         */
226
        public static final int NOT_APPLICABLE      = 0;   /*[NA]*/ /*See note !!*/
227
        /**
228
         * @stable ICU 2.6
229
         */
230
        public static final int LEADING_JAMO        = 1;   /*[L]*/
231
        /**
232
         * @stable ICU 2.6
233
         */
234
        public static final int VOWEL_JAMO          = 2;   /*[V]*/
235
        /**
236
         * @stable ICU 2.6
237
         */
238
        public static final int TRAILING_JAMO       = 3;   /*[T]*/
239
        /**
240
         * @stable ICU 2.6
241
         */
242
        public static final int LV_SYLLABLE         = 4;   /*[LV]*/
243
        /**
244
         * @stable ICU 2.6
245
         */
246
        public static final int LVT_SYLLABLE        = 5;   /*[LVT]*/
247
        /**
248
         * @stable ICU 2.6
249
         */
250
        public static final int COUNT               = 6;
251
    }
252

253
    // public data members -----------------------------------------------
254

255
    /**
256
     * The lowest Unicode code point value.
257
     * @stable ICU 2.1
258
     */
259
    public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE;
260

261
    /**
262
     * The highest Unicode code point value (scalar value) according to the
263
     * Unicode Standard.
264
     * This is a 21-bit value (21 bits, rounded up).<br>
265
     * Up-to-date Unicode implementation of java.lang.Character.MAX_VALUE
266
     * @stable ICU 2.1
267
     */
268
    public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE;
269

270
    // public methods ----------------------------------------------------
271

272
    /**
273
     * Returns the numeric value of a decimal digit code point.
274
     * <br>This method observes the semantics of
275
     * <code>java.lang.Character.digit()</code>.  Note that this
276
     * will return positive values for code points for which isDigit
277
     * returns false, just like java.lang.Character.
278
     * <br><em>Semantic Change:</em> In release 1.3.1 and
279
     * prior, this did not treat the European letters as having a
280
     * digit value, and also treated numeric letters and other numbers as
281
     * digits.
282
     * This has been changed to conform to the java semantics.
283
     * <br>A code point is a valid digit if and only if:
284
     * <ul>
285
     *   <li>ch is a decimal digit or one of the european letters, and
286
     *   <li>the value of ch is less than the specified radix.
287
     * </ul>
288
     * @param ch the code point to query
289
     * @param radix the radix
290
     * @return the numeric value represented by the code point in the
291
     * specified radix, or -1 if the code point is not a decimal digit
292
     * or if its value is too large for the radix
293
     * @stable ICU 2.1
294
     */
295
    public static int digit(int ch, int radix)
296
    {
297
        if (2 <= radix && radix <= 36) {
298
            int value = digit(ch);
299
            if (value < 0) {
300
                // ch is not a decimal digit, try latin letters
301
                value = UCharacterProperty.getEuropeanDigit(ch);
302
            }
303
            return (value < radix) ? value : -1;
304
        } else {
305
            return -1;  // invalid radix
306
        }
307
    }
308

309
    /**
310
     * Returns the numeric value of a decimal digit code point.
311
     * <br>This is a convenience overload of <code>digit(int, int)</code>
312
     * that provides a decimal radix.
313
     * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
314
     * treated numeric letters and other numbers as digits.  This has
315
     * been changed to conform to the java semantics.
316
     * @param ch the code point to query
317
     * @return the numeric value represented by the code point,
318
     * or -1 if the code point is not a decimal digit or if its
319
     * value is too large for a decimal radix
320
     * @stable ICU 2.1
321
     */
322
    public static int digit(int ch)
323
    {
324
        return UCharacterProperty.INSTANCE.digit(ch);
325
    }
326

327
    /**
328
     * Returns a value indicating a code point's Unicode category.
329
     * Up-to-date Unicode implementation of java.lang.Character.getType()
330
     * except for the above mentioned code points that had their category
331
     * changed.<br>
332
     * Return results are constants from the interface
333
     * <a href=UCharacterCategory.html>UCharacterCategory</a><br>
334
     * <em>NOTE:</em> the UCharacterCategory values are <em>not</em> compatible with
335
     * those returned by java.lang.Character.getType.  UCharacterCategory values
336
     * match the ones used in ICU4C, while java.lang.Character type
337
     * values, though similar, skip the value 17.</p>
338
     * @param ch code point whose type is to be determined
339
     * @return category which is a value of UCharacterCategory
340
     * @stable ICU 2.1
341
     */
342
    public static int getType(int ch)
343
    {
344
        return UCharacterProperty.INSTANCE.getType(ch);
345
    }
346

347
    /**
348
     * Returns the Bidirection property of a code point.
349
     * For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional
350
     * property.<br>
351
     * Result returned belongs to the interface
352
     * <a href=UCharacterDirection.html>UCharacterDirection</a>
353
     * @param ch the code point to be determined its direction
354
     * @return direction constant from UCharacterDirection.
355
     * @stable ICU 2.1
356
     */
357
    public static int getDirection(int ch)
358
    {
359
        return UBiDiProps.INSTANCE.getClass(ch);
360
    }
361

362
    /**
363
     * Maps the specified code point to a "mirror-image" code point.
364
     * For code points with the "mirrored" property, implementations sometimes
365
     * need a "poor man's" mapping to another code point such that the default
366
     * glyph may serve as the mirror-image of the default glyph of the
367
     * specified code point.<br>
368
     * This is useful for text conversion to and from codepages with visual
369
     * order, and for displays without glyph selection capabilities.
370
     * @param ch code point whose mirror is to be retrieved
371
     * @return another code point that may serve as a mirror-image substitute,
372
     *         or ch itself if there is no such mapping or ch does not have the
373
     *         "mirrored" property
374
     * @stable ICU 2.1
375
     */
376
    public static int getMirror(int ch)
377
    {
378
        return UBiDiProps.INSTANCE.getMirror(ch);
379
    }
380

381
    /**
382
     * Maps the specified character to its paired bracket character.
383
     * For Bidi_Paired_Bracket_Type!=None, this is the same as getMirror(int).
384
     * Otherwise c itself is returned.
385
     * See http://www.unicode.org/reports/tr9/
386
     *
387
     * @param c the code point to be mapped
388
     * @return the paired bracket code point,
389
     *         or c itself if there is no such mapping
390
     *         (Bidi_Paired_Bracket_Type=None)
391
     *
392
     * @see UProperty#BIDI_PAIRED_BRACKET
393
     * @see UProperty#BIDI_PAIRED_BRACKET_TYPE
394
     * @see #getMirror(int)
395
     * @stable ICU 52
396
     */
397
    public static int getBidiPairedBracket(int c) {
398
        return UBiDiProps.INSTANCE.getPairedBracket(c);
399
    }
400

401
    /**
402
     * Returns the combining class of the argument codepoint
403
     * @param ch code point whose combining is to be retrieved
404
     * @return the combining class of the codepoint
405
     * @stable ICU 2.1
406
     */
407
    public static int getCombiningClass(int ch)
408
    {
409
        return Normalizer2.getNFDInstance().getCombiningClass(ch);
410
    }
411

412
    /**
413
     * Returns the version of Unicode data used.
414
     * @return the unicode version number used
415
     * @stable ICU 2.1
416
     */
417
    public static VersionInfo getUnicodeVersion()
418
    {
419
        return UCharacterProperty.INSTANCE.m_unicodeVersion_;
420
    }
421

422
    /**
423
     * Returns a code point corresponding to the two UTF16 characters.
424
     * @param lead the lead char
425
     * @param trail the trail char
426
     * @return code point if surrogate characters are valid.
427
     * @exception IllegalArgumentException thrown when argument characters do
428
     *            not form a valid codepoint
429
     * @stable ICU 2.1
430
     */
431
    public static int getCodePoint(char lead, char trail)
432
    {
433
        if (UTF16.isLeadSurrogate(lead) && UTF16.isTrailSurrogate(trail)) {
434
            return UCharacterProperty.getRawSupplementary(lead, trail);
435
        }
436
        throw new IllegalArgumentException("Illegal surrogate characters");
437
    }
438

439
    /**
440
     * Returns the "age" of the code point.</p>
441
     * <p>The "age" is the Unicode version when the code point was first
442
     * designated (as a non-character or for Private Use) or assigned a
443
     * character.
444
     * <p>This can be useful to avoid emitting code points to receiving
445
     * processes that do not accept newer characters.</p>
446
     * <p>The data is from the UCD file DerivedAge.txt.</p>
447
     * @param ch The code point.
448
     * @return the Unicode version number
449
     * @stable ICU 2.6
450
     */
451
    public static VersionInfo getAge(int ch)
452
    {
453
        if (ch < MIN_VALUE || ch > MAX_VALUE) {
454
            throw new IllegalArgumentException("Codepoint out of bounds");
455
        }
456
        return UCharacterProperty.INSTANCE.getAge(ch);
457
    }
458

459
    /**
460
     * Returns the property value for an Unicode property type of a code point.
461
     * Also returns binary and mask property values.</p>
462
     * <p>Unicode, especially in version 3.2, defines many more properties than
463
     * the original set in UnicodeData.txt.</p>
464
     * <p>The properties APIs are intended to reflect Unicode properties as
465
     * defined in the Unicode Character Database (UCD) and Unicode Technical
466
     * Reports (UTR). For details about the properties see
467
     * http://www.unicode.org/.</p>
468
     * <p>For names of Unicode properties see the UCD file PropertyAliases.txt.
469
     * </p>
470
     * <pre>
471
     * Sample usage:
472
     * int ea = UCharacter.getIntPropertyValue(c, UProperty.EAST_ASIAN_WIDTH);
473
     * int ideo = UCharacter.getIntPropertyValue(c, UProperty.IDEOGRAPHIC);
474
     * boolean b = (ideo == 1) ? true : false;
475
     * </pre>
476
     * @param ch code point to test.
477
     * @param type UProperty selector constant, identifies which binary
478
     *        property to check. Must be
479
     *        UProperty.BINARY_START &lt;= type &lt; UProperty.BINARY_LIMIT or
480
     *        UProperty.INT_START &lt;= type &lt; UProperty.INT_LIMIT or
481
     *        UProperty.MASK_START &lt;= type &lt; UProperty.MASK_LIMIT.
482
     * @return numeric value that is directly the property value or,
483
     *         for enumerated properties, corresponds to the numeric value of
484
     *         the enumerated constant of the respective property value
485
     *         enumeration type (cast to enum type if necessary).
486
     *         Returns 0 or 1 (for false / true) for binary Unicode properties.
487
     *         Returns a bit-mask for mask properties.
488
     *         Returns 0 if 'type' is out of bounds or if the Unicode version
489
     *         does not have data for the property at all, or not for this code
490
     *         point.
491
     * @see UProperty
492
     * @see #hasBinaryProperty
493
     * @see #getIntPropertyMinValue
494
     * @see #getIntPropertyMaxValue
495
     * @see #getUnicodeVersion
496
     * @stable ICU 2.4
497
     */
498
     // for BiDiBase.java
499
    public static int getIntPropertyValue(int ch, int type) {
500
        return UCharacterProperty.INSTANCE.getIntPropertyValue(ch, type);
501
    }
502

503
    // private constructor -----------------------------------------------
504

505
    /**
506
     * Private constructor to prevent instantiation
507
     */
508
    private UCharacter() { }
509

510
      /*
511
       * Copied from UCharacterEnums.java
512
       */
513

514
        /**
515
         * Character type Mn
516
         * @stable ICU 2.1
517
         */
518
        public static final byte NON_SPACING_MARK        = 6;
519
        /**
520
         * Character type Me
521
         * @stable ICU 2.1
522
         */
523
        public static final byte ENCLOSING_MARK          = 7;
524
        /**
525
         * Character type Mc
526
         * @stable ICU 2.1
527
         */
528
        public static final byte COMBINING_SPACING_MARK  = 8;
529
        /**
530
         * Character type count
531
         * @stable ICU 2.1
532
         */
533
        public static final byte CHAR_CATEGORY_COUNT     = 30;
534

535
        /**
536
         * Directional type R
537
         * @stable ICU 2.1
538
         */
539
        public static final int RIGHT_TO_LEFT              = 1;
540
        /**
541
         * Directional type AL
542
         * @stable ICU 2.1
543
         */
544
        public static final int RIGHT_TO_LEFT_ARABIC       = 13;
545
}
546

547
Product

Resources

Company