Path: blob/master/src/java.base/share/classes/jdk/internal/icu/impl/UCharacterProperty.java
41161 views
/*1* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/24/*25*******************************************************************************26* Copyright (C) 1996-2014, International Business Machines Corporation and27* others. All Rights Reserved.28*******************************************************************************29*/3031package jdk.internal.icu.impl;3233import java.io.IOException;34import java.nio.ByteBuffer;35import java.util.Iterator;36import java.util.MissingResourceException;3738import jdk.internal.icu.lang.UCharacter.HangulSyllableType;39import jdk.internal.icu.lang.UCharacter.NumericType;40import jdk.internal.icu.text.UTF16;41import jdk.internal.icu.text.UnicodeSet;42import jdk.internal.icu.util.VersionInfo;4344/**45* <p>Internal class used for Unicode character property database.</p>46* <p>This classes store binary data read from uprops.icu.47* It does not have the capability to parse the data into more high-level48* information. It only returns bytes of information when required.</p>49* <p>Due to the form most commonly used for retrieval, array of char is used50* to store the binary data.</p>51* <p>UCharacterPropertyDB also contains information on accessing indexes to52* significant points in the binary data.</p>53* <p>Responsibility for molding the binary data into more meaning form lies on54* <a href=UCharacter.html>UCharacter</a>.</p>55* @author Syn Wee Quek56* @since release 2.1, february 1st 200257*/5859public final class UCharacterProperty60{61// public data members -----------------------------------------------6263/*64* public singleton instance65*/66public static final UCharacterProperty INSTANCE;6768/**69* Trie data70*/71public Trie2_16 m_trie_;7273/**74* Unicode version75*/76public VersionInfo m_unicodeVersion_;7778/**79* Character type mask80*/81public static final int TYPE_MASK = 0x1F;8283// uprops.h enum UPropertySource --------------------------------------- ***8485/** From uchar.c/uprops.icu main trie */86public static final int SRC_CHAR=1;87/** From uchar.c/uprops.icu properties vectors trie */88public static final int SRC_PROPSVEC=2;89/** From ubidi_props.c/ubidi.icu */90public static final int SRC_BIDI=5;91/** From normalizer2impl.cpp/nfc.nrm */92public static final int SRC_NFC=8;93/** From normalizer2impl.cpp/nfkc.nrm */94public static final int SRC_NFKC=9;9596// public methods ----------------------------------------------------9798/**99* Gets the main property value for code point ch.100* @param ch code point whose property value is to be retrieved101* @return property value of code point102*/103public final int getProperty(int ch)104{105return m_trie_.get(ch);106}107108/**109* Gets the unicode additional properties.110* Java version of C u_getUnicodeProperties().111* @param codepoint codepoint whose additional properties is to be112* retrieved113* @param column The column index.114* @return unicode properties115*/116public int getAdditional(int codepoint, int column) {117assert column >= 0;118if (column >= m_additionalColumnsCount_) {119return 0;120}121return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column];122}123124/**125* <p>Get the "age" of the code point.</p>126* <p>The "age" is the Unicode version when the code point was first127* designated (as a non-character or for Private Use) or assigned a128* character.</p>129* <p>This can be useful to avoid emitting code points to receiving130* processes that do not accept newer characters.</p>131* <p>The data is from the UCD file DerivedAge.txt.</p>132* <p>This API does not check the validity of the codepoint.</p>133* @param codepoint The code point.134* @return the Unicode version number135*/136public VersionInfo getAge(int codepoint)137{138int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;139return VersionInfo.getInstance(140(version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,141version & LAST_NIBBLE_MASK_, 0, 0);142}143144// int-value and enumerated properties --------------------------------- ***145146public int getType(int c) {147return getProperty(c)&TYPE_MASK;148}149150/*151* Map some of the Grapheme Cluster Break values to Hangul Syllable Types.152* Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break.153*/154private static final int /* UHangulSyllableType */ gcbToHst[]={155HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */156HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */157HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */158HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */159HangulSyllableType.LEADING_JAMO, /* U_GCB_L */160HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */161HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */162HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */163HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */164HangulSyllableType.VOWEL_JAMO /* U_GCB_V */165/*166* Omit GCB values beyond what we need for hst.167* The code below checks for the array length.168*/169};170171private class IntProperty {172int column; // SRC_PROPSVEC column, or "source" if mask==0173int mask;174int shift;175176IntProperty(int column, int mask, int shift) {177this.column=column;178this.mask=mask;179this.shift=shift;180}181182IntProperty(int source) {183this.column=source;184this.mask=0;185}186187int getValue(int c) {188// systematic, directly stored properties189return (getAdditional(c, column)&mask)>>>shift;190}191}192193private class BiDiIntProperty extends IntProperty {194BiDiIntProperty() {195super(SRC_BIDI);196}197}198199private class CombiningClassIntProperty extends IntProperty {200CombiningClassIntProperty(int source) {201super(source);202}203}204205private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties206int which;207int max;208209NormQuickCheckIntProperty(int source, int which, int max) {210super(source);211this.which=which;212this.max=max;213}214}215216private IntProperty intProp = new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE217int getValue(int c) {218return UBiDiProps.INSTANCE.getPairedBracketType(c);219}220};221222public int getIntPropertyValue(int c, int which) {223if (which == BIDI_PAIRED_BRACKET_TYPE) {224return intProp.getValue(c);225}226return 0; // undefined227}228229/**230* Forms a supplementary code point from the argument character<br>231* Note this is for internal use hence no checks for the validity of the232* surrogate characters are done233* @param lead lead surrogate character234* @param trail trailing surrogate character235* @return code point of the supplementary character236*/237public static int getRawSupplementary(char lead, char trail)238{239return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;240}241242/**243* Gets the type mask244* @param type character type245* @return mask246*/247public static final int getMask(int type)248{249return 1 << type;250}251252/**253* Returns the digit values of characters like 'A' - 'Z', normal,254* half-width and full-width. This method assumes that the other digit255* characters are checked by the calling method.256* @param ch character to test257* @return -1 if ch is not a character of the form 'A' - 'Z', otherwise258* its corresponding digit will be returned.259*/260public static int getEuropeanDigit(int ch) {261if ((ch > 0x7a && ch < 0xff21)262|| ch < 0x41 || (ch > 0x5a && ch < 0x61)263|| ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {264return -1;265}266if (ch <= 0x7a) {267// ch >= 0x41 or ch < 0x61268return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);269}270// ch >= 0xff21271if (ch <= 0xff3a) {272return ch + 10 - 0xff21;273}274// ch >= 0xff41 && ch <= 0xff5a275return ch + 10 - 0xff41;276}277278public int digit(int c) {279int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_;280if(value<=9) {281return value;282} else {283return -1;284}285}286287// protected variables -----------------------------------------------288289/**290* Extra property trie291*/292Trie2_16 m_additionalTrie_;293/**294* Extra property vectors, 1st column for age and second for binary295* properties.296*/297int m_additionalVectors_[];298/**299* Number of additional columns300*/301int m_additionalColumnsCount_;302/**303* Maximum values for block, bits used as in vector word304* 0305*/306int m_maxBlockScriptValue_;307/**308* Maximum values for script, bits used as in vector word309* 0310*/311int m_maxJTGValue_;312/**313* Script_Extensions data314*/315public char[] m_scriptExtensions_;316317// private variables -------------------------------------------------318319/**320* Default name of the datafile321*/322@SuppressWarnings("deprecation")323private static final String DATA_FILE_NAME_ =324"/jdk/internal/icu/impl/data/icudt" +325VersionInfo.ICU_DATA_VERSION_PATH +326"/uprops.icu";327328/**329* Shift value for lead surrogate to form a supplementary character.330*/331private static final int LEAD_SURROGATE_SHIFT_ = 10;332/**333* Offset to add to combined surrogate pair to avoid masking.334*/335private static final int SURROGATE_OFFSET_ =336UTF16.SUPPLEMENTARY_MIN_VALUE -337(UTF16.SURROGATE_MIN_VALUE <<338LEAD_SURROGATE_SHIFT_) -339UTF16.TRAIL_SURROGATE_MIN_VALUE;340341342// property data constants -------------------------------------------------343344/**345* Numeric types and values in the main properties words.346*/347private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6;348private static final int getNumericTypeValue(int props) {349return props >> NUMERIC_TYPE_VALUE_SHIFT_;350}351352/* constants for the storage form of numeric types and values */353/** No numeric value. */354private static final int NTV_NONE_ = 0;355/** Decimal digits: nv=0..9 */356private static final int NTV_DECIMAL_START_ = 1;357/** Other digits: nv=0..9 */358private static final int NTV_DIGIT_START_ = 11;359/** Small integers: nv=0..154 */360private static final int NTV_NUMERIC_START_ = 21;361362private static final int ntvGetType(int ntv) {363return364(ntv==NTV_NONE_) ? NumericType.NONE :365(ntv<NTV_DIGIT_START_) ? NumericType.DECIMAL :366(ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT :367NumericType.NUMERIC;368}369370/*371* Properties in vector word 0372* Bits373* 31..24 DerivedAge version major/minor one nibble each374* 23..22 3..1: Bits 21..20 & 7..0 = Script_Extensions index375* 3: Script value from Script_Extensions376* 2: Script=Inherited377* 1: Script=Common378* 0: Script=bits 21..20 & 7..0379* 21..20 Bits 9..8 of the UScriptCode, or index to Script_Extensions380* 19..17 East Asian Width381* 16.. 8 UBlockCode382* 7.. 0 UScriptCode, or index to Script_Extensions383*/384385/**386* Script_Extensions: mask includes Script387*/388public static final int SCRIPT_X_MASK = 0x00f000ff;389//private static final int SCRIPT_X_SHIFT = 22;390391// The UScriptCode or Script_Extensions index is split across two bit fields.392// (Starting with Unicode 13/ICU 66/2019 due to more varied Script_Extensions.)393// Shift the high bits right by 12 to assemble the full value.394public static final int SCRIPT_HIGH_MASK = 0x00300000;395public static final int SCRIPT_HIGH_SHIFT = 12;396public static final int MAX_SCRIPT = 0x3ff;397398/**399* Integer properties mask and shift values for East Asian cell width.400* Equivalent to icu4c UPROPS_EA_MASK401*/402private static final int EAST_ASIAN_MASK_ = 0x000e0000;403/**404* Integer properties mask and shift values for East Asian cell width.405* Equivalent to icu4c UPROPS_EA_SHIFT406*/407private static final int EAST_ASIAN_SHIFT_ = 17;408/**409* Integer properties mask and shift values for blocks.410* Equivalent to icu4c UPROPS_BLOCK_MASK411*/412private static final int BLOCK_MASK_ = 0x0001ff00;413/**414* Integer properties mask and shift values for blocks.415* Equivalent to icu4c UPROPS_BLOCK_SHIFT416*/417private static final int BLOCK_SHIFT_ = 8;418/**419* Integer properties mask and shift values for scripts.420* Equivalent to icu4c UPROPS_SHIFT_LOW_MASK.421*/422public static final int SCRIPT_LOW_MASK = 0x000000ff;423424public static final int mergeScriptCodeOrIndex(int scriptX) {425return426((scriptX & SCRIPT_HIGH_MASK) >> SCRIPT_HIGH_SHIFT) |427(scriptX & SCRIPT_LOW_MASK);428}429430/**431* Additional properties used in internal trie data432*/433/*434* Properties in vector word 1435* Each bit encodes one binary property.436* The following constants represent the bit number, use 1<<UPROPS_XYZ.437* UPROPS_BINARY_1_TOP<=32!438*439* Keep this list of property enums in sync with440* propListNames[] in icu/source/tools/genprops/props2.c!441*442* ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_".443*/444private static final int WHITE_SPACE_PROPERTY_ = 0;445private static final int DASH_PROPERTY_ = 1;446private static final int HYPHEN_PROPERTY_ = 2;447private static final int QUOTATION_MARK_PROPERTY_ = 3;448private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4;449private static final int MATH_PROPERTY_ = 5;450private static final int HEX_DIGIT_PROPERTY_ = 6;451private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7;452private static final int ALPHABETIC_PROPERTY_ = 8;453private static final int IDEOGRAPHIC_PROPERTY_ = 9;454private static final int DIACRITIC_PROPERTY_ = 10;455private static final int EXTENDER_PROPERTY_ = 11;456private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12;457private static final int GRAPHEME_EXTEND_PROPERTY_ = 13;458private static final int GRAPHEME_LINK_PROPERTY_ = 14;459private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15;460private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16;461private static final int RADICAL_PROPERTY_ = 17;462private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18;463private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19;464private static final int DEPRECATED_PROPERTY_ = 20;465private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21;466private static final int XID_START_PROPERTY_ = 22;467private static final int XID_CONTINUE_PROPERTY_ = 23;468private static final int ID_START_PROPERTY_ = 24;469private static final int ID_CONTINUE_PROPERTY_ = 25;470private static final int GRAPHEME_BASE_PROPERTY_ = 26;471private static final int S_TERM_PROPERTY_ = 27;472private static final int VARIATION_SELECTOR_PROPERTY_ = 28;473private static final int PATTERN_SYNTAX = 29; /* new in ICU 3.4 and Unicode 4.1 */474private static final int PATTERN_WHITE_SPACE = 30;475476/*477* Properties in vector word 2478* Bits479* 31..26 reserved480* 25..20 Line Break481* 19..15 Sentence Break482* 14..10 Word Break483* 9.. 5 Grapheme Cluster Break484* 4.. 0 Decomposition Type485*/486private static final int LB_MASK = 0x03f00000;487private static final int LB_SHIFT = 20;488489private static final int SB_MASK = 0x000f8000;490private static final int SB_SHIFT = 15;491492private static final int WB_MASK = 0x00007c00;493private static final int WB_SHIFT = 10;494495private static final int GCB_MASK = 0x000003e0;496private static final int GCB_SHIFT = 5;497498/**499* Integer properties mask for decomposition type.500* Equivalent to icu4c UPROPS_DT_MASK.501*/502private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f;503504/**505* First nibble shift506*/507private static final int FIRST_NIBBLE_SHIFT_ = 0x4;508/**509* Second nibble mask510*/511private static final int LAST_NIBBLE_MASK_ = 0xF;512/**513* Age value shift514*/515private static final int AGE_SHIFT_ = 24;516517// private constructors --------------------------------------------------518519/**520* Constructor521* @exception IOException thrown when data reading fails or data corrupted522*/523private UCharacterProperty() throws IOException524{525// jar access526ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME_);527m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, new IsAcceptable());528// Read or skip the 16 indexes.529int propertyOffset = bytes.getInt();530/* exceptionOffset = */ bytes.getInt();531/* caseOffset = */ bytes.getInt();532int additionalOffset = bytes.getInt();533int additionalVectorsOffset = bytes.getInt();534m_additionalColumnsCount_ = bytes.getInt();535int scriptExtensionsOffset = bytes.getInt();536int reservedOffset7 = bytes.getInt();537/* reservedOffset8 = */ bytes.getInt();538/* dataTopOffset = */ bytes.getInt();539m_maxBlockScriptValue_ = bytes.getInt();540m_maxJTGValue_ = bytes.getInt();541ICUBinary.skipBytes(bytes, (16 - 12) << 2);542543// read the main properties trie544m_trie_ = Trie2_16.createFromSerialized(bytes);545int expectedTrieLength = (propertyOffset - 16) * 4;546int trieLength = m_trie_.getSerializedLength();547if(trieLength > expectedTrieLength) {548throw new IOException("uprops.icu: not enough bytes for main trie");549}550// skip padding after trie bytes551ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);552553// skip unused intervening data structures554ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4);555556if(m_additionalColumnsCount_ > 0) {557// reads the additional property block558m_additionalTrie_ = Trie2_16.createFromSerialized(bytes);559expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4;560trieLength = m_additionalTrie_.getSerializedLength();561if(trieLength > expectedTrieLength) {562throw new IOException("uprops.icu: not enough bytes for additional-properties trie");563}564// skip padding after trie bytes565ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);566567// additional properties568int size = scriptExtensionsOffset - additionalVectorsOffset;569m_additionalVectors_ = new int[size];570for (int i = 0; i < size; i ++) {571m_additionalVectors_[i] = bytes.getInt();572}573}574575// Script_Extensions576int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2;577if(numChars > 0) {578m_scriptExtensions_ = new char[numChars];579for(int i = 0; i < numChars; ++i) {580m_scriptExtensions_[i] = bytes.getChar();581}582}583}584585private static final class IsAcceptable implements ICUBinary.Authenticate {586// @Override when we switch to Java 6587public boolean isDataVersionAcceptable(byte version[]) {588return version[0] == 7;589}590}591592private static final int DATA_FORMAT = 0x5550726F; // "UPro"593594public void upropsvec_addPropertyStarts(UnicodeSet set) {595/* add the start code point of each same-value range of the properties vectors trie */596if(m_additionalColumnsCount_>0) {597/* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */598Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator();599Trie2.Range range;600while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {601set.add(range.startCodePoint);602}603}604}605606// This static initializer block must be placed after607// other static member initialization608static {609try {610INSTANCE = new UCharacterProperty();611}612catch (IOException e) {613throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME_,"");614}615}616617618// Moved from UProperty.java619/**620* Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3).621* Used in UAX #9: Unicode Bidirectional Algorithm622* (http://www.unicode.org/reports/tr9/)623* Returns UCharacter.BidiPairedBracketType values.624* @stable ICU 52625*/626public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015;627628}629630631