Path: blob/master/src/java.base/share/classes/jdk/internal/icu/impl/Trie.java
41161 views
/*1* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/2425/*26******************************************************************************27* Copyright (C) 1996-2014, International Business Machines Corporation and28* others. All Rights Reserved.29******************************************************************************30*/3132package jdk.internal.icu.impl;3334import jdk.internal.icu.lang.UCharacter;35import jdk.internal.icu.text.UTF16;3637import java.io.DataInputStream;38import java.io.InputStream;39import java.io.IOException;4041/**42* <p>A trie is a kind of compressed, serializable table of values43* associated with Unicode code points (0..0x10ffff).</p>44* <p>This class defines the basic structure of a trie and provides methods45* to <b>retrieve the offsets to the actual data</b>.</p>46* <p>Data will be the form of an array of basic types, char or int.</p>47* <p>The actual data format will have to be specified by the user in the48* inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p>49* <p>This trie implementation is optimized for getting offset while walking50* forward through a UTF-16 string.51* Therefore, the simplest and fastest access macros are the52* fromLead() and fromOffsetTrail() methods.53* The fromBMP() method are a little more complicated; they get offsets even54* for lead surrogate codepoints, while the fromLead() method get special55* "folded" offsets for lead surrogate code units if there is relevant data56* associated with them.57* From such a folded offsets, an offset needs to be extracted to supply58* to the fromOffsetTrail() methods.59* To handle such supplementary codepoints, some offset information are kept60* in the data.</p>61* <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve62* that offset from the folded value for the lead surrogate unit.</p>63* <p>For examples of use, see com.ibm.icu.impl.CharTrie or64* com.ibm.icu.impl.IntTrie.</p>65* @author synwee66* @see com.ibm.icu.impl.CharTrie67* @see com.ibm.icu.impl.IntTrie68* @since release 2.1, Jan 01 200269*/70public abstract class Trie71{72// public class declaration ----------------------------------------7374/**75* Character data in com.ibm.impl.Trie have different user-specified format76* for different purposes.77* This interface specifies methods to be implemented in order for78* com.ibm.impl.Trie, to surrogate offset information encapsulated within79* the data.80*/81public static interface DataManipulate82{83/**84* Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's85* data86* the index array offset of the indexes for that lead surrogate.87* @param value data value for a surrogate from the trie, including the88* folding offset89* @return data offset or 0 if there is no data for the lead surrogate90*/91public int getFoldingOffset(int value);92}9394// default implementation95private static class DefaultGetFoldingOffset implements DataManipulate {96public int getFoldingOffset(int value) {97return value;98}99}100101// protected constructor -------------------------------------------102103/**104* Trie constructor for CharTrie use.105* @param inputStream ICU data file input stream which contains the106* trie107* @param dataManipulate object containing the information to parse the108* trie data109* @throws IOException thrown when input stream does not have the110* right header.111*/112protected Trie(InputStream inputStream,113DataManipulate dataManipulate) throws IOException114{115DataInputStream input = new DataInputStream(inputStream);116// Magic number to authenticate the data.117int signature = input.readInt();118m_options_ = input.readInt();119120if (!checkHeader(signature)) {121throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");122}123124if(dataManipulate != null) {125m_dataManipulate_ = dataManipulate;126} else {127m_dataManipulate_ = new DefaultGetFoldingOffset();128}129m_isLatin1Linear_ = (m_options_ &130HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;131m_dataOffset_ = input.readInt();132m_dataLength_ = input.readInt();133unserialize(inputStream);134}135136// protected data members ------------------------------------------137138/**139* Lead surrogate code points' index displacement in the index array.140* <pre>{@code141* 0x10000-0xd800=0x2800142* 0x2800 >> INDEX_STAGE_1_SHIFT_143* }</pre>144*/145protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;146/**147* Shift size for shifting right the input index. 1..9148*/149protected static final int INDEX_STAGE_1_SHIFT_ = 5;150/**151* Shift size for shifting left the index array values.152* Increases possible data size with 16-bit index values at the cost153* of compactability.154* This requires blocks of stage 2 data to be aligned by155* DATA_GRANULARITY.156* 0..INDEX_STAGE_1_SHIFT157*/158protected static final int INDEX_STAGE_2_SHIFT_ = 2;159/**160* Number of data values in a stage 2 (data array) block.161*/162protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_;163/**164* Mask for getting the lower bits from the input index.165* DATA_BLOCK_LENGTH - 1.166*/167protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;168/**169* Surrogate mask to use when shifting offset to retrieve supplementary170* values171*/172protected static final int SURROGATE_MASK_ = 0x3FF;173/**174* Index or UTF16 characters175*/176protected char m_index_[];177/**178* Internal TrieValue which handles the parsing of the data value.179* This class is to be implemented by the user180*/181protected DataManipulate m_dataManipulate_;182/**183* Start index of the data portion of the trie. CharTrie combines184* index and data into a char array, so this is used to indicate the185* initial offset to the data portion.186* Note this index always points to the initial value.187*/188protected int m_dataOffset_;189/**190* Length of the data array191*/192protected int m_dataLength_;193194// protected methods -----------------------------------------------195196/**197* Gets the offset to the data which the surrogate pair points to.198* @param lead lead surrogate199* @param trail trailing surrogate200* @return offset to data201*/202protected abstract int getSurrogateOffset(char lead, char trail);203204/**205* Gets the offset to the data which the index ch after variable offset206* points to.207* Note for locating a non-supplementary character data offset, calling208* <p>209* getRawOffset(0, ch);210* </p>211* will do. Otherwise if it is a supplementary character formed by212* surrogates lead and trail. Then we would have to call getRawOffset()213* with getFoldingIndexOffset(). See getSurrogateOffset().214* @param offset index offset which ch is to start from215* @param ch index to be used after offset216* @return offset to the data217*/218protected final int getRawOffset(int offset, char ch)219{220return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)]221<< INDEX_STAGE_2_SHIFT_)222+ (ch & INDEX_STAGE_3_MASK_);223}224225/**226* Gets the offset to data which the BMP character points to227* Treats a lead surrogate as a normal code point.228* @param ch BMP character229* @return offset to data230*/231protected final int getBMPOffset(char ch)232{233return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE234&& ch <= UTF16.LEAD_SURROGATE_MAX_VALUE)235? getRawOffset(LEAD_INDEX_OFFSET_, ch)236: getRawOffset(0, ch);237// using a getRawOffset(ch) makes no diff238}239240/**241* Gets the offset to the data which this lead surrogate character points242* to.243* Data at the returned offset may contain folding offset information for244* the next trailing surrogate character.245* @param ch lead surrogate character246* @return offset to data247*/248protected final int getLeadOffset(char ch)249{250return getRawOffset(0, ch);251}252253/**254* Internal trie getter from a code point.255* Could be faster(?) but longer with256* {@code if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }}257* Gets the offset to data which the codepoint points to258* @param ch codepoint259* @return offset to data260*/261protected final int getCodePointOffset(int ch)262{263// if ((ch >> 16) == 0) slower264if (ch < 0) {265return -1;266} else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {267// fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works268return getRawOffset(0, (char)ch);269} else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {270// BMP codepoint271return getBMPOffset((char)ch);272} else if (ch <= UCharacter.MAX_VALUE) {273// look at the construction of supplementary characters274// trail forms the ends of it.275return getSurrogateOffset(UTF16.getLeadSurrogate(ch),276(char)(ch & SURROGATE_MASK_));277} else {278// return -1 if there is an error, in this case we return279return -1;280}281}282283/**284* <p>Parses the inputstream and creates the trie index with it.</p>285* <p>This is overwritten by the child classes.286* @param inputStream input stream containing the trie information287* @exception IOException thrown when data reading fails.288*/289protected void unserialize(InputStream inputStream) throws IOException290{291//indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_292m_index_ = new char[m_dataOffset_];293DataInputStream input = new DataInputStream(inputStream);294for (int i = 0; i < m_dataOffset_; i ++) {295m_index_[i] = input.readChar();296}297}298299/**300* Determines if this is a 16 bit trie301* @return true if this is a 16 bit trie302*/303protected final boolean isCharTrie()304{305return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;306}307308// private data members --------------------------------------------309310/**311* Latin 1 option mask312*/313protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;314/**315* Constant number to authenticate the byte block316*/317protected static final int HEADER_SIGNATURE_ = 0x54726965;318/**319* Header option formatting320*/321private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF;322protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4;323protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;324325/**326* Flag indicator for Latin quick access data block327*/328private boolean m_isLatin1Linear_;329330/**331* <p>Trie options field.</p>332* <p>options bit field:<br>333* 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br>334* 8 0 = 16-bit data, 1=32-bit data<br>335* 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br>336* 3..0 INDEX_STAGE_2_SHIFT // 1..9<br>337*/338private int m_options_;339340// private methods ---------------------------------------------------341342/**343* Authenticates raw data header.344* Checking the header information, signature and options.345* @param signature This contains the options and type of a Trie346* @return true if the header is authenticated valid347*/348private final boolean checkHeader(int signature)349{350// check the signature351// Trie in big-endian US-ASCII (0x54726965).352// Magic number to authenticate the data.353if (signature != HEADER_SIGNATURE_) {354return false;355}356357if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) !=358INDEX_STAGE_1_SHIFT_ ||359((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) &360HEADER_OPTIONS_SHIFT_MASK_)361!= INDEX_STAGE_2_SHIFT_) {362return false;363}364return true;365}366}367368369