Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mobile
Path: blob/master/src/java.base/share/classes/jdk/internal/icu/impl/Trie.java
41161 views
1
/*
2
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
3
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
*
5
* This code is free software; you can redistribute it and/or modify it
6
* under the terms of the GNU General Public License version 2 only, as
7
* published by the Free Software Foundation. Oracle designates this
8
* particular file as subject to the "Classpath" exception as provided
9
* by Oracle in the LICENSE file that accompanied this code.
10
*
11
* This code is distributed in the hope that it will be useful, but WITHOUT
12
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14
* version 2 for more details (a copy is included in the LICENSE file that
15
* accompanied this code).
16
*
17
* You should have received a copy of the GNU General Public License version
18
* 2 along with this work; if not, write to the Free Software Foundation,
19
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20
*
21
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22
* or visit www.oracle.com if you need additional information or have any
23
* questions.
24
*/
25
26
/*
27
******************************************************************************
28
* Copyright (C) 1996-2014, International Business Machines Corporation and
29
* others. All Rights Reserved.
30
******************************************************************************
31
*/
32
33
package jdk.internal.icu.impl;
34
35
import jdk.internal.icu.lang.UCharacter;
36
import jdk.internal.icu.text.UTF16;
37
38
import java.io.DataInputStream;
39
import java.io.InputStream;
40
import java.io.IOException;
41
42
/**
43
* <p>A trie is a kind of compressed, serializable table of values
44
* associated with Unicode code points (0..0x10ffff).</p>
45
* <p>This class defines the basic structure of a trie and provides methods
46
* to <b>retrieve the offsets to the actual data</b>.</p>
47
* <p>Data will be the form of an array of basic types, char or int.</p>
48
* <p>The actual data format will have to be specified by the user in the
49
* inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p>
50
* <p>This trie implementation is optimized for getting offset while walking
51
* forward through a UTF-16 string.
52
* Therefore, the simplest and fastest access macros are the
53
* fromLead() and fromOffsetTrail() methods.
54
* The fromBMP() method are a little more complicated; they get offsets even
55
* for lead surrogate codepoints, while the fromLead() method get special
56
* "folded" offsets for lead surrogate code units if there is relevant data
57
* associated with them.
58
* From such a folded offsets, an offset needs to be extracted to supply
59
* to the fromOffsetTrail() methods.
60
* To handle such supplementary codepoints, some offset information are kept
61
* in the data.</p>
62
* <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve
63
* that offset from the folded value for the lead surrogate unit.</p>
64
* <p>For examples of use, see com.ibm.icu.impl.CharTrie or
65
* com.ibm.icu.impl.IntTrie.</p>
66
* @author synwee
67
* @see com.ibm.icu.impl.CharTrie
68
* @see com.ibm.icu.impl.IntTrie
69
* @since release 2.1, Jan 01 2002
70
*/
71
public abstract class Trie
72
{
73
// public class declaration ----------------------------------------
74
75
/**
76
* Character data in com.ibm.impl.Trie have different user-specified format
77
* for different purposes.
78
* This interface specifies methods to be implemented in order for
79
* com.ibm.impl.Trie, to surrogate offset information encapsulated within
80
* the data.
81
*/
82
public static interface DataManipulate
83
{
84
/**
85
* Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's
86
* data
87
* the index array offset of the indexes for that lead surrogate.
88
* @param value data value for a surrogate from the trie, including the
89
* folding offset
90
* @return data offset or 0 if there is no data for the lead surrogate
91
*/
92
public int getFoldingOffset(int value);
93
}
94
95
// default implementation
96
private static class DefaultGetFoldingOffset implements DataManipulate {
97
public int getFoldingOffset(int value) {
98
return value;
99
}
100
}
101
102
// protected constructor -------------------------------------------
103
104
/**
105
* Trie constructor for CharTrie use.
106
* @param inputStream ICU data file input stream which contains the
107
* trie
108
* @param dataManipulate object containing the information to parse the
109
* trie data
110
* @throws IOException thrown when input stream does not have the
111
* right header.
112
*/
113
protected Trie(InputStream inputStream,
114
DataManipulate dataManipulate) throws IOException
115
{
116
DataInputStream input = new DataInputStream(inputStream);
117
// Magic number to authenticate the data.
118
int signature = input.readInt();
119
m_options_ = input.readInt();
120
121
if (!checkHeader(signature)) {
122
throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");
123
}
124
125
if(dataManipulate != null) {
126
m_dataManipulate_ = dataManipulate;
127
} else {
128
m_dataManipulate_ = new DefaultGetFoldingOffset();
129
}
130
m_isLatin1Linear_ = (m_options_ &
131
HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
132
m_dataOffset_ = input.readInt();
133
m_dataLength_ = input.readInt();
134
unserialize(inputStream);
135
}
136
137
// protected data members ------------------------------------------
138
139
/**
140
* Lead surrogate code points' index displacement in the index array.
141
* <pre>{@code
142
* 0x10000-0xd800=0x2800
143
* 0x2800 >> INDEX_STAGE_1_SHIFT_
144
* }</pre>
145
*/
146
protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
147
/**
148
* Shift size for shifting right the input index. 1..9
149
*/
150
protected static final int INDEX_STAGE_1_SHIFT_ = 5;
151
/**
152
* Shift size for shifting left the index array values.
153
* Increases possible data size with 16-bit index values at the cost
154
* of compactability.
155
* This requires blocks of stage 2 data to be aligned by
156
* DATA_GRANULARITY.
157
* 0..INDEX_STAGE_1_SHIFT
158
*/
159
protected static final int INDEX_STAGE_2_SHIFT_ = 2;
160
/**
161
* Number of data values in a stage 2 (data array) block.
162
*/
163
protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_;
164
/**
165
* Mask for getting the lower bits from the input index.
166
* DATA_BLOCK_LENGTH - 1.
167
*/
168
protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;
169
/**
170
* Surrogate mask to use when shifting offset to retrieve supplementary
171
* values
172
*/
173
protected static final int SURROGATE_MASK_ = 0x3FF;
174
/**
175
* Index or UTF16 characters
176
*/
177
protected char m_index_[];
178
/**
179
* Internal TrieValue which handles the parsing of the data value.
180
* This class is to be implemented by the user
181
*/
182
protected DataManipulate m_dataManipulate_;
183
/**
184
* Start index of the data portion of the trie. CharTrie combines
185
* index and data into a char array, so this is used to indicate the
186
* initial offset to the data portion.
187
* Note this index always points to the initial value.
188
*/
189
protected int m_dataOffset_;
190
/**
191
* Length of the data array
192
*/
193
protected int m_dataLength_;
194
195
// protected methods -----------------------------------------------
196
197
/**
198
* Gets the offset to the data which the surrogate pair points to.
199
* @param lead lead surrogate
200
* @param trail trailing surrogate
201
* @return offset to data
202
*/
203
protected abstract int getSurrogateOffset(char lead, char trail);
204
205
/**
206
* Gets the offset to the data which the index ch after variable offset
207
* points to.
208
* Note for locating a non-supplementary character data offset, calling
209
* <p>
210
* getRawOffset(0, ch);
211
* </p>
212
* will do. Otherwise if it is a supplementary character formed by
213
* surrogates lead and trail. Then we would have to call getRawOffset()
214
* with getFoldingIndexOffset(). See getSurrogateOffset().
215
* @param offset index offset which ch is to start from
216
* @param ch index to be used after offset
217
* @return offset to the data
218
*/
219
protected final int getRawOffset(int offset, char ch)
220
{
221
return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)]
222
<< INDEX_STAGE_2_SHIFT_)
223
+ (ch & INDEX_STAGE_3_MASK_);
224
}
225
226
/**
227
* Gets the offset to data which the BMP character points to
228
* Treats a lead surrogate as a normal code point.
229
* @param ch BMP character
230
* @return offset to data
231
*/
232
protected final int getBMPOffset(char ch)
233
{
234
return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE
235
&& ch <= UTF16.LEAD_SURROGATE_MAX_VALUE)
236
? getRawOffset(LEAD_INDEX_OFFSET_, ch)
237
: getRawOffset(0, ch);
238
// using a getRawOffset(ch) makes no diff
239
}
240
241
/**
242
* Gets the offset to the data which this lead surrogate character points
243
* to.
244
* Data at the returned offset may contain folding offset information for
245
* the next trailing surrogate character.
246
* @param ch lead surrogate character
247
* @return offset to data
248
*/
249
protected final int getLeadOffset(char ch)
250
{
251
return getRawOffset(0, ch);
252
}
253
254
/**
255
* Internal trie getter from a code point.
256
* Could be faster(?) but longer with
257
* {@code if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }}
258
* Gets the offset to data which the codepoint points to
259
* @param ch codepoint
260
* @return offset to data
261
*/
262
protected final int getCodePointOffset(int ch)
263
{
264
// if ((ch >> 16) == 0) slower
265
if (ch < 0) {
266
return -1;
267
} else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
268
// fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works
269
return getRawOffset(0, (char)ch);
270
} else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
271
// BMP codepoint
272
return getBMPOffset((char)ch);
273
} else if (ch <= UCharacter.MAX_VALUE) {
274
// look at the construction of supplementary characters
275
// trail forms the ends of it.
276
return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
277
(char)(ch & SURROGATE_MASK_));
278
} else {
279
// return -1 if there is an error, in this case we return
280
return -1;
281
}
282
}
283
284
/**
285
* <p>Parses the inputstream and creates the trie index with it.</p>
286
* <p>This is overwritten by the child classes.
287
* @param inputStream input stream containing the trie information
288
* @exception IOException thrown when data reading fails.
289
*/
290
protected void unserialize(InputStream inputStream) throws IOException
291
{
292
//indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_
293
m_index_ = new char[m_dataOffset_];
294
DataInputStream input = new DataInputStream(inputStream);
295
for (int i = 0; i < m_dataOffset_; i ++) {
296
m_index_[i] = input.readChar();
297
}
298
}
299
300
/**
301
* Determines if this is a 16 bit trie
302
* @return true if this is a 16 bit trie
303
*/
304
protected final boolean isCharTrie()
305
{
306
return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;
307
}
308
309
// private data members --------------------------------------------
310
311
/**
312
* Latin 1 option mask
313
*/
314
protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
315
/**
316
* Constant number to authenticate the byte block
317
*/
318
protected static final int HEADER_SIGNATURE_ = 0x54726965;
319
/**
320
* Header option formatting
321
*/
322
private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF;
323
protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4;
324
protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;
325
326
/**
327
* Flag indicator for Latin quick access data block
328
*/
329
private boolean m_isLatin1Linear_;
330
331
/**
332
* <p>Trie options field.</p>
333
* <p>options bit field:<br>
334
* 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br>
335
* 8 0 = 16-bit data, 1=32-bit data<br>
336
* 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br>
337
* 3..0 INDEX_STAGE_2_SHIFT // 1..9<br>
338
*/
339
private int m_options_;
340
341
// private methods ---------------------------------------------------
342
343
/**
344
* Authenticates raw data header.
345
* Checking the header information, signature and options.
346
* @param signature This contains the options and type of a Trie
347
* @return true if the header is authenticated valid
348
*/
349
private final boolean checkHeader(int signature)
350
{
351
// check the signature
352
// Trie in big-endian US-ASCII (0x54726965).
353
// Magic number to authenticate the data.
354
if (signature != HEADER_SIGNATURE_) {
355
return false;
356
}
357
358
if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) !=
359
INDEX_STAGE_1_SHIFT_ ||
360
((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) &
361
HEADER_OPTIONS_SHIFT_MASK_)
362
!= INDEX_STAGE_2_SHIFT_) {
363
return false;
364
}
365
return true;
366
}
367
}
368
369