Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mobile
Path: blob/master/src/java.base/share/classes/jdk/internal/icu/text/StringPrep.java
41161 views
1
/*
2
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
3
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
*
5
* This code is free software; you can redistribute it and/or modify it
6
* under the terms of the GNU General Public License version 2 only, as
7
* published by the Free Software Foundation. Oracle designates this
8
* particular file as subject to the "Classpath" exception as provided
9
* by Oracle in the LICENSE file that accompanied this code.
10
*
11
* This code is distributed in the hope that it will be useful, but WITHOUT
12
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14
* version 2 for more details (a copy is included in the LICENSE file that
15
* accompanied this code).
16
*
17
* You should have received a copy of the GNU General Public License version
18
* 2 along with this work; if not, write to the Free Software Foundation,
19
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20
*
21
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22
* or visit www.oracle.com if you need additional information or have any
23
* questions.
24
*/
25
/*
26
/*
27
*******************************************************************************
28
* Copyright (C) 2003-2004, International Business Machines Corporation and *
29
* others. All Rights Reserved. *
30
*******************************************************************************
31
*/
32
//
33
// CHANGELOG
34
// 2005-05-19 Edward Wang
35
// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/StringPrep.java
36
// - move from package com.ibm.icu.text to package sun.net.idn
37
// - use ParseException instead of StringPrepParseException
38
// - change 'Normalizer.getUnicodeVersion()' to 'NormalizerImpl.getUnicodeVersion()'
39
// - remove all @deprecated tag to make compiler happy
40
// 2007-08-14 Martin Buchholz
41
// - remove redundant casts
42
//
43
package jdk.internal.icu.text;
44
45
import java.io.BufferedInputStream;
46
import java.io.ByteArrayInputStream;
47
import java.io.IOException;
48
import java.io.InputStream;
49
import java.text.ParseException;
50
51
import sun.text.Normalizer;
52
import jdk.internal.icu.impl.CharTrie;
53
import jdk.internal.icu.impl.StringPrepDataReader;
54
import jdk.internal.icu.impl.Trie;
55
import jdk.internal.icu.lang.UCharacter;
56
import jdk.internal.icu.lang.UCharacterDirection;
57
import jdk.internal.icu.util.VersionInfo;
58
59
/**
60
* StringPrep API implements the StingPrep framework as described by
61
* <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>.
62
* StringPrep prepares Unicode strings for use in network protocols.
63
* Profiles of StingPrep are set of rules and data according to which the
64
* Unicode Strings are prepared. Each profiles contains tables which describe
65
* how a code point should be treated. The tables are broadly classied into
66
* <ul>
67
* <li> Unassigned Table: Contains code points that are unassigned
68
* in the Unicode Version supported by StringPrep. Currently
69
* RFC 3454 supports Unicode 3.2. </li>
70
* <li> Prohibited Table: Contains code points that are prohibted from
71
* the output of the StringPrep processing function. </li>
72
* <li> Mapping Table: Contains code ponts that are deleted from the output or case mapped. </li>
73
* </ul>
74
*
75
* The procedure for preparing Unicode strings:
76
* <ol>
77
* <li> Map: For each character in the input, check if it has a mapping
78
* and, if so, replace it with its mapping. </li>
79
* <li> Normalize: Possibly normalize the result of step 1 using Unicode
80
* normalization. </li>
81
* <li> Prohibit: Check for any characters that are not allowed in the
82
* output. If any are found, return an error.</li>
83
* <li> Check bidi: Possibly check for right-to-left characters, and if
84
* any are found, make sure that the whole string satisfies the
85
* requirements for bidirectional strings. If the string does not
86
* satisfy the requirements for bidirectional strings, return an
87
* error. </li>
88
* </ol>
89
* @author Ram Viswanadha
90
* @draft ICU 2.8
91
*/
92
public final class StringPrep {
93
/**
94
* Option to prohibit processing of unassigned code points in the input
95
*
96
* @see #prepare
97
* @draft ICU 2.8
98
*/
99
public static final int DEFAULT = 0x0000;
100
101
/**
102
* Option to allow processing of unassigned code points in the input
103
*
104
* @see #prepare
105
* @draft ICU 2.8
106
*/
107
public static final int ALLOW_UNASSIGNED = 0x0001;
108
109
private static final int UNASSIGNED = 0x0000;
110
private static final int MAP = 0x0001;
111
private static final int PROHIBITED = 0x0002;
112
private static final int DELETE = 0x0003;
113
private static final int TYPE_LIMIT = 0x0004;
114
115
private static final int NORMALIZATION_ON = 0x0001;
116
private static final int CHECK_BIDI_ON = 0x0002;
117
118
private static final int TYPE_THRESHOLD = 0xFFF0;
119
private static final int MAX_INDEX_VALUE = 0x3FBF; /*16139*/
120
private static final int MAX_INDEX_TOP_LENGTH = 0x0003;
121
122
/* indexes[] value names */
123
private static final int INDEX_TRIE_SIZE = 0; /* number of bytes in normalization trie */
124
private static final int INDEX_MAPPING_DATA_SIZE = 1; /* The array that contains the mapping */
125
private static final int NORM_CORRECTNS_LAST_UNI_VERSION = 2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */
126
private static final int ONE_UCHAR_MAPPING_INDEX_START = 3; /* The starting index of 1 UChar mapping index in the mapping data array */
127
private static final int TWO_UCHARS_MAPPING_INDEX_START = 4; /* The starting index of 2 UChars mapping index in the mapping data array */
128
private static final int THREE_UCHARS_MAPPING_INDEX_START = 5;
129
private static final int FOUR_UCHARS_MAPPING_INDEX_START = 6;
130
private static final int OPTIONS = 7; /* Bit set of options to turn on in the profile */
131
private static final int INDEX_TOP = 16; /* changing this requires a new formatVersion */
132
133
134
/**
135
* Default buffer size of datafile
136
*/
137
private static final int DATA_BUFFER_SIZE = 25000;
138
139
/* Wrappers for Trie implementations */
140
private static final class StringPrepTrieImpl implements Trie.DataManipulate{
141
private CharTrie sprepTrie = null;
142
/**
143
* Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
144
* data the index array offset of the indexes for that lead surrogate.
145
* @param property data value for a surrogate from the trie, including
146
* the folding offset
147
* @return data offset or 0 if there is no data for the lead surrogate
148
*/
149
public int getFoldingOffset(int value){
150
return value;
151
}
152
}
153
154
// CharTrie implementation for reading the trie data
155
private StringPrepTrieImpl sprepTrieImpl;
156
// Indexes read from the data file
157
private int[] indexes;
158
// mapping data read from the data file
159
private char[] mappingData;
160
// format version of the data file
161
private byte[] formatVersion;
162
// the version of Unicode supported by the data file
163
private VersionInfo sprepUniVer;
164
// the Unicode version of last entry in the
165
// NormalizationCorrections.txt file if normalization
166
// is turned on
167
private VersionInfo normCorrVer;
168
// Option to turn on Normalization
169
private boolean doNFKC;
170
// Option to turn on checking for BiDi rules
171
private boolean checkBiDi;
172
173
174
private char getCodePointValue(int ch){
175
return sprepTrieImpl.sprepTrie.getCodePointValue(ch);
176
}
177
178
private static VersionInfo getVersionInfo(int comp){
179
int micro = comp & 0xFF;
180
int milli =(comp >> 8) & 0xFF;
181
int minor =(comp >> 16) & 0xFF;
182
int major =(comp >> 24) & 0xFF;
183
return VersionInfo.getInstance(major,minor,milli,micro);
184
}
185
private static VersionInfo getVersionInfo(byte[] version){
186
if(version.length != 4){
187
return null;
188
}
189
return VersionInfo.getInstance((int)version[0],(int) version[1],(int) version[2],(int) version[3]);
190
}
191
/**
192
* Creates an StringPrep object after reading the input stream.
193
* The object does not hold a reference to the input steam, so the stream can be
194
* closed after the method returns.
195
*
196
* @param inputStream The stream for reading the StringPrep profile binarySun
197
* @throws IOException
198
* @draft ICU 2.8
199
*/
200
public StringPrep(InputStream inputStream) throws IOException{
201
202
BufferedInputStream b = new BufferedInputStream(inputStream,DATA_BUFFER_SIZE);
203
204
StringPrepDataReader reader = new StringPrepDataReader(b);
205
206
// read the indexes
207
indexes = reader.readIndexes(INDEX_TOP);
208
209
byte[] sprepBytes = new byte[indexes[INDEX_TRIE_SIZE]];
210
211
212
//indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes
213
mappingData = new char[indexes[INDEX_MAPPING_DATA_SIZE]/2];
214
// load the rest of the data and initialize the data members
215
reader.read(sprepBytes,mappingData);
216
217
sprepTrieImpl = new StringPrepTrieImpl();
218
sprepTrieImpl.sprepTrie = new CharTrie( new ByteArrayInputStream(sprepBytes),sprepTrieImpl );
219
220
// get the data format version
221
formatVersion = reader.getDataFormatVersion();
222
223
// get the options
224
doNFKC = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0);
225
checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);
226
sprepUniVer = getVersionInfo(reader.getUnicodeVersion());
227
normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
228
VersionInfo normUniVer = UCharacter.getUnicodeVersion();
229
if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
230
normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
231
((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/
232
){
233
throw new IOException("Normalization Correction version not supported");
234
}
235
b.close();
236
}
237
238
private static final class Values{
239
boolean isIndex;
240
int value;
241
int type;
242
public void reset(){
243
isIndex = false;
244
value = 0;
245
type = -1;
246
}
247
}
248
249
private static final void getValues(char trieWord,Values values){
250
values.reset();
251
if(trieWord == 0){
252
/*
253
* Initial value stored in the mapping table
254
* just return TYPE_LIMIT .. so that
255
* the source codepoint is copied to the destination
256
*/
257
values.type = TYPE_LIMIT;
258
}else if(trieWord >= TYPE_THRESHOLD){
259
values.type = (trieWord - TYPE_THRESHOLD);
260
}else{
261
/* get the type */
262
values.type = MAP;
263
/* ascertain if the value is index or delta */
264
if((trieWord & 0x02)>0){
265
values.isIndex = true;
266
values.value = trieWord >> 2; //mask off the lower 2 bits and shift
267
268
}else{
269
values.isIndex = false;
270
values.value = (trieWord<<16)>>16;
271
values.value = (values.value >> 2);
272
273
}
274
275
if((trieWord>>2) == MAX_INDEX_VALUE){
276
values.type = DELETE;
277
values.isIndex = false;
278
values.value = 0;
279
}
280
}
281
}
282
283
284
285
private StringBuffer map( UCharacterIterator iter, int options)
286
throws ParseException {
287
288
Values val = new Values();
289
char result = 0;
290
int ch = UCharacterIterator.DONE;
291
StringBuffer dest = new StringBuffer();
292
boolean allowUnassigned = ((options & ALLOW_UNASSIGNED)>0);
293
294
while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
295
296
result = getCodePointValue(ch);
297
getValues(result,val);
298
299
// check if the source codepoint is unassigned
300
if(val.type == UNASSIGNED && allowUnassigned == false){
301
throw new ParseException("An unassigned code point was found in the input " +
302
iter.getText(), iter.getIndex());
303
}else if((val.type == MAP)){
304
int index, length;
305
306
if(val.isIndex){
307
index = val.value;
308
if(index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] &&
309
index < indexes[TWO_UCHARS_MAPPING_INDEX_START]){
310
length = 1;
311
}else if(index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] &&
312
index < indexes[THREE_UCHARS_MAPPING_INDEX_START]){
313
length = 2;
314
}else if(index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] &&
315
index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]){
316
length = 3;
317
}else{
318
length = mappingData[index++];
319
}
320
/* copy mapping to destination */
321
dest.append(mappingData,index,length);
322
continue;
323
324
}else{
325
ch -= val.value;
326
}
327
}else if(val.type == DELETE){
328
// just consume the codepoint and contine
329
continue;
330
}
331
//copy the source into destination
332
UTF16.append(dest,ch);
333
}
334
335
return dest;
336
}
337
338
339
private StringBuffer normalize(StringBuffer src){
340
/*
341
* Option UNORM_BEFORE_PRI_29:
342
*
343
* IDNA as interpreted by IETF members (see unicode mailing list 2004H1)
344
* requires strict adherence to Unicode 3.2 normalization,
345
* including buggy composition from before fixing Public Review Issue #29.
346
* Note that this results in some valid but nonsensical text to be
347
* either corrupted or rejected, depending on the text.
348
* See http://www.unicode.org/review/resolved-pri.html#pri29
349
* See unorm.cpp and cnormtst.c
350
*/
351
return new StringBuffer(
352
Normalizer.normalize(
353
src.toString(),
354
java.text.Normalizer.Form.NFKC,
355
Normalizer.UNICODE_3_2));
356
}
357
/*
358
boolean isLabelSeparator(int ch){
359
int result = getCodePointValue(ch);
360
if( (result & 0x07) == LABEL_SEPARATOR){
361
return true;
362
}
363
return false;
364
}
365
*/
366
/*
367
1) Map -- For each character in the input, check if it has a mapping
368
and, if so, replace it with its mapping.
369
370
2) Normalize -- Possibly normalize the result of step 1 using Unicode
371
normalization.
372
373
3) Prohibit -- Check for any characters that are not allowed in the
374
output. If any are found, return an error.
375
376
4) Check bidi -- Possibly check for right-to-left characters, and if
377
any are found, make sure that the whole string satisfies the
378
requirements for bidirectional strings. If the string does not
379
satisfy the requirements for bidirectional strings, return an
380
error.
381
[Unicode3.2] defines several bidirectional categories; each character
382
has one bidirectional category assigned to it. For the purposes of
383
the requirements below, an "RandALCat character" is a character that
384
has Unicode bidirectional categories "R" or "AL"; an "LCat character"
385
is a character that has Unicode bidirectional category "L". Note
386
387
388
that there are many characters which fall in neither of the above
389
definitions; Latin digits (<U+0030> through <U+0039>) are examples of
390
this because they have bidirectional category "EN".
391
392
In any profile that specifies bidirectional character handling, all
393
three of the following requirements MUST be met:
394
395
1) The characters in section 5.8 MUST be prohibited.
396
397
2) If a string contains any RandALCat character, the string MUST NOT
398
contain any LCat character.
399
400
3) If a string contains any RandALCat character, a RandALCat
401
character MUST be the first character of the string, and a
402
RandALCat character MUST be the last character of the string.
403
*/
404
/**
405
* Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC),
406
* checks for prohited and BiDi characters in the order defined by RFC 3454
407
* depending on the options specified in the profile.
408
*
409
* @param src A UCharacterIterator object containing the source string
410
* @param options A bit set of options:
411
*
412
* - StringPrep.NONE Prohibit processing of unassigned code points in the input
413
*
414
* - StringPrep.ALLOW_UNASSIGNED Treat the unassigned code points are in the input
415
* as normal Unicode code points.
416
*
417
* @return StringBuffer A StringBuffer containing the output
418
* @throws ParseException
419
* @draft ICU 2.8
420
*/
421
public StringBuffer prepare(UCharacterIterator src, int options)
422
throws ParseException{
423
424
// map
425
StringBuffer mapOut = map(src,options);
426
StringBuffer normOut = mapOut;// initialize
427
428
if(doNFKC){
429
// normalize
430
normOut = normalize(mapOut);
431
}
432
433
int ch;
434
char result;
435
UCharacterIterator iter = UCharacterIterator.getInstance(normOut);
436
Values val = new Values();
437
int direction=UCharacterDirection.CHAR_DIRECTION_COUNT,
438
firstCharDir=UCharacterDirection.CHAR_DIRECTION_COUNT;
439
int rtlPos=-1, ltrPos=-1;
440
boolean rightToLeft=false, leftToRight=false;
441
442
while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
443
result = getCodePointValue(ch);
444
getValues(result,val);
445
446
if(val.type == PROHIBITED ){
447
throw new ParseException("A prohibited code point was found in the input" +
448
iter.getText(), val.value);
449
}
450
451
direction = UCharacter.getDirection(ch);
452
if(firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT){
453
firstCharDir = direction;
454
}
455
if(direction == UCharacterDirection.LEFT_TO_RIGHT){
456
leftToRight = true;
457
ltrPos = iter.getIndex()-1;
458
}
459
if(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC){
460
rightToLeft = true;
461
rtlPos = iter.getIndex()-1;
462
}
463
}
464
if(checkBiDi == true){
465
// satisfy 2
466
if( leftToRight == true && rightToLeft == true){
467
throw new ParseException("The input does not conform to the rules for BiDi code points." +
468
iter.getText(),
469
(rtlPos>ltrPos) ? rtlPos : ltrPos);
470
}
471
472
//satisfy 3
473
if( rightToLeft == true &&
474
!((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) &&
475
(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC))
476
){
477
throw new ParseException("The input does not conform to the rules for BiDi code points." +
478
iter.getText(),
479
(rtlPos>ltrPos) ? rtlPos : ltrPos);
480
}
481
}
482
return normOut;
483
484
}
485
}
486
487