Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mobile
Path: blob/master/test/jdk/java/text/Normalizer/ICUBasicTest.java
41149 views
1
/*
2
* Copyright (c) 2019, 2020, Oracle and/or its affiliates. All rights reserved.
3
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
*
5
* This code is free software; you can redistribute it and/or modify it
6
* under the terms of the GNU General Public License version 2 only, as
7
* published by the Free Software Foundation.
8
*
9
* This code is distributed in the hope that it will be useful, but WITHOUT
10
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12
* version 2 for more details (a copy is included in the LICENSE file that
13
* accompanied this code).
14
*
15
* You should have received a copy of the GNU General Public License version
16
* 2 along with this work; if not, write to the Free Software Foundation,
17
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18
*
19
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20
* or visit www.oracle.com if you need additional information or have any
21
* questions.
22
*/
23
/*
24
* @test
25
* @bug 4221795 8032446 8174270
26
* @summary Confirm Normalizer's fundamental behavior. Imported from ICU4J 3.2's
27
* src/com/ibm/icu/dev/test and modified.
28
* @modules java.base/sun.text java.base/jdk.internal.icu.text
29
* @library /java/text/testlib
30
* @compile -XDignore.symbol.file ICUBasicTest.java
31
* @run main/timeout=30 ICUBasicTest
32
*/
33
34
/*
35
*******************************************************************************
36
* Copyright (C) 1996-2004, International Business Machines Corporation and *
37
* others. All Rights Reserved. *
38
*******************************************************************************
39
*/
40
41
import sun.text.Normalizer;
42
import jdk.internal.icu.text.NormalizerBase;
43
44
import static java.text.Normalizer.Form.*;
45
46
public class ICUBasicTest extends IntlTest {
47
48
public static void main(String[] args) throws Exception {
49
new ICUBasicTest().run(args);
50
}
51
52
/*
53
* Normalization modes
54
*/
55
private static final NormalizerBase.Mode NFCmode = NormalizerBase.NFC;
56
private static final NormalizerBase.Mode NFDmode = NormalizerBase.NFD;
57
private static final NormalizerBase.Mode NFKCmode = NormalizerBase.NFKC;
58
private static final NormalizerBase.Mode NFKDmode = NormalizerBase.NFKD;
59
private static final NormalizerBase.Mode NONEmode = NormalizerBase.NONE;
60
61
/*
62
* Normalization options
63
*/
64
65
/* Normal Unicode versions */
66
private static final int UNICODE_3_2_0 = Normalizer.UNICODE_3_2;
67
private static final int UNICODE_LATEST = NormalizerBase.UNICODE_LATEST;
68
69
/*
70
* Special cases for UAX #15 bug
71
* see Unicode Public Review Issue #29
72
* at http://www.unicode.org/review/resolved-pri.html#pri29
73
*
74
* Note:
75
* PRI #29 is supported in Unicode 4.1.0. Therefore, expected results are
76
* different for earlier Unicode versions.
77
*/
78
public void TestComposition() {
79
80
final TestCompositionCase cases[] = new TestCompositionCase[] {
81
new TestCompositionCase(NFC, UNICODE_3_2_0,
82
"\u1100\u0300\u1161\u0327",
83
"\u1100\u0300\u1161\u0327"),
84
new TestCompositionCase(NFC, UNICODE_LATEST,
85
"\u1100\u0300\u1161\u0327",
86
"\u1100\u0300\u1161\u0327"),
87
88
new TestCompositionCase(NFC, UNICODE_3_2_0,
89
"\u1100\u0300\u1161\u0327\u11a8",
90
"\u1100\u0300\u1161\u0327\u11a8"),
91
new TestCompositionCase(NFC, UNICODE_LATEST,
92
"\u1100\u0300\u1161\u0327\u11a8",
93
"\u1100\u0300\u1161\u0327\u11a8"),
94
95
new TestCompositionCase(NFC, UNICODE_3_2_0,
96
"\uac00\u0300\u0327\u11a8",
97
"\uac00\u0327\u0300\u11a8"),
98
new TestCompositionCase(NFC, UNICODE_LATEST,
99
"\uac00\u0300\u0327\u11a8",
100
"\uac00\u0327\u0300\u11a8"),
101
102
new TestCompositionCase(NFC, UNICODE_3_2_0,
103
"\u0b47\u0300\u0b3e",
104
"\u0b47\u0300\u0b3e"),
105
new TestCompositionCase(NFC, UNICODE_LATEST,
106
"\u0b47\u0300\u0b3e",
107
"\u0b47\u0300\u0b3e"),
108
};
109
110
String output;
111
int i, length;
112
113
for (i=0; i<cases.length; ++i) {
114
output = Normalizer.normalize(cases[i].input,
115
cases[i].form, cases[i].options);
116
if (!output.equals(cases[i].expect)) {
117
errln("unexpected result for case " + i + ". Expected="
118
+ cases[i].expect + ", Actual=" + output);
119
} else if (verbose) {
120
logln("expected result for case " + i + ". Expected="
121
+ cases[i].expect + ", Actual=" + output);
122
}
123
}
124
}
125
126
private final static class TestCompositionCase {
127
public java.text.Normalizer.Form form;
128
public int options;
129
public String input, expect;
130
131
TestCompositionCase(java.text.Normalizer.Form form,
132
int options,
133
String input,
134
String expect) {
135
this.form = form;
136
this.options = options;
137
this.input = input;
138
this.expect = expect;
139
}
140
}
141
142
/*
143
* Added in order to detect a regression.
144
*/
145
public void TestCombiningMarks() {
146
String src = "\u0f71\u0f72\u0f73\u0f74\u0f75";
147
String expected = "\u0F71\u0F71\u0F71\u0F72\u0F72\u0F74\u0F74";
148
String result = NormalizerBase.normalize(src, NFD);
149
150
if (!expected.equals(result)) {
151
errln("Reordering of combining marks failed. Expected: " +
152
toHexString(expected) + " Got: "+ toHexString(result));
153
}
154
}
155
156
/*
157
* Added in order to detect a regression.
158
*/
159
public void TestBengali() throws Exception {
160
String input = "\u09bc\u09be\u09cd\u09be";
161
String output=NormalizerBase.normalize(input, NFC);
162
163
if (!input.equals(output)) {
164
errln("ERROR in NFC of string");
165
}
166
return;
167
}
168
169
170
/*
171
* Added in order to detect a regression.
172
*/
173
/**
174
* Test for a problem found by Verisign. Problem is that
175
* characters at the start of a string are not put in canonical
176
* order correctly by compose() if there is no starter.
177
*/
178
public void TestVerisign() throws Exception {
179
String[] inputs = {
180
"\u05b8\u05b9\u05b1\u0591\u05c3\u05b0\u05ac\u059f",
181
"\u0592\u05b7\u05bc\u05a5\u05b0\u05c0\u05c4\u05ad"
182
};
183
String[] outputs = {
184
"\u05b1\u05b8\u05b9\u0591\u05c3\u05b0\u05ac\u059f",
185
"\u05b0\u05b7\u05bc\u05a5\u0592\u05c0\u05ad\u05c4"
186
};
187
188
for (int i = 0; i < inputs.length; ++i) {
189
String input = inputs[i];
190
String output = outputs[i];
191
192
String result = NormalizerBase.normalize(input, NFD);
193
if (!result.equals(output)) {
194
errln("FAIL input: " + toHexString(input) + "\n" +
195
" decompose: " + toHexString(result) + "\n" +
196
" expected: " + toHexString(output));
197
}
198
199
result = NormalizerBase.normalize(input, NFC);
200
if (!result.equals(output)) {
201
errln("FAIL input: " + toHexString(input) + "\n" +
202
" compose: " + toHexString(result) + "\n" +
203
" expected: " + toHexString(output));
204
}
205
}
206
}
207
208
/**
209
* Test for a problem that showed up just before ICU 1.6 release
210
* having to do with combining characters with an index of zero.
211
* Such characters do not participate in any canonical
212
* decompositions. However, having an index of zero means that
213
* they all share one typeMask[] entry, that is, they all have to
214
* map to the same canonical class, which is not the case, in
215
* reality.
216
*/
217
public void TestZeroIndex() throws Exception {
218
String[] DATA = {
219
// Expect col1 x COMPOSE_COMPAT => col2
220
// Expect col2 x DECOMP => col3
221
"A\u0316\u0300", "\u00C0\u0316", "A\u0316\u0300",
222
"A\u0300\u0316", "\u00C0\u0316", "A\u0316\u0300",
223
"A\u0327\u0300", "\u00C0\u0327", "A\u0327\u0300",
224
"c\u0321\u0327", "c\u0321\u0327", "c\u0321\u0327",
225
"c\u0327\u0321", "\u00E7\u0321", "c\u0327\u0321",
226
};
227
228
for (int i=0; i<DATA.length; i+=3) {
229
String a = DATA[i];
230
String b = NormalizerBase.normalize(a, NFKC);
231
String exp = DATA[i+1];
232
233
if (b.equals(exp)) {
234
logln("Ok: " + toHexString(a) + " x COMPOSE_COMPAT => " +
235
toHexString(b));
236
} else {
237
errln("FAIL: " + toHexString(a) + " x COMPOSE_COMPAT => " +
238
toHexString(b) + ", expect " + toHexString(exp));
239
}
240
241
a = NormalizerBase.normalize(b, NFD);
242
exp = DATA[i+2];
243
if (a.equals(exp)) {
244
logln("Ok: " + toHexString(b) + " x DECOMP => " +
245
toHexString(a));
246
} else {
247
errln("FAIL: " + toHexString(b) + " x DECOMP => " +
248
toHexString(a) + ", expect " + toHexString(exp));
249
}
250
}
251
}
252
253
/**
254
* Make sure characters in the CompositionExclusion.txt list do not get
255
* composed to.
256
*/
257
public void TestCompositionExclusion() throws Exception {
258
// This list is generated from CompositionExclusion.txt.
259
// Update whenever the normalizer tables are updated. Note
260
// that we test all characters listed, even those that can be
261
// derived from the Unicode DB and are therefore commented
262
// out.
263
264
/*
265
* kyuka's note:
266
* Original data seemed to be based on Unicode 3.0.0(the initial
267
* Composition Exclusions list) and seemed to have some mistakes.
268
* Updated in order to correct mistakes and to support Unicode 4.0.0.
269
* And, this table can be used also for Unicode 3.2.0.
270
*/
271
String[][] EXCLUDED_UNICODE_3_2_0 = {
272
{"\u0340"},
273
{"\u0341"},
274
{"\u0343"},
275
{"\u0344"},
276
{"\u0374"},
277
{"\u037E"},
278
{"\u0387"},
279
{"\u0958"},
280
{"\u0959", "\u095F"},
281
{"\u09DC"},
282
{"\u09DD"},
283
{"\u09DF"},
284
{"\u0A33"},
285
{"\u0A36"},
286
{"\u0A59", "\u0A5B"},
287
{"\u0A5E"},
288
{"\u0B5C"},
289
{"\u0B5D"},
290
{"\u0F43"},
291
{"\u0F4D"},
292
{"\u0F52"},
293
{"\u0F57"},
294
{"\u0F5C"},
295
{"\u0F69"},
296
{"\u0F73"},
297
{"\u0F75"},
298
{"\u0F76"},
299
{"\u0F78"},
300
{"\u0F81"},
301
{"\u0F93"},
302
{"\u0F9D"},
303
{"\u0FA2"},
304
{"\u0FA7"},
305
{"\u0FAC"},
306
{"\u0FB9"},
307
{"\u1F71"},
308
{"\u1F73"},
309
{"\u1F75"},
310
{"\u1F77"},
311
{"\u1F79"},
312
{"\u1F7B"},
313
{"\u1F7D"},
314
{"\u1FBB"},
315
{"\u1FBE"},
316
{"\u1FC9"},
317
{"\u1FCB"},
318
{"\u1FD3"},
319
{"\u1FDB"},
320
{"\u1FE3"},
321
{"\u1FEB"},
322
{"\u1FEE"},
323
{"\u1FEF"},
324
{"\u1FF9"},
325
{"\u1FFB"},
326
{"\u1FFD"},
327
{"\u2000"},
328
{"\u2001"},
329
{"\u2126"},
330
{"\u212A"},
331
{"\u212B"},
332
{"\u2329"},
333
{"\u232A"},
334
{"\u2ADC"},
335
{"\uF900", "\uFA0D"},
336
{"\uFA10"},
337
{"\uFA12"},
338
{"\uFA15", "\uFA1E"},
339
{"\uFA20"},
340
{"\uFA22"},
341
{"\uFA25"},
342
{"\uFA26"},
343
{"\uFA2A", "\uFA2D"},
344
{"\uFA30", "\uFA6A"},
345
{"\uFB1D"},
346
{"\uFB1F"},
347
{"\uFB2A", "\uFB36"},
348
{"\uFB38", "\uFB3C"},
349
{"\uFB3E"},
350
{"\uFB40"},
351
{"\uFB41"},
352
{"\uFB43"},
353
{"\uFB44"},
354
{"\uFB46", "\uFB4E"},
355
{"\uD834\uDD5E", "\uD834\uDD64"},
356
{"\uD834\uDDBB", "\uD834\uDDC0"},
357
{"\uD87E\uDC00", "\uD87E\uDE1D"}
358
};
359
360
String[][] EXCLUDED_LATEST = {
361
362
};
363
364
for (int i = 0; i < EXCLUDED_UNICODE_3_2_0.length; ++i) {
365
if (EXCLUDED_UNICODE_3_2_0[i].length == 1) {
366
checkCompositionExclusion_320(EXCLUDED_UNICODE_3_2_0[i][0]);
367
} else {
368
int from, to;
369
from = Character.codePointAt(EXCLUDED_UNICODE_3_2_0[i][0], 0);
370
to = Character.codePointAt(EXCLUDED_UNICODE_3_2_0[i][1], 0);
371
372
for (int j = from; j <= to; j++) {
373
checkCompositionExclusion_320(String.valueOf(Character.toChars(j)));
374
}
375
}
376
}
377
}
378
379
private void checkCompositionExclusion_320(String s) throws Exception {
380
String a = String.valueOf(s);
381
String b = NormalizerBase.normalize(a, NFKD);
382
String c = NormalizerBase.normalize(b, NFC);
383
384
if (c.equals(a)) {
385
errln("FAIL: " + toHexString(a) + " x DECOMP_COMPAT => " +
386
toHexString(b) + " x COMPOSE => " +
387
toHexString(c) + " for the latest Unicode");
388
} else if (verbose) {
389
logln("Ok: " + toHexString(a) + " x DECOMP_COMPAT => " +
390
toHexString(b) + " x COMPOSE => " +
391
toHexString(c) + " for the latest Unicode");
392
}
393
394
b = NormalizerBase.normalize(a, NFKD, Normalizer.UNICODE_3_2);
395
c = NormalizerBase.normalize(b, NFC, Normalizer.UNICODE_3_2);
396
if (c.equals(a)) {
397
errln("FAIL: " + toHexString(a) + " x DECOMP_COMPAT => " +
398
toHexString(b) + " x COMPOSE => " +
399
toHexString(c) + " for Unicode 3.2.0");
400
} else if (verbose) {
401
logln("Ok: " + toHexString(a) + " x DECOMP_COMPAT => " +
402
toHexString(b) + " x COMPOSE => " +
403
toHexString(c) + " for Unicode 3.2.0");
404
}
405
}
406
407
public void TestTibetan() throws Exception {
408
String[][] decomp = {
409
{ "\u0f77", "\u0f77", "\u0fb2\u0f71\u0f80" }
410
};
411
String[][] compose = {
412
{ "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80" }
413
};
414
415
staticTest(NFD, decomp, 1);
416
staticTest(NFKD,decomp, 2);
417
staticTest(NFC, compose, 1);
418
staticTest(NFKC,compose, 2);
419
}
420
421
public void TestExplodingBase() throws Exception{
422
// \u017f - Latin small letter long s
423
// \u0307 - combining dot above
424
// \u1e61 - Latin small letter s with dot above
425
// \u1e9b - Latin small letter long s with dot above
426
String[][] canon = {
427
// Input Decomposed Composed
428
{ "Tschu\u017f", "Tschu\u017f", "Tschu\u017f" },
429
{ "Tschu\u1e9b", "Tschu\u017f\u0307", "Tschu\u1e9b" },
430
};
431
String[][] compat = {
432
// Input Decomposed Composed
433
{ "\u017f", "s", "s" },
434
{ "\u1e9b", "s\u0307", "\u1e61" },
435
};
436
437
staticTest(NFD, canon, 1);
438
staticTest(NFC, canon, 2);
439
staticTest(NFKD, compat, 1);
440
staticTest(NFKC, compat, 2);
441
}
442
443
private String[][] canonTests = {
444
// Input Decomposed Composed
445
446
{ "cat", "cat", "cat" },
447
{ "\u00e0ardvark", "a\u0300ardvark", "\u00e0ardvark", },
448
449
// D-dot_above
450
{ "\u1e0a", "D\u0307", "\u1e0a" },
451
452
// D dot_above
453
{ "D\u0307", "D\u0307", "\u1e0a" },
454
455
// D-dot_below dot_above
456
{ "\u1e0c\u0307", "D\u0323\u0307", "\u1e0c\u0307" },
457
458
// D-dot_above dot_below
459
{ "\u1e0a\u0323", "D\u0323\u0307", "\u1e0c\u0307" },
460
461
// D dot_below dot_above
462
{ "D\u0307\u0323", "D\u0323\u0307", "\u1e0c\u0307" },
463
464
// D dot_below cedilla dot_above
465
{ "\u1e10\u0307\u0323", "D\u0327\u0323\u0307", "\u1e10\u0323\u0307"},
466
467
// D dot_above ogonek dot_below
468
{ "D\u0307\u0328\u0323","D\u0328\u0323\u0307", "\u1e0c\u0328\u0307"},
469
470
// E-macron-grave
471
{ "\u1E14", "E\u0304\u0300", "\u1E14" },
472
473
// E-macron + grave
474
{ "\u0112\u0300", "E\u0304\u0300", "\u1E14" },
475
476
// E-grave + macron
477
{ "\u00c8\u0304", "E\u0300\u0304", "\u00c8\u0304" },
478
479
// angstrom_sign
480
{ "\u212b", "A\u030a", "\u00c5" },
481
482
// A-ring
483
{ "\u00c5", "A\u030a", "\u00c5" },
484
{ "\u00c4ffin", "A\u0308ffin", "\u00c4ffin" },
485
{ "\u00c4\uFB03n", "A\u0308\uFB03n", "\u00c4\uFB03n" },
486
487
//updated with 3.0
488
{ "\u00fdffin", "y\u0301ffin", "\u00fdffin" },
489
{ "\u00fd\uFB03n", "y\u0301\uFB03n", "\u00fd\uFB03n" },
490
491
{ "Henry IV", "Henry IV", "Henry IV" },
492
{ "Henry \u2163", "Henry \u2163", "Henry \u2163" },
493
494
// ga(Zenkaku-Katakana)
495
{ "\u30AC", "\u30AB\u3099", "\u30AC" },
496
497
// ka(Zenkaku-Katakana) + ten(Zenkaku)
498
{ "\u30AB\u3099", "\u30AB\u3099", "\u30AC" },
499
500
// ka(Hankaku-Katakana) + ten(Hankaku-Katakana)
501
{ "\uFF76\uFF9E", "\uFF76\uFF9E", "\uFF76\uFF9E" },
502
503
// ka(Zenkaku-Katakana) + ten(Hankaku)
504
{ "\u30AB\uFF9E", "\u30AB\uFF9E", "\u30AB\uFF9E" },
505
// ka(Hankaku-Katakana) + ten(Zenkaku)
506
{ "\uFF76\u3099", "\uFF76\u3099", "\uFF76\u3099" },
507
508
{ "A\u0300\u0316", "A\u0316\u0300", "\u00C0\u0316" },
509
510
{ "\ud834\udd5e\ud834\udd57\ud834\udd65\ud834\udd5e",
511
"\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65",
512
"\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65" },
513
};
514
515
private String[][] compatTests = {
516
// Input Decomposed Composed
517
518
{ "cat", "cat", "cat" },
519
520
// Alef-Lamed vs. Alef, Lamed
521
{ "\uFB4f", "\u05D0\u05DC", "\u05D0\u05DC", },
522
523
{ "\u00C4ffin", "A\u0308ffin", "\u00C4ffin" },
524
525
// ffi ligature -> f + f + i
526
{ "\u00C4\uFB03n", "A\u0308ffin", "\u00C4ffin" },
527
528
//updated for 3.0
529
{ "\u00fdffin", "y\u0301ffin", "\u00fdffin" },
530
531
// ffi ligature -> f + f + i
532
{ "\u00fd\uFB03n", "y\u0301ffin", "\u00fdffin" },
533
534
{ "Henry IV", "Henry IV", "Henry IV" },
535
{ "Henry \u2163", "Henry IV", "Henry IV" },
536
537
// ga(Zenkaku-Katakana)
538
{ "\u30AC", "\u30AB\u3099", "\u30AC" },
539
540
// ka(Zenkaku-Katakana) + ten(Zenkaku)
541
{ "\u30AB\u3099", "\u30AB\u3099", "\u30AC" },
542
543
// ka(Hankaku-Katakana) + ten(Zenkaku)
544
{ "\uFF76\u3099", "\u30AB\u3099", "\u30AC" },
545
546
/* These two are broken in Unicode 2.1.2 but fixed in 2.1.5 and later*/
547
// ka(Hankaku-Katakana) + ten(Hankaku)
548
{ "\uFF76\uFF9E", "\u30AB\u3099", "\u30AC" },
549
550
// ka(Zenkaku-Katakana) + ten(Hankaku)
551
{ "\u30AB\uFF9E", "\u30AB\u3099", "\u30AC" },
552
};
553
554
public void TestNFD() throws Exception{
555
staticTest(NFD, canonTests, 1);
556
}
557
558
public void TestNFC() throws Exception{
559
staticTest(NFC, canonTests, 2);
560
}
561
562
public void TestNFKD() throws Exception{
563
staticTest(NFKD, compatTests, 1);
564
}
565
566
public void TestNFKC() throws Exception{
567
staticTest(NFKC, compatTests, 2);
568
}
569
570
private void staticTest(java.text.Normalizer.Form form,
571
String[][] tests,
572
int outCol) throws Exception {
573
for (int i = 0; i < tests.length; i++) {
574
String input = tests[i][0];
575
logln("Normalizing '" + input + "' (" + toHexString(input) + ")" );
576
577
String expect =tests[i][outCol];
578
String output = java.text.Normalizer.normalize(input, form);
579
580
if (!output.equals(expect)) {
581
errln("FAIL: case " + i
582
+ " expected '" + expect + "' (" + toHexString(expect) + ")"
583
+ " but got '" + output + "' (" + toHexString(output) + ")"
584
);
585
}
586
}
587
}
588
589
// With Canonical decomposition, Hangul syllables should get decomposed
590
// into Jamo, but Jamo characters should not be decomposed into
591
// conjoining Jamo
592
private String[][] hangulCanon = {
593
// Input Decomposed Composed
594
{ "\ud4db", "\u1111\u1171\u11b6", "\ud4db" },
595
{ "\u1111\u1171\u11b6", "\u1111\u1171\u11b6", "\ud4db" },
596
};
597
598
public void TestHangulCompose() throws Exception{
599
logln("Canonical composition...");
600
staticTest(NFC, hangulCanon, 2);
601
}
602
603
public void TestHangulDecomp() throws Exception{
604
logln("Canonical decomposition...");
605
staticTest(NFD, hangulCanon, 1);
606
}
607
608
}
609
610