Path: blob/master/test/jdk/java/text/Normalizer/ICUBasicTest.java
41149 views
/*1* Copyright (c) 2019, 2020, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation.7*8* This code is distributed in the hope that it will be useful, but WITHOUT9* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or10* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License11* version 2 for more details (a copy is included in the LICENSE file that12* accompanied this code).13*14* You should have received a copy of the GNU General Public License version15* 2 along with this work; if not, write to the Free Software Foundation,16* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.17*18* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA19* or visit www.oracle.com if you need additional information or have any20* questions.21*/22/*23* @test24* @bug 4221795 8032446 817427025* @summary Confirm Normalizer's fundamental behavior. Imported from ICU4J 3.2's26* src/com/ibm/icu/dev/test and modified.27* @modules java.base/sun.text java.base/jdk.internal.icu.text28* @library /java/text/testlib29* @compile -XDignore.symbol.file ICUBasicTest.java30* @run main/timeout=30 ICUBasicTest31*/3233/*34*******************************************************************************35* Copyright (C) 1996-2004, International Business Machines Corporation and *36* others. All Rights Reserved. *37*******************************************************************************38*/3940import sun.text.Normalizer;41import jdk.internal.icu.text.NormalizerBase;4243import static java.text.Normalizer.Form.*;4445public class ICUBasicTest extends IntlTest {4647public static void main(String[] args) throws Exception {48new ICUBasicTest().run(args);49}5051/*52* Normalization modes53*/54private static final NormalizerBase.Mode NFCmode = NormalizerBase.NFC;55private static final NormalizerBase.Mode NFDmode = NormalizerBase.NFD;56private static final NormalizerBase.Mode NFKCmode = NormalizerBase.NFKC;57private static final NormalizerBase.Mode NFKDmode = NormalizerBase.NFKD;58private static final NormalizerBase.Mode NONEmode = NormalizerBase.NONE;5960/*61* Normalization options62*/6364/* Normal Unicode versions */65private static final int UNICODE_3_2_0 = Normalizer.UNICODE_3_2;66private static final int UNICODE_LATEST = NormalizerBase.UNICODE_LATEST;6768/*69* Special cases for UAX #15 bug70* see Unicode Public Review Issue #2971* at http://www.unicode.org/review/resolved-pri.html#pri2972*73* Note:74* PRI #29 is supported in Unicode 4.1.0. Therefore, expected results are75* different for earlier Unicode versions.76*/77public void TestComposition() {7879final TestCompositionCase cases[] = new TestCompositionCase[] {80new TestCompositionCase(NFC, UNICODE_3_2_0,81"\u1100\u0300\u1161\u0327",82"\u1100\u0300\u1161\u0327"),83new TestCompositionCase(NFC, UNICODE_LATEST,84"\u1100\u0300\u1161\u0327",85"\u1100\u0300\u1161\u0327"),8687new TestCompositionCase(NFC, UNICODE_3_2_0,88"\u1100\u0300\u1161\u0327\u11a8",89"\u1100\u0300\u1161\u0327\u11a8"),90new TestCompositionCase(NFC, UNICODE_LATEST,91"\u1100\u0300\u1161\u0327\u11a8",92"\u1100\u0300\u1161\u0327\u11a8"),9394new TestCompositionCase(NFC, UNICODE_3_2_0,95"\uac00\u0300\u0327\u11a8",96"\uac00\u0327\u0300\u11a8"),97new TestCompositionCase(NFC, UNICODE_LATEST,98"\uac00\u0300\u0327\u11a8",99"\uac00\u0327\u0300\u11a8"),100101new TestCompositionCase(NFC, UNICODE_3_2_0,102"\u0b47\u0300\u0b3e",103"\u0b47\u0300\u0b3e"),104new TestCompositionCase(NFC, UNICODE_LATEST,105"\u0b47\u0300\u0b3e",106"\u0b47\u0300\u0b3e"),107};108109String output;110int i, length;111112for (i=0; i<cases.length; ++i) {113output = Normalizer.normalize(cases[i].input,114cases[i].form, cases[i].options);115if (!output.equals(cases[i].expect)) {116errln("unexpected result for case " + i + ". Expected="117+ cases[i].expect + ", Actual=" + output);118} else if (verbose) {119logln("expected result for case " + i + ". Expected="120+ cases[i].expect + ", Actual=" + output);121}122}123}124125private final static class TestCompositionCase {126public java.text.Normalizer.Form form;127public int options;128public String input, expect;129130TestCompositionCase(java.text.Normalizer.Form form,131int options,132String input,133String expect) {134this.form = form;135this.options = options;136this.input = input;137this.expect = expect;138}139}140141/*142* Added in order to detect a regression.143*/144public void TestCombiningMarks() {145String src = "\u0f71\u0f72\u0f73\u0f74\u0f75";146String expected = "\u0F71\u0F71\u0F71\u0F72\u0F72\u0F74\u0F74";147String result = NormalizerBase.normalize(src, NFD);148149if (!expected.equals(result)) {150errln("Reordering of combining marks failed. Expected: " +151toHexString(expected) + " Got: "+ toHexString(result));152}153}154155/*156* Added in order to detect a regression.157*/158public void TestBengali() throws Exception {159String input = "\u09bc\u09be\u09cd\u09be";160String output=NormalizerBase.normalize(input, NFC);161162if (!input.equals(output)) {163errln("ERROR in NFC of string");164}165return;166}167168169/*170* Added in order to detect a regression.171*/172/**173* Test for a problem found by Verisign. Problem is that174* characters at the start of a string are not put in canonical175* order correctly by compose() if there is no starter.176*/177public void TestVerisign() throws Exception {178String[] inputs = {179"\u05b8\u05b9\u05b1\u0591\u05c3\u05b0\u05ac\u059f",180"\u0592\u05b7\u05bc\u05a5\u05b0\u05c0\u05c4\u05ad"181};182String[] outputs = {183"\u05b1\u05b8\u05b9\u0591\u05c3\u05b0\u05ac\u059f",184"\u05b0\u05b7\u05bc\u05a5\u0592\u05c0\u05ad\u05c4"185};186187for (int i = 0; i < inputs.length; ++i) {188String input = inputs[i];189String output = outputs[i];190191String result = NormalizerBase.normalize(input, NFD);192if (!result.equals(output)) {193errln("FAIL input: " + toHexString(input) + "\n" +194" decompose: " + toHexString(result) + "\n" +195" expected: " + toHexString(output));196}197198result = NormalizerBase.normalize(input, NFC);199if (!result.equals(output)) {200errln("FAIL input: " + toHexString(input) + "\n" +201" compose: " + toHexString(result) + "\n" +202" expected: " + toHexString(output));203}204}205}206207/**208* Test for a problem that showed up just before ICU 1.6 release209* having to do with combining characters with an index of zero.210* Such characters do not participate in any canonical211* decompositions. However, having an index of zero means that212* they all share one typeMask[] entry, that is, they all have to213* map to the same canonical class, which is not the case, in214* reality.215*/216public void TestZeroIndex() throws Exception {217String[] DATA = {218// Expect col1 x COMPOSE_COMPAT => col2219// Expect col2 x DECOMP => col3220"A\u0316\u0300", "\u00C0\u0316", "A\u0316\u0300",221"A\u0300\u0316", "\u00C0\u0316", "A\u0316\u0300",222"A\u0327\u0300", "\u00C0\u0327", "A\u0327\u0300",223"c\u0321\u0327", "c\u0321\u0327", "c\u0321\u0327",224"c\u0327\u0321", "\u00E7\u0321", "c\u0327\u0321",225};226227for (int i=0; i<DATA.length; i+=3) {228String a = DATA[i];229String b = NormalizerBase.normalize(a, NFKC);230String exp = DATA[i+1];231232if (b.equals(exp)) {233logln("Ok: " + toHexString(a) + " x COMPOSE_COMPAT => " +234toHexString(b));235} else {236errln("FAIL: " + toHexString(a) + " x COMPOSE_COMPAT => " +237toHexString(b) + ", expect " + toHexString(exp));238}239240a = NormalizerBase.normalize(b, NFD);241exp = DATA[i+2];242if (a.equals(exp)) {243logln("Ok: " + toHexString(b) + " x DECOMP => " +244toHexString(a));245} else {246errln("FAIL: " + toHexString(b) + " x DECOMP => " +247toHexString(a) + ", expect " + toHexString(exp));248}249}250}251252/**253* Make sure characters in the CompositionExclusion.txt list do not get254* composed to.255*/256public void TestCompositionExclusion() throws Exception {257// This list is generated from CompositionExclusion.txt.258// Update whenever the normalizer tables are updated. Note259// that we test all characters listed, even those that can be260// derived from the Unicode DB and are therefore commented261// out.262263/*264* kyuka's note:265* Original data seemed to be based on Unicode 3.0.0(the initial266* Composition Exclusions list) and seemed to have some mistakes.267* Updated in order to correct mistakes and to support Unicode 4.0.0.268* And, this table can be used also for Unicode 3.2.0.269*/270String[][] EXCLUDED_UNICODE_3_2_0 = {271{"\u0340"},272{"\u0341"},273{"\u0343"},274{"\u0344"},275{"\u0374"},276{"\u037E"},277{"\u0387"},278{"\u0958"},279{"\u0959", "\u095F"},280{"\u09DC"},281{"\u09DD"},282{"\u09DF"},283{"\u0A33"},284{"\u0A36"},285{"\u0A59", "\u0A5B"},286{"\u0A5E"},287{"\u0B5C"},288{"\u0B5D"},289{"\u0F43"},290{"\u0F4D"},291{"\u0F52"},292{"\u0F57"},293{"\u0F5C"},294{"\u0F69"},295{"\u0F73"},296{"\u0F75"},297{"\u0F76"},298{"\u0F78"},299{"\u0F81"},300{"\u0F93"},301{"\u0F9D"},302{"\u0FA2"},303{"\u0FA7"},304{"\u0FAC"},305{"\u0FB9"},306{"\u1F71"},307{"\u1F73"},308{"\u1F75"},309{"\u1F77"},310{"\u1F79"},311{"\u1F7B"},312{"\u1F7D"},313{"\u1FBB"},314{"\u1FBE"},315{"\u1FC9"},316{"\u1FCB"},317{"\u1FD3"},318{"\u1FDB"},319{"\u1FE3"},320{"\u1FEB"},321{"\u1FEE"},322{"\u1FEF"},323{"\u1FF9"},324{"\u1FFB"},325{"\u1FFD"},326{"\u2000"},327{"\u2001"},328{"\u2126"},329{"\u212A"},330{"\u212B"},331{"\u2329"},332{"\u232A"},333{"\u2ADC"},334{"\uF900", "\uFA0D"},335{"\uFA10"},336{"\uFA12"},337{"\uFA15", "\uFA1E"},338{"\uFA20"},339{"\uFA22"},340{"\uFA25"},341{"\uFA26"},342{"\uFA2A", "\uFA2D"},343{"\uFA30", "\uFA6A"},344{"\uFB1D"},345{"\uFB1F"},346{"\uFB2A", "\uFB36"},347{"\uFB38", "\uFB3C"},348{"\uFB3E"},349{"\uFB40"},350{"\uFB41"},351{"\uFB43"},352{"\uFB44"},353{"\uFB46", "\uFB4E"},354{"\uD834\uDD5E", "\uD834\uDD64"},355{"\uD834\uDDBB", "\uD834\uDDC0"},356{"\uD87E\uDC00", "\uD87E\uDE1D"}357};358359String[][] EXCLUDED_LATEST = {360361};362363for (int i = 0; i < EXCLUDED_UNICODE_3_2_0.length; ++i) {364if (EXCLUDED_UNICODE_3_2_0[i].length == 1) {365checkCompositionExclusion_320(EXCLUDED_UNICODE_3_2_0[i][0]);366} else {367int from, to;368from = Character.codePointAt(EXCLUDED_UNICODE_3_2_0[i][0], 0);369to = Character.codePointAt(EXCLUDED_UNICODE_3_2_0[i][1], 0);370371for (int j = from; j <= to; j++) {372checkCompositionExclusion_320(String.valueOf(Character.toChars(j)));373}374}375}376}377378private void checkCompositionExclusion_320(String s) throws Exception {379String a = String.valueOf(s);380String b = NormalizerBase.normalize(a, NFKD);381String c = NormalizerBase.normalize(b, NFC);382383if (c.equals(a)) {384errln("FAIL: " + toHexString(a) + " x DECOMP_COMPAT => " +385toHexString(b) + " x COMPOSE => " +386toHexString(c) + " for the latest Unicode");387} else if (verbose) {388logln("Ok: " + toHexString(a) + " x DECOMP_COMPAT => " +389toHexString(b) + " x COMPOSE => " +390toHexString(c) + " for the latest Unicode");391}392393b = NormalizerBase.normalize(a, NFKD, Normalizer.UNICODE_3_2);394c = NormalizerBase.normalize(b, NFC, Normalizer.UNICODE_3_2);395if (c.equals(a)) {396errln("FAIL: " + toHexString(a) + " x DECOMP_COMPAT => " +397toHexString(b) + " x COMPOSE => " +398toHexString(c) + " for Unicode 3.2.0");399} else if (verbose) {400logln("Ok: " + toHexString(a) + " x DECOMP_COMPAT => " +401toHexString(b) + " x COMPOSE => " +402toHexString(c) + " for Unicode 3.2.0");403}404}405406public void TestTibetan() throws Exception {407String[][] decomp = {408{ "\u0f77", "\u0f77", "\u0fb2\u0f71\u0f80" }409};410String[][] compose = {411{ "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80" }412};413414staticTest(NFD, decomp, 1);415staticTest(NFKD,decomp, 2);416staticTest(NFC, compose, 1);417staticTest(NFKC,compose, 2);418}419420public void TestExplodingBase() throws Exception{421// \u017f - Latin small letter long s422// \u0307 - combining dot above423// \u1e61 - Latin small letter s with dot above424// \u1e9b - Latin small letter long s with dot above425String[][] canon = {426// Input Decomposed Composed427{ "Tschu\u017f", "Tschu\u017f", "Tschu\u017f" },428{ "Tschu\u1e9b", "Tschu\u017f\u0307", "Tschu\u1e9b" },429};430String[][] compat = {431// Input Decomposed Composed432{ "\u017f", "s", "s" },433{ "\u1e9b", "s\u0307", "\u1e61" },434};435436staticTest(NFD, canon, 1);437staticTest(NFC, canon, 2);438staticTest(NFKD, compat, 1);439staticTest(NFKC, compat, 2);440}441442private String[][] canonTests = {443// Input Decomposed Composed444445{ "cat", "cat", "cat" },446{ "\u00e0ardvark", "a\u0300ardvark", "\u00e0ardvark", },447448// D-dot_above449{ "\u1e0a", "D\u0307", "\u1e0a" },450451// D dot_above452{ "D\u0307", "D\u0307", "\u1e0a" },453454// D-dot_below dot_above455{ "\u1e0c\u0307", "D\u0323\u0307", "\u1e0c\u0307" },456457// D-dot_above dot_below458{ "\u1e0a\u0323", "D\u0323\u0307", "\u1e0c\u0307" },459460// D dot_below dot_above461{ "D\u0307\u0323", "D\u0323\u0307", "\u1e0c\u0307" },462463// D dot_below cedilla dot_above464{ "\u1e10\u0307\u0323", "D\u0327\u0323\u0307", "\u1e10\u0323\u0307"},465466// D dot_above ogonek dot_below467{ "D\u0307\u0328\u0323","D\u0328\u0323\u0307", "\u1e0c\u0328\u0307"},468469// E-macron-grave470{ "\u1E14", "E\u0304\u0300", "\u1E14" },471472// E-macron + grave473{ "\u0112\u0300", "E\u0304\u0300", "\u1E14" },474475// E-grave + macron476{ "\u00c8\u0304", "E\u0300\u0304", "\u00c8\u0304" },477478// angstrom_sign479{ "\u212b", "A\u030a", "\u00c5" },480481// A-ring482{ "\u00c5", "A\u030a", "\u00c5" },483{ "\u00c4ffin", "A\u0308ffin", "\u00c4ffin" },484{ "\u00c4\uFB03n", "A\u0308\uFB03n", "\u00c4\uFB03n" },485486//updated with 3.0487{ "\u00fdffin", "y\u0301ffin", "\u00fdffin" },488{ "\u00fd\uFB03n", "y\u0301\uFB03n", "\u00fd\uFB03n" },489490{ "Henry IV", "Henry IV", "Henry IV" },491{ "Henry \u2163", "Henry \u2163", "Henry \u2163" },492493// ga(Zenkaku-Katakana)494{ "\u30AC", "\u30AB\u3099", "\u30AC" },495496// ka(Zenkaku-Katakana) + ten(Zenkaku)497{ "\u30AB\u3099", "\u30AB\u3099", "\u30AC" },498499// ka(Hankaku-Katakana) + ten(Hankaku-Katakana)500{ "\uFF76\uFF9E", "\uFF76\uFF9E", "\uFF76\uFF9E" },501502// ka(Zenkaku-Katakana) + ten(Hankaku)503{ "\u30AB\uFF9E", "\u30AB\uFF9E", "\u30AB\uFF9E" },504// ka(Hankaku-Katakana) + ten(Zenkaku)505{ "\uFF76\u3099", "\uFF76\u3099", "\uFF76\u3099" },506507{ "A\u0300\u0316", "A\u0316\u0300", "\u00C0\u0316" },508509{ "\ud834\udd5e\ud834\udd57\ud834\udd65\ud834\udd5e",510"\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65",511"\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65" },512};513514private String[][] compatTests = {515// Input Decomposed Composed516517{ "cat", "cat", "cat" },518519// Alef-Lamed vs. Alef, Lamed520{ "\uFB4f", "\u05D0\u05DC", "\u05D0\u05DC", },521522{ "\u00C4ffin", "A\u0308ffin", "\u00C4ffin" },523524// ffi ligature -> f + f + i525{ "\u00C4\uFB03n", "A\u0308ffin", "\u00C4ffin" },526527//updated for 3.0528{ "\u00fdffin", "y\u0301ffin", "\u00fdffin" },529530// ffi ligature -> f + f + i531{ "\u00fd\uFB03n", "y\u0301ffin", "\u00fdffin" },532533{ "Henry IV", "Henry IV", "Henry IV" },534{ "Henry \u2163", "Henry IV", "Henry IV" },535536// ga(Zenkaku-Katakana)537{ "\u30AC", "\u30AB\u3099", "\u30AC" },538539// ka(Zenkaku-Katakana) + ten(Zenkaku)540{ "\u30AB\u3099", "\u30AB\u3099", "\u30AC" },541542// ka(Hankaku-Katakana) + ten(Zenkaku)543{ "\uFF76\u3099", "\u30AB\u3099", "\u30AC" },544545/* These two are broken in Unicode 2.1.2 but fixed in 2.1.5 and later*/546// ka(Hankaku-Katakana) + ten(Hankaku)547{ "\uFF76\uFF9E", "\u30AB\u3099", "\u30AC" },548549// ka(Zenkaku-Katakana) + ten(Hankaku)550{ "\u30AB\uFF9E", "\u30AB\u3099", "\u30AC" },551};552553public void TestNFD() throws Exception{554staticTest(NFD, canonTests, 1);555}556557public void TestNFC() throws Exception{558staticTest(NFC, canonTests, 2);559}560561public void TestNFKD() throws Exception{562staticTest(NFKD, compatTests, 1);563}564565public void TestNFKC() throws Exception{566staticTest(NFKC, compatTests, 2);567}568569private void staticTest(java.text.Normalizer.Form form,570String[][] tests,571int outCol) throws Exception {572for (int i = 0; i < tests.length; i++) {573String input = tests[i][0];574logln("Normalizing '" + input + "' (" + toHexString(input) + ")" );575576String expect =tests[i][outCol];577String output = java.text.Normalizer.normalize(input, form);578579if (!output.equals(expect)) {580errln("FAIL: case " + i581+ " expected '" + expect + "' (" + toHexString(expect) + ")"582+ " but got '" + output + "' (" + toHexString(output) + ")"583);584}585}586}587588// With Canonical decomposition, Hangul syllables should get decomposed589// into Jamo, but Jamo characters should not be decomposed into590// conjoining Jamo591private String[][] hangulCanon = {592// Input Decomposed Composed593{ "\ud4db", "\u1111\u1171\u11b6", "\ud4db" },594{ "\u1111\u1171\u11b6", "\u1111\u1171\u11b6", "\ud4db" },595};596597public void TestHangulCompose() throws Exception{598logln("Canonical composition...");599staticTest(NFC, hangulCanon, 2);600}601602public void TestHangulDecomp() throws Exception{603logln("Canonical decomposition...");604staticTest(NFD, hangulCanon, 1);605}606607}608609610