Path: blob/master/test/jdk/java/text/Normalizer/DataValidationTest.java
41149 views
/*1* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation.7*8* This code is distributed in the hope that it will be useful, but WITHOUT9* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or10* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License11* version 2 for more details (a copy is included in the LICENSE file that12* accompanied this code).13*14* You should have received a copy of the GNU General Public License version15* 2 along with this work; if not, write to the Free Software Foundation,16* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.17*18* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA19* or visit www.oracle.com if you need additional information or have any20* questions.21*/22/*23* test24* bug 422179525* summary Confirm *.icu data using ICU4J Normalizer26*/2728import java.io.BufferedReader;29import java.io.FileInputStream;30import java.io.InputStreamReader;31import java.nio.charset.Charset;32import java.nio.charset.CharsetDecoder;33import java.util.BitSet;34import java.util.StringTokenizer;3536import com.ibm.icu.text.Normalizer;37import com.ibm.icu.impl.NormalizerImpl;3839/**40* This is not a test program but a data validation utility.41* Two datafiles for Normalizer, unorm.icu and uprops.icu under42* sun/text/resouces, are generated using generators in ICU4C 3.2 on a43* BIG-ENDIAN machine. Before using them with java.text.Normalizer and44* sun.text.Normalizer, you may want to check these test datafile's validation.45* You can test datafiles using Normalizer in ICU4J 3.2. Download ICU4J 3.2 and46* run this test program with -cp <ICU4J 3.2>.47*/48public class DataValidationTest {4950//51// Options to be used with com.ibm.icu.text.Normalizer52//5354/*55* Default Unicode 3.2.0 normalization.56*57* - With Corrigendum 4 fix58* (Different from Mustang's Normalizer.)59* - With Public Review Issue #29 fix60* (Different from Mustang's Normalizer.)61*/62private static final int UNICODE_3_2_0 = Normalizer.UNICODE_3_2;6364/*65* *Incomplete* Unicode 3.2.0 normalization for IDNA/StringPrep.66*67* - With Corrigendum 4 fix68* - Without Public Review Issue #29 fix69*70* ICU4J's Normalizer itself doesn't support normalization for Unicode 3.2.071* without Corrigendum 4 fix, which is necessary for IDNA/StringPrep. It is72* done in StringPrep. Therefore, we don't test the normlaization in this73* test program. We merely test normalization for Unicode 3.2.0 without74* Public Review Issue #29 fix with this test program.75*/76private static final int UNICODE_3_2_0_BEFORE_PRI_29 =77Normalizer.UNICODE_3_2 |78NormalizerImpl.BEFORE_PRI_29;7980/*81* Default normalization.82*83* - Unicode 4.0.184* (Different from Mustang's Normalizer.)85* - With Corrigendum 4 fix86* - With Public Review Issue #29 fix87* (Different from Mustang's Normalizer.)88*89* Because Public Review Issue #29 is fixed in Unicode 4.1.0. I think that90* IUC4J 3.2 should not support it. But it actually supports PRI #29 fix91* as default....92*/93private static final int UNICODE_LATEST = 0x00;9495/*96* Normalization without Public Review Issue #29 fix.97*98* - Unicode 4.0.199* - Without Corrigendum 4 fix100* - Without Public Review Issue #29 fix101*/102static final int UNICODE_LATEST_BEFORE_PRI_29 =103NormalizerImpl.BEFORE_PRI_29;104105//106// Conformance test datafiles107//108109/*110* Conformance test datafile for normalization for Unicode 3.2.0 with111* Corrigendum 4 corrections. This is NOT an original Conformace test112* data. Some inconvenient test cases are commented out.113* About corrigendum 4, please refer114* http://www.unicode.org/versions/corrigendum4.html115*116* ICU4J 3.2's Normalizer itself doesn't support normalization for Unicode117* 3.2.0 without Corrigendum 4 corrections. StringPrep helps it. So, we118* don't test the normalization with this test program.119*/120static final String DATA_3_2_0 = "NormalizationTest-3.2.0.Corrigendum4.txt";121122/*123* Conformance test datafile for the latest Unicode which is supported124* by J2SE.125*/126static final String DATA_LATEST = "NormalizationTest-Latest.txt";127128/*129* Decorder130*/131static final CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder();132133/*134* List to pick up characters which are not listed in Part1135*/136static BitSet charList = new BitSet(Character.MAX_CODE_POINT+1);137138/*139* Shortcuts140*/141static final Normalizer.Mode NFC = com.ibm.icu.text.Normalizer.NFC;142static final Normalizer.Mode NFD = com.ibm.icu.text.Normalizer.NFD;143static final Normalizer.Mode NFKC = com.ibm.icu.text.Normalizer.NFKC;144static final Normalizer.Mode NFKD = com.ibm.icu.text.Normalizer.NFKD;145static final Normalizer.Mode[] modes = {NFC, NFD, NFKC, NFKD};146147148public static void main(String[] args) throws Exception {149test(DATA_3_2_0, UNICODE_3_2_0);150test(DATA_3_2_0, UNICODE_3_2_0_BEFORE_PRI_29);151test(DATA_LATEST, UNICODE_LATEST);152// This test started failing since ICU4J 3.6.153// test(DATA_LATEST, UNICODE_LATEST_BEFORE_PRI_29);154155/* Unconformity test */156// test(DATA_3_2_0, UNICODE_LATEST);157// test(DATA_LATEST, UNICODE_3_2);158}159160private static void test(String filename, int unicodeVer) throws Exception {161162FileInputStream fis = new FileInputStream(filename);163BufferedReader in =164new BufferedReader(new InputStreamReader(fis, decoder));165166System.out.println("\nStart testing with " + filename +167" for options: " +168(((unicodeVer & Normalizer.UNICODE_3_2) != 0) ?169"Unicode 3.2.0" : "the latest Unicode") + ", " +170(((unicodeVer & NormalizerImpl.BEFORE_PRI_29) != 0) ?171"with" : "without") + " PRI #29 fix");172173int lineNo = 0;174String text;175String[] columns = new String[6];176boolean part1test = false;177178while ((text = in.readLine()) != null) {179lineNo ++;180181char c = text.charAt(0);182if (c == '#') {183continue;184} else if (c == '@') {185if (text.startsWith("@Part")) {186System.out.println("# Testing data in " + text);187188if (text.startsWith("@Part1 ")) {189part1test = true;190} else {191part1test = false;192}193194continue;195}196}197198prepareColumns(columns, text, filename, lineNo, part1test);199200testNFC(columns, unicodeVer, filename, lineNo);201testNFD(columns, unicodeVer, filename, lineNo);202testNFKC(columns, unicodeVer, filename, lineNo);203testNFKD(columns, unicodeVer, filename, lineNo);204}205206in.close();207fis.close();208209if (unicodeVer == UNICODE_LATEST) {210System.out.println("# Testing characters which are not listed in Part1");211testRemainingChars(filename, unicodeVer);212}213}214215/*216* Test for NFC217*218* c2 == NFC(c1) == NFC(c2) == NFC(c3)219* c4 == NFC(c4) == NFC(c5)220*/221private static void testNFC(String[] c, int unicodeVer,222String file, int line) throws Exception {223test(2, c, 1, 3, NFC, unicodeVer, file, line);224test(4, c, 4, 5, NFC, unicodeVer, file, line);225}226227/*228* Test for NFD229*230* c3 == NFD(c1) == NFD(c2) == NFD(c3)231* c5 == NFD(c4) == NFD(c5)232*/233private static void testNFD(String[] c, int unicodeVer,234String file, int line) throws Exception {235test(3, c, 1, 3, NFD, unicodeVer, file, line);236test(5, c, 4, 5, NFD, unicodeVer, file, line);237}238239/*240* Test for NFKC241*242* c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)243*/244private static void testNFKC(String[] c, int unicodeVer,245String file, int line) throws Exception {246test(4, c, 1, 5, NFKC, unicodeVer, file, line);247}248249/*250* Test for NFKD251*252* c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)253*/254private static void testNFKD(String[] c, int unicodeVer,255String file, int line) throws Exception {256test(5, c, 1, 5, NFKD, unicodeVer, file, line);257}258259/*260* Test for characters which aren't listed in Part1261*262* X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)263*/264private static void testRemainingChars(String file,265int unicodeVer) throws Exception {266for (int i = Character.MIN_CODE_POINT;267i <= Character.MAX_CODE_POINT;268i++) {269if (!charList.get(i)) {270String from = String.valueOf(Character.toChars(i));271String to;272273for (int j = 0; j < modes.length; j++) {274Normalizer.Mode mode = modes[j];275276to = Normalizer.normalize(from, mode, unicodeVer);277if (!from.equals(to)) {278error(mode, from, from, to, file, -1);279// } else {280// okay(mode, from, from, to, file, -1);281}282283if (!Normalizer.isNormalized(from, mode, unicodeVer)) {284error(mode, from, file, -1);285// } else {286// okay(mode, from, file, -1);287}288}289}290}291}292293/*294* Test normalize() and isNormalized()295*/296private static void test(int col, String[] c,297int FROM, int TO,298Normalizer.Mode mode, int unicodeVer,299String file, int line) throws Exception {300for (int i = FROM; i <= TO; i++) {301String got = Normalizer.normalize(c[i], mode, unicodeVer);302if (!c[col].equals(got)) {303error(mode, c[i], c[col], got, file, line);304// } else {305// okay(mode, c[i], c[col], got, file, line);306}307308/*309* If the original String equals its normalized String, it means310* that the original String is normalizerd. Thus, isNormalized()311* should return true. And, vice versa!312*/313if (c[col].equals(c[i])) {314if (!Normalizer.isNormalized(c[i], mode, unicodeVer)) {315error(mode, c[i], file, line);316// } else {317// okay(mode, c[i], file, line);318}319} else {320if (Normalizer.isNormalized(c[i], mode, unicodeVer)) {321error(mode, c[i], file, line);322// } else {323// okay(mode, c[i], file, line);324}325}326}327}328329/*330* Generate an array of String from a line of conformance datafile.331*/332private static void prepareColumns(String[] col, String text,333String file, int line,334boolean part1test) throws Exception {335int index = text.indexOf('#');336if (index != -1) {337text = text.substring(0, index);338}339340StringTokenizer st = new StringTokenizer(text, ";");341int tokenCount = st.countTokens();342if (tokenCount < 5) {343throw new RuntimeException("# of tokens in datafile should be 6, but got: " + tokenCount + " at line " + line + " in " + file);344}345346StringBuffer sb = new StringBuffer();347for (int i = 1; i <= 5; i++) {348StringTokenizer tst = new StringTokenizer(st.nextToken(), " ");349350while (tst.hasMoreTokens()) {351int code = Integer.parseInt(tst.nextToken(), 16);352sb.append(Character.toChars(code));353}354355col[i] = sb.toString();356sb.setLength(0);357}358359if (part1test) {360charList.set(col[1].codePointAt(0));361}362}363364/*365* Show an error message when normalize() didn't return the expected value.366* (An exception is sometimes convenient. Therefore, it is commented out367* for the moment.)368*/369private static void error(Normalizer.Mode mode,370String from, String to, String got,371String file, int line) throws Exception {372System.err.println("\t" + toString(mode) + ": normalize(" +373toHexString(from) + ") doesn't equal <" + toHexString(to) +374"> at line " + line + " in " + file + ". Got <" +375toHexString(got) + ">.");376// throw new RuntimeException("Normalization(" + toString(mode) + ") failed");377}378379/*380* Show an error message when isNormalize() didn't return the expected value.381* (An exception is sometimes convenient. Therefore, it is commented out382* for the moment.)383*/384private static void error(Normalizer.Mode mode, String orig,385String file, int line) throws Exception {386System.err.println("\t" + toString(mode) + ": isNormalized(" +387toHexString(orig) + ") returned the wrong value at line " + line +388" in " + file + ".");389// throw new RuntimeException("Normalization(" + toString(mode) +") failed");390}391392/*393* (For debugging)394* Shows a message when normalize() returned the expected value.395*/396private static void okay(Normalizer.Mode mode,397String from, String to, String got,398String file, int line) {399System.out.println("\t" + toString(mode) + ": normalize(" +400toHexString(from) + ") equals <" + toHexString(to) +401"> at line " + line + " in " + file + ". Got <" +402toHexString(got) + ">.");403}404405/*406* (For debugging)407* Shows a message when isNormalized() returned the expected value.408*/409private static void okay(Normalizer.Mode mode, String orig,410String file, int line) {411System.out.println("\t" + toString(mode) + ": isNormalized(" +412toHexString(orig) + ") returned the correct value at line " +413line + " in " + file + ".");414}415416/*417* Returns a spece-delimited hex String418*/419private static String toHexString(String s) {420StringBuffer sb = new StringBuffer(" ");421422for (int i = 0; i < s.length(); i++) {423sb.append(Integer.toHexString(s.charAt(i)));424sb.append(' ');425}426427return sb.toString();428}429430/*431* Returns the name of Normalizer.Mode432*/433private static String toString(Normalizer.Mode mode) {434if (mode == Normalizer.NFC) {435return "NFC";436} else if (mode == Normalizer.NFD) {437return "NFD";438} else if (mode == Normalizer.NFKC) {439return "NFKC";440} else if (mode == Normalizer.NFKD) {441return "NFKD";442}443444return "unknown";445}446}447448449