Path: blob/master/src/java.base/share/classes/java/text/RBTableBuilder.java
41152 views
/*1* Copyright (c) 1999, 2020, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/2425/*26* (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved27* (C) Copyright IBM Corp. 1996-1998 - All Rights Reserved28*29* The original version of this source code and documentation is copyrighted30* and owned by Taligent, Inc., a wholly-owned subsidiary of IBM. These31* materials are provided under terms of a License Agreement between Taligent32* and Sun. This technology is protected by multiple US and International33* patents. This notice and attribution to Taligent may not be removed.34* Taligent is a registered trademark of Taligent, Inc.35*36*/3738package java.text;3940import java.util.Vector;41import sun.text.UCompactIntArray;42import sun.text.IntHashtable;43import sun.text.ComposedCharIter;44import jdk.internal.icu.impl.NormalizerImpl;4546/**47* This class contains all the code to parse a RuleBasedCollator pattern48* and build a RBCollationTables object from it. A particular instance49* of tis class exists only during the actual build process-- once an50* RBCollationTables object has been built, the RBTableBuilder object51* goes away. This object carries all of the state which is only needed52* during the build process, plus a "shadow" copy of all of the state53* that will go into the tables object itself. This object communicates54* with RBCollationTables through a separate class, RBCollationTables.BuildAPI,55* this is an inner class of RBCollationTables and provides a separate56* private API for communication with RBTableBuilder.57* This class isn't just an inner class of RBCollationTables itself because58* of its large size. For source-code readability, it seemed better for the59* builder to have its own source file.60*/61final class RBTableBuilder {6263public RBTableBuilder(RBCollationTables.BuildAPI tables) {64this.tables = tables;65}6667/**68* Create a table-based collation object with the given rules.69* This is the main function that actually builds the tables and70* stores them back in the RBCollationTables object. It is called71* ONLY by the RBCollationTables constructor.72* @see RuleBasedCollator#RuleBasedCollator73* @throws ParseException If the rules format is incorrect.74*/7576public void build(String pattern, int decmp) throws ParseException {77String expChars;78String groupChars;79if (pattern.isEmpty())80throw new ParseException("Build rules empty.", 0);8182// This array maps Unicode characters to their collation ordering83mapping = new UCompactIntArray(RBCollationTables.UNMAPPED);84// Normalize the build rules. Find occurances of all decomposed characters85// and normalize the rules before feeding into the builder. By "normalize",86// we mean that all precomposed Unicode characters must be converted into87// a base character and one or more combining characters (such as accents).88// When there are multiple combining characters attached to a base character,89// the combining characters must be in their canonical order90//91// sherman/Note:92//(1)decmp will be NO_DECOMPOSITION only in ko locale to prevent decompose93//hangual syllables to jamos, so we can actually just call decompose with94//normalizer's IGNORE_HANGUL option turned on95//96//(2)just call the "special version" in NormalizerImpl directly97//pattern = Normalizer.decompose(pattern, false, Normalizer.IGNORE_HANGUL, true);98//99//Normalizer.Mode mode = CollatorUtilities.toNormalizerMode(decmp);100//pattern = Normalizer.normalize(pattern, mode, 0, true);101102pattern = NormalizerImpl.canonicalDecomposeWithSingleQuotation(pattern);103104// Build the merged collation entries105// Since rules can be specified in any order in the string106// (e.g. "c , C < d , D < e , E .... C < CH")107// this splits all of the rules in the string out into separate108// objects and then sorts them. In the above example, it merges the109// "C < CH" rule in just before the "C < D" rule.110//111112mPattern = new MergeCollation(pattern);113114int order = 0;115116// Now walk though each entry and add it to my own tables117for (int i = 0; i < mPattern.getCount(); ++i) {118PatternEntry entry = mPattern.getItemAt(i);119if (entry != null) {120groupChars = entry.getChars();121if (groupChars.length() > 1) {122switch(groupChars.charAt(groupChars.length()-1)) {123case '@':124frenchSec = true;125groupChars = groupChars.substring(0, groupChars.length()-1);126break;127case '!':128seAsianSwapping = true;129groupChars = groupChars.substring(0, groupChars.length()-1);130break;131}132}133134order = increment(entry.getStrength(), order);135expChars = entry.getExtension();136137if (!expChars.isEmpty()) {138addExpandOrder(groupChars, expChars, order);139} else if (groupChars.length() > 1) {140char ch = groupChars.charAt(0);141if (Character.isHighSurrogate(ch) && groupChars.length() == 2) {142addOrder(Character.toCodePoint(ch, groupChars.charAt(1)), order);143} else {144addContractOrder(groupChars, order);145}146} else {147char ch = groupChars.charAt(0);148addOrder(ch, order);149}150}151}152addComposedChars();153154commit();155mapping.compact();156/*157System.out.println("mappingSize=" + mapping.getKSize());158for (int j = 0; j < 0xffff; j++) {159int value = mapping.elementAt(j);160if (value != RBCollationTables.UNMAPPED)161System.out.println("index=" + Integer.toString(j, 16)162+ ", value=" + Integer.toString(value, 16));163}164*/165tables.fillInTables(frenchSec, seAsianSwapping, mapping, contractTable, expandTable,166contractFlags, maxSecOrder, maxTerOrder);167}168169/** Add expanding entries for pre-composed unicode characters so that this170* collator can be used reasonably well with decomposition turned off.171*/172private void addComposedChars() throws ParseException {173// Iterate through all of the pre-composed characters in Unicode174ComposedCharIter iter = new ComposedCharIter();175int c;176while ((c = iter.next()) != ComposedCharIter.DONE) {177if (getCharOrder(c) == RBCollationTables.UNMAPPED) {178//179// We don't already have an ordering for this pre-composed character.180//181// First, see if the decomposed string is already in our182// tables as a single contracting-string ordering.183// If so, just map the precomposed character to that order.184//185// TODO: What we should really be doing here is trying to find the186// longest initial substring of the decomposition that is present187// in the tables as a contracting character sequence, and find its188// ordering. Then do this recursively with the remaining chars189// so that we build a list of orderings, and add that list to190// the expansion table.191// That would be more correct but also significantly slower, so192// I'm not totally sure it's worth doing.193//194String s = iter.decomposition();195196//sherman/Note: if this is 1 character decomposed string, the197//only thing need to do is to check if this decomposed character198//has an entry in our order table, this order is not necessary199//to be a contraction order, if it does have one, add an entry200//for the precomposed character by using the same order, the201//previous impl unnecessarily adds a single character expansion202//entry.203if (s.length() == 1) {204int order = getCharOrder(s.charAt(0));205if (order != RBCollationTables.UNMAPPED) {206addOrder(c, order);207}208continue;209} else if (s.length() == 2) {210char ch0 = s.charAt(0);211if (Character.isHighSurrogate(ch0)) {212int order = getCharOrder(s.codePointAt(0));213if (order != RBCollationTables.UNMAPPED) {214addOrder(c, order);215}216continue;217}218}219int contractOrder = getContractOrder(s);220if (contractOrder != RBCollationTables.UNMAPPED) {221addOrder(c, contractOrder);222} else {223//224// We don't have a contracting ordering for the entire string225// that results from the decomposition, but if we have orders226// for each individual character, we can add an expanding227// table entry for the pre-composed character228//229boolean allThere = true;230for (int i = 0; i < s.length(); i++) {231if (getCharOrder(s.charAt(i)) == RBCollationTables.UNMAPPED) {232allThere = false;233break;234}235}236if (allThere) {237addExpandOrder(c, s, RBCollationTables.UNMAPPED);238}239}240}241}242}243244/**245* Look up for unmapped values in the expanded character table.246*247* When the expanding character tables are built by addExpandOrder,248* it doesn't know what the final ordering of each character249* in the expansion will be. Instead, it just puts the raw character250* code into the table, adding CHARINDEX as a flag. Now that we've251* finished building the mapping table, we can go back and look up252* that character to see what its real collation order is and253* stick that into the expansion table. That lets us avoid doing254* a two-stage lookup later.255*/256private final void commit()257{258if (expandTable != null) {259for (int i = 0; i < expandTable.size(); i++) {260int[] valueList = expandTable.elementAt(i);261for (int j = 0; j < valueList.length; j++) {262int order = valueList[j];263if (order < RBCollationTables.EXPANDCHARINDEX && order > CHARINDEX) {264// found a expanding character that isn't filled in yet265int ch = order - CHARINDEX;266267// Get the real values for the non-filled entry268int realValue = getCharOrder(ch);269270if (realValue == RBCollationTables.UNMAPPED) {271// The real value is still unmapped, maybe it's ignorable272valueList[j] = IGNORABLEMASK & ch;273} else {274// just fill in the value275valueList[j] = realValue;276}277}278}279}280}281}282/**283* Increment of the last order based on the comparison level.284*/285private final int increment(int aStrength, int lastValue)286{287switch(aStrength)288{289case Collator.PRIMARY:290// increment priamry order and mask off secondary and tertiary difference291lastValue += PRIMARYORDERINCREMENT;292lastValue &= RBCollationTables.PRIMARYORDERMASK;293isOverIgnore = true;294break;295case Collator.SECONDARY:296// increment secondary order and mask off tertiary difference297lastValue += SECONDARYORDERINCREMENT;298lastValue &= RBCollationTables.SECONDARYDIFFERENCEONLY;299// record max # of ignorable chars with secondary difference300if (!isOverIgnore)301maxSecOrder++;302break;303case Collator.TERTIARY:304// increment tertiary order305lastValue += TERTIARYORDERINCREMENT;306// record max # of ignorable chars with tertiary difference307if (!isOverIgnore)308maxTerOrder++;309break;310}311return lastValue;312}313314/**315* Adds a character and its designated order into the collation table.316*/317private final void addOrder(int ch, int anOrder)318{319// See if the char already has an order in the mapping table320int order = mapping.elementAt(ch);321322if (order >= RBCollationTables.CONTRACTCHARINDEX) {323// There's already an entry for this character that points to a contracting324// character table. Instead of adding the character directly to the mapping325// table, we must add it to the contract table instead.326int length = 1;327if (Character.isSupplementaryCodePoint(ch)) {328length = Character.toChars(ch, keyBuf, 0);329} else {330keyBuf[0] = (char)ch;331}332addContractOrder(new String(keyBuf, 0, length), anOrder);333} else {334// add the entry to the mapping table,335// the same later entry replaces the previous one336mapping.setElementAt(ch, anOrder);337}338}339340private final void addContractOrder(String groupChars, int anOrder) {341addContractOrder(groupChars, anOrder, true);342}343344/**345* Adds the contracting string into the collation table.346*/347private final void addContractOrder(String groupChars, int anOrder,348boolean fwd)349{350if (contractTable == null) {351contractTable = new Vector<>(INITIALTABLESIZE);352}353354//initial character355int ch = groupChars.codePointAt(0);356/*357char ch0 = groupChars.charAt(0);358int ch = Character.isHighSurrogate(ch0)?359Character.toCodePoint(ch0, groupChars.charAt(1)):ch0;360*/361// See if the initial character of the string already has a contract table.362int entry = mapping.elementAt(ch);363Vector<EntryPair> entryTable = getContractValuesImpl(entry - RBCollationTables.CONTRACTCHARINDEX);364365if (entryTable == null) {366// We need to create a new table of contract entries for this base char367int tableIndex = RBCollationTables.CONTRACTCHARINDEX + contractTable.size();368entryTable = new Vector<>(INITIALTABLESIZE);369contractTable.addElement(entryTable);370371// Add the initial character's current ordering first. then372// update its mapping to point to this contract table373entryTable.addElement(new EntryPair(groupChars.substring(0,Character.charCount(ch)), entry));374mapping.setElementAt(ch, tableIndex);375}376377// Now add (or replace) this string in the table378int index = RBCollationTables.getEntry(entryTable, groupChars, fwd);379if (index != RBCollationTables.UNMAPPED) {380EntryPair pair = entryTable.elementAt(index);381pair.value = anOrder;382} else {383EntryPair pair = entryTable.lastElement();384385// NOTE: This little bit of logic is here to speed CollationElementIterator386// .nextContractChar(). This code ensures that the longest sequence in387// this list is always the _last_ one in the list. This keeps388// nextContractChar() from having to search the entire list for the longest389// sequence.390if (groupChars.length() > pair.entryName.length()) {391entryTable.addElement(new EntryPair(groupChars, anOrder, fwd));392} else {393entryTable.insertElementAt(new EntryPair(groupChars, anOrder,394fwd), entryTable.size() - 1);395}396}397398// If this was a forward mapping for a contracting string, also add a399// reverse mapping for it, so that CollationElementIterator.previous400// can work right401if (fwd && groupChars.length() > 1) {402addContractFlags(groupChars);403addContractOrder(new StringBuffer(groupChars).reverse().toString(),404anOrder, false);405}406}407408/**409* If the given string has been specified as a contracting string410* in this collation table, return its ordering.411* Otherwise return UNMAPPED.412*/413private int getContractOrder(String groupChars)414{415int result = RBCollationTables.UNMAPPED;416if (contractTable != null) {417int ch = groupChars.codePointAt(0);418/*419char ch0 = groupChars.charAt(0);420int ch = Character.isHighSurrogate(ch0)?421Character.toCodePoint(ch0, groupChars.charAt(1)):ch0;422*/423Vector<EntryPair> entryTable = getContractValues(ch);424if (entryTable != null) {425int index = RBCollationTables.getEntry(entryTable, groupChars, true);426if (index != RBCollationTables.UNMAPPED) {427EntryPair pair = entryTable.elementAt(index);428result = pair.value;429}430}431}432return result;433}434435private final int getCharOrder(int ch) {436int order = mapping.elementAt(ch);437438if (order >= RBCollationTables.CONTRACTCHARINDEX) {439Vector<EntryPair> groupList = getContractValuesImpl(order - RBCollationTables.CONTRACTCHARINDEX);440EntryPair pair = groupList.firstElement();441order = pair.value;442}443return order;444}445446/**447* Get the entry of hash table of the contracting string in the collation448* table.449* @param ch the starting character of the contracting string450*/451private Vector<EntryPair> getContractValues(int ch)452{453int index = mapping.elementAt(ch);454return getContractValuesImpl(index - RBCollationTables.CONTRACTCHARINDEX);455}456457private Vector<EntryPair> getContractValuesImpl(int index)458{459if (index >= 0)460{461return contractTable.elementAt(index);462}463else // not found464{465return null;466}467}468469/**470* Adds the expanding string into the collation table.471*/472private final void addExpandOrder(String contractChars,473String expandChars,474int anOrder) throws ParseException475{476// Create an expansion table entry477int tableIndex = addExpansion(anOrder, expandChars);478479// And add its index into the main mapping table480if (contractChars.length() > 1) {481char ch = contractChars.charAt(0);482if (Character.isHighSurrogate(ch) && contractChars.length() == 2) {483char ch2 = contractChars.charAt(1);484if (Character.isLowSurrogate(ch2)) {485//only add into table when it is a legal surrogate486addOrder(Character.toCodePoint(ch, ch2), tableIndex);487}488} else {489addContractOrder(contractChars, tableIndex);490}491} else {492addOrder(contractChars.charAt(0), tableIndex);493}494}495496private final void addExpandOrder(int ch, String expandChars, int anOrder)497throws ParseException498{499int tableIndex = addExpansion(anOrder, expandChars);500addOrder(ch, tableIndex);501}502503/**504* Create a new entry in the expansion table that contains the orderings505* for the given characers. If anOrder is valid, it is added to the506* beginning of the expanded list of orders.507*/508private int addExpansion(int anOrder, String expandChars) {509if (expandTable == null) {510expandTable = new Vector<>(INITIALTABLESIZE);511}512513// If anOrder is valid, we want to add it at the beginning of the list514int offset = (anOrder == RBCollationTables.UNMAPPED) ? 0 : 1;515516int[] valueList = new int[expandChars.length() + offset];517if (offset == 1) {518valueList[0] = anOrder;519}520521int j = offset;522for (int i = 0; i < expandChars.length(); i++) {523char ch0 = expandChars.charAt(i);524char ch1;525int ch;526if (Character.isHighSurrogate(ch0)) {527if (++i == expandChars.length() ||528!Character.isLowSurrogate(ch1=expandChars.charAt(i))) {529//ether we are missing the low surrogate or the next char530//is not a legal low surrogate, so stop loop531break;532}533ch = Character.toCodePoint(ch0, ch1);534535} else {536ch = ch0;537}538539int mapValue = getCharOrder(ch);540541if (mapValue != RBCollationTables.UNMAPPED) {542valueList[j++] = mapValue;543} else {544// can't find it in the table, will be filled in by commit().545valueList[j++] = CHARINDEX + ch;546}547}548if (j < valueList.length) {549//we had at least one supplementary character, the size of valueList550//is bigger than it really needs...551int[] tmpBuf = new int[j];552while (--j >= 0) {553tmpBuf[j] = valueList[j];554}555valueList = tmpBuf;556}557// Add the expanding char list into the expansion table.558int tableIndex = RBCollationTables.EXPANDCHARINDEX + expandTable.size();559expandTable.addElement(valueList);560561return tableIndex;562}563564private void addContractFlags(String chars) {565char c0;566int c;567int len = chars.length();568for (int i = 0; i < len; i++) {569c0 = chars.charAt(i);570c = Character.isHighSurrogate(c0)571?Character.toCodePoint(c0, chars.charAt(++i))572:c0;573contractFlags.put(c, 1);574}575}576577// ==============================================================578// constants579// ==============================================================580static final int CHARINDEX = 0x70000000; // need look up in .commit()581582private static final int IGNORABLEMASK = 0x0000ffff;583private static final int PRIMARYORDERINCREMENT = 0x00010000;584private static final int SECONDARYORDERINCREMENT = 0x00000100;585private static final int TERTIARYORDERINCREMENT = 0x00000001;586private static final int INITIALTABLESIZE = 20;587private static final int MAXKEYSIZE = 5;588589// ==============================================================590// instance variables591// ==============================================================592593// variables used by the build process594private RBCollationTables.BuildAPI tables = null;595private MergeCollation mPattern = null;596private boolean isOverIgnore = false;597private char[] keyBuf = new char[MAXKEYSIZE];598private IntHashtable contractFlags = new IntHashtable(100);599600// "shadow" copies of the instance variables in RBCollationTables601// (the values in these variables are copied back into RBCollationTables602// at the end of the build process)603private boolean frenchSec = false;604private boolean seAsianSwapping = false;605606private UCompactIntArray mapping = null;607private Vector<Vector<EntryPair>> contractTable = null;608private Vector<int[]> expandTable = null;609610private short maxSecOrder = 0;611private short maxTerOrder = 0;612}613614615