Path: blob/master/src/java.base/share/classes/java/lang/ConditionalSpecialCasing.java
41152 views
/*1* Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/2425package java.lang;2627import java.text.BreakIterator;28import java.util.HashSet;29import java.util.Hashtable;30import java.util.Iterator;31import java.util.Locale;32import sun.text.Normalizer;333435/**36* This is a utility class for {@code String.toLowerCase()} and37* {@code String.toUpperCase()}, that handles special casing with38* conditions. In other words, it handles the mappings with conditions39* that are defined in40* <a href="http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt">Special41* Casing Properties</a> file.42* <p>43* Note that the unconditional case mappings (including 1:M mappings)44* are handled in {@code Character.toLower/UpperCase()}.45*/46final class ConditionalSpecialCasing {4748// context conditions.49static final int FINAL_CASED = 1;50static final int AFTER_SOFT_DOTTED = 2;51static final int MORE_ABOVE = 3;52static final int AFTER_I = 4;53static final int NOT_BEFORE_DOT = 5;5455// combining class definitions56static final int COMBINING_CLASS_ABOVE = 230;5758// Special case mapping entries59static Entry[] entry = {60//# ================================================================================61//# Conditional mappings62//# ================================================================================63new Entry(0x03A3, new char[]{0x03C2}, new char[]{0x03A3}, null, FINAL_CASED), // # GREEK CAPITAL LETTER SIGMA64new Entry(0x0130, new char[]{0x0069, 0x0307}, new char[]{0x0130}, null, 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE6566//# ================================================================================67//# Locale-sensitive mappings68//# ================================================================================69//# Lithuanian70new Entry(0x0307, new char[]{0x0307}, new char[]{}, "lt", AFTER_SOFT_DOTTED), // # COMBINING DOT ABOVE71new Entry(0x0049, new char[]{0x0069, 0x0307}, new char[]{0x0049}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER I72new Entry(0x004A, new char[]{0x006A, 0x0307}, new char[]{0x004A}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER J73new Entry(0x012E, new char[]{0x012F, 0x0307}, new char[]{0x012E}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER I WITH OGONEK74new Entry(0x00CC, new char[]{0x0069, 0x0307, 0x0300}, new char[]{0x00CC}, "lt", 0), // # LATIN CAPITAL LETTER I WITH GRAVE75new Entry(0x00CD, new char[]{0x0069, 0x0307, 0x0301}, new char[]{0x00CD}, "lt", 0), // # LATIN CAPITAL LETTER I WITH ACUTE76new Entry(0x0128, new char[]{0x0069, 0x0307, 0x0303}, new char[]{0x0128}, "lt", 0), // # LATIN CAPITAL LETTER I WITH TILDE7778//# ================================================================================79//# Turkish and Azeri80new Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "tr", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE81new Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "az", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE82new Entry(0x0307, new char[]{}, new char[]{0x0307}, "tr", AFTER_I), // # COMBINING DOT ABOVE83new Entry(0x0307, new char[]{}, new char[]{0x0307}, "az", AFTER_I), // # COMBINING DOT ABOVE84new Entry(0x0049, new char[]{0x0131}, new char[]{0x0049}, "tr", NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I85new Entry(0x0049, new char[]{0x0131}, new char[]{0x0049}, "az", NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I86new Entry(0x0069, new char[]{0x0069}, new char[]{0x0130}, "tr", 0), // # LATIN SMALL LETTER I87new Entry(0x0069, new char[]{0x0069}, new char[]{0x0130}, "az", 0) // # LATIN SMALL LETTER I88};8990// A hash table that contains the above entries91static Hashtable<Integer, HashSet<Entry>> entryTable = new Hashtable<>();92static {93// create hashtable from the entry94for (Entry cur : entry) {95Integer cp = cur.getCodePoint();96HashSet<Entry> set = entryTable.get(cp);97if (set == null) {98set = new HashSet<>();99entryTable.put(cp, set);100}101set.add(cur);102}103}104105static int toLowerCaseEx(String src, int index, Locale locale) {106char[] result = lookUpTable(src, index, locale, true);107108if (result != null) {109if (result.length == 1) {110return result[0];111} else {112return Character.ERROR;113}114} else {115// default to Character class' one116return Character.toLowerCase(src.codePointAt(index));117}118}119120static int toUpperCaseEx(String src, int index, Locale locale) {121char[] result = lookUpTable(src, index, locale, false);122123if (result != null) {124if (result.length == 1) {125return result[0];126} else {127return Character.ERROR;128}129} else {130// default to Character class' one131return Character.toUpperCaseEx(src.codePointAt(index));132}133}134135static char[] toLowerCaseCharArray(String src, int index, Locale locale) {136return lookUpTable(src, index, locale, true);137}138139static char[] toUpperCaseCharArray(String src, int index, Locale locale) {140char[] result = lookUpTable(src, index, locale, false);141if (result != null) {142return result;143} else {144return Character.toUpperCaseCharArray(src.codePointAt(index));145}146}147148private static char[] lookUpTable(String src, int index, Locale locale, boolean bLowerCasing) {149HashSet<Entry> set = entryTable.get(src.codePointAt(index));150char[] ret = null;151152if (set != null) {153Iterator<Entry> iter = set.iterator();154String currentLang = locale.getLanguage();155while (iter.hasNext()) {156Entry entry = iter.next();157String conditionLang = entry.getLanguage();158if (((conditionLang == null) || (conditionLang.equals(currentLang))) &&159isConditionMet(src, index, locale, entry.getCondition())) {160ret = bLowerCasing ? entry.getLowerCase() : entry.getUpperCase();161if (conditionLang != null) {162break;163}164}165}166}167168return ret;169}170171private static boolean isConditionMet(String src, int index, Locale locale, int condition) {172switch (condition) {173case FINAL_CASED:174return isFinalCased(src, index, locale);175176case AFTER_SOFT_DOTTED:177return isAfterSoftDotted(src, index);178179case MORE_ABOVE:180return isMoreAbove(src, index);181182case AFTER_I:183return isAfterI(src, index);184185case NOT_BEFORE_DOT:186return !isBeforeDot(src, index);187188default:189return true;190}191}192193/**194* Implements the "Final_Cased" condition195*196* Specification: Within the closest word boundaries containing C, there is a cased197* letter before C, and there is no cased letter after C.198*199* Regular Expression:200* Before C: [{cased==true}][{wordBoundary!=true}]*201* After C: !([{wordBoundary!=true}]*[{cased}])202*/203private static boolean isFinalCased(String src, int index, Locale locale) {204BreakIterator wordBoundary = BreakIterator.getWordInstance(locale);205wordBoundary.setText(src);206int ch;207208// Look for a preceding 'cased' letter209for (int i = index; (i >= 0) && !wordBoundary.isBoundary(i);210i -= Character.charCount(ch)) {211212ch = src.codePointBefore(i);213if (isCased(ch)) {214215int len = src.length();216// Check that there is no 'cased' letter after the index217for (i = index + Character.charCount(src.codePointAt(index));218(i < len) && !wordBoundary.isBoundary(i);219i += Character.charCount(ch)) {220221ch = src.codePointAt(i);222if (isCased(ch)) {223return false;224}225}226227return true;228}229}230231return false;232}233234/**235* Implements the "After_I" condition236*237* Specification: The last preceding base character was an uppercase I,238* and there is no intervening combining character class 230 (ABOVE).239*240* Regular Expression:241* Before C: [I]([{cc!=230}&{cc!=0}])*242*/243private static boolean isAfterI(String src, int index) {244int ch;245int cc;246247// Look for the last preceding base character248for (int i = index; i > 0; i -= Character.charCount(ch)) {249250ch = src.codePointBefore(i);251252if (ch == 'I') {253return true;254} else {255cc = Normalizer.getCombiningClass(ch);256if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {257return false;258}259}260}261262return false;263}264265/**266* Implements the "After_Soft_Dotted" condition267*268* Specification: The last preceding character with combining class269* of zero before C was Soft_Dotted, and there is no intervening270* combining character class 230 (ABOVE).271*272* Regular Expression:273* Before C: [{Soft_Dotted==true}]([{cc!=230}&{cc!=0}])*274*/275private static boolean isAfterSoftDotted(String src, int index) {276int ch;277int cc;278279// Look for the last preceding character280for (int i = index; i > 0; i -= Character.charCount(ch)) {281282ch = src.codePointBefore(i);283284if (isSoftDotted(ch)) {285return true;286} else {287cc = Normalizer.getCombiningClass(ch);288if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {289return false;290}291}292}293294return false;295}296297/**298* Implements the "More_Above" condition299*300* Specification: C is followed by one or more characters of combining301* class 230 (ABOVE) in the combining character sequence.302*303* Regular Expression:304* After C: [{cc!=0}]*[{cc==230}]305*/306private static boolean isMoreAbove(String src, int index) {307int ch;308int cc;309int len = src.length();310311// Look for a following ABOVE combining class character312for (int i = index + Character.charCount(src.codePointAt(index));313i < len; i += Character.charCount(ch)) {314315ch = src.codePointAt(i);316cc = Normalizer.getCombiningClass(ch);317318if (cc == COMBINING_CLASS_ABOVE) {319return true;320} else if (cc == 0) {321return false;322}323}324325return false;326}327328/**329* Implements the "Before_Dot" condition330*331* Specification: C is followed by {@code U+0307 COMBINING DOT ABOVE}.332* Any sequence of characters with a combining class that is333* neither 0 nor 230 may intervene between the current character334* and the combining dot above.335*336* Regular Expression:337* After C: ([{cc!=230}&{cc!=0}])*[\u0307]338*/339private static boolean isBeforeDot(String src, int index) {340int ch;341int cc;342int len = src.length();343344// Look for a following COMBINING DOT ABOVE345for (int i = index + Character.charCount(src.codePointAt(index));346i < len; i += Character.charCount(ch)) {347348ch = src.codePointAt(i);349350if (ch == '\u0307') {351return true;352} else {353cc = Normalizer.getCombiningClass(ch);354if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {355return false;356}357}358}359360return false;361}362363/**364* Examines whether a character is 'cased'.365*366* A character C is defined to be 'cased' if and only if at least one of367* following are true for C: uppercase==true, or lowercase==true, or368* general_category==titlecase_letter.369*370* The uppercase and lowercase property values are specified in the data371* file DerivedCoreProperties.txt in the Unicode Character Database.372*/373private static boolean isCased(int ch) {374int type = Character.getType(ch);375if (type == Character.LOWERCASE_LETTER ||376type == Character.UPPERCASE_LETTER ||377type == Character.TITLECASE_LETTER) {378return true;379} else {380// Check for Other_Lowercase and Other_Uppercase381//382if ((ch >= 0x02B0) && (ch <= 0x02B8)) {383// MODIFIER LETTER SMALL H..MODIFIER LETTER SMALL Y384return true;385} else if ((ch >= 0x02C0) && (ch <= 0x02C1)) {386// MODIFIER LETTER GLOTTAL STOP..MODIFIER LETTER REVERSED GLOTTAL STOP387return true;388} else if ((ch >= 0x02E0) && (ch <= 0x02E4)) {389// MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP390return true;391} else if (ch == 0x0345) {392// COMBINING GREEK YPOGEGRAMMENI393return true;394} else if (ch == 0x037A) {395// GREEK YPOGEGRAMMENI396return true;397} else if ((ch >= 0x1D2C) && (ch <= 0x1D61)) {398// MODIFIER LETTER CAPITAL A..MODIFIER LETTER SMALL CHI399return true;400} else if ((ch >= 0x2160) && (ch <= 0x217F)) {401// ROMAN NUMERAL ONE..ROMAN NUMERAL ONE THOUSAND402// SMALL ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL ONE THOUSAND403return true;404} else if ((ch >= 0x24B6) && (ch <= 0x24E9)) {405// CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN CAPITAL LETTER Z406// CIRCLED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z407return true;408} else {409return false;410}411}412}413414private static boolean isSoftDotted(int ch) {415switch (ch) {416case 0x0069: // Soft_Dotted # L& LATIN SMALL LETTER I417case 0x006A: // Soft_Dotted # L& LATIN SMALL LETTER J418case 0x012F: // Soft_Dotted # L& LATIN SMALL LETTER I WITH OGONEK419case 0x0268: // Soft_Dotted # L& LATIN SMALL LETTER I WITH STROKE420case 0x0456: // Soft_Dotted # L& CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I421case 0x0458: // Soft_Dotted # L& CYRILLIC SMALL LETTER JE422case 0x1D62: // Soft_Dotted # L& LATIN SUBSCRIPT SMALL LETTER I423case 0x1E2D: // Soft_Dotted # L& LATIN SMALL LETTER I WITH TILDE BELOW424case 0x1ECB: // Soft_Dotted # L& LATIN SMALL LETTER I WITH DOT BELOW425case 0x2071: // Soft_Dotted # L& SUPERSCRIPT LATIN SMALL LETTER I426return true;427default:428return false;429}430}431432/**433* An internal class that represents an entry in the Special Casing Properties.434*/435static class Entry {436int ch;437char [] lower;438char [] upper;439String lang;440int condition;441442Entry(int ch, char[] lower, char[] upper, String lang, int condition) {443this.ch = ch;444this.lower = lower;445this.upper = upper;446this.lang = lang;447this.condition = condition;448}449450int getCodePoint() {451return ch;452}453454char[] getLowerCase() {455return lower;456}457458char[] getUpperCase() {459return upper;460}461462String getLanguage() {463return lang;464}465466int getCondition() {467return condition;468}469}470}471472473