Path: blob/master/src/java.base/share/classes/jdk/internal/icu/text/StringPrep.java
41161 views
/*1* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/24/*25/*26*******************************************************************************27* Copyright (C) 2003-2004, International Business Machines Corporation and *28* others. All Rights Reserved. *29*******************************************************************************30*/31//32// CHANGELOG33// 2005-05-19 Edward Wang34// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/StringPrep.java35// - move from package com.ibm.icu.text to package sun.net.idn36// - use ParseException instead of StringPrepParseException37// - change 'Normalizer.getUnicodeVersion()' to 'NormalizerImpl.getUnicodeVersion()'38// - remove all @deprecated tag to make compiler happy39// 2007-08-14 Martin Buchholz40// - remove redundant casts41//42package jdk.internal.icu.text;4344import java.io.BufferedInputStream;45import java.io.ByteArrayInputStream;46import java.io.IOException;47import java.io.InputStream;48import java.text.ParseException;4950import sun.text.Normalizer;51import jdk.internal.icu.impl.CharTrie;52import jdk.internal.icu.impl.StringPrepDataReader;53import jdk.internal.icu.impl.Trie;54import jdk.internal.icu.lang.UCharacter;55import jdk.internal.icu.lang.UCharacterDirection;56import jdk.internal.icu.util.VersionInfo;5758/**59* StringPrep API implements the StingPrep framework as described by60* <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>.61* StringPrep prepares Unicode strings for use in network protocols.62* Profiles of StingPrep are set of rules and data according to which the63* Unicode Strings are prepared. Each profiles contains tables which describe64* how a code point should be treated. The tables are broadly classied into65* <ul>66* <li> Unassigned Table: Contains code points that are unassigned67* in the Unicode Version supported by StringPrep. Currently68* RFC 3454 supports Unicode 3.2. </li>69* <li> Prohibited Table: Contains code points that are prohibted from70* the output of the StringPrep processing function. </li>71* <li> Mapping Table: Contains code ponts that are deleted from the output or case mapped. </li>72* </ul>73*74* The procedure for preparing Unicode strings:75* <ol>76* <li> Map: For each character in the input, check if it has a mapping77* and, if so, replace it with its mapping. </li>78* <li> Normalize: Possibly normalize the result of step 1 using Unicode79* normalization. </li>80* <li> Prohibit: Check for any characters that are not allowed in the81* output. If any are found, return an error.</li>82* <li> Check bidi: Possibly check for right-to-left characters, and if83* any are found, make sure that the whole string satisfies the84* requirements for bidirectional strings. If the string does not85* satisfy the requirements for bidirectional strings, return an86* error. </li>87* </ol>88* @author Ram Viswanadha89* @draft ICU 2.890*/91public final class StringPrep {92/**93* Option to prohibit processing of unassigned code points in the input94*95* @see #prepare96* @draft ICU 2.897*/98public static final int DEFAULT = 0x0000;99100/**101* Option to allow processing of unassigned code points in the input102*103* @see #prepare104* @draft ICU 2.8105*/106public static final int ALLOW_UNASSIGNED = 0x0001;107108private static final int UNASSIGNED = 0x0000;109private static final int MAP = 0x0001;110private static final int PROHIBITED = 0x0002;111private static final int DELETE = 0x0003;112private static final int TYPE_LIMIT = 0x0004;113114private static final int NORMALIZATION_ON = 0x0001;115private static final int CHECK_BIDI_ON = 0x0002;116117private static final int TYPE_THRESHOLD = 0xFFF0;118private static final int MAX_INDEX_VALUE = 0x3FBF; /*16139*/119private static final int MAX_INDEX_TOP_LENGTH = 0x0003;120121/* indexes[] value names */122private static final int INDEX_TRIE_SIZE = 0; /* number of bytes in normalization trie */123private static final int INDEX_MAPPING_DATA_SIZE = 1; /* The array that contains the mapping */124private static final int NORM_CORRECTNS_LAST_UNI_VERSION = 2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */125private static final int ONE_UCHAR_MAPPING_INDEX_START = 3; /* The starting index of 1 UChar mapping index in the mapping data array */126private static final int TWO_UCHARS_MAPPING_INDEX_START = 4; /* The starting index of 2 UChars mapping index in the mapping data array */127private static final int THREE_UCHARS_MAPPING_INDEX_START = 5;128private static final int FOUR_UCHARS_MAPPING_INDEX_START = 6;129private static final int OPTIONS = 7; /* Bit set of options to turn on in the profile */130private static final int INDEX_TOP = 16; /* changing this requires a new formatVersion */131132133/**134* Default buffer size of datafile135*/136private static final int DATA_BUFFER_SIZE = 25000;137138/* Wrappers for Trie implementations */139private static final class StringPrepTrieImpl implements Trie.DataManipulate{140private CharTrie sprepTrie = null;141/**142* Called by com.ibm.icu.util.Trie to extract from a lead surrogate's143* data the index array offset of the indexes for that lead surrogate.144* @param property data value for a surrogate from the trie, including145* the folding offset146* @return data offset or 0 if there is no data for the lead surrogate147*/148public int getFoldingOffset(int value){149return value;150}151}152153// CharTrie implementation for reading the trie data154private StringPrepTrieImpl sprepTrieImpl;155// Indexes read from the data file156private int[] indexes;157// mapping data read from the data file158private char[] mappingData;159// format version of the data file160private byte[] formatVersion;161// the version of Unicode supported by the data file162private VersionInfo sprepUniVer;163// the Unicode version of last entry in the164// NormalizationCorrections.txt file if normalization165// is turned on166private VersionInfo normCorrVer;167// Option to turn on Normalization168private boolean doNFKC;169// Option to turn on checking for BiDi rules170private boolean checkBiDi;171172173private char getCodePointValue(int ch){174return sprepTrieImpl.sprepTrie.getCodePointValue(ch);175}176177private static VersionInfo getVersionInfo(int comp){178int micro = comp & 0xFF;179int milli =(comp >> 8) & 0xFF;180int minor =(comp >> 16) & 0xFF;181int major =(comp >> 24) & 0xFF;182return VersionInfo.getInstance(major,minor,milli,micro);183}184private static VersionInfo getVersionInfo(byte[] version){185if(version.length != 4){186return null;187}188return VersionInfo.getInstance((int)version[0],(int) version[1],(int) version[2],(int) version[3]);189}190/**191* Creates an StringPrep object after reading the input stream.192* The object does not hold a reference to the input steam, so the stream can be193* closed after the method returns.194*195* @param inputStream The stream for reading the StringPrep profile binarySun196* @throws IOException197* @draft ICU 2.8198*/199public StringPrep(InputStream inputStream) throws IOException{200201BufferedInputStream b = new BufferedInputStream(inputStream,DATA_BUFFER_SIZE);202203StringPrepDataReader reader = new StringPrepDataReader(b);204205// read the indexes206indexes = reader.readIndexes(INDEX_TOP);207208byte[] sprepBytes = new byte[indexes[INDEX_TRIE_SIZE]];209210211//indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes212mappingData = new char[indexes[INDEX_MAPPING_DATA_SIZE]/2];213// load the rest of the data and initialize the data members214reader.read(sprepBytes,mappingData);215216sprepTrieImpl = new StringPrepTrieImpl();217sprepTrieImpl.sprepTrie = new CharTrie( new ByteArrayInputStream(sprepBytes),sprepTrieImpl );218219// get the data format version220formatVersion = reader.getDataFormatVersion();221222// get the options223doNFKC = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0);224checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);225sprepUniVer = getVersionInfo(reader.getUnicodeVersion());226normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);227VersionInfo normUniVer = UCharacter.getUnicodeVersion();228if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */229normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */230((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/231){232throw new IOException("Normalization Correction version not supported");233}234b.close();235}236237private static final class Values{238boolean isIndex;239int value;240int type;241public void reset(){242isIndex = false;243value = 0;244type = -1;245}246}247248private static final void getValues(char trieWord,Values values){249values.reset();250if(trieWord == 0){251/*252* Initial value stored in the mapping table253* just return TYPE_LIMIT .. so that254* the source codepoint is copied to the destination255*/256values.type = TYPE_LIMIT;257}else if(trieWord >= TYPE_THRESHOLD){258values.type = (trieWord - TYPE_THRESHOLD);259}else{260/* get the type */261values.type = MAP;262/* ascertain if the value is index or delta */263if((trieWord & 0x02)>0){264values.isIndex = true;265values.value = trieWord >> 2; //mask off the lower 2 bits and shift266267}else{268values.isIndex = false;269values.value = (trieWord<<16)>>16;270values.value = (values.value >> 2);271272}273274if((trieWord>>2) == MAX_INDEX_VALUE){275values.type = DELETE;276values.isIndex = false;277values.value = 0;278}279}280}281282283284private StringBuffer map( UCharacterIterator iter, int options)285throws ParseException {286287Values val = new Values();288char result = 0;289int ch = UCharacterIterator.DONE;290StringBuffer dest = new StringBuffer();291boolean allowUnassigned = ((options & ALLOW_UNASSIGNED)>0);292293while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){294295result = getCodePointValue(ch);296getValues(result,val);297298// check if the source codepoint is unassigned299if(val.type == UNASSIGNED && allowUnassigned == false){300throw new ParseException("An unassigned code point was found in the input " +301iter.getText(), iter.getIndex());302}else if((val.type == MAP)){303int index, length;304305if(val.isIndex){306index = val.value;307if(index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] &&308index < indexes[TWO_UCHARS_MAPPING_INDEX_START]){309length = 1;310}else if(index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] &&311index < indexes[THREE_UCHARS_MAPPING_INDEX_START]){312length = 2;313}else if(index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] &&314index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]){315length = 3;316}else{317length = mappingData[index++];318}319/* copy mapping to destination */320dest.append(mappingData,index,length);321continue;322323}else{324ch -= val.value;325}326}else if(val.type == DELETE){327// just consume the codepoint and contine328continue;329}330//copy the source into destination331UTF16.append(dest,ch);332}333334return dest;335}336337338private StringBuffer normalize(StringBuffer src){339/*340* Option UNORM_BEFORE_PRI_29:341*342* IDNA as interpreted by IETF members (see unicode mailing list 2004H1)343* requires strict adherence to Unicode 3.2 normalization,344* including buggy composition from before fixing Public Review Issue #29.345* Note that this results in some valid but nonsensical text to be346* either corrupted or rejected, depending on the text.347* See http://www.unicode.org/review/resolved-pri.html#pri29348* See unorm.cpp and cnormtst.c349*/350return new StringBuffer(351Normalizer.normalize(352src.toString(),353java.text.Normalizer.Form.NFKC,354Normalizer.UNICODE_3_2));355}356/*357boolean isLabelSeparator(int ch){358int result = getCodePointValue(ch);359if( (result & 0x07) == LABEL_SEPARATOR){360return true;361}362return false;363}364*/365/*3661) Map -- For each character in the input, check if it has a mapping367and, if so, replace it with its mapping.3683692) Normalize -- Possibly normalize the result of step 1 using Unicode370normalization.3713723) Prohibit -- Check for any characters that are not allowed in the373output. If any are found, return an error.3743754) Check bidi -- Possibly check for right-to-left characters, and if376any are found, make sure that the whole string satisfies the377requirements for bidirectional strings. If the string does not378satisfy the requirements for bidirectional strings, return an379error.380[Unicode3.2] defines several bidirectional categories; each character381has one bidirectional category assigned to it. For the purposes of382the requirements below, an "RandALCat character" is a character that383has Unicode bidirectional categories "R" or "AL"; an "LCat character"384is a character that has Unicode bidirectional category "L". Note385386387that there are many characters which fall in neither of the above388definitions; Latin digits (<U+0030> through <U+0039>) are examples of389this because they have bidirectional category "EN".390391In any profile that specifies bidirectional character handling, all392three of the following requirements MUST be met:3933941) The characters in section 5.8 MUST be prohibited.3953962) If a string contains any RandALCat character, the string MUST NOT397contain any LCat character.3983993) If a string contains any RandALCat character, a RandALCat400character MUST be the first character of the string, and a401RandALCat character MUST be the last character of the string.402*/403/**404* Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC),405* checks for prohited and BiDi characters in the order defined by RFC 3454406* depending on the options specified in the profile.407*408* @param src A UCharacterIterator object containing the source string409* @param options A bit set of options:410*411* - StringPrep.NONE Prohibit processing of unassigned code points in the input412*413* - StringPrep.ALLOW_UNASSIGNED Treat the unassigned code points are in the input414* as normal Unicode code points.415*416* @return StringBuffer A StringBuffer containing the output417* @throws ParseException418* @draft ICU 2.8419*/420public StringBuffer prepare(UCharacterIterator src, int options)421throws ParseException{422423// map424StringBuffer mapOut = map(src,options);425StringBuffer normOut = mapOut;// initialize426427if(doNFKC){428// normalize429normOut = normalize(mapOut);430}431432int ch;433char result;434UCharacterIterator iter = UCharacterIterator.getInstance(normOut);435Values val = new Values();436int direction=UCharacterDirection.CHAR_DIRECTION_COUNT,437firstCharDir=UCharacterDirection.CHAR_DIRECTION_COUNT;438int rtlPos=-1, ltrPos=-1;439boolean rightToLeft=false, leftToRight=false;440441while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){442result = getCodePointValue(ch);443getValues(result,val);444445if(val.type == PROHIBITED ){446throw new ParseException("A prohibited code point was found in the input" +447iter.getText(), val.value);448}449450direction = UCharacter.getDirection(ch);451if(firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT){452firstCharDir = direction;453}454if(direction == UCharacterDirection.LEFT_TO_RIGHT){455leftToRight = true;456ltrPos = iter.getIndex()-1;457}458if(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC){459rightToLeft = true;460rtlPos = iter.getIndex()-1;461}462}463if(checkBiDi == true){464// satisfy 2465if( leftToRight == true && rightToLeft == true){466throw new ParseException("The input does not conform to the rules for BiDi code points." +467iter.getText(),468(rtlPos>ltrPos) ? rtlPos : ltrPos);469}470471//satisfy 3472if( rightToLeft == true &&473!((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) &&474(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC))475){476throw new ParseException("The input does not conform to the rules for BiDi code points." +477iter.getText(),478(rtlPos>ltrPos) ? rtlPos : ltrPos);479}480}481return normOut;482483}484}485486487