Path: blob/master/src/java.base/share/classes/jdk/internal/icu/impl/Punycode.java
41161 views
/*1* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/24/*25*******************************************************************************26* Copyright (C) 2003-2004, International Business Machines Corporation and *27* others. All Rights Reserved. *28*******************************************************************************29*/30//31// CHANGELOG32// 2005-05-19 Edward Wang33// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/Punycode.java34// - move from package com.ibm.icu.text to package sun.net.idn35// - use ParseException instead of StringPrepParseException36// 2007-08-14 Martin Buchholz37// - remove redundant casts38//39package jdk.internal.icu.impl;4041import java.text.ParseException;42import jdk.internal.icu.lang.UCharacter;43import jdk.internal.icu.text.UTF16;4445/**46* Ported code from ICU punycode.c47* @author ram48*/4950/* Package Private class */51public final class Punycode {5253/* Punycode parameters for Bootstring */54private static final int BASE = 36;55private static final int TMIN = 1;56private static final int TMAX = 26;57private static final int SKEW = 38;58private static final int DAMP = 700;59private static final int INITIAL_BIAS = 72;60private static final int INITIAL_N = 0x80;6162/* "Basic" Unicode/ASCII code points */63private static final int HYPHEN = 0x2d;64private static final int DELIMITER = HYPHEN;6566private static final int ZERO = 0x30;67private static final int NINE = 0x39;6869private static final int SMALL_A = 0x61;70private static final int SMALL_Z = 0x7a;7172private static final int CAPITAL_A = 0x41;73private static final int CAPITAL_Z = 0x5a;7475// TODO: eliminate the 256 limitation76private static final int MAX_CP_COUNT = 256;7778private static final int UINT_MAGIC = 0x80000000;79private static final long ULONG_MAGIC = 0x8000000000000000L;8081private static int adaptBias(int delta, int length, boolean firstTime){82if(firstTime){83delta /=DAMP;84}else{85delta /= 2;86}87delta += delta/length;8889int count=0;90for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {91delta/=(BASE-TMIN);92}9394return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));95}9697/**98* basicToDigit[] contains the numeric value of a basic code99* point (for use in representing integers) in the range 0 to100* BASE-1, or -1 if b is does not represent a value.101*/102static final int[] basicToDigit= new int[]{103-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,104-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,105106-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,10726, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,108109-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,11015, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,111112-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,11315, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,114115-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,116-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,117118-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,119-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,120121-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,122-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,123124-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,125-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1126};127128private static char asciiCaseMap(char b, boolean uppercase) {129if(uppercase) {130if(SMALL_A<=b && b<=SMALL_Z) {131b-=(SMALL_A-CAPITAL_A);132}133} else {134if(CAPITAL_A<=b && b<=CAPITAL_Z) {135b+=(SMALL_A-CAPITAL_A);136}137}138return b;139}140141/**142* digitToBasic() returns the basic code point whose value143* (when used for representing integers) is d, which must be in the144* range 0 to BASE-1. The lowercase form is used unless the uppercase flag is145* nonzero, in which case the uppercase form is used.146*/147private static char digitToBasic(int digit, boolean uppercase) {148/* 0..25 map to ASCII a..z or A..Z */149/* 26..35 map to ASCII 0..9 */150if(digit<26) {151if(uppercase) {152return (char)(CAPITAL_A+digit);153} else {154return (char)(SMALL_A+digit);155}156} else {157return (char)((ZERO-26)+digit);158}159}160/**161* Converts Unicode to Punycode.162* The input string must not contain single, unpaired surrogates.163* The output will be represented as an array of ASCII code points.164*165* @param src166* @param caseFlags167* @return168* @throws ParseException169*/170public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws ParseException{171172int[] cpBuffer = new int[MAX_CP_COUNT];173int n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;174char c, c2;175int srcLength = src.length();176int destCapacity = MAX_CP_COUNT;177char[] dest = new char[destCapacity];178StringBuffer result = new StringBuffer();179/*180* Handle the basic code points and181* convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):182*/183srcCPCount=destLength=0;184185for(j=0; j<srcLength; ++j) {186if(srcCPCount==MAX_CP_COUNT) {187/* too many input code points */188throw new ParseException("Too many input code points", -1);189}190c=src.charAt(j);191if(isBasic(c)) {192if(destLength<destCapacity) {193cpBuffer[srcCPCount++]=0;194dest[destLength]=195caseFlags!=null ?196asciiCaseMap(c, caseFlags[j]) :197c;198}199++destLength;200} else {201n=((caseFlags!=null && caseFlags[j])? 1 : 0)<<31L;202if(!UTF16.isSurrogate(c)) {203n|=c;204} else if(UTF16.isLeadSurrogate(c) && (j+1)<srcLength && UTF16.isTrailSurrogate(c2=src.charAt(j+1))) {205++j;206207n|=UCharacter.getCodePoint(c, c2);208} else {209/* error: unmatched surrogate */210throw new ParseException("Illegal char found", -1);211}212cpBuffer[srcCPCount++]=n;213}214}215216/* Finish the basic string - if it is not empty - with a delimiter. */217basicLength=destLength;218if(basicLength>0) {219if(destLength<destCapacity) {220dest[destLength]=DELIMITER;221}222++destLength;223}224225/*226* handledCPCount is the number of code points that have been handled227* basicLength is the number of basic code points228* destLength is the number of chars that have been output229*/230231/* Initialize the state: */232n=INITIAL_N;233delta=0;234bias=INITIAL_BIAS;235236/* Main encoding loop: */237for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {238/*239* All non-basic code points < n have been handled already.240* Find the next larger one:241*/242for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {243q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */244if(n<=q && q<m) {245m=q;246}247}248249/*250* Increase delta enough to advance the decoder's251* <n,i> state to <m,0>, but guard against overflow:252*/253if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {254throw new RuntimeException("Internal program error");255}256delta+=(m-n)*(handledCPCount+1);257n=m;258259/* Encode a sequence of same code points n */260for(j=0; j<srcCPCount; ++j) {261q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */262if(q<n) {263++delta;264} else if(q==n) {265/* Represent delta as a generalized variable-length integer: */266for(q=delta, k=BASE; /* no condition */; k+=BASE) {267268/** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt269270t=k-bias;271if(t<TMIN) {272t=TMIN;273} else if(t>TMAX) {274t=TMAX;275}276*/277278t=k-bias;279if(t<TMIN) {280t=TMIN;281} else if(k>=(bias+TMAX)) {282t=TMAX;283}284285if(q<t) {286break;287}288289if(destLength<destCapacity) {290dest[destLength++]=digitToBasic(t+(q-t)%(BASE-t), false);291}292q=(q-t)/(BASE-t);293}294295if(destLength<destCapacity) {296dest[destLength++]=digitToBasic(q, (cpBuffer[j]<0));297}298bias=adaptBias(delta, handledCPCount+1,(handledCPCount==basicLength));299delta=0;300++handledCPCount;301}302}303304++delta;305++n;306}307308return result.append(dest, 0, destLength);309}310311private static boolean isBasic(int ch){312return (ch < INITIAL_N);313}314315private static boolean isBasicUpperCase(int ch){316return( CAPITAL_A <= ch && ch <= CAPITAL_Z);317}318319private static boolean isSurrogate(int ch){320return (((ch)&0xfffff800)==0xd800);321}322/**323* Converts Punycode to Unicode.324* The Unicode string will be at most as long as the Punycode string.325*326* @param src327* @param caseFlags328* @return329* @throws ParseException330*/331public static StringBuffer decode(StringBuffer src, boolean[] caseFlags)332throws ParseException{333int srcLength = src.length();334StringBuffer result = new StringBuffer();335int n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,336destCPCount, firstSupplementaryIndex, cpLength;337char b;338int destCapacity = MAX_CP_COUNT;339char[] dest = new char[destCapacity];340341/*342* Handle the basic code points:343* Let basicLength be the number of input code points344* before the last delimiter, or 0 if there is none,345* then copy the first basicLength code points to the output.346*347* The two following loops iterate backward.348*/349for(j=srcLength; j>0;) {350if(src.charAt(--j)==DELIMITER) {351break;352}353}354destLength=basicLength=destCPCount=j;355356while(j>0) {357b=src.charAt(--j);358if(!isBasic(b)) {359throw new ParseException("Illegal char found", -1);360}361362if(j<destCapacity) {363dest[j]= b;364365if(caseFlags!=null) {366caseFlags[j]=isBasicUpperCase(b);367}368}369}370371/* Initialize the state: */372n=INITIAL_N;373i=0;374bias=INITIAL_BIAS;375firstSupplementaryIndex=1000000000;376377/*378* Main decoding loop:379* Start just after the last delimiter if any380* basic code points were copied; start at the beginning otherwise.381*/382for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {383/*384* in is the index of the next character to be consumed, and385* destCPCount is the number of code points in the output array.386*387* Decode a generalized variable-length integer into delta,388* which gets added to i. The overflow checking is easier389* if we increase i as we go, then subtract off its starting390* value at the end to obtain delta.391*/392for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {393if(in>=srcLength) {394throw new ParseException("Illegal char found", -1);395}396397digit=basicToDigit[(byte)src.charAt(in++)];398if(digit<0) {399throw new ParseException("Invalid char found", -1);400}401if(digit>(0x7fffffff-i)/w) {402/* integer overflow */403throw new ParseException("Illegal char found", -1);404}405406i+=digit*w;407t=k-bias;408if(t<TMIN) {409t=TMIN;410} else if(k>=(bias+TMAX)) {411t=TMAX;412}413if(digit<t) {414break;415}416417if(w>0x7fffffff/(BASE-t)) {418/* integer overflow */419throw new ParseException("Illegal char found", -1);420}421w*=BASE-t;422}423424/*425* Modification from sample code:426* Increments destCPCount here,427* where needed instead of in for() loop tail.428*/429++destCPCount;430bias=adaptBias(i-oldi, destCPCount, (oldi==0));431432/*433* i was supposed to wrap around from (incremented) destCPCount to 0,434* incrementing n each time, so we'll fix that now:435*/436if(i/destCPCount>(0x7fffffff-n)) {437/* integer overflow */438throw new ParseException("Illegal char found", -1);439}440441n+=i/destCPCount;442i%=destCPCount;443/* not needed for Punycode: */444/* if (decode_digit(n) <= BASE) return punycode_invalid_input; */445446if(n>0x10ffff || isSurrogate(n)) {447/* Unicode code point overflow */448throw new ParseException("Illegal char found", -1);449}450451/* Insert n at position i of the output: */452cpLength=UTF16.getCharCount(n);453if((destLength+cpLength)<destCapacity) {454int codeUnitIndex;455456/*457* Handle indexes when supplementary code points are present.458*459* In almost all cases, there will be only BMP code points before i460* and even in the entire string.461* This is handled with the same efficiency as with UTF-32.462*463* Only the rare cases with supplementary code points are handled464* more slowly - but not too bad since this is an insertion anyway.465*/466if(i<=firstSupplementaryIndex) {467codeUnitIndex=i;468if(cpLength>1) {469firstSupplementaryIndex=codeUnitIndex;470} else {471++firstSupplementaryIndex;472}473} else {474codeUnitIndex=firstSupplementaryIndex;475codeUnitIndex=UTF16.moveCodePointOffset(dest, 0, destLength, codeUnitIndex, i-codeUnitIndex);476}477478/* use the UChar index codeUnitIndex instead of the code point index i */479if(codeUnitIndex<destLength) {480System.arraycopy(dest, codeUnitIndex,481dest, codeUnitIndex+cpLength,482(destLength-codeUnitIndex));483if(caseFlags!=null) {484System.arraycopy(caseFlags, codeUnitIndex,485caseFlags, codeUnitIndex+cpLength,486destLength-codeUnitIndex);487}488}489if(cpLength==1) {490/* BMP, insert one code unit */491dest[codeUnitIndex]=(char)n;492} else {493/* supplementary character, insert two code units */494dest[codeUnitIndex]=UTF16.getLeadSurrogate(n);495dest[codeUnitIndex+1]=UTF16.getTrailSurrogate(n);496}497if(caseFlags!=null) {498/* Case of last character determines uppercase flag: */499caseFlags[codeUnitIndex]=isBasicUpperCase(src.charAt(in-1));500if(cpLength==2) {501caseFlags[codeUnitIndex+1]=false;502}503}504}505destLength+=cpLength;506++i;507}508result.append(dest, 0, destLength);509return result;510}511}512513514