Path: blob/master/src/java.base/share/classes/java/net/IDN.java
41152 views
/*1* Copyright (c) 2005, 2021, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/24package java.net;2526import java.io.InputStream;27import java.io.IOException;28import java.security.AccessController;29import java.security.PrivilegedAction;3031import jdk.internal.icu.impl.Punycode;32import jdk.internal.icu.text.StringPrep;33import jdk.internal.icu.text.UCharacterIterator;3435/**36* Provides methods to convert internationalized domain names (IDNs) between37* a normal Unicode representation and an ASCII Compatible Encoding (ACE) representation.38* Internationalized domain names can use characters from the entire range of39* Unicode, while traditional domain names are restricted to ASCII characters.40* ACE is an encoding of Unicode strings that uses only ASCII characters and41* can be used with software (such as the Domain Name System) that only42* understands traditional domain names.43*44* <p>Internationalized domain names are defined in <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.45* RFC 3490 defines two operations: ToASCII and ToUnicode. These 2 operations employ46* <a href="http://www.ietf.org/rfc/rfc3491.txt">Nameprep</a> algorithm, which is a47* profile of <a href="http://www.ietf.org/rfc/rfc3454.txt">Stringprep</a>, and48* <a href="http://www.ietf.org/rfc/rfc3492.txt">Punycode</a> algorithm to convert49* domain name string back and forth.50*51* <p>The behavior of aforementioned conversion process can be adjusted by various flags:52* <ul>53* <li>If the ALLOW_UNASSIGNED flag is used, the domain name string to be converted54* can contain code points that are unassigned in Unicode 3.2, which is the55* Unicode version on which IDN conversion is based. If the flag is not used,56* the presence of such unassigned code points is treated as an error.57* <li>If the USE_STD3_ASCII_RULES flag is used, ASCII strings are checked against <a href="http://www.ietf.org/rfc/rfc1122.txt">RFC 1122</a> and <a href="http://www.ietf.org/rfc/rfc1123.txt">RFC 1123</a>.58* It is an error if they don't meet the requirements.59* </ul>60* These flags can be logically OR'ed together.61*62* <p>The security consideration is important with respect to internationalization63* domain name support. For example, English domain names may be <i>homographed</i>64* - maliciously misspelled by substitution of non-Latin letters.65* <a href="http://www.unicode.org/reports/tr36/">Unicode Technical Report #36</a>66* discusses security issues of IDN support as well as possible solutions.67* Applications are responsible for taking adequate security measures when using68* international domain names.69*70* @author Edward Wang71* @since 1.672*73*/74@SuppressWarnings("removal")75public final class IDN {76/**77* Flag to allow processing of unassigned code points78*/79public static final int ALLOW_UNASSIGNED = 0x01;8081/**82* Flag to turn on the check against STD-3 ASCII rules83*/84public static final int USE_STD3_ASCII_RULES = 0x02;858687/**88* Translates a string from Unicode to ASCII Compatible Encoding (ACE),89* as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.90*91* <p>ToASCII operation can fail. ToASCII fails if any step of it fails.92* If ToASCII operation fails, an IllegalArgumentException will be thrown.93* In this case, the input string should not be used in an internationalized domain name.94*95* <p> A label is an individual part of a domain name. The original ToASCII operation,96* as defined in RFC 3490, only operates on a single label. This method can handle97* both label and entire domain name, by assuming that labels in a domain name are98* always separated by dots. The following characters are recognized as dots:99* \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop),100* and \uFF61 (halfwidth ideographic full stop). if dots are101* used as label separators, this method also changes all of them to \u002E (full stop)102* in output translated string.103*104* @param input the string to be processed105* @param flag process flag; can be 0 or any logical OR of possible flags106*107* @return the translated {@code String}108*109* @throws IllegalArgumentException if the input string doesn't conform to RFC 3490 specification110*/111public static String toASCII(String input, int flag)112{113int p = 0, q = 0;114StringBuilder out = new StringBuilder();115116if (isRootLabel(input)) {117return ".";118}119120while (p < input.length()) {121q = searchDots(input, p);122out.append(toASCIIInternal(input.substring(p, q), flag));123if (q != (input.length())) {124// has more labels, or keep the trailing dot as at present125out.append('.');126}127p = q + 1;128}129130return out.toString();131}132133134/**135* Translates a string from Unicode to ASCII Compatible Encoding (ACE),136* as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.137*138* <p> This convenience method works as if by invoking the139* two-argument counterpart as follows:140* <blockquote>141* {@link #toASCII(String, int) toASCII}(input, 0);142* </blockquote>143*144* @param input the string to be processed145*146* @return the translated {@code String}147*148* @throws IllegalArgumentException if the input string doesn't conform to RFC 3490 specification149*/150public static String toASCII(String input) {151return toASCII(input, 0);152}153154155/**156* Translates a string from ASCII Compatible Encoding (ACE) to Unicode,157* as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.158*159* <p>ToUnicode never fails. In case of any error, the input string is returned unmodified.160*161* <p> A label is an individual part of a domain name. The original ToUnicode operation,162* as defined in RFC 3490, only operates on a single label. This method can handle163* both label and entire domain name, by assuming that labels in a domain name are164* always separated by dots. The following characters are recognized as dots:165* \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop),166* and \uFF61 (halfwidth ideographic full stop).167*168* @param input the string to be processed169* @param flag process flag; can be 0 or any logical OR of possible flags170*171* @return the translated {@code String}172*/173public static String toUnicode(String input, int flag) {174int p = 0, q = 0;175StringBuilder out = new StringBuilder();176177if (isRootLabel(input)) {178return ".";179}180181while (p < input.length()) {182q = searchDots(input, p);183out.append(toUnicodeInternal(input.substring(p, q), flag));184if (q != (input.length())) {185// has more labels, or keep the trailing dot as at present186out.append('.');187}188p = q + 1;189}190191return out.toString();192}193194195/**196* Translates a string from ASCII Compatible Encoding (ACE) to Unicode,197* as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.198*199* <p> This convenience method works as if by invoking the200* two-argument counterpart as follows:201* <blockquote>202* {@link #toUnicode(String, int) toUnicode}(input, 0);203* </blockquote>204*205* @param input the string to be processed206*207* @return the translated {@code String}208*/209public static String toUnicode(String input) {210return toUnicode(input, 0);211}212213214/* ---------------- Private members -------------- */215216// ACE Prefix is "xn--"217private static final String ACE_PREFIX = "xn--";218private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length();219220private static final int MAX_LABEL_LENGTH = 63;221222// single instance of nameprep223private static StringPrep namePrep = null;224225static {226InputStream stream = null;227228try {229final String IDN_PROFILE = "/sun/net/idn/uidna.spp";230if (System.getSecurityManager() != null) {231stream = AccessController.doPrivileged(new PrivilegedAction<>() {232public InputStream run() {233return StringPrep.class.getResourceAsStream(IDN_PROFILE);234}235});236} else {237stream = StringPrep.class.getResourceAsStream(IDN_PROFILE);238}239240namePrep = new StringPrep(stream);241stream.close();242} catch (IOException e) {243// should never reach here244assert false;245}246}247248249/* ---------------- Private operations -------------- */250251252//253// to suppress the default zero-argument constructor254//255private IDN() {}256257//258// toASCII operation; should only apply to a single label259//260private static String toASCIIInternal(String label, int flag)261{262// step 1263// Check if the string contains code points outside the ASCII range 0..0x7c.264boolean isASCII = isAllASCII(label);265StringBuffer dest;266267// step 2268// perform the nameprep operation; flag ALLOW_UNASSIGNED is used here269if (!isASCII) {270UCharacterIterator iter = UCharacterIterator.getInstance(label);271try {272dest = namePrep.prepare(iter, flag);273} catch (java.text.ParseException e) {274throw new IllegalArgumentException(e);275}276} else {277dest = new StringBuffer(label);278}279280// step 8, move forward to check the smallest number of the code points281// the length must be inside 1..63282if (dest.length() == 0) {283throw new IllegalArgumentException(284"Empty label is not a legal name");285}286287// step 3288// Verify the absence of non-LDH ASCII code points289// 0..0x2c, 0x2e..0x2f, 0x3a..0x40, 0x5b..0x60, 0x7b..0x7f290// Verify the absence of leading and trailing hyphen291boolean useSTD3ASCIIRules = ((flag & USE_STD3_ASCII_RULES) != 0);292if (useSTD3ASCIIRules) {293for (int i = 0; i < dest.length(); i++) {294int c = dest.charAt(i);295if (isNonLDHAsciiCodePoint(c)) {296throw new IllegalArgumentException(297"Contains non-LDH ASCII characters");298}299}300301if (dest.charAt(0) == '-' ||302dest.charAt(dest.length() - 1) == '-') {303304throw new IllegalArgumentException(305"Has leading or trailing hyphen");306}307}308309if (!isASCII) {310// step 4311// If all code points are inside 0..0x7f, skip to step 8312if (!isAllASCII(dest.toString())) {313// step 5314// verify the sequence does not begin with ACE prefix315if(!startsWithACEPrefix(dest)){316317// step 6318// encode the sequence with punycode319try {320dest = Punycode.encode(dest, null);321} catch (java.text.ParseException e) {322throw new IllegalArgumentException(e);323}324325dest = toASCIILower(dest);326327// step 7328// prepend the ACE prefix329dest.insert(0, ACE_PREFIX);330} else {331throw new IllegalArgumentException("The input starts with the ACE Prefix");332}333334}335}336337// step 8338// the length must be inside 1..63339if (dest.length() > MAX_LABEL_LENGTH) {340throw new IllegalArgumentException("The label in the input is too long");341}342343return dest.toString();344}345346//347// toUnicode operation; should only apply to a single label348//349private static String toUnicodeInternal(String label, int flag) {350boolean[] caseFlags = null;351StringBuffer dest;352353// step 1354// find out if all the codepoints in input are ASCII355boolean isASCII = isAllASCII(label);356357if(!isASCII){358// step 2359// perform the nameprep operation; flag ALLOW_UNASSIGNED is used here360try {361UCharacterIterator iter = UCharacterIterator.getInstance(label);362dest = namePrep.prepare(iter, flag);363} catch (Exception e) {364// toUnicode never fails; if any step fails, return the input string365return label;366}367} else {368dest = new StringBuffer(label);369}370371// step 3372// verify ACE Prefix373if(startsWithACEPrefix(dest)) {374375// step 4376// Remove the ACE Prefix377String temp = dest.substring(ACE_PREFIX_LENGTH, dest.length());378379try {380// step 5381// Decode using punycode382StringBuffer decodeOut = Punycode.decode(new StringBuffer(temp), null);383384// step 6385// Apply toASCII386String toASCIIOut = toASCII(decodeOut.toString(), flag);387388// step 7389// verify390if (toASCIIOut.equalsIgnoreCase(dest.toString())) {391// step 8392// return output of step 5393return decodeOut.toString();394}395} catch (Exception ignored) {396// no-op397}398}399400// just return the input401return label;402}403404405//406// LDH stands for "letter/digit/hyphen", with characters restricted to the407// 26-letter Latin alphabet <A-Z a-z>, the digits <0-9>, and the hyphen408// <->.409// Non LDH refers to characters in the ASCII range, but which are not410// letters, digits or the hyphen.411//412// non-LDH = 0..0x2C, 0x2E..0x2F, 0x3A..0x40, 0x5B..0x60, 0x7B..0x7F413//414private static boolean isNonLDHAsciiCodePoint(int ch){415return (0x0000 <= ch && ch <= 0x002C) ||416(0x002E <= ch && ch <= 0x002F) ||417(0x003A <= ch && ch <= 0x0040) ||418(0x005B <= ch && ch <= 0x0060) ||419(0x007B <= ch && ch <= 0x007F);420}421422//423// search dots in a string and return the index of that character;424// or if there is no dots, return the length of input string425// dots might be: \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop),426// and \uFF61 (halfwidth ideographic full stop).427//428private static int searchDots(String s, int start) {429int i;430for (i = start; i < s.length(); i++) {431if (isLabelSeparator(s.charAt(i))) {432break;433}434}435436return i;437}438439//440// to check if a string is a root label, ".".441//442private static boolean isRootLabel(String s) {443return (s.length() == 1 && isLabelSeparator(s.charAt(0)));444}445446//447// to check if a character is a label separator, i.e. a dot character.448//449private static boolean isLabelSeparator(char c) {450return (c == '.' || c == '\u3002' || c == '\uFF0E' || c == '\uFF61');451}452453//454// to check if a string only contains US-ASCII code point455//456private static boolean isAllASCII(String input) {457boolean isASCII = true;458for (int i = 0; i < input.length(); i++) {459int c = input.charAt(i);460if (c > 0x7F) {461isASCII = false;462break;463}464}465return isASCII;466}467468//469// to check if a string starts with ACE-prefix470//471private static boolean startsWithACEPrefix(StringBuffer input){472boolean startsWithPrefix = true;473474if(input.length() < ACE_PREFIX_LENGTH){475return false;476}477for(int i = 0; i < ACE_PREFIX_LENGTH; i++){478if(toASCIILower(input.charAt(i)) != ACE_PREFIX.charAt(i)){479startsWithPrefix = false;480}481}482return startsWithPrefix;483}484485private static char toASCIILower(char ch){486if('A' <= ch && ch <= 'Z'){487return (char)(ch + 'a' - 'A');488}489return ch;490}491492private static StringBuffer toASCIILower(StringBuffer input){493StringBuffer dest = new StringBuffer();494for(int i = 0; i < input.length();i++){495dest.append(toASCIILower(input.charAt(i)));496}497return dest;498}499}500501502