Path: blob/master/src/java.base/share/classes/sun/net/www/ParseUtil.java
41159 views
/*1* Copyright (c) 1998, 2021, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/2425package sun.net.www;2627import java.io.File;28import java.net.MalformedURLException;29import java.net.URI;30import java.net.URISyntaxException;31import java.net.URL;32import java.nio.ByteBuffer;33import java.nio.CharBuffer;34import java.nio.charset.CharacterCodingException;35import java.nio.charset.CharsetDecoder;36import java.nio.charset.CharsetEncoder;37import java.nio.charset.CoderResult;38import java.nio.charset.CodingErrorAction;39import java.util.HexFormat;4041import sun.nio.cs.UTF_8;4243/**44* A class that contains useful routines common to sun.net.www45* @author Mike McCloskey46*/4748public final class ParseUtil {4950private static final HexFormat HEX_UPPERCASE = HexFormat.of().withUpperCase();5152private ParseUtil() {}5354/**55* Constructs an encoded version of the specified path string suitable56* for use in the construction of a URL.57*58* A path separator is replaced by a forward slash. The string is UTF859* encoded. The % escape sequence is used for characters that are above60* 0x7F or those defined in RFC2396 as reserved or excluded in the path61* component of a URL.62*/63public static String encodePath(String path) {64return encodePath(path, true);65}66/*67* flag indicates whether path uses platform dependent68* File.separatorChar or not. True indicates path uses platform69* dependent File.separatorChar.70*/71public static String encodePath(String path, boolean flag) {72if (flag && File.separatorChar != '/') {73return encodePath(path, 0, File.separatorChar);74} else {75int index = firstEncodeIndex(path);76if (index > -1) {77return encodePath(path, index, '/');78} else {79return path;80}81}82}8384private static int firstEncodeIndex(String path) {85int len = path.length();86for (int i = 0; i < len; i++) {87char c = path.charAt(i);88// Ordering in the following test is performance sensitive,89// and typically paths have most chars in the a-z range, then90// in the symbol range '&'-':' (includes '.', '/' and '0'-'9')91// and more rarely in the A-Z range.92if (c >= 'a' && c <= 'z' ||93c >= '&' && c <= ':' ||94c >= 'A' && c <= 'Z') {95continue;96} else if (c > 0x007F || match(c, L_ENCODED, H_ENCODED)) {97return i;98}99}100return -1;101}102103private static String encodePath(String path, int index, char sep) {104char[] pathCC = path.toCharArray();105char[] retCC = new char[pathCC.length * 2 + 16 - index];106if (index > 0) {107System.arraycopy(pathCC, 0, retCC, 0, index);108}109int retLen = index;110111for (int i = index; i < pathCC.length; i++) {112char c = pathCC[i];113if (c == sep)114retCC[retLen++] = '/';115else {116if (c <= 0x007F) {117if (c >= 'a' && c <= 'z' ||118c >= 'A' && c <= 'Z' ||119c >= '0' && c <= '9') {120retCC[retLen++] = c;121} else if (match(c, L_ENCODED, H_ENCODED)) {122retLen = escape(retCC, c, retLen);123} else {124retCC[retLen++] = c;125}126} else if (c > 0x07FF) {127retLen = escape(retCC, (char)(0xE0 | ((c >> 12) & 0x0F)), retLen);128retLen = escape(retCC, (char)(0x80 | ((c >> 6) & 0x3F)), retLen);129retLen = escape(retCC, (char)(0x80 | ((c >> 0) & 0x3F)), retLen);130} else {131retLen = escape(retCC, (char)(0xC0 | ((c >> 6) & 0x1F)), retLen);132retLen = escape(retCC, (char)(0x80 | ((c >> 0) & 0x3F)), retLen);133}134}135//worst case scenario for character [0x7ff-] every single136//character will be encoded into 9 characters.137if (retLen + 9 > retCC.length) {138int newLen = retCC.length * 2 + 16;139if (newLen < 0) {140newLen = Integer.MAX_VALUE;141}142char[] buf = new char[newLen];143System.arraycopy(retCC, 0, buf, 0, retLen);144retCC = buf;145}146}147return new String(retCC, 0, retLen);148}149150/**151* Appends the URL escape sequence for the specified char to the152* specified character array.153*/154private static int escape(char[] cc, char c, int index) {155cc[index++] = '%';156cc[index++] = Character.forDigit((c >> 4) & 0xF, 16);157cc[index++] = Character.forDigit(c & 0xF, 16);158return index;159}160161/**162* Un-escape and return the character at position i in string s.163*/164private static byte unescape(String s, int i) {165return (byte) Integer.parseInt(s, i + 1, i + 3, 16);166}167168169/**170* Returns a new String constructed from the specified String by replacing171* the URL escape sequences and UTF8 encoding with the characters they172* represent.173*/174public static String decode(String s) {175int n = s.length();176if ((n == 0) || (s.indexOf('%') < 0))177return s;178179StringBuilder sb = new StringBuilder(n);180ByteBuffer bb = ByteBuffer.allocate(n);181CharBuffer cb = CharBuffer.allocate(n);182CharsetDecoder dec = UTF_8.INSTANCE.newDecoder()183.onMalformedInput(CodingErrorAction.REPORT)184.onUnmappableCharacter(CodingErrorAction.REPORT);185186char c = s.charAt(0);187for (int i = 0; i < n;) {188assert c == s.charAt(i);189if (c != '%') {190sb.append(c);191if (++i >= n)192break;193c = s.charAt(i);194continue;195}196bb.clear();197int ui = i;198for (;;) {199assert (n - i >= 2);200try {201bb.put(unescape(s, i));202} catch (NumberFormatException e) {203throw new IllegalArgumentException();204}205i += 3;206if (i >= n)207break;208c = s.charAt(i);209if (c != '%')210break;211}212bb.flip();213cb.clear();214dec.reset();215CoderResult cr = dec.decode(bb, cb, true);216if (cr.isError())217throw new IllegalArgumentException("Error decoding percent encoded characters");218cr = dec.flush(cb);219if (cr.isError())220throw new IllegalArgumentException("Error decoding percent encoded characters");221sb.append(cb.flip().toString());222}223224return sb.toString();225}226227public static URL fileToEncodedURL(File file)228throws MalformedURLException229{230String path = file.getAbsolutePath();231path = ParseUtil.encodePath(path);232if (!path.startsWith("/")) {233path = "/" + path;234}235if (!path.endsWith("/") && file.isDirectory()) {236path = path + "/";237}238return new URL("file", "", path);239}240241public static java.net.URI toURI(URL url) {242String protocol = url.getProtocol();243String auth = url.getAuthority();244String path = url.getPath();245String query = url.getQuery();246String ref = url.getRef();247if (path != null && !(path.startsWith("/")))248path = "/" + path;249250//251// In java.net.URI class, a port number of -1 implies the default252// port number. So get it stripped off before creating URI instance.253//254if (auth != null && auth.endsWith(":-1"))255auth = auth.substring(0, auth.length() - 3);256257java.net.URI uri;258try {259uri = createURI(protocol, auth, path, query, ref);260} catch (java.net.URISyntaxException e) {261uri = null;262}263return uri;264}265266//267// createURI() and its auxiliary code are cloned from java.net.URI.268// Most of the code are just copy and paste, except that quote()269// has been modified to avoid double-escape.270//271// Usually it is unacceptable, but we're forced to do it because272// otherwise we need to change public API, namely java.net.URI's273// multi-argument constructors. It turns out that the changes cause274// incompatibilities so can't be done.275//276private static URI createURI(String scheme,277String authority,278String path,279String query,280String fragment) throws URISyntaxException281{282String s = toString(scheme, null,283authority, null, null, -1,284path, query, fragment);285checkPath(s, scheme, path);286return new URI(s);287}288289private static String toString(String scheme,290String opaquePart,291String authority,292String userInfo,293String host,294int port,295String path,296String query,297String fragment)298{299StringBuilder sb = new StringBuilder();300if (scheme != null) {301sb.append(scheme);302sb.append(':');303}304appendSchemeSpecificPart(sb, opaquePart,305authority, userInfo, host, port,306path, query);307appendFragment(sb, fragment);308return sb.toString();309}310311private static void appendSchemeSpecificPart(StringBuilder sb,312String opaquePart,313String authority,314String userInfo,315String host,316int port,317String path,318String query)319{320if (opaquePart != null) {321/* check if SSP begins with an IPv6 address322* because we must not quote a literal IPv6 address323*/324if (opaquePart.startsWith("//[")) {325int end = opaquePart.indexOf(']');326if (end != -1 && opaquePart.indexOf(':')!=-1) {327String doquote, dontquote;328if (end == opaquePart.length()) {329dontquote = opaquePart;330doquote = "";331} else {332dontquote = opaquePart.substring(0,end+1);333doquote = opaquePart.substring(end+1);334}335sb.append (dontquote);336sb.append(quote(doquote, L_URIC, H_URIC));337}338} else {339sb.append(quote(opaquePart, L_URIC, H_URIC));340}341} else {342appendAuthority(sb, authority, userInfo, host, port);343if (path != null)344sb.append(quote(path, L_PATH, H_PATH));345if (query != null) {346sb.append('?');347sb.append(quote(query, L_URIC, H_URIC));348}349}350}351352private static void appendAuthority(StringBuilder sb,353String authority,354String userInfo,355String host,356int port)357{358if (host != null) {359sb.append("//");360if (userInfo != null) {361sb.append(quote(userInfo, L_USERINFO, H_USERINFO));362sb.append('@');363}364boolean needBrackets = ((host.indexOf(':') >= 0)365&& !host.startsWith("[")366&& !host.endsWith("]"));367if (needBrackets) sb.append('[');368sb.append(host);369if (needBrackets) sb.append(']');370if (port != -1) {371sb.append(':');372sb.append(port);373}374} else if (authority != null) {375sb.append("//");376if (authority.startsWith("[")) {377int end = authority.indexOf(']');378if (end != -1 && authority.indexOf(':')!=-1) {379String doquote, dontquote;380if (end == authority.length()) {381dontquote = authority;382doquote = "";383} else {384dontquote = authority.substring(0,end+1);385doquote = authority.substring(end+1);386}387sb.append (dontquote);388sb.append(quote(doquote,389L_REG_NAME | L_SERVER,390H_REG_NAME | H_SERVER));391}392} else {393sb.append(quote(authority,394L_REG_NAME | L_SERVER,395H_REG_NAME | H_SERVER));396}397}398}399400private static void appendFragment(StringBuilder sb, String fragment) {401if (fragment != null) {402sb.append('#');403sb.append(quote(fragment, L_URIC, H_URIC));404}405}406407// Quote any characters in s that are not permitted408// by the given mask pair409//410private static String quote(String s, long lowMask, long highMask) {411int n = s.length();412StringBuilder sb = null;413CharsetEncoder encoder = null;414boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0);415for (int i = 0; i < s.length(); i++) {416char c = s.charAt(i);417if (c < '\u0080') {418if (!match(c, lowMask, highMask) && !isEscaped(s, i)) {419if (sb == null) {420sb = new StringBuilder();421sb.append(s, 0, i);422}423appendEscape(sb, (byte)c);424} else {425if (sb != null)426sb.append(c);427}428} else if (allowNonASCII429&& (Character.isSpaceChar(c)430|| Character.isISOControl(c))) {431if (encoder == null) {432encoder = UTF_8.INSTANCE.newEncoder();433}434if (sb == null) {435sb = new StringBuilder();436sb.append(s, 0, i);437}438appendEncoded(encoder, sb, c);439} else {440if (sb != null)441sb.append(c);442}443}444return (sb == null) ? s : sb.toString();445}446447//448// To check if the given string has an escaped triplet449// at the given position450//451private static boolean isEscaped(String s, int pos) {452if (s == null || (s.length() <= (pos + 2)))453return false;454455return s.charAt(pos) == '%'456&& match(s.charAt(pos + 1), L_HEX, H_HEX)457&& match(s.charAt(pos + 2), L_HEX, H_HEX);458}459460private static void appendEncoded(CharsetEncoder encoder,461StringBuilder sb, char c) {462ByteBuffer bb = null;463try {464bb = encoder.encode(CharBuffer.wrap("" + c));465} catch (CharacterCodingException x) {466assert false;467}468while (bb.hasRemaining()) {469int b = bb.get() & 0xff;470if (b >= 0x80)471appendEscape(sb, (byte)b);472else473sb.append((char)b);474}475}476477private static void appendEscape(StringBuilder sb, byte b) {478sb.append('%');479HEX_UPPERCASE.toHexDigits(sb, b);480}481482// Tell whether the given character is permitted by the given mask pair483private static boolean match(char c, long lowMask, long highMask) {484if (c < 64)485return ((1L << c) & lowMask) != 0;486if (c < 128)487return ((1L << (c - 64)) & highMask) != 0;488return false;489}490491// If a scheme is given then the path, if given, must be absolute492//493private static void checkPath(String s, String scheme, String path)494throws URISyntaxException495{496if (scheme != null) {497if (path != null && !path.isEmpty() && path.charAt(0) != '/')498throw new URISyntaxException(s,499"Relative path in absolute URI");500}501}502503504// -- Character classes for parsing --505506// To save startup time, we manually calculate the low-/highMask constants.507// For reference, the following methods were used to calculate the values:508509// Compute a low-order mask for the characters510// between first and last, inclusive511// private static long lowMask(char first, char last) {512// long m = 0;513// int f = Math.max(Math.min(first, 63), 0);514// int l = Math.max(Math.min(last, 63), 0);515// for (int i = f; i <= l; i++)516// m |= 1L << i;517// return m;518// }519520// Compute the low-order mask for the characters in the given string521// private static long lowMask(String chars) {522// int n = chars.length();523// long m = 0;524// for (int i = 0; i < n; i++) {525// char c = chars.charAt(i);526// if (c < 64)527// m |= (1L << c);528// }529// return m;530// }531532// Compute a high-order mask for the characters533// between first and last, inclusive534// private static long highMask(char first, char last) {535// long m = 0;536// int f = Math.max(Math.min(first, 127), 64) - 64;537// int l = Math.max(Math.min(last, 127), 64) - 64;538// for (int i = f; i <= l; i++)539// m |= 1L << i;540// return m;541// }542543// Compute the high-order mask for the characters in the given string544// private static long highMask(String chars) {545// int n = chars.length();546// long m = 0;547// for (int i = 0; i < n; i++) {548// char c = chars.charAt(i);549// if ((c >= 64) && (c < 128))550// m |= (1L << (c - 64));551// }552// return m;553// }554555556// Character-class masks557558// digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |559// "8" | "9"560private static final long L_DIGIT = 0x3FF000000000000L; // lowMask('0', '9');561private static final long H_DIGIT = 0L;562563// hex = digit | "A" | "B" | "C" | "D" | "E" | "F" |564// "a" | "b" | "c" | "d" | "e" | "f"565private static final long L_HEX = L_DIGIT;566private static final long H_HEX = 0x7E0000007EL; // highMask('A', 'F') | highMask('a', 'f');567568// upalpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |569// "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |570// "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"571private static final long L_UPALPHA = 0L;572private static final long H_UPALPHA = 0x7FFFFFEL; // highMask('A', 'Z');573574// lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" |575// "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" |576// "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"577private static final long L_LOWALPHA = 0L;578private static final long H_LOWALPHA = 0x7FFFFFE00000000L; // highMask('a', 'z');579580// alpha = lowalpha | upalpha581private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA;582private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA;583584// alphanum = alpha | digit585private static final long L_ALPHANUM = L_DIGIT | L_ALPHA;586private static final long H_ALPHANUM = H_DIGIT | H_ALPHA;587588// mark = "-" | "_" | "." | "!" | "~" | "*" | "'" |589// "(" | ")"590private static final long L_MARK = 0x678200000000L; // lowMask("-_.!~*'()");591private static final long H_MARK = 0x4000000080000000L; // highMask("-_.!~*'()");592593// unreserved = alphanum | mark594private static final long L_UNRESERVED = L_ALPHANUM | L_MARK;595private static final long H_UNRESERVED = H_ALPHANUM | H_MARK;596597// reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |598// "$" | "," | "[" | "]"599// Added per RFC2732: "[", "]"600private static final long L_RESERVED = 0xAC00985000000000L; // lowMask(";/?:@&=+$,[]");601private static final long H_RESERVED = 0x28000001L; // highMask(";/?:@&=+$,[]");602603// The zero'th bit is used to indicate that escape pairs and non-US-ASCII604// characters are allowed; this is handled by the scanEscape method below.605private static final long L_ESCAPED = 1L;606private static final long H_ESCAPED = 0L;607608// uric = reserved | unreserved | escaped609private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED;610private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED;611612// pchar = unreserved | escaped |613// ":" | "@" | "&" | "=" | "+" | "$" | ","614private static final long L_PCHAR615= L_UNRESERVED | L_ESCAPED | 0x2400185000000000L; // lowMask(":@&=+$,");616private static final long H_PCHAR617= H_UNRESERVED | H_ESCAPED | 0x1L; // highMask(":@&=+$,");618619// All valid path characters620private static final long L_PATH = L_PCHAR | 0x800800000000000L; // lowMask(";/");621private static final long H_PATH = H_PCHAR; // highMask(";/") == 0x0L;622623// Dash, for use in domainlabel and toplabel624private static final long L_DASH = 0x200000000000L; // lowMask("-");625private static final long H_DASH = 0x0L; // highMask("-");626627// userinfo = *( unreserved | escaped |628// ";" | ":" | "&" | "=" | "+" | "$" | "," )629private static final long L_USERINFO630= L_UNRESERVED | L_ESCAPED | 0x2C00185000000000L; // lowMask(";:&=+$,");631private static final long H_USERINFO632= H_UNRESERVED | H_ESCAPED; // | highMask(";:&=+$,") == 0L;633634// reg_name = 1*( unreserved | escaped | "$" | "," |635// ";" | ":" | "@" | "&" | "=" | "+" )636private static final long L_REG_NAME637= L_UNRESERVED | L_ESCAPED | 0x2C00185000000000L; // lowMask("$,;:@&=+");638private static final long H_REG_NAME639= H_UNRESERVED | H_ESCAPED | 0x1L; // highMask("$,;:@&=+");640641// All valid characters for server-based authorities642private static final long L_SERVER643= L_USERINFO | L_ALPHANUM | L_DASH | 0x400400000000000L; // lowMask(".:@[]");644private static final long H_SERVER645= H_USERINFO | H_ALPHANUM | H_DASH | 0x28000001L; // highMask(".:@[]");646647// Characters that are encoded in the path component of a URI.648//649// These characters are reserved in the path segment as described in650// RFC2396 section 3.3:651// "=" | ";" | "?" | "/"652//653// These characters are defined as excluded in RFC2396 section 2.4.3654// and must be escaped if they occur in the data part of a URI:655// "#" | " " | "<" | ">" | "%" | "\"" | "{" | "}" | "|" | "\\" | "^" |656// "[" | "]" | "`"657//658// Also US ASCII control characters 00-1F and 7F.659660// lowMask((char)0, (char)31) | lowMask("=;?/# <>%\"{}|\\^[]`");661private static final long L_ENCODED = 0xF800802DFFFFFFFFL;662663// highMask((char)0x7F, (char)0x7F) | highMask("=;?/# <>%\"{}|\\^[]`");664private static final long H_ENCODED = 0xB800000178000000L;665666}667668669