Path: blob/master/src/java.base/share/classes/sun/invoke/util/BytecodeName.java
41159 views
/*1* Copyright (c) 2007, 2011, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/2425package sun.invoke.util;2627/**28* Utility routines for dealing with bytecode-level names.29* Includes universal mangling rules for the JVM.30*31* <h3>Avoiding Dangerous Characters </h3>32*33* <p>34* The JVM defines a very small set of characters which are illegal35* in name spellings. We will slightly extend and regularize this set36* into a group of <cite>dangerous characters</cite>.37* These characters will then be replaced, in mangled names, by escape sequences.38* In addition, accidental escape sequences must be further escaped.39* Finally, a special prefix will be applied if and only if40* the mangling would otherwise fail to begin with the escape character.41* This happens to cover the corner case of the null string,42* and also clearly marks symbols which need demangling.43* </p>44* <p>45* Dangerous characters are the union of all characters forbidden46* or otherwise restricted by the JVM specification,47* plus their mates, if they are brackets48* (<code><big><b>[</b></big></code> and <code><big><b>]</b></big></code>,49* <code><big><b><</b></big></code> and <code><big><b>></b></big></code>),50* plus, arbitrarily, the colon character <code><big><b>:</b></big></code>.51* There is no distinction between type, method, and field names.52* This makes it easier to convert between mangled names of different53* types, since they do not need to be decoded (demangled).54* </p>55* <p>56* The escape character is backslash <code><big><b>\</b></big></code>57* (also known as reverse solidus).58* This character is, until now, unheard of in bytecode names,59* but traditional in the proposed role.60*61* </p>62* <h3> Replacement Characters </h3>63*64*65* <p>66* Every escape sequence is two characters67* (in fact, two UTF8 bytes) beginning with68* the escape character and followed by a69* <cite>replacement character</cite>.70* (Since the replacement character is never a backslash,71* iterated manglings do not double in size.)72* </p>73* <p>74* Each dangerous character has some rough visual similarity75* to its corresponding replacement character.76* This makes mangled symbols easier to recognize by sight.77* </p>78* <p>79* The dangerous characters are80* <code><big><b>/</b></big></code> (forward slash, used to delimit package components),81* <code><big><b>.</b></big></code> (dot, also a package delimiter),82* <code><big><b>;</b></big></code> (semicolon, used in signatures),83* <code><big><b>$</b></big></code> (dollar, used in inner classes and synthetic members),84* <code><big><b><</b></big></code> (left angle),85* <code><big><b>></b></big></code> (right angle),86* <code><big><b>[</b></big></code> (left square bracket, used in array types),87* <code><big><b>]</b></big></code> (right square bracket, reserved in this scheme for language use),88* and <code><big><b>:</b></big></code> (colon, reserved in this scheme for language use).89* Their replacements are, respectively,90* <code><big><b>|</b></big></code> (vertical bar),91* <code><big><b>,</b></big></code> (comma),92* <code><big><b>?</b></big></code> (question mark),93* <code><big><b>%</b></big></code> (percent),94* <code><big><b>^</b></big></code> (caret),95* <code><big><b>_</b></big></code> (underscore), and96* <code><big><b>{</b></big></code> (left curly bracket),97* <code><big><b>}</b></big></code> (right curly bracket),98* <code><big><b>!</b></big></code> (exclamation mark).99* In addition, the replacement character for the escape character itself is100* <code><big><b>-</b></big></code> (hyphen),101* and the replacement character for the null prefix is102* <code><big><b>=</b></big></code> (equal sign).103* </p>104* <p>105* An escape character <code><big><b>\</b></big></code>106* followed by any of these replacement characters107* is an escape sequence, and there are no other escape sequences.108* An equal sign is only part of an escape sequence109* if it is the second character in the whole string, following a backslash.110* Two consecutive backslashes do <em>not</em> form an escape sequence.111* </p>112* <p>113* Each escape sequence replaces a so-called <cite>original character</cite>114* which is either one of the dangerous characters or the escape character.115* A null prefix replaces an initial null string, not a character.116* </p>117* <p>118* All this implies that escape sequences cannot overlap and may be119* determined all at once for a whole string. Note that a spelling120* string can contain <cite>accidental escapes</cite>, apparent escape121* sequences which must not be interpreted as manglings.122* These are disabled by replacing their leading backslash with an123* escape sequence (<code><big><b>\-</b></big></code>). To mangle a string, three logical steps124* are required, though they may be carried out in one pass:125* </p>126* <ol>127* <li>In each accidental escape, replace the backslash with an escape sequence128* (<code><big><b>\-</b></big></code>).</li>129* <li>Replace each dangerous character with an escape sequence130* (<code><big><b>\|</b></big></code> for <code><big><b>/</b></big></code>, etc.).</li>131* <li>If the first two steps introduced any change, <em>and</em>132* if the string does not already begin with a backslash, prepend a null prefix (<code><big><b>\=</b></big></code>).</li>133* </ol>134*135* To demangle a mangled string that begins with an escape,136* remove any null prefix, and then replace (in parallel)137* each escape sequence by its original character.138* <p>Spelling strings which contain accidental139* escapes <em>must</em> have them replaced, even if those140* strings do not contain dangerous characters.141* This restriction means that mangling a string always142* requires a scan of the string for escapes.143* But then, a scan would be required anyway,144* to check for dangerous characters.145*146* </p>147* <h3> Nice Properties </h3>148*149* <p>150* If a bytecode name does not contain any escape sequence,151* demangling is a no-op: The string demangles to itself.152* Such a string is called <cite>self-mangling</cite>.153* Almost all strings are self-mangling.154* In practice, to demangle almost any name “found in nature”,155* simply verify that it does not begin with a backslash.156* </p>157* <p>158* Mangling is a one-to-one function, while demangling159* is a many-to-one function.160* A mangled string is defined as <cite>validly mangled</cite> if161* it is in fact the unique mangling of its spelling string.162* Three examples of invalidly mangled strings are <code><big><b>\=foo</b></big></code>,163* <code><big><b>\-bar</b></big></code>, and <code><big><b>baz\!</b></big></code>, which demangle to <code><big><b>foo</b></big></code>, <code><big><b>\bar</b></big></code>, and164* <code><big><b>baz\!</b></big></code>, but then remangle to <code><big><b>foo</b></big></code>, <code><big><b>\bar</b></big></code>, and <code><big><b>\=baz\-!</b></big></code>.165* If a language back-end or runtime is using mangled names,166* it should never present an invalidly mangled bytecode167* name to the JVM. If the runtime encounters one,168* it should also report an error, since such an occurrence169* probably indicates a bug in name encoding which170* will lead to errors in linkage.171* However, this note does not propose that the JVM verifier172* detect invalidly mangled names.173* </p>174* <p>175* As a result of these rules, it is a simple matter to176* compute validly mangled substrings and concatenations177* of validly mangled strings, and (with a little care)178* these correspond to corresponding operations on their179* spelling strings.180* </p>181* <ul>182* <li>Any prefix of a validly mangled string is also validly mangled,183* although a null prefix may need to be removed.</li>184* <li>Any suffix of a validly mangled string is also validly mangled,185* although a null prefix may need to be added.</li>186* <li>Two validly mangled strings, when concatenated,187* are also validly mangled, although any null prefix188* must be removed from the second string,189* and a trailing backslash on the first string may need escaping,190* if it would participate in an accidental escape when followed191* by the first character of the second string.</li>192* </ul>193* <p>If languages that include non-Java symbol spellings use this194* mangling convention, they will enjoy the following advantages:195* </p>196* <ul>197* <li>They can interoperate via symbols they share in common.</li>198* <li>Low-level tools, such as backtrace printers, will have readable displays.</li>199* <li>Future JVM and language extensions can safely use the dangerous characters200* for structuring symbols, but will never interfere with valid spellings.</li>201* <li>Runtimes and compilers can use standard libraries for mangling and demangling.</li>202* <li>Occasional transliterations and name composition will be simple and regular,203* for classes, methods, and fields.</li>204* <li>Bytecode names will continue to be compact.205* When mangled, spellings will at most double in length, either in206* UTF8 or UTF16 format, and most will not change at all.</li>207* </ul>208*209*210* <h3> Suggestions for Human Readable Presentations </h3>211*212*213* <p>214* For human readable displays of symbols,215* it will be better to present a string-like quoted216* representation of the spelling, because JVM users217* are generally familiar with such tokens.218* We suggest using single or double quotes before and after219* mangled symbols which are not valid Java identifiers,220* with quotes, backslashes, and non-printing characters221* escaped as if for literals in the Java language.222* </p>223* <p>224* For example, an HTML-like spelling225* <code><big><b><pre></b></big></code> mangles to226* <code><big><b>\^pre\_</b></big></code> and could227* display more cleanly as228* <code><big><b>'<pre>'</b></big></code>,229* with the quotes included.230* Such string-like conventions are <em>not</em> suitable231* for mangled bytecode names, in part because232* dangerous characters must be eliminated, rather233* than just quoted. Otherwise internally structured234* strings like package prefixes and method signatures235* could not be reliably parsed.236* </p>237* <p>238* In such human-readable displays, invalidly mangled239* names should <em>not</em> be demangled and quoted,240* for this would be misleading. Likewise, JVM symbols241* which contain dangerous characters (like dots in field242* names or brackets in method names) should not be243* simply quoted. The bytecode names244* <code><big><b>\=phase\,1</b></big></code> and245* <code><big><b>phase.1</b></big></code> are distinct,246* and in demangled displays they should be presented as247* <code><big><b>'phase.1'</b></big></code> and something like248* <code><big><b>'phase'.1</b></big></code>, respectively.249* </p>250*251* @author John Rose252* @version 1.2, 02/06/2008253* @see http://blogs.sun.com/jrose/entry/symbolic_freedom_in_the_vm254*/255public class BytecodeName {256private BytecodeName() { } // static only class257258/** Given a source name, produce the corresponding bytecode name.259* The source name should not be qualified, because any syntactic260* markers (dots, slashes, dollar signs, colons, etc.) will be mangled.261* @param s the source name262* @return a valid bytecode name which represents the source name263*/264public static String toBytecodeName(String s) {265String bn = mangle(s);266assert((Object)bn == s || looksMangled(bn)) : bn;267assert(s.equals(toSourceName(bn))) : s;268return bn;269}270271/** Given an unqualified bytecode name, produce the corresponding source name.272* The bytecode name must not contain dangerous characters.273* In particular, it must not be qualified or segmented by colon {@code ':'}.274* @param s the bytecode name275* @return the source name, which may possibly have unsafe characters276* @throws IllegalArgumentException if the bytecode name is not {@link #isSafeBytecodeName safe}277* @see #isSafeBytecodeName(java.lang.String)278*/279public static String toSourceName(String s) {280checkSafeBytecodeName(s);281String sn = s;282if (looksMangled(s)) {283sn = demangle(s);284assert(s.equals(mangle(sn))) : s+" => "+sn+" => "+mangle(sn);285}286return sn;287}288289/**290* Given a bytecode name from a classfile, separate it into291* components delimited by dangerous characters.292* Each resulting array element will be either a dangerous character,293* or else a safe bytecode name.294* (The safe name might possibly be mangled to hide further dangerous characters.)295* For example, the qualified class name {@code java/lang/String}296* will be parsed into the array {@code {"java", '/', "lang", '/', "String"}}.297* The name {@code <init>} will be parsed into {@code {'<', "init", '>'}}.298* The name {@code foo/bar$:baz} will be parsed into299* {@code {"foo", '/', "bar", '$', ':', "baz"}}.300* The name {@code ::\=:foo:\=bar\!baz} will be parsed into301* {@code {':', ':', "", ':', "foo", ':', "bar:baz"}}.302*/303public static Object[] parseBytecodeName(String s) {304int slen = s.length();305Object[] res = null;306for (int pass = 0; pass <= 1; pass++) {307int fillp = 0;308int lasti = 0;309for (int i = 0; i <= slen; i++) {310int whichDC = -1;311if (i < slen) {312whichDC = DANGEROUS_CHARS.indexOf(s.charAt(i));313if (whichDC < DANGEROUS_CHAR_FIRST_INDEX) continue;314}315// got to end of string or next dangerous char316if (lasti < i) {317// normal component318if (pass != 0)319res[fillp] = toSourceName(s.substring(lasti, i));320fillp++;321lasti = i+1;322}323if (whichDC >= DANGEROUS_CHAR_FIRST_INDEX) {324if (pass != 0)325res[fillp] = DANGEROUS_CHARS_CA[whichDC];326fillp++;327lasti = i+1;328}329}330if (pass != 0) break;331// between passes, build the result array332res = new Object[fillp];333if (fillp <= 1 && lasti == 0) {334if (fillp != 0) res[0] = toSourceName(s);335break;336}337}338return res;339}340341/**342* Given a series of components, create a bytecode name for a classfile.343* This is the inverse of {@link #parseBytecodeName(java.lang.String)}.344* Each component must either be an interned one-character string of345* a dangerous character, or else a safe bytecode name.346* @param components a series of name components347* @return the concatenation of all components348* @throws IllegalArgumentException if any component contains an unsafe349* character, and is not an interned one-character string350* @throws NullPointerException if any component is null351*/352public static String unparseBytecodeName(Object[] components) {353Object[] components0 = components;354for (int i = 0; i < components.length; i++) {355Object c = components[i];356if (c instanceof String) {357String mc = toBytecodeName((String) c);358if (i == 0 && components.length == 1)359return mc; // usual case360if ((Object)mc != c) {361if (components == components0)362components = components.clone();363components[i] = c = mc;364}365}366}367return appendAll(components);368}369private static String appendAll(Object[] components) {370if (components.length <= 1) {371if (components.length == 1) {372return String.valueOf(components[0]);373}374return "";375}376int slen = 0;377for (Object c : components) {378if (c instanceof String)379slen += String.valueOf(c).length();380else381slen += 1;382}383StringBuilder sb = new StringBuilder(slen);384for (Object c : components) {385sb.append(c);386}387return sb.toString();388}389390/**391* Given a bytecode name, produce the corresponding display name.392* This is the source name, plus quotes if needed.393* If the bytecode name contains dangerous characters,394* assume that they are being used as punctuation,395* and pass them through unchanged.396* Non-empty runs of non-dangerous characters are demangled397* if necessary, and the resulting names are quoted if398* they are not already valid Java identifiers, or if399* they contain a dangerous character (i.e., dollar sign "$").400* Single quotes are used when quoting.401* Within quoted names, embedded single quotes and backslashes402* are further escaped by prepended backslashes.403*404* @param s the original bytecode name (which may be qualified)405* @return a human-readable presentation406*/407public static String toDisplayName(String s) {408Object[] components = parseBytecodeName(s);409for (int i = 0; i < components.length; i++) {410if (!(components[i] instanceof String))411continue;412String sn = (String) components[i];413// note that the name is already demangled!414//sn = toSourceName(sn);415if (!isJavaIdent(sn) || sn.indexOf('$') >=0 ) {416components[i] = quoteDisplay(sn);417}418}419return appendAll(components);420}421private static boolean isJavaIdent(String s) {422int slen = s.length();423if (slen == 0) return false;424if (!Character.isJavaIdentifierStart(s.charAt(0)))425return false;426for (int i = 1; i < slen; i++) {427if (!Character.isJavaIdentifierPart(s.charAt(i)))428return false;429}430return true;431}432private static String quoteDisplay(String s) {433// TO DO: Replace wierd characters in s by C-style escapes.434return "'"+s.replaceAll("['\\\\]", "\\\\$0")+"'";435}436437private static void checkSafeBytecodeName(String s)438throws IllegalArgumentException {439if (!isSafeBytecodeName(s)) {440throw new IllegalArgumentException(s);441}442}443444/**445* Report whether a simple name is safe as a bytecode name.446* Such names are acceptable in class files as class, method, and field names.447* Additionally, they are free of "dangerous" characters, even if those448* characters are legal in some (or all) names in class files.449* @param s the proposed bytecode name450* @return true if the name is non-empty and all of its characters are safe451*/452public static boolean isSafeBytecodeName(String s) {453if (s.isEmpty()) return false;454// check occurrences of each DANGEROUS char455for (char xc : DANGEROUS_CHARS_A) {456if (xc == ESCAPE_C) continue; // not really that dangerous457if (s.indexOf(xc) >= 0) return false;458}459return true;460}461462/**463* Report whether a character is safe in a bytecode name.464* This is true of any unicode character except the following465* <em>dangerous characters</em>: {@code ".;:$[]<>/"}.466* @param c the proposed character467* @return true if the character is safe to use in classfiles468*/469public static boolean isSafeBytecodeChar(char c) {470return DANGEROUS_CHARS.indexOf(c) < DANGEROUS_CHAR_FIRST_INDEX;471}472473private static boolean looksMangled(String s) {474return s.charAt(0) == ESCAPE_C;475}476477private static String mangle(String s) {478if (s.isEmpty())479return NULL_ESCAPE;480481// build this lazily, when we first need an escape:482StringBuilder sb = null;483484for (int i = 0, slen = s.length(); i < slen; i++) {485char c = s.charAt(i);486487boolean needEscape = false;488if (c == ESCAPE_C) {489if (i+1 < slen) {490char c1 = s.charAt(i+1);491if ((i == 0 && c1 == NULL_ESCAPE_C)492|| c1 != originalOfReplacement(c1)) {493// an accidental escape494needEscape = true;495}496}497} else {498needEscape = isDangerous(c);499}500501if (!needEscape) {502if (sb != null) sb.append(c);503continue;504}505506// build sb if this is the first escape507if (sb == null) {508sb = new StringBuilder(s.length()+10);509// mangled names must begin with a backslash:510if (s.charAt(0) != ESCAPE_C && i > 0)511sb.append(NULL_ESCAPE);512// append the string so far, which is unremarkable:513sb.append(s, 0, i);514}515516// rewrite \ to \-, / to \|, etc.517sb.append(ESCAPE_C);518sb.append(replacementOf(c));519}520521if (sb != null) return sb.toString();522523return s;524}525526private static String demangle(String s) {527// build this lazily, when we first meet an escape:528StringBuilder sb = null;529530int stringStart = 0;531if (s.startsWith(NULL_ESCAPE))532stringStart = 2;533534for (int i = stringStart, slen = s.length(); i < slen; i++) {535char c = s.charAt(i);536537if (c == ESCAPE_C && i+1 < slen) {538// might be an escape sequence539char rc = s.charAt(i+1);540char oc = originalOfReplacement(rc);541if (oc != rc) {542// build sb if this is the first escape543if (sb == null) {544sb = new StringBuilder(s.length());545// append the string so far, which is unremarkable:546sb.append(s, stringStart, i);547}548++i; // skip both characters549c = oc;550}551}552553if (sb != null)554sb.append(c);555}556557if (sb != null) return sb.toString();558559return s.substring(stringStart);560}561562static char ESCAPE_C = '\\';563// empty escape sequence to avoid a null name or illegal prefix564static char NULL_ESCAPE_C = '=';565static String NULL_ESCAPE = ESCAPE_C+""+NULL_ESCAPE_C;566567static final String DANGEROUS_CHARS = "\\/.;:$[]<>"; // \\ must be first568static final String REPLACEMENT_CHARS = "-|,?!%{}^_";569static final int DANGEROUS_CHAR_FIRST_INDEX = 1; // index after \\570static char[] DANGEROUS_CHARS_A = DANGEROUS_CHARS.toCharArray();571static char[] REPLACEMENT_CHARS_A = REPLACEMENT_CHARS.toCharArray();572static final Character[] DANGEROUS_CHARS_CA;573static {574Character[] dcca = new Character[DANGEROUS_CHARS.length()];575for (int i = 0; i < dcca.length; i++)576dcca[i] = Character.valueOf(DANGEROUS_CHARS.charAt(i));577DANGEROUS_CHARS_CA = dcca;578}579580static final long[] SPECIAL_BITMAP = new long[2]; // 128 bits581static {582String SPECIAL = DANGEROUS_CHARS + REPLACEMENT_CHARS;583//System.out.println("SPECIAL = "+SPECIAL);584for (char c : SPECIAL.toCharArray()) {585SPECIAL_BITMAP[c >>> 6] |= 1L << c;586}587}588static boolean isSpecial(char c) {589if ((c >>> 6) < SPECIAL_BITMAP.length)590return ((SPECIAL_BITMAP[c >>> 6] >> c) & 1) != 0;591else592return false;593}594static char replacementOf(char c) {595if (!isSpecial(c)) return c;596int i = DANGEROUS_CHARS.indexOf(c);597if (i < 0) return c;598return REPLACEMENT_CHARS.charAt(i);599}600static char originalOfReplacement(char c) {601if (!isSpecial(c)) return c;602int i = REPLACEMENT_CHARS.indexOf(c);603if (i < 0) return c;604return DANGEROUS_CHARS.charAt(i);605}606static boolean isDangerous(char c) {607if (!isSpecial(c)) return false;608return (DANGEROUS_CHARS.indexOf(c) >= DANGEROUS_CHAR_FIRST_INDEX);609}610static int indexOfDangerousChar(String s, int from) {611for (int i = from, slen = s.length(); i < slen; i++) {612if (isDangerous(s.charAt(i)))613return i;614}615return -1;616}617static int lastIndexOfDangerousChar(String s, int from) {618for (int i = Math.min(from, s.length()-1); i >= 0; i--) {619if (isDangerous(s.charAt(i)))620return i;621}622return -1;623}624625626}627628629