Path: blob/master/src/java.base/share/classes/sun/nio/cs/UTF_8.java
41159 views
/*1* Copyright (c) 2000, 2021, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/2425package sun.nio.cs;2627import jdk.internal.access.JavaLangAccess;28import jdk.internal.access.SharedSecrets;2930import java.nio.Buffer;31import java.nio.ByteBuffer;32import java.nio.CharBuffer;33import java.nio.charset.Charset;34import java.nio.charset.CharsetDecoder;35import java.nio.charset.CharsetEncoder;36import java.nio.charset.CoderResult;37import java.nio.charset.CodingErrorAction;3839/* Legal UTF-8 Byte Sequences40*41* # Code Points Bits Bit/Byte pattern42* 1 7 0xxxxxxx43* U+0000..U+007F 00..7F44*45* 2 11 110xxxxx 10xxxxxx46* U+0080..U+07FF C2..DF 80..BF47*48* 3 16 1110xxxx 10xxxxxx 10xxxxxx49* U+0800..U+0FFF E0 A0..BF 80..BF50* U+1000..U+FFFF E1..EF 80..BF 80..BF51*52* 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx53* U+10000..U+3FFFF F0 90..BF 80..BF 80..BF54* U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF55* U+100000..U10FFFF F4 80..8F 80..BF 80..BF56*57*/5859public final class UTF_8 extends Unicode {6061public static final UTF_8 INSTANCE = new UTF_8();6263public UTF_8() {64super("UTF-8", StandardCharsets.aliases_UTF_8());65}6667public String historicalName() {68return "UTF8";69}7071public CharsetDecoder newDecoder() {72return new Decoder(this);73}7475public CharsetEncoder newEncoder() {76return new Encoder(this);77}7879static final void updatePositions(Buffer src, int sp,80Buffer dst, int dp) {81src.position(sp - src.arrayOffset());82dst.position(dp - dst.arrayOffset());83}8485private static class Decoder extends CharsetDecoder {8687private static final JavaLangAccess JLA = SharedSecrets.getJavaLangAccess();8889private Decoder(Charset cs) {90super(cs, 1.0f, 1.0f);91}9293private static boolean isNotContinuation(int b) {94return (b & 0xc0) != 0x80;95}9697// [E0] [A0..BF] [80..BF]98// [E1..EF] [80..BF] [80..BF]99private static boolean isMalformed3(int b1, int b2, int b3) {100return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||101(b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80;102}103104// only used when there is only one byte left in src buffer105private static boolean isMalformed3_2(int b1, int b2) {106return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||107(b2 & 0xc0) != 0x80;108}109110// [F0] [90..BF] [80..BF] [80..BF]111// [F1..F3] [80..BF] [80..BF] [80..BF]112// [F4] [80..8F] [80..BF] [80..BF]113// only check 80-be range here, the [0xf0,0x80...] and [0xf4,0x90-...]114// will be checked by Character.isSupplementaryCodePoint(uc)115private static boolean isMalformed4(int b2, int b3, int b4) {116return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 ||117(b4 & 0xc0) != 0x80;118}119120// only used when there is less than 4 bytes left in src buffer.121// both b1 and b2 should be "& 0xff" before passed in.122private static boolean isMalformed4_2(int b1, int b2) {123return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) ||124(b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||125(b2 & 0xc0) != 0x80;126}127128// tests if b1 and b2 are malformed as the first 2 bytes of a129// legal`4-byte utf-8 byte sequence.130// only used when there is less than 4 bytes left in src buffer,131// after isMalformed4_2 has been invoked.132private static boolean isMalformed4_3(int b3) {133return (b3 & 0xc0) != 0x80;134}135136private static CoderResult malformedN(ByteBuffer src, int nb) {137switch (nb) {138case 1:139case 2: // always 1140return CoderResult.malformedForLength(1);141case 3:142int b1 = src.get();143int b2 = src.get(); // no need to lookup b3144return CoderResult.malformedForLength(145((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||146isNotContinuation(b2)) ? 1 : 2);147case 4: // we don't care the speed here148b1 = src.get() & 0xff;149b2 = src.get() & 0xff;150if (b1 > 0xf4 ||151(b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) ||152(b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||153isNotContinuation(b2))154return CoderResult.malformedForLength(1);155if (isNotContinuation(src.get()))156return CoderResult.malformedForLength(2);157return CoderResult.malformedForLength(3);158default:159assert false;160return null;161}162}163164private static CoderResult malformed(ByteBuffer src, int sp,165CharBuffer dst, int dp,166int nb)167{168src.position(sp - src.arrayOffset());169CoderResult cr = malformedN(src, nb);170updatePositions(src, sp, dst, dp);171return cr;172}173174175private static CoderResult malformed(ByteBuffer src,176int mark, int nb)177{178src.position(mark);179CoderResult cr = malformedN(src, nb);180src.position(mark);181return cr;182}183184private static CoderResult malformedForLength(ByteBuffer src,185int sp,186CharBuffer dst,187int dp,188int malformedNB)189{190updatePositions(src, sp, dst, dp);191return CoderResult.malformedForLength(malformedNB);192}193194private static CoderResult malformedForLength(ByteBuffer src,195int mark,196int malformedNB)197{198src.position(mark);199return CoderResult.malformedForLength(malformedNB);200}201202203private static CoderResult xflow(Buffer src, int sp, int sl,204Buffer dst, int dp, int nb) {205updatePositions(src, sp, dst, dp);206return (nb == 0 || sl - sp < nb)207? CoderResult.UNDERFLOW : CoderResult.OVERFLOW;208}209210private static CoderResult xflow(Buffer src, int mark, int nb) {211src.position(mark);212return (nb == 0 || src.remaining() < nb)213? CoderResult.UNDERFLOW : CoderResult.OVERFLOW;214}215216private CoderResult decodeArrayLoop(ByteBuffer src,217CharBuffer dst)218{219// This method is optimized for ASCII input.220byte[] sa = src.array();221int soff = src.arrayOffset();222int sp = soff + src.position();223int sl = soff + src.limit();224225char[] da = dst.array();226int doff = dst.arrayOffset();227int dp = doff + dst.position();228int dl = doff + dst.limit();229230int n = JLA.decodeASCII(sa, sp, da, dp, Math.min(sl - sp, dl - dp));231sp += n;232dp += n;233234while (sp < sl) {235int b1 = sa[sp];236if (b1 >= 0) {237// 1 byte, 7 bits: 0xxxxxxx238if (dp >= dl)239return xflow(src, sp, sl, dst, dp, 1);240da[dp++] = (char) b1;241sp++;242} else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {243// 2 bytes, 11 bits: 110xxxxx 10xxxxxx244// [C2..DF] [80..BF]245if (sl - sp < 2 || dp >= dl)246return xflow(src, sp, sl, dst, dp, 2);247int b2 = sa[sp + 1];248// Now we check the first byte of 2-byte sequence as249// if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0)250// no longer need to check b1 against c1 & c0 for251// malformed as we did in previous version252// (b1 & 0x1e) == 0x0 || (b2 & 0xc0) != 0x80;253// only need to check the second byte b2.254if (isNotContinuation(b2))255return malformedForLength(src, sp, dst, dp, 1);256da[dp++] = (char) (((b1 << 6) ^ b2)257^258(((byte) 0xC0 << 6) ^259((byte) 0x80 << 0)));260sp += 2;261} else if ((b1 >> 4) == -2) {262// 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx263int srcRemaining = sl - sp;264if (srcRemaining < 3 || dp >= dl) {265if (srcRemaining > 1 && isMalformed3_2(b1, sa[sp + 1]))266return malformedForLength(src, sp, dst, dp, 1);267return xflow(src, sp, sl, dst, dp, 3);268}269int b2 = sa[sp + 1];270int b3 = sa[sp + 2];271if (isMalformed3(b1, b2, b3))272return malformed(src, sp, dst, dp, 3);273char c = (char)274((b1 << 12) ^275(b2 << 6) ^276(b3 ^277(((byte) 0xE0 << 12) ^278((byte) 0x80 << 6) ^279((byte) 0x80 << 0))));280if (Character.isSurrogate(c))281return malformedForLength(src, sp, dst, dp, 3);282da[dp++] = c;283sp += 3;284} else if ((b1 >> 3) == -2) {285// 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx286int srcRemaining = sl - sp;287if (srcRemaining < 4 || dl - dp < 2) {288b1 &= 0xff;289if (b1 > 0xf4 ||290srcRemaining > 1 && isMalformed4_2(b1, sa[sp + 1] & 0xff))291return malformedForLength(src, sp, dst, dp, 1);292if (srcRemaining > 2 && isMalformed4_3(sa[sp + 2]))293return malformedForLength(src, sp, dst, dp, 2);294return xflow(src, sp, sl, dst, dp, 4);295}296int b2 = sa[sp + 1];297int b3 = sa[sp + 2];298int b4 = sa[sp + 3];299int uc = ((b1 << 18) ^300(b2 << 12) ^301(b3 << 6) ^302(b4 ^303(((byte) 0xF0 << 18) ^304((byte) 0x80 << 12) ^305((byte) 0x80 << 6) ^306((byte) 0x80 << 0))));307if (isMalformed4(b2, b3, b4) ||308// shortest form check309!Character.isSupplementaryCodePoint(uc)) {310return malformed(src, sp, dst, dp, 4);311}312da[dp++] = Character.highSurrogate(uc);313da[dp++] = Character.lowSurrogate(uc);314sp += 4;315} else316return malformed(src, sp, dst, dp, 1);317}318return xflow(src, sp, sl, dst, dp, 0);319}320321private CoderResult decodeBufferLoop(ByteBuffer src,322CharBuffer dst)323{324int mark = src.position();325int limit = src.limit();326while (mark < limit) {327int b1 = src.get();328if (b1 >= 0) {329// 1 byte, 7 bits: 0xxxxxxx330if (dst.remaining() < 1)331return xflow(src, mark, 1); // overflow332dst.put((char) b1);333mark++;334} else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {335// 2 bytes, 11 bits: 110xxxxx 10xxxxxx336if (limit - mark < 2|| dst.remaining() < 1)337return xflow(src, mark, 2);338int b2 = src.get();339if (isNotContinuation(b2))340return malformedForLength(src, mark, 1);341dst.put((char) (((b1 << 6) ^ b2)342^343(((byte) 0xC0 << 6) ^344((byte) 0x80 << 0))));345mark += 2;346} else if ((b1 >> 4) == -2) {347// 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx348int srcRemaining = limit - mark;349if (srcRemaining < 3 || dst.remaining() < 1) {350if (srcRemaining > 1 && isMalformed3_2(b1, src.get()))351return malformedForLength(src, mark, 1);352return xflow(src, mark, 3);353}354int b2 = src.get();355int b3 = src.get();356if (isMalformed3(b1, b2, b3))357return malformed(src, mark, 3);358char c = (char)359((b1 << 12) ^360(b2 << 6) ^361(b3 ^362(((byte) 0xE0 << 12) ^363((byte) 0x80 << 6) ^364((byte) 0x80 << 0))));365if (Character.isSurrogate(c))366return malformedForLength(src, mark, 3);367dst.put(c);368mark += 3;369} else if ((b1 >> 3) == -2) {370// 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx371int srcRemaining = limit - mark;372if (srcRemaining < 4 || dst.remaining() < 2) {373b1 &= 0xff;374if (b1 > 0xf4 ||375srcRemaining > 1 && isMalformed4_2(b1, src.get() & 0xff))376return malformedForLength(src, mark, 1);377if (srcRemaining > 2 && isMalformed4_3(src.get()))378return malformedForLength(src, mark, 2);379return xflow(src, mark, 4);380}381int b2 = src.get();382int b3 = src.get();383int b4 = src.get();384int uc = ((b1 << 18) ^385(b2 << 12) ^386(b3 << 6) ^387(b4 ^388(((byte) 0xF0 << 18) ^389((byte) 0x80 << 12) ^390((byte) 0x80 << 6) ^391((byte) 0x80 << 0))));392if (isMalformed4(b2, b3, b4) ||393// shortest form check394!Character.isSupplementaryCodePoint(uc)) {395return malformed(src, mark, 4);396}397dst.put(Character.highSurrogate(uc));398dst.put(Character.lowSurrogate(uc));399mark += 4;400} else {401return malformed(src, mark, 1);402}403}404return xflow(src, mark, 0);405}406407protected CoderResult decodeLoop(ByteBuffer src,408CharBuffer dst)409{410if (src.hasArray() && dst.hasArray())411return decodeArrayLoop(src, dst);412else413return decodeBufferLoop(src, dst);414}415}416417private static final class Encoder extends CharsetEncoder {418419private Encoder(Charset cs) {420super(cs, 1.1f, 3.0f);421}422423public boolean canEncode(char c) {424return !Character.isSurrogate(c);425}426427public boolean isLegalReplacement(byte[] repl) {428return ((repl.length == 1 && repl[0] >= 0) ||429super.isLegalReplacement(repl));430}431432private static CoderResult overflow(CharBuffer src, int sp,433ByteBuffer dst, int dp) {434updatePositions(src, sp, dst, dp);435return CoderResult.OVERFLOW;436}437438private static CoderResult overflow(CharBuffer src, int mark) {439src.position(mark);440return CoderResult.OVERFLOW;441}442443private Surrogate.Parser sgp;444private CoderResult encodeArrayLoop(CharBuffer src,445ByteBuffer dst)446{447char[] sa = src.array();448int sp = src.arrayOffset() + src.position();449int sl = src.arrayOffset() + src.limit();450451byte[] da = dst.array();452int dp = dst.arrayOffset() + dst.position();453int dl = dst.arrayOffset() + dst.limit();454int dlASCII = dp + Math.min(sl - sp, dl - dp);455456// ASCII only loop457while (dp < dlASCII && sa[sp] < '\u0080')458da[dp++] = (byte) sa[sp++];459while (sp < sl) {460char c = sa[sp];461if (c < 0x80) {462// Have at most seven bits463if (dp >= dl)464return overflow(src, sp, dst, dp);465da[dp++] = (byte)c;466} else if (c < 0x800) {467// 2 bytes, 11 bits468if (dl - dp < 2)469return overflow(src, sp, dst, dp);470da[dp++] = (byte)(0xc0 | (c >> 6));471da[dp++] = (byte)(0x80 | (c & 0x3f));472} else if (Character.isSurrogate(c)) {473// Have a surrogate pair474if (sgp == null)475sgp = new Surrogate.Parser();476int uc = sgp.parse(c, sa, sp, sl);477if (uc < 0) {478updatePositions(src, sp, dst, dp);479return sgp.error();480}481if (dl - dp < 4)482return overflow(src, sp, dst, dp);483da[dp++] = (byte)(0xf0 | ((uc >> 18)));484da[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f));485da[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f));486da[dp++] = (byte)(0x80 | (uc & 0x3f));487sp++; // 2 chars488} else {489// 3 bytes, 16 bits490if (dl - dp < 3)491return overflow(src, sp, dst, dp);492da[dp++] = (byte)(0xe0 | ((c >> 12)));493da[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f));494da[dp++] = (byte)(0x80 | (c & 0x3f));495}496sp++;497}498updatePositions(src, sp, dst, dp);499return CoderResult.UNDERFLOW;500}501502private CoderResult encodeBufferLoop(CharBuffer src,503ByteBuffer dst)504{505int mark = src.position();506while (src.hasRemaining()) {507char c = src.get();508if (c < 0x80) {509// Have at most seven bits510if (!dst.hasRemaining())511return overflow(src, mark);512dst.put((byte)c);513} else if (c < 0x800) {514// 2 bytes, 11 bits515if (dst.remaining() < 2)516return overflow(src, mark);517dst.put((byte)(0xc0 | (c >> 6)));518dst.put((byte)(0x80 | (c & 0x3f)));519} else if (Character.isSurrogate(c)) {520// Have a surrogate pair521if (sgp == null)522sgp = new Surrogate.Parser();523int uc = sgp.parse(c, src);524if (uc < 0) {525src.position(mark);526return sgp.error();527}528if (dst.remaining() < 4)529return overflow(src, mark);530dst.put((byte)(0xf0 | ((uc >> 18))));531dst.put((byte)(0x80 | ((uc >> 12) & 0x3f)));532dst.put((byte)(0x80 | ((uc >> 6) & 0x3f)));533dst.put((byte)(0x80 | (uc & 0x3f)));534mark++; // 2 chars535} else {536// 3 bytes, 16 bits537if (dst.remaining() < 3)538return overflow(src, mark);539dst.put((byte)(0xe0 | ((c >> 12))));540dst.put((byte)(0x80 | ((c >> 6) & 0x3f)));541dst.put((byte)(0x80 | (c & 0x3f)));542}543mark++;544}545src.position(mark);546return CoderResult.UNDERFLOW;547}548549protected final CoderResult encodeLoop(CharBuffer src,550ByteBuffer dst)551{552if (src.hasArray() && dst.hasArray())553return encodeArrayLoop(src, dst);554else555return encodeBufferLoop(src, dst);556}557558}559}560561562