Path: blob/master/src/java.base/share/classes/sun/nio/cs/CESU_8.java
41159 views
/*1* Copyright (c) 2011, 2021, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/2425package sun.nio.cs;2627import jdk.internal.access.JavaLangAccess;28import jdk.internal.access.SharedSecrets;2930import java.nio.Buffer;31import java.nio.ByteBuffer;32import java.nio.CharBuffer;33import java.nio.charset.Charset;34import java.nio.charset.CharsetDecoder;35import java.nio.charset.CharsetEncoder;36import java.nio.charset.CoderResult;37import java.nio.charset.CodingErrorAction;3839/* Legal CESU-8 Byte Sequences40*41* # Code Points Bits Bit/Byte pattern42* 1 7 0xxxxxxx43* U+0000..U+007F 00..7F44*45* 2 11 110xxxxx 10xxxxxx46* U+0080..U+07FF C2..DF 80..BF47*48* 3 16 1110xxxx 10xxxxxx 10xxxxxx49* U+0800..U+0FFF E0 A0..BF 80..BF50* U+1000..U+FFFF E1..EF 80..BF 80..BF51*52*/5354class CESU_8 extends Unicode55{56public CESU_8() {57super("CESU-8", StandardCharsets.aliases_CESU_8());58}5960public String historicalName() {61return "CESU8";62}6364public CharsetDecoder newDecoder() {65return new Decoder(this);66}6768public CharsetEncoder newEncoder() {69return new Encoder(this);70}7172private static final void updatePositions(Buffer src, int sp,73Buffer dst, int dp) {74src.position(sp - src.arrayOffset());75dst.position(dp - dst.arrayOffset());76}7778private static class Decoder extends CharsetDecoder79implements ArrayDecoder {8081private static final JavaLangAccess JLA = SharedSecrets.getJavaLangAccess();8283private Decoder(Charset cs) {84super(cs, 1.0f, 1.0f);85}8687private static boolean isNotContinuation(int b) {88return (b & 0xc0) != 0x80;89}9091// [E0] [A0..BF] [80..BF]92// [E1..EF] [80..BF] [80..BF]93private static boolean isMalformed3(int b1, int b2, int b3) {94return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||95(b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80;96}9798// only used when there is only one byte left in src buffer99private static boolean isMalformed3_2(int b1, int b2) {100return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||101(b2 & 0xc0) != 0x80;102}103104private static CoderResult malformedN(ByteBuffer src, int nb) {105switch (nb) {106case 1:107case 2: // always 1108return CoderResult.malformedForLength(1);109case 3:110int b1 = src.get();111int b2 = src.get(); // no need to lookup b3112return CoderResult.malformedForLength(113((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||114isNotContinuation(b2)) ? 1 : 2);115case 4: // we don't care the speed here116b1 = src.get() & 0xff;117b2 = src.get() & 0xff;118if (b1 > 0xf4 ||119(b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) ||120(b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||121isNotContinuation(b2))122return CoderResult.malformedForLength(1);123if (isNotContinuation(src.get()))124return CoderResult.malformedForLength(2);125return CoderResult.malformedForLength(3);126default:127assert false;128return null;129}130}131132private static CoderResult malformed(ByteBuffer src, int sp,133CharBuffer dst, int dp,134int nb)135{136src.position(sp - src.arrayOffset());137CoderResult cr = malformedN(src, nb);138updatePositions(src, sp, dst, dp);139return cr;140}141142143private static CoderResult malformed(ByteBuffer src,144int mark, int nb)145{146src.position(mark);147CoderResult cr = malformedN(src, nb);148src.position(mark);149return cr;150}151152private static CoderResult malformedForLength(ByteBuffer src,153int sp,154CharBuffer dst,155int dp,156int malformedNB)157{158updatePositions(src, sp, dst, dp);159return CoderResult.malformedForLength(malformedNB);160}161162private static CoderResult malformedForLength(ByteBuffer src,163int mark,164int malformedNB)165{166src.position(mark);167return CoderResult.malformedForLength(malformedNB);168}169170171private static CoderResult xflow(Buffer src, int sp, int sl,172Buffer dst, int dp, int nb) {173updatePositions(src, sp, dst, dp);174return (nb == 0 || sl - sp < nb)175? CoderResult.UNDERFLOW : CoderResult.OVERFLOW;176}177178private static CoderResult xflow(Buffer src, int mark, int nb) {179src.position(mark);180return (nb == 0 || src.remaining() < nb)181? CoderResult.UNDERFLOW : CoderResult.OVERFLOW;182}183184private CoderResult decodeArrayLoop(ByteBuffer src,185CharBuffer dst)186{187// This method is optimized for ASCII input.188byte[] sa = src.array();189int soff = src.arrayOffset();190int sp = soff + src.position();191int sl = soff + src.limit();192193char[] da = dst.array();194int doff = dst.arrayOffset();195int dp = doff + dst.position();196int dl = doff + dst.limit();197198int n = JLA.decodeASCII(sa, sp, da, dp, Math.min(sl - sp, dl - dp));199sp += n;200dp += n;201202while (sp < sl) {203int b1 = sa[sp];204if (b1 >= 0) {205// 1 byte, 7 bits: 0xxxxxxx206if (dp >= dl)207return xflow(src, sp, sl, dst, dp, 1);208da[dp++] = (char) b1;209sp++;210} else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {211// 2 bytes, 11 bits: 110xxxxx 10xxxxxx212if (sl - sp < 2 || dp >= dl)213return xflow(src, sp, sl, dst, dp, 2);214int b2 = sa[sp + 1];215if (isNotContinuation(b2))216return malformedForLength(src, sp, dst, dp, 1);217da[dp++] = (char) (((b1 << 6) ^ b2)218^219(((byte) 0xC0 << 6) ^220((byte) 0x80 << 0)));221sp += 2;222} else if ((b1 >> 4) == -2) {223// 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx224int srcRemaining = sl - sp;225if (srcRemaining < 3 || dp >= dl) {226if (srcRemaining > 1 && isMalformed3_2(b1, sa[sp + 1]))227return malformedForLength(src, sp, dst, dp, 1);228return xflow(src, sp, sl, dst, dp, 3);229}230int b2 = sa[sp + 1];231int b3 = sa[sp + 2];232if (isMalformed3(b1, b2, b3))233return malformed(src, sp, dst, dp, 3);234da[dp++] = (char)235((b1 << 12) ^236(b2 << 6) ^237(b3 ^238(((byte) 0xE0 << 12) ^239((byte) 0x80 << 6) ^240((byte) 0x80 << 0))));241sp += 3;242} else {243return malformed(src, sp, dst, dp, 1);244}245}246return xflow(src, sp, sl, dst, dp, 0);247}248249private CoderResult decodeBufferLoop(ByteBuffer src,250CharBuffer dst)251{252int mark = src.position();253int limit = src.limit();254while (mark < limit) {255int b1 = src.get();256if (b1 >= 0) {257// 1 byte, 7 bits: 0xxxxxxx258if (dst.remaining() < 1)259return xflow(src, mark, 1); // overflow260dst.put((char) b1);261mark++;262} else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {263// 2 bytes, 11 bits: 110xxxxx 10xxxxxx264if (limit - mark < 2|| dst.remaining() < 1)265return xflow(src, mark, 2);266int b2 = src.get();267if (isNotContinuation(b2))268return malformedForLength(src, mark, 1);269dst.put((char) (((b1 << 6) ^ b2)270^271(((byte) 0xC0 << 6) ^272((byte) 0x80 << 0))));273mark += 2;274} else if ((b1 >> 4) == -2) {275// 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx276int srcRemaining = limit - mark;277if (srcRemaining < 3 || dst.remaining() < 1) {278if (srcRemaining > 1 && isMalformed3_2(b1, src.get()))279return malformedForLength(src, mark, 1);280return xflow(src, mark, 3);281}282int b2 = src.get();283int b3 = src.get();284if (isMalformed3(b1, b2, b3))285return malformed(src, mark, 3);286dst.put((char)287((b1 << 12) ^288(b2 << 6) ^289(b3 ^290(((byte) 0xE0 << 12) ^291((byte) 0x80 << 6) ^292((byte) 0x80 << 0)))));293mark += 3;294} else {295return malformed(src, mark, 1);296}297}298return xflow(src, mark, 0);299}300301protected CoderResult decodeLoop(ByteBuffer src,302CharBuffer dst)303{304if (src.hasArray() && dst.hasArray())305return decodeArrayLoop(src, dst);306else307return decodeBufferLoop(src, dst);308}309310private static ByteBuffer getByteBuffer(ByteBuffer bb, byte[] ba, int sp)311{312if (bb == null)313bb = ByteBuffer.wrap(ba);314bb.position(sp);315return bb;316}317318// returns -1 if there is/are malformed byte(s) and the319// "action" for malformed input is not REPLACE.320public int decode(byte[] sa, int sp, int len, char[] da) {321final int sl = sp + len;322int dp = 0;323int dlASCII = Math.min(len, da.length);324ByteBuffer bb = null; // only necessary if malformed325326// ASCII only optimized loop327while (dp < dlASCII && sa[sp] >= 0)328da[dp++] = (char) sa[sp++];329330while (sp < sl) {331int b1 = sa[sp++];332if (b1 >= 0) {333// 1 byte, 7 bits: 0xxxxxxx334da[dp++] = (char) b1;335} else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {336// 2 bytes, 11 bits: 110xxxxx 10xxxxxx337if (sp < sl) {338int b2 = sa[sp++];339if (isNotContinuation(b2)) {340if (malformedInputAction() != CodingErrorAction.REPLACE)341return -1;342da[dp++] = replacement().charAt(0);343sp--; // malformedN(bb, 2) always returns 1344} else {345da[dp++] = (char) (((b1 << 6) ^ b2)^346(((byte) 0xC0 << 6) ^347((byte) 0x80 << 0)));348}349continue;350}351if (malformedInputAction() != CodingErrorAction.REPLACE)352return -1;353da[dp++] = replacement().charAt(0);354return dp;355} else if ((b1 >> 4) == -2) {356// 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx357if (sp + 1 < sl) {358int b2 = sa[sp++];359int b3 = sa[sp++];360if (isMalformed3(b1, b2, b3)) {361if (malformedInputAction() != CodingErrorAction.REPLACE)362return -1;363da[dp++] = replacement().charAt(0);364sp -=3;365bb = getByteBuffer(bb, sa, sp);366sp += malformedN(bb, 3).length();367} else {368da[dp++] = (char)((b1 << 12) ^369(b2 << 6) ^370(b3 ^371(((byte) 0xE0 << 12) ^372((byte) 0x80 << 6) ^373((byte) 0x80 << 0))));374}375continue;376}377if (malformedInputAction() != CodingErrorAction.REPLACE)378return -1;379if (sp < sl && isMalformed3_2(b1, sa[sp])) {380da[dp++] = replacement().charAt(0);381continue;382383}384da[dp++] = replacement().charAt(0);385return dp;386} else {387if (malformedInputAction() != CodingErrorAction.REPLACE)388return -1;389da[dp++] = replacement().charAt(0);390}391}392return dp;393}394}395396private static class Encoder extends CharsetEncoder397implements ArrayEncoder {398399private Encoder(Charset cs) {400super(cs, 1.1f, 3.0f);401}402403public boolean canEncode(char c) {404return !Character.isSurrogate(c);405}406407public boolean isLegalReplacement(byte[] repl) {408return ((repl.length == 1 && repl[0] >= 0) ||409super.isLegalReplacement(repl));410}411412private static CoderResult overflow(CharBuffer src, int sp,413ByteBuffer dst, int dp) {414updatePositions(src, sp, dst, dp);415return CoderResult.OVERFLOW;416}417418private static CoderResult overflow(CharBuffer src, int mark) {419src.position(mark);420return CoderResult.OVERFLOW;421}422423private static void to3Bytes(byte[] da, int dp, char c) {424da[dp] = (byte)(0xe0 | ((c >> 12)));425da[dp + 1] = (byte)(0x80 | ((c >> 6) & 0x3f));426da[dp + 2] = (byte)(0x80 | (c & 0x3f));427}428429private static void to3Bytes(ByteBuffer dst, char c) {430dst.put((byte)(0xe0 | ((c >> 12))));431dst.put((byte)(0x80 | ((c >> 6) & 0x3f)));432dst.put((byte)(0x80 | (c & 0x3f)));433}434435private Surrogate.Parser sgp;436private char[] c2;437private CoderResult encodeArrayLoop(CharBuffer src,438ByteBuffer dst)439{440char[] sa = src.array();441int sp = src.arrayOffset() + src.position();442int sl = src.arrayOffset() + src.limit();443444byte[] da = dst.array();445int dp = dst.arrayOffset() + dst.position();446int dl = dst.arrayOffset() + dst.limit();447int dlASCII = dp + Math.min(sl - sp, dl - dp);448449// ASCII only loop450while (dp < dlASCII && sa[sp] < '\u0080')451da[dp++] = (byte) sa[sp++];452while (sp < sl) {453char c = sa[sp];454if (c < 0x80) {455// Have at most seven bits456if (dp >= dl)457return overflow(src, sp, dst, dp);458da[dp++] = (byte)c;459} else if (c < 0x800) {460// 2 bytes, 11 bits461if (dl - dp < 2)462return overflow(src, sp, dst, dp);463da[dp++] = (byte)(0xc0 | (c >> 6));464da[dp++] = (byte)(0x80 | (c & 0x3f));465} else if (Character.isSurrogate(c)) {466// Have a surrogate pair467if (sgp == null)468sgp = new Surrogate.Parser();469int uc = sgp.parse(c, sa, sp, sl);470if (uc < 0) {471updatePositions(src, sp, dst, dp);472return sgp.error();473}474if (dl - dp < 6)475return overflow(src, sp, dst, dp);476to3Bytes(da, dp, Character.highSurrogate(uc));477dp += 3;478to3Bytes(da, dp, Character.lowSurrogate(uc));479dp += 3;480sp++; // 2 chars481} else {482// 3 bytes, 16 bits483if (dl - dp < 3)484return overflow(src, sp, dst, dp);485to3Bytes(da, dp, c);486dp += 3;487}488sp++;489}490updatePositions(src, sp, dst, dp);491return CoderResult.UNDERFLOW;492}493494private CoderResult encodeBufferLoop(CharBuffer src,495ByteBuffer dst)496{497int mark = src.position();498while (src.hasRemaining()) {499char c = src.get();500if (c < 0x80) {501// Have at most seven bits502if (!dst.hasRemaining())503return overflow(src, mark);504dst.put((byte)c);505} else if (c < 0x800) {506// 2 bytes, 11 bits507if (dst.remaining() < 2)508return overflow(src, mark);509dst.put((byte)(0xc0 | (c >> 6)));510dst.put((byte)(0x80 | (c & 0x3f)));511} else if (Character.isSurrogate(c)) {512// Have a surrogate pair513if (sgp == null)514sgp = new Surrogate.Parser();515int uc = sgp.parse(c, src);516if (uc < 0) {517src.position(mark);518return sgp.error();519}520if (dst.remaining() < 6)521return overflow(src, mark);522to3Bytes(dst, Character.highSurrogate(uc));523to3Bytes(dst, Character.lowSurrogate(uc));524mark++; // 2 chars525} else {526// 3 bytes, 16 bits527if (dst.remaining() < 3)528return overflow(src, mark);529to3Bytes(dst, c);530}531mark++;532}533src.position(mark);534return CoderResult.UNDERFLOW;535}536537protected final CoderResult encodeLoop(CharBuffer src,538ByteBuffer dst)539{540if (src.hasArray() && dst.hasArray())541return encodeArrayLoop(src, dst);542else543return encodeBufferLoop(src, dst);544}545546// returns -1 if there is malformed char(s) and the547// "action" for malformed input is not REPLACE.548public int encode(char[] sa, int sp, int len, byte[] da) {549int sl = sp + len;550int dp = 0;551int dlASCII = dp + Math.min(len, da.length);552553// ASCII only optimized loop554while (dp < dlASCII && sa[sp] < '\u0080')555da[dp++] = (byte) sa[sp++];556557while (sp < sl) {558char c = sa[sp++];559if (c < 0x80) {560// Have at most seven bits561da[dp++] = (byte)c;562} else if (c < 0x800) {563// 2 bytes, 11 bits564da[dp++] = (byte)(0xc0 | (c >> 6));565da[dp++] = (byte)(0x80 | (c & 0x3f));566} else if (Character.isSurrogate(c)) {567if (sgp == null)568sgp = new Surrogate.Parser();569int uc = sgp.parse(c, sa, sp - 1, sl);570if (uc < 0) {571if (malformedInputAction() != CodingErrorAction.REPLACE)572return -1;573da[dp++] = replacement()[0];574} else {575to3Bytes(da, dp, Character.highSurrogate(uc));576dp += 3;577to3Bytes(da, dp, Character.lowSurrogate(uc));578dp += 3;579sp++; // 2 chars580}581} else {582// 3 bytes, 16 bits583to3Bytes(da, dp, c);584dp += 3;585}586}587return dp;588}589}590}591592593