Path: blob/master/src/jdk.charsets/share/classes/sun/nio/cs/ext/JISAutoDetect.java
41161 views
/*1* Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/2425package sun.nio.cs.ext;2627import java.nio.ByteBuffer;28import java.nio.CharBuffer;29import java.nio.charset.Charset;30import java.nio.charset.CharsetDecoder;31import java.nio.charset.CharsetEncoder;32import java.nio.charset.CoderResult;33import java.nio.charset.CharacterCodingException;34import java.nio.charset.MalformedInputException;35import sun.nio.cs.DelegatableDecoder;36import sun.nio.cs.HistoricallyNamedCharset;37import java.security.AccessController;38import java.security.PrivilegedAction;39import sun.nio.cs.*;40import static java.lang.Character.UnicodeBlock;414243public class JISAutoDetect44extends Charset45implements HistoricallyNamedCharset46{4748private static final int EUCJP_MASK = 0x01;49private static final int SJIS2B_MASK = 0x02;50private static final int SJIS1B_MASK = 0x04;51private static final int EUCJP_KANA1_MASK = 0x08;52private static final int EUCJP_KANA2_MASK = 0x10;5354public JISAutoDetect() {55super("x-JISAutoDetect", ExtendedCharsets.aliasesFor("x-JISAutoDetect"));56}5758public boolean contains(Charset cs) {59return ((cs.name().equals("US-ASCII"))60|| (cs instanceof SJIS)61|| (cs instanceof EUC_JP)62|| (cs instanceof ISO2022_JP));63}6465public boolean canEncode() {66return false;67}6869public CharsetDecoder newDecoder() {70return new Decoder(this);71}7273public String historicalName() {74return "JISAutoDetect";75}7677public CharsetEncoder newEncoder() {78throw new UnsupportedOperationException();79}8081// A heuristic algorithm for guessing if EUC-decoded text really82// might be Japanese text. Better heuristics are possible...83private static boolean looksLikeJapanese(CharBuffer cb) {84int hiragana = 0; // Fullwidth Hiragana85int katakana = 0; // Halfwidth Katakana86while (cb.hasRemaining()) {87char c = cb.get();88if (0x3040 <= c && c <= 0x309f && ++hiragana > 1) return true;89if (0xff65 <= c && c <= 0xff9f && ++katakana > 1) return true;90}91return false;92}9394private static class Decoder extends CharsetDecoder {95@SuppressWarnings("removal")96private static final String osName = AccessController.doPrivileged(97(PrivilegedAction<String>) () -> System.getProperty("os.name"));9899private static final String SJISName = getSJISName();100private static final String EUCJPName = "EUC_JP";101private DelegatableDecoder detectedDecoder = null;102103public Decoder(Charset cs) {104super(cs, 0.5f, 1.0f);105}106107private static boolean isPlainASCII(byte b) {108return b >= 0 && b != 0x1b;109}110111private static void copyLeadingASCII(ByteBuffer src, CharBuffer dst) {112int start = src.position();113int limit = start + Math.min(src.remaining(), dst.remaining());114int p;115byte b;116for (p = start; p < limit && isPlainASCII(b = src.get(p)); p++)117dst.put((char)(b & 0xff));118src.position(p);119}120121private CoderResult decodeLoop(DelegatableDecoder decoder,122ByteBuffer src, CharBuffer dst) {123((CharsetDecoder)decoder).reset();124detectedDecoder = decoder;125return detectedDecoder.decodeLoop(src, dst);126}127128protected CoderResult decodeLoop(ByteBuffer src, CharBuffer dst) {129if (detectedDecoder == null) {130copyLeadingASCII(src, dst);131132// All ASCII?133if (! src.hasRemaining())134return CoderResult.UNDERFLOW;135// Overflow only if there is still ascii but no out buffer.136if (!dst.hasRemaining() &&137isPlainASCII(src.get(src.position())))138return CoderResult.OVERFLOW;139140// We need to perform double, not float, arithmetic; otherwise141// we lose low order bits when src is larger than 2**24.142int cbufsiz = (int)(src.limit() * (double)maxCharsPerByte());143CharBuffer sandbox = CharBuffer.allocate(cbufsiz);144145// First try ISO-2022-JP, since there is no ambiguity146Charset cs2022 = Charset.forName("ISO-2022-JP");147DelegatableDecoder dd2022148= (DelegatableDecoder) cs2022.newDecoder();149ByteBuffer src2022 = src.asReadOnlyBuffer();150CoderResult res2022 = dd2022.decodeLoop(src2022, sandbox);151if (! res2022.isError())152return decodeLoop(dd2022, src, dst);153154// We must choose between EUC and SJIS155Charset csEUCJ = Charset.forName(EUCJPName);156Charset csSJIS = Charset.forName(SJISName);157158DelegatableDecoder ddEUCJ159= (DelegatableDecoder) csEUCJ.newDecoder();160DelegatableDecoder ddSJIS161= (DelegatableDecoder) csSJIS.newDecoder();162163ByteBuffer srcEUCJ = src.asReadOnlyBuffer();164sandbox.clear();165CoderResult resEUCJ = ddEUCJ.decodeLoop(srcEUCJ, sandbox);166// If EUC decoding fails, must be SJIS167if (resEUCJ.isError())168return decodeLoop(ddSJIS, src, dst);169ByteBuffer srcSJIS = src.asReadOnlyBuffer();170CharBuffer sandboxSJIS = CharBuffer.allocate(cbufsiz);171CoderResult resSJIS = ddSJIS.decodeLoop(srcSJIS, sandboxSJIS);172// If SJIS decoding fails, must be EUC173if (resSJIS.isError())174return decodeLoop(ddEUCJ, src, dst);175176// From here on, we have some ambiguity, and must guess.177178// We prefer input that does not appear to end mid-character.179if (srcEUCJ.position() > srcSJIS.position())180return decodeLoop(ddEUCJ, src, dst);181182if (srcEUCJ.position() < srcSJIS.position())183return decodeLoop(ddSJIS, src, dst);184185// end-of-input is after the first byte of the first char?186if (src.position() == srcEUCJ.position())187return CoderResult.UNDERFLOW;188189// Use heuristic knowledge of typical Japanese text190sandbox.flip();191return decodeLoop(looksLikeJapanese(sandbox) ? ddEUCJ : ddSJIS,192src, dst);193}194195return detectedDecoder.decodeLoop(src, dst);196}197198protected void implReset() {199detectedDecoder = null;200}201202protected CoderResult implFlush(CharBuffer out) {203if (detectedDecoder != null)204return detectedDecoder.implFlush(out);205else206return super.implFlush(out);207}208209public boolean isAutoDetecting() {210return true;211}212213public boolean isCharsetDetected() {214return detectedDecoder != null;215}216217public Charset detectedCharset() {218if (detectedDecoder == null)219throw new IllegalStateException("charset not yet detected");220return ((CharsetDecoder) detectedDecoder).charset();221}222223224/**225* Returned Shift_JIS Charset name is OS dependent226*/227private static String getSJISName() {228if (osName.startsWith("Windows"))229return("windows-31J");230else231return("Shift_JIS");232}233234}235}236237238