Path: blob/master/src/java.base/share/native/libtinyiconv/iconv.cpp
41149 views
/*1* Copyright (C) 2017 The Android Open Source Project2* All rights reserved.3*4* Redistribution and use in source and binary forms, with or without5* modification, are permitted provided that the following conditions6* are met:7* * Redistributions of source code must retain the above copyright8* notice, this list of conditions and the following disclaimer.9* * Redistributions in binary form must reproduce the above copyright10* notice, this list of conditions and the following disclaimer in11* the documentation and/or other materials provided with the12* distribution.13*14* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS15* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT16* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS17* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE18* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,19* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,20* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS21* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED22* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,23* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT24* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF25* SUCH DAMAGE.26*/2728#ifdef __ANDROID__2930#include <ctype.h>31#include <endian.h>32#include <errno.h>33#include <iconv.h>34#include <stdbool.h>35#include <stdint.h>36#include <stdlib.h>37#include <string.h>38#include <uchar.h>39#include <wchar.h>4041__BEGIN_DECLS4243/*44* These return values are specified by POSIX for multibyte conversion45* functions.46*/4748#ifdef __cplusplus49#define __MB_ERR_ILLEGAL_SEQUENCE static_cast<size_t>(-1)50#define __MB_ERR_INCOMPLETE_SEQUENCE static_cast<size_t>(-2)51#else52#define __MB_ERR_ILLEGAL_SEQUENCE (size_t)(-1)53#define __MB_ERR_INCOMPLETE_SEQUENCE (size_t)(-2)54#endif // __cplusplus55#define __MB_IS_ERR(rv) (rv == __MB_ERR_ILLEGAL_SEQUENCE || \56rv == __MB_ERR_INCOMPLETE_SEQUENCE)57static inline __wur size_t mbstate_bytes_so_far(const mbstate_t* ps) {58return59(ps->__seq[2] != 0) ? 3 :60(ps->__seq[1] != 0) ? 2 :61(ps->__seq[0] != 0) ? 1 : 0;62}63static inline void mbstate_set_byte(mbstate_t* ps, int i, char byte) {64ps->__seq[i] = (uint8_t)(byte);65}66static inline __wur uint8_t mbstate_get_byte(const mbstate_t* ps, int n) {67return ps->__seq[n];68}69static inline __wur size_t mbstate_reset_and_return_illegal(int _errno, mbstate_t* ps) {70errno = _errno;71#ifdef __cplusplus72*(reinterpret_cast<uint32_t*>(ps->__seq)) = 0;73#else74*(uint32_t*)(ps->__seq) = 0;75#endif // __cplusplus76return __MB_ERR_ILLEGAL_SEQUENCE;77}78static inline __wur size_t mbstate_reset_and_return(int _return, mbstate_t* ps) {79#ifdef __cplusplus80*(reinterpret_cast<uint32_t*>(ps->__seq)) = 0;81#else82*(uint32_t*)(ps->__seq) = 0;83#endif // __cplusplus84return _return;85}8687#ifdef __cplusplus88# define INVALID_ICONV_T reinterpret_cast<iconv_t>(-1)89#else // !__cplusplus90# define INVALID_ICONV_T (iconv_t)(-1)91#endif // __cplusplus9293// Ideally we'd use icu4c but the API mismatch seems too great. So we just offer something94// equivalent to (but slightly easier to use for runs of text than) <uchar.h>. If you're95// here to add more encodings, consider working on finishing the icu4c NDK wrappers instead.9697#ifdef __cplusplus98enum Encoding99#else100typedef enum101#endif // __cplusplus102{103US_ASCII,104UTF_8,105UTF_16_LE,106UTF_16_BE,107UTF_32_LE,108UTF_32_BE,109WCHAR_T,110#ifdef __cplusplus111};112#else113} Encoding;114#endif // __cplusplus115116#ifdef __cplusplus117enum Mode118#else119typedef enum120#endif // __cplusplus121{122ERROR,123IGNORE,124TRANSLIT,125#ifdef __cplusplus126};127#else128} Mode;129#endif // __cplusplus130131// This matching is strange but true.132// See http://www.unicode.org/reports/tr22/#Charset_Alias_Matching.133static bool __match_encoding(const char* lhs, const char* rhs) {134while (*lhs && *rhs) {135// Skip non-alnum in lhs; "UTF-8", "UTF_8", "UTF8", "UTF 8" are all equivalent.136// Also implement the "delete each 0 that is not preceded by a digit" rule.137for (; *lhs; ++lhs) {138if (isalnum(*lhs) && (*lhs != '0' || !isdigit(*(lhs + 1)))) break;139}140// Case doesn't matter either.141if (tolower(*lhs) != tolower(*rhs)) break;142++lhs;143++rhs;144}145// As a special case we treat the GNU "//" extensions as end of string.146if ((*lhs == '\0' || strstr(lhs, "//") == lhs) && *rhs == '\0') return true;147return false;148}149150static bool __parse_encoding(const char* s, Encoding* encoding, Mode* mode) {151const char* suffix = strstr(s, "//");152if (suffix) {153if (!mode) return false;154if (strcmp(suffix, "//IGNORE") == 0) {155*mode = IGNORE;156} else if (strcmp(suffix, "//TRANSLIT") == 0) {157*mode = TRANSLIT;158} else {159return false;160}161}162if (__match_encoding(s, "utf8")) {163*encoding = UTF_8;164} else if (__match_encoding(s, "ascii") || __match_encoding(s, "usascii")) {165*encoding = US_ASCII;166} else if (__match_encoding(s, "utf16le")) {167*encoding = UTF_16_LE;168} else if (__match_encoding(s, "utf16be")) {169*encoding = UTF_16_BE;170} else if (__match_encoding(s, "utf32le")) {171*encoding = UTF_32_LE;172} else if (__match_encoding(s, "utf32be")) {173*encoding = UTF_32_BE;174} else if (__match_encoding(s, "wchart")) {175*encoding = WCHAR_T;176} else {177return false;178}179return true;180}181182struct __iconv_t {183Encoding src_encoding;184Encoding dst_encoding;185Mode mode;186/*187__iconv_t() : mode(ERROR) {188}189*/190int Convert(char** src_buf0, size_t* src_bytes_left0, char** dst_buf0, size_t* dst_bytes_left0) {191// Reset state.192wc = 0;193memset(&ps, 0, sizeof(ps));194replacement_count = 0;195ignored = false;196src_buf = src_buf0;197src_bytes_left = src_bytes_left0;198dst_buf = dst_buf0;199dst_bytes_left = dst_bytes_left0;200while (*src_bytes_left > 0) {201if (!GetNext() || !Convert()) return -1;202}203return Done();204}205private:206char32_t wc;207char buf[16];208size_t src_bytes_used;209size_t dst_bytes_used;210mbstate_t ps;211size_t replacement_count;212bool ignored;213char** src_buf;214size_t* src_bytes_left;215char** dst_buf;216size_t* dst_bytes_left;217bool GetNext() {218errno = 0;219switch (src_encoding) {220case US_ASCII:221wc = **src_buf;222src_bytes_used = 1;223if (wc > 0x7f) errno = EILSEQ;224break;225case UTF_8:226src_bytes_used = mbrtoc32(&wc, *src_buf, *src_bytes_left, &ps);227if (src_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) {228break; // EILSEQ already set.229} else if (src_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) {230errno = EINVAL;231return false;232}233break;234case UTF_16_BE:235case UTF_16_LE: {236if (*src_bytes_left < 2) {237errno = EINVAL;238return false;239}240bool swap = (src_encoding == UTF_16_BE);241wc = In16(*src_buf, swap);242// 0xd800-0xdbff: high surrogates243// 0xdc00-0xdfff: low surrogates244if (wc >= 0xd800 && wc <= 0xdfff) {245if (wc >= 0xdc00) { // Low surrogate before high surrogate.246errno = EILSEQ;247return false;248}249if (*src_bytes_left < 4) {250errno = EINVAL;251return false;252}253uint16_t hi = wc;254uint16_t lo = In16(*src_buf + 2, swap);255wc = 0x10000 + ((hi - 0xd800) << 10) + (lo - 0xdc00);256src_bytes_used = 4;257}258break;259}260case UTF_32_BE:261case UTF_32_LE:262case WCHAR_T:263if (*src_bytes_left < 4) {264errno = EINVAL;265return false;266}267wc = In32(*src_buf, (src_encoding == UTF_32_BE));268break;269}270if (errno == EILSEQ) {271switch (mode) {272case ERROR:273return false;274case IGNORE:275*src_buf += src_bytes_used;276*src_bytes_left -= src_bytes_used;277ignored = true;278return GetNext();279case TRANSLIT:280wc = '?';281++replacement_count;282return true;283}284}285return true;286}287288bool Convert() {289errno = 0;290switch (dst_encoding) {291case US_ASCII:292buf[0] = wc;293dst_bytes_used = 1;294if (wc > 0x7f) errno = EILSEQ;295break;296case UTF_8:297dst_bytes_used = c32rtomb(buf, wc, &ps);298if (dst_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) {299break; // EILSEQ already set.300} else if (dst_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) {301errno = EINVAL;302return false;303}304break;305case UTF_16_BE:306case UTF_16_LE: {307bool swap = (dst_encoding == UTF_16_BE);308if (wc < 0x10000) { // BMP.309Out16(buf, wc, swap);310} else { // Supplementary plane; output surrogate pair.311wc -= 0x10000;312char16_t hi = 0xd800 | (wc >> 10);313char16_t lo = 0xdc00 | (wc & 0x3ff);314Out16(buf + 0, hi, swap);315Out16(buf + 2, lo, swap);316dst_bytes_used = 4;317}318} break;319case UTF_32_BE:320case UTF_32_LE:321case WCHAR_T:322Out32(wc, (dst_encoding == UTF_32_BE));323break;324}325if (errno == EILSEQ) {326if (mode == IGNORE) {327*src_buf += src_bytes_used;328*src_bytes_left -= src_bytes_used;329ignored = true;330return true;331} else if (mode == TRANSLIT) {332wc = '?';333++replacement_count;334return Convert();335}336return false;337}338return Emit();339}340341uint16_t In16(const char* buf, bool swap) {342#ifdef __cplusplus343const uint8_t* src = reinterpret_cast<const uint8_t*>(buf);344#else // !__cplusplus345const uint8_t* src = (const uint8_t*)(buf);346#endif // __cplusplus347uint16_t wc = (src[0]) | (src[1] << 8);348if (swap) wc = __swap16(wc);349src_bytes_used = 2;350return wc;351}352353uint32_t In32(const char* buf, bool swap) {354#ifdef __cplusplus355const uint8_t* src = reinterpret_cast<const uint8_t*>(buf);356#else // !__cplusplus357const uint8_t* src = (const uint8_t*)(buf);358#endif // __cplusplus359uint32_t wc = (src[0]) | (src[1] << 8) | (src[2] << 16) | (src[3] << 24);360if (swap) wc = __swap32(wc);361src_bytes_used = 4;362return wc;363}364365void Out16(char* dst, char16_t ch, bool swap) {366if (swap) ch = __swap16(ch);367dst[0] = ch;368dst[1] = ch >> 8;369dst_bytes_used = 2;370}371372void Out32(char32_t ch, bool swap) {373if (swap) ch = __swap32(ch);374buf[0] = ch;375buf[1] = ch >> 8;376buf[2] = ch >> 16;377buf[3] = ch >> 24;378dst_bytes_used = 4;379}380381bool Emit() {382if (dst_bytes_used > *dst_bytes_left) {383errno = E2BIG;384return false;385}386memcpy(*dst_buf, buf, dst_bytes_used);387*src_buf += src_bytes_used;388*src_bytes_left -= src_bytes_used;389*dst_buf += dst_bytes_used;390*dst_bytes_left -= dst_bytes_used;391return true;392}393394int Done() {395if (mode == TRANSLIT) return replacement_count;396if (ignored) {397errno = EILSEQ;398return -1;399}400return 0;401}402};403404iconv_t iconv_open(const char* __dst_encoding, const char* __src_encoding) {405iconv_t result = iconv_t();406result->mode = ERROR;407if (!__parse_encoding(__src_encoding, &result->src_encoding, 0 /* nullptr */) ||408!__parse_encoding(__dst_encoding, &result->dst_encoding, &result->mode)) {409free(result);410errno = EINVAL;411return INVALID_ICONV_T;412}413return result;414}415416size_t iconv(iconv_t __converter,417char** __src_buf, size_t* __src_bytes_left,418char** __dst_buf, size_t* __dst_bytes_left) {419if (__converter == INVALID_ICONV_T) {420errno = EBADF;421return -1;422}423return __converter->Convert(__src_buf, __src_bytes_left, __dst_buf, __dst_bytes_left);424}425426int iconv_close(iconv_t __converter) {427if (__converter == INVALID_ICONV_T) {428errno = EBADF;429return -1;430}431free(__converter);432return 0;433}434435__END_DECLS436437#endif // __ANDROID__438439440