Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mobile
Path: blob/master/src/java.base/share/native/libtinyiconv/iconv.cpp
41149 views
1
/*
2
* Copyright (C) 2017 The Android Open Source Project
3
* All rights reserved.
4
*
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
7
* are met:
8
* * Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* * Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions and the following disclaimer in
12
* the documentation and/or other materials provided with the
13
* distribution.
14
*
15
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26
* SUCH DAMAGE.
27
*/
28
29
#ifdef __ANDROID__
30
31
#include <ctype.h>
32
#include <endian.h>
33
#include <errno.h>
34
#include <iconv.h>
35
#include <stdbool.h>
36
#include <stdint.h>
37
#include <stdlib.h>
38
#include <string.h>
39
#include <uchar.h>
40
#include <wchar.h>
41
42
__BEGIN_DECLS
43
44
/*
45
* These return values are specified by POSIX for multibyte conversion
46
* functions.
47
*/
48
49
#ifdef __cplusplus
50
#define __MB_ERR_ILLEGAL_SEQUENCE static_cast<size_t>(-1)
51
#define __MB_ERR_INCOMPLETE_SEQUENCE static_cast<size_t>(-2)
52
#else
53
#define __MB_ERR_ILLEGAL_SEQUENCE (size_t)(-1)
54
#define __MB_ERR_INCOMPLETE_SEQUENCE (size_t)(-2)
55
#endif // __cplusplus
56
#define __MB_IS_ERR(rv) (rv == __MB_ERR_ILLEGAL_SEQUENCE || \
57
rv == __MB_ERR_INCOMPLETE_SEQUENCE)
58
static inline __wur size_t mbstate_bytes_so_far(const mbstate_t* ps) {
59
return
60
(ps->__seq[2] != 0) ? 3 :
61
(ps->__seq[1] != 0) ? 2 :
62
(ps->__seq[0] != 0) ? 1 : 0;
63
}
64
static inline void mbstate_set_byte(mbstate_t* ps, int i, char byte) {
65
ps->__seq[i] = (uint8_t)(byte);
66
}
67
static inline __wur uint8_t mbstate_get_byte(const mbstate_t* ps, int n) {
68
return ps->__seq[n];
69
}
70
static inline __wur size_t mbstate_reset_and_return_illegal(int _errno, mbstate_t* ps) {
71
errno = _errno;
72
#ifdef __cplusplus
73
*(reinterpret_cast<uint32_t*>(ps->__seq)) = 0;
74
#else
75
*(uint32_t*)(ps->__seq) = 0;
76
#endif // __cplusplus
77
return __MB_ERR_ILLEGAL_SEQUENCE;
78
}
79
static inline __wur size_t mbstate_reset_and_return(int _return, mbstate_t* ps) {
80
#ifdef __cplusplus
81
*(reinterpret_cast<uint32_t*>(ps->__seq)) = 0;
82
#else
83
*(uint32_t*)(ps->__seq) = 0;
84
#endif // __cplusplus
85
return _return;
86
}
87
88
#ifdef __cplusplus
89
# define INVALID_ICONV_T reinterpret_cast<iconv_t>(-1)
90
#else // !__cplusplus
91
# define INVALID_ICONV_T (iconv_t)(-1)
92
#endif // __cplusplus
93
94
// Ideally we'd use icu4c but the API mismatch seems too great. So we just offer something
95
// equivalent to (but slightly easier to use for runs of text than) <uchar.h>. If you're
96
// here to add more encodings, consider working on finishing the icu4c NDK wrappers instead.
97
98
#ifdef __cplusplus
99
enum Encoding
100
#else
101
typedef enum
102
#endif // __cplusplus
103
{
104
US_ASCII,
105
UTF_8,
106
UTF_16_LE,
107
UTF_16_BE,
108
UTF_32_LE,
109
UTF_32_BE,
110
WCHAR_T,
111
#ifdef __cplusplus
112
};
113
#else
114
} Encoding;
115
#endif // __cplusplus
116
117
#ifdef __cplusplus
118
enum Mode
119
#else
120
typedef enum
121
#endif // __cplusplus
122
{
123
ERROR,
124
IGNORE,
125
TRANSLIT,
126
#ifdef __cplusplus
127
};
128
#else
129
} Mode;
130
#endif // __cplusplus
131
132
// This matching is strange but true.
133
// See http://www.unicode.org/reports/tr22/#Charset_Alias_Matching.
134
static bool __match_encoding(const char* lhs, const char* rhs) {
135
while (*lhs && *rhs) {
136
// Skip non-alnum in lhs; "UTF-8", "UTF_8", "UTF8", "UTF 8" are all equivalent.
137
// Also implement the "delete each 0 that is not preceded by a digit" rule.
138
for (; *lhs; ++lhs) {
139
if (isalnum(*lhs) && (*lhs != '0' || !isdigit(*(lhs + 1)))) break;
140
}
141
// Case doesn't matter either.
142
if (tolower(*lhs) != tolower(*rhs)) break;
143
++lhs;
144
++rhs;
145
}
146
// As a special case we treat the GNU "//" extensions as end of string.
147
if ((*lhs == '\0' || strstr(lhs, "//") == lhs) && *rhs == '\0') return true;
148
return false;
149
}
150
151
static bool __parse_encoding(const char* s, Encoding* encoding, Mode* mode) {
152
const char* suffix = strstr(s, "//");
153
if (suffix) {
154
if (!mode) return false;
155
if (strcmp(suffix, "//IGNORE") == 0) {
156
*mode = IGNORE;
157
} else if (strcmp(suffix, "//TRANSLIT") == 0) {
158
*mode = TRANSLIT;
159
} else {
160
return false;
161
}
162
}
163
if (__match_encoding(s, "utf8")) {
164
*encoding = UTF_8;
165
} else if (__match_encoding(s, "ascii") || __match_encoding(s, "usascii")) {
166
*encoding = US_ASCII;
167
} else if (__match_encoding(s, "utf16le")) {
168
*encoding = UTF_16_LE;
169
} else if (__match_encoding(s, "utf16be")) {
170
*encoding = UTF_16_BE;
171
} else if (__match_encoding(s, "utf32le")) {
172
*encoding = UTF_32_LE;
173
} else if (__match_encoding(s, "utf32be")) {
174
*encoding = UTF_32_BE;
175
} else if (__match_encoding(s, "wchart")) {
176
*encoding = WCHAR_T;
177
} else {
178
return false;
179
}
180
return true;
181
}
182
183
struct __iconv_t {
184
Encoding src_encoding;
185
Encoding dst_encoding;
186
Mode mode;
187
/*
188
__iconv_t() : mode(ERROR) {
189
}
190
*/
191
int Convert(char** src_buf0, size_t* src_bytes_left0, char** dst_buf0, size_t* dst_bytes_left0) {
192
// Reset state.
193
wc = 0;
194
memset(&ps, 0, sizeof(ps));
195
replacement_count = 0;
196
ignored = false;
197
src_buf = src_buf0;
198
src_bytes_left = src_bytes_left0;
199
dst_buf = dst_buf0;
200
dst_bytes_left = dst_bytes_left0;
201
while (*src_bytes_left > 0) {
202
if (!GetNext() || !Convert()) return -1;
203
}
204
return Done();
205
}
206
private:
207
char32_t wc;
208
char buf[16];
209
size_t src_bytes_used;
210
size_t dst_bytes_used;
211
mbstate_t ps;
212
size_t replacement_count;
213
bool ignored;
214
char** src_buf;
215
size_t* src_bytes_left;
216
char** dst_buf;
217
size_t* dst_bytes_left;
218
bool GetNext() {
219
errno = 0;
220
switch (src_encoding) {
221
case US_ASCII:
222
wc = **src_buf;
223
src_bytes_used = 1;
224
if (wc > 0x7f) errno = EILSEQ;
225
break;
226
case UTF_8:
227
src_bytes_used = mbrtoc32(&wc, *src_buf, *src_bytes_left, &ps);
228
if (src_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) {
229
break; // EILSEQ already set.
230
} else if (src_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) {
231
errno = EINVAL;
232
return false;
233
}
234
break;
235
case UTF_16_BE:
236
case UTF_16_LE: {
237
if (*src_bytes_left < 2) {
238
errno = EINVAL;
239
return false;
240
}
241
bool swap = (src_encoding == UTF_16_BE);
242
wc = In16(*src_buf, swap);
243
// 0xd800-0xdbff: high surrogates
244
// 0xdc00-0xdfff: low surrogates
245
if (wc >= 0xd800 && wc <= 0xdfff) {
246
if (wc >= 0xdc00) { // Low surrogate before high surrogate.
247
errno = EILSEQ;
248
return false;
249
}
250
if (*src_bytes_left < 4) {
251
errno = EINVAL;
252
return false;
253
}
254
uint16_t hi = wc;
255
uint16_t lo = In16(*src_buf + 2, swap);
256
wc = 0x10000 + ((hi - 0xd800) << 10) + (lo - 0xdc00);
257
src_bytes_used = 4;
258
}
259
break;
260
}
261
case UTF_32_BE:
262
case UTF_32_LE:
263
case WCHAR_T:
264
if (*src_bytes_left < 4) {
265
errno = EINVAL;
266
return false;
267
}
268
wc = In32(*src_buf, (src_encoding == UTF_32_BE));
269
break;
270
}
271
if (errno == EILSEQ) {
272
switch (mode) {
273
case ERROR:
274
return false;
275
case IGNORE:
276
*src_buf += src_bytes_used;
277
*src_bytes_left -= src_bytes_used;
278
ignored = true;
279
return GetNext();
280
case TRANSLIT:
281
wc = '?';
282
++replacement_count;
283
return true;
284
}
285
}
286
return true;
287
}
288
289
bool Convert() {
290
errno = 0;
291
switch (dst_encoding) {
292
case US_ASCII:
293
buf[0] = wc;
294
dst_bytes_used = 1;
295
if (wc > 0x7f) errno = EILSEQ;
296
break;
297
case UTF_8:
298
dst_bytes_used = c32rtomb(buf, wc, &ps);
299
if (dst_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) {
300
break; // EILSEQ already set.
301
} else if (dst_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) {
302
errno = EINVAL;
303
return false;
304
}
305
break;
306
case UTF_16_BE:
307
case UTF_16_LE: {
308
bool swap = (dst_encoding == UTF_16_BE);
309
if (wc < 0x10000) { // BMP.
310
Out16(buf, wc, swap);
311
} else { // Supplementary plane; output surrogate pair.
312
wc -= 0x10000;
313
char16_t hi = 0xd800 | (wc >> 10);
314
char16_t lo = 0xdc00 | (wc & 0x3ff);
315
Out16(buf + 0, hi, swap);
316
Out16(buf + 2, lo, swap);
317
dst_bytes_used = 4;
318
}
319
} break;
320
case UTF_32_BE:
321
case UTF_32_LE:
322
case WCHAR_T:
323
Out32(wc, (dst_encoding == UTF_32_BE));
324
break;
325
}
326
if (errno == EILSEQ) {
327
if (mode == IGNORE) {
328
*src_buf += src_bytes_used;
329
*src_bytes_left -= src_bytes_used;
330
ignored = true;
331
return true;
332
} else if (mode == TRANSLIT) {
333
wc = '?';
334
++replacement_count;
335
return Convert();
336
}
337
return false;
338
}
339
return Emit();
340
}
341
342
uint16_t In16(const char* buf, bool swap) {
343
#ifdef __cplusplus
344
const uint8_t* src = reinterpret_cast<const uint8_t*>(buf);
345
#else // !__cplusplus
346
const uint8_t* src = (const uint8_t*)(buf);
347
#endif // __cplusplus
348
uint16_t wc = (src[0]) | (src[1] << 8);
349
if (swap) wc = __swap16(wc);
350
src_bytes_used = 2;
351
return wc;
352
}
353
354
uint32_t In32(const char* buf, bool swap) {
355
#ifdef __cplusplus
356
const uint8_t* src = reinterpret_cast<const uint8_t*>(buf);
357
#else // !__cplusplus
358
const uint8_t* src = (const uint8_t*)(buf);
359
#endif // __cplusplus
360
uint32_t wc = (src[0]) | (src[1] << 8) | (src[2] << 16) | (src[3] << 24);
361
if (swap) wc = __swap32(wc);
362
src_bytes_used = 4;
363
return wc;
364
}
365
366
void Out16(char* dst, char16_t ch, bool swap) {
367
if (swap) ch = __swap16(ch);
368
dst[0] = ch;
369
dst[1] = ch >> 8;
370
dst_bytes_used = 2;
371
}
372
373
void Out32(char32_t ch, bool swap) {
374
if (swap) ch = __swap32(ch);
375
buf[0] = ch;
376
buf[1] = ch >> 8;
377
buf[2] = ch >> 16;
378
buf[3] = ch >> 24;
379
dst_bytes_used = 4;
380
}
381
382
bool Emit() {
383
if (dst_bytes_used > *dst_bytes_left) {
384
errno = E2BIG;
385
return false;
386
}
387
memcpy(*dst_buf, buf, dst_bytes_used);
388
*src_buf += src_bytes_used;
389
*src_bytes_left -= src_bytes_used;
390
*dst_buf += dst_bytes_used;
391
*dst_bytes_left -= dst_bytes_used;
392
return true;
393
}
394
395
int Done() {
396
if (mode == TRANSLIT) return replacement_count;
397
if (ignored) {
398
errno = EILSEQ;
399
return -1;
400
}
401
return 0;
402
}
403
};
404
405
iconv_t iconv_open(const char* __dst_encoding, const char* __src_encoding) {
406
iconv_t result = iconv_t();
407
result->mode = ERROR;
408
if (!__parse_encoding(__src_encoding, &result->src_encoding, 0 /* nullptr */) ||
409
!__parse_encoding(__dst_encoding, &result->dst_encoding, &result->mode)) {
410
free(result);
411
errno = EINVAL;
412
return INVALID_ICONV_T;
413
}
414
return result;
415
}
416
417
size_t iconv(iconv_t __converter,
418
char** __src_buf, size_t* __src_bytes_left,
419
char** __dst_buf, size_t* __dst_bytes_left) {
420
if (__converter == INVALID_ICONV_T) {
421
errno = EBADF;
422
return -1;
423
}
424
return __converter->Convert(__src_buf, __src_bytes_left, __dst_buf, __dst_bytes_left);
425
}
426
427
int iconv_close(iconv_t __converter) {
428
if (__converter == INVALID_ICONV_T) {
429
errno = EBADF;
430
return -1;
431
}
432
free(__converter);
433
return 0;
434
}
435
436
__END_DECLS
437
438
#endif // __ANDROID__
439
440