Path: blob/master/src/java.instrument/share/native/libinstrument/EncodingSupport.c
41149 views
/*1* Copyright (c) 2004, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/242526/**27* Determine length of this Standard UTF-8 in Modified UTF-8.28* Validation is done of the basic UTF encoding rules, returns29* length (no change) when errors are detected in the UTF encoding.30*31* Note: Accepts Modified UTF-8 also, no verification on the32* correctness of Standard UTF-8 is done. e,g, 0xC080 input is ok.33*/34int35modifiedUtf8LengthOfUtf8(char* string, int length) {36int new_length;37int i;3839new_length = 0;40/*41* if length < 0 or new_length becomes < 0 => string is too big42* (handled as error after the cycle).43*/44for ( i = 0 ; i < length && new_length >= 0 ; i++ ) {45unsigned byte;4647byte = (unsigned char)string[i];48if ( (byte & 0x80) == 0 ) { /* 1byte encoding */49new_length++;50if ( byte == 0 ) {51new_length++; /* We gain one byte in length on NULL bytes */52}53} else if ( (byte & 0xE0) == 0xC0 ) { /* 2byte encoding */54/* Check encoding of following bytes */55if ( (i+1) >= length || (string[i+1] & 0xC0) != 0x80 ) {56break; /* Error condition */57}58i++; /* Skip next byte */59new_length += 2;60} else if ( (byte & 0xF0) == 0xE0 ) { /* 3byte encoding */61/* Check encoding of following bytes */62if ( (i+2) >= length || (string[i+1] & 0xC0) != 0x8063|| (string[i+2] & 0xC0) != 0x80 ) {64break; /* Error condition */65}66i += 2; /* Skip next two bytes */67new_length += 3;68} else if ( (byte & 0xF8) == 0xF0 ) { /* 4byte encoding */69/* Check encoding of following bytes */70if ( (i+3) >= length || (string[i+1] & 0xC0) != 0x8071|| (string[i+2] & 0xC0) != 0x8072|| (string[i+3] & 0xC0) != 0x80 ) {73break; /* Error condition */74}75i += 3; /* Skip next 3 bytes */76new_length += 6; /* 4byte encoding turns into 2 3byte ones */77} else {78break; /* Error condition */79}80}81if ( i != length ) {82/* Error in finding new length, return old length so no conversion */83/* FIXUP: ERROR_MESSAGE? */84return length;85}86return new_length;87}8889/*90* Convert Standard UTF-8 to Modified UTF-8.91* Assumes the UTF-8 encoding was validated by modifiedLength() above.92*93* Note: Accepts Modified UTF-8 also, no verification on the94* correctness of Standard UTF-8 is done. e,g, 0xC080 input is ok.95*/96void97convertUtf8ToModifiedUtf8(char *string, int length, char *new_string, int new_length)98{99int i;100int j;101102j = 0;103for ( i = 0 ; i < length ; i++ ) {104unsigned byte1;105106byte1 = (unsigned char)string[i];107108/* NULL bytes and bytes starting with 11110xxx are special */109if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */110if ( byte1 == 0 ) {111/* Bits out: 11000000 10000000 */112new_string[j++] = (char)0xC0;113new_string[j++] = (char)0x80;114} else {115/* Single byte */116new_string[j++] = byte1;117}118} else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */119new_string[j++] = byte1;120new_string[j++] = string[++i];121} else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */122new_string[j++] = byte1;123new_string[j++] = string[++i];124new_string[j++] = string[++i];125} else if ( (byte1 & 0xF8) == 0xF0 ) { /* 4byte encoding */126/* Beginning of 4byte encoding, turn into 2 3byte encodings */127unsigned byte2, byte3, byte4, u21;128129/* Bits in: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */130byte2 = (unsigned char)string[++i];131byte3 = (unsigned char)string[++i];132byte4 = (unsigned char)string[++i];133/* Reconstruct full 21bit value */134u21 = (byte1 & 0x07) << 18;135u21 += (byte2 & 0x3F) << 12;136u21 += (byte3 & 0x3F) << 6;137u21 += (byte4 & 0x3F);138/* Bits out: 11101101 1010xxxx 10xxxxxx */139new_string[j++] = (char)0xED;140new_string[j++] = 0xA0 + (((u21 >> 16) - 1) & 0x0F);141new_string[j++] = 0x80 + ((u21 >> 10) & 0x3F);142/* Bits out: 11101101 1011xxxx 10xxxxxx */143new_string[j++] = (char)0xED;144new_string[j++] = 0xB0 + ((u21 >> 6) & 0x0F);145new_string[j++] = byte4;146}147}148new_string[j] = 0;149}150151152