CoCalc -- StringPrep.java

GitHub Repository: PojavLauncherTeam/mobile
Path: blob/master/src/java.base/share/classes/jdk/internal/icu/text/StringPrep.java
⁴¹¹⁶¹ views
1
/*
2
 * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
3
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
 *
5
 * This code is free software; you can redistribute it and/or modify it
6
 * under the terms of the GNU General Public License version 2 only, as
7
 * published by the Free Software Foundation.  Oracle designates this
8
 * particular file as subject to the "Classpath" exception as provided
9
 * by Oracle in the LICENSE file that accompanied this code.
10
 *
11
 * This code is distributed in the hope that it will be useful, but WITHOUT
12
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14
 * version 2 for more details (a copy is included in the LICENSE file that
15
 * accompanied this code).
16
 *
17
 * You should have received a copy of the GNU General Public License version
18
 * 2 along with this work; if not, write to the Free Software Foundation,
19
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20
 *
21
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22
 * or visit www.oracle.com if you need additional information or have any
23
 * questions.
24
 */
25
/*
26
/*
27
 *******************************************************************************
28
 * Copyright (C) 2003-2004, International Business Machines Corporation and         *
29
 * others. All Rights Reserved.                                                *
30
 *******************************************************************************
31
 */
32
//
33
// CHANGELOG
34
//      2005-05-19 Edward Wang
35
//          - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/StringPrep.java
36
//          - move from package com.ibm.icu.text to package sun.net.idn
37
//          - use ParseException instead of StringPrepParseException
38
//          - change 'Normalizer.getUnicodeVersion()' to 'NormalizerImpl.getUnicodeVersion()'
39
//          - remove all @deprecated tag to make compiler happy
40
//      2007-08-14 Martin Buchholz
41
//          - remove redundant casts
42
//
43
package jdk.internal.icu.text;
44

45
import java.io.BufferedInputStream;
46
import java.io.ByteArrayInputStream;
47
import java.io.IOException;
48
import java.io.InputStream;
49
import java.text.ParseException;
50

51
import sun.text.Normalizer;
52
import jdk.internal.icu.impl.CharTrie;
53
import jdk.internal.icu.impl.StringPrepDataReader;
54
import jdk.internal.icu.impl.Trie;
55
import jdk.internal.icu.lang.UCharacter;
56
import jdk.internal.icu.lang.UCharacterDirection;
57
import jdk.internal.icu.util.VersionInfo;
58

59
/**
60
 * StringPrep API implements the StingPrep framework as described by
61
 * <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>.
62
 * StringPrep prepares Unicode strings for use in network protocols.
63
 * Profiles of StingPrep are set of rules and data according to which the
64
 * Unicode Strings are prepared. Each profiles contains tables which describe
65
 * how a code point should be treated. The tables are broadly classied into
66
 * <ul>
67
 *     <li> Unassigned Table: Contains code points that are unassigned
68
 *          in the Unicode Version supported by StringPrep. Currently
69
 *          RFC 3454 supports Unicode 3.2. </li>
70
 *     <li> Prohibited Table: Contains code points that are prohibted from
71
 *          the output of the StringPrep processing function. </li>
72
 *     <li> Mapping Table: Contains code ponts that are deleted from the output or case mapped. </li>
73
 * </ul>
74
 *
75
 * The procedure for preparing Unicode strings:
76
 * <ol>
77
 *      <li> Map: For each character in the input, check if it has a mapping
78
 *           and, if so, replace it with its mapping. </li>
79
 *      <li> Normalize: Possibly normalize the result of step 1 using Unicode
80
 *           normalization. </li>
81
 *      <li> Prohibit: Check for any characters that are not allowed in the
82
 *           output.  If any are found, return an error.</li>
83
 *      <li> Check bidi: Possibly check for right-to-left characters, and if
84
 *           any are found, make sure that the whole string satisfies the
85
 *           requirements for bidirectional strings.  If the string does not
86
 *           satisfy the requirements for bidirectional strings, return an
87
 *           error.  </li>
88
 * </ol>
89
 * @author Ram Viswanadha
90
 * @draft ICU 2.8
91
 */
92
public final class StringPrep {
93
    /**
94
     * Option to prohibit processing of unassigned code points in the input
95
     *
96
     * @see   #prepare
97
     * @draft ICU 2.8
98
     */
99
    public static final int DEFAULT = 0x0000;
100

101
    /**
102
     * Option to allow processing of unassigned code points in the input
103
     *
104
     * @see   #prepare
105
     * @draft ICU 2.8
106
     */
107
    public static final int ALLOW_UNASSIGNED = 0x0001;
108

109
    private static final int UNASSIGNED        = 0x0000;
110
    private static final int MAP               = 0x0001;
111
    private static final int PROHIBITED        = 0x0002;
112
    private static final int DELETE            = 0x0003;
113
    private static final int TYPE_LIMIT        = 0x0004;
114

115
    private static final int NORMALIZATION_ON  = 0x0001;
116
    private static final int CHECK_BIDI_ON     = 0x0002;
117

118
    private static final int TYPE_THRESHOLD       = 0xFFF0;
119
    private static final int MAX_INDEX_VALUE      = 0x3FBF;   /*16139*/
120
    private static final int MAX_INDEX_TOP_LENGTH = 0x0003;
121

122
    /* indexes[] value names */
123
    private static final int INDEX_TRIE_SIZE                  =  0; /* number of bytes in normalization trie */
124
    private static final int INDEX_MAPPING_DATA_SIZE          =  1; /* The array that contains the mapping   */
125
    private static final int NORM_CORRECTNS_LAST_UNI_VERSION  =  2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */
126
    private static final int ONE_UCHAR_MAPPING_INDEX_START    =  3; /* The starting index of 1 UChar mapping index in the mapping data array */
127
    private static final int TWO_UCHARS_MAPPING_INDEX_START   =  4; /* The starting index of 2 UChars mapping index in the mapping data array */
128
    private static final int THREE_UCHARS_MAPPING_INDEX_START =  5;
129
    private static final int FOUR_UCHARS_MAPPING_INDEX_START  =  6;
130
    private static final int OPTIONS                          =  7; /* Bit set of options to turn on in the profile */
131
    private static final int INDEX_TOP                        = 16;                          /* changing this requires a new formatVersion */
132

133

134
    /**
135
     * Default buffer size of datafile
136
     */
137
    private static final int DATA_BUFFER_SIZE = 25000;
138

139
    /* Wrappers for Trie implementations */
140
    private static final class StringPrepTrieImpl implements Trie.DataManipulate{
141
        private CharTrie sprepTrie = null;
142
       /**
143
        * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
144
        * data the index array offset of the indexes for that lead surrogate.
145
        * @param property data value for a surrogate from the trie, including
146
        *        the folding offset
147
        * @return data offset or 0 if there is no data for the lead surrogate
148
        */
149
         public int getFoldingOffset(int value){
150
            return value;
151
        }
152
    }
153

154
    // CharTrie implementation for reading the trie data
155
    private StringPrepTrieImpl sprepTrieImpl;
156
    // Indexes read from the data file
157
    private int[] indexes;
158
    // mapping data read from the data file
159
    private char[] mappingData;
160
    // format version of the data file
161
    private byte[] formatVersion;
162
    // the version of Unicode supported by the data file
163
    private VersionInfo sprepUniVer;
164
    // the Unicode version of last entry in the
165
    // NormalizationCorrections.txt file if normalization
166
    // is turned on
167
    private VersionInfo normCorrVer;
168
    // Option to turn on Normalization
169
    private boolean doNFKC;
170
    // Option to turn on checking for BiDi rules
171
    private boolean checkBiDi;
172

173

174
    private char getCodePointValue(int ch){
175
        return sprepTrieImpl.sprepTrie.getCodePointValue(ch);
176
    }
177

178
    private static VersionInfo getVersionInfo(int comp){
179
        int micro = comp & 0xFF;
180
        int milli =(comp >> 8)  & 0xFF;
181
        int minor =(comp >> 16) & 0xFF;
182
        int major =(comp >> 24) & 0xFF;
183
        return VersionInfo.getInstance(major,minor,milli,micro);
184
    }
185
    private static VersionInfo getVersionInfo(byte[] version){
186
        if(version.length != 4){
187
            return null;
188
        }
189
        return VersionInfo.getInstance((int)version[0],(int) version[1],(int) version[2],(int) version[3]);
190
    }
191
    /**
192
     * Creates an StringPrep object after reading the input stream.
193
     * The object does not hold a reference to the input steam, so the stream can be
194
     * closed after the method returns.
195
     *
196
     * @param inputStream The stream for reading the StringPrep profile binarySun
197
     * @throws IOException
198
     * @draft ICU 2.8
199
     */
200
    public StringPrep(InputStream inputStream) throws IOException{
201

202
        BufferedInputStream b = new BufferedInputStream(inputStream,DATA_BUFFER_SIZE);
203

204
        StringPrepDataReader reader = new StringPrepDataReader(b);
205

206
        // read the indexes
207
        indexes = reader.readIndexes(INDEX_TOP);
208

209
        byte[] sprepBytes = new byte[indexes[INDEX_TRIE_SIZE]];
210

211

212
        //indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes
213
        mappingData = new char[indexes[INDEX_MAPPING_DATA_SIZE]/2];
214
        // load the rest of the data and initialize the data members
215
        reader.read(sprepBytes,mappingData);
216

217
        sprepTrieImpl           = new StringPrepTrieImpl();
218
        sprepTrieImpl.sprepTrie = new CharTrie( new ByteArrayInputStream(sprepBytes),sprepTrieImpl  );
219

220
        // get the data format version
221
        formatVersion = reader.getDataFormatVersion();
222

223
        // get the options
224
        doNFKC            = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0);
225
        checkBiDi         = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);
226
        sprepUniVer   = getVersionInfo(reader.getUnicodeVersion());
227
        normCorrVer   = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
228
        VersionInfo normUniVer = UCharacter.getUnicodeVersion();
229
        if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
230
           normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
231
           ((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/
232
           ){
233
            throw new IOException("Normalization Correction version not supported");
234
        }
235
        b.close();
236
    }
237

238
    private static final class Values{
239
        boolean isIndex;
240
        int value;
241
        int type;
242
        public void reset(){
243
            isIndex = false;
244
            value = 0;
245
            type = -1;
246
        }
247
    }
248

249
    private static final void getValues(char trieWord,Values values){
250
        values.reset();
251
        if(trieWord == 0){
252
            /*
253
             * Initial value stored in the mapping table
254
             * just return TYPE_LIMIT .. so that
255
             * the source codepoint is copied to the destination
256
             */
257
            values.type = TYPE_LIMIT;
258
        }else if(trieWord >= TYPE_THRESHOLD){
259
            values.type = (trieWord - TYPE_THRESHOLD);
260
        }else{
261
            /* get the type */
262
            values.type = MAP;
263
            /* ascertain if the value is index or delta */
264
            if((trieWord & 0x02)>0){
265
                values.isIndex = true;
266
                values.value = trieWord  >> 2; //mask off the lower 2 bits and shift
267

268
            }else{
269
                values.isIndex = false;
270
                values.value = (trieWord<<16)>>16;
271
                values.value =  (values.value >> 2);
272

273
            }
274

275
            if((trieWord>>2) == MAX_INDEX_VALUE){
276
                values.type = DELETE;
277
                values.isIndex = false;
278
                values.value = 0;
279
            }
280
        }
281
    }
282

283

284

285
    private StringBuffer map( UCharacterIterator iter, int options)
286
                            throws ParseException {
287

288
        Values val = new Values();
289
        char result = 0;
290
        int ch  = UCharacterIterator.DONE;
291
        StringBuffer dest = new StringBuffer();
292
        boolean allowUnassigned = ((options & ALLOW_UNASSIGNED)>0);
293

294
        while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
295

296
            result = getCodePointValue(ch);
297
            getValues(result,val);
298

299
            // check if the source codepoint is unassigned
300
            if(val.type == UNASSIGNED && allowUnassigned == false){
301
                 throw new ParseException("An unassigned code point was found in the input " +
302
                                          iter.getText(), iter.getIndex());
303
            }else if((val.type == MAP)){
304
                int index, length;
305

306
                if(val.isIndex){
307
                    index = val.value;
308
                    if(index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] &&
309
                             index < indexes[TWO_UCHARS_MAPPING_INDEX_START]){
310
                        length = 1;
311
                    }else if(index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] &&
312
                             index < indexes[THREE_UCHARS_MAPPING_INDEX_START]){
313
                        length = 2;
314
                    }else if(index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] &&
315
                             index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]){
316
                        length = 3;
317
                    }else{
318
                        length = mappingData[index++];
319
                    }
320
                    /* copy mapping to destination */
321
                    dest.append(mappingData,index,length);
322
                    continue;
323

324
                }else{
325
                    ch -= val.value;
326
                }
327
            }else if(val.type == DELETE){
328
                // just consume the codepoint and contine
329
                continue;
330
            }
331
            //copy the source into destination
332
            UTF16.append(dest,ch);
333
        }
334

335
        return dest;
336
    }
337

338

339
    private StringBuffer normalize(StringBuffer src){
340
        /*
341
         * Option UNORM_BEFORE_PRI_29:
342
         *
343
         * IDNA as interpreted by IETF members (see unicode mailing list 2004H1)
344
         * requires strict adherence to Unicode 3.2 normalization,
345
         * including buggy composition from before fixing Public Review Issue #29.
346
         * Note that this results in some valid but nonsensical text to be
347
         * either corrupted or rejected, depending on the text.
348
         * See http://www.unicode.org/review/resolved-pri.html#pri29
349
         * See unorm.cpp and cnormtst.c
350
         */
351
        return new StringBuffer(
352
            Normalizer.normalize(
353
                src.toString(),
354
                java.text.Normalizer.Form.NFKC,
355
                Normalizer.UNICODE_3_2));
356
    }
357
    /*
358
    boolean isLabelSeparator(int ch){
359
        int result = getCodePointValue(ch);
360
        if( (result & 0x07)  == LABEL_SEPARATOR){
361
            return true;
362
        }
363
        return false;
364
    }
365
    */
366
     /*
367
       1) Map -- For each character in the input, check if it has a mapping
368
          and, if so, replace it with its mapping.
369

370
       2) Normalize -- Possibly normalize the result of step 1 using Unicode
371
          normalization.
372

373
       3) Prohibit -- Check for any characters that are not allowed in the
374
          output.  If any are found, return an error.
375

376
       4) Check bidi -- Possibly check for right-to-left characters, and if
377
          any are found, make sure that the whole string satisfies the
378
          requirements for bidirectional strings.  If the string does not
379
          satisfy the requirements for bidirectional strings, return an
380
          error.
381
          [Unicode3.2] defines several bidirectional categories; each character
382
           has one bidirectional category assigned to it.  For the purposes of
383
           the requirements below, an "RandALCat character" is a character that
384
           has Unicode bidirectional categories "R" or "AL"; an "LCat character"
385
           is a character that has Unicode bidirectional category "L".  Note
386

387

388
           that there are many characters which fall in neither of the above
389
           definitions; Latin digits (<U+0030> through <U+0039>) are examples of
390
           this because they have bidirectional category "EN".
391

392
           In any profile that specifies bidirectional character handling, all
393
           three of the following requirements MUST be met:
394

395
           1) The characters in section 5.8 MUST be prohibited.
396

397
           2) If a string contains any RandALCat character, the string MUST NOT
398
              contain any LCat character.
399

400
           3) If a string contains any RandALCat character, a RandALCat
401
              character MUST be the first character of the string, and a
402
              RandALCat character MUST be the last character of the string.
403
    */
404
    /**
405
     * Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC),
406
     * checks for prohited and BiDi characters in the order defined by RFC 3454
407
     * depending on the options specified in the profile.
408
     *
409
     * @param src           A UCharacterIterator object containing the source string
410
     * @param options       A bit set of options:
411
     *
412
     *  - StringPrep.NONE               Prohibit processing of unassigned code points in the input
413
     *
414
     *  - StringPrep.ALLOW_UNASSIGNED   Treat the unassigned code points are in the input
415
     *                                  as normal Unicode code points.
416
     *
417
     * @return StringBuffer A StringBuffer containing the output
418
     * @throws ParseException
419
     * @draft ICU 2.8
420
     */
421
    public StringBuffer prepare(UCharacterIterator src, int options)
422
                        throws ParseException{
423

424
        // map
425
        StringBuffer mapOut = map(src,options);
426
        StringBuffer normOut = mapOut;// initialize
427

428
        if(doNFKC){
429
            // normalize
430
            normOut = normalize(mapOut);
431
        }
432

433
        int ch;
434
        char result;
435
        UCharacterIterator iter = UCharacterIterator.getInstance(normOut);
436
        Values val = new Values();
437
        int direction=UCharacterDirection.CHAR_DIRECTION_COUNT,
438
            firstCharDir=UCharacterDirection.CHAR_DIRECTION_COUNT;
439
        int rtlPos=-1, ltrPos=-1;
440
        boolean rightToLeft=false, leftToRight=false;
441

442
        while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
443
            result = getCodePointValue(ch);
444
            getValues(result,val);
445

446
            if(val.type == PROHIBITED ){
447
                throw new ParseException("A prohibited code point was found in the input" +
448
                                         iter.getText(), val.value);
449
            }
450

451
            direction = UCharacter.getDirection(ch);
452
            if(firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT){
453
                firstCharDir = direction;
454
            }
455
            if(direction == UCharacterDirection.LEFT_TO_RIGHT){
456
                leftToRight = true;
457
                ltrPos = iter.getIndex()-1;
458
            }
459
            if(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC){
460
                rightToLeft = true;
461
                rtlPos = iter.getIndex()-1;
462
            }
463
        }
464
        if(checkBiDi == true){
465
            // satisfy 2
466
            if( leftToRight == true && rightToLeft == true){
467
                throw new ParseException("The input does not conform to the rules for BiDi code points." +
468
                                         iter.getText(),
469
                                         (rtlPos>ltrPos) ? rtlPos : ltrPos);
470
             }
471

472
            //satisfy 3
473
            if( rightToLeft == true &&
474
                !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) &&
475
                (direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC))
476
              ){
477
                throw new ParseException("The input does not conform to the rules for BiDi code points." +
478
                                         iter.getText(),
479
                                         (rtlPos>ltrPos) ? rtlPos : ltrPos);
480
            }
481
        }
482
        return normOut;
483

484
      }
485
}
486

487
Product

Resources

Company