CoCalc -- Punycode.java

GitHub Repository: PojavLauncherTeam/mobile
Path: blob/master/src/java.base/share/classes/jdk/internal/icu/impl/Punycode.java
⁴¹¹⁶¹ views
1
/*
2
 * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
3
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
 *
5
 * This code is free software; you can redistribute it and/or modify it
6
 * under the terms of the GNU General Public License version 2 only, as
7
 * published by the Free Software Foundation.  Oracle designates this
8
 * particular file as subject to the "Classpath" exception as provided
9
 * by Oracle in the LICENSE file that accompanied this code.
10
 *
11
 * This code is distributed in the hope that it will be useful, but WITHOUT
12
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14
 * version 2 for more details (a copy is included in the LICENSE file that
15
 * accompanied this code).
16
 *
17
 * You should have received a copy of the GNU General Public License version
18
 * 2 along with this work; if not, write to the Free Software Foundation,
19
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20
 *
21
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22
 * or visit www.oracle.com if you need additional information or have any
23
 * questions.
24
 */
25
/*
26
 *******************************************************************************
27
 * Copyright (C) 2003-2004, International Business Machines Corporation and    *
28
 * others. All Rights Reserved.                                                *
29
 *******************************************************************************
30
 */
31
//
32
// CHANGELOG
33
//      2005-05-19 Edward Wang
34
//          - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/Punycode.java
35
//          - move from package com.ibm.icu.text to package sun.net.idn
36
//          - use ParseException instead of StringPrepParseException
37
//      2007-08-14 Martin Buchholz
38
//          - remove redundant casts
39
//
40
package jdk.internal.icu.impl;
41

42
import java.text.ParseException;
43
import jdk.internal.icu.lang.UCharacter;
44
import jdk.internal.icu.text.UTF16;
45

46
/**
47
 * Ported code from ICU punycode.c
48
 * @author ram
49
 */
50

51
/* Package Private class */
52
public final class Punycode {
53

54
    /* Punycode parameters for Bootstring */
55
    private static final int BASE           = 36;
56
    private static final int TMIN           = 1;
57
    private static final int TMAX           = 26;
58
    private static final int SKEW           = 38;
59
    private static final int DAMP           = 700;
60
    private static final int INITIAL_BIAS   = 72;
61
    private static final int INITIAL_N      = 0x80;
62

63
    /* "Basic" Unicode/ASCII code points */
64
    private static final int HYPHEN         = 0x2d;
65
    private static final int DELIMITER      = HYPHEN;
66

67
    private static final int ZERO           = 0x30;
68
    private static final int NINE           = 0x39;
69

70
    private static final int SMALL_A        = 0x61;
71
    private static final int SMALL_Z        = 0x7a;
72

73
    private static final int CAPITAL_A      = 0x41;
74
    private static final int CAPITAL_Z      = 0x5a;
75

76
    //  TODO: eliminate the 256 limitation
77
    private static final int MAX_CP_COUNT   = 256;
78

79
    private static final int UINT_MAGIC     = 0x80000000;
80
    private static final long ULONG_MAGIC   = 0x8000000000000000L;
81

82
    private static int adaptBias(int delta, int length, boolean firstTime){
83
        if(firstTime){
84
            delta /=DAMP;
85
        }else{
86
            delta /=  2;
87
        }
88
        delta += delta/length;
89

90
        int count=0;
91
        for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
92
            delta/=(BASE-TMIN);
93
        }
94

95
        return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
96
    }
97

98
    /**
99
     * basicToDigit[] contains the numeric value of a basic code
100
     * point (for use in representing integers) in the range 0 to
101
     * BASE-1, or -1 if b is does not represent a value.
102
     */
103
    static final int[]    basicToDigit= new int[]{
104
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
105
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
106

107
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
108
        26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
109

110
        -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
111
        15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
112

113
        -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
114
        15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
115

116
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
117
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
118

119
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
120
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
121

122
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
123
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
124

125
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
126
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
127
    };
128

129
    private static char asciiCaseMap(char b, boolean uppercase) {
130
        if(uppercase) {
131
            if(SMALL_A<=b && b<=SMALL_Z) {
132
                b-=(SMALL_A-CAPITAL_A);
133
            }
134
        } else {
135
            if(CAPITAL_A<=b && b<=CAPITAL_Z) {
136
                b+=(SMALL_A-CAPITAL_A);
137
            }
138
        }
139
        return b;
140
    }
141

142
    /**
143
     * digitToBasic() returns the basic code point whose value
144
     * (when used for representing integers) is d, which must be in the
145
     * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
146
     * nonzero, in which case the uppercase form is used.
147
     */
148
    private static char digitToBasic(int digit, boolean uppercase) {
149
        /*  0..25 map to ASCII a..z or A..Z */
150
        /* 26..35 map to ASCII 0..9         */
151
        if(digit<26) {
152
            if(uppercase) {
153
                return (char)(CAPITAL_A+digit);
154
            } else {
155
                return (char)(SMALL_A+digit);
156
            }
157
        } else {
158
            return (char)((ZERO-26)+digit);
159
        }
160
    }
161
    /**
162
     * Converts Unicode to Punycode.
163
     * The input string must not contain single, unpaired surrogates.
164
     * The output will be represented as an array of ASCII code points.
165
     *
166
     * @param src
167
     * @param caseFlags
168
     * @return
169
     * @throws ParseException
170
     */
171
    public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws ParseException{
172

173
        int[] cpBuffer = new int[MAX_CP_COUNT];
174
        int n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
175
        char c, c2;
176
        int srcLength = src.length();
177
        int destCapacity = MAX_CP_COUNT;
178
        char[] dest = new char[destCapacity];
179
        StringBuffer result = new StringBuffer();
180
        /*
181
         * Handle the basic code points and
182
         * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
183
         */
184
        srcCPCount=destLength=0;
185

186
        for(j=0; j<srcLength; ++j) {
187
            if(srcCPCount==MAX_CP_COUNT) {
188
                /* too many input code points */
189
                throw new ParseException("Too many input code points", -1);
190
            }
191
            c=src.charAt(j);
192
            if(isBasic(c)) {
193
                if(destLength<destCapacity) {
194
                    cpBuffer[srcCPCount++]=0;
195
                    dest[destLength]=
196
                        caseFlags!=null ?
197
                            asciiCaseMap(c, caseFlags[j]) :
198
                            c;
199
                }
200
                ++destLength;
201
            } else {
202
                n=((caseFlags!=null && caseFlags[j])? 1 : 0)<<31L;
203
                if(!UTF16.isSurrogate(c)) {
204
                    n|=c;
205
                } else if(UTF16.isLeadSurrogate(c) && (j+1)<srcLength && UTF16.isTrailSurrogate(c2=src.charAt(j+1))) {
206
                    ++j;
207

208
                    n|=UCharacter.getCodePoint(c, c2);
209
                } else {
210
                    /* error: unmatched surrogate */
211
                    throw new ParseException("Illegal char found", -1);
212
                }
213
                cpBuffer[srcCPCount++]=n;
214
            }
215
        }
216

217
        /* Finish the basic string - if it is not empty - with a delimiter. */
218
        basicLength=destLength;
219
        if(basicLength>0) {
220
            if(destLength<destCapacity) {
221
                dest[destLength]=DELIMITER;
222
            }
223
            ++destLength;
224
        }
225

226
        /*
227
         * handledCPCount is the number of code points that have been handled
228
         * basicLength is the number of basic code points
229
         * destLength is the number of chars that have been output
230
         */
231

232
        /* Initialize the state: */
233
        n=INITIAL_N;
234
        delta=0;
235
        bias=INITIAL_BIAS;
236

237
        /* Main encoding loop: */
238
        for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
239
            /*
240
             * All non-basic code points < n have been handled already.
241
             * Find the next larger one:
242
             */
243
            for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
244
                q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
245
                if(n<=q && q<m) {
246
                    m=q;
247
                }
248
            }
249

250
            /*
251
             * Increase delta enough to advance the decoder's
252
             * <n,i> state to <m,0>, but guard against overflow:
253
             */
254
            if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
255
                throw new RuntimeException("Internal program error");
256
            }
257
            delta+=(m-n)*(handledCPCount+1);
258
            n=m;
259

260
            /* Encode a sequence of same code points n */
261
            for(j=0; j<srcCPCount; ++j) {
262
                q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
263
                if(q<n) {
264
                    ++delta;
265
                } else if(q==n) {
266
                    /* Represent delta as a generalized variable-length integer: */
267
                    for(q=delta, k=BASE; /* no condition */; k+=BASE) {
268

269
                        /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
270

271
                        t=k-bias;
272
                        if(t<TMIN) {
273
                            t=TMIN;
274
                        } else if(t>TMAX) {
275
                            t=TMAX;
276
                        }
277
                        */
278

279
                        t=k-bias;
280
                        if(t<TMIN) {
281
                            t=TMIN;
282
                        } else if(k>=(bias+TMAX)) {
283
                            t=TMAX;
284
                        }
285

286
                        if(q<t) {
287
                            break;
288
                        }
289

290
                        if(destLength<destCapacity) {
291
                            dest[destLength++]=digitToBasic(t+(q-t)%(BASE-t), false);
292
                        }
293
                        q=(q-t)/(BASE-t);
294
                    }
295

296
                    if(destLength<destCapacity) {
297
                        dest[destLength++]=digitToBasic(q, (cpBuffer[j]<0));
298
                    }
299
                    bias=adaptBias(delta, handledCPCount+1,(handledCPCount==basicLength));
300
                    delta=0;
301
                    ++handledCPCount;
302
                }
303
            }
304

305
            ++delta;
306
            ++n;
307
        }
308

309
        return result.append(dest, 0, destLength);
310
    }
311

312
    private static boolean isBasic(int ch){
313
        return (ch < INITIAL_N);
314
    }
315

316
    private static boolean isBasicUpperCase(int ch){
317
        return( CAPITAL_A <= ch && ch <= CAPITAL_Z);
318
    }
319

320
    private static boolean isSurrogate(int ch){
321
        return (((ch)&0xfffff800)==0xd800);
322
    }
323
    /**
324
     * Converts Punycode to Unicode.
325
     * The Unicode string will be at most as long as the Punycode string.
326
     *
327
     * @param src
328
     * @param caseFlags
329
     * @return
330
     * @throws ParseException
331
     */
332
    public static StringBuffer decode(StringBuffer src, boolean[] caseFlags)
333
                               throws ParseException{
334
        int srcLength = src.length();
335
        StringBuffer result = new StringBuffer();
336
        int n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
337
                destCPCount, firstSupplementaryIndex, cpLength;
338
        char b;
339
        int destCapacity = MAX_CP_COUNT;
340
        char[] dest = new char[destCapacity];
341

342
        /*
343
         * Handle the basic code points:
344
         * Let basicLength be the number of input code points
345
         * before the last delimiter, or 0 if there is none,
346
         * then copy the first basicLength code points to the output.
347
         *
348
         * The two following loops iterate backward.
349
         */
350
        for(j=srcLength; j>0;) {
351
            if(src.charAt(--j)==DELIMITER) {
352
                break;
353
            }
354
        }
355
        destLength=basicLength=destCPCount=j;
356

357
        while(j>0) {
358
            b=src.charAt(--j);
359
            if(!isBasic(b)) {
360
                throw new ParseException("Illegal char found", -1);
361
            }
362

363
            if(j<destCapacity) {
364
                dest[j]= b;
365

366
                if(caseFlags!=null) {
367
                    caseFlags[j]=isBasicUpperCase(b);
368
                }
369
            }
370
        }
371

372
        /* Initialize the state: */
373
        n=INITIAL_N;
374
        i=0;
375
        bias=INITIAL_BIAS;
376
        firstSupplementaryIndex=1000000000;
377

378
        /*
379
         * Main decoding loop:
380
         * Start just after the last delimiter if any
381
         * basic code points were copied; start at the beginning otherwise.
382
         */
383
        for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
384
            /*
385
             * in is the index of the next character to be consumed, and
386
             * destCPCount is the number of code points in the output array.
387
             *
388
             * Decode a generalized variable-length integer into delta,
389
             * which gets added to i.  The overflow checking is easier
390
             * if we increase i as we go, then subtract off its starting
391
             * value at the end to obtain delta.
392
             */
393
            for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
394
                if(in>=srcLength) {
395
                    throw new ParseException("Illegal char found", -1);
396
                }
397

398
                digit=basicToDigit[(byte)src.charAt(in++)];
399
                if(digit<0) {
400
                    throw new ParseException("Invalid char found", -1);
401
                }
402
                if(digit>(0x7fffffff-i)/w) {
403
                    /* integer overflow */
404
                    throw new ParseException("Illegal char found", -1);
405
                }
406

407
                i+=digit*w;
408
                t=k-bias;
409
                if(t<TMIN) {
410
                    t=TMIN;
411
                } else if(k>=(bias+TMAX)) {
412
                    t=TMAX;
413
                }
414
                if(digit<t) {
415
                    break;
416
                }
417

418
                if(w>0x7fffffff/(BASE-t)) {
419
                    /* integer overflow */
420
                    throw new ParseException("Illegal char found", -1);
421
                }
422
                w*=BASE-t;
423
            }
424

425
            /*
426
             * Modification from sample code:
427
             * Increments destCPCount here,
428
             * where needed instead of in for() loop tail.
429
             */
430
            ++destCPCount;
431
            bias=adaptBias(i-oldi, destCPCount, (oldi==0));
432

433
            /*
434
             * i was supposed to wrap around from (incremented) destCPCount to 0,
435
             * incrementing n each time, so we'll fix that now:
436
             */
437
            if(i/destCPCount>(0x7fffffff-n)) {
438
                /* integer overflow */
439
                throw new ParseException("Illegal char found", -1);
440
            }
441

442
            n+=i/destCPCount;
443
            i%=destCPCount;
444
            /* not needed for Punycode: */
445
            /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
446

447
            if(n>0x10ffff || isSurrogate(n)) {
448
                /* Unicode code point overflow */
449
                throw new ParseException("Illegal char found", -1);
450
            }
451

452
            /* Insert n at position i of the output: */
453
            cpLength=UTF16.getCharCount(n);
454
            if((destLength+cpLength)<destCapacity) {
455
                int codeUnitIndex;
456

457
                /*
458
                 * Handle indexes when supplementary code points are present.
459
                 *
460
                 * In almost all cases, there will be only BMP code points before i
461
                 * and even in the entire string.
462
                 * This is handled with the same efficiency as with UTF-32.
463
                 *
464
                 * Only the rare cases with supplementary code points are handled
465
                 * more slowly - but not too bad since this is an insertion anyway.
466
                 */
467
                if(i<=firstSupplementaryIndex) {
468
                    codeUnitIndex=i;
469
                    if(cpLength>1) {
470
                        firstSupplementaryIndex=codeUnitIndex;
471
                    } else {
472
                        ++firstSupplementaryIndex;
473
                    }
474
                } else {
475
                    codeUnitIndex=firstSupplementaryIndex;
476
                    codeUnitIndex=UTF16.moveCodePointOffset(dest, 0, destLength, codeUnitIndex, i-codeUnitIndex);
477
                }
478

479
                /* use the UChar index codeUnitIndex instead of the code point index i */
480
                if(codeUnitIndex<destLength) {
481
                    System.arraycopy(dest, codeUnitIndex,
482
                                     dest, codeUnitIndex+cpLength,
483
                                    (destLength-codeUnitIndex));
484
                    if(caseFlags!=null) {
485
                        System.arraycopy(caseFlags, codeUnitIndex,
486
                                         caseFlags, codeUnitIndex+cpLength,
487
                                         destLength-codeUnitIndex);
488
                    }
489
                }
490
                if(cpLength==1) {
491
                    /* BMP, insert one code unit */
492
                    dest[codeUnitIndex]=(char)n;
493
                } else {
494
                    /* supplementary character, insert two code units */
495
                    dest[codeUnitIndex]=UTF16.getLeadSurrogate(n);
496
                    dest[codeUnitIndex+1]=UTF16.getTrailSurrogate(n);
497
                }
498
                if(caseFlags!=null) {
499
                    /* Case of last character determines uppercase flag: */
500
                    caseFlags[codeUnitIndex]=isBasicUpperCase(src.charAt(in-1));
501
                    if(cpLength==2) {
502
                        caseFlags[codeUnitIndex+1]=false;
503
                    }
504
                }
505
            }
506
            destLength+=cpLength;
507
            ++i;
508
        }
509
        result.append(dest, 0, destLength);
510
        return result;
511
    }
512
}
513

514
Product

Resources

Company