CoCalc -- BreakIteratorTest.java

GitHub Repository: PojavLauncherTeam/mobile
Path: blob/master/test/jdk/java/text/BreakIterator/BreakIteratorTest.java
⁴¹¹⁵² views
1
/*
2
 * Copyright (c) 1996, 2021, Oracle and/or its affiliates. All rights reserved.
3
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
 *
5
 * This code is free software; you can redistribute it and/or modify it
6
 * under the terms of the GNU General Public License version 2 only, as
7
 * published by the Free Software Foundation.
8
 *
9
 * This code is distributed in the hope that it will be useful, but WITHOUT
10
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12
 * version 2 for more details (a copy is included in the LICENSE file that
13
 * accompanied this code).
14
 *
15
 * You should have received a copy of the GNU General Public License version
16
 * 2 along with this work; if not, write to the Free Software Foundation,
17
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18
 *
19
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20
 * or visit www.oracle.com if you need additional information or have any
21
 * questions.
22
 */
23

24
/*
25
 * @test
26
 * @bug 4035266 4052418 4068133 4068137 4068139 4086052 4095322 4097779
27
 *      4097920 4098467 4111338 4113835 4117554 4143071 4146175 4152117
28
 *      4152416 4153072 4158381 4214367 4217703 4638433 8264765
29
 * @library /java/text/testlib
30
 * @run main/timeout=2000 BreakIteratorTest
31
 * @summary test BreakIterator
32
 */
33

34
/*
35
 *
36
 *
37
 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
38
 * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
39
 *
40
 * Portions copyright (c) 2007 Sun Microsystems, Inc.
41
 * All Rights Reserved.
42
 *
43
 * The original version of this source code and documentation
44
 * is copyrighted and owned by Taligent, Inc., a wholly-owned
45
 * subsidiary of IBM. These materials are provided under terms
46
 * of a License Agreement between Taligent and Sun. This technology
47
 * is protected by multiple US and International patents.
48
 *
49
 * This notice and attribution to Taligent may not be removed.
50
 * Taligent is a registered trademark of Taligent, Inc.
51
 *
52
 * Permission to use, copy, modify, and distribute this software
53
 * and its documentation for NON-COMMERCIAL purposes and without
54
 * fee is hereby granted provided that this copyright notice
55
 * appears in all copies. Please refer to the file "copyright.html"
56
 * for further important copyright and licensing information.
57
 *
58
 * SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
59
 * THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
60
 * TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
61
 * PARTICULAR PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR
62
 * ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR
63
 * DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
64
 *
65
 */
66

67
import java.text.BreakIterator;
68
import java.text.CharacterIterator;
69
import java.text.StringCharacterIterator;
70
import java.util.Locale;
71
import java.util.Vector;
72
import java.util.Enumeration;
73
import java.io.*;
74

75
public class BreakIteratorTest extends IntlTest
76
{
77
    private BreakIterator characterBreak;
78
    private BreakIterator wordBreak;
79
    private BreakIterator lineBreak;
80
    private BreakIterator sentenceBreak;
81

82
    public static void main(String[] args) throws Exception {
83
        new BreakIteratorTest().run(args);
84
    }
85

86
    public BreakIteratorTest()
87
    {
88
        characterBreak = BreakIterator.getCharacterInstance();
89
        wordBreak = BreakIterator.getWordInstance();
90
        lineBreak = BreakIterator.getLineInstance();
91
        sentenceBreak = BreakIterator.getSentenceInstance();
92
    }
93

94
    //=========================================================================
95
    // general test subroutines
96
    //=========================================================================
97

98
    private void generalIteratorTest(BreakIterator bi, Vector expectedResult) {
99
        StringBuffer buffer = new StringBuffer();
100
        String text;
101
        for (int i = 0; i < expectedResult.size(); i++) {
102
            text = (String)expectedResult.elementAt(i);
103
            buffer.append(text);
104
        }
105
        text = buffer.toString();
106

107
        bi.setText(text);
108

109
        Vector nextResults = testFirstAndNext(bi, text);
110
        Vector previousResults = testLastAndPrevious(bi, text);
111

112
        logln("comparing forward and backward...");
113
        int errs = getErrorCount();
114
        compareFragmentLists("forward iteration", "backward iteration", nextResults,
115
                        previousResults);
116
        if (getErrorCount() == errs) {
117
            logln("comparing expected and actual...");
118
            compareFragmentLists("expected result", "actual result", expectedResult,
119
                            nextResults);
120
        }
121

122
        int[] boundaries = new int[expectedResult.size() + 3];
123
        boundaries[0] = BreakIterator.DONE;
124
        boundaries[1] = 0;
125
        for (int i = 0; i < expectedResult.size(); i++)
126
            boundaries[i + 2] = boundaries[i + 1] + ((String)expectedResult.elementAt(i)).
127
                            length();
128
        boundaries[boundaries.length - 1] = BreakIterator.DONE;
129

130
        testFollowing(bi, text, boundaries);
131
        testPreceding(bi, text, boundaries);
132
        testIsBoundary(bi, text, boundaries);
133

134
        doMultipleSelectionTest(bi, text);
135
    }
136

137
    private Vector testFirstAndNext(BreakIterator bi, String text) {
138
        int p = bi.first();
139
        int lastP = p;
140
        Vector<String> result = new Vector<String>();
141

142
        if (p != 0)
143
            errln("first() returned " + p + " instead of 0");
144
        while (p != BreakIterator.DONE) {
145
            p = bi.next();
146
            if (p != BreakIterator.DONE) {
147
                if (p <= lastP)
148
                    errln("next() failed to move forward: next() on position "
149
                                    + lastP + " yielded " + p);
150

151
                result.addElement(text.substring(lastP, p));
152
            }
153
            else {
154
                if (lastP != text.length())
155
                    errln("next() returned DONE prematurely: offset was "
156
                                    + lastP + " instead of " + text.length());
157
            }
158
            lastP = p;
159
        }
160
        return result;
161
    }
162

163
    private Vector testLastAndPrevious(BreakIterator bi, String text) {
164
        int p = bi.last();
165
        int lastP = p;
166
        Vector<String> result = new Vector<String>();
167

168
        if (p != text.length())
169
            errln("last() returned " + p + " instead of " + text.length());
170
        while (p != BreakIterator.DONE) {
171
            p = bi.previous();
172
            if (p != BreakIterator.DONE) {
173
                if (p >= lastP)
174
                    errln("previous() failed to move backward: previous() on position "
175
                                    + lastP + " yielded " + p);
176

177
                result.insertElementAt(text.substring(p, lastP), 0);
178
            }
179
            else {
180
                if (lastP != 0)
181
                    errln("previous() returned DONE prematurely: offset was "
182
                                    + lastP + " instead of 0");
183
            }
184
            lastP = p;
185
        }
186
        return result;
187
    }
188

189
    private void compareFragmentLists(String f1Name, String f2Name, Vector f1, Vector f2) {
190
        int p1 = 0;
191
        int p2 = 0;
192
        String s1;
193
        String s2;
194
        int t1 = 0;
195
        int t2 = 0;
196

197
        while (p1 < f1.size() && p2 < f2.size()) {
198
            s1 = (String)f1.elementAt(p1);
199
            s2 = (String)f2.elementAt(p2);
200
            t1 += s1.length();
201
            t2 += s2.length();
202

203
            if (s1.equals(s2)) {
204
                debugLogln("   >" + s1 + "<");
205
                ++p1;
206
                ++p2;
207
            }
208
            else {
209
                int tempT1 = t1;
210
                int tempT2 = t2;
211
                int tempP1 = p1;
212
                int tempP2 = p2;
213

214
                while (tempT1 != tempT2 && tempP1 < f1.size() && tempP2 < f2.size()) {
215
                    while (tempT1 < tempT2 && tempP1 < f1.size()) {
216
                        tempT1 += ((String)f1.elementAt(tempP1)).length();
217
                        ++tempP1;
218
                    }
219
                    while (tempT2 < tempT1 && tempP2 < f2.size()) {
220
                        tempT2 += ((String)f2.elementAt(tempP2)).length();
221
                        ++tempP2;
222
                    }
223
                }
224
                logln("*** " + f1Name + " has:");
225
                while (p1 <= tempP1 && p1 < f1.size()) {
226
                    s1 = (String)f1.elementAt(p1);
227
                    t1 += s1.length();
228
                    debugLogln(" *** >" + s1 + "<");
229
                    ++p1;
230
                }
231
                logln("***** " + f2Name + " has:");
232
                while (p2 <= tempP2 && p2 < f2.size()) {
233
                    s2 = (String)f2.elementAt(p2);
234
                    t2 += s2.length();
235
                    debugLogln(" ***** >" + s2 + "<");
236
                    ++p2;
237
                }
238
                errln("Discrepancy between " + f1Name + " and " + f2Name + "\n---\n" + f1 +"\n---\n" + f2);
239
            }
240
        }
241
    }
242

243
    private void testFollowing(BreakIterator bi, String text, int[] boundaries) {
244
        logln("testFollowing():");
245
        int p = 2;
246
        int i = 0;
247
        try {
248
            for (i = 0; i <= text.length(); i++) {  // change to <= when new BI code goes in
249
                if (i == boundaries[p])
250
                    ++p;
251

252
                int b = bi.following(i);
253
                logln("bi.following(" + i + ") -> " + b);
254
                if (b != boundaries[p])
255
                    errln("Wrong result from following() for " + i + ": expected " + boundaries[p]
256
                          + ", got " + b);
257
            }
258
        } catch (IllegalArgumentException illargExp) {
259
            errln("IllegalArgumentException caught from following() for offset: " + i);
260
        }
261
    }
262

263
    private void testPreceding(BreakIterator bi, String text, int[] boundaries) {
264
        logln("testPreceding():");
265
        int p = 0;
266
        int i = 0;
267
        try {
268
            for (i = 0; i <= text.length(); i++) {  // change to <= when new BI code goes in
269
                int b = bi.preceding(i);
270
                logln("bi.preceding(" + i + ") -> " + b);
271
                if (b != boundaries[p])
272
                    errln("Wrong result from preceding() for " + i + ": expected " + boundaries[p]
273
                          + ", got " + b);
274

275
                if (i == boundaries[p + 1])
276
                    ++p;
277
            }
278
        } catch (IllegalArgumentException illargExp) {
279
            errln("IllegalArgumentException caught from preceding() for offset: " + i);
280
        }
281
    }
282

283
    private void testIsBoundary(BreakIterator bi, String text, int[] boundaries) {
284
        logln("testIsBoundary():");
285
        int p = 1;
286
        boolean isB;
287
        for (int i = 0; i <= text.length(); i++) {  // change to <= when new BI code goes in
288
            isB = bi.isBoundary(i);
289
            logln("bi.isBoundary(" + i + ") -> " + isB);
290

291
            if (i == boundaries[p]) {
292
                if (!isB)
293
                    errln("Wrong result from isBoundary() for " + i + ": expected true, got false");
294
                ++p;
295
            }
296
            else {
297
                if (isB)
298
                    errln("Wrong result from isBoundary() for " + i + ": expected false, got true");
299
            }
300
        }
301
    }
302

303
    private void doMultipleSelectionTest(BreakIterator iterator, String testText)
304
    {
305
        logln("Multiple selection test...");
306
        BreakIterator testIterator = (BreakIterator)iterator.clone();
307
        int offset = iterator.first();
308
        int testOffset;
309
        int count = 0;
310

311
        do {
312
            testOffset = testIterator.first();
313
            testOffset = testIterator.next(count);
314
            logln("next(" + count + ") -> " + testOffset);
315
            if (offset != testOffset)
316
                errln("next(n) and next() not returning consistent results: for step " + count + ", next(n) returned " + testOffset + " and next() had " + offset);
317

318
            if (offset != BreakIterator.DONE) {
319
                count++;
320
                offset = iterator.next();
321
            }
322
        } while (offset != BreakIterator.DONE);
323

324
        // now do it backwards...
325
        offset = iterator.last();
326
        count = 0;
327

328
        do {
329
            testOffset = testIterator.last();
330
            testOffset = testIterator.next(count);
331
            logln("next(" + count + ") -> " + testOffset);
332
            if (offset != testOffset)
333
                errln("next(n) and next() not returning consistent results: for step " + count + ", next(n) returned " + testOffset + " and next() had " + offset);
334

335
            if (offset != BreakIterator.DONE) {
336
                count--;
337
                offset = iterator.previous();
338
            }
339
        } while (offset != BreakIterator.DONE);
340
    }
341

342
    private void doBreakInvariantTest(BreakIterator tb, String testChars)
343
    {
344
        StringBuffer work = new StringBuffer("aaa");
345
        int errorCount = 0;
346

347
        // a break should always occur after CR (unless followed by LF), LF, PS, and LS
348
        String breaks = /*"\r\n\u2029\u2028"*/"\n\u2029\u2028";
349
                            // change this back when new BI code is added
350

351
        for (int i = 0; i < breaks.length(); i++) {
352
            work.setCharAt(1, breaks.charAt(i));
353
            for (int j = 0; j < testChars.length(); j++) {
354
                work.setCharAt(0, testChars.charAt(j));
355
                for (int k = 0; k < testChars.length(); k++) {
356
                    char c = testChars.charAt(k);
357

358
                    // if a cr is followed by lf, don't do the check (they stay together)
359
                    if (work.charAt(1) == '\r' && (c == '\n'))
360
                        continue;
361

362
                    // CONTROL (Cc) and FORMAT (Cf) Characters are to be ignored
363
                    // for breaking purposes as per UTR14
364
                    int type1 = Character.getType(work.charAt(1));
365
                    int type2 = Character.getType(c);
366
                    if (type1 == Character.CONTROL || type1 == Character.FORMAT ||
367
                        type2 == Character.CONTROL || type2 == Character.FORMAT) {
368
                        continue;
369
                    }
370

371
                    work.setCharAt(2, c);
372
                    tb.setText(work.toString());
373
                    boolean seen2 = false;
374
                    for (int l = tb.first(); l != BreakIterator.DONE; l = tb.next()) {
375
                        if (l == 2)
376
                            seen2 = true;
377
                    }
378
                    if (!seen2) {
379
                        errln("No break between U+" + Integer.toHexString((int)(work.charAt(1)))
380
                                    + " and U+" + Integer.toHexString((int)(work.charAt(2))));
381
                        errorCount++;
382
                        if (errorCount >= 75)
383
                            return;
384
                    }
385
                }
386
            }
387
        }
388
    }
389

390
    private void doOtherInvariantTest(BreakIterator tb, String testChars)
391
    {
392
        StringBuffer work = new StringBuffer("a\r\na");
393
        int errorCount = 0;
394

395
        // a break should never occur between CR and LF
396
        for (int i = 0; i < testChars.length(); i++) {
397
            work.setCharAt(0, testChars.charAt(i));
398
            for (int j = 0; j < testChars.length(); j++) {
399
                work.setCharAt(3, testChars.charAt(j));
400
                tb.setText(work.toString());
401
                for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next())
402
                    if (k == 2) {
403
                        errln("Break between CR and LF in string U+" + Integer.toHexString(
404
                                (int)(work.charAt(0))) + ", U+d U+a U+" + Integer.toHexString(
405
                                (int)(work.charAt(3))));
406
                        errorCount++;
407
                        if (errorCount >= 75)
408
                            return;
409
                    }
410
            }
411
        }
412

413
        // a break should never occur before a non-spacing mark, unless it's preceded
414
        // by a line terminator
415
        work.setLength(0);
416
        work.append("aaaa");
417
        for (int i = 0; i < testChars.length(); i++) {
418
            char c = testChars.charAt(i);
419
            if (c == '\n' || c == '\r' || c == '\u2029' || c == '\u2028' || c == '\u0003')
420
                continue;
421
            work.setCharAt(1, c);
422
            for (int j = 0; j < testChars.length(); j++) {
423
                c = testChars.charAt(j);
424
                if (Character.getType(c) != Character.NON_SPACING_MARK && Character.getType(c)
425
                        != Character.ENCLOSING_MARK)
426
                    continue;
427
                work.setCharAt(2, c);
428

429
                // CONTROL (Cc) and FORMAT (Cf) Characters are to be ignored
430
                // for breaking purposes as per UTR14
431
                int type1 = Character.getType(work.charAt(1));
432
                int type2 = Character.getType(work.charAt(2));
433
                if (type1 == Character.CONTROL || type1 == Character.FORMAT ||
434
                    type2 == Character.CONTROL || type2 == Character.FORMAT) {
435
                    continue;
436
                }
437

438
                tb.setText(work.toString());
439
                for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next())
440
                    if (k == 2) {
441
                        errln("Break between U+" + Integer.toHexString((int)(work.charAt(1)))
442
                                + " and U+" + Integer.toHexString((int)(work.charAt(2))));
443
                        errorCount++;
444
                        if (errorCount >= 75)
445
                            return;
446
                    }
447
            }
448
        }
449
    }
450

451
    public void debugLogln(String s) {
452
        final String zeros = "0000";
453
        String temp;
454
        StringBuffer out = new StringBuffer();
455
        for (int i = 0; i < s.length(); i++) {
456
            char c = s.charAt(i);
457
            if (c >= ' ' && c < '\u007f')
458
                out.append(c);
459
            else {
460
                out.append("\\u");
461
                temp = Integer.toHexString((int)c);
462
                out.append(zeros.substring(0, 4 - temp.length()));
463
                out.append(temp);
464
            }
465
        }
466
        logln(out.toString());
467
    }
468

469
    //=========================================================================
470
    // tests
471
    //=========================================================================
472

473
    public void TestWordBreak() {
474

475
        Vector<String> wordSelectionData = new Vector<String>();
476

477
        wordSelectionData.addElement("12,34");
478

479
        wordSelectionData.addElement(" ");
480
        wordSelectionData.addElement("\u00A2"); //cent sign
481
        wordSelectionData.addElement("\u00A3"); //pound sign
482
        wordSelectionData.addElement("\u00A4"); //currency sign
483
        wordSelectionData.addElement("\u00A5"); //yen sign
484
        wordSelectionData.addElement("alpha-beta-gamma");
485
        wordSelectionData.addElement(".");
486
        wordSelectionData.addElement(" ");
487
        wordSelectionData.addElement("Badges");
488
        wordSelectionData.addElement("?");
489
        wordSelectionData.addElement(" ");
490
        wordSelectionData.addElement("BADGES");
491
        wordSelectionData.addElement("!");
492
        wordSelectionData.addElement("?");
493
        wordSelectionData.addElement("!");
494
        wordSelectionData.addElement(" ");
495
        wordSelectionData.addElement("We");
496
        wordSelectionData.addElement(" ");
497
        wordSelectionData.addElement("don't");
498
        wordSelectionData.addElement(" ");
499
        wordSelectionData.addElement("need");
500
        wordSelectionData.addElement(" ");
501
        wordSelectionData.addElement("no");
502
        wordSelectionData.addElement(" ");
503
        wordSelectionData.addElement("STINKING");
504
        wordSelectionData.addElement(" ");
505
        wordSelectionData.addElement("BADGES");
506
        wordSelectionData.addElement("!");
507
        wordSelectionData.addElement("!");
508
        wordSelectionData.addElement("!");
509

510
        wordSelectionData.addElement("012.566,5");
511
        wordSelectionData.addElement(" ");
512
        wordSelectionData.addElement("123.3434,900");
513
        wordSelectionData.addElement(" ");
514
        wordSelectionData.addElement("1000,233,456.000");
515
        wordSelectionData.addElement(" ");
516
        wordSelectionData.addElement("1,23.322%");
517
        wordSelectionData.addElement(" ");
518
        wordSelectionData.addElement("123.1222");
519

520
        wordSelectionData.addElement(" ");
521
        wordSelectionData.addElement("\u0024123,000.20");
522

523
        wordSelectionData.addElement(" ");
524
        wordSelectionData.addElement("179.01\u0025");
525

526
        wordSelectionData.addElement("Hello");
527
        wordSelectionData.addElement(",");
528
        wordSelectionData.addElement(" ");
529
        wordSelectionData.addElement("how");
530
        wordSelectionData.addElement(" ");
531
        wordSelectionData.addElement("are");
532
        wordSelectionData.addElement(" ");
533
        wordSelectionData.addElement("you");
534
        wordSelectionData.addElement(" ");
535
        wordSelectionData.addElement("X");
536
        wordSelectionData.addElement(" ");
537

538
        wordSelectionData.addElement("Now");
539
        wordSelectionData.addElement("\r");
540
        wordSelectionData.addElement("is");
541
        wordSelectionData.addElement("\n");
542
        wordSelectionData.addElement("the");
543
        wordSelectionData.addElement("\r\n");
544
        wordSelectionData.addElement("time");
545
        wordSelectionData.addElement("\n");
546
        wordSelectionData.addElement("\r");
547
        wordSelectionData.addElement("for");
548
        wordSelectionData.addElement("\r");
549
        wordSelectionData.addElement("\r");
550
        wordSelectionData.addElement("all");
551
        wordSelectionData.addElement(" ");
552

553
        generalIteratorTest(wordBreak, wordSelectionData);
554
    }
555

556
    public void TestBug4097779() {
557
        Vector<String> wordSelectionData = new Vector<String>();
558

559
        wordSelectionData.addElement("aa\u0300a");
560
        wordSelectionData.addElement(" ");
561

562
        generalIteratorTest(wordBreak, wordSelectionData);
563
    }
564

565
    public void TestBug4098467Words() {
566
        Vector<String> wordSelectionData = new Vector<String>();
567

568
        // What follows is a string of Korean characters (I found it in the Yellow Pages
569
        // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
570
        // it correctly), first as precomposed syllables, and then as conjoining jamo.
571
        // Both sequences should be semantically identical and break the same way.
572
        // precomposed syllables...
573
        wordSelectionData.addElement("\uc0c1\ud56d");
574
        wordSelectionData.addElement(" ");
575
        wordSelectionData.addElement("\ud55c\uc778");
576
        wordSelectionData.addElement(" ");
577
        wordSelectionData.addElement("\uc5f0\ud569");
578
        wordSelectionData.addElement(" ");
579
        wordSelectionData.addElement("\uc7a5\ub85c\uad50\ud68c");
580
        wordSelectionData.addElement(" ");
581
        // conjoining jamo...
582
        wordSelectionData.addElement("\u1109\u1161\u11bc\u1112\u1161\u11bc");
583
        wordSelectionData.addElement(" ");
584
        wordSelectionData.addElement("\u1112\u1161\u11ab\u110b\u1175\u11ab");
585
        wordSelectionData.addElement(" ");
586
        wordSelectionData.addElement("\u110b\u1167\u11ab\u1112\u1161\u11b8");
587
        wordSelectionData.addElement(" ");
588
        wordSelectionData.addElement("\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c");
589
        wordSelectionData.addElement(" ");
590

591
        generalIteratorTest(wordBreak, wordSelectionData);
592
    }
593

594
    public void TestBug4117554Words() {
595
        Vector<String> wordSelectionData = new Vector<String>();
596

597
        // this is a test for bug #4117554: the ideographic iteration mark (U+3005) should
598
        // count as a Kanji character for the purposes of word breaking
599
        wordSelectionData.addElement("abc");
600
        wordSelectionData.addElement("\u4e01\u4e02\u3005\u4e03\u4e03");
601
        wordSelectionData.addElement("abc");
602

603
        generalIteratorTest(wordBreak, wordSelectionData);
604
    }
605

606
    public void TestSentenceBreak() {
607
        Vector<String> sentenceSelectionData = new Vector<String>();
608

609
        sentenceSelectionData.addElement("This is a simple sample sentence. ");
610
        sentenceSelectionData.addElement("(This is it.) ");
611
        sentenceSelectionData.addElement("This is a simple sample sentence. ");
612
        sentenceSelectionData.addElement("\"This isn\'t it.\" ");
613
        sentenceSelectionData.addElement("Hi! ");
614
        sentenceSelectionData.addElement("This is a simple sample sentence. ");
615
        sentenceSelectionData.addElement("It does not have to make any sense as you can see. ");
616
        sentenceSelectionData.addElement("Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ");
617
        sentenceSelectionData.addElement("Che la dritta via aveo smarrita. ");
618
        sentenceSelectionData.addElement("He said, that I said, that you said!! ");
619

620
        sentenceSelectionData.addElement("Don't rock the boat.\u2029");
621

622
        sentenceSelectionData.addElement("Because I am the daddy, that is why. ");
623
        sentenceSelectionData.addElement("Not on my time (el timo.)! ");
624

625
        sentenceSelectionData.addElement("So what!!\u2029");
626

627
        sentenceSelectionData.addElement("\"But now,\" he said, \"I know!\" ");
628
        sentenceSelectionData.addElement("Harris thumbed down several, including \"Away We Go\" (which became the huge success Oklahoma!). ");
629
        sentenceSelectionData.addElement("One species, B. anthracis, is highly virulent.\n");
630
        sentenceSelectionData.addElement("Wolf said about Sounder:\"Beautifully thought-out and directed.\" ");
631
        sentenceSelectionData.addElement("Have you ever said, \"This is where \tI shall live\"? ");
632
        sentenceSelectionData.addElement("He answered, \"You may not!\" ");
633
        sentenceSelectionData.addElement("Another popular saying is: \"How do you do?\". ");
634
        sentenceSelectionData.addElement("Yet another popular saying is: \'I\'m fine thanks.\' ");
635
        sentenceSelectionData.addElement("What is the proper use of the abbreviation pp.? ");
636
        sentenceSelectionData.addElement("Yes, I am definatelly 12\" tall!!");
637

638
        generalIteratorTest(sentenceBreak, sentenceSelectionData);
639
    }
640

641
    public void TestBug4113835() {
642
        Vector<String> sentenceSelectionData = new Vector<String>();
643

644
        // test for bug #4113835: \n and \r count as spaces, not as paragraph breaks
645
        sentenceSelectionData.addElement("Now\ris\nthe\r\ntime\n\rfor\r\rall\u2029");
646

647
        generalIteratorTest(sentenceBreak, sentenceSelectionData);
648
    }
649

650
    public void TestBug4111338() {
651
        Vector<String> sentenceSelectionData = new Vector<String>();
652

653
        // test for bug #4111338: Don't break sentences at the boundary between CJK
654
        // and other letters
655
        sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165:\"JAVA\u821c"
656
                + "\u8165\u7fc8\u51ce\u306d,\u2494\u56d8\u4ec0\u60b1\u8560\u51ba"
657
                + "\u611d\u57b6\u2510\u5d46\".\u2029");
658
        sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8"
659
                + "\u97e4JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0"
660
                + "\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029");
661
        sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e4"
662
                + "\u6470\u8790JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8"
663
                + "\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029");
664
        sentenceSelectionData.addElement("He said, \"I can go there.\"\u2029");
665

666
        generalIteratorTest(sentenceBreak, sentenceSelectionData);
667
    }
668

669
    public void TestBug4117554Sentences() {
670
        Vector<String> sentenceSelectionData = new Vector<String>();
671

672
        // Treat fullwidth variants of .!? the same as their
673
        // normal counterparts
674
        sentenceSelectionData.addElement("I know I'm right\uff0e ");
675
        sentenceSelectionData.addElement("Right\uff1f ");
676
        sentenceSelectionData.addElement("Right\uff01 ");
677

678
        // Don't break sentences at boundary between CJK and digits
679
        sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8"
680
                + "\u97e48888\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0"
681
                + "\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029");
682

683
        // Break sentence between a sentence terminator and
684
        // opening punctuation
685
        sentenceSelectionData.addElement("no?");
686
        sentenceSelectionData.addElement("(yes)");
687

688
        generalIteratorTest(sentenceBreak, sentenceSelectionData);
689
    }
690

691
    public void TestBug4158381() {
692
        Vector<String> sentenceSelectionData = new Vector<String>();
693

694
        // Don't break sentence after period if it isn't followed by a space
695
        sentenceSelectionData.addElement("Test <code>Flags.Flag</code> class.  ");
696
        sentenceSelectionData.addElement("Another test.\u2029");
697

698
        // No breaks when there are no terminators around
699
        sentenceSelectionData.addElement("<P>Provides a set of "
700
                + "&quot;lightweight&quot; (all-java<FONT SIZE=\"-2\"><SUP>TM"
701
                + "</SUP></FONT> language) components that, "
702
                + "to the maximum degree possible, work the same on all platforms.  ");
703
        sentenceSelectionData.addElement("Another test.\u2029");
704

705
        generalIteratorTest(sentenceBreak, sentenceSelectionData);
706
    }
707

708
    public void TestBug4143071() {
709
        Vector<String> sentenceSelectionData = new Vector<String>();
710

711
        // Make sure sentences that end with digits work right
712
        sentenceSelectionData.addElement("Today is the 27th of May, 1998.  ");
713
        sentenceSelectionData.addElement("Tomorrow with be 28 May 1998.  ");
714
        sentenceSelectionData.addElement("The day after will be the 30th.\u2029");
715

716
        generalIteratorTest(sentenceBreak, sentenceSelectionData);
717
    }
718

719
    public void TestBug4152416() {
720
        Vector<String> sentenceSelectionData = new Vector<String>();
721

722
        // Make sure sentences ending with a capital letter are treated correctly
723
        sentenceSelectionData.addElement("The type of all primitive "
724
                + "<code>boolean</code> values accessed in the target VM.  ");
725
        sentenceSelectionData.addElement("Calls to xxx will return an "
726
                + "implementor of this interface.\u2029");
727

728
        generalIteratorTest(sentenceBreak, sentenceSelectionData);
729
    }
730

731
    public void TestBug4152117() {
732
        Vector<String> sentenceSelectionData = new Vector<String>();
733

734
        // Make sure sentence breaking is handling punctuation correctly
735
        // [COULD NOT REPRODUCE THIS BUG, BUT TEST IS HERE TO MAKE SURE
736
        // IT DOESN'T CROP UP]
737
        sentenceSelectionData.addElement("Constructs a randomly generated "
738
                + "BigInteger, uniformly distributed over the range <tt>0</tt> "
739
                + "to <tt>(2<sup>numBits</sup> - 1)</tt>, inclusive.  ");
740
        sentenceSelectionData.addElement("The uniformity of the distribution "
741
                + "assumes that a fair source of random bits is provided in "
742
                + "<tt>rnd</tt>.  ");
743
        sentenceSelectionData.addElement("Note that this constructor always "
744
                + "constructs a non-negative BigInteger.\u2029");
745

746
        generalIteratorTest(sentenceBreak, sentenceSelectionData);
747
    }
748

749
    public void TestBug8264765() {
750
        Vector<String> sentenceSelectionData = new Vector<String>();
751

752
        // Comma should not be regarded as the start of a sentence,
753
        // otherwise the backwards rule would break the following sentence.
754
        sentenceSelectionData.addElement(
755
            "Due to a problem (e.g., software bug), the server is down. ");
756

757
        generalIteratorTest(sentenceBreak, sentenceSelectionData);
758
    }
759

760
    public void TestLineBreak() {
761
        Vector<String> lineSelectionData = new Vector<String>();
762

763
        lineSelectionData.addElement("Multi-");
764
        lineSelectionData.addElement("Level ");
765
        lineSelectionData.addElement("example ");
766
        lineSelectionData.addElement("of ");
767
        lineSelectionData.addElement("a ");
768
        lineSelectionData.addElement("semi-");
769
        lineSelectionData.addElement("idiotic ");
770
        lineSelectionData.addElement("non-");
771
        lineSelectionData.addElement("sensical ");
772
        lineSelectionData.addElement("(non-");
773
        lineSelectionData.addElement("important) ");
774
        lineSelectionData.addElement("sentence. ");
775

776
        lineSelectionData.addElement("Hi  ");
777
        lineSelectionData.addElement("Hello ");
778
        lineSelectionData.addElement("How\n");
779
        lineSelectionData.addElement("are\r");
780
        lineSelectionData.addElement("you\u2028");
781
        lineSelectionData.addElement("fine.\t");
782
        lineSelectionData.addElement("good.  ");
783

784
        lineSelectionData.addElement("Now\r");
785
        lineSelectionData.addElement("is\n");
786
        lineSelectionData.addElement("the\r\n");
787
        lineSelectionData.addElement("time\n");
788
        lineSelectionData.addElement("\r");
789
        lineSelectionData.addElement("for\r");
790
        lineSelectionData.addElement("\r");
791
        lineSelectionData.addElement("all");
792

793
        generalIteratorTest(lineBreak, lineSelectionData);
794
    }
795

796
    public void TestBug4068133() {
797
        Vector<String> lineSelectionData = new Vector<String>();
798

799
        lineSelectionData.addElement("\u96f6");
800
        lineSelectionData.addElement("\u4e00\u3002");
801
        lineSelectionData.addElement("\u4e8c\u3001");
802
        lineSelectionData.addElement("\u4e09\u3002\u3001");
803
        lineSelectionData.addElement("\u56db\u3001\u3002\u3001");
804
        lineSelectionData.addElement("\u4e94,");
805
        lineSelectionData.addElement("\u516d.");
806
        lineSelectionData.addElement("\u4e03.\u3001,\u3002");
807
        lineSelectionData.addElement("\u516b");
808

809
        generalIteratorTest(lineBreak, lineSelectionData);
810
    }
811

812
    public void TestBug4086052() {
813
        Vector<String> lineSelectionData = new Vector<String>();
814

815
        lineSelectionData.addElement("foo\u00a0bar ");
816
//        lineSelectionData.addElement("foo\ufeffbar");
817

818
        generalIteratorTest(lineBreak, lineSelectionData);
819
    }
820

821
    public void TestBug4097920() {
822
        Vector<String> lineSelectionData = new Vector<String>();
823

824
        lineSelectionData.addElement("dog,");
825
        lineSelectionData.addElement("cat,");
826
        lineSelectionData.addElement("mouse ");
827
        lineSelectionData.addElement("(one)");
828
        lineSelectionData.addElement("(two)\n");
829

830
        generalIteratorTest(lineBreak, lineSelectionData);
831
    }
832
    /*
833
    public void TestBug4035266() {
834
        Vector<String> lineSelectionData = new Vector<String>();
835

836
        lineSelectionData.addElement("The ");
837
        lineSelectionData.addElement("balance ");
838
        lineSelectionData.addElement("is ");
839
        lineSelectionData.addElement("$-23,456.78, ");
840
        lineSelectionData.addElement("not ");
841
        lineSelectionData.addElement("-$32,456.78!\n");
842

843
        generalIteratorTest(lineBreak, lineSelectionData);
844
    }
845
    */
846
    public void TestBug4098467Lines() {
847
        Vector<String> lineSelectionData = new Vector<String>();
848

849
        // What follows is a string of Korean characters (I found it in the Yellow Pages
850
        // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
851
        // it correctly), first as precomposed syllables, and then as conjoining jamo.
852
        // Both sequences should be semantically identical and break the same way.
853
        // precomposed syllables...
854
        lineSelectionData.addElement("\uc0c1");
855
        lineSelectionData.addElement("\ud56d ");
856
        lineSelectionData.addElement("\ud55c");
857
        lineSelectionData.addElement("\uc778 ");
858
        lineSelectionData.addElement("\uc5f0");
859
        lineSelectionData.addElement("\ud569 ");
860
        lineSelectionData.addElement("\uc7a5");
861
        lineSelectionData.addElement("\ub85c");
862
        lineSelectionData.addElement("\uad50");
863
        lineSelectionData.addElement("\ud68c ");
864
        // conjoining jamo...
865
        lineSelectionData.addElement("\u1109\u1161\u11bc\u1112\u1161\u11bc ");
866
        lineSelectionData.addElement("\u1112\u1161\u11ab\u110b\u1175\u11ab ");
867
        lineSelectionData.addElement("\u110b\u1167\u11ab\u1112\u1161\u11b8 ");
868
        lineSelectionData.addElement("\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c");
869

870
        if (Locale.getDefault().getLanguage().equals("th")) {
871
            logln("This test is skipped in th locale.");
872
            return;
873
        }
874

875
        generalIteratorTest(lineBreak, lineSelectionData);
876
    }
877

878
    public void TestBug4117554Lines() {
879
        Vector<String> lineSelectionData = new Vector<String>();
880

881
        // Fullwidth .!? should be treated as postJwrd
882
        lineSelectionData.addElement("\u4e01\uff0e");
883
        lineSelectionData.addElement("\u4e02\uff01");
884
        lineSelectionData.addElement("\u4e03\uff1f");
885

886
        generalIteratorTest(lineBreak, lineSelectionData);
887
    }
888

889
    public void TestBug4217703() {
890
        if (Locale.getDefault().getLanguage().equals("th")) {
891
            logln("This test is skipped in th locale.");
892
            return;
893
        }
894

895
        Vector<String> lineSelectionData = new Vector<String>();
896

897
        // There shouldn't be a line break between sentence-ending punctuation
898
        // and a closing quote
899
        lineSelectionData.addElement("He ");
900
        lineSelectionData.addElement("said ");
901
        lineSelectionData.addElement("\"Go!\"  ");
902
        lineSelectionData.addElement("I ");
903
        lineSelectionData.addElement("went.  ");
904

905
        lineSelectionData.addElement("Hashtable$Enumeration ");
906
        lineSelectionData.addElement("getText().");
907
        lineSelectionData.addElement("getIndex()");
908

909
        generalIteratorTest(lineBreak, lineSelectionData);
910
    }
911

912
    private static final String graveS = "S\u0300";
913
    private static final String acuteBelowI = "i\u0317";
914
    private static final String acuteE = "e\u0301";
915
    private static final String circumflexA = "a\u0302";
916
    private static final String tildeE = "e\u0303";
917

918
    public void TestCharacterBreak() {
919
        Vector<String> characterSelectionData = new Vector<String>();
920

921
        characterSelectionData.addElement(graveS);
922
        characterSelectionData.addElement(acuteBelowI);
923
        characterSelectionData.addElement("m");
924
        characterSelectionData.addElement("p");
925
        characterSelectionData.addElement("l");
926
        characterSelectionData.addElement(acuteE);
927
        characterSelectionData.addElement(" ");
928
        characterSelectionData.addElement("s");
929
        characterSelectionData.addElement(circumflexA);
930
        characterSelectionData.addElement("m");
931
        characterSelectionData.addElement("p");
932
        characterSelectionData.addElement("l");
933
        characterSelectionData.addElement(tildeE);
934
        characterSelectionData.addElement(".");
935
        characterSelectionData.addElement("w");
936
        characterSelectionData.addElement(circumflexA);
937
        characterSelectionData.addElement("w");
938
        characterSelectionData.addElement("a");
939
        characterSelectionData.addElement("f");
940
        characterSelectionData.addElement("q");
941
        characterSelectionData.addElement("\n");
942
        characterSelectionData.addElement("\r");
943
        characterSelectionData.addElement("\r\n");
944
        characterSelectionData.addElement("\n");
945

946
        generalIteratorTest(characterBreak, characterSelectionData);
947
    }
948

949
    public void TestBug4098467Characters() {
950
        Vector<String> characterSelectionData = new Vector<String>();
951

952
        // What follows is a string of Korean characters (I found it in the Yellow Pages
953
        // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
954
        // it correctly), first as precomposed syllables, and then as conjoining jamo.
955
        // Both sequences should be semantically identical and break the same way.
956
        // precomposed syllables...
957
        characterSelectionData.addElement("\uc0c1");
958
        characterSelectionData.addElement("\ud56d");
959
        characterSelectionData.addElement(" ");
960
        characterSelectionData.addElement("\ud55c");
961
        characterSelectionData.addElement("\uc778");
962
        characterSelectionData.addElement(" ");
963
        characterSelectionData.addElement("\uc5f0");
964
        characterSelectionData.addElement("\ud569");
965
        characterSelectionData.addElement(" ");
966
        characterSelectionData.addElement("\uc7a5");
967
        characterSelectionData.addElement("\ub85c");
968
        characterSelectionData.addElement("\uad50");
969
        characterSelectionData.addElement("\ud68c");
970
        characterSelectionData.addElement(" ");
971
        // conjoining jamo...
972
        characterSelectionData.addElement("\u1109\u1161\u11bc");
973
        characterSelectionData.addElement("\u1112\u1161\u11bc");
974
        characterSelectionData.addElement(" ");
975
        characterSelectionData.addElement("\u1112\u1161\u11ab");
976
        characterSelectionData.addElement("\u110b\u1175\u11ab");
977
        characterSelectionData.addElement(" ");
978
        characterSelectionData.addElement("\u110b\u1167\u11ab");
979
        characterSelectionData.addElement("\u1112\u1161\u11b8");
980
        characterSelectionData.addElement(" ");
981
        characterSelectionData.addElement("\u110c\u1161\u11bc");
982
        characterSelectionData.addElement("\u1105\u1169");
983
        characterSelectionData.addElement("\u1100\u116d");
984
        characterSelectionData.addElement("\u1112\u116c");
985

986
        generalIteratorTest(characterBreak, characterSelectionData);
987
    }
988

989
    public void TestBug4153072() {
990
        BreakIterator iter = BreakIterator.getWordInstance();
991
        String str = "...Hello, World!...";
992
        int begin = 3;
993
        int end = str.length() - 3;
994
        boolean gotException = false;
995
        boolean dummy;
996

997
        iter.setText(new StringCharacterIterator(str, begin, end, begin));
998
        for (int index = -1; index < begin + 1; ++index) {
999
            try {
1000
                dummy = iter.isBoundary(index);
1001
                if (index < begin)
1002
                    errln("Didn't get exception with offset = " + index +
1003
                                    " and begin index = " + begin);
1004
            }
1005
            catch (IllegalArgumentException e) {
1006
                if (index >= begin)
1007
                    errln("Got exception with offset = " + index +
1008
                                    " and begin index = " + begin);
1009
            }
1010
        }
1011
    }
1012

1013
    public void TestBug4146175Sentences() {
1014
        Vector<String> sentenceSelectionData = new Vector<String>();
1015

1016
        // break between periods and opening punctuation even when there's no
1017
        // intervening space
1018
        sentenceSelectionData.addElement("end.");
1019
        sentenceSelectionData.addElement("(This is\u2029");
1020

1021
        // treat the fullwidth period as an unambiguous sentence terminator
1022
        sentenceSelectionData.addElement("\u7d42\u308f\u308a\uff0e");
1023
        sentenceSelectionData.addElement("\u300c\u3053\u308c\u306f");
1024

1025
        generalIteratorTest(sentenceBreak, sentenceSelectionData);
1026
    }
1027

1028
    public void TestBug4146175Lines() {
1029
        if (Locale.getDefault().getLanguage().equals("th")) {
1030
            logln("This test is skipped in th locale.");
1031
            return;
1032
        }
1033

1034
        Vector<String> lineSelectionData = new Vector<String>();
1035

1036
        // the fullwidth comma should stick to the preceding Japanese character
1037
        lineSelectionData.addElement("\u7d42\uff0c");
1038
        lineSelectionData.addElement("\u308f");
1039

1040
        generalIteratorTest(lineBreak, lineSelectionData);
1041
    }
1042

1043
    public void TestBug4214367() {
1044
        if (Locale.getDefault().getLanguage().equals("th")) {
1045
            logln("This test is skipped in th locale.");
1046
            return;
1047
        }
1048

1049
        Vector<String> wordSelectionData = new Vector<String>();
1050

1051
        // the hiragana and katakana iteration marks and the long vowel mark
1052
        // are not being treated correctly by the word-break iterator
1053
        wordSelectionData.addElement("\u3042\u3044\u309d\u3042\u309e\u3042\u30fc\u3042");
1054
        wordSelectionData.addElement("\u30a2\u30a4\u30fd\u30a2\u30fe\u30a2\u30fc\u30a2");
1055

1056
        generalIteratorTest(wordBreak, wordSelectionData);
1057
    }
1058

1059
    private static final String cannedTestChars // characters fo the class Cc are ignorable for breaking
1060
        = /*"\u0000\u0001\u0002\u0003\u0004*/" !\"#$%&()+-01234<=>ABCDE[]^_`abcde{}|\u00a0\u00a2"
1061
        + "\u00a3\u00a4\u00a5\u00a6\u00a7\u00a8\u00a9\u00ab\u00ad\u00ae\u00af\u00b0\u00b2\u00b3"
1062
        + "\u00b4\u00b9\u00bb\u00bc\u00bd\u02b0\u02b1\u02b2\u02b3\u02b4\u0300\u0301\u0302\u0303"
1063
        + "\u0304\u05d0\u05d1\u05d2\u05d3\u05d4\u0903\u093e\u093f\u0940\u0949\u0f3a\u0f3b\u2000"
1064
        + "\u2001\u2002\u200c\u200d\u200e\u200f\u2010\u2011\u2012\u2028\u2029\u202a\u203e\u203f"
1065
        + "\u2040\u20dd\u20de\u20df\u20e0\u2160\u2161\u2162\u2163\u2164";
1066

1067
    public void TestSentenceInvariants()
1068
    {
1069
        BreakIterator e = BreakIterator.getSentenceInstance();
1070
        doOtherInvariantTest(e, cannedTestChars + ".,\u3001\u3002\u3041\u3042\u3043\ufeff");
1071
    }
1072

1073
    public void TestWordInvariants()
1074
    {
1075
        if (Locale.getDefault().getLanguage().equals("th")) {
1076
            logln("This test is skipped in th locale.");
1077
            return;
1078
        }
1079

1080
        BreakIterator e = BreakIterator.getWordInstance();
1081
        doBreakInvariantTest(e, cannedTestChars + "\',.\u3041\u3042\u3043\u309b\u309c\u30a1\u30a2"
1082
            + "\u30a3\u4e00\u4e01\u4e02");
1083
        doOtherInvariantTest(e, cannedTestChars + "\',.\u3041\u3042\u3043\u309b\u309c\u30a1\u30a2"
1084
            + "\u30a3\u4e00\u4e01\u4e02");
1085
    }
1086

1087
    public void TestLineInvariants()
1088
    {
1089
        if (Locale.getDefault().getLanguage().equals("th")) {
1090
            logln("This test is skipped in th locale.");
1091
            return;
1092
        }
1093

1094
        BreakIterator e = BreakIterator.getLineInstance();
1095
        String testChars = cannedTestChars + ".,;:\u3001\u3002\u3041\u3042\u3043\u3044\u3045"
1096
            + "\u30a3\u4e00\u4e01\u4e02";
1097
        doBreakInvariantTest(e, testChars);
1098
        doOtherInvariantTest(e, testChars);
1099

1100
        int errorCount = 0;
1101

1102
        // in addition to the other invariants, a line-break iterator should make sure that:
1103
        // it doesn't break around the non-breaking characters
1104
        String noBreak = "\u00a0\u2007\u2011\ufeff";
1105
        StringBuffer work = new StringBuffer("aaa");
1106
        for (int i = 0; i < testChars.length(); i++) {
1107
            char c = testChars.charAt(i);
1108
            if (c == '\r' || c == '\n' || c == '\u2029' || c == '\u2028' || c == '\u0003')
1109
                continue;
1110
            work.setCharAt(0, c);
1111
            for (int j = 0; j < noBreak.length(); j++) {
1112
                work.setCharAt(1, noBreak.charAt(j));
1113
                for (int k = 0; k < testChars.length(); k++) {
1114
                    work.setCharAt(2, testChars.charAt(k));
1115
                    // CONTROL (Cc) and FORMAT (Cf) Characters are to be ignored
1116
                    // for breaking purposes as per UTR14
1117
                    int type1 = Character.getType(work.charAt(1));
1118
                    int type2 = Character.getType(work.charAt(2));
1119
                    if (type1 == Character.CONTROL || type1 == Character.FORMAT ||
1120
                        type2 == Character.CONTROL || type2 == Character.FORMAT) {
1121
                        continue;
1122
                    }
1123
                    e.setText(work.toString());
1124
                    for (int l = e.first(); l != BreakIterator.DONE; l = e.next()) {
1125
                        if (l == 1 || l == 2) {
1126
                            //errln("Got break between U+" + Integer.toHexString((int)
1127
                            //        (work.charAt(l - 1))) + " and U+" + Integer.toHexString(
1128
                            //        (int)(work.charAt(l))) + "\ntype1 = " + type1 + "\ntype2 = " + type2);
1129
                            // as per UTR14 spaces followed by a GLUE character should allow
1130
                            // line breaking
1131
                            if (work.charAt(l-1) == '\u0020' && (work.charAt(l) == '\u00a0' ||
1132
                                                                 work.charAt(l) == '\u0f0c' ||
1133
                                                                 work.charAt(l) == '\u2007' ||
1134
                                                                 work.charAt(l) == '\u2011' ||
1135
                                                                 work.charAt(l) == '\u202f' ||
1136
                                                                 work.charAt(l) == '\ufeff')) {
1137
                                continue;
1138
                            }
1139
                            errln("Got break between U+" + Integer.toHexString((int)
1140
                                    (work.charAt(l - 1))) + " and U+" + Integer.toHexString(
1141
                                    (int)(work.charAt(l))));
1142
                            errorCount++;
1143
                            if (errorCount >= 75)
1144
                                return;
1145
                        }
1146
                    }
1147
                }
1148
            }
1149
        }
1150

1151
        // The following test has so many exceptions that it would be better to write a new set of data
1152
        // that tested exactly what should be tested
1153
        // Until that point it will be commented out
1154
        /*
1155

1156
        // it does break after dashes (unless they're followed by a digit, a non-spacing mark,
1157
        // a currency symbol, a space, a format-control character, a regular control character,
1158
        // a line or paragraph separator, or another dash)
1159
        String dashes = "-\u00ad\u2010\u2012\u2013\u2014";
1160
        for (int i = 0; i < testChars.length(); i++) {
1161
            work.setCharAt(0, testChars.charAt(i));
1162
            for (int j = 0; j < dashes.length(); j++) {
1163
                work.setCharAt(1, dashes.charAt(j));
1164
                for (int k = 0; k < testChars.length(); k++) {
1165
                    char c = testChars.charAt(k);
1166
                    if (Character.getType(c) == Character.DECIMAL_DIGIT_NUMBER ||
1167
                        Character.getType(c) == Character.OTHER_NUMBER ||
1168
                        Character.getType(c) == Character.NON_SPACING_MARK ||
1169
                        Character.getType(c) == Character.ENCLOSING_MARK ||
1170
                        Character.getType(c) == Character.CURRENCY_SYMBOL ||
1171
                        Character.getType(c) == Character.DASH_PUNCTUATION ||
1172
                        Character.getType(c) == Character.SPACE_SEPARATOR ||
1173
                        Character.getType(c) == Character.FORMAT ||
1174
                        Character.getType(c) == Character.CONTROL ||
1175
                        Character.getType(c) == Character.END_PUNCTUATION ||
1176
                        Character.getType(c) == Character.FINAL_QUOTE_PUNCTUATION ||
1177
                        Character.getType(c) == Character.OTHER_PUNCTUATION ||
1178
                        c == '\'' || c == '\"' ||
1179
                        // category EX as per UTR14
1180
                        c == '!' || c == '?' || c == '\ufe56' || c == '\ufe57' || c == '\uff01' || c == '\uff1f' ||
1181
                        c == '\n' || c == '\r' || c == '\u2028' || c == '\u2029' ||
1182
                        c == '\u0003' || c == '\u2007' || c == '\u2011' ||
1183
                        c == '\ufeff')
1184
                        continue;
1185
                    work.setCharAt(2, c);
1186
                    e.setText(work.toString());
1187
                    boolean saw2 = false;
1188
                    for (int l = e.first(); l != BreakIterator.DONE; l = e.next())
1189
                        if (l == 2)
1190
                            saw2 = true;
1191
                    if (!saw2) {
1192
                        errln("Didn't get break between U+" + Integer.toHexString((int)
1193
                                    (work.charAt(1))) + " and U+" + Integer.toHexString(
1194
                                    (int)(work.charAt(2))));
1195
                        errorCount++;
1196
                        if (errorCount >= 75)
1197
                            return;
1198
                    }
1199
                }
1200
            }
1201
        }
1202
        */
1203
    }
1204

1205
    public void TestCharacterInvariants()
1206
    {
1207
        BreakIterator e = BreakIterator.getCharacterInstance();
1208
        doBreakInvariantTest(e, cannedTestChars + "\u1100\u1101\u1102\u1160\u1161\u1162\u11a8"
1209
            + "\u11a9\u11aa");
1210
        doOtherInvariantTest(e, cannedTestChars + "\u1100\u1101\u1102\u1160\u1161\u1162\u11a8"
1211
            + "\u11a9\u11aa");
1212
    }
1213

1214
    public void TestEmptyString()
1215
    {
1216
        String text = "";
1217
        Vector<String> x = new Vector<String>();
1218
        x.addElement(text);
1219

1220
        generalIteratorTest(lineBreak, x);
1221
    }
1222

1223
    public void TestGetAvailableLocales()
1224
    {
1225
        Locale[] locList = BreakIterator.getAvailableLocales();
1226

1227
        if (locList.length == 0)
1228
            errln("getAvailableLocales() returned an empty list!");
1229
        // I have no idea how to test this function...
1230
    }
1231

1232

1233
    /**
1234
     * Bug 4095322
1235
     */
1236
    public void TestJapaneseLineBreak()
1237
    {
1238
        StringBuffer testString = new StringBuffer("\u4e00x\u4e8c");
1239
        // Breaking on <Kanji>$<Kanji> is inconsistent
1240

1241
        /* Characters in precedingChars and followingChars have been updated
1242
         * from Unicode 2.0.14-based to 3.0.0-based when 4638433 was fixed.
1243
         * In concrete terms,
1244
         *   0x301F : Its category was changed from Ps to Pe since Unicode 2.1.
1245
         *   0x169B & 0x169C : added since Unicode 3.0.0.
1246
         */
1247
        String precedingChars =
1248
            /* Puctuation, Open */
1249
          "([{\u201a\u201e\u2045\u207d\u208d\u2329\u3008\u300a\u300c\u300e\u3010\u3014\u3016\u3018\u301a\u301d\ufe35\ufe37\ufe39\ufe3b\ufe3d\ufe3f\ufe41\ufe43\ufe59\ufe5b\ufe5d\uff08\uff3b\uff5b\uff62\u169b"
1250
            /* Punctuation, Initial quote */
1251
          + "\u00ab\u2018\u201b\u201c\u201f\u2039"
1252
            /* Symbol, Currency */
1253
          + "\u00a5\u00a3\u00a4\u20a0";
1254

1255
        String followingChars =
1256
            /* Puctuation, Close */
1257
          ")]}\u2046\u207e\u208e\u232a\u3009\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b\u301e\u301f\ufd3e\ufe36\ufe38\ufe3a\ufe3c\ufe3e\ufe40\ufe42\ufe44\ufe5a\ufe5c\ufe5e\uff09\uff3d\uff5d\uff63\u169c"
1258
            /* Punctuation, Final quote */
1259
          + "\u00bb\u2019\u201d\u203a"
1260
            /* Punctuation, Other */
1261
          + "!%,.:;\u3001\u3002\u2030\u2031\u2032\u2033\u2034"
1262
            /* Punctuation, Dash */
1263
          + "\u2103\u2109"
1264
            /* Symbol, Currency */
1265
          + "\u00a2"
1266
            /* Letter, Modifier */
1267
          + "\u3005\u309d\u309e"
1268
            /* Letter, Other */
1269
          + "\u3063\u3083\u3085\u3087\u30c3\u30e3\u30e5\u30e7\u30fc\u30fd\u30fe"
1270
           /* Mark, Non-Spacing */
1271
          + "\u0300\u0301\u0302"
1272
            /* Symbol, Modifier */
1273
          + "\u309b\u309c"
1274
            /* Symbol, Other */
1275
          + "\u00b0";
1276

1277
        BreakIterator iter = BreakIterator.getLineInstance(Locale.JAPAN);
1278

1279
        for (int i = 0; i < precedingChars.length(); i++) {
1280
            testString.setCharAt(1, precedingChars.charAt(i));
1281
            iter.setText(testString.toString());
1282
            int j = iter.first();
1283
            if (j != 0) {
1284
                errln("ja line break failure: failed to start at 0 and bounced at " + j);
1285
            }
1286
            j = iter.next();
1287
            if (j != 1) {
1288
                errln("ja line break failure: failed to stop before '"
1289
                        + precedingChars.charAt(i) + "' (\\u"
1290
                        + Integer.toString(precedingChars.charAt(i), 16)
1291
                        + ") at 1 and bounded at " + j);
1292
            }
1293
            j = iter.next();
1294
            if (j != 3) {
1295
                errln("ja line break failure: failed to skip position after '"
1296
                        + precedingChars.charAt(i) + "' (\\u"
1297
                        + Integer.toString(precedingChars.charAt(i), 16)
1298
                        + ") at 3 and bounded at " + j);
1299
            }
1300
        }
1301

1302
        for (int i = 0; i < followingChars.length(); i++) {
1303
            testString.setCharAt(1, followingChars.charAt(i));
1304
            iter.setText(testString.toString());
1305
            int j = iter.first();
1306
            if (j != 0) {
1307
                errln("ja line break failure: failed to start at 0 and bounded at " + j);
1308
            }
1309
            j = iter.next();
1310
            if (j != 2) {
1311
                errln("ja line break failure: failed to skip position before '"
1312
                        + followingChars.charAt(i) + "' (\\u"
1313
                        + Integer.toString(followingChars.charAt(i), 16)
1314
                        + ") at 2 and bounded at " + j);
1315
            }
1316
            j = iter.next();
1317
            if (j != 3) {
1318
                errln("ja line break failure: failed to stop after '"
1319
                        + followingChars.charAt(i) + "' (\\u"
1320
                        + Integer.toString(followingChars.charAt(i), 16)
1321
                        + ") at 3 and bounded at " + j);
1322
            }
1323
        }
1324
    }
1325

1326
    /**
1327
     * Bug 4638433
1328
     */
1329
    public void TestLineBreakBasedOnUnicode3_0_0()
1330
    {
1331
        BreakIterator iter;
1332
        int i;
1333

1334
        /* Latin Extend-B characters
1335
         * 0x0218-0x0233 which have been added since Unicode 3.0.0.
1336
         */
1337
        iter = BreakIterator.getWordInstance(Locale.US);
1338
        iter.setText("\u0216\u0217\u0218\u0219\u021A");
1339
        i = iter.first();
1340
        i = iter.next();
1341
        if (i != 5) {
1342
            errln("Word break failure: failed to stop at 5 and bounded at " + i);
1343
        }
1344

1345

1346
        iter = BreakIterator.getLineInstance(Locale.US);
1347

1348
        /* <Three(Nd)><Two(Nd)><Low Double Prime Quotation Mark(Pe)><One(Nd)>
1349
         * \u301f has changed its category from Ps to Pe since Unicode 2.1.
1350
         */
1351
        iter.setText("32\u301f1");
1352
        i = iter.first();
1353
        i = iter.next();
1354
        if (i != 3) {
1355
            errln("Line break failure: failed to skip before \\u301F(Pe) at 3 and bounded at " + i);
1356
        }
1357

1358
        /* Mongolian <Letter A(Lo)><Todo Soft Hyphen(Pd)><Letter E(Lo)>
1359
         * which have been added since Unicode 3.0.0.
1360
         */
1361
        iter.setText("\u1820\u1806\u1821");
1362
        i = iter.first();
1363
        i = iter.next();
1364
        if (i != 2) {
1365
            errln("Mongolian line break failure: failed to skip position before \\u1806(Pd) at 2 and bounded at " + i);
1366
        }
1367

1368
        /* Khmer <ZERO(Nd)><Currency Symbol(Sc)><ONE(Nd)> which have
1369
         * been added since Unicode 3.0.0.
1370
         */
1371
        iter.setText("\u17E0\u17DB\u17E1");
1372
        i = iter.first();
1373
        i = iter.next();
1374
        if (i != 1) {
1375
            errln("Khmer line break failure: failed to stop before \\u17DB(Sc) at 1 and bounded at " + i);
1376
        }
1377
        i = iter.next();
1378
        if (i != 3) {
1379
            errln("Khmer line break failure: failed to skip position after \\u17DB(Sc) at 3 and bounded at " + i);
1380
        }
1381

1382
        /* Ogham <Letter UR(Lo)><Space Mark(Zs)><Letter OR(Lo)> which have
1383
         * been added since Unicode 3.0.0.
1384
         */
1385
        iter.setText("\u1692\u1680\u1696");
1386
        i = iter.first();
1387
        i = iter.next();
1388
        if (i != 2) {
1389
            errln("Ogham line break failure: failed to skip postion before \\u1680(Zs) at 2 and bounded at " + i);
1390
        }
1391

1392

1393
        // Confirm changes in BreakIteratorRules_th.java have been reflected.
1394
        iter = BreakIterator.getLineInstance(new Locale("th", ""));
1395

1396
        /* Thai <Seven(Nd)>
1397
         *      <Left Double Quotation Mark(Pi)>
1398
         *      <Five(Nd)>
1399
         *      <Right Double Quotation Mark(Pf)>
1400
         *      <Three(Nd)>
1401
         */
1402
        iter.setText("\u0E57\u201C\u0E55\u201D\u0E53");
1403
        i = iter.first();
1404
        i = iter.next();
1405
        if (i != 1) {
1406
            errln("Thai line break failure: failed to stop before \\u201C(Pi) at 1 and bounded at " + i);
1407
        }
1408
        i = iter.next();
1409
        if (i != 4) {
1410
            errln("Thai line break failure: failed to stop after \\u201D(Pf) at 4 and bounded at " + i);
1411
        }
1412
    }
1413

1414
    /**
1415
     * Bug 4068137
1416
     */
1417
    public void TestEndBehavior()
1418
    {
1419
        String testString = "boo.";
1420
        BreakIterator wb = BreakIterator.getWordInstance();
1421
        wb.setText(testString);
1422

1423
        if (wb.first() != 0)
1424
            errln("Didn't get break at beginning of string.");
1425
        if (wb.next() != 3)
1426
            errln("Didn't get break before period in \"boo.\"");
1427
        if (wb.current() != 4 && wb.next() != 4)
1428
            errln("Didn't get break at end of string.");
1429
    }
1430

1431
    // [serialization test has been removed pursuant to bug #4152965]
1432

1433
    /**
1434
     * Bug 4450804
1435
     */
1436
    public void TestLineBreakContractions() {
1437
        Vector<String> expected = new Vector<String>();
1438

1439
        expected.add("These ");
1440
        expected.add("are ");
1441
        expected.add("'foobles'. ");
1442
        expected.add("Don't ");
1443
        expected.add("you ");
1444
        expected.add("like ");
1445
        expected.add("them?");
1446
        generalIteratorTest(lineBreak, expected);
1447
    }
1448

1449
}
1450

1451
Product

Resources

Company