CoCalc -- DictionaryBasedBreakIterator.java

GitHub Repository: PojavLauncherTeam/mobile
Path: blob/master/src/java.base/share/classes/sun/text/DictionaryBasedBreakIterator.java
⁴¹¹⁵² views
1
/*
2
 * Copyright (c) 1999, 2020, Oracle and/or its affiliates. All rights reserved.
3
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
 *
5
 * This code is free software; you can redistribute it and/or modify it
6
 * under the terms of the GNU General Public License version 2 only, as
7
 * published by the Free Software Foundation.  Oracle designates this
8
 * particular file as subject to the "Classpath" exception as provided
9
 * by Oracle in the LICENSE file that accompanied this code.
10
 *
11
 * This code is distributed in the hope that it will be useful, but WITHOUT
12
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14
 * version 2 for more details (a copy is included in the LICENSE file that
15
 * accompanied this code).
16
 *
17
 * You should have received a copy of the GNU General Public License version
18
 * 2 along with this work; if not, write to the Free Software Foundation,
19
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20
 *
21
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22
 * or visit www.oracle.com if you need additional information or have any
23
 * questions.
24
 */
25

26
/*
27
 *
28
 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
29
 * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
30
 *
31
 * The original version of this source code and documentation
32
 * is copyrighted and owned by Taligent, Inc., a wholly-owned
33
 * subsidiary of IBM. These materials are provided under terms
34
 * of a License Agreement between Taligent and Sun. This technology
35
 * is protected by multiple US and International patents.
36
 *
37
 * This notice and attribution to Taligent may not be removed.
38
 * Taligent is a registered trademark of Taligent, Inc.
39
 */
40

41
package sun.text;
42

43
import java.text.CharacterIterator;
44
import java.util.ArrayList;
45
import java.util.List;
46
import java.util.Stack;
47

48
/**
49
 * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
50
 * to further subdivide ranges of text beyond what is possible using just the
51
 * state-table-based algorithm.  This is necessary, for example, to handle
52
 * word and line breaking in Thai, which doesn't use spaces between words.  The
53
 * state-table-based algorithm used by RuleBasedBreakIterator is used to divide
54
 * up text as far as possible, and then contiguous ranges of letters are
55
 * repeatedly compared against a list of known words (i.e., the dictionary)
56
 * to divide them up into words.
57
 *
58
 * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator,
59
 * but adds one more special substitution name: &lt;dictionary&gt;.  This substitution
60
 * name is used to identify characters in words in the dictionary.  The idea is that
61
 * if the iterator passes over a chunk of text that includes two or more characters
62
 * in a row that are included in &lt;dictionary&gt;, it goes back through that range and
63
 * derives additional break positions (if possible) using the dictionary.
64
 *
65
 * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
66
 * file.  It follows a prescribed search path to locate the dictionary (right now,
67
 * it looks for it in /com/ibm/text/resources in each directory in the classpath,
68
 * and won't find it in JAR files, but this location is likely to change).  The
69
 * dictionary file is in a serialized binary format.  We have a very primitive (and
70
 * slow) BuildDictionaryFile utility for creating dictionary files, but aren't
71
 * currently making it public.  Contact us for help.
72
 */
73
public class DictionaryBasedBreakIterator extends RuleBasedBreakIterator {
74

75
    /**
76
     * a list of known words that is used to divide up contiguous ranges of letters,
77
     * stored in a compressed, indexed, format that offers fast access
78
     */
79
    private BreakDictionary dictionary;
80

81
    /**
82
     * a list of flags indicating which character categories are contained in
83
     * the dictionary file (this is used to determine which ranges of characters
84
     * to apply the dictionary to)
85
     */
86
    private boolean[] categoryFlags;
87

88
    /**
89
     * a temporary hiding place for the number of dictionary characters in the
90
     * last range passed over by next()
91
     */
92
    private int dictionaryCharCount;
93

94
    /**
95
     * when a range of characters is divided up using the dictionary, the break
96
     * positions that are discovered are stored here, preventing us from having
97
     * to use either the dictionary or the state table again until the iterator
98
     * leaves this range of text
99
     */
100
    private int[] cachedBreakPositions;
101

102
    /**
103
     * if cachedBreakPositions is not null, this indicates which item in the
104
     * cache the current iteration position refers to
105
     */
106
    private int positionInCache;
107

108
    /**
109
     * Constructs a DictionaryBasedBreakIterator.
110
     *
111
     * @param ruleFile       the name of the rule data file
112
     * @param ruleData       the rule data loaded from the rule data file
113
     * @param dictionaryFile the name of the dictionary file
114
     * @param dictionaryData the dictionary data loaded from the dictionary file
115
     * @throws MissingResourceException if rule data or dictionary initialization failed
116
     */
117
    public DictionaryBasedBreakIterator(String ruleFile, byte[] ruleData,
118
                                        String dictionaryFile, byte[] dictionaryData) {
119
        super(ruleFile, ruleData);
120
        byte[] tmp = super.getAdditionalData();
121
        if (tmp != null) {
122
            prepareCategoryFlags(tmp);
123
            super.setAdditionalData(null);
124
        }
125
        dictionary = new BreakDictionary(dictionaryFile, dictionaryData);
126
    }
127

128
    private void prepareCategoryFlags(byte[] data) {
129
        categoryFlags = new boolean[data.length];
130
        for (int i = 0; i < data.length; i++) {
131
            categoryFlags[i] = (data[i] == (byte)1) ? true : false;
132
        }
133
    }
134

135
    @Override
136
    public void setText(CharacterIterator newText) {
137
        super.setText(newText);
138
        cachedBreakPositions = null;
139
        dictionaryCharCount = 0;
140
        positionInCache = 0;
141
    }
142

143
    /**
144
     * Sets the current iteration position to the beginning of the text.
145
     * (i.e., the CharacterIterator's starting offset).
146
     * @return The offset of the beginning of the text.
147
     */
148
    @Override
149
    public int first() {
150
        cachedBreakPositions = null;
151
        dictionaryCharCount = 0;
152
        positionInCache = 0;
153
        return super.first();
154
    }
155

156
    /**
157
     * Sets the current iteration position to the end of the text.
158
     * (i.e., the CharacterIterator's ending offset).
159
     * @return The text's past-the-end offset.
160
     */
161
    @Override
162
    public int last() {
163
        cachedBreakPositions = null;
164
        dictionaryCharCount = 0;
165
        positionInCache = 0;
166
        return super.last();
167
    }
168

169
    /**
170
     * Advances the iterator one step backwards.
171
     * @return The position of the last boundary position before the
172
     * current iteration position
173
     */
174
    @Override
175
    public int previous() {
176
        CharacterIterator text = getText();
177

178
        // if we have cached break positions and we're still in the range
179
        // covered by them, just move one step backward in the cache
180
        if (cachedBreakPositions != null && positionInCache > 0) {
181
            --positionInCache;
182
            text.setIndex(cachedBreakPositions[positionInCache]);
183
            return cachedBreakPositions[positionInCache];
184
        }
185

186
        // otherwise, dump the cache and use the inherited previous() method to move
187
        // backward.  This may fill up the cache with new break positions, in which
188
        // case we have to mark our position in the cache
189
        else {
190
            cachedBreakPositions = null;
191
            int result = super.previous();
192
            if (cachedBreakPositions != null) {
193
                positionInCache = cachedBreakPositions.length - 2;
194
            }
195
            return result;
196
        }
197
    }
198

199
    /**
200
     * Sets the current iteration position to the last boundary position
201
     * before the specified position.
202
     * @param offset The position to begin searching from
203
     * @return The position of the last boundary before "offset"
204
     */
205
    @Override
206
    public int preceding(int offset) {
207
        CharacterIterator text = getText();
208
        checkOffset(offset, text);
209

210
        // if we have no cached break positions, or "offset" is outside the
211
        // range covered by the cache, we can just call the inherited routine
212
        // (which will eventually call other routines in this class that may
213
        // refresh the cache)
214
        if (cachedBreakPositions == null || offset <= cachedBreakPositions[0] ||
215
                offset > cachedBreakPositions[cachedBreakPositions.length - 1]) {
216
            cachedBreakPositions = null;
217
            return super.preceding(offset);
218
        }
219

220
        // on the other hand, if "offset" is within the range covered by the cache,
221
        // then all we have to do is search the cache for the last break position
222
        // before "offset"
223
        else {
224
            positionInCache = 0;
225
            while (positionInCache < cachedBreakPositions.length
226
                   && offset > cachedBreakPositions[positionInCache]) {
227
                ++positionInCache;
228
            }
229
            --positionInCache;
230
            text.setIndex(cachedBreakPositions[positionInCache]);
231
            return text.getIndex();
232
        }
233
    }
234

235
    /**
236
     * Sets the current iteration position to the first boundary position after
237
     * the specified position.
238
     * @param offset The position to begin searching forward from
239
     * @return The position of the first boundary after "offset"
240
     */
241
    @Override
242
    public int following(int offset) {
243
        CharacterIterator text = getText();
244
        checkOffset(offset, text);
245

246
        // if we have no cached break positions, or if "offset" is outside the
247
        // range covered by the cache, then dump the cache and call our
248
        // inherited following() method.  This will call other methods in this
249
        // class that may refresh the cache.
250
        if (cachedBreakPositions == null || offset < cachedBreakPositions[0] ||
251
                offset >= cachedBreakPositions[cachedBreakPositions.length - 1]) {
252
            cachedBreakPositions = null;
253
            return super.following(offset);
254
        }
255

256
        // on the other hand, if "offset" is within the range covered by the
257
        // cache, then just search the cache for the first break position
258
        // after "offset"
259
        else {
260
            positionInCache = 0;
261
            while (positionInCache < cachedBreakPositions.length
262
                   && offset >= cachedBreakPositions[positionInCache]) {
263
                ++positionInCache;
264
            }
265
            text.setIndex(cachedBreakPositions[positionInCache]);
266
            return text.getIndex();
267
        }
268
    }
269

270
    /**
271
     * This is the implementation function for next().
272
     */
273
    @Override
274
    protected int handleNext() {
275
        CharacterIterator text = getText();
276

277
        // if there are no cached break positions, or if we've just moved
278
        // off the end of the range covered by the cache, we have to dump
279
        // and possibly regenerate the cache
280
        if (cachedBreakPositions == null ||
281
            positionInCache == cachedBreakPositions.length - 1) {
282

283
            // start by using the inherited handleNext() to find a tentative return
284
            // value.   dictionaryCharCount tells us how many dictionary characters
285
            // we passed over on our way to the tentative return value
286
            int startPos = text.getIndex();
287
            dictionaryCharCount = 0;
288
            int result = super.handleNext();
289

290
            // if we passed over more than one dictionary character, then we use
291
            // divideUpDictionaryRange() to regenerate the cached break positions
292
            // for the new range
293
            if (dictionaryCharCount > 1 && result - startPos > 1) {
294
                divideUpDictionaryRange(startPos, result);
295
            }
296

297
            // otherwise, the value we got back from the inherited fuction
298
            // is our return value, and we can dump the cache
299
            else {
300
                cachedBreakPositions = null;
301
                return result;
302
            }
303
        }
304

305
        // if the cache of break positions has been regenerated (or existed all
306
        // along), then just advance to the next break position in the cache
307
        // and return it
308
        if (cachedBreakPositions != null) {
309
            ++positionInCache;
310
            text.setIndex(cachedBreakPositions[positionInCache]);
311
            return cachedBreakPositions[positionInCache];
312
        }
313
        return -9999;   // SHOULD NEVER GET HERE!
314
    }
315

316
    /**
317
     * Looks up a character category for a character.
318
     */
319
    @Override
320
    protected int lookupCategory(int c) {
321
        // this override of lookupCategory() exists only to keep track of whether we've
322
        // passed over any dictionary characters.  It calls the inherited lookupCategory()
323
        // to do the real work, and then checks whether its return value is one of the
324
        // categories represented in the dictionary.  If it is, bump the dictionary-
325
        // character count.
326
        int result = super.lookupCategory(c);
327
        if (result != RuleBasedBreakIterator.IGNORE && categoryFlags[result]) {
328
            ++dictionaryCharCount;
329
        }
330
        return result;
331
    }
332

333
    /**
334
     * This is the function that actually implements the dictionary-based
335
     * algorithm.  Given the endpoints of a range of text, it uses the
336
     * dictionary to determine the positions of any boundaries in this
337
     * range.  It stores all the boundary positions it discovers in
338
     * cachedBreakPositions so that we only have to do this work once
339
     * for each time we enter the range.
340
     */
341
    @SuppressWarnings("unchecked")
342
    private void divideUpDictionaryRange(int startPos, int endPos) {
343
        CharacterIterator text = getText();
344

345
        // the range we're dividing may begin or end with non-dictionary characters
346
        // (i.e., for line breaking, we may have leading or trailing punctuation
347
        // that needs to be kept with the word).  Seek from the beginning of the
348
        // range to the first dictionary character
349
        text.setIndex(startPos);
350
        int c = getCurrent();
351
        int category = lookupCategory(c);
352
        while (category == IGNORE || !categoryFlags[category]) {
353
            c = getNext();
354
            category = lookupCategory(c);
355
        }
356

357
        // initialize.  We maintain two stacks: currentBreakPositions contains
358
        // the list of break positions that will be returned if we successfully
359
        // finish traversing the whole range now.  possibleBreakPositions lists
360
        // all other possible word ends we've passed along the way.  (Whenever
361
        // we reach an error [a sequence of characters that can't begin any word
362
        // in the dictionary], we back up, possibly delete some breaks from
363
        // currentBreakPositions, move a break from possibleBreakPositions
364
        // to currentBreakPositions, and start over from there.  This process
365
        // continues in this way until we either successfully make it all the way
366
        // across the range, or exhaust all of our combinations of break
367
        // positions.)
368
        Stack<Integer> currentBreakPositions = new Stack<>();
369
        Stack<Integer> possibleBreakPositions = new Stack<>();
370
        List<Integer> wrongBreakPositions = new ArrayList<>();
371

372
        // the dictionary is implemented as a trie, which is treated as a state
373
        // machine.  -1 represents the end of a legal word.  Every word in the
374
        // dictionary is represented by a path from the root node to -1.  A path
375
        // that ends in state 0 is an illegal combination of characters.
376
        int state = 0;
377

378
        // these two variables are used for error handling.  We keep track of the
379
        // farthest we've gotten through the range being divided, and the combination
380
        // of breaks that got us that far.  If we use up all possible break
381
        // combinations, the text contains an error or a word that's not in the
382
        // dictionary.  In this case, we "bless" the break positions that got us the
383
        // farthest as real break positions, and then start over from scratch with
384
        // the character where the error occurred.
385
        int farthestEndPoint = text.getIndex();
386
        Stack<Integer> bestBreakPositions = null;
387

388
        // initialize (we always exit the loop with a break statement)
389
        c = getCurrent();
390
        while (true) {
391

392
            // if we can transition to state "-1" from our current state, we're
393
            // on the last character of a legal word.  Push that position onto
394
            // the possible-break-positions stack
395
            if (dictionary.getNextState(state, 0) == -1) {
396
                possibleBreakPositions.push(text.getIndex());
397
            }
398

399
            // look up the new state to transition to in the dictionary
400
            state = dictionary.getNextStateFromCharacter(state, c);
401

402
            // if the character we're sitting on causes us to transition to
403
            // the "end of word" state, then it was a non-dictionary character
404
            // and we've successfully traversed the whole range.  Drop out
405
            // of the loop.
406
            if (state == -1) {
407
                currentBreakPositions.push(text.getIndex());
408
                break;
409
            }
410

411
            // if the character we're sitting on causes us to transition to
412
            // the error state, or if we've gone off the end of the range
413
            // without transitioning to the "end of word" state, we've hit
414
            // an error...
415
            else if (state == 0 || text.getIndex() >= endPos) {
416

417
                // if this is the farthest we've gotten, take note of it in
418
                // case there's an error in the text
419
                if (text.getIndex() > farthestEndPoint) {
420
                    farthestEndPoint = text.getIndex();
421

422
                    @SuppressWarnings("unchecked")
423
                    Stack<Integer> currentBreakPositionsCopy = (Stack<Integer>) currentBreakPositions.clone();
424

425
                    bestBreakPositions = currentBreakPositionsCopy;
426
                }
427

428
                // wrongBreakPositions is a list of all break positions
429
                // we've tried starting that didn't allow us to traverse
430
                // all the way through the text.  Every time we pop a
431
                // break position off of currentBreakPositions, we put it
432
                // into wrongBreakPositions to avoid trying it again later.
433
                // If we make it to this spot, we're either going to back
434
                // up to a break in possibleBreakPositions and try starting
435
                // over from there, or we've exhausted all possible break
436
                // positions and are going to do the fallback procedure.
437
                // This loop prevents us from messing with anything in
438
                // possibleBreakPositions that didn't work as a starting
439
                // point the last time we tried it (this is to prevent a bunch of
440
                // repetitive checks from slowing down some extreme cases)
441
                while (!possibleBreakPositions.isEmpty()
442
                        && wrongBreakPositions.contains(possibleBreakPositions.peek())) {
443
                    possibleBreakPositions.pop();
444
                }
445

446
                // if we've used up all possible break-position combinations, there's
447
                // an error or an unknown word in the text.  In this case, we start
448
                // over, treating the farthest character we've reached as the beginning
449
                // of the range, and "blessing" the break positions that got us that
450
                // far as real break positions
451
                if (possibleBreakPositions.isEmpty()) {
452
                    if (bestBreakPositions != null) {
453
                        currentBreakPositions = bestBreakPositions;
454
                        if (farthestEndPoint < endPos) {
455
                            text.setIndex(farthestEndPoint + 1);
456
                        }
457
                        else {
458
                            break;
459
                        }
460
                    }
461
                    else {
462
                        if ((currentBreakPositions.size() == 0 ||
463
                             currentBreakPositions.peek().intValue() != text.getIndex())
464
                            && text.getIndex() != startPos) {
465
                            currentBreakPositions.push(text.getIndex());
466
                        }
467
                        getNext();
468
                        currentBreakPositions.push(text.getIndex());
469
                    }
470
                }
471

472
                // if we still have more break positions we can try, then promote the
473
                // last break in possibleBreakPositions into currentBreakPositions,
474
                // and get rid of all entries in currentBreakPositions that come after
475
                // it.  Then back up to that position and start over from there (i.e.,
476
                // treat that position as the beginning of a new word)
477
                else {
478
                    Integer temp = possibleBreakPositions.pop();
479
                    Integer temp2 = null;
480
                    while (!currentBreakPositions.isEmpty() && temp.intValue() <
481
                           currentBreakPositions.peek().intValue()) {
482
                        temp2 = currentBreakPositions.pop();
483
                        wrongBreakPositions.add(temp2);
484
                    }
485
                    currentBreakPositions.push(temp);
486
                    text.setIndex(currentBreakPositions.peek().intValue());
487
                }
488

489
                // re-sync "c" for the next go-round, and drop out of the loop if
490
                // we've made it off the end of the range
491
                c = getCurrent();
492
                if (text.getIndex() >= endPos) {
493
                    break;
494
                }
495
            }
496

497
            // if we didn't hit any exceptional conditions on this last iteration,
498
            // just advance to the next character and loop
499
            else {
500
                c = getNext();
501
            }
502
        }
503

504
        // dump the last break position in the list, and replace it with the actual
505
        // end of the range (which may be the same character, or may be further on
506
        // because the range actually ended with non-dictionary characters we want to
507
        // keep with the word)
508
        if (!currentBreakPositions.isEmpty()) {
509
            currentBreakPositions.pop();
510
        }
511
        currentBreakPositions.push(endPos);
512

513
        // create a regular array to hold the break positions and copy
514
        // the break positions from the stack to the array (in addition,
515
        // our starting position goes into this array as a break position).
516
        // This array becomes the cache of break positions used by next()
517
        // and previous(), so this is where we actually refresh the cache.
518
        cachedBreakPositions = new int[currentBreakPositions.size() + 1];
519
        cachedBreakPositions[0] = startPos;
520

521
        for (int i = 0; i < currentBreakPositions.size(); i++) {
522
            cachedBreakPositions[i + 1] = currentBreakPositions.elementAt(i).intValue();
523
        }
524
        positionInCache = 0;
525
    }
526
}
527

528
Product

Resources

Company