CoCalc -- UnicodeSetStringSpan.java

GitHub Repository: PojavLauncherTeam/mobile
Path: blob/master/src/java.base/share/classes/jdk/internal/icu/impl/UnicodeSetStringSpan.java
⁴¹¹⁶¹ views
1
/*
2
 * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved.
3
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
 *
5
 * This code is free software; you can redistribute it and/or modify it
6
 * under the terms of the GNU General Public License version 2 only, as
7
 * published by the Free Software Foundation.  Oracle designates this
8
 * particular file as subject to the "Classpath" exception as provided
9
 * by Oracle in the LICENSE file that accompanied this code.
10
 *
11
 * This code is distributed in the hope that it will be useful, but WITHOUT
12
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14
 * version 2 for more details (a copy is included in the LICENSE file that
15
 * accompanied this code).
16
 *
17
 * You should have received a copy of the GNU General Public License version
18
 * 2 along with this work; if not, write to the Free Software Foundation,
19
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20
 *
21
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22
 * or visit www.oracle.com if you need additional information or have any
23
 * questions.
24
 */
25

26
/*
27
 ******************************************************************************
28
 *
29
 *   Copyright (C) 2009-2014, International Business Machines
30
 *   Corporation and others.  All Rights Reserved.
31
 *
32
 ******************************************************************************
33
 */
34

35
package jdk.internal.icu.impl;
36

37
import java.util.ArrayList;
38

39
import jdk.internal.icu.text.UTF16;
40
import jdk.internal.icu.text.UnicodeSet;
41
import jdk.internal.icu.text.UnicodeSet.SpanCondition;
42
import jdk.internal.icu.util.OutputInt;
43

44
/*
45
 * Implement span() etc. for a set with strings.
46
 * Avoid recursion because of its exponential complexity.
47
 * Instead, try multiple paths at once and track them with an IndexList.
48
 */
49
public class UnicodeSetStringSpan {
50

51
    /*
52
     * Which span() variant will be used? The object is either built for one variant and used once,
53
     * or built for all and may be used many times.
54
     */
55
    public static final int WITH_COUNT    = 0x40;  // spanAndCount() may be called
56
    public static final int FWD           = 0x20;
57
    public static final int BACK          = 0x10;
58
    // public static final int UTF16      = 8;
59
    public static final int CONTAINED     = 2;
60
    public static final int NOT_CONTAINED = 1;
61

62
    public static final int ALL = 0x7f;
63

64
    public static final int FWD_UTF16_CONTAINED      = FWD  | /* UTF16 | */    CONTAINED;
65
    public static final int FWD_UTF16_NOT_CONTAINED  = FWD  | /* UTF16 | */NOT_CONTAINED;
66
    public static final int BACK_UTF16_CONTAINED     = BACK | /* UTF16 | */    CONTAINED;
67
    public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED;
68

69
    /**
70
     * Special spanLength short values. (since Java has not unsigned byte type)
71
     * All code points in the string are contained in the parent set.
72
     */
73
    static final short ALL_CP_CONTAINED = 0xff;
74

75
    /** The spanLength is >=0xfe. */
76
    static final short LONG_SPAN = ALL_CP_CONTAINED - 1;
77

78
    /** Set for span(). Same as parent but without strings. */
79
    private UnicodeSet spanSet;
80

81
    /**
82
     * Set for span(not contained).
83
     * Same as spanSet, plus characters that start or end strings.
84
     */
85
    private UnicodeSet spanNotSet;
86

87
    /** The strings of the parent set. */
88
    private ArrayList<String> strings;
89

90
    /** The lengths of span(), spanBack() etc. for each string. */
91
    private short[] spanLengths;
92

93
    /** Maximum lengths of relevant strings. */
94
    private int maxLength16;
95

96
    /** Are there strings that are not fully contained in the code point set? */
97
    private boolean someRelevant;
98

99
    /** Set up for all variants of span()? */
100
    private boolean all;
101

102
    /** Span helper */
103
    private OffsetList offsets;
104

105
    /**
106
     * Constructs for all variants of span(), or only for any one variant.
107
     * Initializes as little as possible, for single use.
108
     */
109
    public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) {
110
        spanSet = new UnicodeSet(0, 0x10ffff);
111
        // TODO: With Java 6, just take the parent set's strings as is,
112
        // as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings.
113
        // Then iterate via the first() and higher() methods.
114
        // (We do not want to create multiple Iterator objects in each span().)
115
        // See ICU ticket #7454.
116
        strings = setStrings;
117
        all = (which == ALL);
118
        spanSet.retainAll(set);
119
        if (0 != (which & NOT_CONTAINED)) {
120
            // Default to the same sets.
121
            // addToSpanNotSet() will create a separate set if necessary.
122
            spanNotSet = spanSet;
123
        }
124
        offsets = new OffsetList();
125

126
        // Determine if the strings even need to be taken into account at all for span() etc.
127
        // If any string is relevant, then all strings need to be used for
128
        // span(longest match) but only the relevant ones for span(while contained).
129
        // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
130
        // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
131
        // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
132
        // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
133
        int stringsLength = strings.size();
134

135
        int i, spanLength;
136
        someRelevant = false;
137
        for (i = 0; i < stringsLength; ++i) {
138
            String string = strings.get(i);
139
            int length16 = string.length();
140
            spanLength = spanSet.span(string, SpanCondition.CONTAINED);
141
            if (spanLength < length16) { // Relevant string.
142
                someRelevant = true;
143
            }
144
            if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) {
145
                maxLength16 = length16;
146
            }
147
        }
148
        if (!someRelevant && (which & WITH_COUNT) == 0) {
149
            return;
150
        }
151

152
        // Freeze after checking for the need to use strings at all because freezing
153
        // a set takes some time and memory which are wasted if there are no relevant strings.
154
        if (all) {
155
            spanSet.freeze();
156
        }
157

158
        int spanBackLengthsOffset;
159

160
        // Allocate a block of meta data.
161
        int allocSize;
162
        if (all) {
163
            // 2 sets of span lengths
164
            allocSize = stringsLength * (2);
165
        } else {
166
            allocSize = stringsLength; // One set of span lengths.
167
        }
168
        spanLengths = new short[allocSize];
169

170
        if (all) {
171
            // Store span lengths for all span() variants.
172
            spanBackLengthsOffset = stringsLength;
173
        } else {
174
            // Store span lengths for only one span() variant.
175
            spanBackLengthsOffset = 0;
176
        }
177

178
        // Set the meta data and spanNotSet and write the UTF-8 strings.
179

180
        for (i = 0; i < stringsLength; ++i) {
181
            String string = strings.get(i);
182
            int length16 = string.length();
183
            spanLength = spanSet.span(string, SpanCondition.CONTAINED);
184
            if (spanLength < length16) { // Relevant string.
185
                if (true /* 0 != (which & UTF16) */) {
186
                    if (0 != (which & CONTAINED)) {
187
                        if (0 != (which & FWD)) {
188
                            spanLengths[i] = makeSpanLengthByte(spanLength);
189
                        }
190
                        if (0 != (which & BACK)) {
191
                            spanLength = length16
192
                                    - spanSet.spanBack(string, length16, SpanCondition.CONTAINED);
193
                            spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength);
194
                        }
195
                    } else /* not CONTAINED, not all, but NOT_CONTAINED */{
196
                        spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant
197
                                                                                     // flag.
198
                    }
199
                }
200
                if (0 != (which & NOT_CONTAINED)) {
201
                    // Add string start and end code points to the spanNotSet so that
202
                    // a span(while not contained) stops before any string.
203
                    int c;
204
                    if (0 != (which & FWD)) {
205
                        c = string.codePointAt(0);
206
                        addToSpanNotSet(c);
207
                    }
208
                    if (0 != (which & BACK)) {
209
                        c = string.codePointBefore(length16);
210
                        addToSpanNotSet(c);
211
                    }
212
                }
213
            } else { // Irrelevant string.
214
                if (all) {
215
                    spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED;
216
                } else {
217
                    // All spanXYZLengths pointers contain the same address.
218
                    spanLengths[i] = ALL_CP_CONTAINED;
219
                }
220
            }
221
        }
222

223
        // Finish.
224
        if (all) {
225
            spanNotSet.freeze();
226
        }
227
    }
228

229
    /**
230
     * Do the strings need to be checked in span() etc.?
231
     *
232
     * @return true if strings need to be checked (call span() here),
233
     *         false if not (use a BMPSet for best performance).
234
     */
235
    public boolean needsStringSpanUTF16() {
236
        return someRelevant;
237
    }
238

239
    /** For fast UnicodeSet::contains(c). */
240
    public boolean contains(int c) {
241
        return spanSet.contains(c);
242
    }
243

244
    /**
245
     * Adds a starting or ending string character to the spanNotSet
246
     * so that a character span ends before any string.
247
     */
248
    private void addToSpanNotSet(int c) {
249
        if (spanNotSet == null || spanNotSet == spanSet) {
250
            if (spanSet.contains(c)) {
251
                return; // Nothing to do.
252
            }
253
            spanNotSet = spanSet.cloneAsThawed();
254
        }
255
        spanNotSet.add(c);
256
    }
257

258
    /*
259
     * Note: In span() when spanLength==0
260
     * (after a string match, or at the beginning after an empty code point span)
261
     * and in spanNot() and spanNotUTF8(),
262
     * string matching could use a binary search because all string matches are done
263
     * from the same start index.
264
     *
265
     * For UTF-8, this would require a comparison function that returns UTF-16 order.
266
     *
267
     * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets
268
     * with strings have very few very short strings. For cases with many strings, it might be better to use a different
269
     * API and implementation with a DFA (state machine).
270
     */
271

272
    /*
273
     * Algorithm for span(SpanCondition.CONTAINED)
274
     *
275
     * Theoretical algorithm:
276
     * - Iterate through the string, and at each code point boundary:
277
     *   + If the code point there is in the set, then remember to continue after it.
278
     *   + If a set string matches at the current position, then remember to continue after it.
279
     *   + Either recursively span for each code point or string match, or recursively span
280
     *     for all but the shortest one and iteratively continue the span with the shortest local match.
281
     *   + Remember the longest recursive span (the farthest end point).
282
     *   + If there is no match at the current position,
283
     *     neither for the code point there nor for any set string,
284
     *     then stop and return the longest recursive span length.
285
     *
286
     * Optimized implementation:
287
     *
288
     * (We assume that most sets will have very few very short strings.
289
     * A span using a string-less set is extremely fast.)
290
     *
291
     * Create and cache a spanSet which contains all of the single code points of the original set
292
     * but none of its strings.
293
     *
294
     * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
295
     * - Loop:
296
     *   + Try to match each set string at the end of the spanLength.
297
     *     ~ Set strings that start with set-contained code points
298
     *       must be matched with a partial overlap
299
     *       because the recursive algorithm would have tried to match them at every position.
300
     *     ~ Set strings that entirely consist of set-contained code points
301
     *       are irrelevant for span(SpanCondition.CONTAINED)
302
     *       because the recursive algorithm would continue after them anyway and
303
     *       find the longest recursive match from their end.
304
     *     ~ Rather than recursing, note each end point of a set string match.
305
     *   + If no set string matched after spanSet.span(),
306
     *     then return with where the spanSet.span() ended.
307
     *   + If at least one set string matched after spanSet.span(),
308
     *     then pop the shortest string match end point and continue the loop,
309
     *     trying to match all set strings from there.
310
     *   + If at least one more set string matched after a previous string match, then test if the
311
     *     code point after the previous string match is also contained in the set.
312
     *     Continue the loop with the shortest end point of
313
     *     either this code point or a matching set string.
314
     *   + If no more set string matched after a previous string match,
315
     *     then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
316
     *     Stop if spanLength==0, otherwise continue the loop.
317
     *
318
     * By noting each end point of a set string match, the function visits each string position at most once and
319
     * finishes in linear time.
320
     *
321
     * The recursive algorithm may visit the same string position many times
322
     * if multiple paths lead to it and finishes in exponential time.
323
     */
324

325
    /*
326
     * Algorithm for span(SIMPLE)
327
     *
328
     * Theoretical algorithm:
329
     * - Iterate through the string, and at each code point boundary:
330
     *   + If the code point there is in the set, then remember to continue after it.
331
     *   + If a set string matches at the current position, then remember to continue after it.
332
     *   + Continue from the farthest match position and ignore all others.
333
     *   + If there is no match at the current position, then stop and return the current position.
334
     *
335
     * Optimized implementation:
336
     *
337
     * (Same assumption and spanSet as above.)
338
     *
339
     * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
340
     * - Loop:
341
     *   + Try to match each set string at the end of the spanLength.
342
     *     ~ Set strings that start with set-contained code points
343
     *       must be matched with a partial overlap
344
     *       because the standard algorithm would have tried to match them earlier.
345
     *     ~ Set strings that entirely consist of set-contained code points
346
     *       must be matched with a full overlap because the longest-match algorithm
347
     *       would hide set string matches that end earlier.
348
     *       Such set strings need not be matched earlier inside the code point span
349
     *       because the standard algorithm would then have
350
     *       continued after the set string match anyway.
351
     *     ~ Remember the longest set string match (farthest end point)
352
     *       from the earliest starting point.
353
     *   + If no set string matched after spanSet.span(),
354
     *     then return with where the spanSet.span() ended.
355
     *   + If at least one set string matched,
356
     *     then continue the loop after the longest match from the earliest position.
357
     *   + If no more set string matched after a previous string match,
358
     *     then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
359
     *     Stop if spanLength==0, otherwise continue the loop.
360
     */
361
    /**
362
     * Spans a string.
363
     *
364
     * @param s The string to be spanned
365
     * @param start The start index that the span begins
366
     * @param spanCondition The span condition
367
     * @return the limit (exclusive end) of the span
368
     */
369
    public int span(CharSequence s, int start, SpanCondition spanCondition) {
370
        if (spanCondition == SpanCondition.NOT_CONTAINED) {
371
            return spanNot(s, start, null);
372
        }
373
        int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED);
374
        if (spanLimit == s.length()) {
375
            return spanLimit;
376
        }
377
        return spanWithStrings(s, start, spanLimit, spanCondition);
378
    }
379

380
    /**
381
     * Synchronized method for complicated spans using the offsets.
382
     * Avoids synchronization for simple cases.
383
     *
384
     * @param spanLimit = spanSet.span(s, start, CONTAINED)
385
     */
386
    private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit,
387
            SpanCondition spanCondition) {
388
        // Consider strings; they may overlap with the span.
389
        int initSize = 0;
390
        if (spanCondition == SpanCondition.CONTAINED) {
391
            // Use offset list to try all possibilities.
392
            initSize = maxLength16;
393
        }
394
        offsets.setMaxLength(initSize);
395
        int length = s.length();
396
        int pos = spanLimit, rest = length - spanLimit;
397
        int spanLength = spanLimit - start;
398
        int i, stringsLength = strings.size();
399
        for (;;) {
400
            if (spanCondition == SpanCondition.CONTAINED) {
401
                for (i = 0; i < stringsLength; ++i) {
402
                    int overlap = spanLengths[i];
403
                    if (overlap == ALL_CP_CONTAINED) {
404
                        continue; // Irrelevant string.
405
                    }
406
                    String string = strings.get(i);
407

408
                    int length16 = string.length();
409

410
                    // Try to match this string at pos-overlap..pos.
411
                    if (overlap >= LONG_SPAN) {
412
                        overlap = length16;
413
                        // While contained: No point matching fully inside the code point span.
414
                        overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code
415
                                                                          // point.
416
                    }
417
                    if (overlap > spanLength) {
418
                        overlap = spanLength;
419
                    }
420
                    int inc = length16 - overlap; // Keep overlap+inc==length16.
421
                    for (;;) {
422
                        if (inc > rest) {
423
                            break;
424
                        }
425
                        // Try to match if the increment is not listed already.
426
                        if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) {
427
                            if (inc == rest) {
428
                                return length; // Reached the end of the string.
429
                            }
430
                            offsets.addOffset(inc);
431
                        }
432
                        if (overlap == 0) {
433
                            break;
434
                        }
435
                        --overlap;
436
                        ++inc;
437
                    }
438
                }
439
            } else /* SIMPLE */{
440
                int maxInc = 0, maxOverlap = 0;
441
                for (i = 0; i < stringsLength; ++i) {
442
                    int overlap = spanLengths[i];
443
                    // For longest match, we do need to try to match even an all-contained string
444
                    // to find the match from the earliest start.
445

446
                    String string = strings.get(i);
447

448
                    int length16 = string.length();
449

450
                    // Try to match this string at pos-overlap..pos.
451
                    if (overlap >= LONG_SPAN) {
452
                        overlap = length16;
453
                        // Longest match: Need to match fully inside the code point span
454
                        // to find the match from the earliest start.
455
                    }
456
                    if (overlap > spanLength) {
457
                        overlap = spanLength;
458
                    }
459
                    int inc = length16 - overlap; // Keep overlap+inc==length16.
460
                    for (;;) {
461
                        if (inc > rest || overlap < maxOverlap) {
462
                            break;
463
                        }
464
                        // Try to match if the string is longer or starts earlier.
465
                        if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc)
466
                                && matches16CPB(s, pos - overlap, length, string, length16)) {
467
                            maxInc = inc; // Longest match from earliest start.
468
                            maxOverlap = overlap;
469
                            break;
470
                        }
471
                        --overlap;
472
                        ++inc;
473
                    }
474
                }
475

476
                if (maxInc != 0 || maxOverlap != 0) {
477
                    // Longest-match algorithm, and there was a string match.
478
                    // Simply continue after it.
479
                    pos += maxInc;
480
                    rest -= maxInc;
481
                    if (rest == 0) {
482
                        return length; // Reached the end of the string.
483
                    }
484
                    spanLength = 0; // Match strings from after a string match.
485
                    continue;
486
                }
487
            }
488
            // Finished trying to match all strings at pos.
489

490
            if (spanLength != 0 || pos == 0) {
491
                // The position is after an unlimited code point span (spanLength!=0),
492
                // not after a string match.
493
                // The only position where spanLength==0 after a span is pos==0.
494
                // Otherwise, an unlimited code point span is only tried again when no
495
                // strings match, and if such a non-initial span fails we stop.
496
                if (offsets.isEmpty()) {
497
                    return pos; // No strings matched after a span.
498
                }
499
                // Match strings from after the next string match.
500
            } else {
501
                // The position is after a string match (or a single code point).
502
                if (offsets.isEmpty()) {
503
                    // No more strings matched after a previous string match.
504
                    // Try another code point span from after the last string match.
505
                    spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED);
506
                    spanLength = spanLimit - pos;
507
                    if (spanLength == rest || // Reached the end of the string, or
508
                            spanLength == 0 // neither strings nor span progressed.
509
                    ) {
510
                        return spanLimit;
511
                    }
512
                    pos += spanLength;
513
                    rest -= spanLength;
514
                    continue; // spanLength>0: Match strings from after a span.
515
                } else {
516
                    // Try to match only one code point from after a string match if some
517
                    // string matched beyond it, so that we try all possible positions
518
                    // and don't overshoot.
519
                    spanLength = spanOne(spanSet, s, pos, rest);
520
                    if (spanLength > 0) {
521
                        if (spanLength == rest) {
522
                            return length; // Reached the end of the string.
523
                        }
524
                        // Match strings after this code point.
525
                        // There cannot be any increments below it because UnicodeSet strings
526
                        // contain multiple code points.
527
                        pos += spanLength;
528
                        rest -= spanLength;
529
                        offsets.shift(spanLength);
530
                        spanLength = 0;
531
                        continue; // Match strings from after a single code point.
532
                    }
533
                    // Match strings from after the next string match.
534
                }
535
            }
536
            int minOffset = offsets.popMinimum(null);
537
            pos += minOffset;
538
            rest -= minOffset;
539
            spanLength = 0; // Match strings from after a string match.
540
        }
541
    }
542

543
    /**
544
     * Spans a string and counts the smallest number of set elements on any path across the span.
545
     *
546
     * <p>For proper counting, we cannot ignore strings that are fully contained in code point spans.
547
     *
548
     * <p>If the set does not have any fully-contained strings, then we could optimize this
549
     * like span(), but such sets are likely rare, and this is at least still linear.
550
     *
551
     * @param s The string to be spanned
552
     * @param start The start index that the span begins
553
     * @param spanCondition The span condition
554
     * @param outCount The count
555
     * @return the limit (exclusive end) of the span
556
     */
557
    public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition,
558
            OutputInt outCount) {
559
        if (spanCondition == SpanCondition.NOT_CONTAINED) {
560
            return spanNot(s, start, outCount);
561
        }
562
        // Consider strings; they may overlap with the span,
563
        // and they may result in a smaller count that with just code points.
564
        if (spanCondition == SpanCondition.CONTAINED) {
565
            return spanContainedAndCount(s, start, outCount);
566
        }
567
        // SIMPLE (not synchronized, does not use offsets)
568
        int stringsLength = strings.size();
569
        int length = s.length();
570
        int pos = start;
571
        int rest = length - start;
572
        int count = 0;
573
        while (rest != 0) {
574
            // Try to match the next code point.
575
            int cpLength = spanOne(spanSet, s, pos, rest);
576
            int maxInc = (cpLength > 0) ? cpLength : 0;
577
            // Try to match all of the strings.
578
            for (int i = 0; i < stringsLength; ++i) {
579
                String string = strings.get(i);
580
                int length16 = string.length();
581
                if (maxInc < length16 && length16 <= rest &&
582
                        matches16CPB(s, pos, length, string, length16)) {
583
                    maxInc = length16;
584
                }
585
            }
586
            // We are done if there is no match beyond pos.
587
            if (maxInc == 0) {
588
                outCount.value = count;
589
                return pos;
590
            }
591
            // Continue from the longest match.
592
            ++count;
593
            pos += maxInc;
594
            rest -= maxInc;
595
        }
596
        outCount.value = count;
597
        return pos;
598
    }
599

600
    private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) {
601
        // Use offset list to try all possibilities.
602
        offsets.setMaxLength(maxLength16);
603
        int stringsLength = strings.size();
604
        int length = s.length();
605
        int pos = start;
606
        int rest = length - start;
607
        int count = 0;
608
        while (rest != 0) {
609
            // Try to match the next code point.
610
            int cpLength = spanOne(spanSet, s, pos, rest);
611
            if (cpLength > 0) {
612
                offsets.addOffsetAndCount(cpLength, count + 1);
613
            }
614
            // Try to match all of the strings.
615
            for (int i = 0; i < stringsLength; ++i) {
616
                String string = strings.get(i);
617
                int length16 = string.length();
618
                // Note: If the strings were sorted by length, then we could also
619
                // avoid trying to match if there is already a match of the same length.
620
                if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) &&
621
                        matches16CPB(s, pos, length, string, length16)) {
622
                    offsets.addOffsetAndCount(length16, count + 1);
623
                }
624
            }
625
            // We are done if there is no match beyond pos.
626
            if (offsets.isEmpty()) {
627
                outCount.value = count;
628
                return pos;
629
            }
630
            // Continue from the nearest match.
631
            int minOffset = offsets.popMinimum(outCount);
632
            count = outCount.value;
633
            pos += minOffset;
634
            rest -= minOffset;
635
        }
636
        outCount.value = count;
637
        return pos;
638
    }
639

640
    /**
641
     * Span a string backwards.
642
     *
643
     * @param s The string to be spanned
644
     * @param spanCondition The span condition
645
     * @return The string index which starts the span (i.e. inclusive).
646
     */
647
    public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) {
648
        if (spanCondition == SpanCondition.NOT_CONTAINED) {
649
            return spanNotBack(s, length);
650
        }
651
        int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED);
652
        if (pos == 0) {
653
            return 0;
654
        }
655
        int spanLength = length - pos;
656

657
        // Consider strings; they may overlap with the span.
658
        int initSize = 0;
659
        if (spanCondition == SpanCondition.CONTAINED) {
660
            // Use offset list to try all possibilities.
661
            initSize = maxLength16;
662
        }
663
        offsets.setMaxLength(initSize);
664
        int i, stringsLength = strings.size();
665
        int spanBackLengthsOffset = 0;
666
        if (all) {
667
            spanBackLengthsOffset = stringsLength;
668
        }
669
        for (;;) {
670
            if (spanCondition == SpanCondition.CONTAINED) {
671
                for (i = 0; i < stringsLength; ++i) {
672
                    int overlap = spanLengths[spanBackLengthsOffset + i];
673
                    if (overlap == ALL_CP_CONTAINED) {
674
                        continue; // Irrelevant string.
675
                    }
676
                    String string = strings.get(i);
677

678
                    int length16 = string.length();
679

680
                    // Try to match this string at pos-(length16-overlap)..pos-length16.
681
                    if (overlap >= LONG_SPAN) {
682
                        overlap = length16;
683
                        // While contained: No point matching fully inside the code point span.
684
                        int len1 = 0;
685
                        len1 = string.offsetByCodePoints(0, 1);
686
                        overlap -= len1; // Length of the string minus the first code point.
687
                    }
688
                    if (overlap > spanLength) {
689
                        overlap = spanLength;
690
                    }
691
                    int dec = length16 - overlap; // Keep dec+overlap==length16.
692
                    for (;;) {
693
                        if (dec > pos) {
694
                            break;
695
                        }
696
                        // Try to match if the decrement is not listed already.
697
                        if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) {
698
                            if (dec == pos) {
699
                                return 0; // Reached the start of the string.
700
                            }
701
                            offsets.addOffset(dec);
702
                        }
703
                        if (overlap == 0) {
704
                            break;
705
                        }
706
                        --overlap;
707
                        ++dec;
708
                    }
709
                }
710
            } else /* SIMPLE */{
711
                int maxDec = 0, maxOverlap = 0;
712
                for (i = 0; i < stringsLength; ++i) {
713
                    int overlap = spanLengths[spanBackLengthsOffset + i];
714
                    // For longest match, we do need to try to match even an all-contained string
715
                    // to find the match from the latest end.
716

717
                    String string = strings.get(i);
718

719
                    int length16 = string.length();
720

721
                    // Try to match this string at pos-(length16-overlap)..pos-length16.
722
                    if (overlap >= LONG_SPAN) {
723
                        overlap = length16;
724
                        // Longest match: Need to match fully inside the code point span
725
                        // to find the match from the latest end.
726
                    }
727
                    if (overlap > spanLength) {
728
                      overlap = spanLength;
729
                    }
730
                    int dec = length16 - overlap; // Keep dec+overlap==length16.
731
                    for (;;) {
732
                        if (dec > pos || overlap < maxOverlap) {
733
                            break;
734
                        }
735
                        // Try to match if the string is longer or ends later.
736
                        if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec)
737
                                && matches16CPB(s, pos - dec, length, string, length16)) {
738
                            maxDec = dec; // Longest match from latest end.
739
                            maxOverlap = overlap;
740
                            break;
741
                        }
742
                        --overlap;
743
                        ++dec;
744
                    }
745
                }
746

747
                if (maxDec != 0 || maxOverlap != 0) {
748
                    // Longest-match algorithm, and there was a string match.
749
                    // Simply continue before it.
750
                    pos -= maxDec;
751
                    if (pos == 0) {
752
                        return 0; // Reached the start of the string.
753
                    }
754
                    spanLength = 0; // Match strings from before a string match.
755
                    continue;
756
                }
757
            }
758
            // Finished trying to match all strings at pos.
759

760
            if (spanLength != 0 || pos == length) {
761
                // The position is before an unlimited code point span (spanLength!=0),
762
                // not before a string match.
763
                // The only position where spanLength==0 before a span is pos==length.
764
                // Otherwise, an unlimited code point span is only tried again when no
765
                // strings match, and if such a non-initial span fails we stop.
766
                if (offsets.isEmpty()) {
767
                    return pos; // No strings matched before a span.
768
                }
769
                // Match strings from before the next string match.
770
            } else {
771
                // The position is before a string match (or a single code point).
772
                if (offsets.isEmpty()) {
773
                    // No more strings matched before a previous string match.
774
                    // Try another code point span from before the last string match.
775
                    int oldPos = pos;
776
                    pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED);
777
                    spanLength = oldPos - pos;
778
                    if (pos == 0 || // Reached the start of the string, or
779
                            spanLength == 0 // neither strings nor span progressed.
780
                    ) {
781
                        return pos;
782
                    }
783
                    continue; // spanLength>0: Match strings from before a span.
784
                } else {
785
                    // Try to match only one code point from before a string match if some
786
                    // string matched beyond it, so that we try all possible positions
787
                    // and don't overshoot.
788
                    spanLength = spanOneBack(spanSet, s, pos);
789
                    if (spanLength > 0) {
790
                        if (spanLength == pos) {
791
                            return 0; // Reached the start of the string.
792
                        }
793
                        // Match strings before this code point.
794
                        // There cannot be any decrements below it because UnicodeSet strings
795
                        // contain multiple code points.
796
                        pos -= spanLength;
797
                        offsets.shift(spanLength);
798
                        spanLength = 0;
799
                        continue; // Match strings from before a single code point.
800
                    }
801
                    // Match strings from before the next string match.
802
                }
803
            }
804
            pos -= offsets.popMinimum(null);
805
            spanLength = 0; // Match strings from before a string match.
806
        }
807
    }
808

809
    /**
810
     * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED)
811
     *
812
     * Theoretical algorithm:
813
     * - Iterate through the string, and at each code point boundary:
814
     *   + If the code point there is in the set, then return with the current position.
815
     *   + If a set string matches at the current position, then return with the current position.
816
     *
817
     * Optimized implementation:
818
     *
819
     * (Same assumption as for span() above.)
820
     *
821
     * Create and cache a spanNotSet which contains
822
     * all of the single code points of the original set but none of its strings.
823
     * For each set string add its initial code point to the spanNotSet.
824
     * (Also add its final code point for spanNotBack().)
825
     *
826
     * - Loop:
827
     *   + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED).
828
     *   + If the current code point is in the original set, then return the current position.
829
     *   + If any set string matches at the current position, then return the current position.
830
     *   + If there is no match at the current position, neither for the code point
831
     *     there nor for any set string, then skip this code point and continue the loop.
832
     *     This happens for set-string-initial code points that were added to spanNotSet
833
     *     when there is not actually a match for such a set string.
834
     *
835
     * @param s The string to be spanned
836
     * @param start The start index that the span begins
837
     * @param outCount If not null: Receives the number of code points across the span.
838
     * @return the limit (exclusive end) of the span
839
     */
840
    private int spanNot(CharSequence s, int start, OutputInt outCount) {
841
        int length = s.length();
842
        int pos = start, rest = length - start;
843
        int stringsLength = strings.size();
844
        int count = 0;
845
        do {
846
            // Span until we find a code point from the set,
847
            // or a code point that starts or ends some string.
848
            int spanLimit;
849
            if (outCount == null) {
850
                spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED);
851
            } else {
852
                spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount);
853
                outCount.value = count = count + outCount.value;
854
            }
855
            if (spanLimit == length) {
856
                return length; // Reached the end of the string.
857
            }
858
            pos = spanLimit;
859
            rest = length - spanLimit;
860

861
            // Check whether the current code point is in the original set,
862
            // without the string starts and ends.
863
            int cpLength = spanOne(spanSet, s, pos, rest);
864
            if (cpLength > 0) {
865
                return pos; // There is a set element at pos.
866
            }
867

868
            // Try to match the strings at pos.
869
            for (int i = 0; i < stringsLength; ++i) {
870
                if (spanLengths[i] == ALL_CP_CONTAINED) {
871
                    continue; // Irrelevant string.
872
                }
873
                String string = strings.get(i);
874

875
                int length16 = string.length();
876
                if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) {
877
                    return pos; // There is a set element at pos.
878
                }
879
            }
880

881
            // The span(while not contained) ended on a string start/end which is
882
            // not in the original set. Skip this code point and continue.
883
            // cpLength<0
884
            pos -= cpLength;
885
            rest += cpLength;
886
            ++count;
887
        } while (rest != 0);
888
        if (outCount != null) {
889
            outCount.value = count;
890
        }
891
        return length; // Reached the end of the string.
892
    }
893

894
    private int spanNotBack(CharSequence s, int length) {
895
        int pos = length;
896
        int i, stringsLength = strings.size();
897
        do {
898
            // Span until we find a code point from the set,
899
            // or a code point that starts or ends some string.
900
            pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED);
901
            if (pos == 0) {
902
                return 0; // Reached the start of the string.
903
            }
904

905
            // Check whether the current code point is in the original set,
906
            // without the string starts and ends.
907
            int cpLength = spanOneBack(spanSet, s, pos);
908
            if (cpLength > 0) {
909
                return pos; // There is a set element at pos.
910
            }
911

912
            // Try to match the strings at pos.
913
            for (i = 0; i < stringsLength; ++i) {
914
                // Use spanLengths rather than a spanLengths pointer because
915
                // it is easier and we only need to know whether the string is irrelevant
916
                // which is the same in either array.
917
                if (spanLengths[i] == ALL_CP_CONTAINED) {
918
                    continue; // Irrelevant string.
919
                }
920
                String string = strings.get(i);
921

922
                int length16 = string.length();
923
                if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) {
924
                    return pos; // There is a set element at pos.
925
                }
926
            }
927

928
            // The span(while not contained) ended on a string start/end which is
929
            // not in the original set. Skip this code point and continue.
930
            // cpLength<0
931
            pos += cpLength;
932
        } while (pos != 0);
933
        return 0; // Reached the start of the string.
934
    }
935

936
    static short makeSpanLengthByte(int spanLength) {
937
        // 0xfe==UnicodeSetStringSpan::LONG_SPAN
938
        return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN;
939
    }
940

941
    // Compare strings without any argument checks. Requires length>0.
942
    private static boolean matches16(CharSequence s, int start, final String t, int length) {
943
        int end = start + length;
944
        while (length-- > 0) {
945
            if (s.charAt(--end) != t.charAt(length)) {
946
                return false;
947
            }
948
        }
949
        return true;
950
    }
951

952
    /**
953
     * Compare 16-bit Unicode strings (which may be malformed UTF-16)
954
     * at code point boundaries.
955
     * That is, each edge of a match must not be in the middle of a surrogate pair.
956
     * @param s       The string to match in.
957
     * @param start   The start index of s.
958
     * @param limit   The limit of the subsequence of s being spanned.
959
     * @param t       The substring to be matched in s.
960
     * @param tlength The length of t.
961
     */
962
    static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) {
963
        return matches16(s, start, t, tlength)
964
                && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) &&
965
                        Character.isLowSurrogate(s.charAt(start)))
966
                && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) &&
967
                        Character.isLowSurrogate(s.charAt(start + tlength)));
968
    }
969

970
    /**
971
     * Does the set contain the next code point?
972
     * If so, return its length; otherwise return its negative length.
973
     */
974
    static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) {
975
        char c = s.charAt(start);
976
        if (c >= 0xd800 && c <= 0xdbff && length >= 2) {
977
            char c2 = s.charAt(start + 1);
978
            if (UTF16.isTrailSurrogate(c2)) {
979
                int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
980
                return set.contains(supplementary) ? 2 : -2;
981
            }
982
        }
983
        return set.contains(c) ? 1 : -1;
984
    }
985

986
    static int spanOneBack(final UnicodeSet set, CharSequence s, int length) {
987
        char c = s.charAt(length - 1);
988
        if (c >= 0xdc00 && c <= 0xdfff && length >= 2) {
989
            char c2 = s.charAt(length - 2);
990
            if (UTF16.isLeadSurrogate(c2)) {
991
                int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
992
                return set.contains(supplementary) ? 2 : -2;
993
            }
994
        }
995
        return set.contains(c) ? 1 : -1;
996
    }
997

998
    /**
999
     * Helper class for UnicodeSetStringSpan.
1000
     *
1001
     * <p>List of offsets from the current position from where to try matching
1002
     * a code point or a string.
1003
     * Stores offsets rather than indexes to simplify the code and use the same list
1004
     * for both increments (in span()) and decrements (in spanBack()).
1005
     *
1006
     * <p>Assumption: The maximum offset is limited, and the offsets that are stored at any one time
1007
     * are relatively dense, that is,
1008
     * there are normally no gaps of hundreds or thousands of offset values.
1009
     *
1010
     * <p>This class optionally also tracks the minimum non-negative count for each position,
1011
     * intended to count the smallest number of elements of any path leading to that position.
1012
     *
1013
     * <p>The implementation uses a circular buffer of count integers,
1014
     * each indicating whether the corresponding offset is in the list,
1015
     * and its path element count.
1016
     * This avoids inserting into a sorted list of offsets (or absolute indexes)
1017
     * and physically moving part of the list.
1018
     *
1019
     * <p>Note: In principle, the caller should setMaxLength() to
1020
     * the maximum of the max string length and U16_LENGTH/U8_LENGTH
1021
     * to account for "long" single code points.
1022
     *
1023
     * <p>Note: An earlier version did not track counts and stored only byte flags.
1024
     * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64,
1025
     * the list could be stored as bit flags in a single integer.
1026
     * Rather than handling a circular buffer with a start list index,
1027
     * the integer would simply be shifted when lower offsets are removed.
1028
     * UnicodeSet does not have a limit on the lengths of strings.
1029
     */
1030
    private static final class OffsetList {
1031
        private int[] list;
1032
        private int length;
1033
        private int start;
1034

1035
        public OffsetList() {
1036
            list = new int[16];  // default size
1037
        }
1038

1039
        public void setMaxLength(int maxLength) {
1040
            if (maxLength > list.length) {
1041
                list = new int[maxLength];
1042
            }
1043
            clear();
1044
        }
1045

1046
        public void clear() {
1047
            for (int i = list.length; i-- > 0;) {
1048
                list[i] = 0;
1049
            }
1050
            start = length = 0;
1051
        }
1052

1053
        public boolean isEmpty() {
1054
            return (length == 0);
1055
        }
1056

1057
        /**
1058
         * Reduces all stored offsets by delta, used when the current position moves by delta.
1059
         * There must not be any offsets lower than delta.
1060
         * If there is an offset equal to delta, it is removed.
1061
         *
1062
         * @param delta [1..maxLength]
1063
         */
1064
        public void shift(int delta) {
1065
            int i = start + delta;
1066
            if (i >= list.length) {
1067
                i -= list.length;
1068
            }
1069
            if (list[i] != 0) {
1070
                list[i] = 0;
1071
                --length;
1072
            }
1073
            start = i;
1074
        }
1075

1076
        /**
1077
         * Adds an offset. The list must not contain it yet.
1078
         * @param offset [1..maxLength]
1079
         */
1080
        public void addOffset(int offset) {
1081
            int i = start + offset;
1082
            if (i >= list.length) {
1083
                i -= list.length;
1084
            }
1085
            assert list[i] == 0;
1086
            list[i] = 1;
1087
            ++length;
1088
        }
1089

1090
        /**
1091
         * Adds an offset and updates its count.
1092
         * The list may already contain the offset.
1093
         * @param offset [1..maxLength]
1094
         */
1095
        public void addOffsetAndCount(int offset, int count) {
1096
            assert count > 0;
1097
            int i = start + offset;
1098
            if (i >= list.length) {
1099
                i -= list.length;
1100
            }
1101
            if (list[i] == 0) {
1102
                list[i] = count;
1103
                ++length;
1104
            } else if (count < list[i]) {
1105
                list[i] = count;
1106
            }
1107
        }
1108

1109
        /**
1110
         * @param offset [1..maxLength]
1111
         */
1112
        public boolean containsOffset(int offset) {
1113
            int i = start + offset;
1114
            if (i >= list.length) {
1115
                i -= list.length;
1116
            }
1117
            return list[i] != 0;
1118
        }
1119

1120
        /**
1121
         * @param offset [1..maxLength]
1122
         */
1123
        public boolean hasCountAtOffset(int offset, int count) {
1124
            int i = start + offset;
1125
            if (i >= list.length) {
1126
                i -= list.length;
1127
            }
1128
            int oldCount = list[i];
1129
            return oldCount != 0 && oldCount <= count;
1130
        }
1131

1132
        /**
1133
         * Finds the lowest stored offset from a non-empty list, removes it,
1134
         * and reduces all other offsets by this minimum.
1135
         * @return min=[1..maxLength]
1136
         */
1137
        public int popMinimum(OutputInt outCount) {
1138
            // Look for the next offset in list[start+1..list.length-1].
1139
            int i = start, result;
1140
            while (++i < list.length) {
1141
                int count = list[i];
1142
                if (count != 0) {
1143
                    list[i] = 0;
1144
                    --length;
1145
                    result = i - start;
1146
                    start = i;
1147
                    if (outCount != null) { outCount.value = count; }
1148
                    return result;
1149
                }
1150
            }
1151
            // i==list.length
1152

1153
            // Wrap around and look for the next offset in list[0..start].
1154
            // Since the list is not empty, there will be one.
1155
            result = list.length - start;
1156
            i = 0;
1157
            int count;
1158
            while ((count = list[i]) == 0) {
1159
                ++i;
1160
            }
1161
            list[i] = 0;
1162
            --length;
1163
            start = i;
1164
            if (outCount != null) { outCount.value = count; }
1165
            return result + i;
1166
        }
1167
    }
1168
}
1169

1170
Product

Resources

Company