Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mobile
Path: blob/master/src/java.base/share/classes/jdk/internal/icu/impl/UnicodeSetStringSpan.java
41161 views
1
/*
2
* Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved.
3
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
*
5
* This code is free software; you can redistribute it and/or modify it
6
* under the terms of the GNU General Public License version 2 only, as
7
* published by the Free Software Foundation. Oracle designates this
8
* particular file as subject to the "Classpath" exception as provided
9
* by Oracle in the LICENSE file that accompanied this code.
10
*
11
* This code is distributed in the hope that it will be useful, but WITHOUT
12
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14
* version 2 for more details (a copy is included in the LICENSE file that
15
* accompanied this code).
16
*
17
* You should have received a copy of the GNU General Public License version
18
* 2 along with this work; if not, write to the Free Software Foundation,
19
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20
*
21
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22
* or visit www.oracle.com if you need additional information or have any
23
* questions.
24
*/
25
26
/*
27
******************************************************************************
28
*
29
* Copyright (C) 2009-2014, International Business Machines
30
* Corporation and others. All Rights Reserved.
31
*
32
******************************************************************************
33
*/
34
35
package jdk.internal.icu.impl;
36
37
import java.util.ArrayList;
38
39
import jdk.internal.icu.text.UTF16;
40
import jdk.internal.icu.text.UnicodeSet;
41
import jdk.internal.icu.text.UnicodeSet.SpanCondition;
42
import jdk.internal.icu.util.OutputInt;
43
44
/*
45
* Implement span() etc. for a set with strings.
46
* Avoid recursion because of its exponential complexity.
47
* Instead, try multiple paths at once and track them with an IndexList.
48
*/
49
public class UnicodeSetStringSpan {
50
51
/*
52
* Which span() variant will be used? The object is either built for one variant and used once,
53
* or built for all and may be used many times.
54
*/
55
public static final int WITH_COUNT = 0x40; // spanAndCount() may be called
56
public static final int FWD = 0x20;
57
public static final int BACK = 0x10;
58
// public static final int UTF16 = 8;
59
public static final int CONTAINED = 2;
60
public static final int NOT_CONTAINED = 1;
61
62
public static final int ALL = 0x7f;
63
64
public static final int FWD_UTF16_CONTAINED = FWD | /* UTF16 | */ CONTAINED;
65
public static final int FWD_UTF16_NOT_CONTAINED = FWD | /* UTF16 | */NOT_CONTAINED;
66
public static final int BACK_UTF16_CONTAINED = BACK | /* UTF16 | */ CONTAINED;
67
public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED;
68
69
/**
70
* Special spanLength short values. (since Java has not unsigned byte type)
71
* All code points in the string are contained in the parent set.
72
*/
73
static final short ALL_CP_CONTAINED = 0xff;
74
75
/** The spanLength is >=0xfe. */
76
static final short LONG_SPAN = ALL_CP_CONTAINED - 1;
77
78
/** Set for span(). Same as parent but without strings. */
79
private UnicodeSet spanSet;
80
81
/**
82
* Set for span(not contained).
83
* Same as spanSet, plus characters that start or end strings.
84
*/
85
private UnicodeSet spanNotSet;
86
87
/** The strings of the parent set. */
88
private ArrayList<String> strings;
89
90
/** The lengths of span(), spanBack() etc. for each string. */
91
private short[] spanLengths;
92
93
/** Maximum lengths of relevant strings. */
94
private int maxLength16;
95
96
/** Are there strings that are not fully contained in the code point set? */
97
private boolean someRelevant;
98
99
/** Set up for all variants of span()? */
100
private boolean all;
101
102
/** Span helper */
103
private OffsetList offsets;
104
105
/**
106
* Constructs for all variants of span(), or only for any one variant.
107
* Initializes as little as possible, for single use.
108
*/
109
public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) {
110
spanSet = new UnicodeSet(0, 0x10ffff);
111
// TODO: With Java 6, just take the parent set's strings as is,
112
// as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings.
113
// Then iterate via the first() and higher() methods.
114
// (We do not want to create multiple Iterator objects in each span().)
115
// See ICU ticket #7454.
116
strings = setStrings;
117
all = (which == ALL);
118
spanSet.retainAll(set);
119
if (0 != (which & NOT_CONTAINED)) {
120
// Default to the same sets.
121
// addToSpanNotSet() will create a separate set if necessary.
122
spanNotSet = spanSet;
123
}
124
offsets = new OffsetList();
125
126
// Determine if the strings even need to be taken into account at all for span() etc.
127
// If any string is relevant, then all strings need to be used for
128
// span(longest match) but only the relevant ones for span(while contained).
129
// TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
130
// and do not store UTF-8 strings if !thisRelevant and CONTAINED.
131
// (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
132
// Also count the lengths of the UTF-8 versions of the strings for memory allocation.
133
int stringsLength = strings.size();
134
135
int i, spanLength;
136
someRelevant = false;
137
for (i = 0; i < stringsLength; ++i) {
138
String string = strings.get(i);
139
int length16 = string.length();
140
spanLength = spanSet.span(string, SpanCondition.CONTAINED);
141
if (spanLength < length16) { // Relevant string.
142
someRelevant = true;
143
}
144
if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) {
145
maxLength16 = length16;
146
}
147
}
148
if (!someRelevant && (which & WITH_COUNT) == 0) {
149
return;
150
}
151
152
// Freeze after checking for the need to use strings at all because freezing
153
// a set takes some time and memory which are wasted if there are no relevant strings.
154
if (all) {
155
spanSet.freeze();
156
}
157
158
int spanBackLengthsOffset;
159
160
// Allocate a block of meta data.
161
int allocSize;
162
if (all) {
163
// 2 sets of span lengths
164
allocSize = stringsLength * (2);
165
} else {
166
allocSize = stringsLength; // One set of span lengths.
167
}
168
spanLengths = new short[allocSize];
169
170
if (all) {
171
// Store span lengths for all span() variants.
172
spanBackLengthsOffset = stringsLength;
173
} else {
174
// Store span lengths for only one span() variant.
175
spanBackLengthsOffset = 0;
176
}
177
178
// Set the meta data and spanNotSet and write the UTF-8 strings.
179
180
for (i = 0; i < stringsLength; ++i) {
181
String string = strings.get(i);
182
int length16 = string.length();
183
spanLength = spanSet.span(string, SpanCondition.CONTAINED);
184
if (spanLength < length16) { // Relevant string.
185
if (true /* 0 != (which & UTF16) */) {
186
if (0 != (which & CONTAINED)) {
187
if (0 != (which & FWD)) {
188
spanLengths[i] = makeSpanLengthByte(spanLength);
189
}
190
if (0 != (which & BACK)) {
191
spanLength = length16
192
- spanSet.spanBack(string, length16, SpanCondition.CONTAINED);
193
spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength);
194
}
195
} else /* not CONTAINED, not all, but NOT_CONTAINED */{
196
spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant
197
// flag.
198
}
199
}
200
if (0 != (which & NOT_CONTAINED)) {
201
// Add string start and end code points to the spanNotSet so that
202
// a span(while not contained) stops before any string.
203
int c;
204
if (0 != (which & FWD)) {
205
c = string.codePointAt(0);
206
addToSpanNotSet(c);
207
}
208
if (0 != (which & BACK)) {
209
c = string.codePointBefore(length16);
210
addToSpanNotSet(c);
211
}
212
}
213
} else { // Irrelevant string.
214
if (all) {
215
spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED;
216
} else {
217
// All spanXYZLengths pointers contain the same address.
218
spanLengths[i] = ALL_CP_CONTAINED;
219
}
220
}
221
}
222
223
// Finish.
224
if (all) {
225
spanNotSet.freeze();
226
}
227
}
228
229
/**
230
* Do the strings need to be checked in span() etc.?
231
*
232
* @return true if strings need to be checked (call span() here),
233
* false if not (use a BMPSet for best performance).
234
*/
235
public boolean needsStringSpanUTF16() {
236
return someRelevant;
237
}
238
239
/** For fast UnicodeSet::contains(c). */
240
public boolean contains(int c) {
241
return spanSet.contains(c);
242
}
243
244
/**
245
* Adds a starting or ending string character to the spanNotSet
246
* so that a character span ends before any string.
247
*/
248
private void addToSpanNotSet(int c) {
249
if (spanNotSet == null || spanNotSet == spanSet) {
250
if (spanSet.contains(c)) {
251
return; // Nothing to do.
252
}
253
spanNotSet = spanSet.cloneAsThawed();
254
}
255
spanNotSet.add(c);
256
}
257
258
/*
259
* Note: In span() when spanLength==0
260
* (after a string match, or at the beginning after an empty code point span)
261
* and in spanNot() and spanNotUTF8(),
262
* string matching could use a binary search because all string matches are done
263
* from the same start index.
264
*
265
* For UTF-8, this would require a comparison function that returns UTF-16 order.
266
*
267
* This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets
268
* with strings have very few very short strings. For cases with many strings, it might be better to use a different
269
* API and implementation with a DFA (state machine).
270
*/
271
272
/*
273
* Algorithm for span(SpanCondition.CONTAINED)
274
*
275
* Theoretical algorithm:
276
* - Iterate through the string, and at each code point boundary:
277
* + If the code point there is in the set, then remember to continue after it.
278
* + If a set string matches at the current position, then remember to continue after it.
279
* + Either recursively span for each code point or string match, or recursively span
280
* for all but the shortest one and iteratively continue the span with the shortest local match.
281
* + Remember the longest recursive span (the farthest end point).
282
* + If there is no match at the current position,
283
* neither for the code point there nor for any set string,
284
* then stop and return the longest recursive span length.
285
*
286
* Optimized implementation:
287
*
288
* (We assume that most sets will have very few very short strings.
289
* A span using a string-less set is extremely fast.)
290
*
291
* Create and cache a spanSet which contains all of the single code points of the original set
292
* but none of its strings.
293
*
294
* - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
295
* - Loop:
296
* + Try to match each set string at the end of the spanLength.
297
* ~ Set strings that start with set-contained code points
298
* must be matched with a partial overlap
299
* because the recursive algorithm would have tried to match them at every position.
300
* ~ Set strings that entirely consist of set-contained code points
301
* are irrelevant for span(SpanCondition.CONTAINED)
302
* because the recursive algorithm would continue after them anyway and
303
* find the longest recursive match from their end.
304
* ~ Rather than recursing, note each end point of a set string match.
305
* + If no set string matched after spanSet.span(),
306
* then return with where the spanSet.span() ended.
307
* + If at least one set string matched after spanSet.span(),
308
* then pop the shortest string match end point and continue the loop,
309
* trying to match all set strings from there.
310
* + If at least one more set string matched after a previous string match, then test if the
311
* code point after the previous string match is also contained in the set.
312
* Continue the loop with the shortest end point of
313
* either this code point or a matching set string.
314
* + If no more set string matched after a previous string match,
315
* then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
316
* Stop if spanLength==0, otherwise continue the loop.
317
*
318
* By noting each end point of a set string match, the function visits each string position at most once and
319
* finishes in linear time.
320
*
321
* The recursive algorithm may visit the same string position many times
322
* if multiple paths lead to it and finishes in exponential time.
323
*/
324
325
/*
326
* Algorithm for span(SIMPLE)
327
*
328
* Theoretical algorithm:
329
* - Iterate through the string, and at each code point boundary:
330
* + If the code point there is in the set, then remember to continue after it.
331
* + If a set string matches at the current position, then remember to continue after it.
332
* + Continue from the farthest match position and ignore all others.
333
* + If there is no match at the current position, then stop and return the current position.
334
*
335
* Optimized implementation:
336
*
337
* (Same assumption and spanSet as above.)
338
*
339
* - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
340
* - Loop:
341
* + Try to match each set string at the end of the spanLength.
342
* ~ Set strings that start with set-contained code points
343
* must be matched with a partial overlap
344
* because the standard algorithm would have tried to match them earlier.
345
* ~ Set strings that entirely consist of set-contained code points
346
* must be matched with a full overlap because the longest-match algorithm
347
* would hide set string matches that end earlier.
348
* Such set strings need not be matched earlier inside the code point span
349
* because the standard algorithm would then have
350
* continued after the set string match anyway.
351
* ~ Remember the longest set string match (farthest end point)
352
* from the earliest starting point.
353
* + If no set string matched after spanSet.span(),
354
* then return with where the spanSet.span() ended.
355
* + If at least one set string matched,
356
* then continue the loop after the longest match from the earliest position.
357
* + If no more set string matched after a previous string match,
358
* then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
359
* Stop if spanLength==0, otherwise continue the loop.
360
*/
361
/**
362
* Spans a string.
363
*
364
* @param s The string to be spanned
365
* @param start The start index that the span begins
366
* @param spanCondition The span condition
367
* @return the limit (exclusive end) of the span
368
*/
369
public int span(CharSequence s, int start, SpanCondition spanCondition) {
370
if (spanCondition == SpanCondition.NOT_CONTAINED) {
371
return spanNot(s, start, null);
372
}
373
int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED);
374
if (spanLimit == s.length()) {
375
return spanLimit;
376
}
377
return spanWithStrings(s, start, spanLimit, spanCondition);
378
}
379
380
/**
381
* Synchronized method for complicated spans using the offsets.
382
* Avoids synchronization for simple cases.
383
*
384
* @param spanLimit = spanSet.span(s, start, CONTAINED)
385
*/
386
private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit,
387
SpanCondition spanCondition) {
388
// Consider strings; they may overlap with the span.
389
int initSize = 0;
390
if (spanCondition == SpanCondition.CONTAINED) {
391
// Use offset list to try all possibilities.
392
initSize = maxLength16;
393
}
394
offsets.setMaxLength(initSize);
395
int length = s.length();
396
int pos = spanLimit, rest = length - spanLimit;
397
int spanLength = spanLimit - start;
398
int i, stringsLength = strings.size();
399
for (;;) {
400
if (spanCondition == SpanCondition.CONTAINED) {
401
for (i = 0; i < stringsLength; ++i) {
402
int overlap = spanLengths[i];
403
if (overlap == ALL_CP_CONTAINED) {
404
continue; // Irrelevant string.
405
}
406
String string = strings.get(i);
407
408
int length16 = string.length();
409
410
// Try to match this string at pos-overlap..pos.
411
if (overlap >= LONG_SPAN) {
412
overlap = length16;
413
// While contained: No point matching fully inside the code point span.
414
overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code
415
// point.
416
}
417
if (overlap > spanLength) {
418
overlap = spanLength;
419
}
420
int inc = length16 - overlap; // Keep overlap+inc==length16.
421
for (;;) {
422
if (inc > rest) {
423
break;
424
}
425
// Try to match if the increment is not listed already.
426
if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) {
427
if (inc == rest) {
428
return length; // Reached the end of the string.
429
}
430
offsets.addOffset(inc);
431
}
432
if (overlap == 0) {
433
break;
434
}
435
--overlap;
436
++inc;
437
}
438
}
439
} else /* SIMPLE */{
440
int maxInc = 0, maxOverlap = 0;
441
for (i = 0; i < stringsLength; ++i) {
442
int overlap = spanLengths[i];
443
// For longest match, we do need to try to match even an all-contained string
444
// to find the match from the earliest start.
445
446
String string = strings.get(i);
447
448
int length16 = string.length();
449
450
// Try to match this string at pos-overlap..pos.
451
if (overlap >= LONG_SPAN) {
452
overlap = length16;
453
// Longest match: Need to match fully inside the code point span
454
// to find the match from the earliest start.
455
}
456
if (overlap > spanLength) {
457
overlap = spanLength;
458
}
459
int inc = length16 - overlap; // Keep overlap+inc==length16.
460
for (;;) {
461
if (inc > rest || overlap < maxOverlap) {
462
break;
463
}
464
// Try to match if the string is longer or starts earlier.
465
if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc)
466
&& matches16CPB(s, pos - overlap, length, string, length16)) {
467
maxInc = inc; // Longest match from earliest start.
468
maxOverlap = overlap;
469
break;
470
}
471
--overlap;
472
++inc;
473
}
474
}
475
476
if (maxInc != 0 || maxOverlap != 0) {
477
// Longest-match algorithm, and there was a string match.
478
// Simply continue after it.
479
pos += maxInc;
480
rest -= maxInc;
481
if (rest == 0) {
482
return length; // Reached the end of the string.
483
}
484
spanLength = 0; // Match strings from after a string match.
485
continue;
486
}
487
}
488
// Finished trying to match all strings at pos.
489
490
if (spanLength != 0 || pos == 0) {
491
// The position is after an unlimited code point span (spanLength!=0),
492
// not after a string match.
493
// The only position where spanLength==0 after a span is pos==0.
494
// Otherwise, an unlimited code point span is only tried again when no
495
// strings match, and if such a non-initial span fails we stop.
496
if (offsets.isEmpty()) {
497
return pos; // No strings matched after a span.
498
}
499
// Match strings from after the next string match.
500
} else {
501
// The position is after a string match (or a single code point).
502
if (offsets.isEmpty()) {
503
// No more strings matched after a previous string match.
504
// Try another code point span from after the last string match.
505
spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED);
506
spanLength = spanLimit - pos;
507
if (spanLength == rest || // Reached the end of the string, or
508
spanLength == 0 // neither strings nor span progressed.
509
) {
510
return spanLimit;
511
}
512
pos += spanLength;
513
rest -= spanLength;
514
continue; // spanLength>0: Match strings from after a span.
515
} else {
516
// Try to match only one code point from after a string match if some
517
// string matched beyond it, so that we try all possible positions
518
// and don't overshoot.
519
spanLength = spanOne(spanSet, s, pos, rest);
520
if (spanLength > 0) {
521
if (spanLength == rest) {
522
return length; // Reached the end of the string.
523
}
524
// Match strings after this code point.
525
// There cannot be any increments below it because UnicodeSet strings
526
// contain multiple code points.
527
pos += spanLength;
528
rest -= spanLength;
529
offsets.shift(spanLength);
530
spanLength = 0;
531
continue; // Match strings from after a single code point.
532
}
533
// Match strings from after the next string match.
534
}
535
}
536
int minOffset = offsets.popMinimum(null);
537
pos += minOffset;
538
rest -= minOffset;
539
spanLength = 0; // Match strings from after a string match.
540
}
541
}
542
543
/**
544
* Spans a string and counts the smallest number of set elements on any path across the span.
545
*
546
* <p>For proper counting, we cannot ignore strings that are fully contained in code point spans.
547
*
548
* <p>If the set does not have any fully-contained strings, then we could optimize this
549
* like span(), but such sets are likely rare, and this is at least still linear.
550
*
551
* @param s The string to be spanned
552
* @param start The start index that the span begins
553
* @param spanCondition The span condition
554
* @param outCount The count
555
* @return the limit (exclusive end) of the span
556
*/
557
public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition,
558
OutputInt outCount) {
559
if (spanCondition == SpanCondition.NOT_CONTAINED) {
560
return spanNot(s, start, outCount);
561
}
562
// Consider strings; they may overlap with the span,
563
// and they may result in a smaller count that with just code points.
564
if (spanCondition == SpanCondition.CONTAINED) {
565
return spanContainedAndCount(s, start, outCount);
566
}
567
// SIMPLE (not synchronized, does not use offsets)
568
int stringsLength = strings.size();
569
int length = s.length();
570
int pos = start;
571
int rest = length - start;
572
int count = 0;
573
while (rest != 0) {
574
// Try to match the next code point.
575
int cpLength = spanOne(spanSet, s, pos, rest);
576
int maxInc = (cpLength > 0) ? cpLength : 0;
577
// Try to match all of the strings.
578
for (int i = 0; i < stringsLength; ++i) {
579
String string = strings.get(i);
580
int length16 = string.length();
581
if (maxInc < length16 && length16 <= rest &&
582
matches16CPB(s, pos, length, string, length16)) {
583
maxInc = length16;
584
}
585
}
586
// We are done if there is no match beyond pos.
587
if (maxInc == 0) {
588
outCount.value = count;
589
return pos;
590
}
591
// Continue from the longest match.
592
++count;
593
pos += maxInc;
594
rest -= maxInc;
595
}
596
outCount.value = count;
597
return pos;
598
}
599
600
private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) {
601
// Use offset list to try all possibilities.
602
offsets.setMaxLength(maxLength16);
603
int stringsLength = strings.size();
604
int length = s.length();
605
int pos = start;
606
int rest = length - start;
607
int count = 0;
608
while (rest != 0) {
609
// Try to match the next code point.
610
int cpLength = spanOne(spanSet, s, pos, rest);
611
if (cpLength > 0) {
612
offsets.addOffsetAndCount(cpLength, count + 1);
613
}
614
// Try to match all of the strings.
615
for (int i = 0; i < stringsLength; ++i) {
616
String string = strings.get(i);
617
int length16 = string.length();
618
// Note: If the strings were sorted by length, then we could also
619
// avoid trying to match if there is already a match of the same length.
620
if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) &&
621
matches16CPB(s, pos, length, string, length16)) {
622
offsets.addOffsetAndCount(length16, count + 1);
623
}
624
}
625
// We are done if there is no match beyond pos.
626
if (offsets.isEmpty()) {
627
outCount.value = count;
628
return pos;
629
}
630
// Continue from the nearest match.
631
int minOffset = offsets.popMinimum(outCount);
632
count = outCount.value;
633
pos += minOffset;
634
rest -= minOffset;
635
}
636
outCount.value = count;
637
return pos;
638
}
639
640
/**
641
* Span a string backwards.
642
*
643
* @param s The string to be spanned
644
* @param spanCondition The span condition
645
* @return The string index which starts the span (i.e. inclusive).
646
*/
647
public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) {
648
if (spanCondition == SpanCondition.NOT_CONTAINED) {
649
return spanNotBack(s, length);
650
}
651
int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED);
652
if (pos == 0) {
653
return 0;
654
}
655
int spanLength = length - pos;
656
657
// Consider strings; they may overlap with the span.
658
int initSize = 0;
659
if (spanCondition == SpanCondition.CONTAINED) {
660
// Use offset list to try all possibilities.
661
initSize = maxLength16;
662
}
663
offsets.setMaxLength(initSize);
664
int i, stringsLength = strings.size();
665
int spanBackLengthsOffset = 0;
666
if (all) {
667
spanBackLengthsOffset = stringsLength;
668
}
669
for (;;) {
670
if (spanCondition == SpanCondition.CONTAINED) {
671
for (i = 0; i < stringsLength; ++i) {
672
int overlap = spanLengths[spanBackLengthsOffset + i];
673
if (overlap == ALL_CP_CONTAINED) {
674
continue; // Irrelevant string.
675
}
676
String string = strings.get(i);
677
678
int length16 = string.length();
679
680
// Try to match this string at pos-(length16-overlap)..pos-length16.
681
if (overlap >= LONG_SPAN) {
682
overlap = length16;
683
// While contained: No point matching fully inside the code point span.
684
int len1 = 0;
685
len1 = string.offsetByCodePoints(0, 1);
686
overlap -= len1; // Length of the string minus the first code point.
687
}
688
if (overlap > spanLength) {
689
overlap = spanLength;
690
}
691
int dec = length16 - overlap; // Keep dec+overlap==length16.
692
for (;;) {
693
if (dec > pos) {
694
break;
695
}
696
// Try to match if the decrement is not listed already.
697
if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) {
698
if (dec == pos) {
699
return 0; // Reached the start of the string.
700
}
701
offsets.addOffset(dec);
702
}
703
if (overlap == 0) {
704
break;
705
}
706
--overlap;
707
++dec;
708
}
709
}
710
} else /* SIMPLE */{
711
int maxDec = 0, maxOverlap = 0;
712
for (i = 0; i < stringsLength; ++i) {
713
int overlap = spanLengths[spanBackLengthsOffset + i];
714
// For longest match, we do need to try to match even an all-contained string
715
// to find the match from the latest end.
716
717
String string = strings.get(i);
718
719
int length16 = string.length();
720
721
// Try to match this string at pos-(length16-overlap)..pos-length16.
722
if (overlap >= LONG_SPAN) {
723
overlap = length16;
724
// Longest match: Need to match fully inside the code point span
725
// to find the match from the latest end.
726
}
727
if (overlap > spanLength) {
728
overlap = spanLength;
729
}
730
int dec = length16 - overlap; // Keep dec+overlap==length16.
731
for (;;) {
732
if (dec > pos || overlap < maxOverlap) {
733
break;
734
}
735
// Try to match if the string is longer or ends later.
736
if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec)
737
&& matches16CPB(s, pos - dec, length, string, length16)) {
738
maxDec = dec; // Longest match from latest end.
739
maxOverlap = overlap;
740
break;
741
}
742
--overlap;
743
++dec;
744
}
745
}
746
747
if (maxDec != 0 || maxOverlap != 0) {
748
// Longest-match algorithm, and there was a string match.
749
// Simply continue before it.
750
pos -= maxDec;
751
if (pos == 0) {
752
return 0; // Reached the start of the string.
753
}
754
spanLength = 0; // Match strings from before a string match.
755
continue;
756
}
757
}
758
// Finished trying to match all strings at pos.
759
760
if (spanLength != 0 || pos == length) {
761
// The position is before an unlimited code point span (spanLength!=0),
762
// not before a string match.
763
// The only position where spanLength==0 before a span is pos==length.
764
// Otherwise, an unlimited code point span is only tried again when no
765
// strings match, and if such a non-initial span fails we stop.
766
if (offsets.isEmpty()) {
767
return pos; // No strings matched before a span.
768
}
769
// Match strings from before the next string match.
770
} else {
771
// The position is before a string match (or a single code point).
772
if (offsets.isEmpty()) {
773
// No more strings matched before a previous string match.
774
// Try another code point span from before the last string match.
775
int oldPos = pos;
776
pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED);
777
spanLength = oldPos - pos;
778
if (pos == 0 || // Reached the start of the string, or
779
spanLength == 0 // neither strings nor span progressed.
780
) {
781
return pos;
782
}
783
continue; // spanLength>0: Match strings from before a span.
784
} else {
785
// Try to match only one code point from before a string match if some
786
// string matched beyond it, so that we try all possible positions
787
// and don't overshoot.
788
spanLength = spanOneBack(spanSet, s, pos);
789
if (spanLength > 0) {
790
if (spanLength == pos) {
791
return 0; // Reached the start of the string.
792
}
793
// Match strings before this code point.
794
// There cannot be any decrements below it because UnicodeSet strings
795
// contain multiple code points.
796
pos -= spanLength;
797
offsets.shift(spanLength);
798
spanLength = 0;
799
continue; // Match strings from before a single code point.
800
}
801
// Match strings from before the next string match.
802
}
803
}
804
pos -= offsets.popMinimum(null);
805
spanLength = 0; // Match strings from before a string match.
806
}
807
}
808
809
/**
810
* Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED)
811
*
812
* Theoretical algorithm:
813
* - Iterate through the string, and at each code point boundary:
814
* + If the code point there is in the set, then return with the current position.
815
* + If a set string matches at the current position, then return with the current position.
816
*
817
* Optimized implementation:
818
*
819
* (Same assumption as for span() above.)
820
*
821
* Create and cache a spanNotSet which contains
822
* all of the single code points of the original set but none of its strings.
823
* For each set string add its initial code point to the spanNotSet.
824
* (Also add its final code point for spanNotBack().)
825
*
826
* - Loop:
827
* + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED).
828
* + If the current code point is in the original set, then return the current position.
829
* + If any set string matches at the current position, then return the current position.
830
* + If there is no match at the current position, neither for the code point
831
* there nor for any set string, then skip this code point and continue the loop.
832
* This happens for set-string-initial code points that were added to spanNotSet
833
* when there is not actually a match for such a set string.
834
*
835
* @param s The string to be spanned
836
* @param start The start index that the span begins
837
* @param outCount If not null: Receives the number of code points across the span.
838
* @return the limit (exclusive end) of the span
839
*/
840
private int spanNot(CharSequence s, int start, OutputInt outCount) {
841
int length = s.length();
842
int pos = start, rest = length - start;
843
int stringsLength = strings.size();
844
int count = 0;
845
do {
846
// Span until we find a code point from the set,
847
// or a code point that starts or ends some string.
848
int spanLimit;
849
if (outCount == null) {
850
spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED);
851
} else {
852
spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount);
853
outCount.value = count = count + outCount.value;
854
}
855
if (spanLimit == length) {
856
return length; // Reached the end of the string.
857
}
858
pos = spanLimit;
859
rest = length - spanLimit;
860
861
// Check whether the current code point is in the original set,
862
// without the string starts and ends.
863
int cpLength = spanOne(spanSet, s, pos, rest);
864
if (cpLength > 0) {
865
return pos; // There is a set element at pos.
866
}
867
868
// Try to match the strings at pos.
869
for (int i = 0; i < stringsLength; ++i) {
870
if (spanLengths[i] == ALL_CP_CONTAINED) {
871
continue; // Irrelevant string.
872
}
873
String string = strings.get(i);
874
875
int length16 = string.length();
876
if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) {
877
return pos; // There is a set element at pos.
878
}
879
}
880
881
// The span(while not contained) ended on a string start/end which is
882
// not in the original set. Skip this code point and continue.
883
// cpLength<0
884
pos -= cpLength;
885
rest += cpLength;
886
++count;
887
} while (rest != 0);
888
if (outCount != null) {
889
outCount.value = count;
890
}
891
return length; // Reached the end of the string.
892
}
893
894
private int spanNotBack(CharSequence s, int length) {
895
int pos = length;
896
int i, stringsLength = strings.size();
897
do {
898
// Span until we find a code point from the set,
899
// or a code point that starts or ends some string.
900
pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED);
901
if (pos == 0) {
902
return 0; // Reached the start of the string.
903
}
904
905
// Check whether the current code point is in the original set,
906
// without the string starts and ends.
907
int cpLength = spanOneBack(spanSet, s, pos);
908
if (cpLength > 0) {
909
return pos; // There is a set element at pos.
910
}
911
912
// Try to match the strings at pos.
913
for (i = 0; i < stringsLength; ++i) {
914
// Use spanLengths rather than a spanLengths pointer because
915
// it is easier and we only need to know whether the string is irrelevant
916
// which is the same in either array.
917
if (spanLengths[i] == ALL_CP_CONTAINED) {
918
continue; // Irrelevant string.
919
}
920
String string = strings.get(i);
921
922
int length16 = string.length();
923
if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) {
924
return pos; // There is a set element at pos.
925
}
926
}
927
928
// The span(while not contained) ended on a string start/end which is
929
// not in the original set. Skip this code point and continue.
930
// cpLength<0
931
pos += cpLength;
932
} while (pos != 0);
933
return 0; // Reached the start of the string.
934
}
935
936
static short makeSpanLengthByte(int spanLength) {
937
// 0xfe==UnicodeSetStringSpan::LONG_SPAN
938
return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN;
939
}
940
941
// Compare strings without any argument checks. Requires length>0.
942
private static boolean matches16(CharSequence s, int start, final String t, int length) {
943
int end = start + length;
944
while (length-- > 0) {
945
if (s.charAt(--end) != t.charAt(length)) {
946
return false;
947
}
948
}
949
return true;
950
}
951
952
/**
953
* Compare 16-bit Unicode strings (which may be malformed UTF-16)
954
* at code point boundaries.
955
* That is, each edge of a match must not be in the middle of a surrogate pair.
956
* @param s The string to match in.
957
* @param start The start index of s.
958
* @param limit The limit of the subsequence of s being spanned.
959
* @param t The substring to be matched in s.
960
* @param tlength The length of t.
961
*/
962
static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) {
963
return matches16(s, start, t, tlength)
964
&& !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) &&
965
Character.isLowSurrogate(s.charAt(start)))
966
&& !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) &&
967
Character.isLowSurrogate(s.charAt(start + tlength)));
968
}
969
970
/**
971
* Does the set contain the next code point?
972
* If so, return its length; otherwise return its negative length.
973
*/
974
static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) {
975
char c = s.charAt(start);
976
if (c >= 0xd800 && c <= 0xdbff && length >= 2) {
977
char c2 = s.charAt(start + 1);
978
if (UTF16.isTrailSurrogate(c2)) {
979
int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
980
return set.contains(supplementary) ? 2 : -2;
981
}
982
}
983
return set.contains(c) ? 1 : -1;
984
}
985
986
static int spanOneBack(final UnicodeSet set, CharSequence s, int length) {
987
char c = s.charAt(length - 1);
988
if (c >= 0xdc00 && c <= 0xdfff && length >= 2) {
989
char c2 = s.charAt(length - 2);
990
if (UTF16.isLeadSurrogate(c2)) {
991
int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
992
return set.contains(supplementary) ? 2 : -2;
993
}
994
}
995
return set.contains(c) ? 1 : -1;
996
}
997
998
/**
999
* Helper class for UnicodeSetStringSpan.
1000
*
1001
* <p>List of offsets from the current position from where to try matching
1002
* a code point or a string.
1003
* Stores offsets rather than indexes to simplify the code and use the same list
1004
* for both increments (in span()) and decrements (in spanBack()).
1005
*
1006
* <p>Assumption: The maximum offset is limited, and the offsets that are stored at any one time
1007
* are relatively dense, that is,
1008
* there are normally no gaps of hundreds or thousands of offset values.
1009
*
1010
* <p>This class optionally also tracks the minimum non-negative count for each position,
1011
* intended to count the smallest number of elements of any path leading to that position.
1012
*
1013
* <p>The implementation uses a circular buffer of count integers,
1014
* each indicating whether the corresponding offset is in the list,
1015
* and its path element count.
1016
* This avoids inserting into a sorted list of offsets (or absolute indexes)
1017
* and physically moving part of the list.
1018
*
1019
* <p>Note: In principle, the caller should setMaxLength() to
1020
* the maximum of the max string length and U16_LENGTH/U8_LENGTH
1021
* to account for "long" single code points.
1022
*
1023
* <p>Note: An earlier version did not track counts and stored only byte flags.
1024
* With boolean flags, if maxLength were guaranteed to be no more than 32 or 64,
1025
* the list could be stored as bit flags in a single integer.
1026
* Rather than handling a circular buffer with a start list index,
1027
* the integer would simply be shifted when lower offsets are removed.
1028
* UnicodeSet does not have a limit on the lengths of strings.
1029
*/
1030
private static final class OffsetList {
1031
private int[] list;
1032
private int length;
1033
private int start;
1034
1035
public OffsetList() {
1036
list = new int[16]; // default size
1037
}
1038
1039
public void setMaxLength(int maxLength) {
1040
if (maxLength > list.length) {
1041
list = new int[maxLength];
1042
}
1043
clear();
1044
}
1045
1046
public void clear() {
1047
for (int i = list.length; i-- > 0;) {
1048
list[i] = 0;
1049
}
1050
start = length = 0;
1051
}
1052
1053
public boolean isEmpty() {
1054
return (length == 0);
1055
}
1056
1057
/**
1058
* Reduces all stored offsets by delta, used when the current position moves by delta.
1059
* There must not be any offsets lower than delta.
1060
* If there is an offset equal to delta, it is removed.
1061
*
1062
* @param delta [1..maxLength]
1063
*/
1064
public void shift(int delta) {
1065
int i = start + delta;
1066
if (i >= list.length) {
1067
i -= list.length;
1068
}
1069
if (list[i] != 0) {
1070
list[i] = 0;
1071
--length;
1072
}
1073
start = i;
1074
}
1075
1076
/**
1077
* Adds an offset. The list must not contain it yet.
1078
* @param offset [1..maxLength]
1079
*/
1080
public void addOffset(int offset) {
1081
int i = start + offset;
1082
if (i >= list.length) {
1083
i -= list.length;
1084
}
1085
assert list[i] == 0;
1086
list[i] = 1;
1087
++length;
1088
}
1089
1090
/**
1091
* Adds an offset and updates its count.
1092
* The list may already contain the offset.
1093
* @param offset [1..maxLength]
1094
*/
1095
public void addOffsetAndCount(int offset, int count) {
1096
assert count > 0;
1097
int i = start + offset;
1098
if (i >= list.length) {
1099
i -= list.length;
1100
}
1101
if (list[i] == 0) {
1102
list[i] = count;
1103
++length;
1104
} else if (count < list[i]) {
1105
list[i] = count;
1106
}
1107
}
1108
1109
/**
1110
* @param offset [1..maxLength]
1111
*/
1112
public boolean containsOffset(int offset) {
1113
int i = start + offset;
1114
if (i >= list.length) {
1115
i -= list.length;
1116
}
1117
return list[i] != 0;
1118
}
1119
1120
/**
1121
* @param offset [1..maxLength]
1122
*/
1123
public boolean hasCountAtOffset(int offset, int count) {
1124
int i = start + offset;
1125
if (i >= list.length) {
1126
i -= list.length;
1127
}
1128
int oldCount = list[i];
1129
return oldCount != 0 && oldCount <= count;
1130
}
1131
1132
/**
1133
* Finds the lowest stored offset from a non-empty list, removes it,
1134
* and reduces all other offsets by this minimum.
1135
* @return min=[1..maxLength]
1136
*/
1137
public int popMinimum(OutputInt outCount) {
1138
// Look for the next offset in list[start+1..list.length-1].
1139
int i = start, result;
1140
while (++i < list.length) {
1141
int count = list[i];
1142
if (count != 0) {
1143
list[i] = 0;
1144
--length;
1145
result = i - start;
1146
start = i;
1147
if (outCount != null) { outCount.value = count; }
1148
return result;
1149
}
1150
}
1151
// i==list.length
1152
1153
// Wrap around and look for the next offset in list[0..start].
1154
// Since the list is not empty, there will be one.
1155
result = list.length - start;
1156
i = 0;
1157
int count;
1158
while ((count = list[i]) == 0) {
1159
++i;
1160
}
1161
list[i] = 0;
1162
--length;
1163
start = i;
1164
if (outCount != null) { outCount.value = count; }
1165
return result + i;
1166
}
1167
}
1168
}
1169
1170