Path: blob/master/src/java.base/share/classes/jdk/internal/icu/impl/UnicodeSetStringSpan.java
41161 views
/*1* Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/2425/*26******************************************************************************27*28* Copyright (C) 2009-2014, International Business Machines29* Corporation and others. All Rights Reserved.30*31******************************************************************************32*/3334package jdk.internal.icu.impl;3536import java.util.ArrayList;3738import jdk.internal.icu.text.UTF16;39import jdk.internal.icu.text.UnicodeSet;40import jdk.internal.icu.text.UnicodeSet.SpanCondition;41import jdk.internal.icu.util.OutputInt;4243/*44* Implement span() etc. for a set with strings.45* Avoid recursion because of its exponential complexity.46* Instead, try multiple paths at once and track them with an IndexList.47*/48public class UnicodeSetStringSpan {4950/*51* Which span() variant will be used? The object is either built for one variant and used once,52* or built for all and may be used many times.53*/54public static final int WITH_COUNT = 0x40; // spanAndCount() may be called55public static final int FWD = 0x20;56public static final int BACK = 0x10;57// public static final int UTF16 = 8;58public static final int CONTAINED = 2;59public static final int NOT_CONTAINED = 1;6061public static final int ALL = 0x7f;6263public static final int FWD_UTF16_CONTAINED = FWD | /* UTF16 | */ CONTAINED;64public static final int FWD_UTF16_NOT_CONTAINED = FWD | /* UTF16 | */NOT_CONTAINED;65public static final int BACK_UTF16_CONTAINED = BACK | /* UTF16 | */ CONTAINED;66public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED;6768/**69* Special spanLength short values. (since Java has not unsigned byte type)70* All code points in the string are contained in the parent set.71*/72static final short ALL_CP_CONTAINED = 0xff;7374/** The spanLength is >=0xfe. */75static final short LONG_SPAN = ALL_CP_CONTAINED - 1;7677/** Set for span(). Same as parent but without strings. */78private UnicodeSet spanSet;7980/**81* Set for span(not contained).82* Same as spanSet, plus characters that start or end strings.83*/84private UnicodeSet spanNotSet;8586/** The strings of the parent set. */87private ArrayList<String> strings;8889/** The lengths of span(), spanBack() etc. for each string. */90private short[] spanLengths;9192/** Maximum lengths of relevant strings. */93private int maxLength16;9495/** Are there strings that are not fully contained in the code point set? */96private boolean someRelevant;9798/** Set up for all variants of span()? */99private boolean all;100101/** Span helper */102private OffsetList offsets;103104/**105* Constructs for all variants of span(), or only for any one variant.106* Initializes as little as possible, for single use.107*/108public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) {109spanSet = new UnicodeSet(0, 0x10ffff);110// TODO: With Java 6, just take the parent set's strings as is,111// as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings.112// Then iterate via the first() and higher() methods.113// (We do not want to create multiple Iterator objects in each span().)114// See ICU ticket #7454.115strings = setStrings;116all = (which == ALL);117spanSet.retainAll(set);118if (0 != (which & NOT_CONTAINED)) {119// Default to the same sets.120// addToSpanNotSet() will create a separate set if necessary.121spanNotSet = spanSet;122}123offsets = new OffsetList();124125// Determine if the strings even need to be taken into account at all for span() etc.126// If any string is relevant, then all strings need to be used for127// span(longest match) but only the relevant ones for span(while contained).128// TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH129// and do not store UTF-8 strings if !thisRelevant and CONTAINED.130// (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)131// Also count the lengths of the UTF-8 versions of the strings for memory allocation.132int stringsLength = strings.size();133134int i, spanLength;135someRelevant = false;136for (i = 0; i < stringsLength; ++i) {137String string = strings.get(i);138int length16 = string.length();139spanLength = spanSet.span(string, SpanCondition.CONTAINED);140if (spanLength < length16) { // Relevant string.141someRelevant = true;142}143if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) {144maxLength16 = length16;145}146}147if (!someRelevant && (which & WITH_COUNT) == 0) {148return;149}150151// Freeze after checking for the need to use strings at all because freezing152// a set takes some time and memory which are wasted if there are no relevant strings.153if (all) {154spanSet.freeze();155}156157int spanBackLengthsOffset;158159// Allocate a block of meta data.160int allocSize;161if (all) {162// 2 sets of span lengths163allocSize = stringsLength * (2);164} else {165allocSize = stringsLength; // One set of span lengths.166}167spanLengths = new short[allocSize];168169if (all) {170// Store span lengths for all span() variants.171spanBackLengthsOffset = stringsLength;172} else {173// Store span lengths for only one span() variant.174spanBackLengthsOffset = 0;175}176177// Set the meta data and spanNotSet and write the UTF-8 strings.178179for (i = 0; i < stringsLength; ++i) {180String string = strings.get(i);181int length16 = string.length();182spanLength = spanSet.span(string, SpanCondition.CONTAINED);183if (spanLength < length16) { // Relevant string.184if (true /* 0 != (which & UTF16) */) {185if (0 != (which & CONTAINED)) {186if (0 != (which & FWD)) {187spanLengths[i] = makeSpanLengthByte(spanLength);188}189if (0 != (which & BACK)) {190spanLength = length16191- spanSet.spanBack(string, length16, SpanCondition.CONTAINED);192spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength);193}194} else /* not CONTAINED, not all, but NOT_CONTAINED */{195spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant196// flag.197}198}199if (0 != (which & NOT_CONTAINED)) {200// Add string start and end code points to the spanNotSet so that201// a span(while not contained) stops before any string.202int c;203if (0 != (which & FWD)) {204c = string.codePointAt(0);205addToSpanNotSet(c);206}207if (0 != (which & BACK)) {208c = string.codePointBefore(length16);209addToSpanNotSet(c);210}211}212} else { // Irrelevant string.213if (all) {214spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED;215} else {216// All spanXYZLengths pointers contain the same address.217spanLengths[i] = ALL_CP_CONTAINED;218}219}220}221222// Finish.223if (all) {224spanNotSet.freeze();225}226}227228/**229* Do the strings need to be checked in span() etc.?230*231* @return true if strings need to be checked (call span() here),232* false if not (use a BMPSet for best performance).233*/234public boolean needsStringSpanUTF16() {235return someRelevant;236}237238/** For fast UnicodeSet::contains(c). */239public boolean contains(int c) {240return spanSet.contains(c);241}242243/**244* Adds a starting or ending string character to the spanNotSet245* so that a character span ends before any string.246*/247private void addToSpanNotSet(int c) {248if (spanNotSet == null || spanNotSet == spanSet) {249if (spanSet.contains(c)) {250return; // Nothing to do.251}252spanNotSet = spanSet.cloneAsThawed();253}254spanNotSet.add(c);255}256257/*258* Note: In span() when spanLength==0259* (after a string match, or at the beginning after an empty code point span)260* and in spanNot() and spanNotUTF8(),261* string matching could use a binary search because all string matches are done262* from the same start index.263*264* For UTF-8, this would require a comparison function that returns UTF-16 order.265*266* This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets267* with strings have very few very short strings. For cases with many strings, it might be better to use a different268* API and implementation with a DFA (state machine).269*/270271/*272* Algorithm for span(SpanCondition.CONTAINED)273*274* Theoretical algorithm:275* - Iterate through the string, and at each code point boundary:276* + If the code point there is in the set, then remember to continue after it.277* + If a set string matches at the current position, then remember to continue after it.278* + Either recursively span for each code point or string match, or recursively span279* for all but the shortest one and iteratively continue the span with the shortest local match.280* + Remember the longest recursive span (the farthest end point).281* + If there is no match at the current position,282* neither for the code point there nor for any set string,283* then stop and return the longest recursive span length.284*285* Optimized implementation:286*287* (We assume that most sets will have very few very short strings.288* A span using a string-less set is extremely fast.)289*290* Create and cache a spanSet which contains all of the single code points of the original set291* but none of its strings.292*293* - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).294* - Loop:295* + Try to match each set string at the end of the spanLength.296* ~ Set strings that start with set-contained code points297* must be matched with a partial overlap298* because the recursive algorithm would have tried to match them at every position.299* ~ Set strings that entirely consist of set-contained code points300* are irrelevant for span(SpanCondition.CONTAINED)301* because the recursive algorithm would continue after them anyway and302* find the longest recursive match from their end.303* ~ Rather than recursing, note each end point of a set string match.304* + If no set string matched after spanSet.span(),305* then return with where the spanSet.span() ended.306* + If at least one set string matched after spanSet.span(),307* then pop the shortest string match end point and continue the loop,308* trying to match all set strings from there.309* + If at least one more set string matched after a previous string match, then test if the310* code point after the previous string match is also contained in the set.311* Continue the loop with the shortest end point of312* either this code point or a matching set string.313* + If no more set string matched after a previous string match,314* then try another spanLength=spanSet.span(SpanCondition.CONTAINED).315* Stop if spanLength==0, otherwise continue the loop.316*317* By noting each end point of a set string match, the function visits each string position at most once and318* finishes in linear time.319*320* The recursive algorithm may visit the same string position many times321* if multiple paths lead to it and finishes in exponential time.322*/323324/*325* Algorithm for span(SIMPLE)326*327* Theoretical algorithm:328* - Iterate through the string, and at each code point boundary:329* + If the code point there is in the set, then remember to continue after it.330* + If a set string matches at the current position, then remember to continue after it.331* + Continue from the farthest match position and ignore all others.332* + If there is no match at the current position, then stop and return the current position.333*334* Optimized implementation:335*336* (Same assumption and spanSet as above.)337*338* - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).339* - Loop:340* + Try to match each set string at the end of the spanLength.341* ~ Set strings that start with set-contained code points342* must be matched with a partial overlap343* because the standard algorithm would have tried to match them earlier.344* ~ Set strings that entirely consist of set-contained code points345* must be matched with a full overlap because the longest-match algorithm346* would hide set string matches that end earlier.347* Such set strings need not be matched earlier inside the code point span348* because the standard algorithm would then have349* continued after the set string match anyway.350* ~ Remember the longest set string match (farthest end point)351* from the earliest starting point.352* + If no set string matched after spanSet.span(),353* then return with where the spanSet.span() ended.354* + If at least one set string matched,355* then continue the loop after the longest match from the earliest position.356* + If no more set string matched after a previous string match,357* then try another spanLength=spanSet.span(SpanCondition.CONTAINED).358* Stop if spanLength==0, otherwise continue the loop.359*/360/**361* Spans a string.362*363* @param s The string to be spanned364* @param start The start index that the span begins365* @param spanCondition The span condition366* @return the limit (exclusive end) of the span367*/368public int span(CharSequence s, int start, SpanCondition spanCondition) {369if (spanCondition == SpanCondition.NOT_CONTAINED) {370return spanNot(s, start, null);371}372int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED);373if (spanLimit == s.length()) {374return spanLimit;375}376return spanWithStrings(s, start, spanLimit, spanCondition);377}378379/**380* Synchronized method for complicated spans using the offsets.381* Avoids synchronization for simple cases.382*383* @param spanLimit = spanSet.span(s, start, CONTAINED)384*/385private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit,386SpanCondition spanCondition) {387// Consider strings; they may overlap with the span.388int initSize = 0;389if (spanCondition == SpanCondition.CONTAINED) {390// Use offset list to try all possibilities.391initSize = maxLength16;392}393offsets.setMaxLength(initSize);394int length = s.length();395int pos = spanLimit, rest = length - spanLimit;396int spanLength = spanLimit - start;397int i, stringsLength = strings.size();398for (;;) {399if (spanCondition == SpanCondition.CONTAINED) {400for (i = 0; i < stringsLength; ++i) {401int overlap = spanLengths[i];402if (overlap == ALL_CP_CONTAINED) {403continue; // Irrelevant string.404}405String string = strings.get(i);406407int length16 = string.length();408409// Try to match this string at pos-overlap..pos.410if (overlap >= LONG_SPAN) {411overlap = length16;412// While contained: No point matching fully inside the code point span.413overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code414// point.415}416if (overlap > spanLength) {417overlap = spanLength;418}419int inc = length16 - overlap; // Keep overlap+inc==length16.420for (;;) {421if (inc > rest) {422break;423}424// Try to match if the increment is not listed already.425if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) {426if (inc == rest) {427return length; // Reached the end of the string.428}429offsets.addOffset(inc);430}431if (overlap == 0) {432break;433}434--overlap;435++inc;436}437}438} else /* SIMPLE */{439int maxInc = 0, maxOverlap = 0;440for (i = 0; i < stringsLength; ++i) {441int overlap = spanLengths[i];442// For longest match, we do need to try to match even an all-contained string443// to find the match from the earliest start.444445String string = strings.get(i);446447int length16 = string.length();448449// Try to match this string at pos-overlap..pos.450if (overlap >= LONG_SPAN) {451overlap = length16;452// Longest match: Need to match fully inside the code point span453// to find the match from the earliest start.454}455if (overlap > spanLength) {456overlap = spanLength;457}458int inc = length16 - overlap; // Keep overlap+inc==length16.459for (;;) {460if (inc > rest || overlap < maxOverlap) {461break;462}463// Try to match if the string is longer or starts earlier.464if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc)465&& matches16CPB(s, pos - overlap, length, string, length16)) {466maxInc = inc; // Longest match from earliest start.467maxOverlap = overlap;468break;469}470--overlap;471++inc;472}473}474475if (maxInc != 0 || maxOverlap != 0) {476// Longest-match algorithm, and there was a string match.477// Simply continue after it.478pos += maxInc;479rest -= maxInc;480if (rest == 0) {481return length; // Reached the end of the string.482}483spanLength = 0; // Match strings from after a string match.484continue;485}486}487// Finished trying to match all strings at pos.488489if (spanLength != 0 || pos == 0) {490// The position is after an unlimited code point span (spanLength!=0),491// not after a string match.492// The only position where spanLength==0 after a span is pos==0.493// Otherwise, an unlimited code point span is only tried again when no494// strings match, and if such a non-initial span fails we stop.495if (offsets.isEmpty()) {496return pos; // No strings matched after a span.497}498// Match strings from after the next string match.499} else {500// The position is after a string match (or a single code point).501if (offsets.isEmpty()) {502// No more strings matched after a previous string match.503// Try another code point span from after the last string match.504spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED);505spanLength = spanLimit - pos;506if (spanLength == rest || // Reached the end of the string, or507spanLength == 0 // neither strings nor span progressed.508) {509return spanLimit;510}511pos += spanLength;512rest -= spanLength;513continue; // spanLength>0: Match strings from after a span.514} else {515// Try to match only one code point from after a string match if some516// string matched beyond it, so that we try all possible positions517// and don't overshoot.518spanLength = spanOne(spanSet, s, pos, rest);519if (spanLength > 0) {520if (spanLength == rest) {521return length; // Reached the end of the string.522}523// Match strings after this code point.524// There cannot be any increments below it because UnicodeSet strings525// contain multiple code points.526pos += spanLength;527rest -= spanLength;528offsets.shift(spanLength);529spanLength = 0;530continue; // Match strings from after a single code point.531}532// Match strings from after the next string match.533}534}535int minOffset = offsets.popMinimum(null);536pos += minOffset;537rest -= minOffset;538spanLength = 0; // Match strings from after a string match.539}540}541542/**543* Spans a string and counts the smallest number of set elements on any path across the span.544*545* <p>For proper counting, we cannot ignore strings that are fully contained in code point spans.546*547* <p>If the set does not have any fully-contained strings, then we could optimize this548* like span(), but such sets are likely rare, and this is at least still linear.549*550* @param s The string to be spanned551* @param start The start index that the span begins552* @param spanCondition The span condition553* @param outCount The count554* @return the limit (exclusive end) of the span555*/556public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition,557OutputInt outCount) {558if (spanCondition == SpanCondition.NOT_CONTAINED) {559return spanNot(s, start, outCount);560}561// Consider strings; they may overlap with the span,562// and they may result in a smaller count that with just code points.563if (spanCondition == SpanCondition.CONTAINED) {564return spanContainedAndCount(s, start, outCount);565}566// SIMPLE (not synchronized, does not use offsets)567int stringsLength = strings.size();568int length = s.length();569int pos = start;570int rest = length - start;571int count = 0;572while (rest != 0) {573// Try to match the next code point.574int cpLength = spanOne(spanSet, s, pos, rest);575int maxInc = (cpLength > 0) ? cpLength : 0;576// Try to match all of the strings.577for (int i = 0; i < stringsLength; ++i) {578String string = strings.get(i);579int length16 = string.length();580if (maxInc < length16 && length16 <= rest &&581matches16CPB(s, pos, length, string, length16)) {582maxInc = length16;583}584}585// We are done if there is no match beyond pos.586if (maxInc == 0) {587outCount.value = count;588return pos;589}590// Continue from the longest match.591++count;592pos += maxInc;593rest -= maxInc;594}595outCount.value = count;596return pos;597}598599private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) {600// Use offset list to try all possibilities.601offsets.setMaxLength(maxLength16);602int stringsLength = strings.size();603int length = s.length();604int pos = start;605int rest = length - start;606int count = 0;607while (rest != 0) {608// Try to match the next code point.609int cpLength = spanOne(spanSet, s, pos, rest);610if (cpLength > 0) {611offsets.addOffsetAndCount(cpLength, count + 1);612}613// Try to match all of the strings.614for (int i = 0; i < stringsLength; ++i) {615String string = strings.get(i);616int length16 = string.length();617// Note: If the strings were sorted by length, then we could also618// avoid trying to match if there is already a match of the same length.619if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) &&620matches16CPB(s, pos, length, string, length16)) {621offsets.addOffsetAndCount(length16, count + 1);622}623}624// We are done if there is no match beyond pos.625if (offsets.isEmpty()) {626outCount.value = count;627return pos;628}629// Continue from the nearest match.630int minOffset = offsets.popMinimum(outCount);631count = outCount.value;632pos += minOffset;633rest -= minOffset;634}635outCount.value = count;636return pos;637}638639/**640* Span a string backwards.641*642* @param s The string to be spanned643* @param spanCondition The span condition644* @return The string index which starts the span (i.e. inclusive).645*/646public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) {647if (spanCondition == SpanCondition.NOT_CONTAINED) {648return spanNotBack(s, length);649}650int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED);651if (pos == 0) {652return 0;653}654int spanLength = length - pos;655656// Consider strings; they may overlap with the span.657int initSize = 0;658if (spanCondition == SpanCondition.CONTAINED) {659// Use offset list to try all possibilities.660initSize = maxLength16;661}662offsets.setMaxLength(initSize);663int i, stringsLength = strings.size();664int spanBackLengthsOffset = 0;665if (all) {666spanBackLengthsOffset = stringsLength;667}668for (;;) {669if (spanCondition == SpanCondition.CONTAINED) {670for (i = 0; i < stringsLength; ++i) {671int overlap = spanLengths[spanBackLengthsOffset + i];672if (overlap == ALL_CP_CONTAINED) {673continue; // Irrelevant string.674}675String string = strings.get(i);676677int length16 = string.length();678679// Try to match this string at pos-(length16-overlap)..pos-length16.680if (overlap >= LONG_SPAN) {681overlap = length16;682// While contained: No point matching fully inside the code point span.683int len1 = 0;684len1 = string.offsetByCodePoints(0, 1);685overlap -= len1; // Length of the string minus the first code point.686}687if (overlap > spanLength) {688overlap = spanLength;689}690int dec = length16 - overlap; // Keep dec+overlap==length16.691for (;;) {692if (dec > pos) {693break;694}695// Try to match if the decrement is not listed already.696if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) {697if (dec == pos) {698return 0; // Reached the start of the string.699}700offsets.addOffset(dec);701}702if (overlap == 0) {703break;704}705--overlap;706++dec;707}708}709} else /* SIMPLE */{710int maxDec = 0, maxOverlap = 0;711for (i = 0; i < stringsLength; ++i) {712int overlap = spanLengths[spanBackLengthsOffset + i];713// For longest match, we do need to try to match even an all-contained string714// to find the match from the latest end.715716String string = strings.get(i);717718int length16 = string.length();719720// Try to match this string at pos-(length16-overlap)..pos-length16.721if (overlap >= LONG_SPAN) {722overlap = length16;723// Longest match: Need to match fully inside the code point span724// to find the match from the latest end.725}726if (overlap > spanLength) {727overlap = spanLength;728}729int dec = length16 - overlap; // Keep dec+overlap==length16.730for (;;) {731if (dec > pos || overlap < maxOverlap) {732break;733}734// Try to match if the string is longer or ends later.735if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec)736&& matches16CPB(s, pos - dec, length, string, length16)) {737maxDec = dec; // Longest match from latest end.738maxOverlap = overlap;739break;740}741--overlap;742++dec;743}744}745746if (maxDec != 0 || maxOverlap != 0) {747// Longest-match algorithm, and there was a string match.748// Simply continue before it.749pos -= maxDec;750if (pos == 0) {751return 0; // Reached the start of the string.752}753spanLength = 0; // Match strings from before a string match.754continue;755}756}757// Finished trying to match all strings at pos.758759if (spanLength != 0 || pos == length) {760// The position is before an unlimited code point span (spanLength!=0),761// not before a string match.762// The only position where spanLength==0 before a span is pos==length.763// Otherwise, an unlimited code point span is only tried again when no764// strings match, and if such a non-initial span fails we stop.765if (offsets.isEmpty()) {766return pos; // No strings matched before a span.767}768// Match strings from before the next string match.769} else {770// The position is before a string match (or a single code point).771if (offsets.isEmpty()) {772// No more strings matched before a previous string match.773// Try another code point span from before the last string match.774int oldPos = pos;775pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED);776spanLength = oldPos - pos;777if (pos == 0 || // Reached the start of the string, or778spanLength == 0 // neither strings nor span progressed.779) {780return pos;781}782continue; // spanLength>0: Match strings from before a span.783} else {784// Try to match only one code point from before a string match if some785// string matched beyond it, so that we try all possible positions786// and don't overshoot.787spanLength = spanOneBack(spanSet, s, pos);788if (spanLength > 0) {789if (spanLength == pos) {790return 0; // Reached the start of the string.791}792// Match strings before this code point.793// There cannot be any decrements below it because UnicodeSet strings794// contain multiple code points.795pos -= spanLength;796offsets.shift(spanLength);797spanLength = 0;798continue; // Match strings from before a single code point.799}800// Match strings from before the next string match.801}802}803pos -= offsets.popMinimum(null);804spanLength = 0; // Match strings from before a string match.805}806}807808/**809* Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED)810*811* Theoretical algorithm:812* - Iterate through the string, and at each code point boundary:813* + If the code point there is in the set, then return with the current position.814* + If a set string matches at the current position, then return with the current position.815*816* Optimized implementation:817*818* (Same assumption as for span() above.)819*820* Create and cache a spanNotSet which contains821* all of the single code points of the original set but none of its strings.822* For each set string add its initial code point to the spanNotSet.823* (Also add its final code point for spanNotBack().)824*825* - Loop:826* + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED).827* + If the current code point is in the original set, then return the current position.828* + If any set string matches at the current position, then return the current position.829* + If there is no match at the current position, neither for the code point830* there nor for any set string, then skip this code point and continue the loop.831* This happens for set-string-initial code points that were added to spanNotSet832* when there is not actually a match for such a set string.833*834* @param s The string to be spanned835* @param start The start index that the span begins836* @param outCount If not null: Receives the number of code points across the span.837* @return the limit (exclusive end) of the span838*/839private int spanNot(CharSequence s, int start, OutputInt outCount) {840int length = s.length();841int pos = start, rest = length - start;842int stringsLength = strings.size();843int count = 0;844do {845// Span until we find a code point from the set,846// or a code point that starts or ends some string.847int spanLimit;848if (outCount == null) {849spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED);850} else {851spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount);852outCount.value = count = count + outCount.value;853}854if (spanLimit == length) {855return length; // Reached the end of the string.856}857pos = spanLimit;858rest = length - spanLimit;859860// Check whether the current code point is in the original set,861// without the string starts and ends.862int cpLength = spanOne(spanSet, s, pos, rest);863if (cpLength > 0) {864return pos; // There is a set element at pos.865}866867// Try to match the strings at pos.868for (int i = 0; i < stringsLength; ++i) {869if (spanLengths[i] == ALL_CP_CONTAINED) {870continue; // Irrelevant string.871}872String string = strings.get(i);873874int length16 = string.length();875if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) {876return pos; // There is a set element at pos.877}878}879880// The span(while not contained) ended on a string start/end which is881// not in the original set. Skip this code point and continue.882// cpLength<0883pos -= cpLength;884rest += cpLength;885++count;886} while (rest != 0);887if (outCount != null) {888outCount.value = count;889}890return length; // Reached the end of the string.891}892893private int spanNotBack(CharSequence s, int length) {894int pos = length;895int i, stringsLength = strings.size();896do {897// Span until we find a code point from the set,898// or a code point that starts or ends some string.899pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED);900if (pos == 0) {901return 0; // Reached the start of the string.902}903904// Check whether the current code point is in the original set,905// without the string starts and ends.906int cpLength = spanOneBack(spanSet, s, pos);907if (cpLength > 0) {908return pos; // There is a set element at pos.909}910911// Try to match the strings at pos.912for (i = 0; i < stringsLength; ++i) {913// Use spanLengths rather than a spanLengths pointer because914// it is easier and we only need to know whether the string is irrelevant915// which is the same in either array.916if (spanLengths[i] == ALL_CP_CONTAINED) {917continue; // Irrelevant string.918}919String string = strings.get(i);920921int length16 = string.length();922if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) {923return pos; // There is a set element at pos.924}925}926927// The span(while not contained) ended on a string start/end which is928// not in the original set. Skip this code point and continue.929// cpLength<0930pos += cpLength;931} while (pos != 0);932return 0; // Reached the start of the string.933}934935static short makeSpanLengthByte(int spanLength) {936// 0xfe==UnicodeSetStringSpan::LONG_SPAN937return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN;938}939940// Compare strings without any argument checks. Requires length>0.941private static boolean matches16(CharSequence s, int start, final String t, int length) {942int end = start + length;943while (length-- > 0) {944if (s.charAt(--end) != t.charAt(length)) {945return false;946}947}948return true;949}950951/**952* Compare 16-bit Unicode strings (which may be malformed UTF-16)953* at code point boundaries.954* That is, each edge of a match must not be in the middle of a surrogate pair.955* @param s The string to match in.956* @param start The start index of s.957* @param limit The limit of the subsequence of s being spanned.958* @param t The substring to be matched in s.959* @param tlength The length of t.960*/961static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) {962return matches16(s, start, t, tlength)963&& !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) &&964Character.isLowSurrogate(s.charAt(start)))965&& !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) &&966Character.isLowSurrogate(s.charAt(start + tlength)));967}968969/**970* Does the set contain the next code point?971* If so, return its length; otherwise return its negative length.972*/973static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) {974char c = s.charAt(start);975if (c >= 0xd800 && c <= 0xdbff && length >= 2) {976char c2 = s.charAt(start + 1);977if (UTF16.isTrailSurrogate(c2)) {978int supplementary = UCharacterProperty.getRawSupplementary(c, c2);979return set.contains(supplementary) ? 2 : -2;980}981}982return set.contains(c) ? 1 : -1;983}984985static int spanOneBack(final UnicodeSet set, CharSequence s, int length) {986char c = s.charAt(length - 1);987if (c >= 0xdc00 && c <= 0xdfff && length >= 2) {988char c2 = s.charAt(length - 2);989if (UTF16.isLeadSurrogate(c2)) {990int supplementary = UCharacterProperty.getRawSupplementary(c2, c);991return set.contains(supplementary) ? 2 : -2;992}993}994return set.contains(c) ? 1 : -1;995}996997/**998* Helper class for UnicodeSetStringSpan.999*1000* <p>List of offsets from the current position from where to try matching1001* a code point or a string.1002* Stores offsets rather than indexes to simplify the code and use the same list1003* for both increments (in span()) and decrements (in spanBack()).1004*1005* <p>Assumption: The maximum offset is limited, and the offsets that are stored at any one time1006* are relatively dense, that is,1007* there are normally no gaps of hundreds or thousands of offset values.1008*1009* <p>This class optionally also tracks the minimum non-negative count for each position,1010* intended to count the smallest number of elements of any path leading to that position.1011*1012* <p>The implementation uses a circular buffer of count integers,1013* each indicating whether the corresponding offset is in the list,1014* and its path element count.1015* This avoids inserting into a sorted list of offsets (or absolute indexes)1016* and physically moving part of the list.1017*1018* <p>Note: In principle, the caller should setMaxLength() to1019* the maximum of the max string length and U16_LENGTH/U8_LENGTH1020* to account for "long" single code points.1021*1022* <p>Note: An earlier version did not track counts and stored only byte flags.1023* With boolean flags, if maxLength were guaranteed to be no more than 32 or 64,1024* the list could be stored as bit flags in a single integer.1025* Rather than handling a circular buffer with a start list index,1026* the integer would simply be shifted when lower offsets are removed.1027* UnicodeSet does not have a limit on the lengths of strings.1028*/1029private static final class OffsetList {1030private int[] list;1031private int length;1032private int start;10331034public OffsetList() {1035list = new int[16]; // default size1036}10371038public void setMaxLength(int maxLength) {1039if (maxLength > list.length) {1040list = new int[maxLength];1041}1042clear();1043}10441045public void clear() {1046for (int i = list.length; i-- > 0;) {1047list[i] = 0;1048}1049start = length = 0;1050}10511052public boolean isEmpty() {1053return (length == 0);1054}10551056/**1057* Reduces all stored offsets by delta, used when the current position moves by delta.1058* There must not be any offsets lower than delta.1059* If there is an offset equal to delta, it is removed.1060*1061* @param delta [1..maxLength]1062*/1063public void shift(int delta) {1064int i = start + delta;1065if (i >= list.length) {1066i -= list.length;1067}1068if (list[i] != 0) {1069list[i] = 0;1070--length;1071}1072start = i;1073}10741075/**1076* Adds an offset. The list must not contain it yet.1077* @param offset [1..maxLength]1078*/1079public void addOffset(int offset) {1080int i = start + offset;1081if (i >= list.length) {1082i -= list.length;1083}1084assert list[i] == 0;1085list[i] = 1;1086++length;1087}10881089/**1090* Adds an offset and updates its count.1091* The list may already contain the offset.1092* @param offset [1..maxLength]1093*/1094public void addOffsetAndCount(int offset, int count) {1095assert count > 0;1096int i = start + offset;1097if (i >= list.length) {1098i -= list.length;1099}1100if (list[i] == 0) {1101list[i] = count;1102++length;1103} else if (count < list[i]) {1104list[i] = count;1105}1106}11071108/**1109* @param offset [1..maxLength]1110*/1111public boolean containsOffset(int offset) {1112int i = start + offset;1113if (i >= list.length) {1114i -= list.length;1115}1116return list[i] != 0;1117}11181119/**1120* @param offset [1..maxLength]1121*/1122public boolean hasCountAtOffset(int offset, int count) {1123int i = start + offset;1124if (i >= list.length) {1125i -= list.length;1126}1127int oldCount = list[i];1128return oldCount != 0 && oldCount <= count;1129}11301131/**1132* Finds the lowest stored offset from a non-empty list, removes it,1133* and reduces all other offsets by this minimum.1134* @return min=[1..maxLength]1135*/1136public int popMinimum(OutputInt outCount) {1137// Look for the next offset in list[start+1..list.length-1].1138int i = start, result;1139while (++i < list.length) {1140int count = list[i];1141if (count != 0) {1142list[i] = 0;1143--length;1144result = i - start;1145start = i;1146if (outCount != null) { outCount.value = count; }1147return result;1148}1149}1150// i==list.length11511152// Wrap around and look for the next offset in list[0..start].1153// Since the list is not empty, there will be one.1154result = list.length - start;1155i = 0;1156int count;1157while ((count = list[i]) == 0) {1158++i;1159}1160list[i] = 0;1161--length;1162start = i;1163if (outCount != null) { outCount.value = count; }1164return result + i;1165}1166}1167}116811691170