Path: blob/master/src/java.base/share/classes/jdk/internal/icu/util/CodePointMap.java
41161 views
/*1* Copyright (c) 2019, 2020, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/24// (c) 2018 and later: Unicode, Inc. and others.25// License & terms of use: http://www.unicode.org/copyright.html#License2627// created: 2018may10 Markus W. Scherer2829package jdk.internal.icu.util;3031import java.util.Iterator;32import java.util.NoSuchElementException;3334/**35* Abstract map from Unicode code points (U+0000..U+10FFFF) to integer values.36* This does not implement java.util.Map.37*38* @stable ICU 6339*/40public abstract class CodePointMap implements Iterable<CodePointMap.Range> {41/**42* Selectors for how getRange() should report value ranges overlapping with surrogates.43* Most users should use NORMAL.44*45* @see #getRange46* @stable ICU 6347*/48public enum RangeOption {49/**50* getRange() enumerates all same-value ranges as stored in the map.51* Most users should use this option.52*53* @stable ICU 6354*/55NORMAL,56/**57* getRange() enumerates all same-value ranges as stored in the map,58* except that lead surrogates (U+D800..U+DBFF) are treated as having the59* surrogateValue, which is passed to getRange() as a separate parameter.60* The surrogateValue is not transformed via filter().61* See {@link Character#isHighSurrogate}.62*63* <p>Most users should use NORMAL instead.64*65* <p>This option is useful for maps that map surrogate code *units* to66* special values optimized for UTF-16 string processing67* or for special error behavior for unpaired surrogates,68* but those values are not to be associated with the lead surrogate code *points*.69*70* @stable ICU 6371*/72FIXED_LEAD_SURROGATES,73/**74* getRange() enumerates all same-value ranges as stored in the map,75* except that all surrogates (U+D800..U+DFFF) are treated as having the76* surrogateValue, which is passed to getRange() as a separate parameter.77* The surrogateValue is not transformed via filter().78* See {@link Character#isSurrogate}.79*80* <p>Most users should use NORMAL instead.81*82* <p>This option is useful for maps that map surrogate code *units* to83* special values optimized for UTF-16 string processing84* or for special error behavior for unpaired surrogates,85* but those values are not to be associated with the lead surrogate code *points*.86*87* @stable ICU 6388*/89FIXED_ALL_SURROGATES90}9192/**93* Callback function interface: Modifies a map value.94* Optionally called by getRange().95* The modified value will be returned by the getRange() function.96*97* <p>Can be used to ignore some of the value bits,98* make a filter for one of several values,99* return a value index computed from the map value, etc.100*101* @see #getRange102* @see #iterator103* @stable ICU 63104*/105public interface ValueFilter {106/**107* Modifies the map value.108*109* @param value map value110* @return modified value111* @stable ICU 63112*/113public int apply(int value);114}115116/**117* Range iteration result data.118* Code points from start to end map to the same value.119* The value may have been modified by {@link ValueFilter#apply(int)},120* or it may be the surrogateValue if a RangeOption other than "normal" was used.121*122* @see #getRange123* @see #iterator124* @stable ICU 63125*/126public static final class Range {127private int start;128private int end;129private int value;130131/**132* Constructor. Sets start and end to -1 and value to 0.133*134* @stable ICU 63135*/136public Range() {137start = end = -1;138value = 0;139}140141/**142* @return the start code point143* @stable ICU 63144*/145public int getStart() { return start; }146/**147* @return the (inclusive) end code point148* @stable ICU 63149*/150public int getEnd() { return end; }151/**152* @return the range value153* @stable ICU 63154*/155public int getValue() { return value; }156/**157* Sets the range. When using {@link #iterator()},158* iteration will resume after the newly set end.159*160* @param start new start code point161* @param end new end code point162* @param value new value163* @stable ICU 63164*/165public void set(int start, int end, int value) {166this.start = start;167this.end = end;168this.value = value;169}170}171172private final class RangeIterator implements Iterator<Range> {173private Range range = new Range();174175@Override176public boolean hasNext() {177return -1 <= range.end && range.end < 0x10ffff;178}179180@Override181public Range next() {182if (getRange(range.end + 1, null, range)) {183return range;184} else {185throw new NoSuchElementException();186}187}188189@Override190public final void remove() {191throw new UnsupportedOperationException();192}193}194195/**196* Iterates over code points of a string and fetches map values.197* This does not implement java.util.Iterator.198*199* <pre>200* void onString(CodePointMap map, CharSequence s, int start) {201* CodePointMap.StringIterator iter = map.stringIterator(s, start);202* while (iter.next()) {203* int end = iter.getIndex(); // code point from between start and end204* useValue(s, start, end, iter.getCodePoint(), iter.getValue());205* start = end;206* }207* }208* </pre>209*210* <p>This class is not intended for public subclassing.211*212* @stable ICU 63213*/214public class StringIterator {215/**216* @internal217* @deprecated This API is ICU internal only.218*/219@Deprecated220protected CharSequence s;221/**222* @internal223* @deprecated This API is ICU internal only.224*/225@Deprecated226protected int sIndex;227/**228* @internal229* @deprecated This API is ICU internal only.230*/231@Deprecated232protected int c;233/**234* @internal235* @deprecated This API is ICU internal only.236*/237@Deprecated238protected int value;239240/**241* @internal242* @deprecated This API is ICU internal only.243*/244@Deprecated245protected StringIterator(CharSequence s, int sIndex) {246this.s = s;247this.sIndex = sIndex;248c = -1;249value = 0;250}251252/**253* Resets the iterator to a new string and/or a new string index.254*255* @param s string to iterate over256* @param sIndex string index where the iteration will start257* @stable ICU 63258*/259public void reset(CharSequence s, int sIndex) {260this.s = s;261this.sIndex = sIndex;262c = -1;263value = 0;264}265266/**267* Reads the next code point, post-increments the string index,268* and gets a value from the map.269* Sets an implementation-defined error value if the code point is an unpaired surrogate.270*271* @return true if the string index was not yet at the end of the string;272* otherwise the iterator did not advance273* @stable ICU 63274*/275public boolean next() {276if (sIndex >= s.length()) {277return false;278}279c = Character.codePointAt(s, sIndex);280sIndex += Character.charCount(c);281value = get(c);282return true;283}284285/**286* Reads the previous code point, pre-decrements the string index,287* and gets a value from the map.288* Sets an implementation-defined error value if the code point is an unpaired surrogate.289*290* @return true if the string index was not yet at the start of the string;291* otherwise the iterator did not advance292* @stable ICU 63293*/294public boolean previous() {295if (sIndex <= 0) {296return false;297}298c = Character.codePointBefore(s, sIndex);299sIndex -= Character.charCount(c);300value = get(c);301return true;302}303/**304* @return the string index305* @stable ICU 63306*/307public final int getIndex() { return sIndex; }308/**309* @return the code point310* @stable ICU 63311*/312public final int getCodePoint() { return c; }313/**314* @return the map value,315* or an implementation-defined error value if316* the code point is an unpaired surrogate317* @stable ICU 63318*/319public final int getValue() { return value; }320}321322/**323* Protected no-args constructor.324*325* @stable ICU 63326*/327protected CodePointMap() {328}329330/**331* Returns the value for a code point as stored in the map, with range checking.332* Returns an implementation-defined error value if c is not in the range 0..U+10FFFF.333*334* @param c the code point335* @return the map value,336* or an implementation-defined error value if337* the code point is not in the range 0..U+10FFFF338* @stable ICU 63339*/340public abstract int get(int c);341342/**343* Sets the range object to a range of code points beginning with the start parameter.344* The range start is the same as the start input parameter345* (even if there are preceding code points that have the same value).346* The range end is the last code point such that347* all those from start to there have the same value.348* Returns false if start is not 0..U+10FFFF.349* Can be used to efficiently iterate over all same-value ranges in a map.350* (This is normally faster than iterating over code points and get()ting each value,351* but may be much slower than a data structure that stores ranges directly.)352*353* <p>If the {@link ValueFilter} parameter is not null, then354* the value to be delivered is passed through that filter, and the return value is the end355* of the range where all values are modified to the same actual value.356* The value is unchanged if that parameter is null.357*358* <p>Example:359* <pre>360* int start = 0;361* CodePointMap.Range range = new CodePointMap.Range();362* while (map.getRange(start, null, range)) {363* int end = range.getEnd();364* int value = range.getValue();365* // Work with the range start..end and its value.366* start = end + 1;367* }368* </pre>369*370* @param start range start371* @param filter an object that may modify the map data value,372* or null if the values from the map are to be used unmodified373* @param range the range object that will be set to the code point range and value374* @return true if start is 0..U+10FFFF; otherwise no new range is fetched375* @stable ICU 63376*/377public abstract boolean getRange(int start, ValueFilter filter, Range range);378379/**380* Sets the range object to a range of code points beginning with the start parameter.381* The range start is the same as the start input parameter382* (even if there are preceding code points that have the same value).383* The range end is the last code point such that384* all those from start to there have the same value.385* Returns false if start is not 0..U+10FFFF.386*387* <p>Same as the simpler {@link #getRange(int, ValueFilter, Range)} but optionally388* modifies the range if it overlaps with surrogate code points.389*390* @param start range start391* @param option defines whether surrogates are treated normally,392* or as having the surrogateValue; usually {@link RangeOption#NORMAL}393* @param surrogateValue value for surrogates; ignored if option=={@link RangeOption#NORMAL}394* @param filter an object that may modify the map data value,395* or null if the values from the map are to be used unmodified396* @param range the range object that will be set to the code point range and value397* @return true if start is 0..U+10FFFF; otherwise no new range is fetched398* @stable ICU 63399*/400public boolean getRange(int start, RangeOption option, int surrogateValue,401ValueFilter filter, Range range) {402assert option != null;403if (!getRange(start, filter, range)) {404return false;405}406if (option == RangeOption.NORMAL) {407return true;408}409int surrEnd = option == RangeOption.FIXED_ALL_SURROGATES ? 0xdfff : 0xdbff;410int end = range.end;411if (end < 0xd7ff || start > surrEnd) {412return true;413}414// The range overlaps with surrogates, or ends just before the first one.415if (range.value == surrogateValue) {416if (end >= surrEnd) {417// Surrogates followed by a non-surrValue range,418// or surrogates are part of a larger surrValue range.419return true;420}421} else {422if (start <= 0xd7ff) {423range.end = 0xd7ff; // Non-surrValue range ends before surrValue surrogates.424return true;425}426// Start is a surrogate with a non-surrValue code *unit* value.427// Return a surrValue code *point* range.428range.value = surrogateValue;429if (end > surrEnd) {430range.end = surrEnd; // Surrogate range ends before non-surrValue rest of range.431return true;432}433}434// See if the surrValue surrogate range can be merged with435// an immediately following range.436if (getRange(surrEnd + 1, filter, range) && range.value == surrogateValue) {437range.start = start;438return true;439}440range.start = start;441range.end = surrEnd;442range.value = surrogateValue;443return true;444}445446/**447* Convenience iterator over same-map-value code point ranges.448* Same as looping over all ranges with {@link #getRange(int, ValueFilter, Range)}449* without filtering.450* Adjacent ranges have different map values.451*452* <p>The iterator always returns the same Range object.453*454* @return a Range iterator455* @stable ICU 63456*/457@Override458public Iterator<Range> iterator() {459return new RangeIterator();460}461462/**463* Returns an iterator (not a java.util.Iterator) over code points of a string464* for fetching map values.465*466* @param s string to iterate over467* @param sIndex string index where the iteration will start468* @return the iterator469* @stable ICU 63470*/471public StringIterator stringIterator(CharSequence s, int sIndex) {472return new StringIterator(s, sIndex);473}474}475476477