Path: blob/master/src/java.base/share/classes/jdk/internal/icu/impl/BMPSet.java
41161 views
/*1* Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/2425/*26******************************************************************************27*28* Copyright (C) 2009-2014, International Business Machines29* Corporation and others. All Rights Reserved.30*31******************************************************************************32*/3334package jdk.internal.icu.impl;3536import jdk.internal.icu.text.UnicodeSet.SpanCondition;37import jdk.internal.icu.util.OutputInt;3839/**40* Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points.41*42* Latin-1: Look up bytes.43* 2-byte characters: Bits organized vertically.44* 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges.45* Supplementary characters: Call contains() on the parent set.46*/47public final class BMPSet {4849/**50* One boolean ('true' or 'false') per Latin-1 character.51*/52private boolean[] latin1Contains;5354/**55* One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points56* correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6}57* trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead)58*59* Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) for faster validity checking at60* runtime.61*/62private int[] table7FF;6364/**65* One bit per 64 BMP code points. The bits are organized vertically; consecutive 64-code point blocks66* correspond to the same bit position in consecutive table words. With code point parts lead=c{15..12}67* t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit68* indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed69* and set.contains(c) must be called.70*71* Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to the result of contains(FFFD) for faster72* validity checking at runtime.73*/74private int[] bmpBlockBits;7576/**77* Inversion list indexes for restricted binary searches in findCodePoint(), from findCodePoint(U+0800, U+1000,78* U+2000, .., U+F000, U+10000). U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are79* always looked up in the bit tables. The last pair of indexes is for finding supplementary code points.80*/81private int[] list4kStarts;8283/**84* The inversion list of the parent set, for the slower contains() implementation for mixed BMP blocks and for85* supplementary code points. The list is terminated with list[listLength-1]=0x110000.86*/87private final int[] list;88private final int listLength; // length used; list may be longer to minimize reallocs8990public BMPSet(final int[] parentList, int parentListLength) {91list = parentList;92listLength = parentListLength;93latin1Contains = new boolean[0x100];94table7FF = new int[64];95bmpBlockBits = new int[64];96list4kStarts = new int[18];9798/*99* Set the list indexes for binary searches for U+0800, U+1000, U+2000, .., U+F000, U+10000. U+0800 is the100* first 3-byte-UTF-8 code point. Lower code points are looked up in the bit tables. The last pair of101* indexes is for finding supplementary code points.102*/103list4kStarts[0] = findCodePoint(0x800, 0, listLength - 1);104int i;105for (i = 1; i <= 0x10; ++i) {106list4kStarts[i] = findCodePoint(i << 12, list4kStarts[i - 1], listLength - 1);107}108list4kStarts[0x11] = listLength - 1;109110initBits();111}112113public boolean contains(int c) {114if (c <= 0xff) {115return (latin1Contains[c]);116} else if (c <= 0x7ff) {117return ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0);118} else if (c < 0xd800 || (c >= 0xe000 && c <= 0xffff)) {119int lead = c >> 12;120int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;121if (twoBits <= 1) {122// All 64 code points with the same bits 15..6123// are either in the set or not.124return (0 != twoBits);125} else {126// Look up the code point in its 4k block of code points.127return containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1]);128}129} else if (c <= 0x10ffff) {130// surrogate or supplementary code point131return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);132} else {133// Out-of-range code points get false, consistent with long-standing134// behavior of UnicodeSet.contains(c).135return false;136}137}138139/**140* Span the initial substring for which each character c has spanCondition==contains(c). It must be141* spanCondition==0 or 1.142*143* @param start The start index144* @param outCount If not null: Receives the number of code points in the span.145* @return the limit (exclusive end) of the span146*147* NOTE: to reduce the overhead of function call to contains(c), it is manually inlined here. Check for148* sufficient length for trail unit for each surrogate pair. Handle single surrogates as surrogate code points149* as usual in ICU.150*/151public final int span(CharSequence s, int start, SpanCondition spanCondition,152OutputInt outCount) {153char c, c2;154int i = start;155int limit = s.length();156int numSupplementary = 0;157if (SpanCondition.NOT_CONTAINED != spanCondition) {158// span159while (i < limit) {160c = s.charAt(i);161if (c <= 0xff) {162if (!latin1Contains[c]) {163break;164}165} else if (c <= 0x7ff) {166if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {167break;168}169} else if (c < 0xd800 ||170c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {171int lead = c >> 12;172int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;173if (twoBits <= 1) {174// All 64 code points with the same bits 15..6175// are either in the set or not.176if (twoBits == 0) {177break;178}179} else {180// Look up the code point in its 4k block of code points.181if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {182break;183}184}185} else {186// surrogate pair187int supplementary = UCharacterProperty.getRawSupplementary(c, c2);188if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {189break;190}191++numSupplementary;192++i;193}194++i;195}196} else {197// span not198while (i < limit) {199c = s.charAt(i);200if (c <= 0xff) {201if (latin1Contains[c]) {202break;203}204} else if (c <= 0x7ff) {205if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {206break;207}208} else if (c < 0xd800 ||209c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {210int lead = c >> 12;211int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;212if (twoBits <= 1) {213// All 64 code points with the same bits 15..6214// are either in the set or not.215if (twoBits != 0) {216break;217}218} else {219// Look up the code point in its 4k block of code points.220if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {221break;222}223}224} else {225// surrogate pair226int supplementary = UCharacterProperty.getRawSupplementary(c, c2);227if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {228break;229}230++numSupplementary;231++i;232}233++i;234}235}236if (outCount != null) {237int spanLength = i - start;238outCount.value = spanLength - numSupplementary; // number of code points239}240return i;241}242243/**244* Symmetrical with span().245* Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >=246* limit and spanCondition==0 or 1.247*248* @return The string index which starts the span (i.e. inclusive).249*/250public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) {251char c, c2;252253if (SpanCondition.NOT_CONTAINED != spanCondition) {254// span255for (;;) {256c = s.charAt(--limit);257if (c <= 0xff) {258if (!latin1Contains[c]) {259break;260}261} else if (c <= 0x7ff) {262if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {263break;264}265} else if (c < 0xd800 ||266c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {267int lead = c >> 12;268int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;269if (twoBits <= 1) {270// All 64 code points with the same bits 15..6271// are either in the set or not.272if (twoBits == 0) {273break;274}275} else {276// Look up the code point in its 4k block of code points.277if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {278break;279}280}281} else {282// surrogate pair283int supplementary = UCharacterProperty.getRawSupplementary(c2, c);284if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {285break;286}287--limit;288}289if (0 == limit) {290return 0;291}292}293} else {294// span not295for (;;) {296c = s.charAt(--limit);297if (c <= 0xff) {298if (latin1Contains[c]) {299break;300}301} else if (c <= 0x7ff) {302if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {303break;304}305} else if (c < 0xd800 ||306c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {307int lead = c >> 12;308int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;309if (twoBits <= 1) {310// All 64 code points with the same bits 15..6311// are either in the set or not.312if (twoBits != 0) {313break;314}315} else {316// Look up the code point in its 4k block of code points.317if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {318break;319}320}321} else {322// surrogate pair323int supplementary = UCharacterProperty.getRawSupplementary(c2, c);324if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {325break;326}327--limit;328}329if (0 == limit) {330return 0;331}332}333}334return limit + 1;335}336337/**338* Set bits in a bit rectangle in "vertical" bit organization. start<limit<=0x800339*/340private static void set32x64Bits(int[] table, int start, int limit) {341assert (64 == table.length);342int lead = start >> 6; // Named for UTF-8 2-byte lead byte with upper 5 bits.343int trail = start & 0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits.344345// Set one bit indicating an all-one block.346int bits = 1 << lead;347if ((start + 1) == limit) { // Single-character shortcut.348table[trail] |= bits;349return;350}351352int limitLead = limit >> 6;353int limitTrail = limit & 0x3f;354355if (lead == limitLead) {356// Partial vertical bit column.357while (trail < limitTrail) {358table[trail++] |= bits;359}360} else {361// Partial vertical bit column,362// followed by a bit rectangle,363// followed by another partial vertical bit column.364if (trail > 0) {365do {366table[trail++] |= bits;367} while (trail < 64);368++lead;369}370if (lead < limitLead) {371bits = ~((1 << lead) - 1);372if (limitLead < 0x20) {373bits &= (1 << limitLead) - 1;374}375for (trail = 0; trail < 64; ++trail) {376table[trail] |= bits;377}378}379// limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.380// In that case, bits=1<<limitLead == 1<<0 == 1381// (because Java << uses only the lower 5 bits of the shift operand)382// but the bits value is not used because trail<limitTrail is already false.383bits = 1 << limitLead;384for (trail = 0; trail < limitTrail; ++trail) {385table[trail] |= bits;386}387}388}389390private void initBits() {391int start, limit;392int listIndex = 0;393394// Set latin1Contains[].395do {396start = list[listIndex++];397if (listIndex < listLength) {398limit = list[listIndex++];399} else {400limit = 0x110000;401}402if (start >= 0x100) {403break;404}405do {406latin1Contains[start++] = true;407} while (start < limit && start < 0x100);408} while (limit <= 0x100);409410// Set table7FF[].411while (start < 0x800) {412set32x64Bits(table7FF, start, limit <= 0x800 ? limit : 0x800);413if (limit > 0x800) {414start = 0x800;415break;416}417418start = list[listIndex++];419if (listIndex < listLength) {420limit = list[listIndex++];421} else {422limit = 0x110000;423}424}425426// Set bmpBlockBits[].427int minStart = 0x800;428while (start < 0x10000) {429if (limit > 0x10000) {430limit = 0x10000;431}432433if (start < minStart) {434start = minStart;435}436if (start < limit) { // Else: Another range entirely in a known mixed-value block.437if (0 != (start & 0x3f)) {438// Mixed-value block of 64 code points.439start >>= 6;440bmpBlockBits[start & 0x3f] |= 0x10001 << (start >> 6);441start = (start + 1) << 6; // Round up to the next block boundary.442minStart = start; // Ignore further ranges in this block.443}444if (start < limit) {445if (start < (limit & ~0x3f)) {446// Multiple all-ones blocks of 64 code points each.447set32x64Bits(bmpBlockBits, start >> 6, limit >> 6);448}449450if (0 != (limit & 0x3f)) {451// Mixed-value block of 64 code points.452limit >>= 6;453bmpBlockBits[limit & 0x3f] |= 0x10001 << (limit >> 6);454limit = (limit + 1) << 6; // Round up to the next block boundary.455minStart = limit; // Ignore further ranges in this block.456}457}458}459460if (limit == 0x10000) {461break;462}463464start = list[listIndex++];465if (listIndex < listLength) {466limit = list[listIndex++];467} else {468limit = 0x110000;469}470}471}472473/**474* Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code475* points in a certain range.476*477* For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and478* hi=findCodePoint(end) with 0<=lo<=hi<len. findCodePoint(c) defaults to lo=0 and hi=len-1.479*480* @param c481* a character in a subrange of MIN_VALUE..MAX_VALUE482* @param lo483* The lowest index to be returned.484* @param hi485* The highest index to be returned.486* @return the smallest integer i in the range lo..hi, inclusive, such that c < list[i]487*/488private int findCodePoint(int c, int lo, int hi) {489/* Examples:490findCodePoint(c)491set list[] c=0 1 3 4 7 8492=== ============== ===========493[] [110000] 0 0 0 0 0 0494[\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2495[\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2496[:Any:] [0, 110000] 1 1 1 1 1 1497*/498499// Return the smallest i such that c < list[i]. Assume500// list[len - 1] == HIGH and that c is legal (0..HIGH-1).501if (c < list[lo])502return lo;503// High runner test. c is often after the last range, so an504// initial check for this condition pays off.505if (lo >= hi || c >= list[hi - 1])506return hi;507// invariant: c >= list[lo]508// invariant: c < list[hi]509for (;;) {510int i = (lo + hi) >>> 1;511if (i == lo) {512break; // Found!513} else if (c < list[i]) {514hi = i;515} else {516lo = i;517}518}519return hi;520}521522private final boolean containsSlow(int c, int lo, int hi) {523return (0 != (findCodePoint(c, lo, hi) & 1));524}525}526527528529