Path: blob/master/thirdparty/icu4c/common/bmpset.cpp
10278 views
// © 2016 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html2/*3******************************************************************************4*5* Copyright (C) 2007-2012, International Business Machines6* Corporation and others. All Rights Reserved.7*8******************************************************************************9* file name: bmpset.cpp10* encoding: UTF-811* tab size: 8 (not used)12* indentation:413*14* created on: 2007jan2915* created by: Markus W. Scherer16*/1718#include "unicode/utypes.h"19#include "unicode/uniset.h"20#include "unicode/utf8.h"21#include "unicode/utf16.h"22#include "cmemory.h"23#include "bmpset.h"24#include "uassert.h"2526U_NAMESPACE_BEGIN2728BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :29list(parentList), listLength(parentListLength) {30uprv_memset(latin1Contains, 0, sizeof(latin1Contains));31uprv_memset(table7FF, 0, sizeof(table7FF));32uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));3334/*35* Set the list indexes for binary searches for36* U+0800, U+1000, U+2000, .., U+F000, U+10000.37* U+0800 is the first 3-byte-UTF-8 code point. Lower code points are38* looked up in the bit tables.39* The last pair of indexes is for finding supplementary code points.40*/41list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);42int32_t i;43for(i=1; i<=0x10; ++i) {44list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);45}46list4kStarts[0x11]=listLength-1;47containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]);4849initBits();50overrideIllegal();51}5253BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :54containsFFFD(otherBMPSet.containsFFFD),55list(newParentList), listLength(newParentListLength) {56uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains));57uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));58uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));59uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));60}6162BMPSet::~BMPSet() {63}6465/*66* Set bits in a bit rectangle in "vertical" bit organization.67* start<limit<=0x80068*/69static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {70U_ASSERT(start<limit);71U_ASSERT(limit<=0x800);7273int32_t lead=start>>6; // Named for UTF-8 2-byte lead byte with upper 5 bits.74int32_t trail=start&0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits.7576// Set one bit indicating an all-one block.77uint32_t bits = static_cast<uint32_t>(1) << lead;78if((start+1)==limit) { // Single-character shortcut.79table[trail]|=bits;80return;81}8283int32_t limitLead=limit>>6;84int32_t limitTrail=limit&0x3f;8586if(lead==limitLead) {87// Partial vertical bit column.88while(trail<limitTrail) {89table[trail++]|=bits;90}91} else {92// Partial vertical bit column,93// followed by a bit rectangle,94// followed by another partial vertical bit column.95if(trail>0) {96do {97table[trail++]|=bits;98} while(trail<64);99++lead;100}101if(lead<limitLead) {102bits = ~((static_cast<unsigned>(1) << lead) - 1);103if(limitLead<0x20) {104bits &= (static_cast<unsigned>(1) << limitLead) - 1;105}106for(trail=0; trail<64; ++trail) {107table[trail]|=bits;108}109}110// limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.111// In that case, bits=1<<limitLead is undefined but the bits value112// is not used because trail<limitTrail is already false.113bits = static_cast<uint32_t>(1) << ((limitLead == 0x20) ? (limitLead - 1) : limitLead);114for(trail=0; trail<limitTrail; ++trail) {115table[trail]|=bits;116}117}118}119120void BMPSet::initBits() {121UChar32 start, limit;122int32_t listIndex=0;123124// Set latin1Contains[].125do {126start=list[listIndex++];127if(listIndex<listLength) {128limit=list[listIndex++];129} else {130limit=0x110000;131}132if(start>=0x100) {133break;134}135do {136latin1Contains[start++]=1;137} while(start<limit && start<0x100);138} while(limit<=0x100);139140// Find the first range overlapping with (or after) 80..FF again,141// to include them in table7FF as well.142for(listIndex=0;;) {143start=list[listIndex++];144if(listIndex<listLength) {145limit=list[listIndex++];146} else {147limit=0x110000;148}149if(limit>0x80) {150if(start<0x80) {151start=0x80;152}153break;154}155}156157// Set table7FF[].158while(start<0x800) {159set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);160if(limit>0x800) {161start=0x800;162break;163}164165start=list[listIndex++];166if(listIndex<listLength) {167limit=list[listIndex++];168} else {169limit=0x110000;170}171}172173// Set bmpBlockBits[].174int32_t minStart=0x800;175while(start<0x10000) {176if(limit>0x10000) {177limit=0x10000;178}179180if(start<minStart) {181start=minStart;182}183if(start<limit) { // Else: Another range entirely in a known mixed-value block.184if(start&0x3f) {185// Mixed-value block of 64 code points.186start>>=6;187bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);188start=(start+1)<<6; // Round up to the next block boundary.189minStart=start; // Ignore further ranges in this block.190}191if(start<limit) {192if(start<(limit&~0x3f)) {193// Multiple all-ones blocks of 64 code points each.194set32x64Bits(bmpBlockBits, start>>6, limit>>6);195}196197if(limit&0x3f) {198// Mixed-value block of 64 code points.199limit>>=6;200bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);201limit=(limit+1)<<6; // Round up to the next block boundary.202minStart=limit; // Ignore further ranges in this block.203}204}205}206207if(limit==0x10000) {208break;209}210211start=list[listIndex++];212if(listIndex<listLength) {213limit=list[listIndex++];214} else {215limit=0x110000;216}217}218}219220/*221* Override some bits and bytes to the result of contains(FFFD)222* for faster validity checking at runtime.223* No need to set 0 values where they were reset to 0 in the constructor224* and not modified by initBits().225* (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)226* Need to set 0 values for surrogates D800..DFFF.227*/228void BMPSet::overrideIllegal() {229uint32_t bits, mask;230int32_t i;231232if(containsFFFD) {233bits=3; // Lead bytes 0xC0 and 0xC1.234for(i=0; i<64; ++i) {235table7FF[i]|=bits;236}237238bits=1; // Lead byte 0xE0.239for(i=0; i<32; ++i) { // First half of 4k block.240bmpBlockBits[i]|=bits;241}242243mask= static_cast<uint32_t>(~(0x10001<<0xd)); // Lead byte 0xED.244bits=1<<0xd;245for(i=32; i<64; ++i) { // Second half of 4k block.246bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;247}248} else {249mask= static_cast<uint32_t>(~(0x10001<<0xd)); // Lead byte 0xED.250for(i=32; i<64; ++i) { // Second half of 4k block.251bmpBlockBits[i]&=mask;252}253}254}255256int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {257/* Examples:258findCodePoint(c)259set list[] c=0 1 3 4 7 8260=== ============== ===========261[] [110000] 0 0 0 0 0 0262[\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2263[\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2264[:Any:] [0, 110000] 1 1 1 1 1 1265*/266267// Return the smallest i such that c < list[i]. Assume268// list[len - 1] == HIGH and that c is legal (0..HIGH-1).269if (c < list[lo])270return lo;271// High runner test. c is often after the last range, so an272// initial check for this condition pays off.273if (lo >= hi || c >= list[hi-1])274return hi;275// invariant: c >= list[lo]276// invariant: c < list[hi]277for (;;) {278int32_t i = (lo + hi) >> 1;279if (i == lo) {280break; // Found!281} else if (c < list[i]) {282hi = i;283} else {284lo = i;285}286}287return hi;288}289290UBool291BMPSet::contains(UChar32 c) const {292if (static_cast<uint32_t>(c) <= 0xff) {293return latin1Contains[c];294} else if (static_cast<uint32_t>(c) <= 0x7ff) {295return (table7FF[c & 0x3f] & (static_cast<uint32_t>(1) << (c >> 6))) != 0;296} else if (static_cast<uint32_t>(c) < 0xd800 || (c >= 0xe000 && c <= 0xffff)) {297int lead=c>>12;298uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;299if(twoBits<=1) {300// All 64 code points with the same bits 15..6301// are either in the set or not.302return twoBits;303} else {304// Look up the code point in its 4k block of code points.305return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);306}307} else if (static_cast<uint32_t>(c) <= 0x10ffff) {308// surrogate or supplementary code point309return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);310} else {311// Out-of-range code points get false, consistent with long-standing312// behavior of UnicodeSet::contains(c).313return false;314}315}316317/*318* Check for sufficient length for trail unit for each surrogate pair.319* Handle single surrogates as surrogate code points as usual in ICU.320*/321const char16_t *322BMPSet::span(const char16_t *s, const char16_t *limit, USetSpanCondition spanCondition) const {323char16_t c, c2;324325if(spanCondition) {326// span327do {328c=*s;329if(c<=0xff) {330if(!latin1Contains[c]) {331break;332}333} else if(c<=0x7ff) {334if ((table7FF[c & 0x3f] & (static_cast<uint32_t>(1) << (c >> 6))) == 0) {335break;336}337} else if(c<0xd800 || c>=0xe000) {338int lead=c>>12;339uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;340if(twoBits<=1) {341// All 64 code points with the same bits 15..6342// are either in the set or not.343if(twoBits==0) {344break;345}346} else {347// Look up the code point in its 4k block of code points.348if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {349break;350}351}352} else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {353// surrogate code point354if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {355break;356}357} else {358// surrogate pair359if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {360break;361}362++s;363}364} while(++s<limit);365} else {366// span not367do {368c=*s;369if(c<=0xff) {370if(latin1Contains[c]) {371break;372}373} else if(c<=0x7ff) {374if ((table7FF[c & 0x3f] & (static_cast<uint32_t>(1) << (c >> 6))) != 0) {375break;376}377} else if(c<0xd800 || c>=0xe000) {378int lead=c>>12;379uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;380if(twoBits<=1) {381// All 64 code points with the same bits 15..6382// are either in the set or not.383if(twoBits!=0) {384break;385}386} else {387// Look up the code point in its 4k block of code points.388if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {389break;390}391}392} else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {393// surrogate code point394if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {395break;396}397} else {398// surrogate pair399if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {400break;401}402++s;403}404} while(++s<limit);405}406return s;407}408409/* Symmetrical with span(). */410const char16_t *411BMPSet::spanBack(const char16_t *s, const char16_t *limit, USetSpanCondition spanCondition) const {412char16_t c, c2;413414if(spanCondition) {415// span416for(;;) {417c=*(--limit);418if(c<=0xff) {419if(!latin1Contains[c]) {420break;421}422} else if(c<=0x7ff) {423if ((table7FF[c & 0x3f] & (static_cast<uint32_t>(1) << (c >> 6))) == 0) {424break;425}426} else if(c<0xd800 || c>=0xe000) {427int lead=c>>12;428uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;429if(twoBits<=1) {430// All 64 code points with the same bits 15..6431// are either in the set or not.432if(twoBits==0) {433break;434}435} else {436// Look up the code point in its 4k block of code points.437if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {438break;439}440}441} else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {442// surrogate code point443if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {444break;445}446} else {447// surrogate pair448if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {449break;450}451--limit;452}453if(s==limit) {454return s;455}456}457} else {458// span not459for(;;) {460c=*(--limit);461if(c<=0xff) {462if(latin1Contains[c]) {463break;464}465} else if(c<=0x7ff) {466if ((table7FF[c & 0x3f] & (static_cast<uint32_t>(1) << (c >> 6))) != 0) {467break;468}469} else if(c<0xd800 || c>=0xe000) {470int lead=c>>12;471uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;472if(twoBits<=1) {473// All 64 code points with the same bits 15..6474// are either in the set or not.475if(twoBits!=0) {476break;477}478} else {479// Look up the code point in its 4k block of code points.480if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {481break;482}483}484} else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {485// surrogate code point486if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {487break;488}489} else {490// surrogate pair491if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {492break;493}494--limit;495}496if(s==limit) {497return s;498}499}500}501return limit+1;502}503504/*505* Precheck for sufficient trail bytes at end of string only once per span.506* Check validity.507*/508const uint8_t *509BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {510const uint8_t *limit=s+length;511uint8_t b=*s;512if(U8_IS_SINGLE(b)) {513// Initial all-ASCII span.514if(spanCondition) {515do {516if(!latin1Contains[b] || ++s==limit) {517return s;518}519b=*s;520} while(U8_IS_SINGLE(b));521} else {522do {523if(latin1Contains[b] || ++s==limit) {524return s;525}526b=*s;527} while(U8_IS_SINGLE(b));528}529length = static_cast<int32_t>(limit - s);530}531532if(spanCondition!=USET_SPAN_NOT_CONTAINED) {533spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.534}535536const uint8_t *limit0=limit;537538/*539* Make sure that the last 1/2/3/4-byte sequence before limit is complete540* or runs into a lead byte.541* In the span loop compare s with limit only once542* per multi-byte character.543*544* Give a trailing illegal sequence the same value as the result of contains(FFFD),545* including it if that is part of the span, otherwise set limit0 to before546* the truncated sequence.547*/548b=*(limit-1);549if (static_cast<int8_t>(b) < 0) {550// b>=0x80: lead or trail byte551if(b<0xc0) {552// single trail byte, check for preceding 3- or 4-byte lead byte553if(length>=2 && (b=*(limit-2))>=0xe0) {554limit-=2;555if(containsFFFD!=spanCondition) {556limit0=limit;557}558} else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {559// 4-byte lead byte with only two trail bytes560limit-=3;561if(containsFFFD!=spanCondition) {562limit0=limit;563}564}565} else {566// lead byte with no trail bytes567--limit;568if(containsFFFD!=spanCondition) {569limit0=limit;570}571}572}573574uint8_t t1, t2, t3;575576while(s<limit) {577b=*s;578if(U8_IS_SINGLE(b)) {579// ASCII580if(spanCondition) {581do {582if(!latin1Contains[b]) {583return s;584} else if(++s==limit) {585return limit0;586}587b=*s;588} while(U8_IS_SINGLE(b));589} else {590do {591if(latin1Contains[b]) {592return s;593} else if(++s==limit) {594return limit0;595}596b=*s;597} while(U8_IS_SINGLE(b));598}599}600++s; // Advance past the lead byte.601if(b>=0xe0) {602if(b<0xf0) {603if( /* handle U+0000..U+FFFF inline */604(t1 = static_cast<uint8_t>(s[0] - 0x80)) <= 0x3f &&605(t2 = static_cast<uint8_t>(s[1] - 0x80)) <= 0x3f606) {607b&=0xf;608uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;609if(twoBits<=1) {610// All 64 code points with this lead byte and middle trail byte611// are either in the set or not.612if (twoBits != static_cast<uint32_t>(spanCondition)) {613return s-1;614}615} else {616// Look up the code point in its 4k block of code points.617UChar32 c=(b<<12)|(t1<<6)|t2;618if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {619return s-1;620}621}622s+=2;623continue;624}625} else if( /* handle U+10000..U+10FFFF inline */626(t1 = static_cast<uint8_t>(s[0] - 0x80)) <= 0x3f &&627(t2 = static_cast<uint8_t>(s[1] - 0x80)) <= 0x3f &&628(t3 = static_cast<uint8_t>(s[2] - 0x80)) <= 0x3f629) {630// Give an illegal sequence the same value as the result of contains(FFFD).631UChar32 c = (static_cast<UChar32>(b - 0xf0) << 18) | (static_cast<UChar32>(t1) << 12) | (t2 << 6) | t3;632if( ( (0x10000<=c && c<=0x10ffff) ?633containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :634containsFFFD635) != spanCondition636) {637return s-1;638}639s+=3;640continue;641}642} else {643if( /* handle U+0000..U+07FF inline */644b>=0xc0 &&645(t1 = static_cast<uint8_t>(*s - 0x80)) <= 0x3f646) {647if (static_cast<USetSpanCondition>((table7FF[t1] & (static_cast<uint32_t>(1) << (b & 0x1f))) != 0) != spanCondition) {648return s-1;649}650++s;651continue;652}653}654655// Give an illegal sequence the same value as the result of contains(FFFD).656// Handle each byte of an illegal sequence separately to simplify the code;657// no need to optimize error handling.658if(containsFFFD!=spanCondition) {659return s-1;660}661}662663return limit0;664}665666/*667* While going backwards through UTF-8 optimize only for ASCII.668* Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not669* possible to tell from the last byte in a multi-byte sequence how many670* preceding bytes there should be. Therefore, going backwards through UTF-8671* is much harder than going forward.672*/673int32_t674BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {675if(spanCondition!=USET_SPAN_NOT_CONTAINED) {676spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.677}678679uint8_t b;680681do {682b=s[--length];683if(U8_IS_SINGLE(b)) {684// ASCII sub-span685if(spanCondition) {686do {687if(!latin1Contains[b]) {688return length+1;689} else if(length==0) {690return 0;691}692b=s[--length];693} while(U8_IS_SINGLE(b));694} else {695do {696if(latin1Contains[b]) {697return length+1;698} else if(length==0) {699return 0;700}701b=s[--length];702} while(U8_IS_SINGLE(b));703}704}705706int32_t prev=length;707UChar32 c;708// trail byte: collect a multi-byte character709// (or lead byte in last-trail position)710c=utf8_prevCharSafeBody(s, 0, &length, b, -3);711// c is a valid code point, not ASCII, not a surrogate712if(c<=0x7ff) {713if (static_cast<USetSpanCondition>((table7FF[c & 0x3f] & (static_cast<uint32_t>(1) << (c >> 6))) != 0) != spanCondition) {714return prev+1;715}716} else if(c<=0xffff) {717int lead=c>>12;718uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;719if(twoBits<=1) {720// All 64 code points with the same bits 15..6721// are either in the set or not.722if (twoBits != static_cast<uint32_t>(spanCondition)) {723return prev+1;724}725} else {726// Look up the code point in its 4k block of code points.727if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {728return prev+1;729}730}731} else {732if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {733return prev+1;734}735}736} while(length>0);737return 0;738}739740U_NAMESPACE_END741742743