Path: blob/master/thirdparty/icu4c/common/dictbe.cpp
10278 views
// © 2016 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html2/**3*******************************************************************************4* Copyright (C) 2006-2016, International Business Machines Corporation5* and others. All Rights Reserved.6*******************************************************************************7*/89#include <utility>1011#include "unicode/utypes.h"1213#if !UCONFIG_NO_BREAK_ITERATION1415#include "brkeng.h"16#include "dictbe.h"17#include "unicode/uniset.h"18#include "unicode/chariter.h"19#include "unicode/resbund.h"20#include "unicode/ubrk.h"21#include "unicode/usetiter.h"22#include "ubrkimpl.h"23#include "utracimp.h"24#include "uvectr32.h"25#include "uvector.h"26#include "uassert.h"27#include "unicode/normlzr.h"28#include "cmemory.h"29#include "dictionarydata.h"3031U_NAMESPACE_BEGIN3233/*34******************************************************************35*/3637DictionaryBreakEngine::DictionaryBreakEngine() {38}3940DictionaryBreakEngine::~DictionaryBreakEngine() {41}4243UBool44DictionaryBreakEngine::handles(UChar32 c, const char*) const {45return fSet.contains(c);46}4748int32_t49DictionaryBreakEngine::findBreaks( UText *text,50int32_t startPos,51int32_t endPos,52UVector32 &foundBreaks,53UBool isPhraseBreaking,54UErrorCode& status) const {55if (U_FAILURE(status)) return 0;56int32_t result = 0;5758// Find the span of characters included in the set.59// The span to break begins at the current position in the text, and60// extends towards the start or end of the text, depending on 'reverse'.6162utext_setNativeIndex(text, startPos);63int32_t start = static_cast<int32_t>(utext_getNativeIndex(text));64int32_t current;65int32_t rangeStart;66int32_t rangeEnd;67UChar32 c = utext_current32(text);68while ((current = static_cast<int32_t>(utext_getNativeIndex(text))) < endPos && fSet.contains(c)) {69utext_next32(text); // TODO: recast loop for postincrement70c = utext_current32(text);71}72rangeStart = start;73rangeEnd = current;74result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, isPhraseBreaking, status);75utext_setNativeIndex(text, current);7677return result;78}7980void81DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {82fSet = set;83// Compact for caching84fSet.compact();85}8687/*88******************************************************************89* PossibleWord90*/9192// Helper class for improving readability of the Thai/Lao/Khmer word break93// algorithm. The implementation is completely inline.9495// List size, limited by the maximum number of words in the dictionary96// that form a nested sequence.97static const int32_t POSSIBLE_WORD_LIST_MAX = 20;9899class PossibleWord {100private:101// list of word candidate lengths, in increasing length order102// TODO: bytes would be sufficient for word lengths.103int32_t count; // Count of candidates104int32_t prefix; // The longest match with a dictionary word105int32_t offset; // Offset in the text of these candidates106int32_t mark; // The preferred candidate's offset107int32_t current; // The candidate we're currently looking at108int32_t cuLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code units.109int32_t cpLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code points.110111public:112PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {}113~PossibleWord() {}114115// Fill the list of candidates if needed, select the longest, and return the number found116int32_t candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );117118// Select the currently marked candidate, point after it in the text, and invalidate self119int32_t acceptMarked( UText *text );120121// Back up from the current candidate to the next shorter one; return true if that exists122// and point the text after it123UBool backUp( UText *text );124125// Return the longest prefix this candidate location shares with a dictionary word126// Return value is in code points.127int32_t longestPrefix() { return prefix; }128129// Mark the current candidate as the one we like130void markCurrent() { mark = current; }131132// Get length in code points of the marked word.133int32_t markedCPLength() { return cpLengths[mark]; }134};135136137int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {138// TODO: If getIndex is too slow, use offset < 0 and add discardAll()139int32_t start = static_cast<int32_t>(utext_getNativeIndex(text));140if (start != offset) {141offset = start;142count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, nullptr, &prefix);143// Dictionary leaves text after longest prefix, not longest word. Back up.144if (count <= 0) {145utext_setNativeIndex(text, start);146}147}148if (count > 0) {149utext_setNativeIndex(text, start+cuLengths[count-1]);150}151current = count-1;152mark = current;153return count;154}155156int32_t157PossibleWord::acceptMarked( UText *text ) {158utext_setNativeIndex(text, offset + cuLengths[mark]);159return cuLengths[mark];160}161162163UBool164PossibleWord::backUp( UText *text ) {165if (current > 0) {166utext_setNativeIndex(text, offset + cuLengths[--current]);167return true;168}169return false;170}171172/*173******************************************************************174* ThaiBreakEngine175*/176177// How many words in a row are "good enough"?178static const int32_t THAI_LOOKAHEAD = 3;179180// Will not combine a non-word with a preceding dictionary word longer than this181static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;182183// Will not combine a non-word that shares at least this much prefix with a184// dictionary word, with a preceding word185static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;186187// Elision character188static const int32_t THAI_PAIYANNOI = 0x0E2F;189190// Repeat character191static const int32_t THAI_MAIYAMOK = 0x0E46;192193// Minimum word size194static const int32_t THAI_MIN_WORD = 2;195196// Minimum number of characters for two words197static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;198199ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)200: DictionaryBreakEngine(),201fDictionary(adoptDictionary)202{203UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);204UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai");205UnicodeSet thaiWordSet(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]]"), status);206if (U_SUCCESS(status)) {207setCharacters(thaiWordSet);208}209fMarkSet.applyPattern(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);210fMarkSet.add(0x0020);211fEndWordSet = thaiWordSet;212fEndWordSet.remove(0x0E31); // MAI HAN-AKAT213fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI214fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK215fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI216fSuffixSet.add(THAI_PAIYANNOI);217fSuffixSet.add(THAI_MAIYAMOK);218219// Compact for caching.220fMarkSet.compact();221fEndWordSet.compact();222fBeginWordSet.compact();223fSuffixSet.compact();224UTRACE_EXIT_STATUS(status);225}226227ThaiBreakEngine::~ThaiBreakEngine() {228delete fDictionary;229}230231int32_t232ThaiBreakEngine::divideUpDictionaryRange( UText *text,233int32_t rangeStart,234int32_t rangeEnd,235UVector32 &foundBreaks,236UBool /* isPhraseBreaking */,237UErrorCode& status) const {238if (U_FAILURE(status)) return 0;239utext_setNativeIndex(text, rangeStart);240utext_moveIndex32(text, THAI_MIN_WORD_SPAN);241if (utext_getNativeIndex(text) >= rangeEnd) {242return 0; // Not enough characters for two words243}244utext_setNativeIndex(text, rangeStart);245246247uint32_t wordsFound = 0;248int32_t cpWordLength = 0; // Word Length in Code Points.249int32_t cuWordLength = 0; // Word length in code units (UText native indexing)250int32_t current;251PossibleWord words[THAI_LOOKAHEAD];252253utext_setNativeIndex(text, rangeStart);254255while (U_SUCCESS(status) && (current = static_cast<int32_t>(utext_getNativeIndex(text))) < rangeEnd) {256cpWordLength = 0;257cuWordLength = 0;258259// Look for candidate words at the current position260int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);261262// If we found exactly one, use that263if (candidates == 1) {264cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);265cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();266wordsFound += 1;267}268// If there was more than one, see which one can take us forward the most words269else if (candidates > 1) {270// If we're already at the end of the range, we're done271if (static_cast<int32_t>(utext_getNativeIndex(text)) >= rangeEnd) {272goto foundBest;273}274do {275if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {276// Followed by another dictionary word; mark first word as a good candidate277words[wordsFound%THAI_LOOKAHEAD].markCurrent();278279// If we're already at the end of the range, we're done280if (static_cast<int32_t>(utext_getNativeIndex(text)) >= rangeEnd) {281goto foundBest;282}283284// See if any of the possible second words is followed by a third word285do {286// If we find a third word, stop right away287if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {288words[wordsFound % THAI_LOOKAHEAD].markCurrent();289goto foundBest;290}291}292while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));293}294}295while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));296foundBest:297// Set UText position to after the accepted word.298cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);299cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();300wordsFound += 1;301}302303// We come here after having either found a word or not. We look ahead to the304// next word. If it's not a dictionary word, we will combine it with the word we305// just found (if there is one), but only if the preceding word does not exceed306// the threshold.307// The text iterator should now be positioned at the end of the word we found.308309UChar32 uc = 0;310if (static_cast<int32_t>(utext_getNativeIndex(text)) < rangeEnd && cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {311// if it is a dictionary word, do nothing. If it isn't, then if there is312// no preceding word, or the non-word shares less than the minimum threshold313// of characters with a dictionary word, then scan to resynchronize314if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0315&& (cuWordLength == 0316|| words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {317// Look for a plausible word boundary318int32_t remaining = rangeEnd - (current+cuWordLength);319UChar32 pc;320int32_t chars = 0;321for (;;) {322int32_t pcIndex = static_cast<int32_t>(utext_getNativeIndex(text));323pc = utext_next32(text);324int32_t pcSize = static_cast<int32_t>(utext_getNativeIndex(text)) - pcIndex;325chars += pcSize;326remaining -= pcSize;327if (remaining <= 0) {328break;329}330uc = utext_current32(text);331if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {332// Maybe. See if it's in the dictionary.333// NOTE: In the original Apple code, checked that the next334// two characters after uc were not 0x0E4C THANTHAKHAT before335// checking the dictionary. That is just a performance filter,336// but it's not clear it's faster than checking the trie.337int32_t num_candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);338utext_setNativeIndex(text, current + cuWordLength + chars);339if (num_candidates > 0) {340break;341}342}343}344345// Bump the word count if there wasn't already one346if (cuWordLength <= 0) {347wordsFound += 1;348}349350// Update the length with the passed-over characters351cuWordLength += chars;352}353else {354// Back up to where we were for next iteration355utext_setNativeIndex(text, current+cuWordLength);356}357}358359// Never stop before a combining mark.360int32_t currPos;361while ((currPos = static_cast<int32_t>(utext_getNativeIndex(text))) < rangeEnd && fMarkSet.contains(utext_current32(text))) {362utext_next32(text);363cuWordLength += static_cast<int32_t>(utext_getNativeIndex(text)) - currPos;364}365366// Look ahead for possible suffixes if a dictionary word does not follow.367// We do this in code rather than using a rule so that the heuristic368// resynch continues to function. For example, one of the suffix characters369// could be a typo in the middle of a word.370if (static_cast<int32_t>(utext_getNativeIndex(text)) < rangeEnd && cuWordLength > 0) {371if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0372&& fSuffixSet.contains(uc = utext_current32(text))) {373if (uc == THAI_PAIYANNOI) {374if (!fSuffixSet.contains(utext_previous32(text))) {375// Skip over previous end and PAIYANNOI376utext_next32(text);377int32_t paiyannoiIndex = static_cast<int32_t>(utext_getNativeIndex(text));378utext_next32(text);379cuWordLength += static_cast<int32_t>(utext_getNativeIndex(text)) - paiyannoiIndex; // Add PAIYANNOI to word380uc = utext_current32(text); // Fetch next character381}382else {383// Restore prior position384utext_next32(text);385}386}387if (uc == THAI_MAIYAMOK) {388if (utext_previous32(text) != THAI_MAIYAMOK) {389// Skip over previous end and MAIYAMOK390utext_next32(text);391int32_t maiyamokIndex = static_cast<int32_t>(utext_getNativeIndex(text));392utext_next32(text);393cuWordLength += static_cast<int32_t>(utext_getNativeIndex(text)) - maiyamokIndex; // Add MAIYAMOK to word394}395else {396// Restore prior position397utext_next32(text);398}399}400}401else {402utext_setNativeIndex(text, current+cuWordLength);403}404}405406// Did we find a word on this iteration? If so, push it on the break stack407if (cuWordLength > 0) {408foundBreaks.push((current+cuWordLength), status);409}410}411412// Don't return a break for the end of the dictionary range if there is one there.413if (foundBreaks.peeki() >= rangeEnd) {414(void) foundBreaks.popi();415wordsFound -= 1;416}417418return wordsFound;419}420421/*422******************************************************************423* LaoBreakEngine424*/425426// How many words in a row are "good enough"?427static const int32_t LAO_LOOKAHEAD = 3;428429// Will not combine a non-word with a preceding dictionary word longer than this430static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;431432// Will not combine a non-word that shares at least this much prefix with a433// dictionary word, with a preceding word434static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;435436// Minimum word size437static const int32_t LAO_MIN_WORD = 2;438439// Minimum number of characters for two words440static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;441442LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)443: DictionaryBreakEngine(),444fDictionary(adoptDictionary)445{446UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);447UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo");448UnicodeSet laoWordSet(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]]"), status);449if (U_SUCCESS(status)) {450setCharacters(laoWordSet);451}452fMarkSet.applyPattern(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);453fMarkSet.add(0x0020);454fEndWordSet = laoWordSet;455fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels456fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters)457fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent)458fBeginWordSet.add(0x0EC0, 0x0EC4); // prefix vowels459460// Compact for caching.461fMarkSet.compact();462fEndWordSet.compact();463fBeginWordSet.compact();464UTRACE_EXIT_STATUS(status);465}466467LaoBreakEngine::~LaoBreakEngine() {468delete fDictionary;469}470471int32_t472LaoBreakEngine::divideUpDictionaryRange( UText *text,473int32_t rangeStart,474int32_t rangeEnd,475UVector32 &foundBreaks,476UBool /* isPhraseBreaking */,477UErrorCode& status) const {478if (U_FAILURE(status)) return 0;479if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {480return 0; // Not enough characters for two words481}482483uint32_t wordsFound = 0;484int32_t cpWordLength = 0;485int32_t cuWordLength = 0;486int32_t current;487PossibleWord words[LAO_LOOKAHEAD];488489utext_setNativeIndex(text, rangeStart);490491while (U_SUCCESS(status) && (current = static_cast<int32_t>(utext_getNativeIndex(text))) < rangeEnd) {492cuWordLength = 0;493cpWordLength = 0;494495// Look for candidate words at the current position496int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);497498// If we found exactly one, use that499if (candidates == 1) {500cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);501cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();502wordsFound += 1;503}504// If there was more than one, see which one can take us forward the most words505else if (candidates > 1) {506// If we're already at the end of the range, we're done507if (utext_getNativeIndex(text) >= rangeEnd) {508goto foundBest;509}510do {511if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {512// Followed by another dictionary word; mark first word as a good candidate513words[wordsFound%LAO_LOOKAHEAD].markCurrent();514515// If we're already at the end of the range, we're done516if (static_cast<int32_t>(utext_getNativeIndex(text)) >= rangeEnd) {517goto foundBest;518}519520// See if any of the possible second words is followed by a third word521do {522// If we find a third word, stop right away523if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {524words[wordsFound % LAO_LOOKAHEAD].markCurrent();525goto foundBest;526}527}528while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));529}530}531while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));532foundBest:533cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);534cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();535wordsFound += 1;536}537538// We come here after having either found a word or not. We look ahead to the539// next word. If it's not a dictionary word, we will combine it with the word we540// just found (if there is one), but only if the preceding word does not exceed541// the threshold.542// The text iterator should now be positioned at the end of the word we found.543if (static_cast<int32_t>(utext_getNativeIndex(text)) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {544// if it is a dictionary word, do nothing. If it isn't, then if there is545// no preceding word, or the non-word shares less than the minimum threshold546// of characters with a dictionary word, then scan to resynchronize547if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0548&& (cuWordLength == 0549|| words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {550// Look for a plausible word boundary551int32_t remaining = rangeEnd - (current + cuWordLength);552UChar32 pc;553UChar32 uc;554int32_t chars = 0;555for (;;) {556int32_t pcIndex = static_cast<int32_t>(utext_getNativeIndex(text));557pc = utext_next32(text);558int32_t pcSize = static_cast<int32_t>(utext_getNativeIndex(text)) - pcIndex;559chars += pcSize;560remaining -= pcSize;561if (remaining <= 0) {562break;563}564uc = utext_current32(text);565if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {566// Maybe. See if it's in the dictionary.567// TODO: this looks iffy; compare with old code.568int32_t num_candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);569utext_setNativeIndex(text, current + cuWordLength + chars);570if (num_candidates > 0) {571break;572}573}574}575576// Bump the word count if there wasn't already one577if (cuWordLength <= 0) {578wordsFound += 1;579}580581// Update the length with the passed-over characters582cuWordLength += chars;583}584else {585// Back up to where we were for next iteration586utext_setNativeIndex(text, current + cuWordLength);587}588}589590// Never stop before a combining mark.591int32_t currPos;592while ((currPos = static_cast<int32_t>(utext_getNativeIndex(text))) < rangeEnd && fMarkSet.contains(utext_current32(text))) {593utext_next32(text);594cuWordLength += static_cast<int32_t>(utext_getNativeIndex(text)) - currPos;595}596597// Look ahead for possible suffixes if a dictionary word does not follow.598// We do this in code rather than using a rule so that the heuristic599// resynch continues to function. For example, one of the suffix characters600// could be a typo in the middle of a word.601// NOT CURRENTLY APPLICABLE TO LAO602603// Did we find a word on this iteration? If so, push it on the break stack604if (cuWordLength > 0) {605foundBreaks.push((current+cuWordLength), status);606}607}608609// Don't return a break for the end of the dictionary range if there is one there.610if (foundBreaks.peeki() >= rangeEnd) {611(void) foundBreaks.popi();612wordsFound -= 1;613}614615return wordsFound;616}617618/*619******************************************************************620* BurmeseBreakEngine621*/622623// How many words in a row are "good enough"?624static const int32_t BURMESE_LOOKAHEAD = 3;625626// Will not combine a non-word with a preceding dictionary word longer than this627static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;628629// Will not combine a non-word that shares at least this much prefix with a630// dictionary word, with a preceding word631static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;632633// Minimum word size634static const int32_t BURMESE_MIN_WORD = 2;635636// Minimum number of characters for two words637static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;638639BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)640: DictionaryBreakEngine(),641fDictionary(adoptDictionary)642{643UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);644UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr");645fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels646fEndWordSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]]"), status);647fMarkSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);648fMarkSet.add(0x0020);649if (U_SUCCESS(status)) {650setCharacters(fEndWordSet);651}652653// Compact for caching.654fMarkSet.compact();655fEndWordSet.compact();656fBeginWordSet.compact();657UTRACE_EXIT_STATUS(status);658}659660BurmeseBreakEngine::~BurmeseBreakEngine() {661delete fDictionary;662}663664int32_t665BurmeseBreakEngine::divideUpDictionaryRange( UText *text,666int32_t rangeStart,667int32_t rangeEnd,668UVector32 &foundBreaks,669UBool /* isPhraseBreaking */,670UErrorCode& status ) const {671if (U_FAILURE(status)) return 0;672if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {673return 0; // Not enough characters for two words674}675676uint32_t wordsFound = 0;677int32_t cpWordLength = 0;678int32_t cuWordLength = 0;679int32_t current;680PossibleWord words[BURMESE_LOOKAHEAD];681682utext_setNativeIndex(text, rangeStart);683684while (U_SUCCESS(status) && (current = static_cast<int32_t>(utext_getNativeIndex(text))) < rangeEnd) {685cuWordLength = 0;686cpWordLength = 0;687688// Look for candidate words at the current position689int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);690691// If we found exactly one, use that692if (candidates == 1) {693cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);694cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();695wordsFound += 1;696}697// If there was more than one, see which one can take us forward the most words698else if (candidates > 1) {699// If we're already at the end of the range, we're done700if (utext_getNativeIndex(text) >= rangeEnd) {701goto foundBest;702}703do {704if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {705// Followed by another dictionary word; mark first word as a good candidate706words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();707708// If we're already at the end of the range, we're done709if (static_cast<int32_t>(utext_getNativeIndex(text)) >= rangeEnd) {710goto foundBest;711}712713// See if any of the possible second words is followed by a third word714do {715// If we find a third word, stop right away716if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {717words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();718goto foundBest;719}720}721while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));722}723}724while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));725foundBest:726cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);727cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();728wordsFound += 1;729}730731// We come here after having either found a word or not. We look ahead to the732// next word. If it's not a dictionary word, we will combine it with the word we733// just found (if there is one), but only if the preceding word does not exceed734// the threshold.735// The text iterator should now be positioned at the end of the word we found.736if (static_cast<int32_t>(utext_getNativeIndex(text)) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {737// if it is a dictionary word, do nothing. If it isn't, then if there is738// no preceding word, or the non-word shares less than the minimum threshold739// of characters with a dictionary word, then scan to resynchronize740if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0741&& (cuWordLength == 0742|| words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {743// Look for a plausible word boundary744int32_t remaining = rangeEnd - (current + cuWordLength);745UChar32 pc;746UChar32 uc;747int32_t chars = 0;748for (;;) {749int32_t pcIndex = static_cast<int32_t>(utext_getNativeIndex(text));750pc = utext_next32(text);751int32_t pcSize = static_cast<int32_t>(utext_getNativeIndex(text)) - pcIndex;752chars += pcSize;753remaining -= pcSize;754if (remaining <= 0) {755break;756}757uc = utext_current32(text);758if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {759// Maybe. See if it's in the dictionary.760// TODO: this looks iffy; compare with old code.761int32_t num_candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);762utext_setNativeIndex(text, current + cuWordLength + chars);763if (num_candidates > 0) {764break;765}766}767}768769// Bump the word count if there wasn't already one770if (cuWordLength <= 0) {771wordsFound += 1;772}773774// Update the length with the passed-over characters775cuWordLength += chars;776}777else {778// Back up to where we were for next iteration779utext_setNativeIndex(text, current + cuWordLength);780}781}782783// Never stop before a combining mark.784int32_t currPos;785while ((currPos = static_cast<int32_t>(utext_getNativeIndex(text))) < rangeEnd && fMarkSet.contains(utext_current32(text))) {786utext_next32(text);787cuWordLength += static_cast<int32_t>(utext_getNativeIndex(text)) - currPos;788}789790// Look ahead for possible suffixes if a dictionary word does not follow.791// We do this in code rather than using a rule so that the heuristic792// resynch continues to function. For example, one of the suffix characters793// could be a typo in the middle of a word.794// NOT CURRENTLY APPLICABLE TO BURMESE795796// Did we find a word on this iteration? If so, push it on the break stack797if (cuWordLength > 0) {798foundBreaks.push((current+cuWordLength), status);799}800}801802// Don't return a break for the end of the dictionary range if there is one there.803if (foundBreaks.peeki() >= rangeEnd) {804(void) foundBreaks.popi();805wordsFound -= 1;806}807808return wordsFound;809}810811/*812******************************************************************813* KhmerBreakEngine814*/815816// How many words in a row are "good enough"?817static const int32_t KHMER_LOOKAHEAD = 3;818819// Will not combine a non-word with a preceding dictionary word longer than this820static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;821822// Will not combine a non-word that shares at least this much prefix with a823// dictionary word, with a preceding word824static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;825826// Minimum word size827static const int32_t KHMER_MIN_WORD = 2;828829// Minimum number of characters for two words830static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;831832KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)833: DictionaryBreakEngine(),834fDictionary(adoptDictionary)835{836UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);837UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr");838UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]]"), status);839if (U_SUCCESS(status)) {840setCharacters(khmerWordSet);841}842fMarkSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);843fMarkSet.add(0x0020);844fEndWordSet = khmerWordSet;845fBeginWordSet.add(0x1780, 0x17B3);846//fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels847//fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word848//fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word849fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters850//fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels851// fEndWordSet.remove(0x0E31); // MAI HAN-AKAT852// fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI853// fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK854// fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI855// fSuffixSet.add(THAI_PAIYANNOI);856// fSuffixSet.add(THAI_MAIYAMOK);857858// Compact for caching.859fMarkSet.compact();860fEndWordSet.compact();861fBeginWordSet.compact();862// fSuffixSet.compact();863UTRACE_EXIT_STATUS(status);864}865866KhmerBreakEngine::~KhmerBreakEngine() {867delete fDictionary;868}869870int32_t871KhmerBreakEngine::divideUpDictionaryRange( UText *text,872int32_t rangeStart,873int32_t rangeEnd,874UVector32 &foundBreaks,875UBool /* isPhraseBreaking */,876UErrorCode& status ) const {877if (U_FAILURE(status)) return 0;878if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {879return 0; // Not enough characters for two words880}881882uint32_t wordsFound = 0;883int32_t cpWordLength = 0;884int32_t cuWordLength = 0;885int32_t current;886PossibleWord words[KHMER_LOOKAHEAD];887888utext_setNativeIndex(text, rangeStart);889890while (U_SUCCESS(status) && (current = static_cast<int32_t>(utext_getNativeIndex(text))) < rangeEnd) {891cuWordLength = 0;892cpWordLength = 0;893894// Look for candidate words at the current position895int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);896897// If we found exactly one, use that898if (candidates == 1) {899cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);900cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();901wordsFound += 1;902}903904// If there was more than one, see which one can take us forward the most words905else if (candidates > 1) {906// If we're already at the end of the range, we're done907if (static_cast<int32_t>(utext_getNativeIndex(text)) >= rangeEnd) {908goto foundBest;909}910do {911if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {912// Followed by another dictionary word; mark first word as a good candidate913words[wordsFound % KHMER_LOOKAHEAD].markCurrent();914915// If we're already at the end of the range, we're done916if (static_cast<int32_t>(utext_getNativeIndex(text)) >= rangeEnd) {917goto foundBest;918}919920// See if any of the possible second words is followed by a third word921do {922// If we find a third word, stop right away923if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {924words[wordsFound % KHMER_LOOKAHEAD].markCurrent();925goto foundBest;926}927}928while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));929}930}931while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));932foundBest:933cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);934cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();935wordsFound += 1;936}937938// We come here after having either found a word or not. We look ahead to the939// next word. If it's not a dictionary word, we will combine it with the word we940// just found (if there is one), but only if the preceding word does not exceed941// the threshold.942// The text iterator should now be positioned at the end of the word we found.943if (static_cast<int32_t>(utext_getNativeIndex(text)) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {944// if it is a dictionary word, do nothing. If it isn't, then if there is945// no preceding word, or the non-word shares less than the minimum threshold946// of characters with a dictionary word, then scan to resynchronize947if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0948&& (cuWordLength == 0949|| words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {950// Look for a plausible word boundary951int32_t remaining = rangeEnd - (current+cuWordLength);952UChar32 pc;953UChar32 uc;954int32_t chars = 0;955for (;;) {956int32_t pcIndex = static_cast<int32_t>(utext_getNativeIndex(text));957pc = utext_next32(text);958int32_t pcSize = static_cast<int32_t>(utext_getNativeIndex(text)) - pcIndex;959chars += pcSize;960remaining -= pcSize;961if (remaining <= 0) {962break;963}964uc = utext_current32(text);965if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {966// Maybe. See if it's in the dictionary.967int32_t num_candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);968utext_setNativeIndex(text, current+cuWordLength+chars);969if (num_candidates > 0) {970break;971}972}973}974975// Bump the word count if there wasn't already one976if (cuWordLength <= 0) {977wordsFound += 1;978}979980// Update the length with the passed-over characters981cuWordLength += chars;982}983else {984// Back up to where we were for next iteration985utext_setNativeIndex(text, current+cuWordLength);986}987}988989// Never stop before a combining mark.990int32_t currPos;991while ((currPos = static_cast<int32_t>(utext_getNativeIndex(text))) < rangeEnd && fMarkSet.contains(utext_current32(text))) {992utext_next32(text);993cuWordLength += static_cast<int32_t>(utext_getNativeIndex(text)) - currPos;994}995996// Look ahead for possible suffixes if a dictionary word does not follow.997// We do this in code rather than using a rule so that the heuristic998// resynch continues to function. For example, one of the suffix characters999// could be a typo in the middle of a word.1000// if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {1001// if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 01002// && fSuffixSet.contains(uc = utext_current32(text))) {1003// if (uc == KHMER_PAIYANNOI) {1004// if (!fSuffixSet.contains(utext_previous32(text))) {1005// // Skip over previous end and PAIYANNOI1006// utext_next32(text);1007// utext_next32(text);1008// wordLength += 1; // Add PAIYANNOI to word1009// uc = utext_current32(text); // Fetch next character1010// }1011// else {1012// // Restore prior position1013// utext_next32(text);1014// }1015// }1016// if (uc == KHMER_MAIYAMOK) {1017// if (utext_previous32(text) != KHMER_MAIYAMOK) {1018// // Skip over previous end and MAIYAMOK1019// utext_next32(text);1020// utext_next32(text);1021// wordLength += 1; // Add MAIYAMOK to word1022// }1023// else {1024// // Restore prior position1025// utext_next32(text);1026// }1027// }1028// }1029// else {1030// utext_setNativeIndex(text, current+wordLength);1031// }1032// }10331034// Did we find a word on this iteration? If so, push it on the break stack1035if (cuWordLength > 0) {1036foundBreaks.push((current+cuWordLength), status);1037}1038}10391040// Don't return a break for the end of the dictionary range if there is one there.1041if (foundBreaks.peeki() >= rangeEnd) {1042(void) foundBreaks.popi();1043wordsFound -= 1;1044}10451046return wordsFound;1047}10481049#if !UCONFIG_NO_NORMALIZATION1050/*1051******************************************************************1052* CjkBreakEngine1053*/1054static const uint32_t kuint32max = 0xFFFFFFFF;1055CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)1056: DictionaryBreakEngine(), fDictionary(adoptDictionary), isCj(false) {1057UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);1058UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");1059fMlBreakEngine = nullptr;1060nfkcNorm2 = Normalizer2::getNFKCInstance(status);1061// Korean dictionary only includes Hangul syllables1062fHangulWordSet.applyPattern(UnicodeString(u"[\\uac00-\\ud7a3]"), status);1063fHangulWordSet.compact();1064// Digits, open puncutation and Alphabetic characters.1065fDigitOrOpenPunctuationOrAlphabetSet.applyPattern(1066UnicodeString(u"[[:Nd:][:Pi:][:Ps:][:Alphabetic:]]"), status);1067fDigitOrOpenPunctuationOrAlphabetSet.compact();1068fClosePunctuationSet.applyPattern(UnicodeString(u"[[:Pc:][:Pd:][:Pe:][:Pf:][:Po:]]"), status);1069fClosePunctuationSet.compact();10701071// handle Korean and Japanese/Chinese using different dictionaries1072if (type == kKorean) {1073if (U_SUCCESS(status)) {1074setCharacters(fHangulWordSet);1075}1076} else { // Chinese and Japanese1077UnicodeSet cjSet(UnicodeString(u"[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"), status);1078isCj = true;1079if (U_SUCCESS(status)) {1080setCharacters(cjSet);1081#if UCONFIG_USE_ML_PHRASE_BREAKING1082fMlBreakEngine = new MlBreakEngine(fDigitOrOpenPunctuationOrAlphabetSet,1083fClosePunctuationSet, status);1084if (fMlBreakEngine == nullptr) {1085status = U_MEMORY_ALLOCATION_ERROR;1086}1087#else1088initJapanesePhraseParameter(status);1089#endif1090}1091}1092UTRACE_EXIT_STATUS(status);1093}10941095CjkBreakEngine::~CjkBreakEngine(){1096delete fDictionary;1097delete fMlBreakEngine;1098}10991100// The katakanaCost values below are based on the length frequencies of all1101// katakana phrases in the dictionary1102static const int32_t kMaxKatakanaLength = 8;1103static const int32_t kMaxKatakanaGroupLength = 20;1104static const uint32_t maxSnlp = 255;11051106static inline uint32_t getKatakanaCost(int32_t wordLength){1107//TODO: fill array with actual values from dictionary!1108static const uint32_t katakanaCost[kMaxKatakanaLength + 1]1109= {8192, 984, 408, 240, 204, 252, 300, 372, 480};1110return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];1111}11121113static inline bool isKatakana(UChar32 value) {1114return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||1115(value >= 0xFF66 && value <= 0xFF9f);1116}11171118// Function for accessing internal utext flags.1119// Replicates an internal UText function.11201121static inline int32_t utext_i32_flag(int32_t bitIndex) {1122return static_cast<int32_t>(1) << bitIndex;1123}11241125/*1126* @param text A UText representing the text1127* @param rangeStart The start of the range of dictionary characters1128* @param rangeEnd The end of the range of dictionary characters1129* @param foundBreaks vector<int32> to receive the break positions1130* @return The number of breaks found1131*/1132int32_t1133CjkBreakEngine::divideUpDictionaryRange( UText *inText,1134int32_t rangeStart,1135int32_t rangeEnd,1136UVector32 &foundBreaks,1137UBool isPhraseBreaking,1138UErrorCode& status) const {1139if (U_FAILURE(status)) return 0;1140if (rangeStart >= rangeEnd) {1141return 0;1142}11431144// UnicodeString version of input UText, NFKC normalized if necessary.1145UnicodeString inString;11461147// inputMap[inStringIndex] = corresponding native index from UText inText.1148// If nullptr then mapping is 1:11149LocalPointer<UVector32> inputMap;11501151// if UText has the input string as one contiguous UTF-16 chunk1152if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&1153inText->chunkNativeStart <= rangeStart &&1154inText->chunkNativeLimit >= rangeEnd &&1155inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {11561157// Input UText is in one contiguous UTF-16 chunk.1158// Use Read-only aliasing UnicodeString.1159inString.setTo(false,1160inText->chunkContents + rangeStart - inText->chunkNativeStart,1161rangeEnd - rangeStart);1162} else {1163// Copy the text from the original inText (UText) to inString (UnicodeString).1164// Create a map from UnicodeString indices -> UText offsets.1165utext_setNativeIndex(inText, rangeStart);1166int32_t limit = rangeEnd;1167U_ASSERT(limit <= utext_nativeLength(inText));1168if (limit > utext_nativeLength(inText)) {1169limit = static_cast<int32_t>(utext_nativeLength(inText));1170}1171inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);1172if (U_FAILURE(status)) {1173return 0;1174}1175while (utext_getNativeIndex(inText) < limit) {1176int32_t nativePosition = static_cast<int32_t>(utext_getNativeIndex(inText));1177UChar32 c = utext_next32(inText);1178U_ASSERT(c != U_SENTINEL);1179inString.append(c);1180while (inputMap->size() < inString.length()) {1181inputMap->addElement(nativePosition, status);1182}1183}1184inputMap->addElement(limit, status);1185}118611871188if (!nfkcNorm2->isNormalized(inString, status)) {1189UnicodeString normalizedInput;1190// normalizedMap[normalizedInput position] == original UText position.1191LocalPointer<UVector32> normalizedMap(new UVector32(status), status);1192if (U_FAILURE(status)) {1193return 0;1194}11951196UnicodeString fragment;1197UnicodeString normalizedFragment;1198for (int32_t srcI = 0; srcI < inString.length();) { // Once per normalization chunk1199fragment.remove();1200int32_t fragmentStartI = srcI;1201UChar32 c = inString.char32At(srcI);1202for (;;) {1203fragment.append(c);1204srcI = inString.moveIndex32(srcI, 1);1205if (srcI == inString.length()) {1206break;1207}1208c = inString.char32At(srcI);1209if (nfkcNorm2->hasBoundaryBefore(c)) {1210break;1211}1212}1213nfkcNorm2->normalize(fragment, normalizedFragment, status);1214normalizedInput.append(normalizedFragment);12151216// Map every position in the normalized chunk to the start of the chunk1217// in the original input.1218int32_t fragmentOriginalStart = inputMap.isValid() ?1219inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;1220while (normalizedMap->size() < normalizedInput.length()) {1221normalizedMap->addElement(fragmentOriginalStart, status);1222if (U_FAILURE(status)) {1223break;1224}1225}1226}1227U_ASSERT(normalizedMap->size() == normalizedInput.length());1228int32_t nativeEnd = inputMap.isValid() ?1229inputMap->elementAti(inString.length()) : inString.length()+rangeStart;1230normalizedMap->addElement(nativeEnd, status);12311232inputMap = std::move(normalizedMap);1233inString = std::move(normalizedInput);1234}12351236int32_t numCodePts = inString.countChar32();1237if (numCodePts != inString.length()) {1238// There are supplementary characters in the input.1239// The dictionary will produce boundary positions in terms of code point indexes,1240// not in terms of code unit string indexes.1241// Use the inputMap mechanism to take care of this in addition to indexing differences1242// from normalization and/or UTF-8 input.1243UBool hadExistingMap = inputMap.isValid();1244if (!hadExistingMap) {1245inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);1246if (U_FAILURE(status)) {1247return 0;1248}1249}1250int32_t cpIdx = 0;1251for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {1252U_ASSERT(cuIdx >= cpIdx);1253if (hadExistingMap) {1254inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);1255} else {1256inputMap->addElement(cuIdx+rangeStart, status);1257}1258cpIdx++;1259if (cuIdx == inString.length()) {1260break;1261}1262}1263}12641265#if UCONFIG_USE_ML_PHRASE_BREAKING1266// PhraseBreaking is supported in ja and ko; MlBreakEngine only supports ja.1267if (isPhraseBreaking && isCj) {1268return fMlBreakEngine->divideUpRange(inText, rangeStart, rangeEnd, foundBreaks, inString,1269inputMap, status);1270}1271#endif12721273// bestSnlp[i] is the snlp of the best segmentation of the first i1274// code points in the range to be matched.1275UVector32 bestSnlp(numCodePts + 1, status);1276bestSnlp.addElement(0, status);1277for(int32_t i = 1; i <= numCodePts; i++) {1278bestSnlp.addElement(kuint32max, status);1279}128012811282// prev[i] is the index of the last CJK code point in the previous word in1283// the best segmentation of the first i characters.1284UVector32 prev(numCodePts + 1, status);1285for(int32_t i = 0; i <= numCodePts; i++){1286prev.addElement(-1, status);1287}12881289const int32_t maxWordSize = 20;1290UVector32 values(numCodePts, status);1291values.setSize(numCodePts);1292UVector32 lengths(numCodePts, status);1293lengths.setSize(numCodePts);12941295UText fu = UTEXT_INITIALIZER;1296utext_openUnicodeString(&fu, &inString, &status);12971298// Dynamic programming to find the best segmentation.12991300// In outer loop, i is the code point index,1301// ix is the corresponding string (code unit) index.1302// They differ when the string contains supplementary characters.1303int32_t ix = 0;1304bool is_prev_katakana = false;1305for (int32_t i = 0; i < numCodePts; ++i, ix = inString.moveIndex32(ix, 1)) {1306if (static_cast<uint32_t>(bestSnlp.elementAti(i)) == kuint32max) {1307continue;1308}13091310int32_t count;1311utext_setNativeIndex(&fu, ix);1312count = fDictionary->matches(&fu, maxWordSize, numCodePts,1313nullptr, lengths.getBuffer(), values.getBuffer(), nullptr);1314// Note: lengths is filled with code point lengths1315// The nullptr parameter is the ignored code unit lengths.13161317// if there are no single character matches found in the dictionary1318// starting with this character, treat character as a 1-character word1319// with the highest value possible, i.e. the least likely to occur.1320// Exclude Korean characters from this treatment, as they should be left1321// together by default.1322if ((count == 0 || lengths.elementAti(0) != 1) &&1323!fHangulWordSet.contains(inString.char32At(ix))) {1324values.setElementAt(maxSnlp, count); // 2551325lengths.setElementAt(1, count++);1326}13271328for (int32_t j = 0; j < count; j++) {1329uint32_t newSnlp = static_cast<uint32_t>(bestSnlp.elementAti(i)) + static_cast<uint32_t>(values.elementAti(j));1330int32_t ln_j_i = lengths.elementAti(j) + i;1331if (newSnlp < static_cast<uint32_t>(bestSnlp.elementAti(ln_j_i))) {1332bestSnlp.setElementAt(newSnlp, ln_j_i);1333prev.setElementAt(i, ln_j_i);1334}1335}13361337// In Japanese,1338// Katakana word in single character is pretty rare. So we apply1339// the following heuristic to Katakana: any continuous run of Katakana1340// characters is considered a candidate word with a default cost1341// specified in the katakanaCost table according to its length.13421343bool is_katakana = isKatakana(inString.char32At(ix));1344int32_t katakanaRunLength = 1;1345if (!is_prev_katakana && is_katakana) {1346int32_t j = inString.moveIndex32(ix, 1);1347// Find the end of the continuous run of Katakana characters1348while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&1349isKatakana(inString.char32At(j))) {1350j = inString.moveIndex32(j, 1);1351katakanaRunLength++;1352}1353if (katakanaRunLength < kMaxKatakanaGroupLength) {1354uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);1355if (newSnlp < static_cast<uint32_t>(bestSnlp.elementAti(i + katakanaRunLength))) {1356bestSnlp.setElementAt(newSnlp, i+katakanaRunLength);1357prev.setElementAt(i, i+katakanaRunLength); // prev[j] = i;1358}1359}1360}1361is_prev_katakana = is_katakana;1362}1363utext_close(&fu);13641365// Start pushing the optimal offset index into t_boundary (t for tentative).1366// prev[numCodePts] is guaranteed to be meaningful.1367// We'll first push in the reverse order, i.e.,1368// t_boundary[0] = numCodePts, and afterwards do a swap.1369UVector32 t_boundary(numCodePts+1, status);13701371int32_t numBreaks = 0;1372// No segmentation found, set boundary to end of range1373if (static_cast<uint32_t>(bestSnlp.elementAti(numCodePts)) == kuint32max) {1374t_boundary.addElement(numCodePts, status);1375numBreaks++;1376} else if (isPhraseBreaking) {1377t_boundary.addElement(numCodePts, status);1378if(U_SUCCESS(status)) {1379numBreaks++;1380int32_t prevIdx = numCodePts;13811382int32_t codeUnitIdx = -1;1383int32_t prevCodeUnitIdx = -1;1384int32_t length = -1;1385for (int32_t i = prev.elementAti(numCodePts); i > 0; i = prev.elementAti(i)) {1386codeUnitIdx = inString.moveIndex32(0, i);1387prevCodeUnitIdx = inString.moveIndex32(0, prevIdx);1388// Calculate the length by using the code unit.1389length = prevCodeUnitIdx - codeUnitIdx;1390prevIdx = i;1391// Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana1392// characters don't occur.1393if (!fSkipSet.containsKey(inString.tempSubString(codeUnitIdx, length))1394&& (!isKatakana(inString.char32At(inString.moveIndex32(codeUnitIdx, -1)))1395|| !isKatakana(inString.char32At(codeUnitIdx)))) {1396t_boundary.addElement(i, status);1397numBreaks++;1398}1399}1400}1401} else {1402for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {1403t_boundary.addElement(i, status);1404numBreaks++;1405}1406U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);1407}14081409// Add a break for the start of the dictionary range if there is not one1410// there already.1411if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {1412t_boundary.addElement(0, status);1413numBreaks++;1414}14151416// Now that we're done, convert positions in t_boundary[] (indices in1417// the normalized input string) back to indices in the original input UText1418// while reversing t_boundary and pushing values to foundBreaks.1419int32_t prevCPPos = -1;1420int32_t prevUTextPos = -1;1421int32_t correctedNumBreaks = 0;1422for (int32_t i = numBreaks - 1; i >= 0; i--) {1423int32_t cpPos = t_boundary.elementAti(i);1424U_ASSERT(cpPos > prevCPPos);1425int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;1426U_ASSERT(utextPos >= prevUTextPos);1427if (utextPos > prevUTextPos) {1428// Boundaries are added to foundBreaks output in ascending order.1429U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);1430// In phrase breaking, there has to be a breakpoint between Cj character and close1431// punctuation.1432// E.g.[携帯電話]正しい選択 -> [携帯▁電話]▁正しい▁選択 -> breakpoint between ] and 正1433if (utextPos != rangeStart1434|| (isPhraseBreaking && utextPos > 01435&& fClosePunctuationSet.contains(utext_char32At(inText, utextPos - 1)))) {1436foundBreaks.push(utextPos, status);1437correctedNumBreaks++;1438}1439} else {1440// Normalization expanded the input text, the dictionary found a boundary1441// within the expansion, giving two boundaries with the same index in the1442// original text. Ignore the second. See ticket #12918.1443--numBreaks;1444}1445prevCPPos = cpPos;1446prevUTextPos = utextPos;1447}1448(void)prevCPPos; // suppress compiler warnings about unused variable14491450UChar32 nextChar = utext_char32At(inText, rangeEnd);1451if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) {1452// In phrase breaking, there has to be a breakpoint between Cj character and1453// the number/open punctuation.1454// E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「1455// E.g. 乗車率90%程度だろうか -> 乗車▁率▁90%▁程度だろうか -> breakpoint between 率 and 91456// E.g. しかもロゴがUnicode! -> しかも▁ロゴが▁Unicode!-> breakpoint between が and U1457if (isPhraseBreaking) {1458if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(nextChar)) {1459foundBreaks.popi();1460correctedNumBreaks--;1461}1462} else {1463foundBreaks.popi();1464correctedNumBreaks--;1465}1466}14671468// inString goes out of scope1469// inputMap goes out of scope1470return correctedNumBreaks;1471}14721473void CjkBreakEngine::initJapanesePhraseParameter(UErrorCode& error) {1474loadJapaneseExtensions(error);1475loadHiragana(error);1476}14771478void CjkBreakEngine::loadJapaneseExtensions(UErrorCode& error) {1479const char* tag = "extensions";1480ResourceBundle ja(U_ICUDATA_BRKITR, "ja", error);1481if (U_SUCCESS(error)) {1482ResourceBundle bundle = ja.get(tag, error);1483while (U_SUCCESS(error) && bundle.hasNext()) {1484fSkipSet.puti(bundle.getNextString(error), 1, error);1485}1486}1487}14881489void CjkBreakEngine::loadHiragana(UErrorCode& error) {1490UnicodeSet hiraganaWordSet(UnicodeString(u"[:Hiragana:]"), error);1491hiraganaWordSet.compact();1492UnicodeSetIterator iterator(hiraganaWordSet);1493while (iterator.next()) {1494fSkipSet.puti(UnicodeString(iterator.getCodepoint()), 1, error);1495}1496}1497#endif14981499U_NAMESPACE_END15001501#endif /* #if !UCONFIG_NO_BREAK_ITERATION */1502150315041505