Path: blob/master/src/java.base/share/classes/sun/text/RuleBasedBreakIterator.java
41152 views
/*1* Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/2425/*26*27* (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved28* (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved29*30* The original version of this source code and documentation31* is copyrighted and owned by Taligent, Inc., a wholly-owned32* subsidiary of IBM. These materials are provided under terms33* of a License Agreement between Taligent and Sun. This technology34* is protected by multiple US and International patents.35*36* This notice and attribution to Taligent may not be removed.37* Taligent is a registered trademark of Taligent, Inc.38*/3940package sun.text;4142import java.nio.BufferUnderflowException;43import java.nio.ByteBuffer;44import java.text.BreakIterator;45import java.text.CharacterIterator;46import java.text.StringCharacterIterator;47import java.util.MissingResourceException;48import sun.text.CompactByteArray;49import sun.text.SupplementaryCharacterData;5051/**52* <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p>53*54* <p>There are two kinds of rules, which are separated by semicolons: <i>substitutions</i>55* and <i>regular expressions.</i></p>56*57* <p>A substitution rule defines a name that can be used in place of an expression. It58* consists of a name, which is a string of characters contained in angle brackets, an equals59* sign, and an expression. (There can be no whitespace on either side of the equals sign.)60* To keep its syntactic meaning intact, the expression must be enclosed in parentheses or61* square brackets. A substitution is visible after its definition, and is filled in using62* simple textual substitution. Substitution definitions can contain other substitutions, as63* long as those substitutions have been defined first. Substitutions are generally used to64* make the regular expressions (which can get quite complex) shorted and easier to read.65* They typically define either character categories or commonly-used subexpressions.</p>66*67* <p>There is one special substitution. If the description defines a substitution68* called "<ignore>", the expression must be a [] expression, and the69* expression defines a set of characters (the "<em>ignore characters</em>") that70* will be transparent to the BreakIterator. A sequence of characters will break the71* same way it would if any ignore characters it contains are taken out. Break72* positions never occur befoer ignore characters.</p>73*74* <p>A regular expression uses a subset of the normal Unix regular-expression syntax, and75* defines a sequence of characters to be kept together. With one significant exception, the76* iterator uses a longest-possible-match algorithm when matching text to regular77* expressions. The iterator also treats descriptions containing multiple regular expressions78* as if they were ORed together (i.e., as if they were separated by |).</p>79*80* <p>The special characters recognized by the regular-expression parser are as follows:</p>81*82* <blockquote>83* <table border="1" width="100%">84* <tr>85* <td width="6%">*</td>86* <td width="94%">Specifies that the expression preceding the asterisk may occur any number87* of times (including not at all).</td>88* </tr>89* <tr>90* <td width="6%">{}</td>91* <td width="94%">Encloses a sequence of characters that is optional.</td>92* </tr>93* <tr>94* <td width="6%">()</td>95* <td width="94%">Encloses a sequence of characters. If followed by *, the sequence96* repeats. Otherwise, the parentheses are just a grouping device and a way to delimit97* the ends of expressions containing |.</td>98* </tr>99* <tr>100* <td width="6%">|</td>101* <td width="94%">Separates two alternative sequences of characters. Either one102* sequence or the other, but not both, matches this expression. The | character can103* only occur inside ().</td>104* </tr>105* <tr>106* <td width="6%">.</td>107* <td width="94%">Matches any character.</td>108* </tr>109* <tr>110* <td width="6%">*?</td>111* <td width="94%">Specifies a non-greedy asterisk. *? works the same way as *, except112* when there is overlap between the last group of characters in the expression preceding the113* * and the first group of characters following the *. When there is this kind of114* overlap, * will match the longest sequence of characters that match the expression before115* the *, and *? will match the shortest sequence of characters matching the expression116* before the *?. For example, if you have "xxyxyyyxyxyxxyxyxyy" in the text,117* "x[xy]*x" will match through to the last x (i.e., "<strong>xxyxyyyxyxyxxyxyx</strong>yy",118* but "x[xy]*?x" will only match the first two xes ("<strong>xx</strong>yxyyyxyxyxxyxyxyy").</td>119* </tr>120* <tr>121* <td width="6%">[]</td>122* <td width="94%">Specifies a group of alternative characters. A [] expression will123* match any single character that is specified in the [] expression. For more on the124* syntax of [] expressions, see below.</td>125* </tr>126* <tr>127* <td width="6%">/</td>128* <td width="94%">Specifies where the break position should go if text matches this129* expression. (e.g., "[a-z]*/[:Zs:]*[1-0]" will match if the iterator sees a run130* of letters, followed by a run of whitespace, followed by a digit, but the break position131* will actually go before the whitespace). Expressions that don't contain / put the132* break position at the end of the matching text.</td>133* </tr>134* <tr>135* <td width="6%">\</td>136* <td width="94%">Escape character. The \ itself is ignored, but causes the next137* character to be treated as literal character. This has no effect for many138* characters, but for the characters listed above, this deprives them of their special139* meaning. (There are no special escape sequences for Unicode characters, or tabs and140* newlines; these are all handled by a higher-level protocol. In a Java string,141* "\n" will be converted to a literal newline character by the time the142* regular-expression parser sees it. Of course, this means that \ sequences that are143* visible to the regexp parser must be written as \\ when inside a Java string.) All144* characters in the ASCII range except for letters, digits, and control characters are145* reserved characters to the parser and must be preceded by \ even if they currently don't146* mean anything.</td>147* </tr>148* <tr>149* <td width="6%">!</td>150* <td width="94%">If ! appears at the beginning of a regular expression, it tells the regexp151* parser that this expression specifies the backwards-iteration behavior of the iterator,152* and not its normal iteration behavior. This is generally only used in situations153* where the automatically-generated backwards-iteration brhavior doesn't produce154* satisfactory results and must be supplemented with extra client-specified rules.</td>155* </tr>156* <tr>157* <td width="6%"><em>(all others)</em></td>158* <td width="94%">All other characters are treated as literal characters, which must match159* the corresponding character(s) in the text exactly.</td>160* </tr>161* </table>162* </blockquote>163*164* <p>Within a [] expression, a number of other special characters can be used to specify165* groups of characters:</p>166*167* <blockquote>168* <table border="1" width="100%">169* <tr>170* <td width="6%">-</td>171* <td width="94%">Specifies a range of matching characters. For example172* "[a-p]" matches all lowercase Latin letters from a to p (inclusive). The -173* sign specifies ranges of continuous Unicode numeric values, not ranges of characters in a174* language's alphabetical order: "[a-z]" doesn't include capital letters, nor does175* it include accented letters such as a-umlaut.</td>176* </tr>177* <tr>178* <td width="6%">::</td>179* <td width="94%">A pair of colons containing a one- or two-letter code matches all180* characters in the corresponding Unicode category. The two-letter codes are the same181* as the two-letter codes in the Unicode database (for example, "[:Sc::Sm:]"182* matches all currency symbols and all math symbols). Specifying a one-letter code is183* the same as specifying all two-letter codes that begin with that letter (for example,184* "[:L:]" matches all letters, and is equivalent to185* "[:Lu::Ll::Lo::Lm::Lt:]"). Anything other than a valid two-letter Unicode186* category code or a single letter that begins a Unicode category code is illegal within187* colons.</td>188* </tr>189* <tr>190* <td width="6%">[]</td>191* <td width="94%">[] expressions can nest. This has no effect, except when used in192* conjunction with the ^ token.</td>193* </tr>194* <tr>195* <td width="6%">^</td>196* <td width="94%">Excludes the character (or the characters in the [] expression) following197* it from the group of characters. For example, "[a-z^p]" matches all Latin198* lowercase letters except p. "[:L:^[\u4e00-\u9fff]]" matches all letters199* except the Han ideographs.</td>200* </tr>201* <tr>202* <td width="6%"><em>(all others)</em></td>203* <td width="94%">All other characters are treated as literal characters. (For204* example, "[aeiou]" specifies just the letters a, e, i, o, and u.)</td>205* </tr>206* </table>207* </blockquote>208*209* <p>For a more complete explanation, see <a210* href="http://www.ibm.com/java/education/boundaries/boundaries.html">http://www.ibm.com/java/education/boundaries/boundaries.html</a>.211* For examples, see the resource data (which is annotated).</p>212*213* @author Richard Gillam214*/215public class RuleBasedBreakIterator extends BreakIterator {216217/**218* A token used as a character-category value to identify ignore characters219*/220protected static final byte IGNORE = -1;221222/**223* The state number of the starting state224*/225private static final short START_STATE = 1;226227/**228* The state-transition value indicating "stop"229*/230private static final short STOP_STATE = 0;231232/**233* Magic number for the BreakIterator data file format.234*/235static final byte[] LABEL = {236(byte)'B', (byte)'I', (byte)'d', (byte)'a', (byte)'t', (byte)'a',237(byte)'\0'238};239static final int LABEL_LENGTH = LABEL.length;240241/**242* Version number of the dictionary that was read in.243*/244static final byte supportedVersion = 1;245246/**247* An array length of indices for BMP characters248*/249private static final int BMP_INDICES_LENGTH = 512;250251/**252* Tables that indexes from character values to character category numbers253*/254private CompactByteArray charCategoryTable = null;255private SupplementaryCharacterData supplementaryCharCategoryTable = null;256257/**258* The table of state transitions used for forward iteration259*/260private short[] stateTable = null;261262/**263* The table of state transitions used to sync up the iterator with the264* text in backwards and random-access iteration265*/266private short[] backwardsStateTable = null;267268/**269* A list of flags indicating which states in the state table are accepting270* ("end") states271*/272private boolean[] endStates = null;273274/**275* A list of flags indicating which states in the state table are276* lookahead states (states which turn lookahead on and off)277*/278private boolean[] lookaheadStates = null;279280/**281* A table for additional data. May be used by a subclass of282* RuleBasedBreakIterator.283*/284private byte[] additionalData = null;285286/**287* The number of character categories (and, thus, the number of columns in288* the state tables)289*/290private int numCategories;291292/**293* The character iterator through which this BreakIterator accesses the text294*/295private CharacterIterator text = null;296297/**298* A CRC32 value of all data in datafile299*/300private long checksum;301302//=======================================================================303// constructors304//=======================================================================305306/**307* Constructs a RuleBasedBreakIterator using the given rule data.308*309* @throws MissingResourceException if the rule data is invalid or corrupted310*/311public RuleBasedBreakIterator(String ruleFile, byte[] ruleData) {312ByteBuffer bb = ByteBuffer.wrap(ruleData);313try {314validateRuleData(ruleFile, bb);315setupTables(ruleFile, bb);316} catch (BufferUnderflowException bue) {317MissingResourceException e;318e = new MissingResourceException("Corrupted rule data file", ruleFile, "");319e.initCause(bue);320throw e;321}322}323324/**325* Initializes the fields with the given rule data.326* The data format is as follows:327* <pre>328* BreakIteratorData {329* u1 magic[7];330* u1 version;331* u4 totalDataSize;332* header_info header;333* body value;334* }335* </pre>336* <code>totalDataSize</code> is the summation of the size of337* <code>header_info</code> and <code>body</code> in byte count.338* <p>339* In <code>header</code>, each field except for checksum implies the340* length of each field. Since <code>BMPdataLength</code> is a fixed-length341* data(512 entries), its length isn't included in <code>header</code>.342* <code>checksum</code> is a CRC32 value of all in <code>body</code>.343* <pre>344* header_info {345* u4 stateTableLength;346* u4 backwardsStateTableLength;347* u4 endStatesLength;348* u4 lookaheadStatesLength;349* u4 BMPdataLength;350* u4 nonBMPdataLength;351* u4 additionalDataLength;352* u8 checksum;353* }354* </pre>355* <p>356*357* Finally, <code>BMPindices</code> and <code>BMPdata</code> are set to358* <code>charCategoryTable</code>. <code>nonBMPdata</code> is set to359* <code>supplementaryCharCategoryTable</code>.360* <pre>361* body {362* u2 stateTable[stateTableLength];363* u2 backwardsStateTable[backwardsStateTableLength];364* u1 endStates[endStatesLength];365* u1 lookaheadStates[lookaheadStatesLength];366* u2 BMPindices[512];367* u1 BMPdata[BMPdataLength];368* u4 nonBMPdata[numNonBMPdataLength];369* u1 additionalData[additionalDataLength];370* }371* </pre>372*373* @throws BufferUnderflowException if the end-of-data is reached before374* setting up all the tables375*/376private void setupTables(String ruleFile, ByteBuffer bb) {377/* Read header_info. */378int stateTableLength = bb.getInt();379int backwardsStateTableLength = bb.getInt();380int endStatesLength = bb.getInt();381int lookaheadStatesLength = bb.getInt();382int BMPdataLength = bb.getInt();383int nonBMPdataLength = bb.getInt();384int additionalDataLength = bb.getInt();385checksum = bb.getLong();386387/* Read stateTable[numCategories * numRows] */388stateTable = new short[stateTableLength];389for (int i = 0; i < stateTableLength; i++) {390stateTable[i] = bb.getShort();391}392393/* Read backwardsStateTable[numCategories * numRows] */394backwardsStateTable = new short[backwardsStateTableLength];395for (int i = 0; i < backwardsStateTableLength; i++) {396backwardsStateTable[i] = bb.getShort();397}398399/* Read endStates[numRows] */400endStates = new boolean[endStatesLength];401for (int i = 0; i < endStatesLength; i++) {402endStates[i] = bb.get() == 1;403}404405/* Read lookaheadStates[numRows] */406lookaheadStates = new boolean[lookaheadStatesLength];407for (int i = 0; i < lookaheadStatesLength; i++) {408lookaheadStates[i] = bb.get() == 1;409}410411/* Read a category table and indices for BMP characters. */412short[] temp1 = new short[BMP_INDICES_LENGTH]; // BMPindices413for (int i = 0; i < BMP_INDICES_LENGTH; i++) {414temp1[i] = bb.getShort();415}416byte[] temp2 = new byte[BMPdataLength]; // BMPdata417bb.get(temp2);418charCategoryTable = new CompactByteArray(temp1, temp2);419420/* Read a category table for non-BMP characters. */421int[] temp3 = new int[nonBMPdataLength];422for (int i = 0; i < nonBMPdataLength; i++) {423temp3[i] = bb.getInt();424}425supplementaryCharCategoryTable = new SupplementaryCharacterData(temp3);426427/* Read additional data */428if (additionalDataLength > 0) {429additionalData = new byte[additionalDataLength];430bb.get(additionalData);431}432assert bb.position() == bb.limit();433434/* Set numCategories */435numCategories = stateTable.length / endStates.length;436}437438/**439* Validates the magic number, version, and the length of the given data.440*441* @throws BufferUnderflowException if the end-of-data is reached while442* validating data443* @throws MissingResourceException if valification failed444*/445void validateRuleData(String ruleFile, ByteBuffer bb) {446/* Verify the magic number. */447for (int i = 0; i < LABEL_LENGTH; i++) {448if (bb.get() != LABEL[i]) {449throw new MissingResourceException("Wrong magic number",450ruleFile, "");451}452}453454/* Verify the version number. */455byte version = bb.get();456if (version != supportedVersion) {457throw new MissingResourceException("Unsupported version(" + version + ")",458ruleFile, "");459}460461// Check the length of the rest of data462int len = bb.getInt();463if (bb.position() + len != bb.limit()) {464throw new MissingResourceException("Wrong data length",465ruleFile, "");466}467}468469byte[] getAdditionalData() {470return additionalData;471}472473void setAdditionalData(byte[] b) {474additionalData = b;475}476477//=======================================================================478// boilerplate479//=======================================================================480/**481* Clones this iterator.482* @return A newly-constructed RuleBasedBreakIterator with the same483* behavior as this one.484*/485@Override486public Object clone() {487RuleBasedBreakIterator result = (RuleBasedBreakIterator) super.clone();488if (text != null) {489result.text = (CharacterIterator) text.clone();490}491return result;492}493494/**495* Returns true if both BreakIterators are of the same class, have the same496* rules, and iterate over the same text.497*/498@Override499public boolean equals(Object that) {500try {501if (that == null) {502return false;503}504505RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;506if (checksum != other.checksum) {507return false;508}509if (text == null) {510return other.text == null;511} else {512return text.equals(other.text);513}514}515catch(ClassCastException e) {516return false;517}518}519520/**521* Returns text522*/523@Override524public String toString() {525return "[checksum=0x" + Long.toHexString(checksum) + ']';526}527528/**529* Compute a hashcode for this BreakIterator530* @return A hash code531*/532@Override533public int hashCode() {534return (int)checksum;535}536537//=======================================================================538// BreakIterator overrides539//=======================================================================540541/**542* Sets the current iteration position to the beginning of the text.543* (i.e., the CharacterIterator's starting offset).544* @return The offset of the beginning of the text.545*/546@Override547public int first() {548CharacterIterator t = getText();549550t.first();551return t.getIndex();552}553554/**555* Sets the current iteration position to the end of the text.556* (i.e., the CharacterIterator's ending offset).557* @return The text's past-the-end offset.558*/559@Override560public int last() {561CharacterIterator t = getText();562563// I'm not sure why, but t.last() returns the offset of the last character,564// rather than the past-the-end offset565t.setIndex(t.getEndIndex());566return t.getIndex();567}568569/**570* Advances the iterator either forward or backward the specified number of steps.571* Negative values move backward, and positive values move forward. This is572* equivalent to repeatedly calling next() or previous().573* @param n The number of steps to move. The sign indicates the direction574* (negative is backwards, and positive is forwards).575* @return The character offset of the boundary position n boundaries away from576* the current one.577*/578@Override579public int next(int n) {580int result = current();581while (n > 0) {582result = handleNext();583--n;584}585while (n < 0) {586result = previous();587++n;588}589return result;590}591592/**593* Advances the iterator to the next boundary position.594* @return The position of the first boundary after this one.595*/596@Override597public int next() {598return handleNext();599}600601private int cachedLastKnownBreak = BreakIterator.DONE;602603/**604* Advances the iterator backwards, to the last boundary preceding this one.605* @return The position of the last boundary position preceding this one.606*/607@Override608public int previous() {609// if we're already sitting at the beginning of the text, return DONE610CharacterIterator text = getText();611if (current() == text.getBeginIndex()) {612return BreakIterator.DONE;613}614615// set things up. handlePrevious() will back us up to some valid616// break position before the current position (we back our internal617// iterator up one step to prevent handlePrevious() from returning618// the current position), but not necessarily the last one before619// where we started620int start = current();621int lastResult = cachedLastKnownBreak;622if (lastResult >= start || lastResult <= BreakIterator.DONE) {623getPrevious();624lastResult = handlePrevious();625} else {626//it might be better to check if handlePrevious() give us closer627//safe value but handlePrevious() is slow too628//So, this has to be done carefully629text.setIndex(lastResult);630}631int result = lastResult;632633// iterate forward from the known break position until we pass our634// starting point. The last break position before the starting635// point is our return value636while (result != BreakIterator.DONE && result < start) {637lastResult = result;638result = handleNext();639}640641// set the current iteration position to be the last break position642// before where we started, and then return that value643text.setIndex(lastResult);644cachedLastKnownBreak = lastResult;645return lastResult;646}647648/**649* Returns previous character650*/651private int getPrevious() {652char c2 = text.previous();653if (Character.isLowSurrogate(c2) &&654text.getIndex() > text.getBeginIndex()) {655char c1 = text.previous();656if (Character.isHighSurrogate(c1)) {657return Character.toCodePoint(c1, c2);658} else {659text.next();660}661}662return (int)c2;663}664665/**666* Returns current character667*/668int getCurrent() {669char c1 = text.current();670if (Character.isHighSurrogate(c1) &&671text.getIndex() < text.getEndIndex()) {672char c2 = text.next();673text.previous();674if (Character.isLowSurrogate(c2)) {675return Character.toCodePoint(c1, c2);676}677}678return (int)c1;679}680681/**682* Returns the count of next character.683*/684private int getCurrentCodePointCount() {685char c1 = text.current();686if (Character.isHighSurrogate(c1) &&687text.getIndex() < text.getEndIndex()) {688char c2 = text.next();689text.previous();690if (Character.isLowSurrogate(c2)) {691return 2;692}693}694return 1;695}696697/**698* Returns next character699*/700int getNext() {701int index = text.getIndex();702int endIndex = text.getEndIndex();703if (index == endIndex ||704(index += getCurrentCodePointCount()) >= endIndex) {705return CharacterIterator.DONE;706}707text.setIndex(index);708return getCurrent();709}710711/**712* Returns the position of next character.713*/714private int getNextIndex() {715int index = text.getIndex() + getCurrentCodePointCount();716int endIndex = text.getEndIndex();717if (index > endIndex) {718return endIndex;719} else {720return index;721}722}723724/**725* Throw IllegalArgumentException unless begin <= offset < end.726*/727protected static final void checkOffset(int offset, CharacterIterator text) {728if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {729throw new IllegalArgumentException("offset out of bounds");730}731}732733/**734* Sets the iterator to refer to the first boundary position following735* the specified position.736* @offset The position from which to begin searching for a break position.737* @return The position of the first break after the current position.738*/739@Override740public int following(int offset) {741742CharacterIterator text = getText();743checkOffset(offset, text);744745// Set our internal iteration position (temporarily)746// to the position passed in. If this is the _beginning_ position,747// then we can just use next() to get our return value748text.setIndex(offset);749if (offset == text.getBeginIndex()) {750cachedLastKnownBreak = handleNext();751return cachedLastKnownBreak;752}753754// otherwise, we have to sync up first. Use handlePrevious() to back755// us up to a known break position before the specified position (if756// we can determine that the specified position is a break position,757// we don't back up at all). This may or may not be the last break758// position at or before our starting position. Advance forward759// from here until we've passed the starting position. The position760// we stop on will be the first break position after the specified one.761int result = cachedLastKnownBreak;762if (result >= offset || result <= BreakIterator.DONE) {763result = handlePrevious();764} else {765//it might be better to check if handlePrevious() give us closer766//safe value but handlePrevious() is slow too767//So, this has to be done carefully768text.setIndex(result);769}770while (result != BreakIterator.DONE && result <= offset) {771result = handleNext();772}773cachedLastKnownBreak = result;774return result;775}776777/**778* Sets the iterator to refer to the last boundary position before the779* specified position.780* @offset The position to begin searching for a break from.781* @return The position of the last boundary before the starting position.782*/783@Override784public int preceding(int offset) {785// if we start by updating the current iteration position to the786// position specified by the caller, we can just use previous()787// to carry out this operation788CharacterIterator text = getText();789checkOffset(offset, text);790text.setIndex(offset);791return previous();792}793794/**795* Returns true if the specified position is a boundary position. As a side796* effect, leaves the iterator pointing to the first boundary position at797* or after "offset".798* @param offset the offset to check.799* @return True if "offset" is a boundary position.800*/801@Override802public boolean isBoundary(int offset) {803CharacterIterator text = getText();804checkOffset(offset, text);805if (offset == text.getBeginIndex()) {806return true;807}808809// to check whether this is a boundary, we can use following() on the810// position before the specified one and return true if the position we811// get back is the one the user specified812else {813return following(offset - 1) == offset;814}815}816817/**818* Returns the current iteration position.819* @return The current iteration position.820*/821@Override822public int current() {823return getText().getIndex();824}825826/**827* Return a CharacterIterator over the text being analyzed. This version828* of this method returns the actual CharacterIterator we're using internally.829* Changing the state of this iterator can have undefined consequences. If830* you need to change it, clone it first.831* @return An iterator over the text being analyzed.832*/833@Override834public CharacterIterator getText() {835// The iterator is initialized pointing to no text at all, so if this836// function is called while we're in that state, we have to fudge an837// iterator to return.838if (text == null) {839text = new StringCharacterIterator("");840}841return text;842}843844/**845* Set the iterator to analyze a new piece of text. This function resets846* the current iteration position to the beginning of the text.847* @param newText An iterator over the text to analyze.848*/849@Override850public void setText(CharacterIterator newText) {851// Test iterator to see if we need to wrap it in a SafeCharIterator.852// The correct behavior for CharacterIterators is to allow the853// position to be set to the endpoint of the iterator. Many854// CharacterIterators do not uphold this, so this is a workaround855// to permit them to use this class.856int end = newText.getEndIndex();857boolean goodIterator;858try {859newText.setIndex(end); // some buggy iterators throw an exception here860goodIterator = newText.getIndex() == end;861}862catch(IllegalArgumentException e) {863goodIterator = false;864}865866if (goodIterator) {867text = newText;868}869else {870text = new SafeCharIterator(newText);871}872text.first();873874cachedLastKnownBreak = BreakIterator.DONE;875}876877878//=======================================================================879// implementation880//=======================================================================881882/**883* This method is the actual implementation of the next() method. All iteration884* vectors through here. This method initializes the state machine to state 1885* and advances through the text character by character until we reach the end886* of the text or the state machine transitions to state 0. We update our return887* value every time the state machine passes through a possible end state.888*/889protected int handleNext() {890// if we're already at the end of the text, return DONE.891CharacterIterator text = getText();892if (text.getIndex() == text.getEndIndex()) {893return BreakIterator.DONE;894}895896// no matter what, we always advance at least one character forward897int result = getNextIndex();898int lookaheadResult = 0;899900// begin in state 1901int state = START_STATE;902int category;903int c = getCurrent();904905// loop until we reach the end of the text or transition to state 0906while (c != CharacterIterator.DONE && state != STOP_STATE) {907908// look up the current character's character category (which tells us909// which column in the state table to look at)910category = lookupCategory(c);911912// if the character isn't an ignore character, look up a state913// transition in the state table914if (category != IGNORE) {915state = lookupState(state, category);916}917918// if the state we've just transitioned to is a lookahead state,919// (but not also an end state), save its position. If it's920// both a lookahead state and an end state, update the break position921// to the last saved lookup-state position922if (lookaheadStates[state]) {923if (endStates[state]) {924result = lookaheadResult;925}926else {927lookaheadResult = getNextIndex();928}929}930931// otherwise, if the state we've just transitioned to is an accepting932// state, update the break position to be the current iteration position933else {934if (endStates[state]) {935result = getNextIndex();936}937}938939c = getNext();940}941942// if we've run off the end of the text, and the very last character took us into943// a lookahead state, advance the break position to the lookahead position944// (the theory here is that if there are no characters at all after the lookahead945// position, that always matches the lookahead criteria)946if (c == CharacterIterator.DONE && lookaheadResult == text.getEndIndex()) {947result = lookaheadResult;948}949950text.setIndex(result);951return result;952}953954/**955* This method backs the iterator back up to a "safe position" in the text.956* This is a position that we know, without any context, must be a break position.957* The various calling methods then iterate forward from this safe position to958* the appropriate position to return. (For more information, see the description959* of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)960*/961protected int handlePrevious() {962CharacterIterator text = getText();963int state = START_STATE;964int category = 0;965int lastCategory = 0;966int c = getCurrent();967968// loop until we reach the beginning of the text or transition to state 0969while (c != CharacterIterator.DONE && state != STOP_STATE) {970971// save the last character's category and look up the current972// character's category973lastCategory = category;974category = lookupCategory(c);975976// if the current character isn't an ignore character, look up a977// state transition in the backwards state table978if (category != IGNORE) {979state = lookupBackwardState(state, category);980}981982// then advance one character backwards983c = getPrevious();984}985986// if we didn't march off the beginning of the text, we're either one or two987// positions away from the real break position. (One because of the call to988// previous() at the end of the loop above, and another because the character989// that takes us into the stop state will always be the character BEFORE990// the break position.)991if (c != CharacterIterator.DONE) {992if (lastCategory != IGNORE) {993getNext();994getNext();995}996else {997getNext();998}999}1000return text.getIndex();1001}10021003/**1004* Looks up a character's category (i.e., its category for breaking purposes,1005* not its Unicode category)1006*/1007protected int lookupCategory(int c) {1008if (c < Character.MIN_SUPPLEMENTARY_CODE_POINT) {1009return charCategoryTable.elementAt((char)c);1010} else {1011return supplementaryCharCategoryTable.getValue(c);1012}1013}10141015/**1016* Given a current state and a character category, looks up the1017* next state to transition to in the state table.1018*/1019protected int lookupState(int state, int category) {1020return stateTable[state * numCategories + category];1021}10221023/**1024* Given a current state and a character category, looks up the1025* next state to transition to in the backwards state table.1026*/1027protected int lookupBackwardState(int state, int category) {1028return backwardsStateTable[state * numCategories + category];1029}10301031/*1032* This class exists to work around a bug in incorrect implementations1033* of CharacterIterator, which incorrectly handle setIndex(endIndex).1034* This iterator relies only on base.setIndex(n) where n is less than1035* endIndex.1036*1037* One caveat: if the base iterator's begin and end indices change1038* the change will not be reflected by this wrapper. Does that matter?1039*/1040// TODO: Review this class to see if it's still required.1041private static final class SafeCharIterator implements CharacterIterator,1042Cloneable {10431044private CharacterIterator base;1045private int rangeStart;1046private int rangeLimit;1047private int currentIndex;10481049SafeCharIterator(CharacterIterator base) {1050this.base = base;1051this.rangeStart = base.getBeginIndex();1052this.rangeLimit = base.getEndIndex();1053this.currentIndex = base.getIndex();1054}10551056@Override1057public char first() {1058return setIndex(rangeStart);1059}10601061@Override1062public char last() {1063return setIndex(rangeLimit - 1);1064}10651066@Override1067public char current() {1068if (currentIndex < rangeStart || currentIndex >= rangeLimit) {1069return DONE;1070}1071else {1072return base.setIndex(currentIndex);1073}1074}10751076@Override1077public char next() {10781079currentIndex++;1080if (currentIndex >= rangeLimit) {1081currentIndex = rangeLimit;1082return DONE;1083}1084else {1085return base.setIndex(currentIndex);1086}1087}10881089@Override1090public char previous() {10911092currentIndex--;1093if (currentIndex < rangeStart) {1094currentIndex = rangeStart;1095return DONE;1096}1097else {1098return base.setIndex(currentIndex);1099}1100}11011102@Override1103public char setIndex(int i) {11041105if (i < rangeStart || i > rangeLimit) {1106throw new IllegalArgumentException("Invalid position");1107}1108currentIndex = i;1109return current();1110}11111112@Override1113public int getBeginIndex() {1114return rangeStart;1115}11161117@Override1118public int getEndIndex() {1119return rangeLimit;1120}11211122@Override1123public int getIndex() {1124return currentIndex;1125}11261127@Override1128public Object clone() {11291130SafeCharIterator copy = null;1131try {1132copy = (SafeCharIterator) super.clone();1133}1134catch(CloneNotSupportedException e) {1135throw new Error("Clone not supported: " + e);1136}11371138CharacterIterator copyOfBase = (CharacterIterator) base.clone();1139copy.base = copyOfBase;1140return copy;1141}1142}1143}114411451146