1. /*
  2. * @(#)RuleBasedBreakIterator.java 1.17 03/12/19
  3. *
  4. * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
  5. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
  6. */
  7. /*
  8. * @(#)RuleBasedBreakIterator.java 1.3 99/04/07
  9. *
  10. * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  11. * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
  12. *
  13. * The original version of this source code and documentation
  14. * is copyrighted and owned by Taligent, Inc., a wholly-owned
  15. * subsidiary of IBM. These materials are provided under terms
  16. * of a License Agreement between Taligent and Sun. This technology
  17. * is protected by multiple US and International patents.
  18. *
  19. * This notice and attribution to Taligent may not be removed.
  20. * Taligent is a registered trademark of Taligent, Inc.
  21. */
  22. package java.text;
  23. import java.io.BufferedInputStream;
  24. import java.io.IOException;
  25. import java.security.AccessController;
  26. import java.security.PrivilegedActionException;
  27. import java.security.PrivilegedExceptionAction;
  28. import java.util.Vector;
  29. import java.util.Stack;
  30. import java.util.Hashtable;
  31. import java.util.Enumeration;
  32. import java.util.MissingResourceException;
  33. import java.text.CharacterIterator;
  34. import java.text.StringCharacterIterator;
  35. import sun.text.CompactByteArray;
  36. import sun.text.SupplementaryCharacterData;
  37. /**
  38. * <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p>
  39. *
  40. * <p>There are two kinds of rules, which are separated by semicolons: <i>substitutions</i>
  41. * and <i>regular expressions.</i></p>
  42. *
  43. * <p>A substitution rule defines a name that can be used in place of an expression. It
  44. * consists of a name, which is a string of characters contained in angle brackets, an equals
  45. * sign, and an expression. (There can be no whitespace on either side of the equals sign.)
  46. * To keep its syntactic meaning intact, the expression must be enclosed in parentheses or
  47. * square brackets. A substitution is visible after its definition, and is filled in using
  48. * simple textual substitution. Substitution definitions can contain other substitutions, as
  49. * long as those substitutions have been defined first. Substitutions are generally used to
  50. * make the regular expressions (which can get quite complex) shorted and easier to read.
  51. * They typically define either character categories or commonly-used subexpressions.</p>
  52. *
  53. * <p>There is one special substitution.  If the description defines a substitution
  54. * called "<ignore>", the expression must be a [] expression, and the
  55. * expression defines a set of characters (the "<em>ignore characters</em>") that
  56. * will be transparent to the BreakIterator.  A sequence of characters will break the
  57. * same way it would if any ignore characters it contains are taken out.  Break
  58. * positions never occur befoer ignore characters.</p>
  59. *
  60. * <p>A regular expression uses a subset of the normal Unix regular-expression syntax, and
  61. * defines a sequence of characters to be kept together. With one significant exception, the
  62. * iterator uses a longest-possible-match algorithm when matching text to regular
  63. * expressions. The iterator also treats descriptions containing multiple regular expressions
  64. * as if they were ORed together (i.e., as if they were separated by |).</p>
  65. *
  66. * <p>The special characters recognized by the regular-expression parser are as follows:</p>
  67. *
  68. * <blockquote>
  69. * <table border="1" width="100%">
  70. * <tr>
  71. * <td width="6%">*</td>
  72. * <td width="94%">Specifies that the expression preceding the asterisk may occur any number
  73. * of times (including not at all).</td>
  74. * </tr>
  75. * <tr>
  76. * <td width="6%">{}</td>
  77. * <td width="94%">Encloses a sequence of characters that is optional.</td>
  78. * </tr>
  79. * <tr>
  80. * <td width="6%">()</td>
  81. * <td width="94%">Encloses a sequence of characters.  If followed by *, the sequence
  82. * repeats.  Otherwise, the parentheses are just a grouping device and a way to delimit
  83. * the ends of expressions containing |.</td>
  84. * </tr>
  85. * <tr>
  86. * <td width="6%">|</td>
  87. * <td width="94%">Separates two alternative sequences of characters.  Either one
  88. * sequence or the other, but not both, matches this expression.  The | character can
  89. * only occur inside ().</td>
  90. * </tr>
  91. * <tr>
  92. * <td width="6%">.</td>
  93. * <td width="94%">Matches any character.</td>
  94. * </tr>
  95. * <tr>
  96. * <td width="6%">*?</td>
  97. * <td width="94%">Specifies a non-greedy asterisk.  *? works the same way as *, except
  98. * when there is overlap between the last group of characters in the expression preceding the
  99. * * and the first group of characters following the *.  When there is this kind of
  100. * overlap, * will match the longest sequence of characters that match the expression before
  101. * the *, and *? will match the shortest sequence of characters matching the expression
  102. * before the *?.  For example, if you have "xxyxyyyxyxyxxyxyxyy" in the text,
  103. * "x[xy]*x" will match through to the last x (i.e., "<strong>xxyxyyyxyxyxxyxyx</strong>yy",
  104. * but "x[xy]*?x" will only match the first two xes ("<strong>xx</strong>yxyyyxyxyxxyxyxyy").</td>
  105. * </tr>
  106. * <tr>
  107. * <td width="6%">[]</td>
  108. * <td width="94%">Specifies a group of alternative characters.  A [] expression will
  109. * match any single character that is specified in the [] expression.  For more on the
  110. * syntax of [] expressions, see below.</td>
  111. * </tr>
  112. * <tr>
  113. * <td width="6%">/</td>
  114. * <td width="94%">Specifies where the break position should go if text matches this
  115. * expression.  (e.g., "[a-z]*/[:Zs:]*[1-0]" will match if the iterator sees a run
  116. * of letters, followed by a run of whitespace, followed by a digit, but the break position
  117. * will actually go before the whitespace).  Expressions that don't contain / put the
  118. * break position at the end of the matching text.</td>
  119. * </tr>
  120. * <tr>
  121. * <td width="6%">\</td>
  122. * <td width="94%">Escape character.  The \ itself is ignored, but causes the next
  123. * character to be treated as literal character.  This has no effect for many
  124. * characters, but for the characters listed above, this deprives them of their special
  125. * meaning.  (There are no special escape sequences for Unicode characters, or tabs and
  126. * newlines; these are all handled by a higher-level protocol.  In a Java string,
  127. * "\n" will be converted to a literal newline character by the time the
  128. * regular-expression parser sees it.  Of course, this means that \ sequences that are
  129. * visible to the regexp parser must be written as \\ when inside a Java string.)  All
  130. * characters in the ASCII range except for letters, digits, and control characters are
  131. * reserved characters to the parser and must be preceded by \ even if they currently don't
  132. * mean anything.</td>
  133. * </tr>
  134. * <tr>
  135. * <td width="6%">!</td>
  136. * <td width="94%">If ! appears at the beginning of a regular expression, it tells the regexp
  137. * parser that this expression specifies the backwards-iteration behavior of the iterator,
  138. * and not its normal iteration behavior.  This is generally only used in situations
  139. * where the automatically-generated backwards-iteration brhavior doesn't produce
  140. * satisfactory results and must be supplemented with extra client-specified rules.</td>
  141. * </tr>
  142. * <tr>
  143. * <td width="6%"><em>(all others)</em></td>
  144. * <td width="94%">All other characters are treated as literal characters, which must match
  145. * the corresponding character(s) in the text exactly.</td>
  146. * </tr>
  147. * </table>
  148. * </blockquote>
  149. *
  150. * <p>Within a [] expression, a number of other special characters can be used to specify
  151. * groups of characters:</p>
  152. *
  153. * <blockquote>
  154. * <table border="1" width="100%">
  155. * <tr>
  156. * <td width="6%">-</td>
  157. * <td width="94%">Specifies a range of matching characters.  For example
  158. * "[a-p]" matches all lowercase Latin letters from a to p (inclusive).  The -
  159. * sign specifies ranges of continuous Unicode numeric values, not ranges of characters in a
  160. * language's alphabetical order: "[a-z]" doesn't include capital letters, nor does
  161. * it include accented letters such as a-umlaut.</td>
  162. * </tr>
  163. * <tr>
  164. * <td width="6%">::</td>
  165. * <td width="94%">A pair of colons containing a one- or two-letter code matches all
  166. * characters in the corresponding Unicode category.  The two-letter codes are the same
  167. * as the two-letter codes in the Unicode database (for example, "[:Sc::Sm:]"
  168. * matches all currency symbols and all math symbols).  Specifying a one-letter code is
  169. * the same as specifying all two-letter codes that begin with that letter (for example,
  170. * "[:L:]" matches all letters, and is equivalent to
  171. * "[:Lu::Ll::Lo::Lm::Lt:]").  Anything other than a valid two-letter Unicode
  172. * category code or a single letter that begins a Unicode category code is illegal within
  173. * colons.</td>
  174. * </tr>
  175. * <tr>
  176. * <td width="6%">[]</td>
  177. * <td width="94%">[] expressions can nest.  This has no effect, except when used in
  178. * conjunction with the ^ token.</td>
  179. * </tr>
  180. * <tr>
  181. * <td width="6%">^</td>
  182. * <td width="94%">Excludes the character (or the characters in the [] expression) following
  183. * it from the group of characters.  For example, "[a-z^p]" matches all Latin
  184. * lowercase letters except p.  "[:L:^[\u4e00-\u9fff]]" matches all letters
  185. * except the Han ideographs.</td>
  186. * </tr>
  187. * <tr>
  188. * <td width="6%"><em>(all others)</em></td>
  189. * <td width="94%">All other characters are treated as literal characters.  (For
  190. * example, "[aeiou]" specifies just the letters a, e, i, o, and u.)</td>
  191. * </tr>
  192. * </table>
  193. * </blockquote>
  194. *
  195. * <p>For a more complete explanation, see <a
  196. * href="http://www.ibm.com/java/education/boundaries/boundaries.html">http://www.ibm.com/java/education/boundaries/boundaries.html</a>.
  197. *   For examples, see the resource data (which is annotated).</p>
  198. *
  199. * @author Richard Gillam
  200. * @version $RCSFile$ $Revision: 1.1 $ $Date: 1998/11/05 19:32:04 $
  201. */
  202. class RuleBasedBreakIterator extends BreakIterator {
  203. /**
  204. * A token used as a character-category value to identify ignore characters
  205. */
  206. protected static final byte IGNORE = -1;
  207. /**
  208. * The state number of the starting state
  209. */
  210. private static final short START_STATE = 1;
  211. /**
  212. * The state-transition value indicating "stop"
  213. */
  214. private static final short STOP_STATE = 0;
  215. /**
  216. * Magic number for the BreakIterator data file format.
  217. */
  218. static final byte[] LABEL = {
  219. (byte)'B', (byte)'I', (byte)'d', (byte)'a', (byte)'t', (byte)'a',
  220. (byte)'\0'
  221. };
  222. static final int LABEL_LENGTH = LABEL.length;
  223. /**
  224. * Version number of the dictionary that was read in.
  225. */
  226. static final byte supportedVersion = 1;
  227. /**
  228. * Header size in byte count
  229. */
  230. private static final int HEADER_LENGTH = 36;
  231. /**
  232. * An array length of indices for BMP characters
  233. */
  234. private static final int BMP_INDICES_LENGTH = 512;
  235. /**
  236. * Tables that indexes from character values to character category numbers
  237. */
  238. private CompactByteArray charCategoryTable = null;
  239. private SupplementaryCharacterData supplementaryCharCategoryTable = null;
  240. /**
  241. * The table of state transitions used for forward iteration
  242. */
  243. private short[] stateTable = null;
  244. /**
  245. * The table of state transitions used to sync up the iterator with the
  246. * text in backwards and random-access iteration
  247. */
  248. private short[] backwardsStateTable = null;
  249. /**
  250. * A list of flags indicating which states in the state table are accepting
  251. * ("end") states
  252. */
  253. private boolean[] endStates = null;
  254. /**
  255. * A list of flags indicating which states in the state table are
  256. * lookahead states (states which turn lookahead on and off)
  257. */
  258. private boolean[] lookaheadStates = null;
  259. /**
  260. * A table for additional data. May be used by a subclass of
  261. * RuleBasedBreakIterator.
  262. */
  263. private byte[] additionalData = null;
  264. /**
  265. * The number of character categories (and, thus, the number of columns in
  266. * the state tables)
  267. */
  268. private int numCategories;
  269. /**
  270. * The character iterator through which this BreakIterator accesses the text
  271. */
  272. private CharacterIterator text = null;
  273. /**
  274. * A CRC32 value of all data in datafile
  275. */
  276. private long checksum;
  277. //=======================================================================
  278. // constructors
  279. //=======================================================================
  280. /**
  281. * Constructs a RuleBasedBreakIterator according to the datafile
  282. * provided.
  283. */
  284. public RuleBasedBreakIterator(String datafile)
  285. throws IOException, MissingResourceException {
  286. readTables(datafile);
  287. }
  288. /**
  289. * Read datafile. The datafile's format is as follows:
  290. * <pre>
  291. * BreakIteratorData {
  292. * u1 magic[7];
  293. * u1 version;
  294. * u4 totalDataSize;
  295. * header_info header;
  296. * body value;
  297. * }
  298. * </pre>
  299. * <code>totalDataSize</code> is the summation of the size of
  300. * <code>header_info</code> and <code>body</code> in byte count.
  301. * <p>
  302. * In <code>header</code>, each field except for checksum implies the
  303. * length of each field. Since <code>BMPdataLength</code> is a fixed-length
  304. * data(512 entries), its length isn't included in <code>header</code>.
  305. * <code>checksum</code> is a CRC32 value of all in <code>body</code>.
  306. * <pre>
  307. * header_info {
  308. * u4 stateTableLength;
  309. * u4 backwardsStateTableLength;
  310. * u4 endStatesLength;
  311. * u4 lookaheadStatesLength;
  312. * u4 BMPdataLength;
  313. * u4 nonBMPdataLength;
  314. * u4 additionalDataLength;
  315. * u8 checksum;
  316. * }
  317. * </pre>
  318. * <p>
  319. *
  320. * Finally, <code>BMPindices</code> and <code>BMPdata</code> are set to
  321. * <code>charCategoryTable</code>. <code>nonBMPdata</code> is set to
  322. * <code>supplementaryCharCategoryTable</code>.
  323. * <pre>
  324. * body {
  325. * u2 stateTable[stateTableLength];
  326. * u2 backwardsStateTable[backwardsStateTableLength];
  327. * u1 endStates[endStatesLength];
  328. * u1 lookaheadStates[lookaheadStatesLength];
  329. * u2 BMPindices[512];
  330. * u1 BMPdata[BMPdataLength];
  331. * u4 nonBMPdata[numNonBMPdataLength];
  332. * u1 additionalData[additionalDataLength];
  333. * }
  334. * </pre>
  335. */
  336. protected void readTables(String datafile)
  337. throws IOException, MissingResourceException {
  338. byte[] buffer = readFile(datafile);
  339. /* Read header_info. */
  340. int stateTableLength = BreakIterator.getInt(buffer, 0);
  341. int backwardsStateTableLength = BreakIterator.getInt(buffer, 4);
  342. int endStatesLength = BreakIterator.getInt(buffer, 8);
  343. int lookaheadStatesLength = BreakIterator.getInt(buffer, 12);
  344. int BMPdataLength = BreakIterator.getInt(buffer, 16);
  345. int nonBMPdataLength = BreakIterator.getInt(buffer, 20);
  346. int additionalDataLength = BreakIterator.getInt(buffer, 24);
  347. checksum = BreakIterator.getLong(buffer, 28);
  348. /* Read stateTable[numCategories * numRows] */
  349. stateTable = new short[stateTableLength];
  350. int offset = HEADER_LENGTH;
  351. for (int i = 0; i < stateTableLength; i++, offset+=2) {
  352. stateTable[i] = BreakIterator.getShort(buffer, offset);
  353. }
  354. /* Read backwardsStateTable[numCategories * numRows] */
  355. backwardsStateTable = new short[backwardsStateTableLength];
  356. for (int i = 0; i < backwardsStateTableLength; i++, offset+=2) {
  357. backwardsStateTable[i] = BreakIterator.getShort(buffer, offset);
  358. }
  359. /* Read endStates[numRows] */
  360. endStates = new boolean[endStatesLength];
  361. for (int i = 0; i < endStatesLength; i++, offset++) {
  362. endStates[i] = buffer[offset] == 1;
  363. }
  364. /* Read lookaheadStates[numRows] */
  365. lookaheadStates = new boolean[lookaheadStatesLength];
  366. for (int i = 0; i < lookaheadStatesLength; i++, offset++) {
  367. lookaheadStates[i] = buffer[offset] == 1;
  368. }
  369. /* Read a category table and indices for BMP characters. */
  370. short[] temp1 = new short[BMP_INDICES_LENGTH]; // BMPindices
  371. for (int i = 0; i < BMP_INDICES_LENGTH; i++, offset+=2) {
  372. temp1[i] = BreakIterator.getShort(buffer, offset);
  373. }
  374. byte[] temp2 = new byte[BMPdataLength]; // BMPdata
  375. System.arraycopy(buffer, offset, temp2, 0, BMPdataLength);
  376. offset += BMPdataLength;
  377. charCategoryTable = new CompactByteArray(temp1, temp2);
  378. /* Read a category table for non-BMP characters. */
  379. int[] temp3 = new int[nonBMPdataLength];
  380. for (int i = 0; i < nonBMPdataLength; i++, offset+=4) {
  381. temp3[i] = BreakIterator.getInt(buffer, offset);
  382. }
  383. supplementaryCharCategoryTable = new SupplementaryCharacterData(temp3);
  384. /* Read additional data */
  385. if (additionalDataLength > 0) {
  386. additionalData = new byte[additionalDataLength];
  387. System.arraycopy(buffer, offset, additionalData, 0, additionalDataLength);
  388. }
  389. /* Set numCategories */
  390. numCategories = stateTable.length / endStates.length;
  391. }
  392. protected byte[] readFile(final String datafile)
  393. throws IOException, MissingResourceException {
  394. BufferedInputStream is;
  395. try {
  396. is = (BufferedInputStream)AccessController.doPrivileged(
  397. new PrivilegedExceptionAction() {
  398. public Object run() throws Exception {
  399. return new BufferedInputStream(getClass().getResourceAsStream("/sun/text/resources/" + datafile));
  400. }
  401. }
  402. );
  403. }
  404. catch (PrivilegedActionException e) {
  405. throw new InternalError(e.toString());
  406. }
  407. int offset = 0;
  408. /* First, read magic, version, and header_info. */
  409. int len = LABEL_LENGTH + 5;
  410. byte[] buf = new byte[len];
  411. if (is.read(buf) != len) {
  412. throw new MissingResourceException("Wrong header length",
  413. datafile, "");
  414. }
  415. /* Validate the magic number. */
  416. for (int i = 0; i < LABEL_LENGTH; i++, offset++) {
  417. if (buf[offset] != LABEL[offset]) {
  418. throw new MissingResourceException("Wrong magic number",
  419. datafile, "");
  420. }
  421. }
  422. /* Validate the version number. */
  423. if (buf[offset] != supportedVersion) {
  424. throw new MissingResourceException("Unsupported version(" + buf[offset] + ")",
  425. datafile, "");
  426. }
  427. /* Read data: totalDataSize + 8(for checksum) */
  428. len = BreakIterator.getInt(buf, ++offset);
  429. buf = new byte[len];
  430. if (is.read(buf) != len) {
  431. throw new MissingResourceException("Wrong data length",
  432. datafile, "");
  433. }
  434. is.close();
  435. return buf;
  436. }
  437. byte[] getAdditionalData() {
  438. return additionalData;
  439. }
  440. void setAdditionalData(byte[] b) {
  441. additionalData = b;
  442. }
  443. //=======================================================================
  444. // boilerplate
  445. //=======================================================================
  446. /**
  447. * Clones this iterator.
  448. * @return A newly-constructed RuleBasedBreakIterator with the same
  449. * behavior as this one.
  450. */
  451. public Object clone() {
  452. RuleBasedBreakIterator result = (RuleBasedBreakIterator) super.clone();
  453. if (text != null) {
  454. result.text = (CharacterIterator) text.clone();
  455. }
  456. return result;
  457. }
  458. /**
  459. * Returns true if both BreakIterators are of the same class, have the same
  460. * rules, and iterate over the same text.
  461. */
  462. public boolean equals(Object that) {
  463. try {
  464. if (that == null) {
  465. return false;
  466. }
  467. RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
  468. if (checksum != other.checksum) {
  469. return false;
  470. }
  471. if (text == null) {
  472. return other.text == null;
  473. } else {
  474. return text.equals(other.text);
  475. }
  476. }
  477. catch(ClassCastException e) {
  478. return false;
  479. }
  480. }
  481. /**
  482. * Returns text
  483. */
  484. public String toString() {
  485. StringBuffer sb = new StringBuffer();
  486. sb.append('[');
  487. sb.append("checksum=0x" + Long.toHexString(checksum));
  488. sb.append(']');
  489. return sb.toString();
  490. }
  491. /**
  492. * Compute a hashcode for this BreakIterator
  493. * @return A hash code
  494. */
  495. public int hashCode() {
  496. return (int)checksum;
  497. }
  498. //=======================================================================
  499. // BreakIterator overrides
  500. //=======================================================================
  501. /**
  502. * Sets the current iteration position to the beginning of the text.
  503. * (i.e., the CharacterIterator's starting offset).
  504. * @return The offset of the beginning of the text.
  505. */
  506. public int first() {
  507. CharacterIterator t = getText();
  508. t.first();
  509. return t.getIndex();
  510. }
  511. /**
  512. * Sets the current iteration position to the end of the text.
  513. * (i.e., the CharacterIterator's ending offset).
  514. * @return The text's past-the-end offset.
  515. */
  516. public int last() {
  517. CharacterIterator t = getText();
  518. // I'm not sure why, but t.last() returns the offset of the last character,
  519. // rather than the past-the-end offset
  520. t.setIndex(t.getEndIndex());
  521. return t.getIndex();
  522. }
  523. /**
  524. * Advances the iterator either forward or backward the specified number of steps.
  525. * Negative values move backward, and positive values move forward. This is
  526. * equivalent to repeatedly calling next() or previous().
  527. * @param n The number of steps to move. The sign indicates the direction
  528. * (negative is backwards, and positive is forwards).
  529. * @return The character offset of the boundary position n boundaries away from
  530. * the current one.
  531. */
  532. public int next(int n) {
  533. int result = current();
  534. while (n > 0) {
  535. result = handleNext();
  536. --n;
  537. }
  538. while (n < 0) {
  539. result = previous();
  540. ++n;
  541. }
  542. return result;
  543. }
  544. /**
  545. * Advances the iterator to the next boundary position.
  546. * @return The position of the first boundary after this one.
  547. */
  548. public int next() {
  549. return handleNext();
  550. }
  551. /**
  552. * Advances the iterator backwards, to the last boundary preceding this one.
  553. * @return The position of the last boundary position preceding this one.
  554. */
  555. public int previous() {
  556. // if we're already sitting at the beginning of the text, return DONE
  557. CharacterIterator text = getText();
  558. if (current() == text.getBeginIndex()) {
  559. return BreakIterator.DONE;
  560. }
  561. // set things up. handlePrevious() will back us up to some valid
  562. // break position before the current position (we back our internal
  563. // iterator up one step to prevent handlePrevious() from returning
  564. // the current position), but not necessarily the last one before
  565. // where we started
  566. int start = current();
  567. getPrevious();
  568. int lastResult = handlePrevious();
  569. int result = lastResult;
  570. // iterate forward from the known break position until we pass our
  571. // starting point. The last break position before the starting
  572. // point is our return value
  573. while (result != BreakIterator.DONE && result < start) {
  574. lastResult = result;
  575. result = handleNext();
  576. }
  577. // set the current iteration position to be the last break position
  578. // before where we started, and then return that value
  579. text.setIndex(lastResult);
  580. return lastResult;
  581. }
  582. /**
  583. * Returns previous character
  584. */
  585. private int getPrevious() {
  586. char c2 = text.previous();
  587. if (Character.isLowSurrogate(c2) &&
  588. text.getIndex() > text.getBeginIndex()) {
  589. char c1 = text.previous();
  590. if (Character.isHighSurrogate(c1)) {
  591. return Character.toCodePoint(c1, c2);
  592. } else {
  593. text.next();
  594. }
  595. }
  596. return (int)c2;
  597. }
  598. /**
  599. * Returns current character
  600. */
  601. int getCurrent() {
  602. char c1 = text.current();
  603. if (Character.isHighSurrogate(c1) &&
  604. text.getIndex() < text.getEndIndex()) {
  605. char c2 = text.next();
  606. text.previous();
  607. if (Character.isLowSurrogate(c2)) {
  608. return Character.toCodePoint(c1, c2);
  609. }
  610. }
  611. return (int)c1;
  612. }
  613. /**
  614. * Returns the count of next character.
  615. */
  616. private int getCurrentCodePointCount() {
  617. char c1 = text.current();
  618. if (Character.isHighSurrogate(c1) &&
  619. text.getIndex() < text.getEndIndex()) {
  620. char c2 = text.next();
  621. text.previous();
  622. if (Character.isLowSurrogate(c2)) {
  623. return 2;
  624. }
  625. }
  626. return 1;
  627. }
  628. /**
  629. * Returns next character
  630. */
  631. int getNext() {
  632. int index = text.getIndex();
  633. int endIndex = text.getEndIndex();
  634. if (index == endIndex ||
  635. (index = index + getCurrentCodePointCount()) >= endIndex) {
  636. return CharacterIterator.DONE;
  637. }
  638. text.setIndex(index);
  639. return getCurrent();
  640. }
  641. /**
  642. * Returns the position of next character.
  643. */
  644. private int getNextIndex() {
  645. int index = text.getIndex() + getCurrentCodePointCount();
  646. int endIndex = text.getEndIndex();
  647. if (index > endIndex) {
  648. return endIndex;
  649. } else {
  650. return index;
  651. }
  652. }
  653. /**
  654. * Throw IllegalArgumentException unless begin <= offset < end.
  655. */
  656. protected static final void checkOffset(int offset, CharacterIterator text) {
  657. if (offset < text.getBeginIndex() || offset >= text.getEndIndex()) {
  658. throw new IllegalArgumentException("offset out of bounds");
  659. }
  660. }
  661. /**
  662. * Sets the iterator to refer to the first boundary position following
  663. * the specified position.
  664. * @offset The position from which to begin searching for a break position.
  665. * @return The position of the first break after the current position.
  666. */
  667. public int following(int offset) {
  668. CharacterIterator text = getText();
  669. checkOffset(offset, text);
  670. // Set our internal iteration position (temporarily)
  671. // to the position passed in. If this is the _beginning_ position,
  672. // then we can just use next() to get our return value
  673. text.setIndex(offset);
  674. if (offset == text.getBeginIndex()) {
  675. return handleNext();
  676. }
  677. // otherwise, we have to sync up first. Use handlePrevious() to back
  678. // us up to a known break position before the specified position (if
  679. // we can determine that the specified position is a break position,
  680. // we don't back up at all). This may or may not be the last break
  681. // position at or before our starting position. Advance forward
  682. // from here until we've passed the starting position. The position
  683. // we stop on will be the first break position after the specified one.
  684. int result = handlePrevious();
  685. while (result != BreakIterator.DONE && result <= offset) {
  686. result = handleNext();
  687. }
  688. return result;
  689. }
  690. /**
  691. * Sets the iterator to refer to the last boundary position before the
  692. * specified position.
  693. * @offset The position to begin searching for a break from.
  694. * @return The position of the last boundary before the starting position.
  695. */
  696. public int preceding(int offset) {
  697. // if we start by updating the current iteration position to the
  698. // position specified by the caller, we can just use previous()
  699. // to carry out this operation
  700. CharacterIterator text = getText();
  701. checkOffset(offset, text);
  702. text.setIndex(offset);
  703. return previous();
  704. }
  705. /**
  706. * Returns true if the specfied position is a boundary position. As a side
  707. * effect, leaves the iterator pointing to the first boundary position at
  708. * or after "offset".
  709. * @param offset the offset to check.
  710. * @return True if "offset" is a boundary position.
  711. */
  712. public boolean isBoundary(int offset) {
  713. CharacterIterator text = getText();
  714. checkOffset(offset, text);
  715. if (offset == text.getBeginIndex()) {
  716. return true;
  717. }
  718. // to check whether this is a boundary, we can use following() on the
  719. // position before the specified one and return true if the position we
  720. // get back is the one the user specified
  721. else {
  722. return following(offset - 1) == offset;
  723. }
  724. }
  725. /**
  726. * Returns the current iteration position.
  727. * @return The current iteration position.
  728. */
  729. public int current() {
  730. return getText().getIndex();
  731. }
  732. /**
  733. * Return a CharacterIterator over the text being analyzed. This version
  734. * of this method returns the actual CharacterIterator we're using internally.
  735. * Changing the state of this iterator can have undefined consequences. If
  736. * you need to change it, clone it first.
  737. * @return An iterator over the text being analyzed.
  738. */
  739. public CharacterIterator getText() {
  740. // The iterator is initialized pointing to no text at all, so if this
  741. // function is called while we're in that state, we have to fudge an
  742. // iterator to return.
  743. if (text == null) {
  744. text = new StringCharacterIterator("");
  745. }
  746. return text;
  747. }
  748. /**
  749. * Set the iterator to analyze a new piece of text. This function resets
  750. * the current iteration position to the beginning of the text.
  751. * @param newText An iterator over the text to analyze.
  752. */
  753. public void setText(CharacterIterator newText) {
  754. // Test iterator to see if we need to wrap it in a SafeCharIterator.
  755. // The correct behavior for CharacterIterators is to allow the
  756. // position to be set to the endpoint of the iterator. Many
  757. // CharacterIterators do not uphold this, so this is a workaround
  758. // to permit them to use this class.
  759. int end = newText.getEndIndex();
  760. boolean goodIterator;
  761. try {
  762. newText.setIndex(end); // some buggy iterators throw an exception here
  763. goodIterator = newText.getIndex() == end;
  764. }
  765. catch(IllegalArgumentException e) {
  766. goodIterator = false;
  767. }
  768. if (goodIterator) {
  769. text = newText;
  770. }
  771. else {
  772. text = new SafeCharIterator(newText);
  773. }
  774. text.first();
  775. }
  776. //=======================================================================
  777. // implementation
  778. //=======================================================================
  779. /**
  780. * This method is the actual implementation of the next() method. All iteration
  781. * vectors through here. This method initializes the state machine to state 1
  782. * and advances through the text character by character until we reach the end
  783. * of the text or the state machine transitions to state 0. We update our return
  784. * value every time the state machine passes through a possible end state.
  785. */
  786. protected int handleNext() {
  787. // if we're already at the end of the text, return DONE.
  788. CharacterIterator text = getText();
  789. if (text.getIndex() == text.getEndIndex()) {
  790. return BreakIterator.DONE;
  791. }
  792. // no matter what, we always advance at least one character forward
  793. int result = getNextIndex();
  794. int lookaheadResult = 0;
  795. // begin in state 1
  796. int state = START_STATE;
  797. int category;
  798. int c = getCurrent();
  799. // loop until we reach the end of the text or transition to state 0
  800. while (c != CharacterIterator.DONE && state != STOP_STATE) {
  801. // look up the current character's character category (which tells us
  802. // which column in the state table to look at)
  803. category = lookupCategory(c);
  804. // if the character isn't an ignore character, look up a state
  805. // transition in the state table
  806. if (category != IGNORE) {
  807. state = lookupState(state, category);
  808. }
  809. // if the state we've just transitioned to is a lookahead state,
  810. // (but not also an end state), save its position. If it's
  811. // both a lookahead state and an end state, update the break position
  812. // to the last saved lookup-state position
  813. if (lookaheadStates[state]) {
  814. if (endStates[state]) {
  815. result = lookaheadResult;
  816. }
  817. else {
  818. lookaheadResult = getNextIndex();
  819. }
  820. }
  821. // otherwise, if the state we've just transitioned to is an accepting
  822. // state, update the break position to be the current iteration position
  823. else {
  824. if (endStates[state]) {
  825. result = getNextIndex();
  826. }
  827. }
  828. c = getNext();
  829. }
  830. // if we've run off the end of the text, and the very last character took us into
  831. // a lookahead state, advance the break position to the lookahead position
  832. // (the theory here is that if there are no characters at all after the lookahead
  833. // position, that always matches the lookahead criteria)
  834. if (c == CharacterIterator.DONE && lookaheadResult == text.getEndIndex()) {
  835. result = lookaheadResult;
  836. }
  837. text.setIndex(result);
  838. return result;
  839. }
  840. /**
  841. * This method backs the iterator back up to a "safe position" in the text.
  842. * This is a position that we know, without any context, must be a break position.
  843. * The various calling methods then iterate forward from this safe position to
  844. * the appropriate position to return. (For more information, see the description
  845. * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
  846. */
  847. protected int handlePrevious() {
  848. CharacterIterator text = getText();
  849. int state = START_STATE;
  850. int category = 0;
  851. int lastCategory = 0;
  852. int c = getCurrent();
  853. // loop until we reach the beginning of the text or transition to state 0
  854. while (c != CharacterIterator.DONE && state != STOP_STATE) {
  855. // save the last character's category and look up the current
  856. // character's category
  857. lastCategory = category;
  858. category = lookupCategory(c);
  859. // if the current character isn't an ignore character, look up a
  860. // state transition in the backwards state table
  861. if (category != IGNORE) {
  862. state = lookupBackwardState(state, category);
  863. }
  864. // then advance one character backwards
  865. c = getPrevious();
  866. }
  867. // if we didn't march off the beginning of the text, we're either one or two
  868. // positions away from the real break position. (One because of the call to
  869. // previous() at the end of the loop above, and another because the character
  870. // that takes us into the stop state will always be the character BEFORE
  871. // the break position.)
  872. if (c != CharacterIterator.DONE) {
  873. if (lastCategory != IGNORE) {
  874. getNext();
  875. getNext();
  876. }
  877. else {
  878. getNext();
  879. }
  880. }
  881. return text.getIndex();
  882. }
  883. /**
  884. * Looks up a character's category (i.e., its category for breaking purposes,
  885. * not its Unicode category)
  886. */
  887. protected int lookupCategory(int c) {
  888. if (c < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
  889. return charCategoryTable.elementAt((char)c);
  890. } else {
  891. return supplementaryCharCategoryTable.getValue(c);
  892. }
  893. }
  894. /**
  895. * Given a current state and a character category, looks up the
  896. * next state to transition to in the state table.
  897. */
  898. protected int lookupState(int state, int category) {
  899. return stateTable[state * numCategories + category];
  900. }
  901. /**
  902. * Given a current state and a character category, looks up the
  903. * next state to transition to in the backwards state table.
  904. */
  905. protected int lookupBackwardState(int state, int category) {
  906. return backwardsStateTable[state * numCategories + category];
  907. }
  908. /*
  909. * This class exists to work around a bug in incorrect implementations
  910. * of CharacterIterator, which incorrectly handle setIndex(endIndex).
  911. * This iterator relies only on base.setIndex(n) where n is less than
  912. * endIndex.
  913. *
  914. * One caveat: if the base iterator's begin and end indices change
  915. * the change will not be reflected by this wrapper. Does that matter?
  916. */
  917. private static final class SafeCharIterator implements CharacterIterator,
  918. Cloneable {
  919. private CharacterIterator base;
  920. private int rangeStart;
  921. private int rangeLimit;
  922. private int currentIndex;
  923. SafeCharIterator(CharacterIterator base) {
  924. this.base = base;
  925. this.rangeStart = base.getBeginIndex();
  926. this.rangeLimit = base.getEndIndex();
  927. this.currentIndex = base.getIndex();
  928. }
  929. public char first() {
  930. return setIndex(rangeStart);
  931. }
  932. public char last() {
  933. return setIndex(rangeLimit - 1);
  934. }
  935. public char current() {
  936. if (currentIndex < rangeStart || currentIndex >= rangeLimit) {
  937. return DONE;
  938. }
  939. else {
  940. return base.setIndex(currentIndex);
  941. }
  942. }
  943. public char next() {
  944. currentIndex++;
  945. if (currentIndex >= rangeLimit) {
  946. currentIndex = rangeLimit;
  947. return DONE;
  948. }
  949. else {
  950. return base.setIndex(currentIndex);
  951. }
  952. }
  953. public char previous() {
  954. currentIndex--;
  955. if (currentIndex < rangeStart) {
  956. currentIndex = rangeStart;
  957. return DONE;
  958. }
  959. else {
  960. return base.setIndex(currentIndex);
  961. }
  962. }
  963. public char setIndex(int i) {
  964. if (i < rangeStart || i > rangeLimit) {
  965. throw new IllegalArgumentException("Invalid position");
  966. }
  967. currentIndex = i;
  968. return current();
  969. }
  970. public int getBeginIndex() {
  971. return rangeStart;
  972. }
  973. public int getEndIndex() {
  974. return rangeLimit;
  975. }
  976. public int getIndex() {
  977. return currentIndex;
  978. }
  979. public Object clone() {
  980. SafeCharIterator copy = null;
  981. try {
  982. copy = (SafeCharIterator) super.clone();
  983. }
  984. catch(CloneNotSupportedException e) {
  985. throw new Error("Clone not supported: " + e);
  986. }
  987. CharacterIterator copyOfBase = (CharacterIterator) base.clone();
  988. copy.base = copyOfBase;
  989. return copy;
  990. }
  991. }
  992. }