1. /*
  2. * @(#)RuleBasedBreakIterator.java 1.13 03/01/23
  3. *
  4. * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
  5. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
  6. */
  7. /*
  8. * @(#)RuleBasedBreakIterator.java 1.3 99/04/07
  9. *
  10. * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  11. * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
  12. *
  13. * The original version of this source code and documentation
  14. * is copyrighted and owned by Taligent, Inc., a wholly-owned
  15. * subsidiary of IBM. These materials are provided under terms
  16. * of a License Agreement between Taligent and Sun. This technology
  17. * is protected by multiple US and International patents.
  18. *
  19. * This notice and attribution to Taligent may not be removed.
  20. * Taligent is a registered trademark of Taligent, Inc.
  21. */
  22. package java.text;
  23. import java.util.Vector;
  24. import java.util.Stack;
  25. import java.util.Hashtable;
  26. import java.util.Enumeration;
  27. import java.text.CharacterIterator;
  28. import java.text.StringCharacterIterator;
  29. import sun.text.CompactByteArray;
  30. /**
  31. * <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p>
  32. *
  33. * <p>There are two kinds of rules, which are separated by semicolons: <i>substitutions</i>
  34. * and <i>regular expressions.</i></p>
  35. *
  36. * <p>A substitution rule defines a name that can be used in place of an expression. It
  37. * consists of a name, which is a string of characters contained in angle brackets, an equals
  38. * sign, and an expression. (There can be no whitespace on either side of the equals sign.)
  39. * To keep its syntactic meaning intact, the expression must be enclosed in parentheses or
  40. * square brackets. A substitution is visible after its definition, and is filled in using
  41. * simple textual substitution. Substitution definitions can contain other substitutions, as
  42. * long as those substitutions have been defined first. Substitutions are generally used to
  43. * make the regular expressions (which can get quite complex) shorted and easier to read.
  44. * They typically define either character categories or commonly-used subexpressions.</p>
  45. *
  46. * <p>There is one special substitution.  If the description defines a substitution
  47. * called "<ignore>", the expression must be a [] expression, and the
  48. * expression defines a set of characters (the "<em>ignore characters</em>") that
  49. * will be transparent to the BreakIterator.  A sequence of characters will break the
  50. * same way it would if any ignore characters it contains are taken out.  Break
  51. * positions never occur befoer ignore characters.</p>
  52. *
  53. * <p>A regular expression uses a subset of the normal Unix regular-expression syntax, and
  54. * defines a sequence of characters to be kept together. With one significant exception, the
  55. * iterator uses a longest-possible-match algorithm when matching text to regular
  56. * expressions. The iterator also treats descriptions containing multiple regular expressions
  57. * as if they were ORed together (i.e., as if they were separated by |).</p>
  58. *
  59. * <p>The special characters recognized by the regular-expression parser are as follows:</p>
  60. *
  61. * <blockquote>
  62. * <table border="1" width="100%">
  63. * <tr>
  64. * <td width="6%">*</td>
  65. * <td width="94%">Specifies that the expression preceding the asterisk may occur any number
  66. * of times (including not at all).</td>
  67. * </tr>
  68. * <tr>
  69. * <td width="6%">{}</td>
  70. * <td width="94%">Encloses a sequence of characters that is optional.</td>
  71. * </tr>
  72. * <tr>
  73. * <td width="6%">()</td>
  74. * <td width="94%">Encloses a sequence of characters.  If followed by *, the sequence
  75. * repeats.  Otherwise, the parentheses are just a grouping device and a way to delimit
  76. * the ends of expressions containing |.</td>
  77. * </tr>
  78. * <tr>
  79. * <td width="6%">|</td>
  80. * <td width="94%">Separates two alternative sequences of characters.  Either one
  81. * sequence or the other, but not both, matches this expression.  The | character can
  82. * only occur inside ().</td>
  83. * </tr>
  84. * <tr>
  85. * <td width="6%">.</td>
  86. * <td width="94%">Matches any character.</td>
  87. * </tr>
  88. * <tr>
  89. * <td width="6%">*?</td>
  90. * <td width="94%">Specifies a non-greedy asterisk.  *? works the same way as *, except
  91. * when there is overlap between the last group of characters in the expression preceding the
  92. * * and the first group of characters following the *.  When there is this kind of
  93. * overlap, * will match the longest sequence of characters that match the expression before
  94. * the *, and *? will match the shortest sequence of characters matching the expression
  95. * before the *?.  For example, if you have "xxyxyyyxyxyxxyxyxyy" in the text,
  96. * "x[xy]*x" will match through to the last x (i.e., "<strong>xxyxyyyxyxyxxyxyx</strong>yy",
  97. * but "x[xy]*?x" will only match the first two xes ("<strong>xx</strong>yxyyyxyxyxxyxyxyy").</td>
  98. * </tr>
  99. * <tr>
  100. * <td width="6%">[]</td>
  101. * <td width="94%">Specifies a group of alternative characters.  A [] expression will
  102. * match any single character that is specified in the [] expression.  For more on the
  103. * syntax of [] expressions, see below.</td>
  104. * </tr>
  105. * <tr>
  106. * <td width="6%">/</td>
  107. * <td width="94%">Specifies where the break position should go if text matches this
  108. * expression.  (e.g., "[a-z]*/[:Zs:]*[1-0]" will match if the iterator sees a run
  109. * of letters, followed by a run of whitespace, followed by a digit, but the break position
  110. * will actually go before the whitespace).  Expressions that don't contain / put the
  111. * break position at the end of the matching text.</td>
  112. * </tr>
  113. * <tr>
  114. * <td width="6%">\</td>
  115. * <td width="94%">Escape character.  The \ itself is ignored, but causes the next
  116. * character to be treated as literal character.  This has no effect for many
  117. * characters, but for the characters listed above, this deprives them of their special
  118. * meaning.  (There are no special escape sequences for Unicode characters, or tabs and
  119. * newlines; these are all handled by a higher-level protocol.  In a Java string,
  120. * "\n" will be converted to a literal newline character by the time the
  121. * regular-expression parser sees it.  Of course, this means that \ sequences that are
  122. * visible to the regexp parser must be written as \\ when inside a Java string.)  All
  123. * characters in the ASCII range except for letters, digits, and control characters are
  124. * reserved characters to the parser and must be preceded by \ even if they currently don't
  125. * mean anything.</td>
  126. * </tr>
  127. * <tr>
  128. * <td width="6%">!</td>
  129. * <td width="94%">If ! appears at the beginning of a regular expression, it tells the regexp
  130. * parser that this expression specifies the backwards-iteration behavior of the iterator,
  131. * and not its normal iteration behavior.  This is generally only used in situations
  132. * where the automatically-generated backwards-iteration brhavior doesn't produce
  133. * satisfactory results and must be supplemented with extra client-specified rules.</td>
  134. * </tr>
  135. * <tr>
  136. * <td width="6%"><em>(all others)</em></td>
  137. * <td width="94%">All other characters are treated as literal characters, which must match
  138. * the corresponding character(s) in the text exactly.</td>
  139. * </tr>
  140. * </table>
  141. * </blockquote>
  142. *
  143. * <p>Within a [] expression, a number of other special characters can be used to specify
  144. * groups of characters:</p>
  145. *
  146. * <blockquote>
  147. * <table border="1" width="100%">
  148. * <tr>
  149. * <td width="6%">-</td>
  150. * <td width="94%">Specifies a range of matching characters.  For example
  151. * "[a-p]" matches all lowercase Latin letters from a to p (inclusive).  The -
  152. * sign specifies ranges of continuous Unicode numeric values, not ranges of characters in a
  153. * language's alphabetical order: "[a-z]" doesn't include capital letters, nor does
  154. * it include accented letters such as a-umlaut.</td>
  155. * </tr>
  156. * <tr>
  157. * <td width="6%">::</td>
  158. * <td width="94%">A pair of colons containing a one- or two-letter code matches all
  159. * characters in the corresponding Unicode category.  The two-letter codes are the same
  160. * as the two-letter codes in the Unicode database (for example, "[:Sc::Sm:]"
  161. * matches all currency symbols and all math symbols).  Specifying a one-letter code is
  162. * the same as specifying all two-letter codes that begin with that letter (for example,
  163. * "[:L:]" matches all letters, and is equivalent to
  164. * "[:Lu::Ll::Lo::Lm::Lt:]").  Anything other than a valid two-letter Unicode
  165. * category code or a single letter that begins a Unicode category code is illegal within
  166. * colons.</td>
  167. * </tr>
  168. * <tr>
  169. * <td width="6%">[]</td>
  170. * <td width="94%">[] expressions can nest.  This has no effect, except when used in
  171. * conjunction with the ^ token.</td>
  172. * </tr>
  173. * <tr>
  174. * <td width="6%">^</td>
  175. * <td width="94%">Excludes the character (or the characters in the [] expression) following
  176. * it from the group of characters.  For example, "[a-z^p]" matches all Latin
  177. * lowercase letters except p.  "[:L:^[\u4e00-\u9fff]]" matches all letters
  178. * except the Han ideographs.</td>
  179. * </tr>
  180. * <tr>
  181. * <td width="6%"><em>(all others)</em></td>
  182. * <td width="94%">All other characters are treated as literal characters.  (For
  183. * example, "[aeiou]" specifies just the letters a, e, i, o, and u.)</td>
  184. * </tr>
  185. * </table>
  186. * </blockquote>
  187. *
  188. * <p>For a more complete explanation, see <a
  189. * href="http://www.ibm.com/java/education/boundaries/boundaries.html">http://www.ibm.com/java/education/boundaries/boundaries.html</a>.
  190. *   For examples, see the resource data (which is annotated).</p>
  191. *
  192. * @author Richard Gillam
  193. * @version $RCSFile$ $Revision: 1.1 $ $Date: 1998/11/05 19:32:04 $
  194. */
  195. class RuleBasedBreakIterator extends BreakIterator {
  196. /**
  197. * A token used as a character-category value to identify ignore characters
  198. */
  199. protected static final byte IGNORE = -1;
  200. /**
  201. * The state number of the starting state
  202. */
  203. private static final short START_STATE = 1;
  204. /**
  205. * The state-transition value indicating "stop"
  206. */
  207. private static final short STOP_STATE = 0;
  208. /**
  209. * The textual description this iterator was created from
  210. */
  211. private String description;
  212. /**
  213. * A table that indexes from character values to character category numbers
  214. */
  215. private CompactByteArray charCategoryTable = null;
  216. /**
  217. * The table of state transitions used for forward iteration
  218. */
  219. private short[] stateTable = null;
  220. /**
  221. * The table of state transitions used to sync up the iterator with the
  222. * text in backwards and random-access iteration
  223. */
  224. private short[] backwardsStateTable = null;
  225. /**
  226. * A list of flags indicating which states in the state table are accepting
  227. * ("end") states
  228. */
  229. private boolean[] endStates = null;
  230. /**
  231. * A list of flags indicating which states in the state table are
  232. * lookahead states (states which turn lookahead on and off)
  233. */
  234. private boolean[] lookaheadStates = null;
  235. /**
  236. * The number of character categories (and, thus, the number of columns in
  237. * the state tables)
  238. */
  239. private int numCategories;
  240. /**
  241. * The character iterator through which this BreakIterator accesses the text
  242. */
  243. private CharacterIterator text = null;
  244. //=======================================================================
  245. // constructors
  246. //=======================================================================
  247. /**
  248. * Constructs a RuleBasedBreakIterator according to the description
  249. * provided. If the description is malformed, throws an
  250. * IllegalArgumentException. Normally, instead of constructing a
  251. * RuleBasedBreakIterator directory, you'll use the factory methods
  252. * on BreakIterator to create one indirectly from a description
  253. * in the framework's resource files. You'd use this when you want
  254. * special behavior not provided by the built-in iterators.
  255. */
  256. public RuleBasedBreakIterator(String description) {
  257. this.description = description;
  258. // the actual work is done by the Builder class
  259. Builder builder = makeBuilder();
  260. builder.buildBreakIterator();
  261. }
  262. /**
  263. * Creates a Builder.
  264. */
  265. protected Builder makeBuilder() {
  266. return new Builder();
  267. }
  268. //=======================================================================
  269. // boilerplate
  270. //=======================================================================
  271. /**
  272. * Clones this iterator.
  273. * @return A newly-constructed RuleBasedBreakIterator with the same
  274. * behavior as this one.
  275. */
  276. public Object clone()
  277. {
  278. RuleBasedBreakIterator result = (RuleBasedBreakIterator) super.clone();
  279. if (text != null) {
  280. result.text = (CharacterIterator) text.clone();
  281. }
  282. return result;
  283. }
  284. /**
  285. * Returns true if both BreakIterators are of the same class, have the same
  286. * rules, and iterate over the same text.
  287. */
  288. public boolean equals(Object that) {
  289. try {
  290. RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
  291. if (!description.equals(other.description)) {
  292. return false;
  293. }
  294. if (text == null) {
  295. return other.text == null;
  296. }
  297. else {
  298. return text.equals(other.text);
  299. }
  300. }
  301. catch(ClassCastException e) {
  302. return false;
  303. }
  304. }
  305. /**
  306. * Returns the description used to create this iterator
  307. */
  308. public String toString() {
  309. return description;
  310. }
  311. /**
  312. * Compute a hashcode for this BreakIterator
  313. * @return A hash code
  314. */
  315. public int hashCode()
  316. {
  317. return description.hashCode();
  318. }
  319. //=======================================================================
  320. // BreakIterator overrides
  321. //=======================================================================
  322. /**
  323. * Sets the current iteration position to the beginning of the text.
  324. * (i.e., the CharacterIterator's starting offset).
  325. * @return The offset of the beginning of the text.
  326. */
  327. public int first() {
  328. CharacterIterator t = getText();
  329. t.first();
  330. return t.getIndex();
  331. }
  332. /**
  333. * Sets the current iteration position to the end of the text.
  334. * (i.e., the CharacterIterator's ending offset).
  335. * @return The text's past-the-end offset.
  336. */
  337. public int last() {
  338. CharacterIterator t = getText();
  339. // I'm not sure why, but t.last() returns the offset of the last character,
  340. // rather than the past-the-end offset
  341. t.setIndex(t.getEndIndex());
  342. return t.getIndex();
  343. }
  344. /**
  345. * Advances the iterator either forward or backward the specified number of steps.
  346. * Negative values move backward, and positive values move forward. This is
  347. * equivalent to repeatedly calling next() or previous().
  348. * @param n The number of steps to move. The sign indicates the direction
  349. * (negative is backwards, and positive is forwards).
  350. * @return The character offset of the boundary position n boundaries away from
  351. * the current one.
  352. */
  353. public int next(int n) {
  354. int result = current();
  355. while (n > 0) {
  356. result = handleNext();
  357. --n;
  358. }
  359. while (n < 0) {
  360. result = previous();
  361. ++n;
  362. }
  363. return result;
  364. }
  365. /**
  366. * Advances the iterator to the next boundary position.
  367. * @return The position of the first boundary after this one.
  368. */
  369. public int next() {
  370. return handleNext();
  371. }
  372. /**
  373. * Advances the iterator backwards, to the last boundary preceding this one.
  374. * @return The position of the last boundary position preceding this one.
  375. */
  376. public int previous() {
  377. // if we're already sitting at the beginning of the text, return DONE
  378. CharacterIterator text = getText();
  379. if (current() == text.getBeginIndex()) {
  380. return BreakIterator.DONE;
  381. }
  382. // set things up. handlePrevious() will back us up to some valid
  383. // break position before the current position (we back our internal
  384. // iterator up one step to prevent handlePrevious() from returning
  385. // the current position), but not necessarily the last one before
  386. // where we started
  387. int start = current();
  388. text.previous();
  389. int lastResult = handlePrevious();
  390. int result = lastResult;
  391. // iterate forward from the known break position until we pass our
  392. // starting point. The last break position before the starting
  393. // point is our return value
  394. while (result != BreakIterator.DONE && result < start) {
  395. lastResult = result;
  396. result = handleNext();
  397. }
  398. // set the current iteration position to be the last break position
  399. // before where we started, and then return that value
  400. text.setIndex(lastResult);
  401. return lastResult;
  402. }
  403. /**
  404. * Throw IllegalArgumentException unless begin <= offset < end.
  405. */
  406. protected static final void checkOffset(int offset, CharacterIterator text) {
  407. if (offset < text.getBeginIndex() || offset >= text.getEndIndex()) {
  408. throw new IllegalArgumentException("offset out of bounds");
  409. }
  410. }
  411. /**
  412. * Sets the iterator to refer to the first boundary position following
  413. * the specified position.
  414. * @offset The position from which to begin searching for a break position.
  415. * @return The position of the first break after the current position.
  416. */
  417. public int following(int offset) {
  418. CharacterIterator text = getText();
  419. checkOffset(offset, text);
  420. // Set our internal iteration position (temporarily)
  421. // to the position passed in. If this is the _beginning_ position,
  422. // then we can just use next() to get our return value
  423. text.setIndex(offset);
  424. if (offset == text.getBeginIndex()) {
  425. return handleNext();
  426. }
  427. // otherwise, we have to sync up first. Use handlePrevious() to back
  428. // us up to a known break position before the specified position (if
  429. // we can determine that the specified position is a break position,
  430. // we don't back up at all). This may or may not be the last break
  431. // position at or before our starting position. Advance forward
  432. // from here until we've passed the starting position. The position
  433. // we stop on will be the first break position after the specified one.
  434. int result = handlePrevious();
  435. while (result != BreakIterator.DONE && result <= offset) {
  436. result = handleNext();
  437. }
  438. return result;
  439. }
  440. /**
  441. * Sets the iterator to refer to the last boundary position before the
  442. * specified position.
  443. * @offset The position to begin searching for a break from.
  444. * @return The position of the last boundary before the starting position.
  445. */
  446. public int preceding(int offset) {
  447. // if we start by updating the current iteration position to the
  448. // position specified by the caller, we can just use previous()
  449. // to carry out this operation
  450. CharacterIterator text = getText();
  451. checkOffset(offset, text);
  452. text.setIndex(offset);
  453. return previous();
  454. }
  455. /**
  456. * Returns true if the specfied position is a boundary position. As a side
  457. * effect, leaves the iterator pointing to the first boundary position at
  458. * or after "offset".
  459. * @param offset the offset to check.
  460. * @return True if "offset" is a boundary position.
  461. */
  462. public boolean isBoundary(int offset) {
  463. CharacterIterator text = getText();
  464. checkOffset(offset, text);
  465. if (offset == text.getBeginIndex()) {
  466. return true;
  467. }
  468. // to check whether this is a boundary, we can use following() on the
  469. // position before the specified one and return true if the position we
  470. // get back is the one the user specified
  471. else {
  472. return following(offset - 1) == offset;
  473. }
  474. }
  475. /**
  476. * Returns the current iteration position.
  477. * @return The current iteration position.
  478. */
  479. public int current() {
  480. return getText().getIndex();
  481. }
  482. /**
  483. * Return a CharacterIterator over the text being analyzed. This version
  484. * of this method returns the actual CharacterIterator we're using internally.
  485. * Changing the state of this iterator can have undefined consequences. If
  486. * you need to change it, clone it first.
  487. * @return An iterator over the text being analyzed.
  488. */
  489. public CharacterIterator getText() {
  490. // The iterator is initialized pointing to no text at all, so if this
  491. // function is called while we're in that state, we have to fudge an
  492. // an iterator to return.
  493. if (text == null) {
  494. text = new StringCharacterIterator("");
  495. }
  496. return text;
  497. }
  498. /**
  499. * Set the iterator to analyze a new piece of text. This function resets
  500. * the current iteration position to the beginning of the text.
  501. * @param newText An iterator over the text to analyze.
  502. */
  503. public void setText(CharacterIterator newText) {
  504. // Test iterator to see if we need to wrap it in a SafeCharIterator.
  505. // The correct behavior for CharacterIterators is to allow the
  506. // position to be set to the endpoint of the iterator. Many
  507. // CharacterIterators do not uphold this, so this is a workaround
  508. // to permit them to use this class.
  509. int end = newText.getEndIndex();
  510. boolean goodIterator;
  511. try {
  512. newText.setIndex(end); // some buggy iterators throw an exception here
  513. goodIterator = newText.getIndex() == end;
  514. }
  515. catch(IllegalArgumentException e) {
  516. goodIterator = false;
  517. }
  518. if (goodIterator) {
  519. text = newText;
  520. }
  521. else {
  522. text = new SafeCharIterator(newText);
  523. }
  524. text.first();
  525. }
  526. //=======================================================================
  527. // implementation
  528. //=======================================================================
  529. /**
  530. * This method is the actual implementation of the next() method. All iteration
  531. * vectors through here. This method initializes the state machine to state 1
  532. * and advances through the text character by character until we reach the end
  533. * of the text or the state machine transitions to state 0. We update our return
  534. * value every time the state machine passes through a possible end state.
  535. */
  536. protected int handleNext() {
  537. // if we're already at the end of the text, return DONE.
  538. CharacterIterator text = getText();
  539. if (text.getIndex() == text.getEndIndex()) {
  540. return BreakIterator.DONE;
  541. }
  542. // no matter what, we always advance at least one character forward
  543. int result = text.getIndex() + 1;
  544. int lookaheadResult = 0;
  545. // begin in state 1
  546. int state = START_STATE;
  547. int category;
  548. char c = text.current();
  549. // loop until we reach the end of the text or transition to state 0
  550. while (c != CharacterIterator.DONE && state != STOP_STATE) {
  551. // look up the current character's character category (which tells us
  552. // which column in the state table to look at)
  553. category = lookupCategory(c);
  554. // if the character isn't an ignore character, look up a state
  555. // transition in the state table
  556. if (category != IGNORE) {
  557. state = lookupState(state, category);
  558. }
  559. // if the state we've just transitioned to is a lookahead state,
  560. // (but not also an end state), save its position. If it's
  561. // both a lookahead state and an end state, update the break position
  562. // to the last saved lookup-state position
  563. if (lookaheadStates[state]) {
  564. if (endStates[state]) {
  565. result = lookaheadResult;
  566. }
  567. else {
  568. lookaheadResult = text.getIndex() + 1;
  569. }
  570. }
  571. // otherwise, if the state we've just transitioned to is an accepting
  572. // state, update the break position to be the current iteration position
  573. else {
  574. if (endStates[state]) {
  575. result = text.getIndex() + 1;
  576. }
  577. }
  578. c = text.next();
  579. }
  580. // if we've run off the end of the text, and the very last character took us into
  581. // a lookahead state, advance the break position to the lookahead position
  582. // (the theory here is that if there are no characters at all after the lookahead
  583. // position, that always matches the lookahead criteria)
  584. if (c == CharacterIterator.DONE && lookaheadResult == text.getEndIndex()) {
  585. result = lookaheadResult;
  586. }
  587. text.setIndex(result);
  588. return result;
  589. }
  590. /**
  591. * This method backs the iterator back up to a "safe position" in the text.
  592. * This is a position that we know, without any context, must be a break position.
  593. * The various calling methods then iterate forward from this safe position to
  594. * the appropriate position to return. (For more information, see the description
  595. * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
  596. */
  597. protected int handlePrevious() {
  598. CharacterIterator text = getText();
  599. int state = START_STATE;
  600. int category = 0;
  601. int lastCategory = 0;
  602. char c = text.current();
  603. // loop until we reach the beginning of the text or transition to state 0
  604. while (c != CharacterIterator.DONE && state != STOP_STATE) {
  605. // save the last character's category and look up the current
  606. // character's category
  607. lastCategory = category;
  608. category = lookupCategory(c);
  609. // if the current character isn't an ignore character, look up a
  610. // state transition in the backwards state table
  611. if (category != IGNORE) {
  612. state = lookupBackwardState(state, category);
  613. }
  614. // then advance one character backwards
  615. c = text.previous();
  616. }
  617. // if we didn't march off the beginning of the text, we're either one or two
  618. // positions away from the real break position. (One because of the call to
  619. // previous() at the end of the loop above, and another because the character
  620. // that takes us into the stop state will always be the character BEFORE
  621. // the break position.)
  622. if (c != CharacterIterator.DONE) {
  623. if (lastCategory != IGNORE) {
  624. text.setIndex(text.getIndex() + 2);
  625. }
  626. else {
  627. text.next();
  628. }
  629. }
  630. return text.getIndex();
  631. }
  632. /**
  633. * Looks up a character's category (i.e., its category for breaking purposes,
  634. * not its Unicode category)
  635. */
  636. protected int lookupCategory(char c) {
  637. return charCategoryTable.elementAt(c);
  638. }
  639. /**
  640. * Given a current state and a character category, looks up the
  641. * next state to transition to in the state table.
  642. */
  643. protected int lookupState(int state, int category) {
  644. return stateTable[state * numCategories + category];
  645. }
  646. /**
  647. * Given a current state and a character category, looks up the
  648. * next state to transition to in the backwards state table.
  649. */
  650. protected int lookupBackwardState(int state, int category) {
  651. return backwardsStateTable[state * numCategories + category];
  652. }
  653. //=======================================================================
  654. // RuleBasedBreakIterator.Builder
  655. //=======================================================================
  656. /**
  657. * The Builder class has the job of constructing a RuleBasedBreakIterator from a
  658. * textual description. A Builder is constructed by RuleBasedBreakIterator's
  659. * constructor, which uses it to construct the iterator itself and then throws it
  660. * away.
  661. * <p>The construction logic is separated out into its own class for two primary
  662. * reasons:
  663. * <ul><li>The construction logic is quite sophisticated and large. Separating it
  664. * out into its own class means the code must only be loaded into memory while a
  665. * RuleBasedBreakIterator is being constructed, and can be purged after that.
  666. * <li>There is a fair amount of state that must be maintained throughout the
  667. * construction process that is not needed by the iterator after construction.
  668. * Separating this state out into another class prevents all of the functions that
  669. * construct the iterator from having to have really long parameter lists,
  670. * (hopefully) contributing to readability and maintainability.</ul>
  671. * <p>It'd be really nice if this could be an independent class rather than an
  672. * inner class, because that would shorten the source file considerably, but
  673. * making Builder an inner class of RuleBasedBreakIterator allows it direct access
  674. * to RuleBasedBreakIterator's private members, which saves us from having to
  675. * provide some kind of "back door" to the Builder class that could then also be
  676. * used by other classes.
  677. */
  678. protected class Builder {
  679. /**
  680. * A temporary holding place used for calculating the character categories.
  681. * This object contains CharSet objects.
  682. */
  683. protected Vector categories = null;
  684. /**
  685. * A table used to map parts of regexp text to lists of character categories,
  686. * rather than having to figure them out from scratch each time
  687. */
  688. protected Hashtable expressions = null;
  689. /**
  690. * A temporary holding place for the list of ignore characters
  691. */
  692. protected CharSet ignoreChars = null;
  693. /**
  694. * A temporary holding place where the forward state table is built
  695. */
  696. protected Vector tempStateTable = null;
  697. /**
  698. * A list of all the states that have to be filled in with transitions to the
  699. * next state that is created. Used when building the state table from the
  700. * regular expressions.
  701. */
  702. protected Vector decisionPointList = null;
  703. /**
  704. * A stack for holding decision point lists. This is used to handle nested
  705. * parentheses and braces in regexps.
  706. */
  707. protected Stack decisionPointStack = null;
  708. /**
  709. * A list of states that loop back on themselves. Used to handle .*?
  710. */
  711. protected Vector loopingStates = null;
  712. /**
  713. * Looping states actually have to be backfilled later in the process
  714. * than everything else. This is where a the list of states to backfill
  715. * is accumulated. This is also used to handle .*?
  716. */
  717. protected Vector statesToBackfill = null;
  718. /**
  719. * A list mapping pairs of state numbers for states that are to be combined
  720. * to the state number of the state representing their combination. Used
  721. * in the process of making the state table deterministic to prevent
  722. * infinite recursion.
  723. */
  724. protected Vector mergeList = null;
  725. /**
  726. * A flag that is used to indicate when the list of looping states can
  727. * be reset.
  728. */
  729. protected boolean clearLoopingStates = false;
  730. /**
  731. * A bit mask used to indicate a bit in the table's flags column that marks a
  732. * state as an accepting state.
  733. */
  734. protected static final int END_STATE_FLAG = 0x8000;
  735. /**
  736. * A bit mask used to indicate a bit in the table's flags column that marks a
  737. * state as one the builder shouldn't loop to any looping states
  738. */
  739. protected static final int DONT_LOOP_FLAG = 0x4000;
  740. /**
  741. * A bit mask used to indicate a bit in the table's flags column that marks a
  742. * state as a lookahead state.
  743. */
  744. protected static final int LOOKAHEAD_STATE_FLAG = 0x2000;
  745. /**
  746. * A bit mask representing the union of the mask values listed above.
  747. * Used for clearing or masking off the flag bits.
  748. */
  749. protected static final int ALL_FLAGS = END_STATE_FLAG | LOOKAHEAD_STATE_FLAG
  750. | DONT_LOOP_FLAG;
  751. /**
  752. * No special construction is required for the Builder.
  753. */
  754. public Builder() {
  755. }
  756. /**
  757. * This is the main function for setting up the BreakIterator's tables. It
  758. * just vectors different parts of the job off to other functions.
  759. */
  760. public void buildBreakIterator() {
  761. Vector tempRuleList = buildRuleList(description);
  762. buildCharCategories(tempRuleList);
  763. buildStateTable(tempRuleList);
  764. buildBackwardsStateTable(tempRuleList);
  765. }
  766. /**
  767. * Thus function has three main purposes:
  768. * <ul><li>Perform general syntax checking on the description, so the rest of the
  769. * build code can assume that it's parsing a legal description.
  770. * <li>Split the description into separate rules
  771. * <li>Perform variable-name substitutions (so that no one else sees variable names)
  772. * </ul>
  773. */
  774. private Vector buildRuleList(String description) {
  775. // invariants:
  776. // - parentheses must be balanced: ()[]{}<>
  777. // - nothing can be nested inside <>
  778. // - nothing can be nested inside [] except more []s
  779. // - pairs of ()[]{}<> must not be empty
  780. // - ; can only occur at the outer level
  781. // - | can only appear inside ()
  782. // - only one = or / can occur in a single rule
  783. // - = and / cannot both occur in the same rule
  784. // - <> can only occur on the left side of a = expression
  785. // (because we'll perform substitutions to eliminate them other places)
  786. // - the left-hand side of a = expression can only be a single character
  787. // (possibly with \) or text inside <>
  788. // - the right-hand side of a = expression must be enclosed in [] or ()
  789. // - * may not occur at the beginning of a rule, nor may it follow
  790. // =, /, (, (, |, }, ;, or *
  791. // - ? may only follow *
  792. // - the rule list must contain at least one / rule
  793. // - no rule may be empty
  794. // - all printing characters in the ASCII range except letters and digits
  795. // are reserved and must be preceded by \
  796. // - ! may only occur at the beginning of a rule
  797. // set up a vector to contain the broken-up description (each entry in the
  798. // vector is a separate rule) and a stack for keeping track of opening
  799. // punctuation
  800. Vector tempRuleList = new Vector();
  801. Stack parenStack = new Stack();
  802. int p = 0;
  803. int ruleStart = 0;
  804. char c = '\u0000';
  805. char lastC = '\u0000';
  806. char lastOpen = '\u0000';
  807. boolean haveEquals = false;
  808. boolean havePipe = false;
  809. boolean sawVarName = false;
  810. final String charsThatCantPrecedeAsterisk = "=/{(|}*;\u0000";
  811. // if the description doesn't end with a semicolon, tack a semicolon onto the end
  812. if (description.length() != 0 && description.charAt(description.length() - 1) != ';') {
  813. description = description + ";";
  814. }
  815. // for each character, do...
  816. while (p < description.length()) {
  817. c = description.charAt(p);
  818. switch (c) {
  819. // if the character is a backslash, skip the character that follows it
  820. // (it'll get treated as a literal character)
  821. case '\\':
  822. ++p;
  823. break;
  824. // if the character is opening punctuation, verify that no nesting
  825. // rules are broken, and push the character onto the stack
  826. case '{':
  827. case '<':
  828. case '[':
  829. case '(':
  830. if (lastOpen == '<') {
  831. error("Can't nest brackets inside <>", p, description);
  832. }
  833. if (lastOpen == '[' && c != '[') {
  834. error("Can't nest anything in [] but []", p, description);
  835. }
  836. // if we see < anywhere except on the left-hand side of =,
  837. // we must be seeing a variable name that was never defined
  838. if (c == '<' && (haveEquals || havePipe)) {
  839. error("Unknown variable name", p, description);
  840. }
  841. lastOpen = c;
  842. parenStack.push(new Character(c));
  843. if (c == '<') {
  844. sawVarName = true;
  845. }
  846. break;
  847. // if the character is closing punctuation, verify that it matches the
  848. // last opening punctuation we saw, and that the brackets contain
  849. // something, then pop the stack
  850. case '}':
  851. case '>':
  852. case ']':
  853. case ')':
  854. char expectedClose = '\u0000';
  855. switch (lastOpen) {
  856. case '{':
  857. expectedClose = '}';
  858. break;
  859. case '[':
  860. expectedClose = ']';
  861. break;
  862. case '(':
  863. expectedClose = ')';
  864. break;
  865. case '<':
  866. expectedClose = '>';
  867. break;
  868. }
  869. if (c != expectedClose) {
  870. error("Unbalanced parentheses", p, description);
  871. }
  872. if (lastC == lastOpen) {
  873. error("Parens don't contain anything", p, description);
  874. }
  875. parenStack.pop();
  876. if (!parenStack.empty()) {
  877. lastOpen = ((Character)(parenStack.peek())).charValue();
  878. }
  879. else {
  880. lastOpen = '\u0000';
  881. }
  882. break;
  883. // if the character is an asterisk, make sure it occurs in a place
  884. // where an asterisk can legally go
  885. case '*':
  886. if (charsThatCantPrecedeAsterisk.indexOf(lastC) != -1) {
  887. error("Misplaced asterisk", p, description);
  888. }
  889. break;
  890. // if the character is a question mark, make sure it follows an asterisk
  891. case '?':
  892. if (lastC != '*') {
  893. error("Misplaced ?", p, description);
  894. }
  895. break;
  896. // if the character is an equals sign, make sure we haven't seen another
  897. // equals sign or a slash yet
  898. case '=':
  899. if (haveEquals || havePipe) {
  900. error("More than one = or / in rule", p, description);
  901. }
  902. haveEquals = true;
  903. break;
  904. // if the character is a slash, make sure we haven't seen another slash
  905. // or an equals sign yet
  906. case '/':
  907. if (haveEquals || havePipe) {
  908. error("More than one = or / in rule", p, description);
  909. }
  910. if (sawVarName) {
  911. error("Unknown variable name", p, description);
  912. }
  913. havePipe = true;
  914. break;
  915. // if the character is an exclamation point, make sure it occurs only
  916. // at the beginning of a rule
  917. case '!':
  918. if (lastC != ';' && lastC != '\u0000') {
  919. error("! can only occur at the beginning of a rule", p, description);
  920. }
  921. break;
  922. // we don't have to do anything special on a period
  923. case '.':
  924. break;
  925. // if the character is a syntax character that can only occur
  926. // inside [], make sure that it does in fact only occur inside [].
  927. case '^':
  928. case '-':
  929. case ':':
  930. if (lastOpen != '[' && lastOpen != '<') {
  931. error("Illegal character", p, description);
  932. }
  933. break;
  934. // if the character is a semicolon, do the following...
  935. case ';':
  936. // make sure the rule contains something and that there are no
  937. // unbalanced parentheses or brackets
  938. if (lastC == ';' || lastC == '\u0000') {
  939. error("Empty rule", p, description);
  940. }
  941. if (!parenStack.empty()) {
  942. error("Unbalanced parenheses", p, description);
  943. }
  944. if (parenStack.empty()) {
  945. // if the rule contained an = sign, call processSubstitution()
  946. // to replace the substitution name with the substitution text
  947. // wherever it appears in the description
  948. if (haveEquals) {
  949. description = processSubstitution(description.substring(ruleStart,
  950. p), description, p + 1);
  951. }
  952. else {
  953. // otherwise, check to make sure the rule doesn't reference
  954. // any undefined substitutions
  955. if (sawVarName) {
  956. error("Unknown variable name", p, description);
  957. }
  958. // then add it to tempRuleList
  959. tempRuleList.addElement(description.substring(ruleStart, p));
  960. }
  961. // and reset everything to process the next rule
  962. ruleStart = p + 1;
  963. haveEquals = havePipe = sawVarName = false;
  964. }
  965. break;
  966. // if the character is a vertical bar, check to make sure that it
  967. // occurs inside a () expression and that the character that precedes
  968. // it isn't also a vertical bar
  969. case '|':
  970. if (lastC == '|') {
  971. error("Empty alternative", p, description);
  972. }
  973. if (parenStack.empty() || lastOpen != '(') {
  974. error("Misplaced |", p, description);
  975. }
  976. break;
  977. // if the character is anything else (escaped characters are
  978. // skipped and don't make it here), it's an error
  979. default:
  980. if (c >= ' ' && c < '\u007f' && !Character.isLetter(c)
  981. && !Character.isDigit(c)) {
  982. error("Illegal character", p, description);
  983. }
  984. break;
  985. }
  986. lastC = c;
  987. ++p;
  988. }
  989. if (tempRuleList.size() == 0) {
  990. error("No valid rules in description", p, description);
  991. }
  992. return tempRuleList;
  993. }
  994. /**
  995. * This function performs variable-name substitutions. First it does syntax
  996. * checking on the variable-name definition. If it's syntactically valid, it
  997. * then goes through the remainder of the description and does a simple
  998. * find-and-replace of the variable name with its text. (The variable text
  999. * must be enclosed in either [] or () for this to work.)
  1000. */
  1001. protected String processSubstitution(String substitutionRule, String description,
  1002. int startPos) {
  1003. // isolate out the text on either side of the equals sign
  1004. String replace;
  1005. String replaceWith;
  1006. int equalPos = substitutionRule.indexOf('=');
  1007. replace = substitutionRule.substring(0, equalPos);
  1008. replaceWith = substitutionRule.substring(equalPos + 1);
  1009. // check to see whether the substitution name is something we've declared
  1010. // to be "special". For RuleBasedBreakIterator itself, this is "<ignore>".
  1011. // This function takes care of any extra processing that has to be done
  1012. // with "special" substitution names.
  1013. handleSpecialSubstitution(replace, replaceWith, startPos, description);
  1014. // perform various other syntax checks on the rule
  1015. if (replaceWith.length() == 0) {
  1016. error("Nothing on right-hand side of =", startPos, description);
  1017. }
  1018. if (replace.length() == 0) {
  1019. error("Nothing on left-hand side of =", startPos, description);
  1020. }
  1021. if (replace.length() == 2 && replace.charAt(0) != '\\') {
  1022. error("Illegal left-hand side for =", startPos, description);
  1023. }
  1024. if (replace.length() >= 3 && replace.charAt(0) != '<' && replace.charAt(equalPos - 1)
  1025. != '>') {
  1026. error("Illegal left-hand side for =", startPos, description);
  1027. }
  1028. if (!(replaceWith.charAt(0) == '[' && replaceWith.charAt(replaceWith.length() - 1)
  1029. == ']') && !(replaceWith.charAt(0) == '(' && replaceWith.charAt(
  1030. replaceWith.length() - 1) == ')')) {
  1031. error("Illegal right-hand side for =", startPos, description);
  1032. }
  1033. // now go through the rest of the description (which hasn't been broken up
  1034. // into separate rules yet) and replace every occurrence of the
  1035. // substitution name with the substitution body
  1036. StringBuffer result = new StringBuffer();
  1037. result.append(description.substring(0, startPos));
  1038. int lastPos = startPos;
  1039. int pos = description.indexOf(replace, startPos);
  1040. while (pos != -1) {
  1041. result.append(description.substring(lastPos, pos));
  1042. result.append(replaceWith);
  1043. lastPos = pos + replace.length();
  1044. pos = description.indexOf(replace, lastPos);
  1045. }
  1046. result.append(description.substring(lastPos));
  1047. return result.toString();
  1048. }
  1049. /**
  1050. * This function defines a protocol for handling substitution names that
  1051. * are "special," i.e., that have some property beyond just being
  1052. * substitutions. At the RuleBasedBreakIterator level, we have one
  1053. * special substitution name, "<ignore>". Subclasses can override this
  1054. * function to add more. Any special processing that has to go on beyond
  1055. * that which is done by the normal substitution-processing code is done
  1056. * here.
  1057. */
  1058. protected void handleSpecialSubstitution(String replace, String replaceWith,
  1059. int startPos, String description) {
  1060. // if we get a definition for a substitution called "ignore", it defines
  1061. // the ignore characters for the iterator. Check to make sure the expression
  1062. // is a [] expression, and if it is, parse it and store the characters off
  1063. // to the side.
  1064. if (replace.equals("<ignore>")) {
  1065. if (replaceWith.charAt(0) == '(') {
  1066. error("Ignore group can't be enclosed in (", startPos, description);
  1067. }
  1068. ignoreChars = CharSet.parseString(replaceWith);
  1069. }
  1070. }
  1071. /**
  1072. * This function builds the character category table. On entry,
  1073. * tempRuleList is a vector of break rules that has had variable names substituted.
  1074. * On exit, the charCategoryTable data member has been initialized to hold the
  1075. * character category table, and tempRuleList's rules have been munged to contain
  1076. * character category numbers everywhere a literal character or a [] expression
  1077. * originally occurred.
  1078. */
  1079. protected void buildCharCategories(Vector tempRuleList) {
  1080. int bracketLevel = 0;
  1081. int p = 0;
  1082. int lineNum = 0;
  1083. // build hash table of every literal character or [] expression in the rule list
  1084. // and use CharSet.parseString() to derive a CharSet object representing the
  1085. // characters each refers to
  1086. expressions = new Hashtable();
  1087. while (lineNum < tempRuleList.size()) {
  1088. String line = (String)(tempRuleList.elementAt(lineNum));
  1089. p = 0;
  1090. while (p < line.length()) {
  1091. char c = line.charAt(p);
  1092. switch (c) {
  1093. // skip over all syntax characters except [
  1094. case '{': case '}': case '(': case ')': case '*': case '.':
  1095. case '/': case '|': case ';': case '?': case '!':
  1096. break;
  1097. // for [, find the matching ] (taking nested [] pairs into account)
  1098. // and add the whole expression to the expression list
  1099. case '[':
  1100. int q = p + 1;
  1101. ++bracketLevel;
  1102. while (q < line.length() && bracketLevel != 0) {
  1103. c = line.charAt(q);
  1104. switch (c) {
  1105. case '\\':
  1106. q++;
  1107. break;
  1108. case '[':
  1109. ++bracketLevel;
  1110. break;
  1111. case ']':
  1112. --bracketLevel;
  1113. break;
  1114. }
  1115. ++q;
  1116. }
  1117. if (expressions.get(line.substring(p, q)) == null) {
  1118. expressions.put(line.substring(p, q), CharSet.parseString(line.
  1119. substring(p, q)));
  1120. }
  1121. p = q - 1;
  1122. break;
  1123. // for \ sequences, just move to the next character and treat
  1124. // it as a single character
  1125. case '\\':
  1126. ++p;
  1127. c = line.charAt(p);
  1128. // DON'T break; fall through into "default" clause
  1129. // for an isolated single character, add it to the expression list
  1130. default:
  1131. expressions.put(line.substring(p, p + 1), CharSet.parseString(line.
  1132. substring(p, p + 1)));
  1133. break;
  1134. }
  1135. ++p;
  1136. }
  1137. ++lineNum;
  1138. }
  1139. // dump CharSet's internal expression cache
  1140. CharSet.releaseExpressionCache();
  1141. // create the temporary category table (which is a vector of CharSet objects)
  1142. categories = new Vector();
  1143. if (ignoreChars != null) {
  1144. categories.addElement(ignoreChars);
  1145. }
  1146. else {
  1147. categories.addElement(new CharSet());
  1148. }
  1149. ignoreChars = null;
  1150. // this is a hook to allow subclasses to add categories on their own
  1151. mungeExpressionList(expressions);
  1152. // Derive the character categories. Go through the existing character categories
  1153. // looking for overlap. Any time there's overlap, we create a new character
  1154. // category for the characters that overlapped and remove them from their original
  1155. // category. At the end, any characters that are left in the expression haven't
  1156. // been mentioned in any category, so another new category is created for them.
  1157. // For example, if the first expression is [abc], then a, b, and c will be placed
  1158. // into a single character category. If the next expression is [bcd], we will first
  1159. // remove b and c from their existing category (leaving a behind), create a new
  1160. // category for b and c, and then create another new category for d (which hadn't
  1161. // been mentioned in the previous expression).
  1162. // At no time should a character ever occur in more than one character category.
  1163. // for each expression in the expressions list, do...
  1164. Enumeration iter = expressions.elements();
  1165. while (iter.hasMoreElements()) {
  1166. // initialize the working char set to the chars in the current expression
  1167. CharSet e = (CharSet)iter.nextElement();
  1168. // for each category in the category list, do...
  1169. for (int j = categories.size() - 1; !e.empty() && j > 0; j--) {
  1170. // if there's overlap between the current working set of chars
  1171. // and the current category...
  1172. CharSet that = (CharSet)(categories.elementAt(j));
  1173. if (!that.intersection(e).empty()) {
  1174. // add a new category for the characters that were in the
  1175. // current category but not in the working char set
  1176. CharSet temp = that.difference(e);
  1177. if (!temp.empty()) {
  1178. categories.addElement(temp);
  1179. }
  1180. // remove those characters from the working char set and replace
  1181. // the current category with the characters that it did
  1182. // have in common with the current working char set
  1183. temp = e.intersection(that);
  1184. e = e.difference(that);
  1185. if (!temp.equals(that)) {
  1186. categories.setElementAt(temp, j);
  1187. }
  1188. }
  1189. }
  1190. // if there are still characters left in the working char set,
  1191. // add a new category containing them
  1192. if (!e.empty()) {
  1193. categories.addElement(e);
  1194. }
  1195. }
  1196. // we have the ignore characters stored in position 0. Make an extra pass through
  1197. // the character category list and remove anything from the ignore list that shows
  1198. // up in some other category
  1199. CharSet allChars = new CharSet();
  1200. for (int i = 1; i < categories.size(); i++)
  1201. allChars = allChars.union((CharSet)(categories.elementAt(i)));
  1202. CharSet ignoreChars = (CharSet)(categories.elementAt(0));
  1203. ignoreChars = ignoreChars.difference(allChars);
  1204. categories.setElementAt(ignoreChars, 0);
  1205. // now that we've derived the character categories, go back through the expression
  1206. // list and replace each CharSet object with a String that represents the
  1207. // character categories that expression refers to. The String is encoded: each
  1208. // character is a character category number (plus 0x100 to avoid confusing them
  1209. // with syntax characters in the rule grammar)
  1210. iter = expressions.keys();
  1211. while (iter.hasMoreElements()) {
  1212. String key = (String)iter.nextElement();
  1213. CharSet cs = (CharSet)expressions.get(key);
  1214. StringBuffer cats = new StringBuffer();
  1215. // for each category...
  1216. for (int j = 0; j < categories.size(); j++) {
  1217. // if the current expression contains characters in that category...
  1218. CharSet temp = cs.intersection((CharSet)(categories.elementAt(j)));
  1219. if (!temp.empty()) {
  1220. // then add the encoded category number to the String for this
  1221. // expression
  1222. cats.append((char)(0x100 + j));
  1223. if (temp.equals(cs)) {
  1224. break;
  1225. }
  1226. }
  1227. }
  1228. // once we've finished building the encoded String for this expression,
  1229. // replace the CharSet object with it
  1230. expressions.put(key, cats.toString());
  1231. }
  1232. // and finally, we turn the temporary category table into a permanent category
  1233. // table, which is a CompactByteArray. (we skip category 0, which by definition
  1234. // refers to all characters not mentioned specifically in the rules)
  1235. charCategoryTable = new CompactByteArray((byte)0);
  1236. // for each category...
  1237. for (int i = 0; i < categories.size(); i++) {
  1238. CharSet chars = (CharSet)(categories.elementAt(i));
  1239. // go through the character ranges in the category one by one...
  1240. Enumeration enum = chars.getChars();
  1241. while (enum.hasMoreElements()) {
  1242. char[] range = (char[])(enum.nextElement());
  1243. // and set the corresponding elements in the CompactArray accordingly
  1244. if (i != 0) {
  1245. charCategoryTable.setElementAt(range[0], range[1], (byte)i);
  1246. }
  1247. // (category 0 is special-- it's the hiding place for the ignore
  1248. // characters, whose real category number in the CompactArray is
  1249. // -1 [this is because category 0 contains all characters not
  1250. // specifically mentioned anywhere in the rules] )
  1251. else {
  1252. charCategoryTable.setElementAt(range[0], range[1], IGNORE);
  1253. }
  1254. }
  1255. }
  1256. // once we've populated the CompactArray, compact it
  1257. charCategoryTable.compact();
  1258. // initialize numCategories
  1259. numCategories = categories.size();
  1260. }
  1261. protected void mungeExpressionList(Hashtable expressions) {
  1262. // empty in the parent class. This function provides a hook for subclasses
  1263. // to mess with the character category table.
  1264. }
  1265. /**
  1266. * This is the function that builds the forward state table. Most of the real
  1267. * work is done in parseRule(), which is called once for each rule in the
  1268. * description.
  1269. */
  1270. private void buildStateTable(Vector tempRuleList) {
  1271. // initialize our temporary state table, and fill it with two states:
  1272. // state 0 is a dummy state that allows state 1 to be the starting state
  1273. // and 0 to represent "stop". State 1 is added here to seed things
  1274. // before we start parsing
  1275. tempStateTable = new Vector();
  1276. tempStateTable.addElement(new short[numCategories + 1]);
  1277. tempStateTable.addElement(new short[numCategories + 1]);
  1278. // call parseRule() for every rule in the rule list (except those which
  1279. // start with !, which are actually backwards-iteration rules)
  1280. for (int i = 0; i < tempRuleList.size(); i++) {
  1281. String rule = (String)tempRuleList.elementAt(i);
  1282. if (rule.charAt(0) != '!') {
  1283. parseRule(rule, true);
  1284. }
  1285. }
  1286. // finally, use finishBuildingStateTable() to minimize the number of
  1287. // states in the table and perform some other cleanup work
  1288. finishBuildingStateTable(true);
  1289. }
  1290. /**
  1291. * This is where most of the work really happens. This routine parses a single
  1292. * rule in the rule description, adding and modifying states in the state
  1293. * table according to the new expression. The state table is kept deterministic
  1294. * throughout the whole operation, although some ugly postprocessing is needed
  1295. * to handle the *? token.
  1296. */
  1297. private void parseRule(String rule, boolean forward) {
  1298. // algorithm notes:
  1299. // - The basic idea here is to read successive character-category groups
  1300. // from the input string. For each group, you create a state and point
  1301. // the appropriate entries in the previous state to it. This produces a
  1302. // straight line from the start state to the end state. The {}, *, and (|)
  1303. // idioms produce branches in this straight line. These branches (states
  1304. // that can transition to more than one other state) are called "decision
  1305. // points." A list of decision points is kept. This contains a list of
  1306. // all states that can transition to the next state to be created. For a
  1307. // straight line progression, the only thing in the decision-point list is
  1308. // the current state. But if there's a branch, the decision-point list
  1309. // will contain all of the beginning points of the branch when the next
  1310. // state to be created represents the end point of the branch. A stack is
  1311. // used to save decision point lists in the presence of nested parentheses
  1312. // and the like. For example, when a { is encountered, the current decision
  1313. // point list is saved on the stack and restored when the corresponding }
  1314. // is encountered. This way, after the } is read, the decision point list
  1315. // will contain both the state right before the } _and_ the state before
  1316. // the whole {} expression. Both of these states can transition to the next
  1317. // state after the {} expression.
  1318. // - one complication arises when we have to stamp a transition value into
  1319. // an array cell that already contains one. The updateStateTable() and
  1320. // mergeStates() functions handle this case. Their basic approach is to
  1321. // create a new state that combines the two states that conflict and point
  1322. // at it when necessary. This happens recursively, so if the merged states
  1323. // also conflict, they're resolved in the same way, and so on. There are
  1324. // a number of tests aimed at preventing infinite recursion.
  1325. // - another complication arises with repeating characters. It's somewhat
  1326. // ambiguous whether the user wants a greedy or non-greedy match in these cases.
  1327. // (e.g., whether "[a-z]*abc" means the SHORTEST sequence of letters ending in
  1328. // "abc" or the LONGEST sequence of letters ending in "abc". We've adopted
  1329. // the *? to mean "shortest" and * by itself to mean "longest". (You get the
  1330. // same result with both if there's no overlap between the repeating character
  1331. // group and the group immediately following it.) Handling the *? token is
  1332. // rather complicated and involves keeping track of whether a state needs to
  1333. // be merged (as described above) or merely overwritten when you update one of
  1334. // its cells, and copying the contents of a state that loops with a *? token
  1335. // into some of the states that follow it after the rest of the table-building
  1336. // process is complete ("backfilling").
  1337. // implementation notes:
  1338. // - This function assumes syntax checking has been performed on the input string
  1339. // prior to its being passed in here. It assumes that parentheses are
  1340. // balanced, all literal characters are enclosed in [] and turned into category
  1341. // numbers, that there are no illegal characters or character sequences, and so
  1342. // on. Violation of these invariants will lead to undefined behavior.
  1343. // - It'd probably be better to use linked lists rather than Vector and Stack
  1344. // to maintain the decision point list and stack. I went for simplicity in
  1345. // this initial implementation. If performance is critical enough, we can go
  1346. // back and fix this later.
  1347. // -There are a number of important limitations on the *? token. It does not work
  1348. // right when followed by a repeating character sequence (e.g., ".*?(abc)*")
  1349. // (although it does work right when followed by a single repeating character).
  1350. // It will not always work right when nested in parentheses or braces (although
  1351. // sometimes it will). It also will not work right if the group of repeating
  1352. // characters and the group of characters that follows overlap partially
  1353. // (e.g., "[a-g]*?[e-j]"). None of these capabilites was deemed necessary for
  1354. // describing breaking rules we know about, so we left them out for
  1355. // expeditiousness.
  1356. // - Rules such as "[a-z]*?abc;" will be treated the same as "[a-z]*?aa*bc;"--
  1357. // that is, if the string ends in "aaaabc", the break will go before the first
  1358. // "a" rather than the last one. Both of these are limitations in the design
  1359. // of RuleBasedBreakIterator and not limitations of the rule parser.
  1360. int p = 0;
  1361. int currentState = 1; // don't use state number 0; 0 means "stop"
  1362. int lastState = currentState;
  1363. String pendingChars = "";
  1364. decisionPointStack = new Stack();
  1365. decisionPointList = new Vector();
  1366. loopingStates = new Vector();
  1367. statesToBackfill = new Vector();
  1368. short[] state;
  1369. boolean sawEarlyBreak = false;
  1370. // if we're adding rules to the backward state table, mark the initial state
  1371. // as a looping state
  1372. if (!forward) {
  1373. loopingStates.addElement(new Integer(1));
  1374. }
  1375. // put the current state on the decision point list before we start
  1376. decisionPointList.addElement(new Integer(currentState)); // we want currentState to
  1377. // be 1 here...
  1378. currentState = tempStateTable.size() - 1; // but after that, we want it to be
  1379. // 1 less than the state number of the next state
  1380. while (p < rule.length()) {
  1381. char c = rule.charAt(p);
  1382. clearLoopingStates = false;
  1383. // this section handles literal characters, escaped characters (which are
  1384. // effectively literal characters too), the . token, and [] expressions
  1385. if (c == '['
  1386. || c == '\\'
  1387. || Character.isLetter(c)
  1388. || Character.isDigit(c)
  1389. || c < ' '
  1390. || c == '.'
  1391. || c >= '\u007f') {
  1392. // if we're not on a period, isolate the expression and look up
  1393. // the corresponding category list
  1394. if (c != '.') {
  1395. int q = p;
  1396. // if we're on a backslash, the expression is the character
  1397. // after the backslash
  1398. if (c == '\\') {
  1399. q = p + 2;
  1400. ++p;
  1401. }
  1402. // if we're on an opening bracket, scan to the closing bracket
  1403. // to isolate the expression
  1404. else if (c == '[') {
  1405. int bracketLevel = 1;
  1406. while (bracketLevel > 0) {
  1407. ++q;
  1408. c = rule.charAt(q);
  1409. if (c == '[') {
  1410. ++bracketLevel;
  1411. }
  1412. else if (c == ']') {
  1413. --bracketLevel;
  1414. }
  1415. else if (c == '\\') {
  1416. ++q;
  1417. }
  1418. }
  1419. ++q;
  1420. }
  1421. // otherwise, the expression is just the character itself
  1422. else {
  1423. q = p + 1;
  1424. }
  1425. // look up the category list for the expression and store it
  1426. // in pendingChars
  1427. pendingChars = (String)expressions.get(rule.substring(p, q));
  1428. // advance the current position past the expression
  1429. p = q - 1;
  1430. }
  1431. // if the character we're on is a period, we end up down here
  1432. else {
  1433. int rowNum = ((Integer)decisionPointList.lastElement()).intValue();
  1434. state = (short[])tempStateTable.elementAt(rowNum);
  1435. // if the period is followed by an asterisk, then just set the current
  1436. // state to loop back on itself
  1437. if (p + 1 < rule.length() && rule.charAt(p + 1) == '*' && state[0] != 0) {
  1438. decisionPointList.addElement(new Integer(state[0]));
  1439. pendingChars = "";
  1440. ++p;
  1441. }
  1442. // otherwise, fabricate a category list ("pendingChars") with
  1443. // every category in it
  1444. else {
  1445. StringBuffer temp = new StringBuffer();
  1446. for (int i = 0; i < numCategories; i++)
  1447. temp.append((char)(i + 0x100));
  1448. pendingChars = temp.toString();
  1449. }
  1450. }
  1451. // we'll end up in here for all expressions except for .*, which is
  1452. // special-cased above
  1453. if (pendingChars.length() != 0) {
  1454. // if the expression is followed by an asterisk, then push a copy
  1455. // of the current desicion point list onto the stack (this is
  1456. // the same thing we do on an opening brace)
  1457. if (p + 1 < rule.length() && rule.charAt(p + 1) == '*') {
  1458. decisionPointStack.push(decisionPointList.clone());
  1459. }
  1460. // create a new state, add it to the list of states to backfill
  1461. // if we have looping states to worry about, set its "don't make
  1462. // me an accepting state" flag if we've seen a slash, and add
  1463. // it to the end of the state table
  1464. int newState = tempStateTable.size();
  1465. if (loopingStates.size() != 0) {
  1466. statesToBackfill.addElement(new Integer(newState));
  1467. }
  1468. state = new short[numCategories + 1];
  1469. if (sawEarlyBreak) {
  1470. state[numCategories] = DONT_LOOP_FLAG;
  1471. }
  1472. tempStateTable.addElement(state);
  1473. // update everybody in the decision point list to point to
  1474. // the new state (this also performs all the reconciliation
  1475. // needed to make the table deterministic), then clear the
  1476. // decision point list
  1477. updateStateTable(decisionPointList, pendingChars, (short)newState);
  1478. decisionPointList.removeAllElements();
  1479. // add all states created since the last literal character we've
  1480. // seen to the decision point list
  1481. lastState = currentState;
  1482. do {
  1483. ++currentState;
  1484. decisionPointList.addElement(new Integer(currentState));
  1485. } while (currentState + 1 < tempStateTable.size());
  1486. }
  1487. }
  1488. // a { marks the beginning of an optional run of characters. Push a
  1489. // copy of the current decision point list onto the stack. This saves
  1490. // it, preventing it from being affected by whatever's inside the parentheses.
  1491. // This decision point list is restored when a } is encountered.
  1492. else if (c == '{') {
  1493. decisionPointStack.push(decisionPointList.clone());
  1494. }
  1495. // a } marks the end of an optional run of characters. Pop the last decision
  1496. // point list off the stack and merge it with the current decision point list.
  1497. // a * denotes a repeating character or group (* after () is handled separately
  1498. // below). In addition to restoring the decision point list, modify the
  1499. // current state to point to itself on the appropriate character categories.
  1500. else if (c == '}' || c == '*') {
  1501. // when there's a *, update the current state to loop back on itself
  1502. // on the character categories that caused us to enter this state
  1503. if (c == '*') {
  1504. for (int i = lastState + 1; i < tempStateTable.size(); i++) {
  1505. Vector temp = new Vector();
  1506. temp.addElement(new Integer(i));
  1507. updateStateTable(temp, pendingChars, (short)(lastState + 1));
  1508. }
  1509. }
  1510. // pop the top element off the decision point stack and merge
  1511. // it with the current decision point list (this causes the divergent
  1512. // paths through the state table to come together again on the next
  1513. // new state)
  1514. Vector temp = (Vector)decisionPointStack.pop();
  1515. for (int i = 0; i < decisionPointList.size(); i++)
  1516. temp.addElement(decisionPointList.elementAt(i));
  1517. decisionPointList = temp;
  1518. }
  1519. // a ? after a * modifies the behavior of * in cases where there is overlap
  1520. // between the set of characters that repeat and the characters which follow.
  1521. // Without the ?, all states following the repeating state, up to a state which
  1522. // is reached by a character that doesn't overlap, will loop back into the
  1523. // repeating state. With the ?, the mark states following the *? DON'T loop
  1524. // back into the repeating state. Thus, "[a-z]*xyz" will match the longest
  1525. // sequence of letters that ends in "xyz," while "[a-z]*? will match the
  1526. // _shortest_ sequence of letters that ends in "xyz".
  1527. // We use extra bookkeeping to achieve this effect, since everything else works
  1528. // according to the "longest possible match" principle. The basic principle
  1529. // is that transitions out of a looping state are written in over the looping
  1530. // value instead of being reconciled, and that we copy the contents of the
  1531. // looping state into empty cells of all non-terminal states that follow the
  1532. // looping state.
  1533. else if (c == '?') {
  1534. setLoopingStates(decisionPointList, decisionPointList);
  1535. }
  1536. // a ( marks the beginning of a sequence of characters. Parentheses can either
  1537. // contain several alternative character sequences (i.e., "(ab|cd|ef)"), or
  1538. // they can contain a sequence of characters that can repeat (i.e., "(abc)*"). Thus,
  1539. // A () group can have multiple entry and exit points. To keep track of this,
  1540. // we reserve TWO spots on the decision-point stack. The top of the stack is
  1541. // the list of exit points, which becomes the current decision point list when
  1542. // the ) is reached. The next entry down is the decision point list at the
  1543. // beginning of the (), which becomes the current decision point list at every
  1544. // entry point.
  1545. // In addition to keeping track of the exit points and the active decision
  1546. // points before the ( (i.e., the places from which the () can be entered),
  1547. // we need to keep track of the entry points in case the expression loops
  1548. // (i.e., is followed by *). We do that by creating a dummy state in the
  1549. // state table and adding it to the decision point list (BEFORE it's duplicated
  1550. // on the stack). Nobody points to this state, so it'll get optimized out
  1551. // at the end. It exists only to hold the entry points in case the ()
  1552. // expression loops.
  1553. else if (c == '(') {
  1554. // add a new state to the state table to hold the entry points into
  1555. // the () expression
  1556. tempStateTable.addElement(new short[numCategories + 1]);
  1557. // we have to adjust lastState and currentState to account for the
  1558. // new dummy state
  1559. lastState = currentState;
  1560. ++currentState;
  1561. // add the current state to the decision point list (add it at the
  1562. // BEGINNING so we can find it later)
  1563. decisionPointList.insertElementAt(new Integer(currentState), 0);
  1564. // finally, push a copy of the current decision point list onto the
  1565. // stack (this keeps track of the active decision point list before
  1566. // the () expression), followed by an empty decision point list
  1567. // (this will hold the exit points)
  1568. decisionPointStack.push(decisionPointList.clone());
  1569. decisionPointStack.push(new Vector());
  1570. }
  1571. // a | separates alternative character sequences in a () expression. When
  1572. // a | is encountered, we add the current decision point list to the exit-point
  1573. // list, and restore the decision point list to its state prior to the (.
  1574. else if (c == '|') {
  1575. // pick out the top two decision point lists on the stack
  1576. Vector oneDown = (Vector)decisionPointStack.pop();
  1577. Vector twoDown = (Vector)decisionPointStack.peek();
  1578. decisionPointStack.push(oneDown);
  1579. // append the current decision point list to the list below it
  1580. // on the stack (the list of exit points), and restore the
  1581. // current decision point list to its state before the () expression
  1582. for (int i = 0; i < decisionPointList.size(); i++)
  1583. oneDown.addElement(decisionPointList.elementAt(i));
  1584. decisionPointList = (Vector)twoDown.clone();
  1585. }
  1586. // a ) marks the end of a sequence of characters. We do one of two things
  1587. // depending on whether the sequence repeats (i.e., whether the ) is followed
  1588. // by *): If the sequence doesn't repeat, then the exit-point list is merged
  1589. // with the current decision point list and the decision point list from before
  1590. // the () is thrown away. If the sequence does repeat, then we fish out the
  1591. // state we were in before the ( and copy all of its forward transitions
  1592. // (i.e., every transition added by the () expression) into every state in the
  1593. // exit-point list and the current decision point list. The current decision
  1594. // point list is then merged with both the exit-point list AND the saved version
  1595. // of the decision point list from before the (). Then we throw out the *.
  1596. else if (c == ')') {
  1597. // pull the exit point list off the stack, merge it with the current
  1598. // decision point list, and make the merged version the current
  1599. // decision point list
  1600. Vector exitPoints = (Vector)decisionPointStack.pop();
  1601. for (int i = 0; i < decisionPointList.size(); i++)
  1602. exitPoints.addElement(decisionPointList.elementAt(i));
  1603. decisionPointList = exitPoints;
  1604. // if the ) isn't followed by a *, then all we have to do is throw
  1605. // away the other list on the decision point stack, and we're done
  1606. if (p + 1 >= rule.length() || rule.charAt(p + 1) != '*') {
  1607. decisionPointStack.pop();
  1608. }
  1609. // but if the sequence repeats, we have a lot more work to do...
  1610. else {
  1611. // now exitPoints and decisionPointList have to point to equivalent
  1612. // vectors, but not the SAME vector
  1613. exitPoints = (Vector)decisionPointList.clone();
  1614. // pop the original decision point list off the stack
  1615. Vector temp = (Vector)decisionPointStack.pop();
  1616. // we squirreled away the row number of our entry point list
  1617. // at the beginning of the original decision point list. Fish
  1618. // that state number out and retrieve the entry point list
  1619. int tempStateNum = ((Integer)temp.firstElement()).intValue();
  1620. short[] tempState = (short[])tempStateTable.elementAt(tempStateNum);
  1621. // merge the original decision point list with the current
  1622. // decision point list
  1623. for (int i = 0; i < decisionPointList.size(); i++)
  1624. temp.addElement(decisionPointList.elementAt(i));
  1625. decisionPointList = temp;
  1626. // finally, copy every forward reference from the entry point
  1627. // list into every state in the new decision point list
  1628. for (int i = 0; i < tempState.length; i++) {
  1629. if (tempState[i] > tempStateNum) {
  1630. updateStateTable(exitPoints,
  1631. new Character((char)(i + 0x100)).toString(),
  1632. tempState[i]);
  1633. }
  1634. }
  1635. // update lastState and currentState, and throw away the *
  1636. lastState = currentState;
  1637. currentState = tempStateTable.size() - 1;
  1638. ++p;
  1639. }
  1640. }
  1641. // a / marks the position where the break is to go if the character sequence
  1642. // matches this rule. We update the flag word of every state on the decision
  1643. // point list to mark them as ending states, and take note of the fact that
  1644. // we've seen the slash
  1645. else if (c == '/') {
  1646. sawEarlyBreak = true;
  1647. for (int i = 0; i < decisionPointList.size(); i++) {
  1648. state = (short[])tempStateTable.elementAt(((Integer)decisionPointList.
  1649. elementAt(i)).intValue());
  1650. state[numCategories] |= LOOKAHEAD_STATE_FLAG;
  1651. }
  1652. }
  1653. // if we get here without executing any of the above clauses, we have a
  1654. // syntax error. However, for now we just ignore the offending character
  1655. // and move on
  1656. // clearLoopingStates is a signal back from updateStateTable() that we've
  1657. // transitioned to a state that won't loop back to the current looping
  1658. // state. (In other words, we've gotten to a point where we can no longer
  1659. // go back into a *? we saw earlier.) Clear out the list of looping states
  1660. // and backfill any states that need to be backfilled.
  1661. if (clearLoopingStates) {
  1662. setLoopingStates(null, decisionPointList);
  1663. }
  1664. // advance to the next character, now that we've processed the current
  1665. // character
  1666. ++p;
  1667. }
  1668. // this takes care of backfilling any states that still need to be backfilled
  1669. setLoopingStates(null, decisionPointList);
  1670. // when we reach the end of the string, we do a postprocessing step to mark the
  1671. // end states. The decision point list contains every state that can transition
  1672. // to the end state-- that is, every state that is the last state in a sequence
  1673. // that matches the rule. All of these states are considered "mark states"
  1674. // or "accepting states"-- that is, states that cause the position returned from
  1675. // next() to be updated. A mark state represents a possible break position.
  1676. // This allows us to look ahead and remember how far the rule matched
  1677. // before following the new branch (see next() for more information).
  1678. // The temporary state table has an extra "flag column" at the end where this
  1679. // information is stored. We mark the end states by setting a flag in their
  1680. // flag column.
  1681. // Now if we saw the / in the rule, then everything after it is lookahead
  1682. // material and the break really goes where the slash is. In this case,
  1683. // we mark these states as BOTH accepting states and lookahead states. This
  1684. // signals that these states cause the break position to be updated to the
  1685. // position of the slash rather than the current break position.
  1686. for (int i = 0; i < decisionPointList.size(); i++) {
  1687. int rowNum = ((Integer)decisionPointList.elementAt(i)).intValue();
  1688. state = (short[])tempStateTable.elementAt(rowNum);
  1689. state[numCategories] |= END_STATE_FLAG;
  1690. if (sawEarlyBreak) {
  1691. state[numCategories] |= LOOKAHEAD_STATE_FLAG;
  1692. }
  1693. }
  1694. }
  1695. /**
  1696. * Update entries in the state table, and merge states when necessary to keep
  1697. * the table deterministic.
  1698. * @param rows The list of rows that need updating (the decision point list)
  1699. * @param pendingChars A character category list, encoded in a String. This is the
  1700. * list of the columns that need updating.
  1701. * @param newValue Update the cells specfied above to contain this value
  1702. */
  1703. private void updateStateTable(Vector rows,
  1704. String pendingChars,
  1705. short newValue) {
  1706. // create a dummy state that has the specified row number (newValue) in
  1707. // the cells that need to be updated (those specified by pendingChars)
  1708. // and 0 in the other cells
  1709. short[] newValues = new short[numCategories + 1];
  1710. for (int i = 0; i < pendingChars.length(); i++)
  1711. newValues[(int)(pendingChars.charAt(i)) - 0x100] = newValue;
  1712. // go through the list of rows to update, and update them by calling
  1713. // mergeStates() to merge them the the dummy state we created
  1714. for (int i = 0; i < rows.size(); i++) {
  1715. mergeStates(((Integer)rows.elementAt(i)).intValue(), newValues, rows);
  1716. }
  1717. }
  1718. /**
  1719. * The real work of making the state table deterministic happens here. This function
  1720. * merges a state in the state table (specified by rowNum) with a state that is
  1721. * passed in (newValues). The basic process is to copy the nonzero cells in newStates
  1722. * into the state in the state table (we'll call that oldValues). If there's a
  1723. * collision (i.e., if the same cell has a nonzero value in both states, and it's
  1724. * not the SAME value), then we have to reconcile the collision. We do this by
  1725. * creating a new state, adding it to the end of the state table, and using this
  1726. * function recursively to merge the original two states into a single, combined
  1727. * state. This process may happen recursively (i.e., each successive level may
  1728. * involve collisions). To prevent infinite recursion, we keep a log of merge
  1729. * operations. Any time we're merging two states we've merged before, we can just
  1730. * supply the row number for the result of that merge operation rather than creating
  1731. * a new state just like it.
  1732. * @param rowNum The row number in the state table of the state to be updated
  1733. * @param newValues The state to merge it with.
  1734. * @param rowsBeingUpdated A copy of the list of rows passed to updateStateTable()
  1735. * (itself a copy of the decision point list from parseRule()). Newly-created
  1736. * states get added to the decision point list if their "parents" were on it.
  1737. */
  1738. private void mergeStates(int rowNum,
  1739. short[] newValues,
  1740. Vector rowsBeingUpdated) {
  1741. short[] oldValues = (short[])(tempStateTable.elementAt(rowNum));
  1742. boolean isLoopingState = loopingStates.contains(new Integer(rowNum));
  1743. // for each of the cells in the rows we're reconciling, do...
  1744. for (int i = 0; i < oldValues.length; i++) {
  1745. // if they contain the same value, we don't have to do anything
  1746. if (oldValues[i] == newValues[i]) {
  1747. continue;
  1748. }
  1749. // if oldValues is a looping state and the state the current cell points to
  1750. // is too, then we can just stomp over the current value of that cell (and
  1751. // set the clear-looping-states flag if necessary)
  1752. else if (isLoopingState && loopingStates.contains(new Integer(oldValues[i]))) {
  1753. if (newValues[i] != 0) {
  1754. if (oldValues[i] == 0) {
  1755. clearLoopingStates = true;
  1756. }
  1757. oldValues[i] = newValues[i];
  1758. }
  1759. }
  1760. // if the current cell in oldValues is 0, copy in the corresponding value
  1761. // from newValues
  1762. else if (oldValues[i] == 0) {
  1763. oldValues[i] = newValues[i];
  1764. }
  1765. // the last column of each row is the flag column. Take care to merge the
  1766. // flag words correctly
  1767. else if (i == numCategories) {
  1768. oldValues[i] = (short)((newValues[i] & ALL_FLAGS) | oldValues[i]);
  1769. }
  1770. // if both newValues and oldValues have a nonzero value in the current
  1771. // cell, and it isn't the same value both places...
  1772. else if (oldValues[i] != 0 && newValues[i] != 0) {
  1773. // look up this pair of cell values in the merge list. If it's
  1774. // found, update the cell in oldValues to point to the merged state
  1775. int combinedRowNum = searchMergeList(oldValues[i], newValues[i]);
  1776. if (combinedRowNum != 0) {
  1777. oldValues[i] = (short)combinedRowNum;
  1778. }
  1779. // otherwise, we have to reconcile them...
  1780. else {
  1781. // copy our row numbers into variables to make things easier
  1782. int oldRowNum = oldValues[i];
  1783. int newRowNum = newValues[i];
  1784. combinedRowNum = tempStateTable.size();
  1785. // add this pair of row numbers to the merge list (create it first
  1786. // if we haven't created the merge list yet)
  1787. if (mergeList == null) {
  1788. mergeList = new Vector();
  1789. }
  1790. mergeList.addElement(new int[] { oldRowNum, newRowNum, combinedRowNum });
  1791. // create a new row to represent the merged state, and copy the
  1792. // contents of oldRow into it, then add it to the end of the
  1793. // state table and update the original row (oldValues) to point
  1794. // to the new, merged, state
  1795. short[] newRow = new short[numCategories + 1];
  1796. short[] oldRow = (short[])(tempStateTable.elementAt(oldRowNum));
  1797. System.arraycopy(oldRow, 0, newRow, 0, numCategories + 1);
  1798. tempStateTable.addElement(newRow);
  1799. oldValues[i] = (short)combinedRowNum;
  1800. // if the decision point list contains either of the parent rows,
  1801. // update it to include the new row as well
  1802. if ((decisionPointList.contains(new Integer(oldRowNum))
  1803. || decisionPointList.contains(new Integer(newRowNum)))
  1804. && !decisionPointList.contains(new Integer(combinedRowNum))
  1805. ) {
  1806. decisionPointList.addElement(new Integer(combinedRowNum));
  1807. }
  1808. // do the same thing with the list of rows being updated
  1809. if ((rowsBeingUpdated.contains(new Integer(oldRowNum))
  1810. || rowsBeingUpdated.contains(new Integer(newRowNum)))
  1811. && !rowsBeingUpdated.contains(new Integer(combinedRowNum))
  1812. ) {
  1813. decisionPointList.addElement(new Integer(combinedRowNum));
  1814. }
  1815. // now (groan) do the same thing for all the entries on the
  1816. // decision point stack
  1817. for (int k = 0; k < decisionPointStack.size(); k++) {
  1818. Vector dpl = (Vector)decisionPointStack.elementAt(k);
  1819. if ((dpl.contains(new Integer(oldRowNum))
  1820. || dpl.contains(new Integer(newRowNum)))
  1821. && !dpl.contains(new Integer(combinedRowNum))
  1822. ) {
  1823. dpl.addElement(new Integer(combinedRowNum));
  1824. }
  1825. }
  1826. // FINALLY (puff puff puff), call mergeStates() recursively to copy
  1827. // the row referred to by newValues into the new row and resolve any
  1828. // conflicts that come up at that level
  1829. mergeStates(combinedRowNum, (short[])(tempStateTable.elementAt(
  1830. newValues[i])), rowsBeingUpdated);
  1831. }
  1832. }
  1833. }
  1834. return;
  1835. }
  1836. /**
  1837. * The merge list is a list of pairs of rows that have been merged somewhere in
  1838. * the process of building this state table, along with the row number of the
  1839. * row containing the merged state. This function looks up a pair of row numbers
  1840. * and returns the row number of the row they combine into. (It returns 0 if
  1841. * this pair of rows isn't in the merge list.)
  1842. */
  1843. private int searchMergeList(int a, int b) {
  1844. // if there is no merge list, there obviously isn't anything in it
  1845. if (mergeList == null) {
  1846. return 0;
  1847. }
  1848. // otherwise, for each element in the merge list...
  1849. else {
  1850. int[] entry;
  1851. for (int i = 0; i < mergeList.size(); i++) {
  1852. entry = (int[])(mergeList.elementAt(i));
  1853. // we have a hit if the two row numbers match the two row numbers
  1854. // in the beginning of the entry (the two that combine), in either
  1855. // order
  1856. if ((entry[0] == a && entry[1] == b) || (entry[0] == b && entry[1] == a)) {
  1857. return entry[2];
  1858. }
  1859. // we also have a hit if one of the two row numbers matches the marged
  1860. // row number and the other one matches one of the original row numbers
  1861. if ((entry[2] == a && (entry[0] == b || entry[1] == b))) {
  1862. return entry[2];
  1863. }
  1864. if ((entry[2] == b && (entry[0] == a || entry[1] == a))) {
  1865. return entry[2];
  1866. }
  1867. }
  1868. return 0;
  1869. }
  1870. }
  1871. /**
  1872. * This function is used to update the list of current loooping states (i.e.,
  1873. * states that are controlled by a *? construct). It backfills values from
  1874. * the looping states into unpopulated cells of the states that are currently
  1875. * marked for backfilling, and then updates the list of looping states to be
  1876. * the new list
  1877. * @param newLoopingStates The list of new looping states
  1878. * @param endStates The list of states to treat as end states (states that
  1879. * can exit the loop).
  1880. */
  1881. private void setLoopingStates(Vector newLoopingStates, Vector endStates) {
  1882. // if the current list of looping states isn't empty, we have to backfill
  1883. // values from the looping states into the states that are waiting to be
  1884. // backfilled
  1885. if (!loopingStates.isEmpty()) {
  1886. int loopingState = ((Integer)loopingStates.lastElement()).intValue();
  1887. int rowNum;
  1888. // don't backfill into an end state OR any state reachable from an end state
  1889. // (since the search for reachable states is recursive, it's split out into
  1890. // a separate function, eliminateBackfillStates(), below)
  1891. for (int i = 0; i < endStates.size(); i++) {
  1892. eliminateBackfillStates(((Integer)endStates.elementAt(i)).intValue());
  1893. }
  1894. // we DON'T actually backfill the states that need to be backfilled here.
  1895. // Instead, we MARK them for backfilling. The reason for this is that if
  1896. // there are multiple rules in the state-table description, the looping
  1897. // states may have some of their values changed by a succeeding rule, and
  1898. // this wouldn't be reflected in the backfilled states. We mark a state
  1899. // for backfilling by putting the row number of the state to copy from
  1900. // into the flag cell at the end of the row
  1901. for (int i = 0; i < statesToBackfill.size(); i++) {
  1902. rowNum = ((Integer)statesToBackfill.elementAt(i)).intValue();
  1903. short[] state = (short[])tempStateTable.elementAt(rowNum);
  1904. state[numCategories] =
  1905. (short)((state[numCategories] & ALL_FLAGS) | loopingState);
  1906. }
  1907. statesToBackfill.removeAllElements();
  1908. loopingStates.removeAllElements();
  1909. }
  1910. if (newLoopingStates != null) {
  1911. loopingStates = (Vector)newLoopingStates.clone();
  1912. }
  1913. }
  1914. /**
  1915. * This removes "ending states" and states reachable from them from the
  1916. * list of states to backfill.
  1917. * @param The row number of the state to remove from the backfill list
  1918. */
  1919. private void eliminateBackfillStates(int baseState) {
  1920. // don't do anything unless this state is actually in the backfill list...
  1921. if (statesToBackfill.contains(new Integer(baseState))) {
  1922. // if it is, take it out
  1923. statesToBackfill.removeElement(new Integer(baseState));
  1924. // then go through and recursively call this function for every
  1925. // state that the base state points to
  1926. short[] state = (short[])tempStateTable.elementAt(baseState);
  1927. for (int i = 0; i < numCategories; i++) {
  1928. if (state[i] != 0) {
  1929. eliminateBackfillStates(state[i]);
  1930. }
  1931. }
  1932. }
  1933. }
  1934. /**
  1935. * This function completes the backfilling process by actually doing the
  1936. * backfilling on the states that are marked for it
  1937. */
  1938. private void backfillLoopingStates() {
  1939. short[] state;
  1940. short[] loopingState = null;
  1941. int loopingStateRowNum = 0;
  1942. int fromState;
  1943. // for each state in the state table...
  1944. for (int i = 0; i < tempStateTable.size(); i++) {
  1945. state = (short[])tempStateTable.elementAt(i);
  1946. // check the state's flag word to see if it's marked for backfilling
  1947. // (it's marked for backfilling if any bits other than the two high-order
  1948. // bits are set-- if they are, then the flag word, minus the two high bits,
  1949. // is the row number to copy from)
  1950. fromState = state[numCategories] & ~ALL_FLAGS;
  1951. if (fromState > 0) {
  1952. // load up the state to copy from (if we haven't already)
  1953. if (fromState != loopingStateRowNum) {
  1954. loopingStateRowNum = fromState;
  1955. loopingState = (short[])tempStateTable.elementAt(loopingStateRowNum);
  1956. }
  1957. // clear out the backfill part of the flag word
  1958. state[numCategories] &= ALL_FLAGS;
  1959. // then fill all zero cells in the current state with values
  1960. // from the corresponding cells of the fromState
  1961. for (int j = 0; j < state.length; j++) {
  1962. if (state[j] == 0) {
  1963. state[j] = loopingState[j];
  1964. }
  1965. else if (state[j] == DONT_LOOP_FLAG) {
  1966. state[j] = 0;
  1967. }
  1968. }
  1969. }
  1970. }
  1971. }
  1972. /**
  1973. * This function completes the state-table-building process by doing several
  1974. * postprocessing steps and copying everything into its final resting place
  1975. * in the iterator itself
  1976. * @param forward True if we're working on the forward state table
  1977. */
  1978. private void finishBuildingStateTable(boolean forward) {
  1979. // start by backfilling the looping states
  1980. backfillLoopingStates();
  1981. int[] rowNumMap = new int[tempStateTable.size()];
  1982. Stack rowsToFollow = new Stack();
  1983. rowsToFollow.push(new Integer(1));
  1984. rowNumMap[1] = 1;
  1985. // determine which states are no longer reachable from the start state
  1986. // (the reachable states will have their row numbers in the row number
  1987. // map, and the nonreachable states will have zero in the row number map)
  1988. while (rowsToFollow.size() != 0) {
  1989. int rowNum = ((Integer)rowsToFollow.pop()).intValue();
  1990. short[] row = (short[])(tempStateTable.elementAt(rowNum));
  1991. for (int i = 0; i < numCategories; i++) {
  1992. if (row[i] != 0) {
  1993. if (rowNumMap[row[i]] == 0) {
  1994. rowNumMap[row[i]] = row[i];
  1995. rowsToFollow.push(new Integer(row[i]));
  1996. }
  1997. }
  1998. }
  1999. }
  2000. boolean madeChange;
  2001. int newRowNum;
  2002. // algorithm for minimizing the number of states in the table adapted from
  2003. // Aho & Ullman, "Principles of Compiler Design"
  2004. // The basic idea here is to organize the states into classes. When we're done,
  2005. // all states in the same class can be considered identical and all but one eliminated.
  2006. // initially assign states to classes based on the number of populated cells they
  2007. // contain (the class number is the number of populated cells)
  2008. int[] stateClasses = new int[tempStateTable.size()];
  2009. int nextClass = numCategories + 1;
  2010. short[] state1, state2;
  2011. for (int i = 1; i < stateClasses.length; i++) {
  2012. if (rowNumMap[i] == 0) {
  2013. continue;
  2014. }
  2015. state1 = (short[])tempStateTable.elementAt(i);
  2016. for (int j = 0; j < numCategories; j++) {
  2017. if (state1[j] != 0) {
  2018. ++stateClasses[i];
  2019. }
  2020. }
  2021. if (stateClasses[i] == 0) {
  2022. stateClasses[i] = nextClass;
  2023. }
  2024. }
  2025. ++nextClass;
  2026. // then, for each class, elect the first member of that class as that class's
  2027. // "representative". For each member of the class, compare it to the "representative."
  2028. // If there's a column position where the state being tested transitions to a
  2029. // state in a DIFFERENT class from the class where the "representative" transitions,
  2030. // then move the state into a new class. Repeat this process until no new classes
  2031. // are created.
  2032. int currentClass;
  2033. int lastClass;
  2034. boolean split;
  2035. do {
  2036. currentClass = 1;
  2037. lastClass = nextClass;
  2038. while (currentClass < nextClass) {
  2039. split = false;
  2040. state1 = state2 = null;
  2041. for (int i = 0; i < stateClasses.length; i++) {
  2042. if (stateClasses[i] == currentClass) {
  2043. if (state1 == null) {
  2044. state1 = (short[])tempStateTable.elementAt(i);
  2045. }
  2046. else {
  2047. state2 = (short[])tempStateTable.elementAt(i);
  2048. for (int j = 0; j < state2.length; j++) {
  2049. if ((j == numCategories && state1[j] != state2[j] && forward)
  2050. || (j != numCategories && stateClasses[state1[j]]
  2051. != stateClasses[state2[j]])) {
  2052. stateClasses[i] = nextClass;
  2053. split = true;
  2054. break;
  2055. }
  2056. }
  2057. }
  2058. }
  2059. }
  2060. if (split) {
  2061. ++nextClass;
  2062. }
  2063. ++currentClass;
  2064. }
  2065. } while (lastClass != nextClass);
  2066. // at this point, all of the states in a class except the first one (the
  2067. //"representative") can be eliminated, so update the row-number map accordingly
  2068. int[] representatives = new int[nextClass];
  2069. for (int i = 1; i < stateClasses.length; i++)
  2070. if (representatives[stateClasses[i]] == 0) {
  2071. representatives[stateClasses[i]] = i;
  2072. }
  2073. else {
  2074. rowNumMap[i] = representatives[stateClasses[i]];
  2075. }
  2076. // renumber all remaining rows...
  2077. // first drop all that are either unreferenced or not a class representative
  2078. for (int i = 1; i < rowNumMap.length; i++) {
  2079. if (rowNumMap[i] != i) {
  2080. tempStateTable.setElementAt(null, i);
  2081. }
  2082. }
  2083. // then calculate everybody's new row number and update the row
  2084. // number map appropriately (the first pass updates the row numbers
  2085. // of all the class representatives [the rows we're keeping], and the
  2086. // second pass updates the cross references for all the rows that
  2087. // are being deleted)
  2088. newRowNum = 1;
  2089. for (int i = 1; i < rowNumMap.length; i++) {
  2090. if (tempStateTable.elementAt(i) != null) {
  2091. rowNumMap[i] = newRowNum++;
  2092. }
  2093. }
  2094. for (int i = 1; i < rowNumMap.length; i++) {
  2095. if (tempStateTable.elementAt(i) == null) {
  2096. rowNumMap[i] = rowNumMap[rowNumMap[i]];
  2097. }
  2098. }
  2099. // allocate the permanent state table, and copy the remaining rows into it
  2100. // (adjusting all the cell values, of course)
  2101. // this section does that for the forward state table
  2102. if (forward) {
  2103. endStates = new boolean[newRowNum];
  2104. lookaheadStates = new boolean[newRowNum];
  2105. stateTable = new short[newRowNum * numCategories];
  2106. int p = 0;
  2107. int p2 = 0;
  2108. for (int i = 0; i < tempStateTable.size(); i++) {
  2109. short[] row = (short[])(tempStateTable.elementAt(i));
  2110. if (row == null) {
  2111. continue;
  2112. }
  2113. for (int j = 0; j < numCategories; j++) {
  2114. stateTable[p] = (short)(rowNumMap[row[j]]);
  2115. ++p;
  2116. }
  2117. endStates[p2] = ((row[numCategories] & END_STATE_FLAG) != 0);
  2118. lookaheadStates[p2] = ((row[numCategories] & LOOKAHEAD_STATE_FLAG) != 0);
  2119. ++p2;
  2120. }
  2121. }
  2122. // and this section does it for the backward state table
  2123. else {
  2124. backwardsStateTable = new short[newRowNum * numCategories];
  2125. int p = 0;
  2126. for (int i = 0; i < tempStateTable.size(); i++) {
  2127. short[] row = (short[])(tempStateTable.elementAt(i));
  2128. if (row == null) {
  2129. continue;
  2130. }
  2131. for (int j = 0; j < numCategories; j++) {
  2132. backwardsStateTable[p] = (short)(rowNumMap[row[j]]);
  2133. ++p;
  2134. }
  2135. }
  2136. }
  2137. }
  2138. /**
  2139. * This function builds the backward state table from the forward state
  2140. * table and any additional rules (identified by the ! on the front)
  2141. * supplied in the description
  2142. */
  2143. private void buildBackwardsStateTable(Vector tempRuleList) {
  2144. // create the temporary state table and seed it with two rows (row 0
  2145. // isn't used for anything, and we have to create row 1 (the initial
  2146. // state) before we can do anything else
  2147. tempStateTable = new Vector();
  2148. tempStateTable.addElement(new short[numCategories + 1]);
  2149. tempStateTable.addElement(new short[numCategories + 1]);
  2150. // although the backwards state table is built automatically from the forward
  2151. // state table, there are some situations (the default sentence-break rules,
  2152. // for example) where this doesn't yield enough stop states, causing a dramatic
  2153. // drop in performance. To help with these cases, the user may supply
  2154. // supplemental rules that are added to the backward state table. These have
  2155. // the same syntax as the normal break rules, but begin with '!' to distinguish
  2156. // them from normal break rules
  2157. for (int i = 0; i < tempRuleList.size(); i++) {
  2158. String rule = (String)tempRuleList.elementAt(i);
  2159. if (rule.charAt(0) == '!') {
  2160. parseRule(rule.substring(1), false);
  2161. }
  2162. }
  2163. backfillLoopingStates();
  2164. // Backwards iteration is qualitatively different from forwards iteration.
  2165. // This is because backwards iteration has to be made to operate from no context
  2166. // at all-- the user should be able to ask BreakIterator for the break position
  2167. // immediately on either side of some arbitrary offset in the text. The
  2168. // forward iteration table doesn't let us do that-- it assumes complete
  2169. // information on the context, which means starting from the beginning of the
  2170. // document.
  2171. // The way we do backward and random-access iteration is to back up from the
  2172. // current (or user-specified) position until we see something we're sure is
  2173. // a break position (it may not be the last break position immediately
  2174. // preceding our starting point, however). Then we roll forward from there to
  2175. // locate the actual break position we're after.
  2176. // This means that the backwards state table doesn't have to identify every
  2177. // break position, allowing the building algorithm to be much simpler. Here,
  2178. // we use a "pairs" approach, scanning the forward-iteration state table for
  2179. // pairs of character categories we ALWAYS break between, and building a state
  2180. // table from that information. No context is required-- all this state table
  2181. // looks at is a pair of adjacent characters.
  2182. // It's possible that the user has supplied supplementary rules (see above).
  2183. // This has to be done first to keep parseRule() and friends from becoming
  2184. // EVEN MORE complicated. The automatically-generated states are appended
  2185. // onto the end of the state table, and then the two sets of rules are
  2186. // stitched together at the end. Take note of the row number of the
  2187. // first row of the auromatically-generated part.
  2188. int backTableOffset = tempStateTable.size();
  2189. if (backTableOffset > 2) {
  2190. ++backTableOffset;
  2191. }
  2192. // the automatically-generated part of the table models a two-dimensional
  2193. // array where the two dimensions represent the two characters we're currently
  2194. // looking at. To model this as a state table, we actually need one additional
  2195. // row to represent the initial state. It gets populated with the row numbers
  2196. // of the other rows (in order).
  2197. for (int i = 0; i < numCategories + 1; i++)
  2198. tempStateTable.addElement(new short[numCategories + 1]);
  2199. short[] state = (short[])tempStateTable.elementAt(backTableOffset - 1);
  2200. for (int i = 0; i < numCategories; i++)
  2201. state[i] = (short)(i + backTableOffset);
  2202. // scavenge the forward state table for pairs of character categories
  2203. // that always have a break between them. The algorithm is as follows:
  2204. // Look down each column in the state table. For each nonzero cell in
  2205. // that column, look up the row it points to. For each nonzero cell in
  2206. // that row, populate a cell in the backwards state table: the row number
  2207. // of that cell is the number of the column we were scanning (plus the
  2208. // offset that locates this sub-table), and the column number of that cell
  2209. // is the column number of the nonzero cell we just found. This cell is
  2210. // populated with its own column number (adjusted according to the actual
  2211. // location of the sub-table). This process will produce a state table
  2212. // whose behavior is the same as looking up successive pairs of characters
  2213. // in an array of Booleans to determine whether there is a break.
  2214. int numRows = stateTable.length / numCategories;
  2215. for (int column = 0; column < numCategories; column++) {
  2216. for (int row = 0; row < numRows; row++) {
  2217. int nextRow = lookupState(row, column);
  2218. if (nextRow != 0) {
  2219. for (int nextColumn = 0; nextColumn < numCategories; nextColumn++) {
  2220. int cellValue = lookupState(nextRow, nextColumn);
  2221. if (cellValue != 0) {
  2222. state = (short[])tempStateTable.elementAt(nextColumn +
  2223. backTableOffset);
  2224. state[column] = (short)(column + backTableOffset);
  2225. }
  2226. }
  2227. }
  2228. }
  2229. }
  2230. // if the user specified some backward-iteration rules with the ! token,
  2231. // we have to merge the resulting state table with the auto-generated one
  2232. // above. First copy the populated cells from row 1 over the populated
  2233. // cells in the auto-generated table. Then copy values from row 1 of the
  2234. // auto-generated table into all of the the unpopulated cells of the
  2235. // rule-based table.
  2236. if (backTableOffset > 1) {
  2237. // for every row in the auto-generated sub-table, if a cell is
  2238. // populated that is also populated in row 1 of the rule-based
  2239. // sub-table, copy the value from row 1 over the value in the
  2240. // auto-generated sub-table
  2241. state = (short[])tempStateTable.elementAt(1);
  2242. for (int i = backTableOffset - 1; i < tempStateTable.size(); i++) {
  2243. short[] state2 = (short[])tempStateTable.elementAt(i);
  2244. for (int j = 0; j < numCategories; j++) {
  2245. if (state[j] != 0 && state2[j] != 0) {
  2246. state2[j] = state[j];
  2247. }
  2248. }
  2249. }
  2250. // now, for every row in the rule-based sub-table that is not
  2251. // an end state, fill in all unpopulated cells with the values
  2252. // of the corresponding cells in the first row of the auto-
  2253. // generated sub-table.
  2254. state = (short[])tempStateTable.elementAt(backTableOffset - 1);
  2255. for (int i = 1; i < backTableOffset - 1; i++) {
  2256. short[] state2 = (short[])tempStateTable.elementAt(i);
  2257. if ((state2[numCategories] & END_STATE_FLAG) == 0) {
  2258. for (int j = 0; j < numCategories; j++) {
  2259. if (state2[j] == 0) {
  2260. state2[j] = state[j];
  2261. }
  2262. }
  2263. }
  2264. }
  2265. }
  2266. // finally, clean everything up and copy it into the actual BreakIterator
  2267. // by calling finishBuildingStateTable()
  2268. finishBuildingStateTable(false);
  2269. }
  2270. /**
  2271. * Throws an IllegalArgumentException representing a syntax error in the rule
  2272. * description. The exception's message contains some debugging information.
  2273. * @param message A message describing the problem
  2274. * @param position The position in the description where the problem was
  2275. * discovered
  2276. * @param context The string containing the error
  2277. */
  2278. protected void error(String message, int position, String context) {
  2279. throw new IllegalArgumentException("Parse error at position (" + position + "): " + message + "\n" +
  2280. context.substring(0, position) + " -here- " + context.substring(position));
  2281. }
  2282. }
  2283. /*
  2284. * This class exists to work around a bug in incorrect implementations
  2285. * of CharacterIterator, which incorrectly handle setIndex(endIndex).
  2286. * This iterator relies only on base.setIndex(n) where n is less than
  2287. * endIndex.
  2288. *
  2289. * One caveat: if the base iterator's begin and end indices change
  2290. * the change will not be reflected by this wrapper. Does that matter?
  2291. */
  2292. private static final class SafeCharIterator implements CharacterIterator,
  2293. Cloneable {
  2294. private CharacterIterator base;
  2295. private int rangeStart;
  2296. private int rangeLimit;
  2297. private int currentIndex;
  2298. SafeCharIterator(CharacterIterator base) {
  2299. this.base = base;
  2300. this.rangeStart = base.getBeginIndex();
  2301. this.rangeLimit = base.getEndIndex();
  2302. this.currentIndex = base.getIndex();
  2303. }
  2304. public char first() {
  2305. return setIndex(rangeStart);
  2306. }
  2307. public char last() {
  2308. return setIndex(rangeLimit - 1);
  2309. }
  2310. public char current() {
  2311. if (currentIndex < rangeStart || currentIndex >= rangeLimit) {
  2312. return DONE;
  2313. }
  2314. else {
  2315. return base.setIndex(currentIndex);
  2316. }
  2317. }
  2318. public char next() {
  2319. currentIndex++;
  2320. if (currentIndex >= rangeLimit) {
  2321. currentIndex = rangeLimit;
  2322. return DONE;
  2323. }
  2324. else {
  2325. return base.setIndex(currentIndex);
  2326. }
  2327. }
  2328. public char previous() {
  2329. currentIndex--;
  2330. if (currentIndex < rangeStart) {
  2331. currentIndex = rangeStart;
  2332. return DONE;
  2333. }
  2334. else {
  2335. return base.setIndex(currentIndex);
  2336. }
  2337. }
  2338. public char setIndex(int i) {
  2339. if (i < rangeStart || i > rangeLimit) {
  2340. throw new IllegalArgumentException("Invalid position");
  2341. }
  2342. currentIndex = i;
  2343. return current();
  2344. }
  2345. public int getBeginIndex() {
  2346. return rangeStart;
  2347. }
  2348. public int getEndIndex() {
  2349. return rangeLimit;
  2350. }
  2351. public int getIndex() {
  2352. return currentIndex;
  2353. }
  2354. public Object clone() {
  2355. SafeCharIterator copy = null;
  2356. try {
  2357. copy = (SafeCharIterator) super.clone();
  2358. }
  2359. catch(CloneNotSupportedException e) {
  2360. throw new Error("Clone not supported: " + e);
  2361. }
  2362. CharacterIterator copyOfBase = (CharacterIterator) base.clone();
  2363. copy.base = copyOfBase;
  2364. return copy;
  2365. }
  2366. }
  2367. }