1. /*
  2. * @(#)DictionaryBasedBreakIterator.java 1.10 03/01/23
  3. *
  4. * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
  5. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
  6. */
  7. /*
  8. * @(#)DictionaryBasedBreakIterator.java 1.3 99/05/03
  9. *
  10. * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  11. * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
  12. *
  13. * The original version of this source code and documentation
  14. * is copyrighted and owned by Taligent, Inc., a wholly-owned
  15. * subsidiary of IBM. These materials are provided under terms
  16. * of a License Agreement between Taligent and Sun. This technology
  17. * is protected by multiple US and International patents.
  18. *
  19. * This notice and attribution to Taligent may not be removed.
  20. * Taligent is a registered trademark of Taligent, Inc.
  21. */
  22. package java.text;
  23. import java.util.Vector;
  24. import java.util.Stack;
  25. import java.util.Hashtable;
  26. import java.text.CharacterIterator;
  27. import java.io.InputStream;
  28. import java.io.IOException;
  29. /**
  30. * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
  31. * to further subdivide ranges of text beyond what is possible using just the
  32. * state-table-based algorithm. This is necessary, for example, to handle
  33. * word and line breaking in Thai, which doesn't use spaces between words. The
  34. * state-table-based algorithm used by RuleBasedBreakIterator is used to divide
  35. * up text as far as possible, and then contiguous ranges of letters are
  36. * repeatedly compared against a list of known words (i.e., the dictionary)
  37. * to divide them up into words.
  38. *
  39. * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator,
  40. * but adds one more special substitution name: <dictionary>. This substitution
  41. * name is used to identify characters in words in the dictionary. The idea is that
  42. * if the iterator passes over a chunk of text that includes two or more characters
  43. * in a row that are included in <dictionary>, it goes back through that range and
  44. * derives additional break positions (if possible) using the dictionary.
  45. *
  46. * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
  47. * file. It follows a prescribed search path to locate the dictionary (right now,
  48. * it looks for it in /com/ibm/text/resources in each directory in the classpath,
  49. * and won't find it in JAR files, but this location is likely to change). The
  50. * dictionary file is in a serialized binary format. We have a very primitive (and
  51. * slow) BuildDictionaryFile utility for creating dictionary files, but aren't
  52. * currently making it public. Contact us for help.
  53. */
  54. class DictionaryBasedBreakIterator extends RuleBasedBreakIterator {
  55. /**
  56. * a list of known words that is used to divide up contiguous ranges of letters,
  57. * stored in a compressed, indexed, format that offers fast access
  58. */
  59. private BreakDictionary dictionary;
  60. /**
  61. * a list of flags indicating which character categories are contained in
  62. * the dictionary file (this is used to determine which ranges of characters
  63. * to apply the dictionary to)
  64. */
  65. private boolean[] categoryFlags;
  66. /**
  67. * a temporary hiding place for the number of dictionary characters in the
  68. * last range passed over by next()
  69. */
  70. private int dictionaryCharCount;
  71. /**
  72. * when a range of characters is divided up using the dictionary, the break
  73. * positions that are discovered are stored here, preventing us from having
  74. * to use either the dictionary or the state table again until the iterator
  75. * leaves this range of text
  76. */
  77. private int[] cachedBreakPositions;
  78. /**
  79. * if cachedBreakPositions is not null, this indicates which item in the
  80. * cache the current iteration position refers to
  81. */
  82. private int positionInCache;
  83. /**
  84. * Constructs a DictionaryBasedBreakIterator.
  85. * @param description Same as the description parameter on RuleBasedBreakIterator,
  86. * except for the special meaning of "<dictionary>". This parameter is just
  87. * passed through to RuleBasedBreakIterator's constructor.
  88. * @param dictionaryFilename The filename of the dictionary file to use
  89. */
  90. public DictionaryBasedBreakIterator(String description,
  91. InputStream dictionaryStream) throws IOException {
  92. super(description);
  93. dictionary = new BreakDictionary(dictionaryStream);
  94. }
  95. /**
  96. * Returns a Builder that is customized to build a DictionaryBasedBreakIterator.
  97. * This is the same as RuleBasedBreakIterator.Builder, except for the extra code
  98. * to handle the <dictionary> tag.
  99. */
  100. protected RuleBasedBreakIterator.Builder makeBuilder() {
  101. return new Builder();
  102. }
  103. public void setText(CharacterIterator newText) {
  104. super.setText(newText);
  105. cachedBreakPositions = null;
  106. dictionaryCharCount = 0;
  107. positionInCache = 0;
  108. }
  109. /**
  110. * Sets the current iteration position to the beginning of the text.
  111. * (i.e., the CharacterIterator's starting offset).
  112. * @return The offset of the beginning of the text.
  113. */
  114. public int first() {
  115. cachedBreakPositions = null;
  116. dictionaryCharCount = 0;
  117. positionInCache = 0;
  118. return super.first();
  119. }
  120. /**
  121. * Sets the current iteration position to the end of the text.
  122. * (i.e., the CharacterIterator's ending offset).
  123. * @return The text's past-the-end offset.
  124. */
  125. public int last() {
  126. cachedBreakPositions = null;
  127. dictionaryCharCount = 0;
  128. positionInCache = 0;
  129. return super.last();
  130. }
  131. /**
  132. * Advances the iterator one step backwards.
  133. * @return The position of the last boundary position before the
  134. * current iteration position
  135. */
  136. public int previous() {
  137. CharacterIterator text = getText();
  138. // if we have cached break positions and we're still in the range
  139. // covered by them, just move one step backward in the cache
  140. if (cachedBreakPositions != null && positionInCache > 0) {
  141. --positionInCache;
  142. text.setIndex(cachedBreakPositions[positionInCache]);
  143. return cachedBreakPositions[positionInCache];
  144. }
  145. // otherwise, dump the cache and use the inherited previous() method to move
  146. // backward. This may fill up the cache with new break positions, in which
  147. // case we have to mark our position in the cache
  148. else {
  149. cachedBreakPositions = null;
  150. int result = super.previous();
  151. if (cachedBreakPositions != null)
  152. positionInCache = cachedBreakPositions.length - 2;
  153. return result;
  154. }
  155. }
  156. /**
  157. * Sets the current iteration position to the last boundary position
  158. * before the specified position.
  159. * @param offset The position to begin searching from
  160. * @return The position of the last boundary before "offset"
  161. */
  162. public int preceding(int offset) {
  163. CharacterIterator text = getText();
  164. checkOffset(offset, text);
  165. // if we have no cached break positions, or "offset" is outside the
  166. // range covered by the cache, we can just call the inherited routine
  167. // (which will eventually call other routines in this class that may
  168. // refresh the cache)
  169. if (cachedBreakPositions == null || offset <= cachedBreakPositions[0] ||
  170. offset > cachedBreakPositions[cachedBreakPositions.length - 1]) {
  171. cachedBreakPositions = null;
  172. return super.preceding(offset);
  173. }
  174. // on the other hand, if "offset" is within the range covered by the cache,
  175. // then all we have to do is search the cache for the last break position
  176. // before "offset"
  177. else {
  178. positionInCache = 0;
  179. while (positionInCache < cachedBreakPositions.length
  180. && offset > cachedBreakPositions[positionInCache])
  181. ++positionInCache;
  182. --positionInCache;
  183. text.setIndex(cachedBreakPositions[positionInCache]);
  184. return text.getIndex();
  185. }
  186. }
  187. /**
  188. * Sets the current iteration position to the first boundary position after
  189. * the specified position.
  190. * @param offset The position to begin searching forward from
  191. * @return The position of the first boundary after "offset"
  192. */
  193. public int following(int offset) {
  194. CharacterIterator text = getText();
  195. checkOffset(offset, text);
  196. // if we have no cached break positions, or if "offset" is outside the
  197. // range covered by the cache, then dump the cache and call our
  198. // inherited following() method. This will call other methods in this
  199. // class that may refresh the cache.
  200. if (cachedBreakPositions == null || offset < cachedBreakPositions[0] ||
  201. offset >= cachedBreakPositions[cachedBreakPositions.length - 1]) {
  202. cachedBreakPositions = null;
  203. return super.following(offset);
  204. }
  205. // on the other hand, if "offset" is within the range covered by the
  206. // cache, then just search the cache for the first break position
  207. // after "offset"
  208. else {
  209. positionInCache = 0;
  210. while (positionInCache < cachedBreakPositions.length
  211. && offset >= cachedBreakPositions[positionInCache])
  212. ++positionInCache;
  213. text.setIndex(cachedBreakPositions[positionInCache]);
  214. return text.getIndex();
  215. }
  216. }
  217. /**
  218. * This is the implementation function for next().
  219. */
  220. protected int handleNext() {
  221. CharacterIterator text = getText();
  222. // if there are no cached break positions, or if we've just moved
  223. // off the end of the range covered by the cache, we have to dump
  224. // and possibly regenerate the cache
  225. if (cachedBreakPositions == null ||
  226. positionInCache == cachedBreakPositions.length - 1) {
  227. // start by using the inherited handleNext() to find a tentative return
  228. // value. dictionaryCharCount tells us how many dictionary characters
  229. // we passed over on our way to the tentative return value
  230. int startPos = text.getIndex();
  231. dictionaryCharCount = 0;
  232. int result = super.handleNext();
  233. // if we passed over more than one dictionary character, then we use
  234. // divideUpDictionaryRange() to regenerate the cached break positions
  235. // for the new range
  236. if (dictionaryCharCount > 1 && result - startPos > 1) {
  237. divideUpDictionaryRange(startPos, result);
  238. }
  239. // otherwise, the value we got back from the inherited fuction
  240. // is our return value, and we can dump the cache
  241. else {
  242. cachedBreakPositions = null;
  243. return result;
  244. }
  245. }
  246. // if the cache of break positions has been regenerated (or existed all
  247. // along), then just advance to the next break position in the cache
  248. // and return it
  249. if (cachedBreakPositions != null) {
  250. ++positionInCache;
  251. text.setIndex(cachedBreakPositions[positionInCache]);
  252. return cachedBreakPositions[positionInCache];
  253. }
  254. return -9999; // SHOULD NEVER GET HERE!
  255. }
  256. /**
  257. * Looks up a character category for a character.
  258. */
  259. protected int lookupCategory(char c) {
  260. // this override of lookupCategory() exists only to keep track of whether we've
  261. // passed over any dictionary characters. It calls the inherited lookupCategory()
  262. // to do the real work, and then checks whether its return value is one of the
  263. // categories represented in the dictionary. If it is, bump the dictionary-
  264. // character count.
  265. int result = super.lookupCategory(c);
  266. if (result != RuleBasedBreakIterator.IGNORE && categoryFlags[result]) {
  267. ++dictionaryCharCount;
  268. }
  269. return result;
  270. }
  271. /**
  272. * This is the function that actually implements the dictionary-based
  273. * algorithm. Given the endpoints of a range of text, it uses the
  274. * dictionary to determine the positions of any boundaries in this
  275. * range. It stores all the boundary positions it discovers in
  276. * cachedBreakPositions so that we only have to do this work once
  277. * for each time we enter the range.
  278. */
  279. private void divideUpDictionaryRange(int startPos, int endPos) {
  280. CharacterIterator text = getText();
  281. // the range we're dividing may begin or end with non-dictionary characters
  282. // (i.e., for line breaking, we may have leading or trailing punctuation
  283. // that needs to be kept with the word). Seek from the beginning of the
  284. // range to the first dictionary character
  285. text.setIndex(startPos);
  286. char c = text.current();
  287. int category = lookupCategory(c);
  288. while (category == IGNORE || !categoryFlags[category]) {
  289. c = text.next();
  290. category = lookupCategory(c);
  291. }
  292. // initialize. We maintain two stacks: currentBreakPositions contains
  293. // the list of break positions that will be returned if we successfully
  294. // finish traversing the whole range now. possibleBreakPositions lists
  295. // all other possible word ends we've passed along the way. (Whenever
  296. // we reach an error [a sequence of characters that can't begin any word
  297. // in the dictionary], we back up, possibly delete some breaks from
  298. // currentBreakPositions, move a break from possibleBreakPositions
  299. // to currentBreakPositions, and start over from there. This process
  300. // continues in this way until we either successfully make it all the way
  301. // across the range, or exhaust all of our combinations of break
  302. // positions.)
  303. Stack currentBreakPositions = new Stack();
  304. Stack possibleBreakPositions = new Stack();
  305. Vector wrongBreakPositions = new Vector();
  306. // the dictionary is implemented as a trie, which is treated as a state
  307. // machine. -1 represents the end of a legal word. Every word in the
  308. // dictionary is represented by a path from the root node to -1. A path
  309. // that ends in state 0 is an illegal combination of characters.
  310. int state = 0;
  311. // these two variables are used for error handling. We keep track of the
  312. // farthest we've gotten through the range being divided, and the combination
  313. // of breaks that got us that far. If we use up all possible break
  314. // combinations, the text contains an error or a word that's not in the
  315. // dictionary. In this case, we "bless" the break positions that got us the
  316. // farthest as real break positions, and then start over from scratch with
  317. // the character where the error occurred.
  318. int farthestEndPoint = text.getIndex();
  319. Stack bestBreakPositions = null;
  320. // initialize (we always exit the loop with a break statement)
  321. c = text.current();
  322. while (true) {
  323. // if we can transition to state "-1" from our current state, we're
  324. // on the last character of a legal word. Push that position onto
  325. // the possible-break-positions stack
  326. if (dictionary.at(state, 0) == -1) {
  327. possibleBreakPositions.push(new Integer(text.getIndex()));
  328. }
  329. // look up the new state to transition to in the dictionary
  330. state = dictionary.at(state, c);
  331. // if the character we're sitting on causes us to transition to
  332. // the "end of word" state, then it was a non-dictionary character
  333. // and we've successfully traversed the whole range. Drop out
  334. // of the loop.
  335. if (state == -1) {
  336. currentBreakPositions.push(new Integer(text.getIndex()));
  337. break;
  338. }
  339. // if the character we're sitting on causes us to transition to
  340. // the error state, or if we've gone off the end of the range
  341. // without transitioning to the "end of word" state, we've hit
  342. // an error...
  343. else if (state == 0 || text.getIndex() >= endPos) {
  344. // if this is the farthest we've gotten, take note of it in
  345. // case there's an error in the text
  346. if (text.getIndex() > farthestEndPoint) {
  347. farthestEndPoint = text.getIndex();
  348. bestBreakPositions = (Stack)(currentBreakPositions.clone());
  349. }
  350. // wrongBreakPositions is a list of all break positions
  351. // we've tried starting that didn't allow us to traverse
  352. // all the way through the text. Every time we pop a
  353. //break position off of currentBreakPositions, we put it
  354. // into wrongBreakPositions to avoid trying it again later.
  355. // If we make it to this spot, we're either going to back
  356. // up to a break in possibleBreakPositions and try starting
  357. // over from there, or we've exhausted all possible break
  358. // positions and are going to do the fallback procedure.
  359. // This loop prevents us from messing with anything in
  360. // possibleBreakPositions that didn't work as a starting
  361. // point the last time we tried it (this is to prevent a bunch of
  362. // repetitive checks from slowing down some extreme cases)
  363. Integer newStartingSpot = null;
  364. while (!possibleBreakPositions.isEmpty() && wrongBreakPositions.contains(
  365. possibleBreakPositions.peek())) {
  366. possibleBreakPositions.pop();
  367. }
  368. // if we've used up all possible break-position combinations, there's
  369. // an error or an unknown word in the text. In this case, we start
  370. // over, treating the farthest character we've reached as the beginning
  371. // of the range, and "blessing" the break positions that got us that
  372. // far as real break positions
  373. if (possibleBreakPositions.isEmpty()) {
  374. if (bestBreakPositions != null) {
  375. currentBreakPositions = bestBreakPositions;
  376. if (farthestEndPoint < endPos) {
  377. text.setIndex(farthestEndPoint + 1);
  378. }
  379. else {
  380. break;
  381. }
  382. }
  383. else {
  384. if ((currentBreakPositions.size() == 0 ||
  385. ((Integer)(currentBreakPositions.peek())).intValue() != text.getIndex())
  386. && text.getIndex() != startPos) {
  387. currentBreakPositions.push(new Integer(text.getIndex()));
  388. }
  389. text.next();
  390. currentBreakPositions.push(new Integer(text.getIndex()));
  391. }
  392. }
  393. // if we still have more break positions we can try, then promote the
  394. // last break in possibleBreakPositions into currentBreakPositions,
  395. // and get rid of all entries in currentBreakPositions that come after
  396. // it. Then back up to that position and start over from there (i.e.,
  397. // treat that position as the beginning of a new word)
  398. else {
  399. Integer temp = (Integer)possibleBreakPositions.pop();
  400. Object temp2 = null;
  401. while (!currentBreakPositions.isEmpty() && temp.intValue() <
  402. ((Integer)currentBreakPositions.peek()).intValue()) {
  403. temp2 = currentBreakPositions.pop();
  404. wrongBreakPositions.addElement(temp2);
  405. }
  406. currentBreakPositions.push(temp);
  407. text.setIndex(((Integer)currentBreakPositions.peek()).intValue());
  408. }
  409. // re-sync "c" for the next go-round, and drop out of the loop if
  410. // we've made it off the end of the range
  411. c = text.current();
  412. if (text.getIndex() >= endPos) {
  413. break;
  414. }
  415. }
  416. // if we didn't hit any exceptional conditions on this last iteration,
  417. // just advance to the next character and loop
  418. else {
  419. c = text.next();
  420. }
  421. }
  422. // dump the last break position in the list, and replace it with the actual
  423. // end of the range (which may be the same character, or may be further on
  424. // because the range actually ended with non-dictionary characters we want to
  425. // keep with the word)
  426. if (!currentBreakPositions.isEmpty()) {
  427. currentBreakPositions.pop();
  428. }
  429. currentBreakPositions.push(new Integer(endPos));
  430. // create a regular array to hold the break positions and copy
  431. // the break positions from the stack to the array (in addition,
  432. // our starting position goes into this array as a break position).
  433. // This array becomes the cache of break positions used by next()
  434. // and previous(), so this is where we actually refresh the cache.
  435. cachedBreakPositions = new int[currentBreakPositions.size() + 1];
  436. cachedBreakPositions[0] = startPos;
  437. for (int i = 0; i < currentBreakPositions.size(); i++) {
  438. cachedBreakPositions[i + 1] = ((Integer)currentBreakPositions.elementAt(i)).intValue();
  439. }
  440. positionInCache = 0;
  441. }
  442. /**
  443. * The Builder class for DictionaryBasedBreakIterator inherits almost all of
  444. * its functionality from the Builder class for RuleBasedBreakIterator, but
  445. * extends it with extra logic to handle the "<dictionary>" token
  446. */
  447. protected class Builder extends RuleBasedBreakIterator.Builder {
  448. /**
  449. * A CharSet that contains all the characters represented in the dictionary
  450. */
  451. private CharSet dictionaryChars = new CharSet();
  452. private String dictionaryExpression = "";
  453. /**
  454. * No special initialization
  455. */
  456. public Builder() {
  457. DictionaryBasedBreakIterator.this.super();
  458. }
  459. /**
  460. * We override handleSpecialSubstitution() to add logic to handle
  461. * the <dictionary> tag. If we see a substitution named "<dictionary>",
  462. * parse the substitution expression and store the result in
  463. * dictionaryChars.
  464. */
  465. protected void handleSpecialSubstitution(String replace, String replaceWith,
  466. int startPos, String description) {
  467. super.handleSpecialSubstitution(replace, replaceWith, startPos, description);
  468. if (replace.equals("<dictionary>")) {
  469. if (replaceWith.charAt(0) == '(') {
  470. error("Dictionary group can't be enclosed in (", startPos, description);
  471. }
  472. dictionaryExpression = replaceWith;
  473. dictionaryChars = CharSet.parseString(replaceWith);
  474. }
  475. }
  476. /**
  477. * The other half of the logic to handle the dictionary characters happens here.
  478. * After the inherited builder has derived the real character categories, we
  479. * set up the categoryFlags array in the iterator. This array contains "true"
  480. * for every character category that includes a dictionary character.
  481. */
  482. protected void buildCharCategories(Vector tempRuleList) {
  483. super.buildCharCategories(tempRuleList);
  484. categoryFlags = new boolean[categories.size()];
  485. for (int i = 0; i < categories.size(); i++) {
  486. CharSet cs = (CharSet)categories.elementAt(i);
  487. if (!(cs.intersection(dictionaryChars).empty())) {
  488. categoryFlags[i] = true;
  489. }
  490. }
  491. }
  492. // This function is actually called by
  493. // RuleBasedBreakIterator.buildCharCategories(), which is called
  494. // by the function above. This gives us a way to create a separate
  495. // character category for the dictionary characters even when
  496. // RuleBasedBreakIterator isn't making a distinction.
  497. protected void mungeExpressionList(Hashtable expressions) {
  498. expressions.put(dictionaryExpression, dictionaryChars);
  499. }
  500. }
  501. }