1. /*
  2. * @(#)DictionaryBasedBreakIterator.java 1.13 03/12/19
  3. *
  4. * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
  5. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
  6. */
  7. /*
  8. * @(#)DictionaryBasedBreakIterator.java 1.3 99/05/03
  9. *
  10. * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  11. * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
  12. *
  13. * The original version of this source code and documentation
  14. * is copyrighted and owned by Taligent, Inc., a wholly-owned
  15. * subsidiary of IBM. These materials are provided under terms
  16. * of a License Agreement between Taligent and Sun. This technology
  17. * is protected by multiple US and International patents.
  18. *
  19. * This notice and attribution to Taligent may not be removed.
  20. * Taligent is a registered trademark of Taligent, Inc.
  21. */
  22. package java.text;
  23. import java.util.Vector;
  24. import java.util.Stack;
  25. import java.util.Hashtable;
  26. import java.text.CharacterIterator;
  27. import java.io.InputStream;
  28. import java.io.IOException;
  29. /**
  30. * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
  31. * to further subdivide ranges of text beyond what is possible using just the
  32. * state-table-based algorithm. This is necessary, for example, to handle
  33. * word and line breaking in Thai, which doesn't use spaces between words. The
  34. * state-table-based algorithm used by RuleBasedBreakIterator is used to divide
  35. * up text as far as possible, and then contiguous ranges of letters are
  36. * repeatedly compared against a list of known words (i.e., the dictionary)
  37. * to divide them up into words.
  38. *
  39. * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator,
  40. * but adds one more special substitution name: <dictionary>. This substitution
  41. * name is used to identify characters in words in the dictionary. The idea is that
  42. * if the iterator passes over a chunk of text that includes two or more characters
  43. * in a row that are included in <dictionary>, it goes back through that range and
  44. * derives additional break positions (if possible) using the dictionary.
  45. *
  46. * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
  47. * file. It follows a prescribed search path to locate the dictionary (right now,
  48. * it looks for it in /com/ibm/text/resources in each directory in the classpath,
  49. * and won't find it in JAR files, but this location is likely to change). The
  50. * dictionary file is in a serialized binary format. We have a very primitive (and
  51. * slow) BuildDictionaryFile utility for creating dictionary files, but aren't
  52. * currently making it public. Contact us for help.
  53. */
  54. class DictionaryBasedBreakIterator extends RuleBasedBreakIterator {
  55. /**
  56. * a list of known words that is used to divide up contiguous ranges of letters,
  57. * stored in a compressed, indexed, format that offers fast access
  58. */
  59. private BreakDictionary dictionary;
  60. /**
  61. * a list of flags indicating which character categories are contained in
  62. * the dictionary file (this is used to determine which ranges of characters
  63. * to apply the dictionary to)
  64. */
  65. private boolean[] categoryFlags;
  66. /**
  67. * a temporary hiding place for the number of dictionary characters in the
  68. * last range passed over by next()
  69. */
  70. private int dictionaryCharCount;
  71. /**
  72. * when a range of characters is divided up using the dictionary, the break
  73. * positions that are discovered are stored here, preventing us from having
  74. * to use either the dictionary or the state table again until the iterator
  75. * leaves this range of text
  76. */
  77. private int[] cachedBreakPositions;
  78. /**
  79. * if cachedBreakPositions is not null, this indicates which item in the
  80. * cache the current iteration position refers to
  81. */
  82. private int positionInCache;
  83. /**
  84. * Constructs a DictionaryBasedBreakIterator.
  85. * @param description Same as the description parameter on RuleBasedBreakIterator,
  86. * except for the special meaning of "<dictionary>". This parameter is just
  87. * passed through to RuleBasedBreakIterator's constructor.
  88. * @param dictionaryFilename The filename of the dictionary file to use
  89. */
  90. public DictionaryBasedBreakIterator(String dataFile, String dictionaryFile)
  91. throws IOException {
  92. super(dataFile);
  93. byte[] tmp = super.getAdditionalData();
  94. if (tmp != null) {
  95. prepareCategoryFlags(tmp);
  96. super.setAdditionalData(null);
  97. }
  98. dictionary = new BreakDictionary(dictionaryFile);
  99. }
  100. private void prepareCategoryFlags(byte[] data) {
  101. categoryFlags = new boolean[data.length];
  102. for (int i = 0; i < data.length; i++) {
  103. categoryFlags[i] = (data[i] == (byte)1) ? true : false;
  104. }
  105. }
  106. public void setText(CharacterIterator newText) {
  107. super.setText(newText);
  108. cachedBreakPositions = null;
  109. dictionaryCharCount = 0;
  110. positionInCache = 0;
  111. }
  112. /**
  113. * Sets the current iteration position to the beginning of the text.
  114. * (i.e., the CharacterIterator's starting offset).
  115. * @return The offset of the beginning of the text.
  116. */
  117. public int first() {
  118. cachedBreakPositions = null;
  119. dictionaryCharCount = 0;
  120. positionInCache = 0;
  121. return super.first();
  122. }
  123. /**
  124. * Sets the current iteration position to the end of the text.
  125. * (i.e., the CharacterIterator's ending offset).
  126. * @return The text's past-the-end offset.
  127. */
  128. public int last() {
  129. cachedBreakPositions = null;
  130. dictionaryCharCount = 0;
  131. positionInCache = 0;
  132. return super.last();
  133. }
  134. /**
  135. * Advances the iterator one step backwards.
  136. * @return The position of the last boundary position before the
  137. * current iteration position
  138. */
  139. public int previous() {
  140. CharacterIterator text = getText();
  141. // if we have cached break positions and we're still in the range
  142. // covered by them, just move one step backward in the cache
  143. if (cachedBreakPositions != null && positionInCache > 0) {
  144. --positionInCache;
  145. text.setIndex(cachedBreakPositions[positionInCache]);
  146. return cachedBreakPositions[positionInCache];
  147. }
  148. // otherwise, dump the cache and use the inherited previous() method to move
  149. // backward. This may fill up the cache with new break positions, in which
  150. // case we have to mark our position in the cache
  151. else {
  152. cachedBreakPositions = null;
  153. int result = super.previous();
  154. if (cachedBreakPositions != null) {
  155. positionInCache = cachedBreakPositions.length - 2;
  156. }
  157. return result;
  158. }
  159. }
  160. /**
  161. * Sets the current iteration position to the last boundary position
  162. * before the specified position.
  163. * @param offset The position to begin searching from
  164. * @return The position of the last boundary before "offset"
  165. */
  166. public int preceding(int offset) {
  167. CharacterIterator text = getText();
  168. checkOffset(offset, text);
  169. // if we have no cached break positions, or "offset" is outside the
  170. // range covered by the cache, we can just call the inherited routine
  171. // (which will eventually call other routines in this class that may
  172. // refresh the cache)
  173. if (cachedBreakPositions == null || offset <= cachedBreakPositions[0] ||
  174. offset > cachedBreakPositions[cachedBreakPositions.length - 1]) {
  175. cachedBreakPositions = null;
  176. return super.preceding(offset);
  177. }
  178. // on the other hand, if "offset" is within the range covered by the cache,
  179. // then all we have to do is search the cache for the last break position
  180. // before "offset"
  181. else {
  182. positionInCache = 0;
  183. while (positionInCache < cachedBreakPositions.length
  184. && offset > cachedBreakPositions[positionInCache]) {
  185. ++positionInCache;
  186. }
  187. --positionInCache;
  188. text.setIndex(cachedBreakPositions[positionInCache]);
  189. return text.getIndex();
  190. }
  191. }
  192. /**
  193. * Sets the current iteration position to the first boundary position after
  194. * the specified position.
  195. * @param offset The position to begin searching forward from
  196. * @return The position of the first boundary after "offset"
  197. */
  198. public int following(int offset) {
  199. CharacterIterator text = getText();
  200. checkOffset(offset, text);
  201. // if we have no cached break positions, or if "offset" is outside the
  202. // range covered by the cache, then dump the cache and call our
  203. // inherited following() method. This will call other methods in this
  204. // class that may refresh the cache.
  205. if (cachedBreakPositions == null || offset < cachedBreakPositions[0] ||
  206. offset >= cachedBreakPositions[cachedBreakPositions.length - 1]) {
  207. cachedBreakPositions = null;
  208. return super.following(offset);
  209. }
  210. // on the other hand, if "offset" is within the range covered by the
  211. // cache, then just search the cache for the first break position
  212. // after "offset"
  213. else {
  214. positionInCache = 0;
  215. while (positionInCache < cachedBreakPositions.length
  216. && offset >= cachedBreakPositions[positionInCache]) {
  217. ++positionInCache;
  218. }
  219. text.setIndex(cachedBreakPositions[positionInCache]);
  220. return text.getIndex();
  221. }
  222. }
  223. /**
  224. * This is the implementation function for next().
  225. */
  226. protected int handleNext() {
  227. CharacterIterator text = getText();
  228. // if there are no cached break positions, or if we've just moved
  229. // off the end of the range covered by the cache, we have to dump
  230. // and possibly regenerate the cache
  231. if (cachedBreakPositions == null ||
  232. positionInCache == cachedBreakPositions.length - 1) {
  233. // start by using the inherited handleNext() to find a tentative return
  234. // value. dictionaryCharCount tells us how many dictionary characters
  235. // we passed over on our way to the tentative return value
  236. int startPos = text.getIndex();
  237. dictionaryCharCount = 0;
  238. int result = super.handleNext();
  239. // if we passed over more than one dictionary character, then we use
  240. // divideUpDictionaryRange() to regenerate the cached break positions
  241. // for the new range
  242. if (dictionaryCharCount > 1 && result - startPos > 1) {
  243. divideUpDictionaryRange(startPos, result);
  244. }
  245. // otherwise, the value we got back from the inherited fuction
  246. // is our return value, and we can dump the cache
  247. else {
  248. cachedBreakPositions = null;
  249. return result;
  250. }
  251. }
  252. // if the cache of break positions has been regenerated (or existed all
  253. // along), then just advance to the next break position in the cache
  254. // and return it
  255. if (cachedBreakPositions != null) {
  256. ++positionInCache;
  257. text.setIndex(cachedBreakPositions[positionInCache]);
  258. return cachedBreakPositions[positionInCache];
  259. }
  260. return -9999; // SHOULD NEVER GET HERE!
  261. }
  262. /**
  263. * Looks up a character category for a character.
  264. */
  265. protected int lookupCategory(int c) {
  266. // this override of lookupCategory() exists only to keep track of whether we've
  267. // passed over any dictionary characters. It calls the inherited lookupCategory()
  268. // to do the real work, and then checks whether its return value is one of the
  269. // categories represented in the dictionary. If it is, bump the dictionary-
  270. // character count.
  271. int result = super.lookupCategory(c);
  272. if (result != RuleBasedBreakIterator.IGNORE && categoryFlags[result]) {
  273. ++dictionaryCharCount;
  274. }
  275. return result;
  276. }
  277. /**
  278. * This is the function that actually implements the dictionary-based
  279. * algorithm. Given the endpoints of a range of text, it uses the
  280. * dictionary to determine the positions of any boundaries in this
  281. * range. It stores all the boundary positions it discovers in
  282. * cachedBreakPositions so that we only have to do this work once
  283. * for each time we enter the range.
  284. */
  285. private void divideUpDictionaryRange(int startPos, int endPos) {
  286. CharacterIterator text = getText();
  287. // the range we're dividing may begin or end with non-dictionary characters
  288. // (i.e., for line breaking, we may have leading or trailing punctuation
  289. // that needs to be kept with the word). Seek from the beginning of the
  290. // range to the first dictionary character
  291. text.setIndex(startPos);
  292. int c = getCurrent();
  293. int category = lookupCategory(c);
  294. while (category == IGNORE || !categoryFlags[category]) {
  295. c = getNext();
  296. category = lookupCategory(c);
  297. }
  298. // initialize. We maintain two stacks: currentBreakPositions contains
  299. // the list of break positions that will be returned if we successfully
  300. // finish traversing the whole range now. possibleBreakPositions lists
  301. // all other possible word ends we've passed along the way. (Whenever
  302. // we reach an error [a sequence of characters that can't begin any word
  303. // in the dictionary], we back up, possibly delete some breaks from
  304. // currentBreakPositions, move a break from possibleBreakPositions
  305. // to currentBreakPositions, and start over from there. This process
  306. // continues in this way until we either successfully make it all the way
  307. // across the range, or exhaust all of our combinations of break
  308. // positions.)
  309. Stack currentBreakPositions = new Stack();
  310. Stack possibleBreakPositions = new Stack();
  311. Vector wrongBreakPositions = new Vector();
  312. // the dictionary is implemented as a trie, which is treated as a state
  313. // machine. -1 represents the end of a legal word. Every word in the
  314. // dictionary is represented by a path from the root node to -1. A path
  315. // that ends in state 0 is an illegal combination of characters.
  316. int state = 0;
  317. // these two variables are used for error handling. We keep track of the
  318. // farthest we've gotten through the range being divided, and the combination
  319. // of breaks that got us that far. If we use up all possible break
  320. // combinations, the text contains an error or a word that's not in the
  321. // dictionary. In this case, we "bless" the break positions that got us the
  322. // farthest as real break positions, and then start over from scratch with
  323. // the character where the error occurred.
  324. int farthestEndPoint = text.getIndex();
  325. Stack bestBreakPositions = null;
  326. // initialize (we always exit the loop with a break statement)
  327. c = getCurrent();
  328. while (true) {
  329. // if we can transition to state "-1" from our current state, we're
  330. // on the last character of a legal word. Push that position onto
  331. // the possible-break-positions stack
  332. if (dictionary.getNextState(state, 0) == -1) {
  333. possibleBreakPositions.push(new Integer(text.getIndex()));
  334. }
  335. // look up the new state to transition to in the dictionary
  336. state = dictionary.getNextStateFromCharacter(state, c);
  337. // if the character we're sitting on causes us to transition to
  338. // the "end of word" state, then it was a non-dictionary character
  339. // and we've successfully traversed the whole range. Drop out
  340. // of the loop.
  341. if (state == -1) {
  342. currentBreakPositions.push(new Integer(text.getIndex()));
  343. break;
  344. }
  345. // if the character we're sitting on causes us to transition to
  346. // the error state, or if we've gone off the end of the range
  347. // without transitioning to the "end of word" state, we've hit
  348. // an error...
  349. else if (state == 0 || text.getIndex() >= endPos) {
  350. // if this is the farthest we've gotten, take note of it in
  351. // case there's an error in the text
  352. if (text.getIndex() > farthestEndPoint) {
  353. farthestEndPoint = text.getIndex();
  354. bestBreakPositions = (Stack)(currentBreakPositions.clone());
  355. }
  356. // wrongBreakPositions is a list of all break positions
  357. // we've tried starting that didn't allow us to traverse
  358. // all the way through the text. Every time we pop a
  359. //break position off of currentBreakPositions, we put it
  360. // into wrongBreakPositions to avoid trying it again later.
  361. // If we make it to this spot, we're either going to back
  362. // up to a break in possibleBreakPositions and try starting
  363. // over from there, or we've exhausted all possible break
  364. // positions and are going to do the fallback procedure.
  365. // This loop prevents us from messing with anything in
  366. // possibleBreakPositions that didn't work as a starting
  367. // point the last time we tried it (this is to prevent a bunch of
  368. // repetitive checks from slowing down some extreme cases)
  369. Integer newStartingSpot = null;
  370. while (!possibleBreakPositions.isEmpty() && wrongBreakPositions.contains(
  371. possibleBreakPositions.peek())) {
  372. possibleBreakPositions.pop();
  373. }
  374. // if we've used up all possible break-position combinations, there's
  375. // an error or an unknown word in the text. In this case, we start
  376. // over, treating the farthest character we've reached as the beginning
  377. // of the range, and "blessing" the break positions that got us that
  378. // far as real break positions
  379. if (possibleBreakPositions.isEmpty()) {
  380. if (bestBreakPositions != null) {
  381. currentBreakPositions = bestBreakPositions;
  382. if (farthestEndPoint < endPos) {
  383. text.setIndex(farthestEndPoint + 1);
  384. }
  385. else {
  386. break;
  387. }
  388. }
  389. else {
  390. if ((currentBreakPositions.size() == 0 ||
  391. ((Integer)(currentBreakPositions.peek())).intValue() != text.getIndex())
  392. && text.getIndex() != startPos) {
  393. currentBreakPositions.push(new Integer(text.getIndex()));
  394. }
  395. getNext();
  396. currentBreakPositions.push(new Integer(text.getIndex()));
  397. }
  398. }
  399. // if we still have more break positions we can try, then promote the
  400. // last break in possibleBreakPositions into currentBreakPositions,
  401. // and get rid of all entries in currentBreakPositions that come after
  402. // it. Then back up to that position and start over from there (i.e.,
  403. // treat that position as the beginning of a new word)
  404. else {
  405. Integer temp = (Integer)possibleBreakPositions.pop();
  406. Object temp2 = null;
  407. while (!currentBreakPositions.isEmpty() && temp.intValue() <
  408. ((Integer)currentBreakPositions.peek()).intValue()) {
  409. temp2 = currentBreakPositions.pop();
  410. wrongBreakPositions.addElement(temp2);
  411. }
  412. currentBreakPositions.push(temp);
  413. text.setIndex(((Integer)currentBreakPositions.peek()).intValue());
  414. }
  415. // re-sync "c" for the next go-round, and drop out of the loop if
  416. // we've made it off the end of the range
  417. c = getCurrent();
  418. if (text.getIndex() >= endPos) {
  419. break;
  420. }
  421. }
  422. // if we didn't hit any exceptional conditions on this last iteration,
  423. // just advance to the next character and loop
  424. else {
  425. c = getNext();
  426. }
  427. }
  428. // dump the last break position in the list, and replace it with the actual
  429. // end of the range (which may be the same character, or may be further on
  430. // because the range actually ended with non-dictionary characters we want to
  431. // keep with the word)
  432. if (!currentBreakPositions.isEmpty()) {
  433. currentBreakPositions.pop();
  434. }
  435. currentBreakPositions.push(new Integer(endPos));
  436. // create a regular array to hold the break positions and copy
  437. // the break positions from the stack to the array (in addition,
  438. // our starting position goes into this array as a break position).
  439. // This array becomes the cache of break positions used by next()
  440. // and previous(), so this is where we actually refresh the cache.
  441. cachedBreakPositions = new int[currentBreakPositions.size() + 1];
  442. cachedBreakPositions[0] = startPos;
  443. for (int i = 0; i < currentBreakPositions.size(); i++) {
  444. cachedBreakPositions[i + 1] = ((Integer)currentBreakPositions.elementAt(i)).intValue();
  445. }
  446. positionInCache = 0;
  447. }
  448. }