1. /*
  2. * @(#)BreakDictionary.java 1.14 03/12/19
  3. *
  4. * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
  5. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
  6. */
  7. /*
  8. * @(#)BreakDictionary.java 1.3 99/04/07
  9. *
  10. * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  11. * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
  12. *
  13. * The original version of this source code and documentation
  14. * is copyrighted and owned by Taligent, Inc., a wholly-owned
  15. * subsidiary of IBM. These materials are provided under terms
  16. * of a License Agreement between Taligent and Sun. This technology
  17. * is protected by multiple US and International patents.
  18. *
  19. * This notice and attribution to Taligent may not be removed.
  20. * Taligent is a registered trademark of Taligent, Inc.
  21. */
  22. package java.text;
  23. import java.io.*;
  24. import java.security.AccessController;
  25. import java.security.PrivilegedActionException;
  26. import java.security.PrivilegedExceptionAction;
  27. import java.util.MissingResourceException;
  28. import sun.text.CompactByteArray;
  29. import sun.text.SupplementaryCharacterData;
  30. /**
  31. * This is the class that represents the list of known words used by
  32. * DictionaryBasedBreakIterator. The conceptual data structure used
  33. * here is a trie: there is a node hanging off the root node for every
  34. * letter that can start a word. Each of these nodes has a node hanging
  35. * off of it for every letter that can be the second letter of a word
  36. * if this node is the first letter, and so on. The trie is represented
  37. * as a two-dimensional array that can be treated as a table of state
  38. * transitions. Indexes are used to compress this array, taking
  39. * advantage of the fact that this array will always be very sparse.
  40. */
  41. class BreakDictionary {
  42. //=========================================================================
  43. // data members
  44. //=========================================================================
  45. /**
  46. * The version of the dictionary that was read in.
  47. */
  48. private static int supportedVersion = 1;
  49. /**
  50. * Maps from characters to column numbers. The main use of this is to
  51. * avoid making room in the array for empty columns.
  52. */
  53. private CompactByteArray columnMap = null;
  54. private SupplementaryCharacterData supplementaryCharColumnMap = null;
  55. /**
  56. * The number of actual columns in the table
  57. */
  58. private int numCols;
  59. /**
  60. * Columns are organized into groups of 32. This says how many
  61. * column groups. (We could calculate this, but we store the
  62. * value to avoid having to repeatedly calculate it.)
  63. */
  64. private int numColGroups;
  65. /**
  66. * The actual compressed state table. Each conceptual row represents
  67. * a state, and the cells in it contain the row numbers of the states
  68. * to transition to for each possible letter. 0 is used to indicate
  69. * an illegal combination of letters (i.e., the error state). The
  70. * table is compressed by eliminating all the unpopulated (i.e., zero)
  71. * cells. Multiple conceptual rows can then be doubled up in a single
  72. * physical row by sliding them up and possibly shifting them to one
  73. * side or the other so the populated cells don't collide. Indexes
  74. * are used to identify unpopulated cells and to locate populated cells.
  75. */
  76. private short[] table = null;
  77. /**
  78. * This index maps logical row numbers to physical row numbers
  79. */
  80. private short[] rowIndex = null;
  81. /**
  82. * A bitmap is used to tell which cells in the comceptual table are
  83. * populated. This array contains all the unique bit combinations
  84. * in that bitmap. If the table is more than 32 columns wide,
  85. * successive entries in this array are used for a single row.
  86. */
  87. private int[] rowIndexFlags = null;
  88. /**
  89. * This index maps from a logical row number into the bitmap table above.
  90. * (This keeps us from storing duplicate bitmap combinations.) Since there
  91. * are a lot of rows with only one populated cell, instead of wasting space
  92. * in the bitmap table, we just store a negative number in this index for
  93. * rows with one populated cell. The absolute value of that number is
  94. * the column number of the populated cell.
  95. */
  96. private short[] rowIndexFlagsIndex = null;
  97. /**
  98. * For each logical row, this index contains a constant that is added to
  99. * the logical column number to get the physical column number
  100. */
  101. private byte[] rowIndexShifts = null;
  102. //=========================================================================
  103. // deserialization
  104. //=========================================================================
  105. public BreakDictionary(String dictionaryName)
  106. throws IOException, MissingResourceException {
  107. readDictionaryFile(dictionaryName);
  108. }
  109. private void readDictionaryFile(final String dictionaryName)
  110. throws IOException, MissingResourceException {
  111. BufferedInputStream in;
  112. try {
  113. in = (BufferedInputStream)AccessController.doPrivileged(
  114. new PrivilegedExceptionAction() {
  115. public Object run() throws Exception {
  116. return new BufferedInputStream(getClass().getResourceAsStream("/sun/text/resources/" + dictionaryName));
  117. }
  118. }
  119. );
  120. }
  121. catch (PrivilegedActionException e) {
  122. throw new InternalError(e.toString());
  123. }
  124. byte[] buf = new byte[8];
  125. if (in.read(buf) != 8) {
  126. throw new MissingResourceException("Wrong data length",
  127. dictionaryName, "");
  128. }
  129. // check vesion
  130. int version = BreakIterator.getInt(buf, 0);
  131. if (version != supportedVersion) {
  132. throw new MissingResourceException("Dictionary version(" + version + ") is unsupported",
  133. dictionaryName, "");
  134. }
  135. // get data size
  136. int len = BreakIterator.getInt(buf, 4);
  137. buf = new byte[len];
  138. if (in.read(buf) != len) {
  139. throw new MissingResourceException("Wrong data length",
  140. dictionaryName, "");
  141. }
  142. // close the stream
  143. in.close();
  144. int l;
  145. int offset = 0;
  146. // read in the column map for BMP characteres (this is serialized in
  147. // its internal form: an index array followed by a data array)
  148. l = BreakIterator.getInt(buf, offset);
  149. offset += 4;
  150. short[] temp = new short[l];
  151. for (int i = 0; i < l; i++, offset+=2) {
  152. temp[i] = BreakIterator.getShort(buf, offset);
  153. }
  154. l = BreakIterator.getInt(buf, offset);
  155. offset += 4;
  156. byte[] temp2 = new byte[l];
  157. for (int i = 0; i < l; i++, offset++) {
  158. temp2[i] = buf[offset];
  159. }
  160. columnMap = new CompactByteArray(temp, temp2);
  161. // read in numCols and numColGroups
  162. numCols = BreakIterator.getInt(buf, offset);
  163. offset += 4;
  164. numColGroups = BreakIterator.getInt(buf, offset);
  165. offset += 4;
  166. // read in the row-number index
  167. l = BreakIterator.getInt(buf, offset);
  168. offset += 4;
  169. rowIndex = new short[l];
  170. for (int i = 0; i < l; i++, offset+=2) {
  171. rowIndex[i] = BreakIterator.getShort(buf, offset);
  172. }
  173. // load in the populated-cells bitmap: index first, then bitmap list
  174. l = BreakIterator.getInt(buf, offset);
  175. offset += 4;
  176. rowIndexFlagsIndex = new short[l];
  177. for (int i = 0; i < l; i++, offset+=2) {
  178. rowIndexFlagsIndex[i] = BreakIterator.getShort(buf, offset);
  179. }
  180. l = BreakIterator.getInt(buf, offset);
  181. offset += 4;
  182. rowIndexFlags = new int[l];
  183. for (int i = 0; i < l; i++, offset+=4) {
  184. rowIndexFlags[i] = BreakIterator.getInt(buf, offset);
  185. }
  186. // load in the row-shift index
  187. l = BreakIterator.getInt(buf, offset);
  188. offset += 4;
  189. rowIndexShifts = new byte[l];
  190. for (int i = 0; i < l; i++, offset++) {
  191. rowIndexShifts[i] = buf[offset];
  192. }
  193. // load in the actual state table
  194. l = BreakIterator.getInt(buf, offset);
  195. offset += 4;
  196. table = new short[l];
  197. for (int i = 0; i < l; i++, offset+=2) {
  198. table[i] = BreakIterator.getShort(buf, offset);
  199. }
  200. // finally, prepare the column map for supplementary characters
  201. l = BreakIterator.getInt(buf, offset);
  202. offset += 4;
  203. int[] temp3 = new int[l];
  204. for (int i = 0; i < l; i++, offset+=4) {
  205. temp3[i] = BreakIterator.getInt(buf, offset);
  206. }
  207. supplementaryCharColumnMap = new SupplementaryCharacterData(temp3);
  208. }
  209. //=========================================================================
  210. // access to the words
  211. //=========================================================================
  212. /**
  213. * Uses the column map to map the character to a column number, then
  214. * passes the row and column number to getNextState()
  215. * @param row The current state
  216. * @param ch The character whose column we're interested in
  217. * @return The new state to transition to
  218. */
  219. public final short getNextStateFromCharacter(int row, int ch) {
  220. int col;
  221. if (ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
  222. col = columnMap.elementAt((char)ch);
  223. } else {
  224. col = supplementaryCharColumnMap.getValue(ch);
  225. }
  226. return getNextState(row, col);
  227. }
  228. /**
  229. * Returns the value in the cell with the specified (logical) row and
  230. * column numbers. In DictionaryBasedBreakIterator, the row number is
  231. * a state number, the column number is an input, and the return value
  232. * is the row number of the new state to transition to. (0 is the
  233. * "error" state, and -1 is the "end of word" state in a dictionary)
  234. * @param row The row number of the current state
  235. * @param col The column number of the input character (0 means "not a
  236. * dictionary character")
  237. * @return The row number of the new state to transition to
  238. */
  239. public final short getNextState(int row, int col) {
  240. if (cellIsPopulated(row, col)) {
  241. // we map from logical to physical row number by looking up the
  242. // mapping in rowIndex; we map from logical column number to
  243. // physical column number by looking up a shift value for this
  244. // logical row and offsetting the logical column number by
  245. // the shift amount. Then we can use internalAt() to actually
  246. // get the value out of the table.
  247. return internalAt(rowIndex[row], col + rowIndexShifts[row]);
  248. }
  249. else {
  250. return 0;
  251. }
  252. }
  253. /**
  254. * Given (logical) row and column numbers, returns true if the
  255. * cell in that position is populated
  256. */
  257. private final boolean cellIsPopulated(int row, int col) {
  258. // look up the entry in the bitmap index for the specified row.
  259. // If it's a negative number, it's the column number of the only
  260. // populated cell in the row
  261. if (rowIndexFlagsIndex[row] < 0) {
  262. return col == -rowIndexFlagsIndex[row];
  263. }
  264. // if it's a positive number, it's the offset of an entry in the bitmap
  265. // list. If the table is more than 32 columns wide, the bitmap is stored
  266. // successive entries in the bitmap list, so we have to divide the column
  267. // number by 32 and offset the number we got out of the index by the result.
  268. // Once we have the appropriate piece of the bitmap, test the appropriate
  269. // bit and return the result.
  270. else {
  271. int flags = rowIndexFlags[rowIndexFlagsIndex[row] + (col >> 5)];
  272. return (flags & (1 << (col & 0x1f))) != 0;
  273. }
  274. }
  275. /**
  276. * Implementation of getNextState() when we know the specified cell is
  277. * populated.
  278. * @param row The PHYSICAL row number of the cell
  279. * @param col The PHYSICAL column number of the cell
  280. * @return The value stored in the cell
  281. */
  282. private final short internalAt(int row, int col) {
  283. // the table is a one-dimensional array, so this just does the math necessary
  284. // to treat it as a two-dimensional array (we don't just use a two-dimensional
  285. // array because two-dimensional arrays are inefficient in Java)
  286. return table[row * numCols + col];
  287. }
  288. }