1. /*
  2. * @(#)BreakDictionary.java 1.12 03/01/23
  3. *
  4. * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
  5. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
  6. */
  7. /*
  8. * @(#)BreakDictionary.java 1.3 99/04/07
  9. *
  10. * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  11. * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
  12. *
  13. * The original version of this source code and documentation
  14. * is copyrighted and owned by Taligent, Inc., a wholly-owned
  15. * subsidiary of IBM. These materials are provided under terms
  16. * of a License Agreement between Taligent and Sun. This technology
  17. * is protected by multiple US and International patents.
  18. *
  19. * This notice and attribution to Taligent may not be removed.
  20. * Taligent is a registered trademark of Taligent, Inc.
  21. */
  22. package java.text;
  23. import java.io.*;
  24. import java.util.MissingResourceException;
  25. import sun.text.CompactByteArray;
  26. /**
  27. * This is the class that represents the list of known words used by
  28. * DictionaryBasedBreakIterator. The conceptual data structure used
  29. * here is a trie: there is a node hanging off the root node for every
  30. * letter that can start a word. Each of these nodes has a node hanging
  31. * off of it for every letter that can be the second letter of a word
  32. * if this node is the first letter, and so on. The trie is represented
  33. * as a two-dimensional array that can be treated as a table of state
  34. * transitions. Indexes are used to compress this array, taking
  35. * advantage of the fact that this array will always be very sparse.
  36. */
  37. class BreakDictionary {
  38. //=================================================================================
  39. // testing and debugging
  40. //=================================================================================
  41. public static void main(String args[])
  42. throws FileNotFoundException, UnsupportedEncodingException, IOException {
  43. String filename = args[0];
  44. BreakDictionary dictionary = new BreakDictionary(new FileInputStream(filename));
  45. String outputFile = "";
  46. if(args.length >= 2) {
  47. outputFile = args[1];
  48. }
  49. PrintWriter out = null;
  50. if (outputFile.length() != 0) {
  51. out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outputFile), "UnicodeLittle"));
  52. }
  53. dictionary.printWordList("", 0, out);
  54. }
  55. public void printWordList(String partialWord, int state, PrintWriter out)
  56. throws IOException {
  57. if (state == -1) {
  58. System.out.println(partialWord);
  59. if (out != null) {
  60. out.println(partialWord);
  61. }
  62. }
  63. else {
  64. for (int i = 0; i < numCols; i++) {
  65. if (at(state, i) != 0) {
  66. printWordList(partialWord + reverseColumnMap[i], at(state, i), out);
  67. }
  68. }
  69. }
  70. }
  71. /**
  72. * A map used to go from column numbers to characters. Used only
  73. * for debugging right now.
  74. */
  75. private char[] reverseColumnMap = null;
  76. //=================================================================================
  77. // data members
  78. //=================================================================================
  79. /**
  80. * The version of the dictionary that was read in.
  81. */
  82. private static int supportedVersion = 0;
  83. private int version;
  84. /**
  85. * Maps from characters to column numbers. The main use of this is to
  86. * avoid making room in the array for empty columns.
  87. */
  88. private CompactByteArray columnMap = null;
  89. /**
  90. * The number of actual columns in the table
  91. */
  92. private int numCols;
  93. /**
  94. * Columns are organized into groups of 32. This says how many
  95. * column groups. (We could calculate this, but we store the
  96. * value to avoid having to repeatedly calculate it.)
  97. */
  98. private int numColGroups;
  99. /**
  100. * The actual compressed state table. Each conceptual row represents
  101. * a state, and the cells in it contain the row numbers of the states
  102. * to transition to for each possible letter. 0 is used to indicate
  103. * an illegal combination of letters (i.e., the error state). The
  104. * table is compressed by eliminating all the unpopulated (i.e., zero)
  105. * cells. Multiple conceptual rows can then be doubled up in a single
  106. * physical row by sliding them up and possibly shifting them to one
  107. * side or the other so the populated cells don't collide. Indexes
  108. * are used to identify unpopulated cells and to locate populated cells.
  109. */
  110. private short[] table = null;
  111. /**
  112. * This index maps logical row numbers to physical row numbers
  113. */
  114. private short[] rowIndex = null;
  115. /**
  116. * A bitmap is used to tell which cells in the comceptual table are
  117. * populated. This array contains all the unique bit combinations
  118. * in that bitmap. If the table is more than 32 columns wide,
  119. * successive entries in this array are used for a single row.
  120. */
  121. private int[] rowIndexFlags = null;
  122. /**
  123. * This index maps from a logical row number into the bitmap table above.
  124. * (This keeps us from storing duplicate bitmap combinations.) Since there
  125. * are a lot of rows with only one populated cell, instead of wasting space
  126. * in the bitmap table, we just store a negative number in this index for
  127. * rows with one populated cell. The absolute value of that number is
  128. * the column number of the populated cell.
  129. */
  130. private short[] rowIndexFlagsIndex = null;
  131. /**
  132. * For each logical row, this index contains a constant that is added to
  133. * the logical column number to get the physical column number
  134. */
  135. private byte[] rowIndexShifts = null;
  136. //=================================================================================
  137. // deserialization
  138. //=================================================================================
  139. public BreakDictionary(InputStream dictionaryStream) throws IOException {
  140. readDictionaryFile(new DataInputStream(dictionaryStream));
  141. }
  142. public void readDictionaryFile(DataInputStream in) throws IOException {
  143. version = in.readInt();
  144. if (version != supportedVersion) {
  145. throw new MissingResourceException("Dictionary version(" + version + ") is unsupported",
  146. in.toString(), "");
  147. }
  148. int l;
  149. // read in the column map (this is serialized in its internal form:
  150. // an index array followed by a data array)
  151. l = in.readInt();
  152. short[] temp = new short[l];
  153. for (int i = 0; i < temp.length; i++)
  154. temp[i] = in.readShort();
  155. l = in.readInt();
  156. byte[] temp2 = new byte[l];
  157. for (int i = 0; i < temp2.length; i++)
  158. temp2[i] = in.readByte();
  159. columnMap = new CompactByteArray(temp, temp2);
  160. // read in numCols and numColGroups
  161. numCols = in.readInt();
  162. numColGroups = in.readInt();
  163. // read in the row-number index
  164. l = in.readInt();
  165. rowIndex = new short[l];
  166. for (int i = 0; i < rowIndex.length; i++)
  167. rowIndex[i] = in.readShort();
  168. // load in the populated-cells bitmap: index first, then bitmap list
  169. l = in.readInt();
  170. rowIndexFlagsIndex = new short[l];
  171. for (int i = 0; i < rowIndexFlagsIndex.length; i++)
  172. rowIndexFlagsIndex[i] = in.readShort();
  173. l = in.readInt();
  174. rowIndexFlags = new int[l];
  175. for (int i = 0; i < rowIndexFlags.length; i++)
  176. rowIndexFlags[i] = in.readInt();
  177. // load in the row-shift index
  178. l = in.readInt();
  179. rowIndexShifts = new byte[l];
  180. for (int i = 0; i < rowIndexShifts.length; i++)
  181. rowIndexShifts[i] = in.readByte();
  182. // finally, load in the actual state table
  183. l = in.readInt();
  184. table = new short[l];
  185. for (int i = 0; i < table.length; i++)
  186. table[i] = in.readShort();
  187. // this data structure is only necessary for testing and debugging purposes
  188. reverseColumnMap = new char[numCols];
  189. for (char c = 0; c < 0xffff; c++) {
  190. int col = columnMap.elementAt(c);
  191. if (col != 0) {
  192. reverseColumnMap[col] = c;
  193. }
  194. }
  195. // close the stream
  196. in.close();
  197. }
  198. //=================================================================================
  199. // access to the words
  200. //=================================================================================
  201. /**
  202. * Uses the column map to map the character to a column number, then
  203. * passes the row and column number to the other version of at()
  204. * @param row The current state
  205. * @param ch The character whose column we're interested in
  206. * @return The new state to transition to
  207. */
  208. public final short at(int row, char ch) {
  209. int col = columnMap.elementAt(ch);
  210. return at(row, col);
  211. }
  212. /**
  213. * Returns the value in the cell with the specified (logical) row and
  214. * column numbers. In DictionaryBasedBreakIterator, the row number is
  215. * a state number, the column number is an input, and the return value
  216. * is the row number of the new state to transition to. (0 is the
  217. * "error" state, and -1 is the "end of word" state in a dictionary)
  218. * @param row The row number of the current state
  219. * @param col The column number of the input character (0 means "not a
  220. * dictionary character")
  221. * @return The row number of the new state to transition to
  222. */
  223. public final short at(int row, int col) {
  224. if (cellIsPopulated(row, col)) {
  225. // we map from logical to physical row number by looking up the
  226. // mapping in rowIndex; we map from logical column number to
  227. // physical column number by looking up a shift value for this
  228. // logical row and offsetting the logical column number by
  229. // the shift amount. Then we can use internalAt() to actually
  230. // get the value out of the table.
  231. return internalAt(rowIndex[row], col + rowIndexShifts[row]);
  232. }
  233. else {
  234. return 0;
  235. }
  236. }
  237. /**
  238. * Given (logical) row and column numbers, returns true if the
  239. * cell in that position is populated
  240. */
  241. private final boolean cellIsPopulated(int row, int col) {
  242. // look up the entry in the bitmap index for the specified row.
  243. // If it's a negative number, it's the column number of the only
  244. // populated cell in the row
  245. if (rowIndexFlagsIndex[row] < 0) {
  246. return col == -rowIndexFlagsIndex[row];
  247. }
  248. // if it's a positive number, it's the offset of an entry in the bitmap
  249. // list. If the table is more than 32 columns wide, the bitmap is stored
  250. // successive entries in the bitmap list, so we have to divide the column
  251. // number by 32 and offset the number we got out of the index by the result.
  252. // Once we have the appropriate piece of the bitmap, test the appropriate
  253. // bit and return the result.
  254. else {
  255. int flags = rowIndexFlags[rowIndexFlagsIndex[row] + (col >> 5)];
  256. return (flags & (1 << (col & 0x1f))) != 0;
  257. }
  258. }
  259. /**
  260. * Implementation of at() when we know the specified cell is populated.
  261. * @param row The PHYSICAL row number of the cell
  262. * @param col The PHYSICAL column number of the cell
  263. * @return The value stored in the cell
  264. */
  265. private final short internalAt(int row, int col) {
  266. // the table is a one-dimensional array, so this just does the math necessary
  267. // to treat it as a two-dimensional array (we don't just use a two-dimensional
  268. // array because two-dimensional arrays are inefficient in Java)
  269. return table[row * numCols + col];
  270. }
  271. }