1. /*
  2. * @(#)RBTableBuilder.java 1.9 03/01/23
  3. *
  4. * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
  5. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
  6. */
  7. /*
  8. * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  9. * (C) Copyright IBM Corp. 1996-1998 - All Rights Reserved
  10. *
  11. * The original version of this source code and documentation is copyrighted
  12. * and owned by Taligent, Inc., a wholly-owned subsidiary of IBM. These
  13. * materials are provided under terms of a License Agreement between Taligent
  14. * and Sun. This technology is protected by multiple US and International
  15. * patents. This notice and attribution to Taligent may not be removed.
  16. * Taligent is a registered trademark of Taligent, Inc.
  17. *
  18. */
  19. package java.text;
  20. import java.util.Vector;
  21. import sun.text.CompactIntArray;
  22. import sun.text.IntHashtable;
  23. import sun.text.Normalizer;
  24. import sun.text.ComposedCharIter;
  25. import sun.text.NormalizerUtilities;
  26. /**
  27. * This class contains all the code to parse a RuleBasedCollator pattern
  28. * and build a RBCollationTables object from it. A particular instance
  29. * of tis class exists only during the actual build process-- once an
  30. * RBCollationTables object has been built, the RBTableBuilder object
  31. * goes away. This object carries all of the state which is only needed
  32. * during the build process, plus a "shadow" copy of all of the state
  33. * that will go into the tables object itself. This object communicates
  34. * with RBCollationTables through a separate class, RBCollationTables.BuildAPI,
  35. * this is an inner class of RBCollationTables and provides a separate
  36. * private API for communication with RBTableBuilder.
  37. * This class isn't just an inner class of RBCollationTables itself because
  38. * of its large size. For source-code readability, it seemed better for the
  39. * builder to have its own source file.
  40. */
  41. final class RBTableBuilder {
  42. public RBTableBuilder(RBCollationTables.BuildAPI tables) {
  43. this.tables = tables;
  44. }
  45. /**
  46. * Create a table-based collation object with the given rules.
  47. * This is the main function that actually builds the tables and
  48. * stores them back in the RBCollationTables object. It is called
  49. * ONLY by the RBCollationTables constructor.
  50. * @see java.util.RuleBasedCollator#RuleBasedCollator
  51. * @exception ParseException If the rules format is incorrect.
  52. */
  53. public void build(String pattern, int decmp) throws ParseException
  54. {
  55. boolean isSource = true;
  56. int i = 0;
  57. String expChars;
  58. String groupChars;
  59. if (pattern.length() == 0)
  60. throw new ParseException("Build rules empty.", 0);
  61. // This array maps Unicode characters to their collation ordering
  62. mapping = new CompactIntArray((int)RBCollationTables.UNMAPPED);
  63. // Normalize the build rules. Find occurances of all decomposed characters
  64. // and normalize the rules before feeding into the builder. By "normalize",
  65. // we mean that all precomposed Unicode characters must be converted into
  66. // a base character and one or more combining characters (such as accents).
  67. // When there are multiple combining characters attached to a base character,
  68. // the combining characters must be in their canonical order
  69. //
  70. Normalizer.Mode mode = NormalizerUtilities.toNormalizerMode(decmp);
  71. pattern = Normalizer.normalize(pattern, mode, 0, true);
  72. // Build the merged collation entries
  73. // Since rules can be specified in any order in the string
  74. // (e.g. "c , C < d , D < e , E .... C < CH")
  75. // this splits all of the rules in the string out into separate
  76. // objects and then sorts them. In the above example, it merges the
  77. // "C < CH" rule in just before the "C < D" rule.
  78. //
  79. mPattern = new MergeCollation(pattern);
  80. int order = 0;
  81. // Now walk though each entry and add it to my own tables
  82. for (i = 0; i < mPattern.getCount(); ++i)
  83. {
  84. PatternEntry entry = mPattern.getItemAt(i);
  85. if (entry != null) {
  86. groupChars = entry.getChars();
  87. if (groupChars.length() > 1) {
  88. switch(groupChars.charAt(groupChars.length()-1)) {
  89. case '@':
  90. frenchSec = true;
  91. groupChars = groupChars.substring(0, groupChars.length()-1);
  92. break;
  93. case '!':
  94. seAsianSwapping = true;
  95. groupChars = groupChars.substring(0, groupChars.length()-1);
  96. break;
  97. }
  98. }
  99. order = increment(entry.getStrength(), order);
  100. expChars = entry.getExtension();
  101. if (expChars.length() != 0) {
  102. addExpandOrder(groupChars, expChars, order);
  103. } else if (groupChars.length() > 1) {
  104. addContractOrder(groupChars, order);
  105. } else {
  106. char ch = groupChars.charAt(0);
  107. addOrder(ch, order);
  108. }
  109. }
  110. }
  111. addComposedChars();
  112. commit();
  113. mapping.compact();
  114. tables.fillInTables(frenchSec, seAsianSwapping, mapping, contractTable, expandTable,
  115. contractFlags, maxSecOrder, maxTerOrder);
  116. }
  117. /** Add expanding entries for pre-composed unicode characters so that this
  118. * collator can be used reasonably well with decomposition turned off.
  119. */
  120. private void addComposedChars() throws ParseException {
  121. StringBuffer buf = new StringBuffer(1);
  122. // Iterate through all of the pre-composed characters in Unicode
  123. ComposedCharIter iter = new ComposedCharIter(false, Normalizer.IGNORE_HANGUL);
  124. while (iter.hasNext()) {
  125. char c = iter.next();
  126. if (getCharOrder(c) == RBCollationTables.UNMAPPED) {
  127. //
  128. // We don't already have an ordering for this pre-composed character.
  129. //
  130. // First, see if the decomposed string is already in our
  131. // tables as a single contracting-string ordering.
  132. // If so, just map the precomposed character to that order.
  133. //
  134. // TODO: What we should really be doing here is trying to find the
  135. // longest initial substring of the decomposition that is present
  136. // in the tables as a contracting character sequence, and find its
  137. // ordering. Then do this recursively with the remaining chars
  138. // so that we build a list of orderings, and add that list to
  139. // the expansion table.
  140. // That would be more correct but also significantly slower, so
  141. // I'm not totally sure it's worth doing.
  142. //
  143. String s = iter.decomposition();
  144. int contractOrder = getContractOrder(s);
  145. if (contractOrder != RBCollationTables.UNMAPPED) {
  146. addOrder(c, contractOrder);
  147. } else {
  148. //
  149. // We don't have a contracting ordering for the entire string
  150. // that results from the decomposition, but if we have orders
  151. // for each individual character, we can add an expanding
  152. // table entry for the pre-composed character
  153. //
  154. boolean allThere = true;
  155. for (int i = 0; i < s.length(); i++) {
  156. if (getCharOrder(s.charAt(i)) == RBCollationTables.UNMAPPED) {
  157. allThere = false;
  158. break;
  159. }
  160. }
  161. if (allThere) {
  162. buf.setLength(0);
  163. buf.append(c);
  164. addExpandOrder(buf.toString(), s, RBCollationTables.UNMAPPED);
  165. }
  166. }
  167. }
  168. }
  169. }
  170. /**
  171. * Look up for unmapped values in the expanded character table.
  172. *
  173. * When the expanding character tables are built by addExpandOrder,
  174. * it doesn't know what the final ordering of each character
  175. * in the expansion will be. Instead, it just puts the raw character
  176. * code into the table, adding CHARINDEX as a flag. Now that we've
  177. * finished building the mapping table, we can go back and look up
  178. * that character to see what its real collation order is and
  179. * stick that into the expansion table. That lets us avoid doing
  180. * a two-stage lookup later.
  181. */
  182. private final void commit()
  183. {
  184. if (expandTable != null) {
  185. for (int i = 0; i < expandTable.size(); i++) {
  186. int[] valueList = (int [])expandTable.elementAt(i);
  187. for (int j = 0; j < valueList.length; j++) {
  188. int order = valueList[j];
  189. if (order < RBCollationTables.EXPANDCHARINDEX && order > CHARINDEX) {
  190. // found a expanding character that isn't filled in yet
  191. char ch = (char)(order - CHARINDEX);
  192. // Get the real values for the non-filled entry
  193. int realValue = getCharOrder(ch);
  194. if (realValue == RBCollationTables.UNMAPPED) {
  195. // The real value is still unmapped, maybe it's ignorable
  196. valueList[j] = IGNORABLEMASK & ch;
  197. } else {
  198. // just fill in the value
  199. valueList[j] = realValue;
  200. }
  201. }
  202. }
  203. }
  204. }
  205. }
  206. /**
  207. * Increment of the last order based on the comparison level.
  208. */
  209. private final int increment(int aStrength, int lastValue)
  210. {
  211. switch(aStrength)
  212. {
  213. case Collator.PRIMARY:
  214. // increment priamry order and mask off secondary and tertiary difference
  215. lastValue += PRIMARYORDERINCREMENT;
  216. lastValue &= RBCollationTables.PRIMARYORDERMASK;
  217. isOverIgnore = true;
  218. break;
  219. case Collator.SECONDARY:
  220. // increment secondary order and mask off tertiary difference
  221. lastValue += SECONDARYORDERINCREMENT;
  222. lastValue &= RBCollationTables.SECONDARYDIFFERENCEONLY;
  223. // record max # of ignorable chars with secondary difference
  224. if (!isOverIgnore)
  225. maxSecOrder++;
  226. break;
  227. case Collator.TERTIARY:
  228. // increment tertiary order
  229. lastValue += TERTIARYORDERINCREMENT;
  230. // record max # of ignorable chars with tertiary difference
  231. if (!isOverIgnore)
  232. maxTerOrder++;
  233. break;
  234. }
  235. return lastValue;
  236. }
  237. /**
  238. * Adds a character and its designated order into the collation table.
  239. */
  240. private final void addOrder(char ch,
  241. int anOrder)
  242. {
  243. // See if the char already has an order in the mapping table
  244. int order = mapping.elementAt(ch);
  245. if (order >= RBCollationTables.CONTRACTCHARINDEX) {
  246. // There's already an entry for this character that points to a contracting
  247. // character table. Instead of adding the character directly to the mapping
  248. // table, we must add it to the contract table instead.
  249. key.setLength(0);
  250. key.append(ch);
  251. addContractOrder(key.toString(), anOrder);
  252. } else {
  253. // add the entry to the mapping table,
  254. // the same later entry replaces the previous one
  255. mapping.setElementAt(ch, anOrder);
  256. }
  257. }
  258. private final void addContractOrder(String groupChars, int anOrder) {
  259. addContractOrder(groupChars, anOrder, true);
  260. }
  261. /**
  262. * Adds the contracting string into the collation table.
  263. */
  264. private final void addContractOrder(String groupChars, int anOrder,
  265. boolean fwd)
  266. {
  267. if (contractTable == null) {
  268. contractTable = new Vector(INITIALTABLESIZE);
  269. }
  270. // See if the initial character of the string already has a contract table.
  271. int entry = mapping.elementAt(groupChars.charAt(0));
  272. Vector entryTable = getContractValues(entry - RBCollationTables.CONTRACTCHARINDEX);
  273. if (entryTable == null) {
  274. // We need to create a new table of contract entries for this base char
  275. int tableIndex = RBCollationTables.CONTRACTCHARINDEX + contractTable.size();
  276. entryTable = new Vector(INITIALTABLESIZE);
  277. contractTable.addElement(entryTable);
  278. // Add the initial character's current ordering first. then
  279. // update its mapping to point to this contract table
  280. entryTable.addElement(new EntryPair(groupChars.substring(0,1), entry));
  281. mapping.setElementAt(groupChars.charAt(0), tableIndex);
  282. }
  283. // Now add (or replace) this string in the table
  284. int index = RBCollationTables.getEntry(entryTable, groupChars, fwd);
  285. if (index != RBCollationTables.UNMAPPED) {
  286. EntryPair pair = (EntryPair) entryTable.elementAt(index);
  287. pair.value = anOrder;
  288. } else {
  289. EntryPair pair = (EntryPair)entryTable.lastElement();
  290. // NOTE: This little bit of logic is here to speed CollationElementIterator
  291. // .nextContractChar(). This code ensures that the longest sequence in
  292. // this list is always the _last_ one in the list. This keeps
  293. // nextContractChar() from having to search the entire list for the longest
  294. // sequence.
  295. if (groupChars.length() > pair.entryName.length()) {
  296. entryTable.addElement(new EntryPair(groupChars, anOrder, fwd));
  297. } else {
  298. entryTable.insertElementAt(new EntryPair(groupChars, anOrder,
  299. fwd), entryTable.size() - 1);
  300. }
  301. }
  302. // If this was a forward mapping for a contracting string, also add a
  303. // reverse mapping for it, so that CollationElementIterator.previous
  304. // can work right
  305. if (fwd && groupChars.length() > 1) {
  306. addContractFlags(groupChars);
  307. addContractOrder(new StringBuffer(groupChars).reverse().toString(),
  308. anOrder, false);
  309. }
  310. }
  311. /**
  312. * If the given string has been specified as a contracting string
  313. * in this collation table, return its ordering.
  314. * Otherwise return UNMAPPED.
  315. */
  316. private int getContractOrder(String groupChars)
  317. {
  318. int result = RBCollationTables.UNMAPPED;
  319. if (contractTable != null) {
  320. Vector entryTable = getContractValues(groupChars.charAt(0));
  321. if (entryTable != null) {
  322. int index = RBCollationTables.getEntry(entryTable, groupChars, true);
  323. if (index != RBCollationTables.UNMAPPED) {
  324. EntryPair pair = (EntryPair) entryTable.elementAt(index);
  325. result = pair.value;
  326. }
  327. }
  328. }
  329. return result;
  330. }
  331. private final int getCharOrder(char ch) {
  332. int order = mapping.elementAt(ch);
  333. if (order >= RBCollationTables.CONTRACTCHARINDEX) {
  334. Vector groupList = getContractValues(order - RBCollationTables.CONTRACTCHARINDEX);
  335. EntryPair pair = (EntryPair)groupList.firstElement();
  336. order = pair.value;
  337. }
  338. return order;
  339. }
  340. /**
  341. * Get the entry of hash table of the contracting string in the collation
  342. * table.
  343. * @param ch the starting character of the contracting string
  344. */
  345. Vector getContractValues(char ch)
  346. {
  347. int index = mapping.elementAt(ch);
  348. return getContractValues(index - RBCollationTables.CONTRACTCHARINDEX);
  349. }
  350. Vector getContractValues(int index)
  351. {
  352. if (index >= 0)
  353. {
  354. return (Vector)contractTable.elementAt(index);
  355. }
  356. else // not found
  357. {
  358. return null;
  359. }
  360. }
  361. /**
  362. * Adds the expanding string into the collation table.
  363. */
  364. private final void addExpandOrder(String contractChars,
  365. String expandChars,
  366. int anOrder) throws ParseException
  367. {
  368. // Create an expansion table entry
  369. int tableIndex = addExpansion(anOrder, expandChars);
  370. // And add its index into the main mapping table
  371. if (contractChars.length() > 1) {
  372. addContractOrder(contractChars, tableIndex);
  373. } else {
  374. addOrder(contractChars.charAt(0), tableIndex);
  375. }
  376. }
  377. /**
  378. * Create a new entry in the expansion table that contains the orderings
  379. * for the given characers. If anOrder is valid, it is added to the
  380. * beginning of the expanded list of orders.
  381. */
  382. private int addExpansion(int anOrder, String expandChars) {
  383. if (expandTable == null) {
  384. expandTable = new Vector(INITIALTABLESIZE);
  385. }
  386. // If anOrder is valid, we want to add it at the beginning of the list
  387. int offset = (anOrder == RBCollationTables.UNMAPPED) ? 0 : 1;
  388. int[] valueList = new int[expandChars.length() + offset];
  389. if (offset == 1) {
  390. valueList[0] = anOrder;
  391. }
  392. for (int i = 0; i < expandChars.length(); i++) {
  393. char ch = expandChars.charAt(i);
  394. int mapValue = getCharOrder(ch);
  395. if (mapValue != RBCollationTables.UNMAPPED) {
  396. valueList[i+offset] = mapValue;
  397. } else {
  398. // can't find it in the table, will be filled in by commit().
  399. valueList[i+offset] = CHARINDEX + (int)ch;
  400. }
  401. }
  402. // Add the expanding char list into the expansion table.
  403. int tableIndex = RBCollationTables.EXPANDCHARINDEX + expandTable.size();
  404. expandTable.addElement(valueList);
  405. return tableIndex;
  406. }
  407. private void addContractFlags(String chars) {
  408. char c;
  409. int len = chars.length();
  410. for (int i = 0; i < len; i++) {
  411. c = chars.charAt(i);
  412. contractFlags.put(c, 1);
  413. }
  414. }
  415. // ==============================================================
  416. // constants
  417. // ==============================================================
  418. final static int CHARINDEX = 0x70000000; // need look up in .commit()
  419. private final static int IGNORABLEMASK = 0x0000ffff;
  420. private final static int PRIMARYORDERINCREMENT = 0x00010000;
  421. private final static int SECONDARYORDERINCREMENT = 0x00000100;
  422. private final static int TERTIARYORDERINCREMENT = 0x00000001;
  423. private final static int INITIALTABLESIZE = 20;
  424. private final static int MAXKEYSIZE = 5;
  425. // ==============================================================
  426. // instance variables
  427. // ==============================================================
  428. // variables used by the build process
  429. private RBCollationTables.BuildAPI tables = null;
  430. private MergeCollation mPattern = null;
  431. private boolean isOverIgnore = false;
  432. private StringBuffer key = new StringBuffer(MAXKEYSIZE);
  433. private IntHashtable contractFlags = new IntHashtable(100);
  434. // "shadow" copies of the instance variables in RBCollationTables
  435. // (the values in these variables are copied back into RBCollationTables
  436. // at the end of the build process)
  437. private boolean frenchSec = false;
  438. private boolean seAsianSwapping = false;
  439. private CompactIntArray mapping = null;
  440. private Vector contractTable = null;
  441. private Vector expandTable = null;
  442. private short maxSecOrder = 0;
  443. private short maxTerOrder = 0;
  444. }