1. /*
  2. * @(#)CollationElementIterator.java 1.30 00/01/19
  3. *
  4. * Copyright 1996-2000 Sun Microsystems, Inc. All Rights Reserved.
  5. *
  6. * This software is the proprietary information of Sun Microsystems, Inc.
  7. * Use is subject to license terms.
  8. *
  9. */
  10. /*
  11. * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  12. * (C) Copyright IBM Corp. 1996-1998 - All Rights Reserved
  13. *
  14. * The original version of this source code and documentation is copyrighted
  15. * and owned by Taligent, Inc., a wholly-owned subsidiary of IBM. These
  16. * materials are provided under terms of a License Agreement between Taligent
  17. * and Sun. This technology is protected by multiple US and International
  18. * patents. This notice and attribution to Taligent may not be removed.
  19. * Taligent is a registered trademark of Taligent, Inc.
  20. *
  21. */
  22. package java.text;
  23. import java.lang.Character;
  24. import java.util.Vector;
  25. /**
  26. * The <code>CollationElementIterator</code> class is used as an iterator
  27. * to walk through each character of an international string. Use the iterator
  28. * to return the ordering priority of the positioned character. The ordering
  29. * priority of a character, which we refer to as a key, defines how a character
  30. * is collated in the given collation object.
  31. *
  32. * <p>
  33. * For example, consider the following in Spanish:
  34. * <blockquote>
  35. * <pre>
  36. * "ca" -> the first key is key('c') and second key is key('a').
  37. * "cha" -> the first key is key('ch') and second key is key('a').
  38. * </pre>
  39. * </blockquote>
  40. * And in German,
  41. * <blockquote>
  42. * <pre>
  43. * "\u00e4b"-> the first key is key('a'), the second key is key('e'), and
  44. * the third key is key('b').
  45. * </pre>
  46. * </blockquote>
  47. * The key of a character is an integer composed of primary order(short),
  48. * secondary order(byte), and tertiary order(byte). Java strictly defines
  49. * the size and signedness of its primitive data types. Therefore, the static
  50. * functions <code>primaryOrder</code>, <code>secondaryOrder</code>, and
  51. * <code>tertiaryOrder</code> return <code>int</code>, <code>short</code>,
  52. * and <code>short</code> respectively to ensure the correctness of the key
  53. * value.
  54. *
  55. * <p>
  56. * Example of the iterator usage,
  57. * <blockquote>
  58. * <pre>
  59. * // get the first key of the string
  60. * String str = "This is a test";
  61. * CollationElementIterator c =
  62. * new CollationElementIterator(str, 0, str.length(),
  63. * Collator.getInstance());
  64. * int primaryOrder = CollationElementIterator.primaryOrder(c->next());
  65. * </pre>
  66. * </blockquote>
  67. *
  68. * <p>
  69. * <code>CollationElementIterator.next</code> returns the collation order
  70. * of the next character. A collation order consists of primary order,
  71. * secondary order and tertiary order. The data type of the collation
  72. * order is <strong>int</strong>. The first 16 bits of a collation order
  73. * is its primary order; the next 8 bits is the secondary order and the
  74. * last 8 bits is the tertiary order.
  75. *
  76. * @see Collator
  77. * @see RuleBasedCollator
  78. * @version 1.24 07/27/98
  79. * @author Helena Shih, Laura Werner, Richard Gillam
  80. */
  81. public final class CollationElementIterator
  82. {
  83. /**
  84. * Null order which indicates the end of string is reached by the
  85. * cursor.
  86. */
  87. public final static int NULLORDER = 0xffffffff;
  88. /**
  89. * CollationElementIterator constructor. This takes the source string and
  90. * the collation object. The cursor will walk thru the source string based
  91. * on the predefined collation rules. If the source string is empty,
  92. * NULLORDER will be returned on the calls to next().
  93. * @param sourceText the source string.
  94. * @param order the collation object.
  95. */
  96. CollationElementIterator(String sourceText, RuleBasedCollator owner) {
  97. this.owner = owner;
  98. ordering = owner.getTables();
  99. if ( sourceText.length() != 0 ) {
  100. text = new Normalizer(sourceText, owner.getDecomposition());
  101. }
  102. }
  103. /**
  104. * CollationElementIterator constructor. This takes the source string and
  105. * the collation object. The cursor will walk thru the source string based
  106. * on the predefined collation rules. If the source string is empty,
  107. * NULLORDER will be returned on the calls to next().
  108. * @param sourceText the source string.
  109. * @param order the collation object.
  110. */
  111. CollationElementIterator(CharacterIterator sourceText, RuleBasedCollator owner) {
  112. this.owner = owner;
  113. ordering = owner.getTables();
  114. text = new Normalizer(sourceText, owner.getDecomposition());
  115. }
  116. /**
  117. * Resets the cursor to the beginning of the string. The next call
  118. * to next() will return the first collation element in the string.
  119. */
  120. public void reset()
  121. {
  122. if (text != null) {
  123. text.reset();
  124. text.setDecomposition(owner.getDecomposition());
  125. }
  126. buffer = null;
  127. expIndex = 0;
  128. swapOrder = 0;
  129. }
  130. /**
  131. * Get the next collation element in the string. <p>This iterator iterates
  132. * over a sequence of collation elements that were built from the string.
  133. * Because there isn't necessarily a one-to-one mapping from characters to
  134. * collation elements, this doesn't mean the same thing as "return the
  135. * collation element [or ordering priority] of the next character in the
  136. * string".</p>
  137. * <p>This function returns the collation element that the iterator is currently
  138. * pointing to and then updates the internal pointer to point to the next element.
  139. * previous() updates the pointer first and then returns the element. This
  140. * means that when you change direction while iterating (i.e., call next() and
  141. * then call previous(), or call previous() and then call next()), you'll get
  142. * back the same element twice.</p>
  143. */
  144. public int next()
  145. {
  146. if (text == null) {
  147. return NULLORDER;
  148. } else if (text.getDecomposition() != owner.getDecomposition()) {
  149. text.setDecomposition(owner.getDecomposition());
  150. }
  151. if (buffer != null) {
  152. if (expIndex < buffer.length) {
  153. return strengthOrder(buffer[expIndex++]);
  154. } else {
  155. buffer = null;
  156. expIndex = 0;
  157. }
  158. } else if (swapOrder != 0) {
  159. int order = swapOrder << 16;
  160. swapOrder = 0;
  161. return order;
  162. }
  163. char ch = text.next();
  164. if (ch == Normalizer.DONE) {
  165. return NULLORDER;
  166. }
  167. int value = ordering.getUnicodeOrder(ch);
  168. if (value == RuleBasedCollator.UNMAPPED) {
  169. swapOrder = ch;
  170. return UNMAPPEDCHARVALUE;
  171. }
  172. else if (value >= RuleBasedCollator.CONTRACTCHARINDEX) {
  173. value = nextContractChar(ch);
  174. }
  175. if (value >= RuleBasedCollator.EXPANDCHARINDEX) {
  176. buffer = ordering.getExpandValueList(value);
  177. expIndex = 0;
  178. value = buffer[expIndex++];
  179. }
  180. return strengthOrder(value);
  181. }
  182. /**
  183. * Get the previous collation element in the string. <p>This iterator iterates
  184. * over a sequence of collation elements that were built from the string.
  185. * Because there isn't necessarily a one-to-one mapping from characters to
  186. * collation elements, this doesn't mean the same thing as "return the
  187. * collation element [or ordering priority] of the previous character in the
  188. * string".</p>
  189. * <p>This function updates the iterator's internal pointer to point to the
  190. * collation element preceding the one it's currently pointing to and then
  191. * returns that element, while next() returns the current element and then
  192. * updates the pointer. This means that when you change direction while
  193. * iterating (i.e., call next() and then call previous(), or call previous()
  194. * and then call next()), you'll get back the same element twice.</p>
  195. */
  196. public int previous()
  197. {
  198. if (text == null) {
  199. return NULLORDER;
  200. } else if (text.getDecomposition() != owner.getDecomposition()) {
  201. text.setDecomposition(owner.getDecomposition());
  202. }
  203. if (buffer != null) {
  204. if (expIndex > 0) {
  205. return strengthOrder(buffer[--expIndex]);
  206. } else {
  207. buffer = null;
  208. expIndex = 0;
  209. }
  210. } else if (swapOrder != 0) {
  211. int order = swapOrder << 16;
  212. swapOrder = 0;
  213. return order;
  214. }
  215. char ch = text.previous();
  216. if (ch == Normalizer.DONE) {
  217. return NULLORDER;
  218. }
  219. int value = ordering.getUnicodeOrder(ch);
  220. if (value == RuleBasedCollator.UNMAPPED) {
  221. swapOrder = UNMAPPEDCHARVALUE;
  222. return ch;
  223. } else if (value >= RuleBasedCollator.CONTRACTCHARINDEX) {
  224. value = prevContractChar(ch);
  225. }
  226. if (value >= RuleBasedCollator.EXPANDCHARINDEX) {
  227. buffer = ordering.getExpandValueList(value);
  228. expIndex = buffer.length;
  229. value = buffer[--expIndex];
  230. }
  231. return strengthOrder(value);
  232. }
  233. /**
  234. * Return the primary component of a collation element.
  235. * @param order the collation element
  236. * @return the element's primary component
  237. */
  238. public final static int primaryOrder(int order)
  239. {
  240. order &= RBCollationTables.PRIMARYORDERMASK;
  241. return (order >>> RBCollationTables.PRIMARYORDERSHIFT);
  242. }
  243. /**
  244. * Return the secondary component of a collation element.
  245. * @param order the collation element
  246. * @return the element's secondary component
  247. */
  248. public final static short secondaryOrder(int order)
  249. {
  250. order = order & RBCollationTables.SECONDARYORDERMASK;
  251. return ((short)(order >> RBCollationTables.SECONDARYORDERSHIFT));
  252. }
  253. /**
  254. * Return the tertiary component of a collation element.
  255. * @param order the collation element
  256. * @return the element's tertiary component
  257. */
  258. public final static short tertiaryOrder(int order)
  259. {
  260. return ((short)(order &= RBCollationTables.TERTIARYORDERMASK));
  261. }
  262. /**
  263. * Get the comparison order in the desired strength. Ignore the other
  264. * differences.
  265. * @param order The order value
  266. */
  267. final int strengthOrder(int order)
  268. {
  269. int s = owner.getStrength();
  270. if (s == Collator.PRIMARY)
  271. {
  272. order &= RBCollationTables.PRIMARYDIFFERENCEONLY;
  273. } else if (s == Collator.SECONDARY)
  274. {
  275. order &= RBCollationTables.SECONDARYDIFFERENCEONLY;
  276. }
  277. return order;
  278. }
  279. /**
  280. * Sets the iterator to point to the collation element corresponding to
  281. * the specified character (the parameter is a CHARACTER offset in the
  282. * original string, not an offset into its corresponding sequence of
  283. * collation elements). The value returned by the next call to next()
  284. * will be the collation element corresponding to the specified position
  285. * in the text. If that position is in the middle of a contracting
  286. * character sequence, the result of the next call to next() is the
  287. * collation element for that sequence. This means that getOffset()
  288. * is not guaranteed to return the same value as was passed to a preceding
  289. * call to setOffset().
  290. *
  291. * @param newOffset The new character offset into the original text.
  292. */
  293. public void setOffset(int newOffset)
  294. {
  295. if (text != null) {
  296. if (newOffset < text.getText().getBeginIndex()
  297. || newOffset >= text.getText().getEndIndex()) {
  298. text.setOffset(newOffset);
  299. } else {
  300. text.setOffset(newOffset + 1);
  301. // if the desired character isn't used in a contracting character
  302. // sequence, bypass all the backing-up logic-- we're sitting on
  303. // the right character already
  304. char c = text.previous();
  305. if (ordering.usedInContractSeq(c)) {
  306. // walk backwards through the string until we see a character
  307. // that DOESN'T participate in a contracting character sequence
  308. while (ordering.usedInContractSeq(c)) {
  309. c = text.previous();
  310. }
  311. // now walk forward using this object's next() method until
  312. // we pass the starting point and set our current position
  313. // to the beginning of the last "character" before or at
  314. // our starting position
  315. int last = text.getOffset();
  316. while (text.getOffset() <= newOffset) {
  317. last = text.getOffset();
  318. next();
  319. }
  320. text.setOffset(last);
  321. }
  322. }
  323. }
  324. buffer = null;
  325. expIndex = 0;
  326. swapOrder = 0;
  327. }
  328. /**
  329. * Returns the character offset in the original text corresponding to the next
  330. * collation element. (That is, getOffset() returns the position in the text
  331. * corresponding to the collation element that will be returned by the next
  332. * call to next().) This value will always be the index of the FIRST character
  333. * corresponding to the collation element (a contracting character sequence is
  334. * when two or more characters all correspond to the same collation element).
  335. * This means if you do setOffset(x) followed immediately by getOffset(), getOffset()
  336. * won't necessarily return x.
  337. *
  338. * @return The character offset in the original text corresponding to the collation
  339. * element that will be returned by the next call to next().
  340. */
  341. public int getOffset()
  342. {
  343. return (text != null) ? text.getOffset() : 0;
  344. }
  345. /**
  346. * Return the maximum length of any expansion sequences that end
  347. * with the specified comparison order.
  348. * @param order a collation order returned by previous or next.
  349. * @return the maximum length of any expansion sequences ending
  350. * with the specified order.
  351. */
  352. public int getMaxExpansion(int order)
  353. {
  354. return ordering.getMaxExpansion(order);
  355. }
  356. //============================================================
  357. // Package-visible methods that should eventually be made public
  358. //============================================================
  359. /**
  360. * Set a new string over which to iterate.
  361. *
  362. * @param str the new source text.
  363. */
  364. public void setText(String source)
  365. {
  366. buffer = null;
  367. swapOrder = 0;
  368. expIndex = 0;
  369. if (text == null) {
  370. text = new Normalizer(source, owner.getDecomposition());
  371. } else {
  372. text.setDecomposition(owner.getDecomposition());
  373. text.setText(source);
  374. }
  375. }
  376. /**
  377. * Set a new string over which to iterate.
  378. *
  379. * @param str the new source text.
  380. */
  381. public void setText(CharacterIterator source)
  382. {
  383. buffer = null;
  384. swapOrder = 0;
  385. expIndex = 0;
  386. if (text == null) {
  387. text = new Normalizer(source, owner.getDecomposition());
  388. } else {
  389. text.setDecomposition(owner.getDecomposition());
  390. text.setText(source);
  391. }
  392. }
  393. //============================================================
  394. // privates
  395. //============================================================
  396. /**
  397. * Check if a comparison order is ignorable.
  398. * @return true if a character is ignorable, false otherwise.
  399. */
  400. final static boolean isIgnorable(int order)
  401. {
  402. return ((primaryOrder(order) == 0) ? true : false);
  403. }
  404. /**
  405. * Get the ordering priority of the next contracting character in the
  406. * string.
  407. * @param ch the starting character of a contracting character token
  408. * @return the next contracting character's ordering. Returns NULLORDER
  409. * if the end of string is reached.
  410. */
  411. private int nextContractChar(char ch)
  412. {
  413. // First get the ordering of this single character,
  414. // which is always the first element in the list
  415. Vector list = ordering.getContractValues(ch);
  416. EntryPair pair = (EntryPair)list.firstElement();
  417. int order = pair.value;
  418. // find out the length of the longest contracting character sequence in the list.
  419. // There's logic in the builder code to make sure the longest sequence is always
  420. // the last.
  421. pair = (EntryPair)list.lastElement();
  422. int maxLength = pair.entryName.length();
  423. // (the Normalizer is cloned here so that the seeking we do in the next loop
  424. // won't affect our real position in the text)
  425. Normalizer tempText = (Normalizer)text.clone();
  426. // extract the next maxLength characters in the string (we have to do this using the
  427. // Normalizer to ensure that our offsets correspond to those the rest of the
  428. // iterator is using) and store it in "fragment".
  429. tempText.previous();
  430. key.setLength(0);
  431. char c = tempText.next();
  432. while (maxLength > 0 && c != Normalizer.DONE) {
  433. key.append(c);
  434. --maxLength;
  435. c = tempText.next();
  436. }
  437. String fragment = key.toString();
  438. // now that we have that fragment, iterate through this list looking for the
  439. // longest sequence that matches the characters in the actual text. (maxLength
  440. // is used here to keep track of the length of the longest sequence)
  441. // Upon exit from this loop, maxLength will contain the length of the matching
  442. // sequence and order will contain the collation-element value corresponding
  443. // to this sequence
  444. maxLength = 1;
  445. for (int i = list.size() - 1; i > 0; i--) {
  446. pair = (EntryPair)list.elementAt(i);
  447. if (!pair.fwd)
  448. continue;
  449. if (fragment.startsWith(pair.entryName) && pair.entryName.length()
  450. > maxLength) {
  451. maxLength = pair.entryName.length();
  452. order = pair.value;
  453. }
  454. }
  455. // seek our current iteration position to the end of the matching sequence
  456. // and return the appropriate collation-element value (if there was no matching
  457. // sequence, we're already seeked to the right position and order already contains
  458. // the correct collation-element value for the single character)
  459. while (maxLength > 1) {
  460. text.next();
  461. --maxLength;
  462. }
  463. return order;
  464. }
  465. /**
  466. * Get the ordering priority of the previous contracting character in the
  467. * string.
  468. * @param ch the starting character of a contracting character token
  469. * @return the next contracting character's ordering. Returns NULLORDER
  470. * if the end of string is reached.
  471. */
  472. private int prevContractChar(char ch)
  473. {
  474. // This function is identical to nextContractChar(), except that we've
  475. // switched things so that the next() and previous() calls on the Normalizer
  476. // are switched and so that we skip entry pairs with the fwd flag turned on
  477. // rather than off. Notice that we still use append() and startsWith() when
  478. // working on the fragment. This is because the entry pairs that are used
  479. // in reverse iteration have their names reversed already.
  480. Vector list = ordering.getContractValues(ch);
  481. EntryPair pair = (EntryPair)list.firstElement();
  482. int order = pair.value;
  483. pair = (EntryPair)list.lastElement();
  484. int maxLength = pair.entryName.length();
  485. Normalizer tempText = (Normalizer)text.clone();
  486. tempText.next();
  487. key.setLength(0);
  488. char c = tempText.previous();
  489. while (maxLength > 0 && c != Normalizer.DONE) {
  490. key.append(c);
  491. --maxLength;
  492. c = tempText.previous();
  493. }
  494. String fragment = key.toString();
  495. maxLength = 1;
  496. for (int i = list.size() - 1; i > 0; i--) {
  497. pair = (EntryPair)list.elementAt(i);
  498. if (pair.fwd)
  499. continue;
  500. if (fragment.startsWith(pair.entryName) && pair.entryName.length()
  501. > maxLength) {
  502. maxLength = pair.entryName.length();
  503. order = pair.value;
  504. }
  505. }
  506. while (maxLength > 1) {
  507. text.previous();
  508. --maxLength;
  509. }
  510. return order;
  511. }
  512. final static int UNMAPPEDCHARVALUE = 0x7FFF0000;
  513. private Normalizer text = null;
  514. private int[] buffer = null;
  515. private int expIndex = 0;
  516. private StringBuffer key = new StringBuffer(5);
  517. private int swapOrder = 0;
  518. private RBCollationTables ordering;
  519. private RuleBasedCollator owner;
  520. }