1. /*
  2. * @(#)BreakIterator.java 1.35 03/12/19
  3. *
  4. * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
  5. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
  6. */
  7. /*
  8. * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  9. * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
  10. *
  11. * The original version of this source code and documentation
  12. * is copyrighted and owned by Taligent, Inc., a wholly-owned
  13. * subsidiary of IBM. These materials are provided under terms
  14. * of a License Agreement between Taligent and Sun. This technology
  15. * is protected by multiple US and International patents.
  16. *
  17. * This notice and attribution to Taligent may not be removed.
  18. * Taligent is a registered trademark of Taligent, Inc.
  19. *
  20. */
  21. package java.text;
  22. import java.util.Vector;
  23. import java.util.Locale;
  24. import java.util.ResourceBundle;
  25. import java.util.MissingResourceException;
  26. import sun.text.resources.LocaleData;
  27. import java.text.CharacterIterator;
  28. import java.text.StringCharacterIterator;
  29. import java.net.URL;
  30. import java.io.InputStream;
  31. import java.io.IOException;
  32. import java.lang.ref.SoftReference;
  33. import java.security.AccessController;
  34. import java.security.PrivilegedAction;
  35. /**
  36. * The <code>BreakIterator</code> class implements methods for finding
  37. * the location of boundaries in text. Instances of <code>BreakIterator</code>
  38. * maintain a current position and scan over text
  39. * returning the index of characters where boundaries occur.
  40. * Internally, <code>BreakIterator</code> scans text using a
  41. * <code>CharacterIterator</code>, and is thus able to scan text held
  42. * by any object implementing that protocol. A <code>StringCharacterIterator</code>
  43. * is used to scan <code>String</code> objects passed to <code>setText</code>.
  44. *
  45. * <p>
  46. * You use the factory methods provided by this class to create
  47. * instances of various types of break iterators. In particular,
  48. * use <code>getWordIterator</code>, <code>getLineIterator</code>,
  49. * <code>getSentenceIterator</code>, and <code>getCharacterIterator</code>
  50. * to create <code>BreakIterator</code>s that perform
  51. * word, line, sentence, and character boundary analysis respectively.
  52. * A single <code>BreakIterator</code> can work only on one unit
  53. * (word, line, sentence, and so on). You must use a different iterator
  54. * for each unit boundary analysis you wish to perform.
  55. *
  56. * <p>
  57. * Line boundary analysis determines where a text string can be
  58. * broken when line-wrapping. The mechanism correctly handles
  59. * punctuation and hyphenated words.
  60. *
  61. * <p>
  62. * Sentence boundary analysis allows selection with correct interpretation
  63. * of periods within numbers and abbreviations, and trailing punctuation
  64. * marks such as quotation marks and parentheses.
  65. *
  66. * <p>
  67. * Word boundary analysis is used by search and replace functions, as
  68. * well as within text editing applications that allow the user to
  69. * select words with a double click. Word selection provides correct
  70. * interpretation of punctuation marks within and following
  71. * words. Characters that are not part of a word, such as symbols
  72. * or punctuation marks, have word-breaks on both sides.
  73. *
  74. * <p>
  75. * Character boundary analysis allows users to interact with characters
  76. * as they expect to, for example, when moving the cursor through a text
  77. * string. Character boundary analysis provides correct navigation of
  78. * through character strings, regardless of how the character is stored.
  79. * For example, an accented character might be stored as a base character
  80. * and a diacritical mark. What users consider to be a character can
  81. * differ between languages.
  82. *
  83. * <p>
  84. * <code>BreakIterator</code> is intended for use with natural
  85. * languages only. Do not use this class to tokenize a programming language.
  86. *
  87. * <P>
  88. * <strong>Examples</strong>:<P>
  89. * Creating and using text boundaries
  90. * <blockquote>
  91. * <pre>
  92. * public static void main(String args[]) {
  93. * if (args.length == 1) {
  94. * String stringToExamine = args[0];
  95. * //print each word in order
  96. * BreakIterator boundary = BreakIterator.getWordInstance();
  97. * boundary.setText(stringToExamine);
  98. * printEachForward(boundary, stringToExamine);
  99. * //print each sentence in reverse order
  100. * boundary = BreakIterator.getSentenceInstance(Locale.US);
  101. * boundary.setText(stringToExamine);
  102. * printEachBackward(boundary, stringToExamine);
  103. * printFirst(boundary, stringToExamine);
  104. * printLast(boundary, stringToExamine);
  105. * }
  106. * }
  107. * </pre>
  108. * </blockquote>
  109. *
  110. * Print each element in order
  111. * <blockquote>
  112. * <pre>
  113. * public static void printEachForward(BreakIterator boundary, String source) {
  114. * int start = boundary.first();
  115. * for (int end = boundary.next();
  116. * end != BreakIterator.DONE;
  117. * start = end, end = boundary.next()) {
  118. * System.out.println(source.substring(start,end));
  119. * }
  120. * }
  121. * </pre>
  122. * </blockquote>
  123. *
  124. * Print each element in reverse order
  125. * <blockquote>
  126. * <pre>
  127. * public static void printEachBackward(BreakIterator boundary, String source) {
  128. * int end = boundary.last();
  129. * for (int start = boundary.previous();
  130. * start != BreakIterator.DONE;
  131. * end = start, start = boundary.previous()) {
  132. * System.out.println(source.substring(start,end));
  133. * }
  134. * }
  135. * </pre>
  136. * </blockquote>
  137. *
  138. * Print first element
  139. * <blockquote>
  140. * <pre>
  141. * public static void printFirst(BreakIterator boundary, String source) {
  142. * int start = boundary.first();
  143. * int end = boundary.next();
  144. * System.out.println(source.substring(start,end));
  145. * }
  146. * </pre>
  147. * </blockquote>
  148. *
  149. * Print last element
  150. * <blockquote>
  151. * <pre>
  152. * public static void printLast(BreakIterator boundary, String source) {
  153. * int end = boundary.last();
  154. * int start = boundary.previous();
  155. * System.out.println(source.substring(start,end));
  156. * }
  157. * </pre>
  158. * </blockquote>
  159. *
  160. * Print the element at a specified position
  161. * <blockquote>
  162. * <pre>
  163. * public static void printAt(BreakIterator boundary, int pos, String source) {
  164. * int end = boundary.following(pos);
  165. * int start = boundary.previous();
  166. * System.out.println(source.substring(start,end));
  167. * }
  168. * </pre>
  169. * </blockquote>
  170. *
  171. * Find the next word
  172. * <blockquote>
  173. * <pre>
  174. * public static int nextWordStartAfter(int pos, String text) {
  175. * BreakIterator wb = BreakIterator.getWordInstance();
  176. * wb.setText(text);
  177. * int last = wb.following(pos);
  178. * int current = wb.next();
  179. * while (current != BreakIterator.DONE) {
  180. * for (int p = last; p < current; p++) {
  181. * if (Character.isLetter(text.codePointAt(p))
  182. * return last;
  183. * }
  184. * last = current;
  185. * current = wb.next();
  186. * }
  187. * return BreakIterator.DONE;
  188. * }
  189. * </pre>
  190. * (The iterator returned by BreakIterator.getWordInstance() is unique in that
  191. * the break positions it returns don't represent both the start and end of the
  192. * thing being iterated over. That is, a sentence-break iterator returns breaks
  193. * that each represent the end of one sentence and the beginning of the next.
  194. * With the word-break iterator, the characters between two boundaries might be a
  195. * word, or they might be the punctuation or whitespace between two words. The
  196. * above code uses a simple heuristic to determine which boundary is the beginning
  197. * of a word: If the characters between this boundary and the next boundary
  198. * include at least one letter (this can be an alphabetical letter, a CJK ideograph,
  199. * a Hangul syllable, a Kana character, etc.), then the text between this boundary
  200. * and the next is a word; otherwise, it's the material between words.)
  201. * </blockquote>
  202. *
  203. * @see CharacterIterator
  204. *
  205. */
  206. public abstract class BreakIterator implements Cloneable
  207. {
  208. /**
  209. * Constructor. BreakIterator is stateless and has no default behavior.
  210. */
  211. protected BreakIterator()
  212. {
  213. }
  214. /**
  215. * Create a copy of this iterator
  216. * @return A copy of this
  217. */
  218. public Object clone()
  219. {
  220. try {
  221. return super.clone();
  222. }
  223. catch (CloneNotSupportedException e) {
  224. throw new InternalError();
  225. }
  226. }
  227. /**
  228. * DONE is returned by previous() and next() after all valid
  229. * boundaries have been returned.
  230. */
  231. public static final int DONE = -1;
  232. /**
  233. * Return the first boundary. The iterator's current position is set
  234. * to the first boundary.
  235. * @return The character index of the first text boundary.
  236. */
  237. public abstract int first();
  238. /**
  239. * Return the last boundary. The iterator's current position is set
  240. * to the last boundary.
  241. * @return The character index of the last text boundary.
  242. */
  243. public abstract int last();
  244. /**
  245. * Return the nth boundary from the current boundary
  246. * @param n which boundary to return. A value of 0
  247. * does nothing. Negative values move to previous boundaries
  248. * and positive values move to later boundaries.
  249. * @return The index of the nth boundary from the current position.
  250. */
  251. public abstract int next(int n);
  252. /**
  253. * Return the boundary following the current boundary.
  254. * @return The character index of the next text boundary or DONE if all
  255. * boundaries have been returned. Equivalent to next(1).
  256. */
  257. public abstract int next();
  258. /**
  259. * Return the boundary preceding the current boundary.
  260. * @return The character index of the previous text boundary or DONE if all
  261. * boundaries have been returned.
  262. */
  263. public abstract int previous();
  264. /**
  265. * Return the first boundary following the specified offset.
  266. * The value returned is always greater than the offset or
  267. * the value BreakIterator.DONE
  268. * @param offset the offset to begin scanning. Valid values
  269. * are determined by the CharacterIterator passed to
  270. * setText(). Invalid values cause
  271. * an IllegalArgumentException to be thrown.
  272. * @return The first boundary after the specified offset.
  273. */
  274. public abstract int following(int offset);
  275. /**
  276. * Return the last boundary preceding the specfied offset.
  277. * The value returned is always less than the offset or the value
  278. * BreakIterator.DONE.
  279. * @param offset the offset to begin scanning. Valid values are
  280. * determined by the CharacterIterator passed to setText().
  281. * Invalid values cause an IllegalArgumentException to be thrown.
  282. * @return The last boundary before the specified offset.
  283. * @since 1.2
  284. */
  285. public int preceding(int offset) {
  286. // NOTE: This implementation is here solely because we can't add new
  287. // abstract methods to an existing class. There is almost ALWAYS a
  288. // better, faster way to do this.
  289. int pos = following(offset);
  290. while (pos >= offset && pos != DONE)
  291. pos = previous();
  292. return pos;
  293. }
  294. /**
  295. * Return true if the specified position is a boundary position.
  296. * @param offset the offset to check.
  297. * @return True if "offset" is a boundary position.
  298. * @since 1.2
  299. */
  300. public boolean isBoundary(int offset) {
  301. // NOTE: This implementation probably is wrong for most situations
  302. // because it fails to take into account the possibility that a
  303. // CharacterIterator passed to setText() may not have a begin offset
  304. // of 0. But since the abstract BreakIterator doesn't have that
  305. // knowledge, it assumes the begin offset is 0. If you subclass
  306. // BreakIterator, copy the SimpleTextBoundary implementation of this
  307. // function into your subclass. [This should have been abstract at
  308. // this level, but it's too late to fix that now.]
  309. if (offset == 0)
  310. return true;
  311. else
  312. return following(offset - 1) == offset;
  313. }
  314. /**
  315. * Return character index of the text boundary that was most recently
  316. * returned by next(), previous(), first(), or last()
  317. * @return The boundary most recently returned.
  318. */
  319. public abstract int current();
  320. /**
  321. * Get the text being scanned
  322. * @return the text being scanned
  323. */
  324. public abstract CharacterIterator getText();
  325. /**
  326. * Set a new text string to be scanned. The current scan
  327. * position is reset to first().
  328. * @param newText new text to scan.
  329. */
  330. public void setText(String newText)
  331. {
  332. setText(new StringCharacterIterator(newText));
  333. }
  334. /**
  335. * Set a new text for scanning. The current scan
  336. * position is reset to first().
  337. * @param newText new text to scan.
  338. */
  339. public abstract void setText(CharacterIterator newText);
  340. private static final int CHARACTER_INDEX = 0;
  341. private static final int WORD_INDEX = 1;
  342. private static final int LINE_INDEX = 2;
  343. private static final int SENTENCE_INDEX = 3;
  344. private static final SoftReference[] iterCache = new SoftReference[4];
  345. /**
  346. * Create BreakIterator for word-breaks using default locale.
  347. * Returns an instance of a BreakIterator implementing word breaks.
  348. * WordBreak is usefull for word selection (ex. double click)
  349. * @return A BreakIterator for word-breaks
  350. * @see java.util.Locale#getDefault
  351. */
  352. public static BreakIterator getWordInstance()
  353. {
  354. return getWordInstance(Locale.getDefault());
  355. }
  356. /**
  357. * Create BreakIterator for word-breaks using specified locale.
  358. * Returns an instance of a BreakIterator implementing word breaks.
  359. * WordBreak is usefull for word selection (ex. double click)
  360. * @param where the local. If a specific WordBreak is not
  361. * avaliable for the specified locale, a default WordBreak is returned.
  362. * @return A BreakIterator for word-breaks
  363. */
  364. public static BreakIterator getWordInstance(Locale where)
  365. {
  366. return getBreakInstance(where,
  367. WORD_INDEX,
  368. "WordData",
  369. "WordDictionary");
  370. }
  371. /**
  372. * Create BreakIterator for line-breaks using default locale.
  373. * Returns an instance of a BreakIterator implementing line breaks. Line
  374. * breaks are logically possible line breaks, actual line breaks are
  375. * usually determined based on display width.
  376. * LineBreak is useful for word wrapping text.
  377. * @return A BreakIterator for line-breaks
  378. * @see java.util.Locale#getDefault
  379. */
  380. public static BreakIterator getLineInstance()
  381. {
  382. return getLineInstance(Locale.getDefault());
  383. }
  384. /**
  385. * Create BreakIterator for line-breaks using specified locale.
  386. * Returns an instance of a BreakIterator implementing line breaks. Line
  387. * breaks are logically possible line breaks, actual line breaks are
  388. * usually determined based on display width.
  389. * LineBreak is useful for word wrapping text.
  390. * @param where the local. If a specific LineBreak is not
  391. * avaliable for the specified locale, a default LineBreak is returned.
  392. * @return A BreakIterator for line-breaks
  393. */
  394. public static BreakIterator getLineInstance(Locale where)
  395. {
  396. return getBreakInstance(where,
  397. LINE_INDEX,
  398. "LineData",
  399. "LineDictionary");
  400. }
  401. /**
  402. * Create BreakIterator for character-breaks using default locale
  403. * Returns an instance of a BreakIterator implementing character breaks.
  404. * Character breaks are boundaries of combining character sequences.
  405. * @return A BreakIterator for character-breaks
  406. * @see Locale#getDefault
  407. */
  408. public static BreakIterator getCharacterInstance()
  409. {
  410. return getCharacterInstance(Locale.getDefault());
  411. }
  412. /**
  413. * Create BreakIterator for character-breaks using specified locale
  414. * Returns an instance of a BreakIterator implementing character breaks.
  415. * Character breaks are boundaries of combining character sequences.
  416. * @param where the local. If a specific character break is not
  417. * avaliable for the specified local, a default character break is returned.
  418. * @return A BreakIterator for character-breaks
  419. */
  420. public static BreakIterator getCharacterInstance(Locale where)
  421. {
  422. return getBreakInstance(where,
  423. CHARACTER_INDEX,
  424. "CharacterData",
  425. "CharacterDictionary");
  426. }
  427. /**
  428. * Create BreakIterator for sentence-breaks using default locale
  429. * Returns an instance of a BreakIterator implementing sentence breaks.
  430. * @return A BreakIterator for sentence-breaks
  431. * @see java.util.Locale#getDefault
  432. */
  433. public static BreakIterator getSentenceInstance()
  434. {
  435. return getSentenceInstance(Locale.getDefault());
  436. }
  437. /**
  438. * Create BreakIterator for sentence-breaks using specified locale
  439. * Returns an instance of a BreakIterator implementing sentence breaks.
  440. * @param where the local. If a specific SentenceBreak is not
  441. * avaliable for the specified local, a default SentenceBreak is returned.
  442. * @return A BreakIterator for sentence-breaks
  443. */
  444. public static BreakIterator getSentenceInstance(Locale where)
  445. {
  446. return getBreakInstance(where,
  447. SENTENCE_INDEX,
  448. "SentenceData",
  449. "SentenceDictionary");
  450. }
  451. private static BreakIterator getBreakInstance(Locale where,
  452. int type,
  453. String dataName,
  454. String dictionaryName) {
  455. if (iterCache[type] != null) {
  456. BreakIteratorCache cache = (BreakIteratorCache) iterCache[type].get();
  457. if (cache != null) {
  458. if (cache.getLocale().equals(where)) {
  459. return cache.createBreakInstance();
  460. }
  461. }
  462. }
  463. BreakIterator result = createBreakInstance(where,
  464. type,
  465. dataName,
  466. dictionaryName);
  467. BreakIteratorCache cache = new BreakIteratorCache(where, result);
  468. iterCache[type] = new SoftReference(cache);
  469. return result;
  470. }
  471. private static ResourceBundle getBundle(final String baseName, final Locale locale) {
  472. return (ResourceBundle) AccessController.doPrivileged(new PrivilegedAction() {
  473. public Object run() {
  474. return ResourceBundle.getBundle(baseName, locale);
  475. }
  476. });
  477. }
  478. private static BreakIterator createBreakInstance(Locale where,
  479. int type,
  480. String dataName,
  481. String dictionaryName) {
  482. ResourceBundle bundle = getBundle(
  483. "sun.text.resources.BreakIteratorInfo", where);
  484. String[] classNames = bundle.getStringArray("BreakIteratorClasses");
  485. String dataFile = bundle.getString(dataName);
  486. try {
  487. if (classNames[type].equals("RuleBasedBreakIterator")) {
  488. return new RuleBasedBreakIterator(dataFile);
  489. }
  490. else if (classNames[type].equals("DictionaryBasedBreakIterator")) {
  491. String dictionaryFile = bundle.getString(dictionaryName);
  492. return new DictionaryBasedBreakIterator(dataFile, dictionaryFile);
  493. }
  494. else {
  495. throw new IllegalArgumentException("Invalid break iterator class \"" +
  496. classNames[type] + "\"");
  497. }
  498. }
  499. catch (Exception e) {
  500. throw new InternalError(e.toString());
  501. }
  502. }
  503. /**
  504. * Returns an array of all locales for which the
  505. * <code>get*Instance</code> methods of this class can return
  506. * localized instances.
  507. * The array returned must contain at least a <code>Locale</code>
  508. * instance equal to {@link java.util.Locale#US Locale.US}.
  509. *
  510. * @return An array of locales for which localized
  511. * <code>BreakIterator</code> instances are available.
  512. */
  513. public static synchronized Locale[] getAvailableLocales()
  514. {
  515. //FIX ME - this is a known bug. It should return
  516. //all locales.
  517. return LocaleData.getAvailableLocales("NumberPatterns");
  518. }
  519. private static final class BreakIteratorCache {
  520. private BreakIterator iter;
  521. private Locale where;
  522. BreakIteratorCache(Locale where, BreakIterator iter) {
  523. this.where = where;
  524. this.iter = (BreakIterator) iter.clone();
  525. }
  526. Locale getLocale() {
  527. return where;
  528. }
  529. BreakIterator createBreakInstance() {
  530. return (BreakIterator) iter.clone();
  531. }
  532. }
  533. protected static long getLong(byte[] buf, int offset) {
  534. long num = buf[offset]&0xFF;
  535. for (int i = 1; i < 8; i++) {
  536. num = num<<8 | (buf[offset+i]&0xFF);
  537. }
  538. return num;
  539. }
  540. protected static int getInt(byte[] buf, int offset) {
  541. int num = buf[offset]&0xFF;
  542. for (int i = 1; i < 4; i++) {
  543. num = num<<8 | (buf[offset+i]&0xFF);
  544. }
  545. return num;
  546. }
  547. protected static short getShort(byte[] buf, int offset) {
  548. short num = (short)(buf[offset]&0xFF);
  549. num = (short)(num<<8 | (buf[offset+1]&0xFF));
  550. return num;
  551. }
  552. }