1. /*
  2. * @(#)BreakIterator.java 1.25 01/11/29
  3. *
  4. * Copyright 2002 Sun Microsystems, Inc. All rights reserved.
  5. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
  6. */
  7. /*
  8. * @(#)BreakIterator.java 1.22 98/07/24
  9. *
  10. * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  11. * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
  12. *
  13. * Portions copyright (c) 1996-1998 Sun Microsystems, Inc.
  14. * All Rights Reserved.
  15. *
  16. * The original version of this source code and documentation
  17. * is copyrighted and owned by Taligent, Inc., a wholly-owned
  18. * subsidiary of IBM. These materials are provided under terms
  19. * of a License Agreement between Taligent and Sun. This technology
  20. * is protected by multiple US and International patents.
  21. *
  22. * This notice and attribution to Taligent may not be removed.
  23. * Taligent is a registered trademark of Taligent, Inc.
  24. *
  25. * Permission to use, copy, modify, and distribute this software
  26. * and its documentation for NON-COMMERCIAL purposes and without
  27. * fee is hereby granted provided that this copyright notice
  28. * appears in all copies. Please refer to the file "copyright.html"
  29. * for further important copyright and licensing information.
  30. *
  31. * SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
  32. * THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
  33. * TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
  34. * PARTICULAR PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR
  35. * ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR
  36. * DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
  37. *
  38. */
  39. package java.text;
  40. import java.util.Vector;
  41. import java.util.Locale;
  42. import java.text.resources.*;
  43. /**
  44. * The <code>BreakIterator</code> class implements methods for finding
  45. * the location of boundaries in text. Instances of <code>BreakIterator</code>
  46. * maintain a current position and scan over text
  47. * returning the index of characters where boundaries occur.
  48. * Internally, <code>BreakIterator</code> scans text using a
  49. * <code>CharacterIterator</code>, and is thus able to scan text held
  50. * by any object implementing that protocol. A <code>StringCharacterIterator</code>
  51. * is used to scan <code>String</code> objects passed to <code>setText</code>.
  52. *
  53. * <p>
  54. * You use the factory methods provided by this class to create
  55. * instances of various types of break iterators. In particular,
  56. * use <code>getWordIterator</code>, <code>getLineIterator</code>,
  57. * <code>getSentenceIterator</code>, and <code>getCharacterIterator</code>
  58. * to create <code>BreakIterator</code>s that perform
  59. * word, line, sentence, and character boundary analysis respectively.
  60. * A single <code>BreakIterator</code> can work only on one unit
  61. * (word, line, sentence, and so on). You must use a different iterator
  62. * for each unit boundary analysis you wish to perform.
  63. *
  64. * <p>
  65. * Line boundary analysis determines where a text string can be
  66. * broken when line-wrapping. The mechanism correctly handles
  67. * punctuation and hyphenated words.
  68. *
  69. * <p>
  70. * Sentence boundary analysis allows selection with correct interpretation
  71. * of periods within numbers and abbreviations, and trailing punctuation
  72. * marks such as quotation marks and parentheses.
  73. *
  74. * <p>
  75. * Word boundary analysis is used by search and replace functions, as
  76. * well as within text editing applications that allow the user to
  77. * select words with a double click. Word selection provides correct
  78. * interpretation of punctuation marks within and following
  79. * words. Characters that are not part of a word, such as symbols
  80. * or punctuation marks, have word-breaks on both sides.
  81. *
  82. * <p>
  83. * Character boundary analysis allows users to interact with characters
  84. * as they expect to, for example, when moving the cursor through a text
  85. * string. Character boundary analysis provides correct navigation of
  86. * through character strings, regardless of how the character is stored.
  87. * For example, an accented character might be stored as a base character
  88. * and a diacritical mark. What users consider to be a character can
  89. * differ between languages.
  90. *
  91. * <p>
  92. * <code>BreakIterator</code> is intended for use with natural
  93. * languages only. Do not use this class to tokenize a programming language.
  94. *
  95. * <P>
  96. * <strong>Examples</strong>:<P>
  97. * Creating and using text boundaries
  98. * <blockquote>
  99. * <pre>
  100. * public static void main(String args[]) {
  101. * if (args.length == 1) {
  102. * String stringToExamine = args[0];
  103. * //print each word in order
  104. * BreakIterator boundary = BreakIterator.getWordInstance();
  105. * boundary.setText(stringToExamine);
  106. * printEachForward(boundary, stringToExamine);
  107. * //print each sentence in reverse order
  108. * boundary = BreakIterator.getSentenceInstance(Locale.US);
  109. * boundary.setText(stringToExamine);
  110. * printEachBackward(boundary, stringToExamine);
  111. * printFirst(boundary, stringToExamine);
  112. * printLast(boundary, stringToExamine);
  113. * }
  114. * }
  115. * </pre>
  116. * </blockquote>
  117. *
  118. * Print each element in order
  119. * <blockquote>
  120. * <pre>
  121. * public static void printEachForward(BreakIterator boundary, String source) {
  122. * int start = boundary.first();
  123. * for (int end = boundary.next();
  124. * end != BreakIterator.DONE;
  125. * start = end, end = boundary.next()) {
  126. * System.out.println(source.substring(start,end));
  127. * }
  128. * }
  129. * </pre>
  130. * </blockquote>
  131. *
  132. * Print each element in reverse order
  133. * <blockquote>
  134. * <pre>
  135. * public static void printEachBackward(BreakIterator boundary, String source) {
  136. * int end = boundary.last();
  137. * for (int start = boundary.previous();
  138. * start != BreakIterator.DONE;
  139. * end = start, start = boundary.previous()) {
  140. * System.out.println(source.substring(start,end));
  141. * }
  142. * }
  143. * </pre>
  144. * </blockquote>
  145. *
  146. * Print first element
  147. * <blockquote>
  148. * <pre>
  149. * public static void printFirst(BreakIterator boundary, String source) {
  150. * int start = boundary.first();
  151. * int end = boundary.next();
  152. * System.out.println(source.substring(start,end));
  153. * }
  154. * </pre>
  155. * </blockquote>
  156. *
  157. * Print last element
  158. * <blockquote>
  159. * <pre>
  160. * public static void printLast(BreakIterator boundary, String source) {
  161. * int end = boundary.last();
  162. * int start = boundary.previous();
  163. * System.out.println(source.substring(start,end));
  164. * }
  165. * </pre>
  166. * </blockquote>
  167. *
  168. * Print the element at a specified position
  169. * <blockquote>
  170. * <pre>
  171. * public static void printAt(BreakIterator boundary, int pos, String source) {
  172. * int end = boundary.following(pos);
  173. * int start = boundary.previous();
  174. * System.out.println(source.substring(start,end));
  175. * }
  176. * </pre>
  177. * </blockquote>
  178. *
  179. * Find the next word
  180. * <blockquote>
  181. * <pre>
  182. * public static int nextWordStartAfter(int pos, String text) {
  183. * BreakIterator wb = BreakIterator.getWordInstance();
  184. * wb.setText(text);
  185. * int last = wb.following(pos);
  186. * int current = wb.next();
  187. * while (current != BreakIterator.DONE) {
  188. * for (int p = last; p < current; p++) {
  189. * if (Character.isLetter(text.charAt(p))
  190. * return last;
  191. * }
  192. * last = current;
  193. * current = wb.next();
  194. * }
  195. * return BreakIterator.DONE;
  196. * }
  197. * </pre>
  198. * (The iterator returned by BreakIterator.getWordInstance() is unique in that
  199. * the break positions it returns don't represent both the start and end of the
  200. * thing being iterated over. That is, a sentence-break iterator returns breaks
  201. * that each represent the end of one sentence and the beginning of the next.
  202. * With the word-break iterator, the characters between two boundaries might be a
  203. * word, or they might be the punctuation or whitespace between two words. The
  204. * above code uses a simple heuristic to determine which boundary is the beginning
  205. * of a word: If the characters between this boundary and the next boundary
  206. * include at least one letter (this can be an alphabetical letter, a CJK ideograph,
  207. * a Hangul syllable, a Kana character, etc.), then the text between this boundary
  208. * and the next is a word; otherwise, it's the material between words.)
  209. * </blockquote>
  210. *
  211. * @see CharacterIterator
  212. *
  213. */
  214. public abstract class BreakIterator implements Cloneable
  215. {
  216. /**
  217. * Constructor. BreakIterator is stateless and has no default behavior.
  218. */
  219. protected BreakIterator()
  220. {
  221. }
  222. /**
  223. * Create a copy of this iterator
  224. * @return A copy of this
  225. */
  226. public Object clone()
  227. {
  228. try {
  229. return super.clone();
  230. }
  231. catch (CloneNotSupportedException e) {
  232. throw new InternalError();
  233. }
  234. }
  235. /**
  236. * DONE is returned by previous() and next() after all valid
  237. * boundaries have been returned.
  238. */
  239. public static final int DONE = -1;
  240. /**
  241. * Return the first boundary. The iterator's current position is set
  242. * to the first boundary.
  243. * @return The character index of the first text boundary.
  244. */
  245. public abstract int first()
  246. ;
  247. /**
  248. * Return the last boundary. The iterator's current position is set
  249. * to the last boundary.
  250. * @return The character index of the last text boundary.
  251. */
  252. public abstract int last();
  253. /**
  254. * Return the nth boundary from the current boundary
  255. * @param n which boundary to return. A value of 0
  256. * does nothing. Negative values move to previous boundaries
  257. * and positive values move to later boundaries.
  258. * @return The index of the nth boundary from the current position.
  259. */
  260. public abstract int next(int n);
  261. /**
  262. * Return the boundary following the current boundary.
  263. * @return The character index of the next text boundary or DONE if all
  264. * boundaries have been returned. Equivalent to next(1).
  265. */
  266. public abstract int next();
  267. /**
  268. * Return the boundary preceding the current boundary.
  269. * @return The character index of the previous text boundary or DONE if all
  270. * boundaries have been returned.
  271. */
  272. public abstract int previous();
  273. /**
  274. * Return the first boundary following the specified offset.
  275. * The value returned is always greater than the offset or
  276. * the value BreakIterator.DONE
  277. * @param offset the offset to begin scanning. Valid values
  278. * are determined by the CharacterIterator passed to
  279. * setText(). Invalid values cause
  280. * an IllegalArgumentException to be thrown.
  281. * @return The first boundary after the specified offset.
  282. */
  283. public abstract int following(int offset);
  284. /**
  285. * Return the last boundary preceding the specfied offset.
  286. * The value returned is always less than the offset or the value
  287. * BreakIterator.DONE.
  288. * @param offset the offset to begin scanning. Valid values are
  289. * determined by the CharacterIterator passed to setText().
  290. * Invalid values cause an IllegalArgumentException to be thrown.
  291. * @return The last boundary before the specified offset.
  292. */
  293. public int preceding(int offset) {
  294. // NOTE: This implementation is here solely because we can't add new
  295. // abstract methods to an existing class. There is almost ALWAYS a
  296. // better, faster way to do this.
  297. int pos = following(offset);
  298. while (pos >= offset && pos != DONE)
  299. pos = previous();
  300. return pos;
  301. }
  302. /**
  303. * Return true if the specified position is a boundary position.
  304. * @param offset the offset to check.
  305. * @return True if "offset" is a boundary position.
  306. */
  307. public boolean isBoundary(int offset) {
  308. // NOTE: This implementation probably is wrong for most situations
  309. // because it fails to take into account the possibility that a
  310. // CharacterIterator passed to setText() may not have a begin offset
  311. // of 0. But since the abstract BreakIterator doesn't have that
  312. // knowledge, it assumes the begin offset is 0. If you subclass
  313. // BreakIterator, copy the SimpleTextBoundary implementation of this
  314. // function into your subclass. [This should have been abstract at
  315. // this level, but it's too late to fix that now.]
  316. if (offset == 0)
  317. return true;
  318. else
  319. return following(offset - 1) == offset;
  320. }
  321. /**
  322. * Return character index of the text boundary that was most recently
  323. * returned by next(), previous(), first(), or last()
  324. * @return The boundary most recently returned.
  325. */
  326. public abstract int current();
  327. /**
  328. * Get the text being scanned
  329. * @return the text being scanned
  330. */
  331. public abstract CharacterIterator getText();
  332. /**
  333. * Set a new text string to be scanned. The current scan
  334. * position is reset to first().
  335. * @param newText new text to scan.
  336. */
  337. public void setText(String newText)
  338. {
  339. setText(new StringCharacterIterator(newText));
  340. }
  341. /**
  342. * Set a new text for scanning. The current scan
  343. * position is reset to first().
  344. * @param newText new text to scan.
  345. */
  346. public abstract void setText(CharacterIterator newText);
  347. /**
  348. * Create BreakIterator for word-breaks using default locale.
  349. * Returns an instance of a BreakIterator implementing word breaks.
  350. * WordBreak is usefull for word selection (ex. double click)
  351. * @return A BreakIterator for word-breaks
  352. * @see java.util.Locale#getDefault
  353. */
  354. public static BreakIterator getWordInstance()
  355. {
  356. return getWordInstance(Locale.getDefault());
  357. }
  358. /**
  359. * Create BreakIterator for word-breaks using specified locale.
  360. * Returns an instance of a BreakIterator implementing word breaks.
  361. * WordBreak is usefull for word selection (ex. double click)
  362. * @param where the local. If a specific WordBreak is not
  363. * avaliable for the specified locale, a default WordBreak is returned.
  364. * @return A BreakIterator for word-breaks
  365. */
  366. public static BreakIterator getWordInstance(Locale where)
  367. {
  368. return new SimpleTextBoundary(new WordBreakData());
  369. }
  370. /**
  371. * Create BreakIterator for line-breaks using default locale.
  372. * Returns an instance of a BreakIterator implementing line breaks. Line
  373. * breaks are logically possible line breaks, actual line breaks are
  374. * usually determined based on display width.
  375. * LineBreak is useful for word wrapping text.
  376. * @return A BreakIterator for line-breaks
  377. * @see java.util.Locale#getDefault
  378. */
  379. public static BreakIterator getLineInstance()
  380. {
  381. return getLineInstance(Locale.getDefault());
  382. }
  383. /**
  384. * Create BreakIterator for line-breaks using specified locale.
  385. * Returns an instance of a BreakIterator implementing line breaks. Line
  386. * breaks are logically possible line breaks, actual line breaks are
  387. * usually determined based on display width.
  388. * LineBreak is useful for word wrapping text.
  389. * @param where the local. If a specific LineBreak is not
  390. * avaliable for the specified locale, a default LineBreak is returned.
  391. * @return A BreakIterator for line-breaks
  392. */
  393. public static BreakIterator getLineInstance(Locale where)
  394. {
  395. return new SimpleTextBoundary(new LineBreakData());
  396. }
  397. /**
  398. * Create BreakIterator for character-breaks using default locale
  399. * Returns an instance of a BreakIterator implementing character breaks.
  400. * Character breaks are boundaries of combining character sequences.
  401. * @return A BreakIterator for character-breaks
  402. * @see Locale#getDefault
  403. */
  404. public static BreakIterator getCharacterInstance()
  405. {
  406. return getCharacterInstance(Locale.getDefault());
  407. }
  408. /**
  409. * Create BreakIterator for character-breaks using specified locale
  410. * Returns an instance of a BreakIterator implementing character breaks.
  411. * Character breaks are boundaries of combining character sequences.
  412. * @param where the local. If a specific character break is not
  413. * avaliable for the specified local, a default character break is returned.
  414. * @return A BreakIterator for character-breaks
  415. */
  416. public static BreakIterator getCharacterInstance(Locale where)
  417. {
  418. return new SimpleTextBoundary(new CharacterBreakData());
  419. }
  420. /**
  421. * Create BreakIterator for sentence-breaks using default locale
  422. * Returns an instance of a BreakIterator implementing sentence breaks.
  423. * @return A BreakIterator for sentence-breaks
  424. * @see java.util.Locale#getDefault
  425. */
  426. public static BreakIterator getSentenceInstance()
  427. {
  428. return getSentenceInstance(Locale.getDefault());
  429. }
  430. /**
  431. * Create BreakIterator for sentence-breaks using specified locale
  432. * Returns an instance of a BreakIterator implementing sentence breaks.
  433. * @param where the local. If a specific SentenceBreak is not
  434. * avaliable for the specified local, a default SentenceBreak is returned.
  435. * @return A BreakIterator for sentence-breaks
  436. */
  437. public static BreakIterator getSentenceInstance(Locale where)
  438. {
  439. return new SimpleTextBoundary(new SentenceBreakData());
  440. }
  441. /**
  442. * Get the set of Locales for which BreakIterators are installed
  443. * @return available locales
  444. */
  445. public static synchronized Locale[] getAvailableLocales()
  446. {
  447. //FIX ME - this is a known bug. It should return
  448. //all locales.
  449. return LocaleData.getAvailableLocales("NumberPatterns");
  450. }
  451. }