1. /*
  2. * @(#)BreakIterator.java 1.26 00/01/19
  3. *
  4. * Copyright 1996-2000 Sun Microsystems, Inc. All Rights Reserved.
  5. *
  6. * This software is the proprietary information of Sun Microsystems, Inc.
  7. * Use is subject to license terms.
  8. *
  9. */
  10. /*
  11. * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  12. * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
  13. *
  14. * The original version of this source code and documentation
  15. * is copyrighted and owned by Taligent, Inc., a wholly-owned
  16. * subsidiary of IBM. These materials are provided under terms
  17. * of a License Agreement between Taligent and Sun. This technology
  18. * is protected by multiple US and International patents.
  19. *
  20. * This notice and attribution to Taligent may not be removed.
  21. * Taligent is a registered trademark of Taligent, Inc.
  22. *
  23. */
  24. package java.text;
  25. import java.util.Vector;
  26. import java.util.Locale;
  27. import java.text.resources.*;
  28. /**
  29. * The <code>BreakIterator</code> class implements methods for finding
  30. * the location of boundaries in text. Instances of <code>BreakIterator</code>
  31. * maintain a current position and scan over text
  32. * returning the index of characters where boundaries occur.
  33. * Internally, <code>BreakIterator</code> scans text using a
  34. * <code>CharacterIterator</code>, and is thus able to scan text held
  35. * by any object implementing that protocol. A <code>StringCharacterIterator</code>
  36. * is used to scan <code>String</code> objects passed to <code>setText</code>.
  37. *
  38. * <p>
  39. * You use the factory methods provided by this class to create
  40. * instances of various types of break iterators. In particular,
  41. * use <code>getWordIterator</code>, <code>getLineIterator</code>,
  42. * <code>getSentenceIterator</code>, and <code>getCharacterIterator</code>
  43. * to create <code>BreakIterator</code>s that perform
  44. * word, line, sentence, and character boundary analysis respectively.
  45. * A single <code>BreakIterator</code> can work only on one unit
  46. * (word, line, sentence, and so on). You must use a different iterator
  47. * for each unit boundary analysis you wish to perform.
  48. *
  49. * <p>
  50. * Line boundary analysis determines where a text string can be
  51. * broken when line-wrapping. The mechanism correctly handles
  52. * punctuation and hyphenated words.
  53. *
  54. * <p>
  55. * Sentence boundary analysis allows selection with correct interpretation
  56. * of periods within numbers and abbreviations, and trailing punctuation
  57. * marks such as quotation marks and parentheses.
  58. *
  59. * <p>
  60. * Word boundary analysis is used by search and replace functions, as
  61. * well as within text editing applications that allow the user to
  62. * select words with a double click. Word selection provides correct
  63. * interpretation of punctuation marks within and following
  64. * words. Characters that are not part of a word, such as symbols
  65. * or punctuation marks, have word-breaks on both sides.
  66. *
  67. * <p>
  68. * Character boundary analysis allows users to interact with characters
  69. * as they expect to, for example, when moving the cursor through a text
  70. * string. Character boundary analysis provides correct navigation of
  71. * through character strings, regardless of how the character is stored.
  72. * For example, an accented character might be stored as a base character
  73. * and a diacritical mark. What users consider to be a character can
  74. * differ between languages.
  75. *
  76. * <p>
  77. * <code>BreakIterator</code> is intended for use with natural
  78. * languages only. Do not use this class to tokenize a programming language.
  79. *
  80. * <P>
  81. * <strong>Examples</strong>:<P>
  82. * Creating and using text boundaries
  83. * <blockquote>
  84. * <pre>
  85. * public static void main(String args[]) {
  86. * if (args.length == 1) {
  87. * String stringToExamine = args[0];
  88. * //print each word in order
  89. * BreakIterator boundary = BreakIterator.getWordInstance();
  90. * boundary.setText(stringToExamine);
  91. * printEachForward(boundary, stringToExamine);
  92. * //print each sentence in reverse order
  93. * boundary = BreakIterator.getSentenceInstance(Locale.US);
  94. * boundary.setText(stringToExamine);
  95. * printEachBackward(boundary, stringToExamine);
  96. * printFirst(boundary, stringToExamine);
  97. * printLast(boundary, stringToExamine);
  98. * }
  99. * }
  100. * </pre>
  101. * </blockquote>
  102. *
  103. * Print each element in order
  104. * <blockquote>
  105. * <pre>
  106. * public static void printEachForward(BreakIterator boundary, String source) {
  107. * int start = boundary.first();
  108. * for (int end = boundary.next();
  109. * end != BreakIterator.DONE;
  110. * start = end, end = boundary.next()) {
  111. * System.out.println(source.substring(start,end));
  112. * }
  113. * }
  114. * </pre>
  115. * </blockquote>
  116. *
  117. * Print each element in reverse order
  118. * <blockquote>
  119. * <pre>
  120. * public static void printEachBackward(BreakIterator boundary, String source) {
  121. * int end = boundary.last();
  122. * for (int start = boundary.previous();
  123. * start != BreakIterator.DONE;
  124. * end = start, start = boundary.previous()) {
  125. * System.out.println(source.substring(start,end));
  126. * }
  127. * }
  128. * </pre>
  129. * </blockquote>
  130. *
  131. * Print first element
  132. * <blockquote>
  133. * <pre>
  134. * public static void printFirst(BreakIterator boundary, String source) {
  135. * int start = boundary.first();
  136. * int end = boundary.next();
  137. * System.out.println(source.substring(start,end));
  138. * }
  139. * </pre>
  140. * </blockquote>
  141. *
  142. * Print last element
  143. * <blockquote>
  144. * <pre>
  145. * public static void printLast(BreakIterator boundary, String source) {
  146. * int end = boundary.last();
  147. * int start = boundary.previous();
  148. * System.out.println(source.substring(start,end));
  149. * }
  150. * </pre>
  151. * </blockquote>
  152. *
  153. * Print the element at a specified position
  154. * <blockquote>
  155. * <pre>
  156. * public static void printAt(BreakIterator boundary, int pos, String source) {
  157. * int end = boundary.following(pos);
  158. * int start = boundary.previous();
  159. * System.out.println(source.substring(start,end));
  160. * }
  161. * </pre>
  162. * </blockquote>
  163. *
  164. * Find the next word
  165. * <blockquote>
  166. * <pre>
  167. * public static int nextWordStartAfter(int pos, String text) {
  168. * BreakIterator wb = BreakIterator.getWordInstance();
  169. * wb.setText(text);
  170. * int last = wb.following(pos);
  171. * int current = wb.next();
  172. * while (current != BreakIterator.DONE) {
  173. * for (int p = last; p < current; p++) {
  174. * if (Character.isLetter(text.charAt(p))
  175. * return last;
  176. * }
  177. * last = current;
  178. * current = wb.next();
  179. * }
  180. * return BreakIterator.DONE;
  181. * }
  182. * </pre>
  183. * (The iterator returned by BreakIterator.getWordInstance() is unique in that
  184. * the break positions it returns don't represent both the start and end of the
  185. * thing being iterated over. That is, a sentence-break iterator returns breaks
  186. * that each represent the end of one sentence and the beginning of the next.
  187. * With the word-break iterator, the characters between two boundaries might be a
  188. * word, or they might be the punctuation or whitespace between two words. The
  189. * above code uses a simple heuristic to determine which boundary is the beginning
  190. * of a word: If the characters between this boundary and the next boundary
  191. * include at least one letter (this can be an alphabetical letter, a CJK ideograph,
  192. * a Hangul syllable, a Kana character, etc.), then the text between this boundary
  193. * and the next is a word; otherwise, it's the material between words.)
  194. * </blockquote>
  195. *
  196. * @see CharacterIterator
  197. *
  198. */
  199. public abstract class BreakIterator implements Cloneable
  200. {
  201. /**
  202. * Constructor. BreakIterator is stateless and has no default behavior.
  203. */
  204. protected BreakIterator()
  205. {
  206. }
  207. /**
  208. * Create a copy of this iterator
  209. * @return A copy of this
  210. */
  211. public Object clone()
  212. {
  213. try {
  214. return super.clone();
  215. }
  216. catch (CloneNotSupportedException e) {
  217. throw new InternalError();
  218. }
  219. }
  220. /**
  221. * DONE is returned by previous() and next() after all valid
  222. * boundaries have been returned.
  223. */
  224. public static final int DONE = -1;
  225. /**
  226. * Return the first boundary. The iterator's current position is set
  227. * to the first boundary.
  228. * @return The character index of the first text boundary.
  229. */
  230. public abstract int first()
  231. ;
  232. /**
  233. * Return the last boundary. The iterator's current position is set
  234. * to the last boundary.
  235. * @return The character index of the last text boundary.
  236. */
  237. public abstract int last();
  238. /**
  239. * Return the nth boundary from the current boundary
  240. * @param n which boundary to return. A value of 0
  241. * does nothing. Negative values move to previous boundaries
  242. * and positive values move to later boundaries.
  243. * @return The index of the nth boundary from the current position.
  244. */
  245. public abstract int next(int n);
  246. /**
  247. * Return the boundary following the current boundary.
  248. * @return The character index of the next text boundary or DONE if all
  249. * boundaries have been returned. Equivalent to next(1).
  250. */
  251. public abstract int next();
  252. /**
  253. * Return the boundary preceding the current boundary.
  254. * @return The character index of the previous text boundary or DONE if all
  255. * boundaries have been returned.
  256. */
  257. public abstract int previous();
  258. /**
  259. * Return the first boundary following the specified offset.
  260. * The value returned is always greater than the offset or
  261. * the value BreakIterator.DONE
  262. * @param offset the offset to begin scanning. Valid values
  263. * are determined by the CharacterIterator passed to
  264. * setText(). Invalid values cause
  265. * an IllegalArgumentException to be thrown.
  266. * @return The first boundary after the specified offset.
  267. */
  268. public abstract int following(int offset);
  269. /**
  270. * Return the last boundary preceding the specfied offset.
  271. * The value returned is always less than the offset or the value
  272. * BreakIterator.DONE.
  273. * @param offset the offset to begin scanning. Valid values are
  274. * determined by the CharacterIterator passed to setText().
  275. * Invalid values cause an IllegalArgumentException to be thrown.
  276. * @return The last boundary before the specified offset.
  277. */
  278. public int preceding(int offset) {
  279. // NOTE: This implementation is here solely because we can't add new
  280. // abstract methods to an existing class. There is almost ALWAYS a
  281. // better, faster way to do this.
  282. int pos = following(offset);
  283. while (pos >= offset && pos != DONE)
  284. pos = previous();
  285. return pos;
  286. }
  287. /**
  288. * Return true if the specified position is a boundary position.
  289. * @param offset the offset to check.
  290. * @return True if "offset" is a boundary position.
  291. */
  292. public boolean isBoundary(int offset) {
  293. // NOTE: This implementation probably is wrong for most situations
  294. // because it fails to take into account the possibility that a
  295. // CharacterIterator passed to setText() may not have a begin offset
  296. // of 0. But since the abstract BreakIterator doesn't have that
  297. // knowledge, it assumes the begin offset is 0. If you subclass
  298. // BreakIterator, copy the SimpleTextBoundary implementation of this
  299. // function into your subclass. [This should have been abstract at
  300. // this level, but it's too late to fix that now.]
  301. if (offset == 0)
  302. return true;
  303. else
  304. return following(offset - 1) == offset;
  305. }
  306. /**
  307. * Return character index of the text boundary that was most recently
  308. * returned by next(), previous(), first(), or last()
  309. * @return The boundary most recently returned.
  310. */
  311. public abstract int current();
  312. /**
  313. * Get the text being scanned
  314. * @return the text being scanned
  315. */
  316. public abstract CharacterIterator getText();
  317. /**
  318. * Set a new text string to be scanned. The current scan
  319. * position is reset to first().
  320. * @param newText new text to scan.
  321. */
  322. public void setText(String newText)
  323. {
  324. setText(new StringCharacterIterator(newText));
  325. }
  326. /**
  327. * Set a new text for scanning. The current scan
  328. * position is reset to first().
  329. * @param newText new text to scan.
  330. */
  331. public abstract void setText(CharacterIterator newText);
  332. /**
  333. * Create BreakIterator for word-breaks using default locale.
  334. * Returns an instance of a BreakIterator implementing word breaks.
  335. * WordBreak is usefull for word selection (ex. double click)
  336. * @return A BreakIterator for word-breaks
  337. * @see java.util.Locale#getDefault
  338. */
  339. public static BreakIterator getWordInstance()
  340. {
  341. return getWordInstance(Locale.getDefault());
  342. }
  343. /**
  344. * Create BreakIterator for word-breaks using specified locale.
  345. * Returns an instance of a BreakIterator implementing word breaks.
  346. * WordBreak is usefull for word selection (ex. double click)
  347. * @param where the local. If a specific WordBreak is not
  348. * avaliable for the specified locale, a default WordBreak is returned.
  349. * @return A BreakIterator for word-breaks
  350. */
  351. public static BreakIterator getWordInstance(Locale where)
  352. {
  353. return new SimpleTextBoundary(new WordBreakData());
  354. }
  355. /**
  356. * Create BreakIterator for line-breaks using default locale.
  357. * Returns an instance of a BreakIterator implementing line breaks. Line
  358. * breaks are logically possible line breaks, actual line breaks are
  359. * usually determined based on display width.
  360. * LineBreak is useful for word wrapping text.
  361. * @return A BreakIterator for line-breaks
  362. * @see java.util.Locale#getDefault
  363. */
  364. public static BreakIterator getLineInstance()
  365. {
  366. return getLineInstance(Locale.getDefault());
  367. }
  368. /**
  369. * Create BreakIterator for line-breaks using specified locale.
  370. * Returns an instance of a BreakIterator implementing line breaks. Line
  371. * breaks are logically possible line breaks, actual line breaks are
  372. * usually determined based on display width.
  373. * LineBreak is useful for word wrapping text.
  374. * @param where the local. If a specific LineBreak is not
  375. * avaliable for the specified locale, a default LineBreak is returned.
  376. * @return A BreakIterator for line-breaks
  377. */
  378. public static BreakIterator getLineInstance(Locale where)
  379. {
  380. return new SimpleTextBoundary(new LineBreakData());
  381. }
  382. /**
  383. * Create BreakIterator for character-breaks using default locale
  384. * Returns an instance of a BreakIterator implementing character breaks.
  385. * Character breaks are boundaries of combining character sequences.
  386. * @return A BreakIterator for character-breaks
  387. * @see Locale#getDefault
  388. */
  389. public static BreakIterator getCharacterInstance()
  390. {
  391. return getCharacterInstance(Locale.getDefault());
  392. }
  393. /**
  394. * Create BreakIterator for character-breaks using specified locale
  395. * Returns an instance of a BreakIterator implementing character breaks.
  396. * Character breaks are boundaries of combining character sequences.
  397. * @param where the local. If a specific character break is not
  398. * avaliable for the specified local, a default character break is returned.
  399. * @return A BreakIterator for character-breaks
  400. */
  401. public static BreakIterator getCharacterInstance(Locale where)
  402. {
  403. return new SimpleTextBoundary(new CharacterBreakData());
  404. }
  405. /**
  406. * Create BreakIterator for sentence-breaks using default locale
  407. * Returns an instance of a BreakIterator implementing sentence breaks.
  408. * @return A BreakIterator for sentence-breaks
  409. * @see java.util.Locale#getDefault
  410. */
  411. public static BreakIterator getSentenceInstance()
  412. {
  413. return getSentenceInstance(Locale.getDefault());
  414. }
  415. /**
  416. * Create BreakIterator for sentence-breaks using specified locale
  417. * Returns an instance of a BreakIterator implementing sentence breaks.
  418. * @param where the local. If a specific SentenceBreak is not
  419. * avaliable for the specified local, a default SentenceBreak is returned.
  420. * @return A BreakIterator for sentence-breaks
  421. */
  422. public static BreakIterator getSentenceInstance(Locale where)
  423. {
  424. return new SimpleTextBoundary(new SentenceBreakData());
  425. }
  426. /**
  427. * Get the set of Locales for which BreakIterators are installed
  428. * @return available locales
  429. */
  430. public static synchronized Locale[] getAvailableLocales()
  431. {
  432. //FIX ME - this is a known bug. It should return
  433. //all locales.
  434. return LocaleData.getAvailableLocales("NumberPatterns");
  435. }
  436. }