1. /*
  2. * @(#)SentenceBreakData.java 1.23 03/01/23
  3. *
  4. * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
  5. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
  6. */
  7. /*
  8. * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  9. * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
  10. *
  11. * The original version of this source code and documentation
  12. * is copyrighted and owned by Taligent, Inc., a wholly-owned
  13. * subsidiary of IBM. These materials are provided under terms
  14. * of a License Agreement between Taligent and Sun. This technology
  15. * is protected by multiple US and International patents.
  16. *
  17. * This notice and attribution to Taligent may not be removed.
  18. * Taligent is a registered trademark of Taligent, Inc.
  19. *
  20. */
  21. package java.text;
  22. /**
  23. * The SentenceBreakData contains data used by SimpleTextBoundary
  24. * to determine sentence breaks.
  25. * @see #BreakIterator
  26. */
  27. final class SentenceBreakData extends TextBoundaryData
  28. {
  29. // THEORY OF OPERATION: This class contains all the tables necessary to do
  30. // character-break iteration. This class descends from TextBoundaryData, which
  31. // is abstract. This class doesn't define any non-static members; it inherits the
  32. // non-static members from TextBoundaryData and fills them in with pointers to
  33. // the static members defined here.
  34. // There are two main parts to a TextBoundaryData object: the state-transition
  35. // tables and the character-mapping tables. The forward state table defines the
  36. // transitions for a deterministic finite state machine that locates character
  37. // boundaries. The rows are the states and the columns are character categories.
  38. // The cell values consist of two parts: The first is the row number of the next
  39. // state to transition to, or a "stop" value (0). (Because 0 is the stop value
  40. // rather than a valid state number, row 0 of the array isn't ever looked at; we
  41. // fill it with STOP values by convention.) The second part is a flag indicating
  42. // whether the iterator should update its break position on this transition. When
  43. // the flag is set, the sign bit of the value is turned on (SI is used to represent
  44. // the flag bit being turned on-- we do it this way rather than just using negative
  45. // numbers because we still need to see the SI flag when the value of the transition
  46. // is STOP. SI_STOP is used to denote this.) The starting state in all state tables
  47. // is 1.
  48. // The backward state table works the same way as the forward state table, but is
  49. // usually simplified. The iterator uses the backward state table only to find a
  50. // "safe place" to start iterating forward. It then seeks forward from the "safe
  51. // place" to the actual break position using the forward table. A "safe place" is
  52. // a spot in the text that is guaranteed to be a break position.
  53. // The character-category mapping tables are split into several pieces, one for
  54. // each stage of the category-mapping process: 1) kRawMapping maps generic Unicode
  55. // character categories to the character categories used by this break iterator.
  56. // The index of the array is the Unicode category number as returned by
  57. // Character.getType(). 2) The kExceptionFlags table is a table of Boolean values
  58. // indicating whether all the characters in the Unicode category have the
  59. // raw-mapping value. The rows correspond to the rows of the raw-mapping table. If
  60. // an entry is true, then we find the right category using... 3) The kExceptionChar
  61. // table. This table is a sorted list of SpecialMapping objects. Each entry defines
  62. // a range of contiguous characters that share the same category and the category
  63. // number. This list is binary-searched to find an entry corresponding to the
  64. // charactre being mapped. Only characters whose breaking category is different from
  65. // the raw-mapping value (the breaking category for their Unicode category) are
  66. // listed in this table. 4) The kAsciiValues table is a fast-path table for characters
  67. // in the Latin1 range. This table maps straight from a character value to a
  68. // category number, bypassing all the other tables. The programmer must take care
  69. // that all of the different category-mapping tables are consistent.
  70. // In the current implementation, all of these tables are created and maintained
  71. // by hand, not using a tool.
  72. private static final byte other = 0; // characters not otherwise mentioned
  73. private static final byte space = 1; // whitespace
  74. private static final byte terminator = 2; // characters that always mark the end of a
  75. // sentence (? ! etc.)
  76. private static final byte ambiguosTerm = 3; // characters that may mark the end of a
  77. // sentence (periods)
  78. private static final byte openBracket = 4; // Opening punctuation that may occur before
  79. // the beginning of a sentence
  80. private static final byte closeBracket = 5; // Closing punctuation that may occur after
  81. // the end of a sentence
  82. private static final byte cjk = 6; // Characters where the previous sentence
  83. // does not have a space after a terminator.
  84. // Common in Japanese, Chinese, and Korean
  85. private static final byte paragraphBreak = 7;
  86. // the Unicode paragraph-break character
  87. private static final byte lowerCase = 8; // lower-case letters
  88. private static final byte upperCase = 9; // upper-case letters
  89. private static final byte number = 10; // digits
  90. private static final byte quote = 11; // the ASCII quote mark, which may be
  91. // either opening or closing punctuation
  92. private static final byte nsm = 12; // Unicode non-spacing marks
  93. private static final byte EOS = 13; // end of string
  94. private static final int COL_COUNT = 14; // number of categories
  95. private static final byte SI = (byte)0x80;
  96. private static final byte STOP = (byte) 0;
  97. private static final byte SI_STOP = (byte)SI + STOP;
  98. public SentenceBreakData() {
  99. super(kSentenceForward, kSentenceBackward, kSentenceMap);
  100. }
  101. // This table implements a relative simple heuristic for locating sentence
  102. // boundaries. It doesn't always work right (one common case is "Mr. Smith",
  103. // where it'll break between "Mr." and "Smith"), but is a pretty close
  104. // approximation.
  105. // The table implements these rules:
  106. // 1) Unless otherwise mentioned, don't break between characters. (state 1)
  107. // 2) If you see an unambiguous sentence terminator, continue seeking past more
  108. // terminators (if there are any), closing punctuation (if any), whitespace
  109. // (if any), and one paragraph separator (if any), in that order. The first
  110. // time you see an unexpected character, that's where the break goes.
  111. // (states 2 and 3)
  112. // 3) If you see a period followed by a Kanji character, there's a sentence break
  113. // after the period. If you see a period followed by whitespace or opening
  114. // punctuation, there's a break after the whitespace or before the opening
  115. // punctuation unless the next character is a lower-case letter,
  116. // a digit, closing punctuation, or a paragraph separator. If you see a
  117. // period followed by whitespace, followed by opening punctuation, there's a
  118. // break after the whitespace if the first character after the opening punctuation
  119. // is a capital letter, and a break after the opening punctuation if the next
  120. // character is anything other than a lower-case letter. (states 5, 6, and 7)
  121. // 4) There is ALWAYS a sentence break after a paragraph separator. (state 4)
  122. // 5) Non-spacing marks are transparent to the algorithm. (the nsm column)
  123. private static final byte kSentenceForwardData[] =
  124. {
  125. // other space terminator ambTerm
  126. // open close CJK PB
  127. // lower upper digit Quote
  128. // nsm EOS
  129. // 0 - dummy state
  130. STOP, STOP, STOP, STOP,
  131. STOP, STOP, STOP, STOP,
  132. STOP, STOP, STOP, STOP,
  133. STOP, STOP,
  134. // 1 - this is the main state, which just eats characters
  135. // until it sees a paragraph break or a sentence-terminating
  136. // character (all states loop back to here if they
  137. // don't see the right sequence of things that denotes the
  138. // end of a sentence).
  139. (byte)(SI+1), (byte)(SI+1), (byte)(SI+2), (byte)(SI+5),
  140. (byte)(SI+1), (byte)(SI+1), (byte)(SI+1), (byte)(SI+4),
  141. (byte)(SI+1), (byte)(SI+1), (byte)(SI+1), (byte)(SI+1),
  142. (byte)(SI+1), SI_STOP,
  143. // 2 - This state is triggered when we pass an unambiguous
  144. // sentence terminator. It eats terminating characters
  145. // and closing punctuation, passes whitespace and paragraph
  146. // separators, switches to state 5 on periods, and stops
  147. // on everything else.
  148. SI_STOP, (byte)(SI+3), (byte)(SI+2), (byte)(SI+5),
  149. SI_STOP, (byte)(SI+2), SI_STOP, (byte)(SI+4),
  150. SI_STOP, SI_STOP, SI_STOP, (byte)(SI+2),
  151. (byte)(SI+2), SI_STOP,
  152. // 3 - This state eats trailing whitespace after a sentence.
  153. // It passes paragraph separators, but stops on anything else.
  154. SI_STOP, (byte)(SI+3), SI_STOP, SI_STOP,
  155. SI_STOP, SI_STOP, SI_STOP, (byte)(SI+4),
  156. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  157. (byte)(SI+3), SI_STOP,
  158. // 4 - This state handles paragraph separators by eating them
  159. // and then stopping.
  160. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  161. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  162. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  163. SI_STOP, SI_STOP,
  164. // 5 - This state handles periods and other ambiguous sentence
  165. // terminators. It'll go back to state 2 on an unambiguous
  166. // terminator. It'll eat trailing punctuation and additional
  167. // periods. It stops on Kanji (a sentence in Kanji doesn't
  168. // have to be followed by whitespace), advances to state 6
  169. // on whitespace, and loops back to the starting state
  170. // on anything else (i.e., this wasn't actually the end
  171. // of a sentence).
  172. (byte)(SI+1), (byte)(SI+6), (byte)(SI+2), (byte)(SI+5),
  173. (byte)(SI+7), (byte)(SI+5), SI_STOP, (byte)(SI+4),
  174. (byte)(SI+1), (byte)(SI+1), (byte)(SI+1), (byte)(SI+5),
  175. (byte)(SI+5), SI_STOP,
  176. // 6 - This state handles whitespace after a period. It eats
  177. // any additional whitespace and passes paragraph breaks.
  178. // It'll loop back on lower-case letters and digits (not the
  179. // end of a sentence) and stop (yes the end of a sentence)
  180. // on most other characters. Opening punctuation requires
  181. // more lookahead and transitions to state 7.
  182. SI_STOP, (byte)(SI+6), SI_STOP, SI_STOP,
  183. (byte)(SI+7), (byte)(SI+1), SI_STOP, (byte)(SI+4),
  184. (byte)(SI+1), SI_STOP, (byte)(SI+1), SI_STOP,
  185. (byte)(SI+6), SI_STOP,
  186. // 7 - This state handles opening punctuation after whitespace
  187. // after a period. It stops unless the next character is a
  188. // lower-case letter (it rewinds back to before the sequence
  189. // opening punctuation and THEN stops if the character is an
  190. // upper-case letter). It loops (without advancing the break
  191. // position while eating additional opening punctuation.
  192. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  193. (byte)(7), SI_STOP, SI_STOP, SI_STOP,
  194. (byte)(SI+1), STOP, SI_STOP, SI_STOP,
  195. (byte)(SI+7), SI_STOP
  196. };
  197. private static final WordBreakTable kSentenceForward
  198. = new WordBreakTable(COL_COUNT, kSentenceForwardData);
  199. // This table locates a safe place for backward or random-access iterator
  200. // to turn around and seek forward.
  201. // 1) There is never a safe place to turn around before a non-spacing
  202. // mark. (state 1)
  203. // 2) There is always a sentence break after a paragraph separator.
  204. // (the PB column)
  205. // 3) If you see a closing punctuation mark or a Kanji character preceded
  206. // by whitespace, we can turn around and seek forward when we see a
  207. // sentence terminator.
  208. private static final byte kSentenceBackwardData[] =
  209. {
  210. // other space terminator ambTerm
  211. // open close CJK PB
  212. // lower upper digit quote
  213. // nsm EOS
  214. // 0
  215. STOP, STOP, STOP, STOP,
  216. STOP, STOP, STOP, STOP,
  217. STOP, STOP, STOP, STOP,
  218. STOP, STOP,
  219. // 1
  220. (byte)(SI+2), (byte)(SI+2), (byte)(SI+2), (byte)(SI+2),
  221. (byte)(SI+2), (byte)(SI+2), (byte)(SI+3), STOP,
  222. (byte)(SI+2), (byte)(SI+3), (byte)(SI+2), (byte)(SI+2),
  223. (byte)(SI+1), STOP,
  224. // 2
  225. (byte)(SI+2), (byte)(SI+2), (byte)(SI+2), (byte)(SI+2),
  226. (byte)(SI+2), (byte)(SI+2), (byte)(SI+3), STOP,
  227. (byte)(SI+2), (byte)(SI+3), (byte)(SI+2), (byte)(SI+2),
  228. (byte)(SI+2), STOP,
  229. // 3
  230. (byte)(SI+2), (byte)(SI+4), (byte)(SI+2), (byte)(SI+2),
  231. (byte)(SI+2), (byte)(SI+2), (byte)(SI+3), STOP,
  232. (byte)(SI+3), (byte)(SI+2), (byte)(SI+2), (byte)(SI+2),
  233. (byte)(SI+3), STOP,
  234. // 4
  235. (byte)(SI+2), (byte)(SI+4), SI_STOP, SI_STOP,
  236. (byte)(SI+2), (byte)(SI+2), (byte)(SI+3), STOP,
  237. (byte)(SI+2), (byte)(SI+3), (byte)(SI+2), (byte)(SI+2),
  238. (byte)(SI+4), STOP
  239. };
  240. private static final WordBreakTable kSentenceBackward
  241. = new WordBreakTable(COL_COUNT, kSentenceBackwardData);
  242. private static final int kRawMapping[] =
  243. {
  244. other, // UNASSIGNED = 0,
  245. upperCase, // UPPERCASE_LETTER = 1,
  246. lowerCase, // LOWERCASE_LETTER = 2,
  247. other, // TITLECASE_LETTER = 3,
  248. other, // MODIFIER_LETTER = 4,
  249. other, // OTHER_LETTER = 5,
  250. nsm, // NON_SPACING_MARK = 6,
  251. nsm, // ENCLOSING_MARK = 7,
  252. other, // COMBINING_SPACING_MARK = 8,
  253. number, // DECIMAL_DIGIT_NUMBER = 9,
  254. number, // LETTER_NUMBER = 10,
  255. number, // OTHER_NUMBER = 11,
  256. space, // SPACE_SEPARATOR = 12,
  257. space, // LINE_SEPARATOR = 13,
  258. space, // PARAGRAPH_SEPARATOR = 14, ???????
  259. other, // CONTROL = 15,
  260. other, // PRIVATE_USE = 16,
  261. other, // FORMAT = 17,
  262. other, // ???? = 18,
  263. other, // SURROGATE = 19,
  264. other, // DASH_PUNCTUATION = 20,
  265. openBracket, // START_PUNCTUATION = 21,
  266. closeBracket, // END_PUNCTUATION = 22,
  267. other, // CONNECTOR_PUNCTUATION = 23,
  268. other, // OTHER_PUNCTUATION = 24,
  269. other, // MATH_SYMBOL = 25,
  270. other, // CURRENCY_SYMBOL = 26,
  271. other, // MODIFIER_SYMBOL = 27,
  272. other, // OTHER_SYMBOL = 28;
  273. openBracket, // INITIAL_QUOTE_PUNCTUATION = 29,
  274. closeBracket, // FINAL_QUOTE_PUNCTUATION = 30,
  275. };
  276. private static final SpecialMapping kExceptionChar[] =
  277. {
  278. //note: the ranges in this table must be sorted in ascending order
  279. //as required by the UnicodeClassMapping class.
  280. new SpecialMapping(ASCII_HORIZONTAL_TABULATION, space),
  281. new SpecialMapping(ASCII_LINEFEED, space),
  282. new SpecialMapping(ASCII_FORM_FEED, terminator),
  283. new SpecialMapping(ASCII_CARRIAGE_RETURN, space),
  284. new SpecialMapping(ASCII_EXCLAMATION_MARK, terminator),
  285. new SpecialMapping(ASCII_QUOTATION_MARK, quote),
  286. new SpecialMapping(ASCII_APOSTROPHE, quote),
  287. new SpecialMapping(ASCII_FULL_STOP, ambiguosTerm),
  288. new SpecialMapping(ASCII_QUESTION_MARK, terminator),
  289. new SpecialMapping(ASCII_NONBREAKING_SPACE, other),
  290. new SpecialMapping(PUNCTUATION_LINE_SEPARATOR, space),
  291. new SpecialMapping(PUNCTUATION_PARAGRAPH_SEPARATOR, paragraphBreak),
  292. new SpecialMapping(PUNCTUATION_IDEOGRAPHIC_FULL_STOP, terminator),
  293. new SpecialMapping(HIRAGANA_LETTER_SMALL_A, HIRAGANA_LETTER_VU, cjk),
  294. new SpecialMapping(COMBINING_KATAKANA_HIRAGANA_VOICED_SOUND_MARK,
  295. HIRAGANA_SEMIVOICED_SOUND_MARK, cjk), // cjk
  296. new SpecialMapping(KATAKANA_LETTER_SMALL_A, KATAKANA_LETTER_SMALL_KE,
  297. cjk), // cjk
  298. new SpecialMapping(UNICODE_LOW_BOUND_HAN, UNICODE_HIGH_BOUND_HAN, cjk),
  299. new SpecialMapping(CJK_COMPATIBILITY_F900, CJK_COMPATIBILITY_FA2D,cjk),
  300. new SpecialMapping(UNICODE_ZERO_WIDTH_NON_BREAKING_SPACE, other),
  301. new SpecialMapping(FULLWIDTH_EXCLAMATION_MARK, terminator),
  302. new SpecialMapping(FULLWIDTH_FULL_STOP, terminator),
  303. new SpecialMapping(FULLWIDTH_QUESTION_MARK, terminator),
  304. new SpecialMapping(END_OF_STRING, EOS)
  305. };
  306. private static final boolean SentenceExceptionFlags[] = {
  307. false, // kNonCharacter = 0,
  308. false, // kUppercaseLetter = 1,
  309. false, // kLowercaseLetter = 2,
  310. false, // kTitlecaseLetter = 3,
  311. false, // kModifierLetter = 4,
  312. true, // kOtherLetter = 5,
  313. true, // kNonSpacingMark = 6,
  314. false, // kEnclosingMark = 7,
  315. false, // kCombiningSpacingMark = 8,
  316. false, // kDecimalNumber = 9,
  317. false, // kLetterNumber = 10,
  318. false, // kOtherNumber = 11,
  319. true, // kSpaceSeparator = 12,
  320. true, // kLineSeparator = 13,
  321. true, // kParagraphSeparator = 14,
  322. true, // kControlCharacter = 15,
  323. true, // kFormatCharacter = 16,
  324. false, // UNDEFINED = 17,
  325. false, // kPrivateUseCharacter = 18,
  326. false, // kSurrogate = 19,
  327. false, // kDashPunctuation = 20,
  328. false, // kOpenPunctuation = 21,
  329. false, // kClosePunctuation = 22,
  330. false, // kConnectorPunctuation = 23,
  331. true, // kOtherPunctuation = 24,
  332. false, // kMathSymbol = 25,
  333. false, // kCurrencySymbol = 26,
  334. false, // kModifierSymbol = 27,
  335. false, // kOtherSymbol = 28,
  336. false, // kInitialQuotePunctuation = 29,
  337. false, // kFinalQuotePunctuation = 30,
  338. };
  339. private static final int kSentenceAsciiValues[] = {
  340. // null soh stx etx eot enq ask bell
  341. other, other, other, other, other, other, other, other,
  342. // bs ht lf vt ff cr so si
  343. other, space, space, other, terminator, space, other, other,
  344. // dle dc1 dc2 dc3 dc4 nak syn etb
  345. other, other, other, other, other, other, other, other,
  346. // can em sub esc fs gs rs us
  347. other, other, other, other, other, other, other, other,
  348. // sp ! " # $ % & '
  349. space, terminator, quote, other, other, other, other, quote,
  350. // ( ) * + , - . /
  351. openBracket, closeBracket, other, other, other, other, ambiguosTerm, other,
  352. // 0 1 2 3 4 5 6 7
  353. number, number, number, number, number, number, number, number,
  354. // 8 9 : ; < = > ?
  355. number, number, other, other, other, other, other, terminator,
  356. // @ A B C D E F G
  357. other, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase,
  358. // H I J K L M N O
  359. upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase,
  360. // P Q R S T U V W
  361. upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase,
  362. // X Y Z [ \ ] ^ _
  363. upperCase, upperCase, upperCase, openBracket, other, closeBracket, other, other,
  364. // ` a b c d e f g
  365. other, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase,
  366. // h i j k l m n o
  367. lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase,
  368. // p q r s t u v w
  369. lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase,
  370. // x y z { | } ~ del
  371. lowerCase, lowerCase, lowerCase, openBracket, other, closeBracket, other, other,
  372. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  373. other, other, other, other, other, other, other, other,
  374. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  375. other, other, other, other, other, other, other, other,
  376. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  377. other, other, other, other, other, other, other, other,
  378. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  379. other, other, other, other, other, other, other, other,
  380. // nbsp inv-! cents pounds currency yen broken-bar section
  381. other, other, other, other, other, other, other, other,
  382. // umlaut copyright super-a gui-left not soft-hyph registered macron
  383. other, other, lowerCase, openBracket, other, other, other, other,
  384. // degree +/- super-2 super-3 acute micro paragraph bullet
  385. other, other, number, number, other, lowerCase, other, other,
  386. // cedilla super-1 super-o gui-right 1/4 1/2 3/4 inv-?
  387. other, lowerCase, other, closeBracket, number, number, number, other,
  388. // A-grave A-acute A-hat A-tilde A-umlaut A-ring AE C-cedilla
  389. upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase,
  390. // E-grave E-acute E-hat E-umlaut I-grave I-acute I-hat I-umlaut
  391. upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase,
  392. // Edh N-tilde O-grave O-acute O-hat O-tilde O-umlaut times
  393. upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, other,
  394. // O=slash U-grave U-acute U-hat U-umlaut Y-acute Thorn ess-zed
  395. upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, lowerCase,
  396. // a-grave a-acute a-hat a-tilde a-umlaut a-ring ae c-cedilla
  397. lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase,
  398. // e-grave e-acute e-hat e-umlaut i-grave i-acute i-hat i-umlaut
  399. lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase,
  400. // edh n-tilde o-grave o-acute o-hat o-tilde o-umlaut over
  401. lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, other,
  402. // o-slash u-grave u-acute u-hat u-umlaut y-acute thorn y=umlaut
  403. lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase
  404. };
  405. private static final UnicodeClassMapping kSentenceMap
  406. = new UnicodeClassMapping(kRawMapping, kExceptionChar, SentenceExceptionFlags,
  407. kSentenceAsciiValues);
  408. }