1. /*
  2. * @(#)SentenceBreakData.java 1.20 00/01/19
  3. *
  4. * Copyright 1996-2000 Sun Microsystems, Inc. All Rights Reserved.
  5. *
  6. * This software is the proprietary information of Sun Microsystems, Inc.
  7. * Use is subject to license terms.
  8. *
  9. */
  10. /*
  11. * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  12. * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
  13. *
  14. * The original version of this source code and documentation
  15. * is copyrighted and owned by Taligent, Inc., a wholly-owned
  16. * subsidiary of IBM. These materials are provided under terms
  17. * of a License Agreement between Taligent and Sun. This technology
  18. * is protected by multiple US and International patents.
  19. *
  20. * This notice and attribution to Taligent may not be removed.
  21. * Taligent is a registered trademark of Taligent, Inc.
  22. *
  23. */
  24. package java.text;
  25. /**
  26. * The SentenceBreakData contains data used by SimpleTextBoundary
  27. * to determine sentence breaks.
  28. * @see #BreakIterator
  29. */
  30. final class SentenceBreakData extends TextBoundaryData
  31. {
  32. // THEORY OF OPERATION: This class contains all the tables necessary to do
  33. // character-break iteration. This class descends from TextBoundaryData, which
  34. // is abstract. This class doesn't define any non-static members; it inherits the
  35. // non-static members from TextBoundaryData and fills them in with pointers to
  36. // the static members defined here.
  37. // There are two main parts to a TextBoundaryData object: the state-transition
  38. // tables and the character-mapping tables. The forward state table defines the
  39. // transitions for a deterministic finite state machine that locates character
  40. // boundaries. The rows are the states and the columns are character categories.
  41. // The cell values consist of two parts: The first is the row number of the next
  42. // state to transition to, or a "stop" value (0). (Because 0 is the stop value
  43. // rather than a valid state number, row 0 of the array isn't ever looked at; we
  44. // fill it with STOP values by convention.) The second part is a flag indicating
  45. // whether the iterator should update its break position on this transition. When
  46. // the flag is set, the sign bit of the value is turned on (SI is used to represent
  47. // the flag bit being turned on-- we do it this way rather than just using negative
  48. // numbers because we still need to see the SI flag when the value of the transition
  49. // is STOP. SI_STOP is used to denote this.) The starting state in all state tables
  50. // is 1.
  51. // The backward state table works the same way as the forward state table, but is
  52. // usually simplified. The iterator uses the backward state table only to find a
  53. // "safe place" to start iterating forward. It then seeks forward from the "safe
  54. // place" to the actual break position using the forward table. A "safe place" is
  55. // a spot in the text that is guaranteed to be a break position.
  56. // The character-category mapping tables are split into several pieces, one for
  57. // each stage of the category-mapping process: 1) kRawMapping maps generic Unicode
  58. // character categories to the character categories used by this break iterator.
  59. // The index of the array is the Unicode category number as returned by
  60. // Character.getType(). 2) The kExceptionFlags table is a table of Boolean values
  61. // indicating whether all the characters in the Unicode category have the
  62. // raw-mapping value. The rows correspond to the rows of the raw-mapping table. If
  63. // an entry is true, then we find the right category using... 3) The kExceptionChar
  64. // table. This table is a sorted list of SpecialMapping objects. Each entry defines
  65. // a range of contiguous characters that share the same category and the category
  66. // number. This list is binary-searched to find an entry corresponding to the
  67. // charactre being mapped. Only characters whose breaking category is different from
  68. // the raw-mapping value (the breaking category for their Unicode category) are
  69. // listed in this table. 4) The kAsciiValues table is a fast-path table for characters
  70. // in the Latin1 range. This table maps straight from a character value to a
  71. // category number, bypassing all the other tables. The programmer must take care
  72. // that all of the different category-mapping tables are consistent.
  73. // In the current implementation, all of these tables are created and maintained
  74. // by hand, not using a tool.
  75. private static final byte other = 0; // characters not otherwise mentioned
  76. private static final byte space = 1; // whitespace
  77. private static final byte terminator = 2; // characters that always mark the end of a
  78. // sentence (? ! etc.)
  79. private static final byte ambiguosTerm = 3; // characters that may mark the end of a
  80. // sentence (periods)
  81. private static final byte openBracket = 4; // Opening punctuation that may occur before
  82. // the beginning of a sentence
  83. private static final byte closeBracket = 5; // Closing punctuation that may occur after
  84. // the end of a sentence
  85. private static final byte cjk = 6; // Characters where the previous sentence
  86. // does not have a space after a terminator.
  87. // Common in Japanese, Chinese, and Korean
  88. private static final byte paragraphBreak = 7;
  89. // the Unicode paragraph-break character
  90. private static final byte lowerCase = 8; // lower-case letters
  91. private static final byte upperCase = 9; // upper-case letters
  92. private static final byte number = 10; // digits
  93. private static final byte quote = 11; // the ASCII quote mark, which may be
  94. // either opening or closing punctuation
  95. private static final byte nsm = 12; // Unicode non-spacing marks
  96. private static final byte EOS = 13; // end of string
  97. private static final int COL_COUNT = 14; // number of categories
  98. private static final byte SI = (byte)0x80;
  99. private static final byte STOP = (byte) 0;
  100. private static final byte SI_STOP = (byte)SI + STOP;
  101. public SentenceBreakData() {
  102. super(kSentenceForward, kSentenceBackward, kSentenceMap);
  103. }
  104. // This table implements a relative simple heuristic for locating sentence
  105. // boundaries. It doesn't always work right (one common case is "Mr. Smith",
  106. // where it'll break between "Mr." and "Smith"), but is a pretty close
  107. // approximation.
  108. // The table implements these rules:
  109. // 1) Unless otherwise mentioned, don't break between characters. (state 1)
  110. // 2) If you see an unambiguous sentence terminator, continue seeking past more
  111. // terminators (if there are any), closing punctuation (if any), whitespace
  112. // (if any), and one paragraph separator (if any), in that order. The first
  113. // time you see an unexpected character, that's where the break goes.
  114. // (states 2 and 3)
  115. // 3) If you see a period followed by a Kanji character, there's a sentence break
  116. // after the period. If you see a period followed by whitespace or opening
  117. // punctuation, there's a break after the whitespace or before the opening
  118. // punctuation unless the next character is a lower-case letter,
  119. // a digit, closing punctuation, or a paragraph separator. If you see a
  120. // period followed by whitespace, followed by opening punctuation, there's a
  121. // break after the whitespace if the first character after the opening punctuation
  122. // is a capital letter, and a break after the opening punctuation if the next
  123. // character is anything other than a lower-case letter. (states 5, 6, and 7)
  124. // 4) There is ALWAYS a sentence break after a paragraph separator. (state 4)
  125. // 5) Non-spacing marks are transparent to the algorithm. (the nsm column)
  126. private static final byte kSentenceForwardData[] =
  127. {
  128. // other space terminator ambTerm
  129. // open close CJK PB
  130. // lower upper digit Quote
  131. // nsm EOS
  132. // 0 - dummy state
  133. STOP, STOP, STOP, STOP,
  134. STOP, STOP, STOP, STOP,
  135. STOP, STOP, STOP, STOP,
  136. STOP, STOP,
  137. // 1 - this is the main state, which just eats characters
  138. // until it sees a paragraph break or a sentence-terminating
  139. // character (all states loop back to here if they
  140. // don't see the right sequence of things that denotes the
  141. // end of a sentence).
  142. (byte)(SI+1), (byte)(SI+1), (byte)(SI+2), (byte)(SI+5),
  143. (byte)(SI+1), (byte)(SI+1), (byte)(SI+1), (byte)(SI+4),
  144. (byte)(SI+1), (byte)(SI+1), (byte)(SI+1), (byte)(SI+1),
  145. (byte)(SI+1), SI_STOP,
  146. // 2 - This state is triggered when we pass an unambiguous
  147. // sentence terminator. It eats terminating characters
  148. // and closing punctuation, passes whitespace and paragraph
  149. // separators, switches to state 5 on periods, and stops
  150. // on everything else.
  151. SI_STOP, (byte)(SI+3), (byte)(SI+2), (byte)(SI+5),
  152. SI_STOP, (byte)(SI+2), SI_STOP, (byte)(SI+4),
  153. SI_STOP, SI_STOP, SI_STOP, (byte)(SI+2),
  154. (byte)(SI+2), SI_STOP,
  155. // 3 - This state eats trailing whitespace after a sentence.
  156. // It passes paragraph separators, but stops on anything else.
  157. SI_STOP, (byte)(SI+3), SI_STOP, SI_STOP,
  158. SI_STOP, SI_STOP, SI_STOP, (byte)(SI+4),
  159. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  160. (byte)(SI+3), SI_STOP,
  161. // 4 - This state handles paragraph separators by eating them
  162. // and then stopping.
  163. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  164. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  165. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  166. SI_STOP, SI_STOP,
  167. // 5 - This state handles periods and other ambiguous sentence
  168. // terminators. It'll go back to state 2 on an unambiguous
  169. // terminator. It'll eat trailing punctuation and additional
  170. // periods. It stops on Kanji (a sentence in Kanji doesn't
  171. // have to be followed by whitespace), advances to state 6
  172. // on whitespace, and loops back to the starting state
  173. // on anything else (i.e., this wasn't actually the end
  174. // of a sentence).
  175. (byte)(SI+1), (byte)(SI+6), (byte)(SI+2), (byte)(SI+5),
  176. (byte)(SI+7), (byte)(SI+5), SI_STOP, (byte)(SI+4),
  177. (byte)(SI+1), (byte)(SI+1), (byte)(SI+1), (byte)(SI+5),
  178. (byte)(SI+5), SI_STOP,
  179. // 6 - This state handles whitespace after a period. It eats
  180. // any additional whitespace and passes paragraph breaks.
  181. // It'll loop back on lower-case letters and digits (not the
  182. // end of a sentence) and stop (yes the end of a sentence)
  183. // on most other characters. Opening punctuation requires
  184. // more lookahead and transitions to state 7.
  185. SI_STOP, (byte)(SI+6), SI_STOP, SI_STOP,
  186. (byte)(SI+7), (byte)(SI+1), SI_STOP, (byte)(SI+4),
  187. (byte)(SI+1), SI_STOP, (byte)(SI+1), SI_STOP,
  188. (byte)(SI+6), SI_STOP,
  189. // 7 - This state handles opening punctuation after whitespace
  190. // after a period. It stops unless the next character is a
  191. // lower-case letter (it rewinds back to before the sequence
  192. // opening punctuation and THEN stops if the character is an
  193. // upper-case letter). It loops (without advancing the break
  194. // position while eating additional opening punctuation.
  195. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  196. (byte)(7), SI_STOP, SI_STOP, SI_STOP,
  197. (byte)(SI+1), STOP, SI_STOP, SI_STOP,
  198. (byte)(SI+7), SI_STOP
  199. };
  200. private static final WordBreakTable kSentenceForward
  201. = new WordBreakTable(COL_COUNT, kSentenceForwardData);
  202. // This table locates a safe place for backward or random-access iterator
  203. // to turn around and seek forward.
  204. // 1) There is never a safe place to turn around before a non-spacing
  205. // mark. (state 1)
  206. // 2) There is always a sentence break after a paragraph separator.
  207. // (the PB column)
  208. // 3) If you see a closing punctuation mark or a Kanji character preceded
  209. // by whitespace, we can turn around and seek forward when we see a
  210. // sentence terminator.
  211. private static final byte kSentenceBackwardData[] =
  212. {
  213. // other space terminator ambTerm
  214. // open close CJK PB
  215. // lower upper digit quote
  216. // nsm EOS
  217. // 0
  218. STOP, STOP, STOP, STOP,
  219. STOP, STOP, STOP, STOP,
  220. STOP, STOP, STOP, STOP,
  221. STOP, STOP,
  222. // 1
  223. (byte)(SI+2), (byte)(SI+2), (byte)(SI+2), (byte)(SI+2),
  224. (byte)(SI+2), (byte)(SI+2), (byte)(SI+3), STOP,
  225. (byte)(SI+2), (byte)(SI+3), (byte)(SI+2), (byte)(SI+2),
  226. (byte)(SI+1), STOP,
  227. // 2
  228. (byte)(SI+2), (byte)(SI+2), (byte)(SI+2), (byte)(SI+2),
  229. (byte)(SI+2), (byte)(SI+2), (byte)(SI+3), STOP,
  230. (byte)(SI+2), (byte)(SI+3), (byte)(SI+2), (byte)(SI+2),
  231. (byte)(SI+2), STOP,
  232. // 3
  233. (byte)(SI+2), (byte)(SI+4), (byte)(SI+2), (byte)(SI+2),
  234. (byte)(SI+2), (byte)(SI+2), (byte)(SI+3), STOP,
  235. (byte)(SI+3), (byte)(SI+2), (byte)(SI+2), (byte)(SI+2),
  236. (byte)(SI+3), STOP,
  237. // 4
  238. (byte)(SI+2), (byte)(SI+4), SI_STOP, SI_STOP,
  239. (byte)(SI+2), (byte)(SI+2), (byte)(SI+3), STOP,
  240. (byte)(SI+2), (byte)(SI+3), (byte)(SI+2), (byte)(SI+2),
  241. (byte)(SI+4), STOP
  242. };
  243. private static final WordBreakTable kSentenceBackward
  244. = new WordBreakTable(COL_COUNT, kSentenceBackwardData);
  245. private static final int kRawMapping[] =
  246. {
  247. other, // UNASSIGNED = 0,
  248. upperCase, // UPPERCASE_LETTER = 1,
  249. lowerCase, // LOWERCASE_LETTER = 2,
  250. other, // TITLECASE_LETTER = 3,
  251. other, // MODIFIER_LETTER = 4,
  252. other, // OTHER_LETTER = 5,
  253. nsm, // NON_SPACING_MARK = 6,
  254. nsm, // ENCLOSING_MARK = 7,
  255. other, // COMBINING_SPACING_MARK = 8,
  256. number, // DECIMAL_DIGIT_NUMBER = 9,
  257. number, // LETTER_NUMBER = 10,
  258. number, // OTHER_NUMBER = 11,
  259. space, // SPACE_SEPARATOR = 12,
  260. space, // LINE_SEPARATOR = 13,
  261. space, // PARAGRAPH_SEPARATOR = 14, ???????
  262. other, // CONTROL = 15,
  263. other, // PRIVATE_USE = 16,
  264. other, // FORMAT = 17,
  265. other, // ???? = 18,
  266. other, // SURROGATE = 19,
  267. other, // DASH_PUNCTUATION = 20,
  268. openBracket, // START_PUNCTUATION = 21,
  269. closeBracket, // END_PUNCTUATION = 22,
  270. other, // CONNECTOR_PUNCTUATION = 23,
  271. other, // OTHER_PUNCTUATION = 24,
  272. other, // MATH_SYMBOL = 25,
  273. other, // CURRENCY_SYMBOL = 26,
  274. other, // MODIFIER_SYMBOL = 27,
  275. other, // OTHER_SYMBOL = 28;
  276. };
  277. private static final SpecialMapping kExceptionChar[] =
  278. {
  279. //note: the ranges in this table must be sorted in ascending order
  280. //as required by the UnicodeClassMapping class.
  281. new SpecialMapping(ASCII_HORIZONTAL_TABULATION, space),
  282. new SpecialMapping(ASCII_LINEFEED, space),
  283. new SpecialMapping(ASCII_FORM_FEED, terminator),
  284. new SpecialMapping(ASCII_CARRIAGE_RETURN, space),
  285. new SpecialMapping(ASCII_EXCLAMATION_MARK, terminator),
  286. new SpecialMapping(ASCII_QUOTATION_MARK, quote),
  287. new SpecialMapping(ASCII_APOSTROPHE, quote),
  288. new SpecialMapping(ASCII_FULL_STOP, ambiguosTerm),
  289. new SpecialMapping(ASCII_QUESTION_MARK, terminator),
  290. new SpecialMapping(ASCII_NONBREAKING_SPACE, other),
  291. new SpecialMapping(PUNCTUATION_LINE_SEPARATOR, space),
  292. new SpecialMapping(PUNCTUATION_PARAGRAPH_SEPARATOR, paragraphBreak),
  293. new SpecialMapping(PUNCTUATION_IDEOGRAPHIC_FULL_STOP, terminator),
  294. new SpecialMapping(HIRAGANA_LETTER_SMALL_A, HIRAGANA_LETTER_VU, cjk),
  295. new SpecialMapping(COMBINING_KATAKANA_HIRAGANA_VOICED_SOUND_MARK,
  296. HIRAGANA_SEMIVOICED_SOUND_MARK, cjk), // cjk
  297. new SpecialMapping(KATAKANA_LETTER_SMALL_A, KATAKANA_LETTER_SMALL_KE,
  298. cjk), // cjk
  299. new SpecialMapping(UNICODE_LOW_BOUND_HAN, UNICODE_HIGH_BOUND_HAN, cjk),
  300. new SpecialMapping(CJK_COMPATIBILITY_F900, CJK_COMPATIBILITY_FA2D,cjk),
  301. new SpecialMapping(UNICODE_ZERO_WIDTH_NON_BREAKING_SPACE, other),
  302. new SpecialMapping(FULLWIDTH_EXCLAMATION_MARK, terminator),
  303. new SpecialMapping(FULLWIDTH_FULL_STOP, terminator),
  304. new SpecialMapping(FULLWIDTH_QUESTION_MARK, terminator),
  305. new SpecialMapping(END_OF_STRING, EOS)
  306. };
  307. private static final boolean SentenceExceptionFlags[] = {
  308. false, // kNonCharacter = 0,
  309. false, // kUppercaseLetter = 1,
  310. false, // kLowercaseLetter = 2,
  311. false, // kTitlecaseLetter = 3,
  312. false, // kModifierLetter = 4,
  313. true, // kOtherLetter = 5,
  314. true, // kNonSpacingMark = 6,
  315. false, // kEnclosingMark = 7,
  316. false, // kCombiningSpacingMark = 8,
  317. false, // kDecimalNumber = 9,
  318. false, // kLetterNumber = 10,
  319. false, // kOtherNumber = 11,
  320. true, // kSpaceSeparator = 12,
  321. true, // kLineSeparator = 13,
  322. true, // kParagraphSeparator = 14,
  323. true, // kControlCharacter = 15,
  324. true, // kFormatCharacter = 16,
  325. false, // UNDEFINED = 17,
  326. false, // kPrivateUseCharacter = 18,
  327. false, // kSurrogate = 19,
  328. false, // kDashPunctuation = 20,
  329. false, // kOpenPunctuation = 21,
  330. false, // kClosePunctuation = 22,
  331. false, // kConnectorPunctuation = 23,
  332. true, // kOtherPunctuation = 24,
  333. false, // kMathSymbol = 25,
  334. false, // kCurrencySymbol = 26,
  335. false, // kModifierSymbol = 27,
  336. false // kOtherSymbol = 28
  337. };
  338. private static final int kSentenceAsciiValues[] = {
  339. // null soh stx etx eot enq ask bell
  340. other, other, other, other, other, other, other, other,
  341. // bs ht lf vt ff cr so si
  342. other, space, space, other, terminator, space, other, other,
  343. // dle dc1 dc2 dc3 dc4 nak syn etb
  344. other, other, other, other, other, other, other, other,
  345. // can em sub esc fs gs rs us
  346. other, other, other, other, other, other, other, other,
  347. // sp ! " # $ % & '
  348. space, terminator, quote, other, other, other, other, quote,
  349. // ( ) * + , - . /
  350. openBracket, closeBracket, other, other, other, other, ambiguosTerm, other,
  351. // 0 1 2 3 4 5 6 7
  352. number, number, number, number, number, number, number, number,
  353. // 8 9 : ; < = > ?
  354. number, number, other, other, other, other, other, terminator,
  355. // @ A B C D E F G
  356. other, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase,
  357. // H I J K L M N O
  358. upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase,
  359. // P Q R S T U V W
  360. upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase,
  361. // X Y Z [ \ ] ^ _
  362. upperCase, upperCase, upperCase, openBracket, other, closeBracket, other, other,
  363. // ` a b c d e f g
  364. other, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase,
  365. // h i j k l m n o
  366. lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase,
  367. // p q r s t u v w
  368. lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase,
  369. // x y z { | } ~ del
  370. lowerCase, lowerCase, lowerCase, openBracket, other, closeBracket, other, other,
  371. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  372. other, other, other, other, other, other, other, other,
  373. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  374. other, other, other, other, other, other, other, other,
  375. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  376. other, other, other, other, other, other, other, other,
  377. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  378. other, other, other, other, other, other, other, other,
  379. // nbsp inv-! cents pounds currency yen broken-bar section
  380. other, other, other, other, other, other, other, other,
  381. // umlaut copyright super-a gui-left not soft-hyph registered macron
  382. other, other, lowerCase, openBracket, other, other, other, other,
  383. // degree +/- super-2 super-3 acute micro paragraph bullet
  384. other, other, number, number, other, lowerCase, other, other,
  385. // cedilla super-1 super-o gui-right 1/4 1/2 3/4 inv-?
  386. other, lowerCase, other, closeBracket, number, number, number, other,
  387. // A-grave A-acute A-hat A-tilde A-umlaut A-ring AE C-cedilla
  388. upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase,
  389. // E-grave E-acute E-hat E-umlaut I-grave I-acute I-hat I-umlaut
  390. upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase,
  391. // Edh N-tilde O-grave O-acute O-hat O-tilde O-umlaut times
  392. upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, other,
  393. // O=slash U-grave U-acute U-hat U-umlaut Y-acute Thorn ess-zed
  394. upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, lowerCase,
  395. // a-grave a-acute a-hat a-tilde a-umlaut a-ring ae c-cedilla
  396. lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase,
  397. // e-grave e-acute e-hat e-umlaut i-grave i-acute i-hat i-umlaut
  398. lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase,
  399. // edh n-tilde o-grave o-acute o-hat o-tilde o-umlaut over
  400. lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, other,
  401. // o-slash u-grave u-acute u-hat u-umlaut y-acute thorn y=umlaut
  402. lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase
  403. };
  404. private static final UnicodeClassMapping kSentenceMap
  405. = new UnicodeClassMapping(kRawMapping, kExceptionChar, SentenceExceptionFlags,
  406. kSentenceAsciiValues);
  407. }