1. /*
  2. * @(#)SentenceBreakData.java 1.18 01/11/29
  3. *
  4. * Copyright 2002 Sun Microsystems, Inc. All rights reserved.
  5. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
  6. */
  7. /*
  8. * @(#)SentenceBreakData.java 1.18 01/11/29
  9. *
  10. * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  11. * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
  12. *
  13. * Portions copyright (c) 1996-1998 Sun Microsystems, Inc.
  14. * All Rights Reserved.
  15. *
  16. * The original version of this source code and documentation
  17. * is copyrighted and owned by Taligent, Inc., a wholly-owned
  18. * subsidiary of IBM. These materials are provided under terms
  19. * of a License Agreement between Taligent and Sun. This technology
  20. * is protected by multiple US and International patents.
  21. *
  22. * This notice and attribution to Taligent may not be removed.
  23. * Taligent is a registered trademark of Taligent, Inc.
  24. *
  25. * Permission to use, copy, modify, and distribute this software
  26. * and its documentation for NON-COMMERCIAL purposes and without
  27. * fee is hereby granted provided that this copyright notice
  28. * appears in all copies. Please refer to the file "copyright.html"
  29. * for further important copyright and licensing information.
  30. *
  31. * SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
  32. * THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
  33. * TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
  34. * PARTICULAR PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR
  35. * ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR
  36. * DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
  37. *
  38. */
  39. package java.text;
  40. /**
  41. * The SentenceBreakData contains data used by SimpleTextBoundary
  42. * to determine sentence breaks.
  43. * @see #BreakIterator
  44. */
  45. final class SentenceBreakData extends TextBoundaryData
  46. {
  47. private static final byte other = 0;
  48. // lower case letters, digits...
  49. private static final byte space = 1;
  50. // spaces...
  51. private static final byte terminator = 2;
  52. // period, questionmark...
  53. private static final byte ambiguosTerm = 3;
  54. // Ambiguos terminator
  55. private static final byte openBracket = 4;
  56. // open brackets
  57. private static final byte closeBracket = 5;
  58. // close brackets
  59. private static final byte cjk = 6;
  60. // Characters where the previous sentence does not have a space
  61. // after a terminator. Common in Japanese, Chinese, and Korean
  62. private static final byte paragraphBreak = 7;
  63. // Paragraph break
  64. private static final byte lowerCase = 8;
  65. // Lower case
  66. private static final byte upperCase = 9;
  67. private static final byte number = 10;
  68. private static final byte quote = 11;
  69. private static final byte nsm = 12;
  70. private static final byte EOS = 13;
  71. // digit
  72. private static final int COL_COUNT = 14;
  73. private static final byte SI = (byte)0x80;
  74. private static final byte STOP = (byte) 0;
  75. private static final byte SI_STOP = (byte)SI + STOP;
  76. public SentenceBreakData() {
  77. super(kSentenceForward, kSentenceBackward, kSentenceMap);
  78. }
  79. private static final byte kSentenceForwardData[] =
  80. {
  81. // other space terminator ambTerm
  82. // open close CJK PB
  83. // lower upper digit Quote
  84. // nsm EOS
  85. // 0
  86. STOP, STOP, STOP, STOP,
  87. STOP, STOP, STOP, STOP,
  88. STOP, STOP, STOP, STOP,
  89. STOP, STOP,
  90. // 1
  91. (byte)(SI+1), (byte)(SI+1), (byte)(SI+2), (byte)(SI+5),
  92. (byte)(SI+1), (byte)(SI+1), (byte)(SI+1), (byte)(SI+4),
  93. (byte)(SI+1), (byte)(SI+1), (byte)(SI+1), (byte)(SI+1),
  94. (byte)(SI+1), SI_STOP,
  95. // 2
  96. SI_STOP, (byte)(SI+3), (byte)(SI+2), (byte)(SI+5),
  97. SI_STOP, (byte)(SI+2), SI_STOP, (byte)(SI+4),
  98. SI_STOP, SI_STOP, SI_STOP, (byte)(SI+2),
  99. (byte)(SI+2), SI_STOP,
  100. // 3
  101. SI_STOP, (byte)(SI+3), SI_STOP, SI_STOP,
  102. SI_STOP, SI_STOP, SI_STOP, (byte)(SI+4),
  103. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  104. (byte)(SI+3), SI_STOP,
  105. // 4
  106. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  107. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  108. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  109. SI_STOP, SI_STOP,
  110. // 5
  111. (byte)(SI+1), (byte)(SI+6), (byte)(SI+2), (byte)(SI+5),
  112. (byte)(SI+1), (byte)(SI+5), SI_STOP, (byte)(SI+4),
  113. (byte)(SI+1), (byte)(SI+1), (byte)(SI+1), (byte)(SI+5),
  114. (byte)(SI+5), SI_STOP,
  115. // 6
  116. SI_STOP, (byte)(SI+6), SI_STOP, SI_STOP,
  117. (byte)(SI+7), (byte)(SI+1), SI_STOP, (byte)(SI+4),
  118. (byte)(SI+1), SI_STOP, (byte)(SI+1), SI_STOP,
  119. (byte)(SI+6), SI_STOP,
  120. // 7
  121. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  122. (byte)(7), SI_STOP, SI_STOP, SI_STOP,
  123. (byte)(SI+1), STOP, SI_STOP, SI_STOP,
  124. (byte)(SI+7), SI_STOP,
  125. // 8
  126. (byte)(SI+1), (byte)(SI+1), (byte)(SI+2), (byte)(SI+8),
  127. (byte)(SI+1), (byte)(SI+5), (byte)(SI+1), (byte)(SI+4),
  128. (byte)(SI+1), (byte)(SI+8), (byte)(SI+9), (byte)(SI+5),
  129. (byte)(SI+8), SI_STOP,
  130. // 9
  131. (byte)(SI+1), (byte)(SI+1), (byte)(SI+2), (byte)(SI+9),
  132. (byte)(SI+1), (byte)(SI+5), (byte)(SI+1), (byte)(SI+4),
  133. (byte)(SI+1), (byte)(SI+1), (byte)(SI+9), (byte)(SI+5),
  134. (byte)(SI+9), SI_STOP
  135. };
  136. private static final WordBreakTable kSentenceForward
  137. = new WordBreakTable(COL_COUNT, kSentenceForwardData);
  138. private static final byte kSentenceBackwardData[] =
  139. {
  140. // other space terminator ambTerm
  141. // open close CJK PB
  142. // lower upper digit quote
  143. // nsm EOS
  144. // 0
  145. STOP, STOP, STOP, STOP,
  146. STOP, STOP, STOP, STOP,
  147. STOP, STOP, STOP, STOP,
  148. STOP, STOP,
  149. // 1
  150. (byte)(SI+2), (byte)(SI+2), (byte)(SI+2), (byte)(SI+2),
  151. (byte)(SI+2), (byte)(SI+2), (byte)(SI+3), STOP,
  152. (byte)(SI+2), (byte)(SI+3), (byte)(SI+2), (byte)(SI+2),
  153. (byte)(SI+1), STOP,
  154. // 2
  155. (byte)(SI+2), (byte)(SI+2), (byte)(SI+2), (byte)(SI+2),
  156. (byte)(SI+2), (byte)(SI+2), (byte)(SI+3), STOP,
  157. (byte)(SI+2), (byte)(SI+3), (byte)(SI+2), (byte)(SI+2),
  158. (byte)(SI+2), STOP,
  159. // 3
  160. (byte)(SI+2), (byte)(SI+4), (byte)(SI+2), (byte)(SI+2),
  161. (byte)(SI+2), (byte)(SI+2), (byte)(SI+3), STOP,
  162. (byte)(SI+3), (byte)(SI+2), (byte)(SI+2), (byte)(SI+2),
  163. (byte)(SI+3), STOP,
  164. // 4
  165. (byte)(SI+2), (byte)(SI+4), SI_STOP, SI_STOP,
  166. (byte)(SI+2), (byte)(SI+2), (byte)(SI+3), STOP,
  167. (byte)(SI+2), (byte)(SI+3), (byte)(SI+2), (byte)(SI+2),
  168. (byte)(SI+4), STOP
  169. };
  170. private static final WordBreakTable kSentenceBackward
  171. = new WordBreakTable(COL_COUNT, kSentenceBackwardData);
  172. private static final int kRawMapping[] =
  173. {
  174. other, // UNASSIGNED = 0,
  175. upperCase, // UPPERCASE_LETTER = 1,
  176. lowerCase, // LOWERCASE_LETTER = 2,
  177. other, // TITLECASE_LETTER = 3,
  178. other, // MODIFIER_LETTER = 4,
  179. other, // OTHER_LETTER = 5,
  180. nsm, // NON_SPACING_MARK = 6,
  181. nsm, // ENCLOSING_MARK = 7,
  182. other, // COMBINING_SPACING_MARK = 8,
  183. number, // DECIMAL_DIGIT_NUMBER = 9,
  184. number, // LETTER_NUMBER = 10,
  185. number, // OTHER_NUMBER = 11,
  186. space, // SPACE_SEPARATOR = 12,
  187. space, // LINE_SEPARATOR = 13,
  188. space, // PARAGRAPH_SEPARATOR = 14, ???????
  189. other, // CONTROL = 15,
  190. other, // PRIVATE_USE = 16,
  191. other, // FORMAT = 17,
  192. other, // ???? = 18,
  193. other, // SURROGATE = 19,
  194. other, // DASH_PUNCTUATION = 20,
  195. openBracket, // START_PUNCTUATION = 21,
  196. closeBracket, // END_PUNCTUATION = 22,
  197. other, // CONNECTOR_PUNCTUATION = 23,
  198. other, // OTHER_PUNCTUATION = 24,
  199. other, // MATH_SYMBOL = 25,
  200. other, // CURRENCY_SYMBOL = 26,
  201. other, // MODIFIER_SYMBOL = 27,
  202. other, // OTHER_SYMBOL = 28;
  203. };
  204. private static final SpecialMapping kExceptionChar[] =
  205. {
  206. //note: the ranges in this table must be sorted in ascending order
  207. //as required by the UnicodeClassMapping class.
  208. new SpecialMapping(ASCII_HORIZONTAL_TABULATION, space),
  209. new SpecialMapping(ASCII_LINEFEED, space),
  210. new SpecialMapping(ASCII_FORM_FEED, terminator),
  211. new SpecialMapping(ASCII_CARRIAGE_RETURN, space),
  212. new SpecialMapping(ASCII_EXCLAMATION_MARK, terminator),
  213. new SpecialMapping(ASCII_QUOTATION_MARK, quote),
  214. new SpecialMapping(ASCII_APOSTROPHE, quote),
  215. new SpecialMapping(ASCII_FULL_STOP, ambiguosTerm),
  216. new SpecialMapping(ASCII_QUESTION_MARK, terminator),
  217. new SpecialMapping(ASCII_NONBREAKING_SPACE, other),
  218. new SpecialMapping(PUNCTUATION_LINE_SEPARATOR, space),
  219. new SpecialMapping(PUNCTUATION_PARAGRAPH_SEPARATOR, paragraphBreak),
  220. new SpecialMapping(PUNCTUATION_IDEOGRAPHIC_FULL_STOP, terminator),
  221. new SpecialMapping(HIRAGANA_LETTER_SMALL_A, HIRAGANA_LETTER_VU, cjk),
  222. new SpecialMapping(COMBINING_KATAKANA_HIRAGANA_VOICED_SOUND_MARK,
  223. HIRAGANA_SEMIVOICED_SOUND_MARK, cjk), // cjk
  224. new SpecialMapping(KATAKANA_LETTER_SMALL_A, KATAKANA_LETTER_SMALL_KE,
  225. cjk), // cjk
  226. new SpecialMapping(UNICODE_LOW_BOUND_HAN, UNICODE_HIGH_BOUND_HAN, cjk),
  227. new SpecialMapping(CJK_COMPATIBILITY_F900, CJK_COMPATIBILITY_FA2D,cjk),
  228. new SpecialMapping(UNICODE_ZERO_WIDTH_NON_BREAKING_SPACE, other),
  229. new SpecialMapping(FULLWIDTH_EXCLAMATION_MARK, terminator),
  230. new SpecialMapping(FULLWIDTH_FULL_STOP, ambiguosTerm),
  231. new SpecialMapping(FULLWIDTH_QUESTION_MARK, terminator),
  232. new SpecialMapping(END_OF_STRING, EOS)
  233. };
  234. private static final boolean SentenceExceptionFlags[] = {
  235. false, // kNonCharacter = 0,
  236. false, // kUppercaseLetter = 1,
  237. false, // kLowercaseLetter = 2,
  238. false, // kTitlecaseLetter = 3,
  239. false, // kModifierLetter = 4,
  240. true, // kOtherLetter = 5,
  241. true, // kNonSpacingMark = 6,
  242. false, // kEnclosingMark = 7,
  243. false, // kCombiningSpacingMark = 8,
  244. false, // kDecimalNumber = 9,
  245. false, // kLetterNumber = 10,
  246. false, // kOtherNumber = 11,
  247. true, // kSpaceSeparator = 12,
  248. true, // kLineSeparator = 13,
  249. true, // kParagraphSeparator = 14,
  250. true, // kControlCharacter = 15,
  251. true, // kFormatCharacter = 16,
  252. false, // UNDEFINED = 17,
  253. false, // kPrivateUseCharacter = 18,
  254. false, // kSurrogate = 19,
  255. false, // kDashPunctuation = 20,
  256. false, // kOpenPunctuation = 21,
  257. false, // kClosePunctuation = 22,
  258. false, // kConnectorPunctuation = 23,
  259. true, // kOtherPunctuation = 24,
  260. false, // kMathSymbol = 25,
  261. false, // kCurrencySymbol = 26,
  262. false, // kModifierSymbol = 27,
  263. false // kOtherSymbol = 28
  264. };
  265. private static final int kSentenceAsciiValues[] = {
  266. // null soh stx etx eot enq ask bell
  267. other, other, other, other, other, other, other, other,
  268. // bs ht lf vt ff cr so si
  269. other, space, space, other, terminator, space, other, other,
  270. // dle dc1 dc2 dc3 dc4 nak syn etb
  271. other, other, other, other, other, other, other, other,
  272. // can em sub esc fs gs rs us
  273. other, other, other, other, other, other, other, other,
  274. // sp ! " # $ % & '
  275. space, terminator, quote, other, other, other, other, quote,
  276. // ( ) * + , - . /
  277. openBracket, closeBracket, other, other, other, other, ambiguosTerm, other,
  278. // 0 1 2 3 4 5 6 7
  279. number, number, number, number, number, number, number, number,
  280. // 8 9 : ; < = > ?
  281. number, number, other, other, other, other, other, terminator,
  282. // @ A B C D E F G
  283. other, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase,
  284. // H I J K L M N O
  285. upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase,
  286. // P Q R S T U V W
  287. upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase,
  288. // X Y Z [ \ ] ^ _
  289. upperCase, upperCase, upperCase, openBracket, other, closeBracket, other, other,
  290. // ` a b c d e f g
  291. other, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase,
  292. // h i j k l m n o
  293. lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase,
  294. // p q r s t u v w
  295. lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase,
  296. // x y z { | } ~ del
  297. lowerCase, lowerCase, lowerCase, openBracket, other, closeBracket, other, other,
  298. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  299. other, other, other, other, other, other, other, other,
  300. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  301. other, other, other, other, other, other, other, other,
  302. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  303. other, other, other, other, other, other, other, other,
  304. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  305. other, other, other, other, other, other, other, other,
  306. // nbsp inv-! cents pounds currency yen broken-bar section
  307. other, other, other, other, other, other, other, other,
  308. // umlaut copyright super-a gui-left not soft-hyph registered macron
  309. other, other, lowerCase, openBracket, other, other, other, other,
  310. // degree +/- super-2 super-3 acute micro paragraph bullet
  311. other, other, number, number, other, lowerCase, other, other,
  312. // cedilla super-1 super-o gui-right 1/4 1/2 3/4 inv-?
  313. other, lowerCase, other, closeBracket, number, number, number, other,
  314. // A-grave A-acute A-hat A-tilde A-umlaut A-ring AE C-cedilla
  315. upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase,
  316. // E-grave E-acute E-hat E-umlaut I-grave I-acute I-hat I-umlaut
  317. upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase,
  318. // Edh N-tilde O-grave O-acute O-hat O-tilde O-umlaut times
  319. upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, other,
  320. // O=slash U-grave U-acute U-hat U-umlaut Y-acute Thorn ess-zed
  321. upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, lowerCase,
  322. // a-grave a-acute a-hat a-tilde a-umlaut a-ring ae c-cedilla
  323. lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase,
  324. // e-grave e-acute e-hat e-umlaut i-grave i-acute i-hat i-umlaut
  325. lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase,
  326. // edh n-tilde o-grave o-acute o-hat o-tilde o-umlaut over
  327. lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, other,
  328. // o-slash u-grave u-acute u-hat u-umlaut y-acute thorn y=umlaut
  329. lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase
  330. };
  331. private static final UnicodeClassMapping kSentenceMap
  332. = new UnicodeClassMapping(kRawMapping, kExceptionChar, SentenceExceptionFlags,
  333. kSentenceAsciiValues);
  334. }