1. /*
  2. * @(#)TextBoundaryData.java 1.11 01/11/29
  3. *
  4. * Copyright 2002 Sun Microsystems, Inc. All rights reserved.
  5. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
  6. */
  7. /*
  8. * @(#)TextBoundaryData.java 1.11 01/11/29
  9. *
  10. * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  11. * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
  12. *
  13. * Portions copyright (c) 1996-1998 Sun Microsystems, Inc.
  14. * All Rights Reserved.
  15. *
  16. * The original version of this source code and documentation
  17. * is copyrighted and owned by Taligent, Inc., a wholly-owned
  18. * subsidiary of IBM. These materials are provided under terms
  19. * of a License Agreement between Taligent and Sun. This technology
  20. * is protected by multiple US and International patents.
  21. *
  22. * This notice and attribution to Taligent may not be removed.
  23. * Taligent is a registered trademark of Taligent, Inc.
  24. *
  25. * Permission to use, copy, modify, and distribute this software
  26. * and its documentation for NON-COMMERCIAL purposes and without
  27. * fee is hereby granted provided that this copyright notice
  28. * appears in all copies. Please refer to the file "copyright.html"
  29. * for further important copyright and licensing information.
  30. *
  31. * SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
  32. * THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
  33. * TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
  34. * PARTICULAR PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR
  35. * ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR
  36. * DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
  37. *
  38. */
  39. package java.text;
  40. /**
  41. * This class wraps up the data tables needed for SimpleTextBoundary.
  42. * It is subclassed for each type of text boundary.
  43. */
  44. abstract class TextBoundaryData
  45. {
  46. private WordBreakTable forwardStateTable = null;
  47. private WordBreakTable backwardStateTable = null;
  48. private UnicodeClassMapping mappingTable = null;
  49. protected TextBoundaryData(WordBreakTable fwd, WordBreakTable bwd, UnicodeClassMapping map) {
  50. forwardStateTable = fwd;
  51. backwardStateTable = bwd;
  52. mappingTable = map;
  53. }
  54. public WordBreakTable forward() {
  55. return forwardStateTable;
  56. }
  57. public WordBreakTable backward() {
  58. return backwardStateTable;
  59. }
  60. public UnicodeClassMapping map() {
  61. return mappingTable;
  62. }
  63. // useful Unicode constants
  64. protected static final char ASCII_END_OF_TEXT
  65. = '\u0003';
  66. protected static final char ASCII_HORIZONTAL_TABULATION
  67. = '\u0009';
  68. protected static final char ASCII_LINEFEED
  69. = (char)0x000A;
  70. protected static final char ASCII_VERTICAL_TABULATION
  71. = '\u000B';
  72. protected static final char ASCII_FORM_FEED
  73. = '\u000C';
  74. protected static final char ASCII_CARRIAGE_RETURN
  75. = (char)0x000D;
  76. protected static final char ASCII_SPACE
  77. = '\u0020';
  78. protected static final char ASCII_EXCLAMATION_MARK
  79. = '\u0021';
  80. protected static final char ASCII_QUOTATION_MARK
  81. = '\u0022';
  82. protected static final char ASCII_NUMBER_SIGN
  83. = '\u0023';
  84. protected static final char ASCII_DOLLAR_SIGN
  85. = '\u0024';
  86. protected static final char ASCII_PERCENT
  87. = '\u0025';
  88. protected static final char ASCII_AMPERSAND
  89. = '\u0026';
  90. protected static final char ASCII_APOSTROPHE
  91. = (char)0x0027;
  92. protected static final char ASCII_COMMA
  93. = '\u002C';
  94. protected static final char ASCII_FULL_STOP
  95. = '\u002E';
  96. protected static final char ASCII_COLON
  97. = '\u003A';
  98. protected static final char ASCII_SEMICOLON
  99. = '\u003B';
  100. protected static final char ASCII_QUESTION_MARK
  101. = '\u003F';
  102. protected static final char ASCII_NONBREAKING_SPACE
  103. = '\u00A0';
  104. protected static final char ASCII_CENT_SIGN
  105. = '\u00A2';
  106. protected static final char ASCII_POUND_SIGN
  107. = '\u00a3';
  108. protected static final char ASCII_YEN_SIGN
  109. = '\u00a5';
  110. protected static final char LATIN1_SOFTHYPHEN
  111. = '\u00AD';
  112. protected static final char LATIN1_DEGREE_SIGN
  113. = '\u00B0';
  114. protected static final char ARABIC_PERCENT_SIGN
  115. = '\u066A';
  116. protected static final char ARABIC_DECIMAL_SEPARATOR
  117. = '\u066B';
  118. protected static final char HANGUL_CHOSEONG_LOW
  119. = '\u1100';
  120. protected static final char HANGUL_CHOSEONG_HIGH
  121. = '\u115f';
  122. protected static final char HANGUL_JUNGSEONG_LOW
  123. = '\u1160';
  124. protected static final char HANGUL_JUNGSEONG_HIGH
  125. = '\u11A7';
  126. protected static final char HANGUL_JONGSEONG_LOW
  127. = '\u11A8';
  128. protected static final char HANGUL_JONGSEONG_HIGH
  129. = '\u11FF';
  130. protected static final char FIGURE_SPACE
  131. = '\u2007';
  132. protected static final char NONBREAKING_HYPHEN
  133. = '\u2011';
  134. protected static final char PUNCTUATION_HYPHENATION_POINT
  135. = '\u2027';
  136. protected static final char PUNCTUATION_LINE_SEPARATOR
  137. = '\u2028';
  138. protected static final char PUNCTUATION_PARAGRAPH_SEPARATOR
  139. = '\u2029';
  140. protected static final char PER_MILLE_SIGN
  141. = '\u2030';
  142. protected static final char PER_TEN_THOUSAND_SIGN
  143. = '\u2031';
  144. protected static final char PRIME
  145. = '\u2032';
  146. protected static final char DOUBLE_PRIME
  147. = '\u2033';
  148. protected static final char TRIPLE_PRIME
  149. = '\u2034';
  150. protected static final char DEGREE_CELSIUS
  151. = '\u2103';
  152. protected static final char DEGREE_FAHRENHEIT
  153. = '\u2109';
  154. protected static final char PUNCTUATION_IDEOGRAPHIC_COMMA
  155. = '\u3001';
  156. protected static final char PUNCTUATION_IDEOGRAPHIC_FULL_STOP
  157. = '\u3002';
  158. protected static final char IDEOGRAPHIC_ITERATION_MARK
  159. = '\u3005';
  160. protected static final char HIRAGANA_LETTER_SMALL_A
  161. = '\u3041';
  162. protected static final char HIRAGANA_LETTER_A
  163. = '\u3042';
  164. protected static final char HIRAGANA_LETTER_SMALL_I
  165. = '\u3043';
  166. protected static final char HIRAGANA_LETTER_I
  167. = '\u3044';
  168. protected static final char HIRAGANA_LETTER_SMALL_U
  169. = '\u3045';
  170. protected static final char HIRAGANA_LETTER_U
  171. = '\u3046';
  172. protected static final char HIRAGANA_LETTER_SMALL_E
  173. = '\u3047';
  174. protected static final char HIRAGANA_LETTER_E
  175. = '\u3048';
  176. protected static final char HIRAGANA_LETTER_SMALL_O
  177. = '\u3049';
  178. protected static final char HIRAGANA_LETTER_O
  179. = '\u304A';
  180. protected static final char HIRAGANA_LETTER_DI
  181. = '\u3062';
  182. protected static final char HIRAGANA_LETTER_SMALL_TU
  183. = '\u3063';
  184. protected static final char HIRAGANA_LETTER_TU
  185. = '\u3064';
  186. protected static final char HIRAGANA_LETTER_MO
  187. = '\u3082';
  188. protected static final char HIRAGANA_LETTER_SMALL_YA
  189. = '\u3083';
  190. protected static final char HIRAGANA_LETTER_YA
  191. = '\u3084';
  192. protected static final char HIRAGANA_LETTER_SMALL_YU
  193. = '\u3085';
  194. protected static final char HIRAGANA_LETTER_YU
  195. = '\u3086';
  196. protected static final char HIRAGANA_LETTER_SMALL_YO
  197. = '\u3087';
  198. protected static final char HIRAGANA_LETTER_YO
  199. = '\u3088';
  200. protected static final char HIRAGANA_LETTER_RO
  201. = '\u308D';
  202. protected static final char HIRAGANA_LETTER_SMALL_WA
  203. = '\u308E';
  204. protected static final char HIRAGANA_LETTER_WA
  205. = '\u308F';
  206. protected static final char HIRAGANA_LETTER_VU
  207. = '\u3094';
  208. protected static final char COMBINING_KATAKANA_HIRAGANA_VOICED_SOUND_MARK
  209. = '\u3099';
  210. protected static final char HIRAGANA_SEMIVOICED_SOUND_MARK
  211. = '\u309C';
  212. protected static final char HIRAGANA_ITERATION_MARK
  213. = '\u309D';
  214. protected static final char HIRAGANA_VOICED_ITERATION_MARK
  215. = '\u309E';
  216. protected static final char KATAKANA_LETTER_SMALL_A
  217. = '\u30A1';
  218. protected static final char KATAKANA_LETTER_A
  219. = '\u30A2';
  220. protected static final char KATAKANA_LETTER_SMALL_I
  221. = '\u30A3';
  222. protected static final char KATAKANA_LETTER_I
  223. = '\u30A4';
  224. protected static final char KATAKANA_LETTER_SMALL_U
  225. = '\u30A5';
  226. protected static final char KATAKANA_LETTER_U
  227. = '\u30A6';
  228. protected static final char KATAKANA_LETTER_SMALL_E
  229. = '\u30A7';
  230. protected static final char KATAKANA_LETTER_E
  231. = '\u30A8';
  232. protected static final char KATAKANA_LETTER_SMALL_O
  233. = '\u30A9';
  234. protected static final char KATAKANA_LETTER_O
  235. = '\u30AA';
  236. protected static final char KATAKANA_LETTER_DI
  237. = '\u30C2';
  238. protected static final char KATAKANA_LETTER_SMALL_TU
  239. = '\u30C3';
  240. protected static final char KATAKANA_LETTER_TU
  241. = '\u30C4';
  242. protected static final char KATAKANA_LETTER_MO
  243. = '\u30E2';
  244. protected static final char KATAKANA_LETTER_SMALL_YA
  245. = '\u30E3';
  246. protected static final char KATAKANA_LETTER_YA
  247. = '\u30E4';
  248. protected static final char KATAKANA_LETTER_SMALL_YU
  249. = '\u30E5';
  250. protected static final char KATAKANA_LETTER_YU
  251. = '\u30E6';
  252. protected static final char KATAKANA_LETTER_SMALL_YO
  253. = '\u30E7';
  254. protected static final char KATAKANA_LETTER_YO
  255. = '\u30E8';
  256. protected static final char KATAKANA_LETTER_RO
  257. = '\u30ED';
  258. protected static final char KATAKANA_LETTER_SMALL_WA
  259. = '\u30EE';
  260. protected static final char KATAKANA_LETTER_WA
  261. = '\u30EF';
  262. protected static final char KATAKANA_LETTER_VU
  263. = '\u30F4';
  264. protected static final char KATAKANA_LETTER_SMALL_KA
  265. = '\u30F5';
  266. protected static final char KATAKANA_LETTER_SMALL_KE
  267. = '\u30F6';
  268. protected static final char KATAKANA_LETTER_VA
  269. = '\u30F7';
  270. protected static final char KATAKANA_LETTER_VO
  271. = '\u30FA';
  272. protected static final char KATAKANA_HIRAGANA_PROLONGED_SOUND_MARK
  273. = '\u30FC';
  274. protected static final char KATAKANA_ITERATION_MARK
  275. = '\u30FD';
  276. protected static final char KATAKANA_VOICED_ITERATION_MARK
  277. = '\u30FE';
  278. protected static final char UNICODE_LOW_BOUND_HAN
  279. = '\u4E00';
  280. protected static final char UNICODE_HIGH_BOUND_HAN
  281. = '\u9FA5';
  282. protected static final char HANGUL_SYL_LOW
  283. = '\uAC00';
  284. protected static final char HANGUL_SYL_HIGH
  285. = '\uD7A3';
  286. protected static final char CJK_COMPATIBILITY_F900
  287. = '\uF900';
  288. protected static final char CJK_COMPATIBILITY_FA2D
  289. = '\uFA2D';
  290. protected static final char UNICODE_ZERO_WIDTH_NON_BREAKING_SPACE
  291. = '\uFEFF';
  292. protected static final char FULLWIDTH_EXCLAMATION_MARK
  293. = '\uFF01';
  294. protected static final char FULLWIDTH_FULL_STOP
  295. = '\uFF0E';
  296. protected static final char FULLWIDTH_QUESTION_MARK
  297. = '\uFF1F';
  298. // SimpleTextBoundary has an internal convention that the not-a-Unicode value
  299. // $FFFF is used to signify the end of the string when looking up a proper state
  300. // transition for the end of the string
  301. protected static final char END_OF_STRING
  302. = '\uFFFF';
  303. }