1. /*
  2. * @(#)WordBreakData.java 1.14 01/11/29
  3. *
  4. * Copyright 2002 Sun Microsystems, Inc. All rights reserved.
  5. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
  6. */
  7. /*
  8. * @(#)WordBreakData.java 1.14 01/11/29
  9. *
  10. * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  11. * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
  12. *
  13. * Portions copyright (c) 1996-1998 Sun Microsystems, Inc.
  14. * All Rights Reserved.
  15. *
  16. * The original version of this source code and documentation
  17. * is copyrighted and owned by Taligent, Inc., a wholly-owned
  18. * subsidiary of IBM. These materials are provided under terms
  19. * of a License Agreement between Taligent and Sun. This technology
  20. * is protected by multiple US and International patents.
  21. *
  22. * This notice and attribution to Taligent may not be removed.
  23. * Taligent is a registered trademark of Taligent, Inc.
  24. *
  25. * Permission to use, copy, modify, and distribute this software
  26. * and its documentation for NON-COMMERCIAL purposes and without
  27. * fee is hereby granted provided that this copyright notice
  28. * appears in all copies. Please refer to the file "copyright.html"
  29. * for further important copyright and licensing information.
  30. *
  31. * SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
  32. * THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
  33. * TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
  34. * PARTICULAR PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR
  35. * ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR
  36. * DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
  37. *
  38. */
  39. package java.text;
  40. /**
  41. * The WordBreakData contains data used by SimpleTextBoundary
  42. * to determine word breaks.
  43. * @see #BreakIterator
  44. */
  45. final class WordBreakData extends TextBoundaryData
  46. {
  47. private static final byte BREAK = 0;
  48. private static final byte letter = 1;
  49. private static final byte number = 2;
  50. private static final byte midLetter = 3;
  51. private static final byte midLetNum = 4;
  52. private static final byte preNum = 5;
  53. private static final byte postNum = 6;
  54. private static final byte midNum = 7;
  55. private static final byte preMidNum = 8;
  56. private static final byte blank = 9;
  57. private static final byte lf = 10;
  58. private static final byte kata = 11;
  59. private static final byte hira = 12;
  60. private static final byte kanji = 13;
  61. private static final byte diacrit = 14;
  62. private static final byte cr = 15;
  63. private static final byte nsm = 16;
  64. private static final byte EOS = 17;
  65. private static final int COL_COUNT = 18;
  66. private static final byte SI = (byte)0x80;
  67. private static final byte STOP = (byte) 0;
  68. private static final byte SI_STOP = (byte)SI + STOP;
  69. public WordBreakData() {
  70. super(kWordForward, kWordBackward, kWordMap);
  71. }
  72. private static final byte kWordForwardData[] =
  73. {
  74. // brk let num mLe mLN
  75. // prN poN mNu pMN blk
  76. // lf kat hir kan dia
  77. // cr nsm EOS
  78. // 0
  79. STOP, STOP, STOP, STOP, STOP,
  80. STOP, STOP, STOP, STOP, STOP,
  81. STOP, STOP, STOP, STOP, STOP,
  82. STOP, STOP, STOP,
  83. // 1
  84. (byte)(SI+14), (byte)(SI+2), (byte)(SI+3), (byte)(SI+14), (byte)(SI+14),
  85. (byte)(SI+5), (byte)(SI+14), (byte)(SI+14), (byte)(SI+5), (byte)(SI+6),
  86. (byte)(SI+4), (byte)(SI+10), (byte)(SI+11), (byte)(SI+12), (byte)(SI+9),
  87. (byte)(SI+13), (byte)(1), SI_STOP,
  88. // 2
  89. SI_STOP, (byte)(SI+2), (byte)(SI+3), (byte)(SI+7), (byte)(SI+7),
  90. SI_STOP, SI_STOP, SI_STOP, (byte)(SI+7), SI_STOP,
  91. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  92. SI_STOP, (byte)(2), SI_STOP,
  93. // 3
  94. SI_STOP, (byte)(SI+2), (byte)(SI+3), SI_STOP, (byte)(SI+8),
  95. SI_STOP, (byte)(SI+14), (byte)(SI+8), (byte)(SI+8), SI_STOP,
  96. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  97. SI_STOP, (byte)(3), SI_STOP,
  98. // 4
  99. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  100. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  101. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  102. SI_STOP, SI_STOP, SI_STOP,
  103. // 5
  104. SI_STOP, SI_STOP, (byte)(SI+3), SI_STOP, SI_STOP,
  105. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  106. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  107. SI_STOP, (byte)(5), SI_STOP,
  108. // 6
  109. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  110. SI_STOP, SI_STOP, SI_STOP, SI_STOP, (byte)(SI+6),
  111. (byte)(SI+4), SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  112. (byte)(SI+13), (byte)(6), SI_STOP,
  113. // 7
  114. STOP, (byte)(SI+2), STOP, STOP, STOP,
  115. STOP, STOP, STOP, STOP, STOP,
  116. STOP, STOP, STOP, STOP, STOP,
  117. STOP, (byte)(7), STOP,
  118. // 8
  119. STOP, STOP, (byte)(SI+3), STOP, STOP,
  120. STOP, STOP, STOP, STOP, STOP,
  121. STOP, STOP, STOP, STOP, STOP,
  122. STOP, (byte)(8), STOP,
  123. // 9
  124. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  125. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  126. SI_STOP, (byte)(SI+10), (byte)(SI+11), SI_STOP, (byte)(SI+9),
  127. SI_STOP, (byte)(9), SI_STOP,
  128. // 10
  129. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  130. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  131. SI_STOP, (byte)(SI+10), SI_STOP, SI_STOP, (byte)(SI+10),
  132. SI_STOP, (byte)(10), SI_STOP,
  133. // 11
  134. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  135. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  136. SI_STOP, SI_STOP, (byte)(SI+11), SI_STOP, (byte)(SI+11),
  137. SI_STOP, (byte)(11), SI_STOP,
  138. // 12
  139. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  140. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  141. SI_STOP, SI_STOP, SI_STOP, (byte)(SI+12), SI_STOP,
  142. SI_STOP, (byte)(12), SI_STOP,
  143. // 13
  144. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  145. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  146. (byte)(SI+4), SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  147. SI_STOP, SI_STOP, SI_STOP,
  148. // 14
  149. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  150. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  151. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  152. SI_STOP, (byte)(14), SI_STOP
  153. };
  154. private static final WordBreakTable kWordForward =
  155. new WordBreakTable(COL_COUNT, kWordForwardData);
  156. private static final byte kWordBackwardData[] =
  157. {
  158. // brk let num mLe mLN
  159. // prN poN mNu pMN blk
  160. // lf kat hir kan dia
  161. // cr nsm EOS
  162. // 0
  163. STOP, STOP, STOP, STOP, STOP,
  164. STOP, STOP, STOP, STOP, STOP,
  165. STOP, STOP, STOP, STOP, STOP,
  166. STOP, STOP, STOP,
  167. // 1
  168. (byte)(SI+6), (byte)(SI+2), (byte)(SI+3), (byte)(SI+4), (byte)(SI+5),
  169. (byte)(SI+6), (byte)(SI+7), (byte)(SI+7), (byte)(SI+5), (byte)(SI+8),
  170. (byte)(SI+8), (byte)(SI+9), (byte)(SI+10), (byte)(SI+12), (byte)(SI+11),
  171. (byte)(SI+8), (byte)(1), STOP,
  172. // 2
  173. STOP, (byte)(SI+2), (byte)(SI+3), (byte)(4), (byte)(4),
  174. STOP, STOP, STOP, (byte)(4), STOP,
  175. STOP, STOP, STOP, STOP, STOP,
  176. STOP, (byte)(2), STOP,
  177. // 3
  178. STOP, (byte)(SI+2), (byte)(SI+3), STOP, (byte)(7),
  179. SI_STOP, STOP, (byte)(7), (byte)(SI+7), STOP,
  180. STOP, STOP, STOP, STOP, STOP,
  181. STOP, (byte)(3), STOP,
  182. // 4
  183. STOP, (byte)(SI+2), STOP, STOP, STOP,
  184. STOP, STOP, STOP, STOP, STOP,
  185. STOP, STOP, STOP, STOP, STOP,
  186. STOP, (byte)(4), STOP,
  187. // 5
  188. STOP, (byte)(SI+2), (byte)(SI+3), STOP, STOP,
  189. STOP, STOP, STOP, STOP, STOP,
  190. STOP, STOP, STOP, STOP, STOP,
  191. STOP, (byte)(5), STOP,
  192. // 6
  193. STOP, STOP, STOP, STOP, STOP,
  194. STOP, STOP, STOP, STOP, STOP,
  195. STOP, STOP, STOP, STOP, STOP,
  196. STOP, (byte)(6), STOP,
  197. // 7
  198. STOP, STOP, (byte)(SI+3), STOP, STOP,
  199. STOP, STOP, STOP, STOP, STOP,
  200. STOP, STOP, STOP, STOP, STOP,
  201. STOP, (byte)(7), STOP,
  202. // 8
  203. STOP, STOP, STOP, STOP, STOP,
  204. STOP, STOP, STOP, STOP, (byte)(SI+8),
  205. (byte)(SI+8), STOP, STOP, STOP, STOP,
  206. (byte)(SI+8), (byte)(8), STOP,
  207. // 9
  208. STOP, STOP, STOP, STOP, STOP,
  209. STOP, STOP, STOP, STOP, STOP,
  210. STOP, (byte)(SI+9), STOP, STOP, (byte)(9),
  211. STOP, (byte)(9), STOP,
  212. // 10
  213. STOP, STOP, STOP, STOP, STOP,
  214. STOP, STOP, STOP, STOP, STOP,
  215. STOP, STOP, (byte)(SI+10),STOP, (byte)(10),
  216. STOP, (byte)(10), STOP,
  217. // 11
  218. STOP, STOP, STOP, STOP, STOP,
  219. STOP, STOP, STOP, STOP, STOP,
  220. STOP, (byte)(SI+9), (byte)(SI+10), STOP, (byte)(SI+11),
  221. STOP, (byte)(11), STOP,
  222. // 12
  223. STOP, STOP, STOP, STOP, STOP,
  224. STOP, STOP, STOP, STOP, STOP,
  225. STOP, STOP, STOP, (byte)(SI+12), STOP,
  226. STOP, (byte)(12), STOP
  227. };
  228. private static final WordBreakTable kWordBackward =
  229. new WordBreakTable(COL_COUNT, kWordBackwardData);
  230. private static final int kRawMapping[] =
  231. {
  232. BREAK, // UNASSIGNED = 0,
  233. letter, // UPPERCASE_LETTER = 1,
  234. letter, // LOWERCASE_LETTER = 2,
  235. letter, // TITLECASE_LETTER = 3,
  236. letter, // MODIFIER_LETTER = 4,
  237. letter, // OTHER_LETTER = 5,
  238. nsm, // NON_SPACING_MARK = 6,
  239. nsm, // ENCLOSING_MARK = 7,
  240. BREAK, // COMBINING_SPACING_MARK = 8,
  241. number, // DECIMAL_DIGIT_NUMBER = 9,
  242. letter, // LETTER_NUMBER = 10,
  243. number, // OTHER_NUMBER = 11,
  244. blank, // SPACE_SEPARATOR = 12,
  245. BREAK, // LINE_SEPARATOR = 13,
  246. BREAK, // PARAGRAPH_SEPARATOR = 14,
  247. BREAK, // CONTROL = 15,
  248. BREAK, // FORMAT = 16
  249. BREAK, // ???? = 17,
  250. BREAK, // PRIVATE_USE = 18,
  251. BREAK, // SURROGATE = 19,
  252. midLetter, // DASH_PUNCTUATION = 20,
  253. BREAK, // START_PUNCTUATION = 21,
  254. BREAK, // END_PUNCTUATION = 22,
  255. BREAK, // CONNECTOR_PUNCTUATION = 23,
  256. BREAK, // OTHER_PUNCTUATION = 24,
  257. BREAK, // MATH_SYMBOL = 25,
  258. preNum, // CURRENCY_SYMBOL = 26,
  259. BREAK, // MODIFIER_SYMBOL = 27,
  260. BREAK // OTHER_SYMBOL = 28
  261. };
  262. private static final SpecialMapping kExceptionChar[] =
  263. {
  264. //note: the ranges in this table must be sorted in ascending order
  265. //as required by the UnicodeClassMapping class.
  266. new SpecialMapping(ASCII_HORIZONTAL_TABULATION, blank),
  267. new SpecialMapping(ASCII_LINEFEED, lf),
  268. new SpecialMapping(ASCII_FORM_FEED, lf),
  269. new SpecialMapping(ASCII_CARRIAGE_RETURN, cr),
  270. new SpecialMapping(ASCII_QUOTATION_MARK, midLetNum),
  271. new SpecialMapping(ASCII_NUMBER_SIGN, preNum),
  272. new SpecialMapping(ASCII_PERCENT, postNum),
  273. new SpecialMapping(ASCII_AMPERSAND, postNum),
  274. new SpecialMapping(ASCII_APOSTROPHE, midLetNum),
  275. new SpecialMapping(ASCII_COMMA, midNum),
  276. new SpecialMapping(ASCII_FULL_STOP, preMidNum),
  277. new SpecialMapping(ASCII_CENT_SIGN, postNum),
  278. new SpecialMapping(LATIN1_SOFTHYPHEN, midLetter),
  279. new SpecialMapping(ARABIC_PERCENT_SIGN, postNum),
  280. new SpecialMapping(ARABIC_DECIMAL_SEPARATOR, midNum),
  281. new SpecialMapping(PUNCTUATION_HYPHENATION_POINT, midLetter),
  282. new SpecialMapping(PUNCTUATION_LINE_SEPARATOR,
  283. PUNCTUATION_PARAGRAPH_SEPARATOR, lf),
  284. new SpecialMapping(PER_MILLE_SIGN, postNum),
  285. new SpecialMapping(PER_TEN_THOUSAND_SIGN, postNum),
  286. new SpecialMapping(IDEOGRAPHIC_ITERATION_MARK, kanji),
  287. new SpecialMapping(HIRAGANA_LETTER_SMALL_A, HIRAGANA_LETTER_VU, hira),
  288. new SpecialMapping(COMBINING_KATAKANA_HIRAGANA_VOICED_SOUND_MARK,
  289. HIRAGANA_SEMIVOICED_SOUND_MARK, diacrit),
  290. new SpecialMapping(KATAKANA_LETTER_SMALL_A,
  291. KATAKANA_LETTER_SMALL_KE, kata),
  292. new SpecialMapping(UNICODE_LOW_BOUND_HAN,
  293. UNICODE_HIGH_BOUND_HAN, kanji),
  294. new SpecialMapping(HANGUL_SYL_LOW, HANGUL_SYL_HIGH, letter),
  295. new SpecialMapping(CJK_COMPATIBILITY_F900,
  296. CJK_COMPATIBILITY_FA2D, kanji),
  297. new SpecialMapping(END_OF_STRING, EOS)
  298. };
  299. private static final boolean WordExceptionFlags[] = {
  300. false, // kNonCharacter = 0,
  301. false, // kUppercaseLetter = 1,
  302. false, // kLowercaseLetter = 2,
  303. false, // kTitlecaseLetter = 3,
  304. true, // kModifierLetter = 4,
  305. true, // kOtherLetter = 5,
  306. true, // kNonSpacingMark = 6,
  307. false, // kEnclosingMark = 7,
  308. false, // kCombiningSpacingMark = 8,
  309. false, // kDecimalNumber = 9,
  310. false, // kLetterNumber = 10,
  311. false, // kOtherNumber = 11,
  312. false, // kSpaceSeparator = 12,
  313. true, // kLineSeparator = 13,
  314. true, // kParagraphSeparator = 14,
  315. true, // kControlCharacter = 15,
  316. false, // kFormatCharacter = 16,
  317. false, // UNDEFINED = 17,
  318. false, // kPrivateUseCharacter = 18,
  319. false, // kSurrogate = 19,
  320. true, // kDashPunctuation = 20,
  321. false, // kOpenPunctuation = 21,
  322. false, // kClosePunctuation = 22,
  323. false, // kConnectorPunctuation = 23,
  324. true, // kOtherPunctuation = 24,
  325. false, // kMathSymbol = 25,
  326. true, // kCurrencySymbol = 26,
  327. false, // kModifierSymbol = 27,
  328. false // kOtherSymbol = 28
  329. };
  330. private static final int kWordAsciiValues[] = {
  331. // null soh stx etx eot enq ask bell
  332. BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
  333. // bs ht lf vt ff cr so si
  334. BREAK, blank, lf, BREAK, lf, cr, BREAK, BREAK,
  335. // dle dc1 dc2 dc3 dc4 nak syn etb
  336. BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
  337. // can em sub esc fs gs rs us
  338. BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
  339. // sp ! " # $ % & '
  340. blank, BREAK, midLetNum, preNum, preNum, postNum, postNum, midLetNum,
  341. // ( ) * + , - . /
  342. BREAK, BREAK, BREAK, BREAK, midNum, midLetter, preMidNum, BREAK,
  343. // 0 1 2 3 4 5 6 7
  344. number, number, number, number, number, number, number, number,
  345. // 8 9 : ; < = > ?
  346. number, number, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
  347. // @ A B C D E F G
  348. BREAK, letter, letter, letter, letter, letter, letter, letter,
  349. // H I J K L M N O
  350. letter, letter, letter, letter, letter, letter, letter, letter,
  351. // P Q R S T U V W
  352. letter, letter, letter, letter, letter, letter, letter, letter,
  353. // X Y Z [ \ ] ^ _
  354. letter, letter, letter, BREAK, BREAK, BREAK, BREAK, BREAK,
  355. // ` a b c d e f g
  356. BREAK, letter, letter, letter, letter, letter, letter, letter,
  357. // h i j k l m n o
  358. letter, letter, letter, letter, letter, letter, letter, letter,
  359. // p q r s t u v w
  360. letter, letter, letter, letter, letter, letter, letter, letter,
  361. // x y z { | } ~ del
  362. letter, letter, letter, BREAK, BREAK, BREAK, BREAK, BREAK,
  363. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  364. BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
  365. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  366. BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
  367. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  368. BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
  369. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  370. BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
  371. // nbsp inv-! cents pounds currency yen broken-bar section
  372. blank, BREAK, postNum, preNum, preNum, preNum, BREAK, BREAK,
  373. // umlaut copyright super-a gui-left not soft-hyph registered macron
  374. BREAK, BREAK, letter, BREAK, BREAK, midLetter, BREAK, BREAK,
  375. // degree +/- super-2 super-3 acute micro paragraph bullet
  376. BREAK, BREAK, number, number, BREAK, letter, BREAK, BREAK,
  377. // cedilla super-1 super-o gui-right 1/4 1/2 3/4 inv-?
  378. BREAK, letter, BREAK, BREAK, number, number, number, BREAK,
  379. // A-grave A-acute A-hat A-tilde A-umlaut A-ring AE C-cedilla
  380. letter, letter, letter, letter, letter, letter, letter, letter,
  381. // E-grave E-acute E-hat E-umlaut I-grave I-acute I-hat I-umlaut
  382. letter, letter, letter, letter, letter, letter, letter, letter,
  383. // Edh N-tilde O-grave O-acute O-hat O-tilde O-umlaut times
  384. letter, letter, letter, letter, letter, letter, letter, BREAK,
  385. // O-slash U-grave U-acute U-hat U-umlaut Y-acute Thorn ess-zed
  386. letter, letter, letter, letter, letter, letter, letter, letter,
  387. // a-grave a-acute a-hat a-tilde a-umlaut a-ring ae c-cedilla
  388. letter, letter, letter, letter, letter, letter, letter, letter,
  389. // e-grave e-acute e-hat e-umlaut i-grave i-acute i-hat i-umlaut
  390. letter, letter, letter, letter, letter, letter, letter, letter,
  391. // edh n-tilde o-grave o-acute o-hat o-tilde o-umlaut over
  392. letter, letter, letter, letter, letter, letter, letter, BREAK,
  393. // o-slash u-grave u-acute u-hat u-umlaut y-acute thorn y-umlaut
  394. letter, letter, letter, letter, letter, letter, letter, letter
  395. };
  396. private static final UnicodeClassMapping kWordMap
  397. = new UnicodeClassMapping(kRawMapping, kExceptionChar, WordExceptionFlags,
  398. kWordAsciiValues);
  399. }