1. /*
  2. * @(#)WordBreakData.java 1.19 03/01/23
  3. *
  4. * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
  5. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
  6. */
  7. /*
  8. * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  9. * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
  10. *
  11. * The original version of this source code and documentation
  12. * is copyrighted and owned by Taligent, Inc., a wholly-owned
  13. * subsidiary of IBM. These materials are provided under terms
  14. * of a License Agreement between Taligent and Sun. This technology
  15. * is protected by multiple US and International patents.
  16. *
  17. * This notice and attribution to Taligent may not be removed.
  18. * Taligent is a registered trademark of Taligent, Inc.
  19. *
  20. */
  21. package java.text;
  22. /**
  23. * The WordBreakData contains data used by SimpleTextBoundary
  24. * to determine word breaks.
  25. * @see #BreakIterator
  26. */
  27. final class WordBreakData extends TextBoundaryData
  28. {
  29. // THEORY OF OPERATION: This class contains all the tables necessary to do
  30. // character-break iteration. This class descends from TextBoundaryData, which
  31. // is abstract. This class doesn't define any non-static members; it inherits the
  32. // non-static members from TextBoundaryData and fills them in with pointers to
  33. // the static members defined here.
  34. // There are two main parts to a TextBoundaryData object: the state-transition
  35. // tables and the character-mapping tables. The forward state table defines the
  36. // transitions for a deterministic finite state machine that locates character
  37. // boundaries. The rows are the states and the columns are character categories.
  38. // The cell values consist of two parts: The first is the row number of the next
  39. // state to transition to, or a "stop" value (0). (Because 0 is the stop value
  40. // rather than a valid state number, row 0 of the array isn't ever looked at; we
  41. // fill it with STOP values by convention.) The second part is a flag indicating
  42. // whether the iterator should update its break position on this transition. When
  43. // the flag is set, the sign bit of the value is turned on (SI is used to represent
  44. // the flag bit being turned on-- we do it this way rather than just using negative
  45. // numbers because we still need to see the SI flag when the value of the transition
  46. // is STOP. SI_STOP is used to denote this.) The starting state in all state tables
  47. // is 1.
  48. // The backward state table works the same way as the forward state table, but is
  49. // usually simplified. The iterator uses the backward state table only to find a
  50. // "safe place" to start iterating forward. It then seeks forward from the "safe
  51. // place" to the actual break position using the forward table. A "safe place" is
  52. // a spot in the text that is guaranteed to be a break position.
  53. // The character-category mapping tables are split into several pieces, one for
  54. // each stage of the category-mapping process: 1) kRawMapping maps generic Unicode
  55. // character categories to the character categories used by this break iterator.
  56. // The index of the array is the Unicode category number as returned by
  57. // Character.getType(). 2) The kExceptionFlags table is a table of Boolean values
  58. // indicating whether all the characters in the Unicode category have the
  59. // raw-mapping value. The rows correspond to the rows of the raw-mapping table. If
  60. // an entry is true, then we find the right category using... 3) The kExceptionChar
  61. // table. This table is a sorted list of SpecialMapping objects. Each entry defines
  62. // a range of contiguous characters that share the same category and the category
  63. // number. This list is binary-searched to find an entry corresponding to the
  64. // charactre being mapped. Only characters whose breaking category is different from
  65. // the raw-mapping value (the breaking category for their Unicode category) are
  66. // listed in this table. 4) The kAsciiValues table is a fast-path table for characters
  67. // in the Latin1 range. This table maps straight from a character value to a
  68. // category number, bypassing all the other tables. The programmer must take care
  69. // that all of the different category-mapping tables are consistent.
  70. // In the current implementation, all of these tables are created and maintained
  71. // by hand, not using a tool.
  72. private static final byte BREAK = 0; // characters not listed in any other category
  73. private static final byte letter = 1; // letters
  74. private static final byte number = 2; // digits
  75. private static final byte midLetter = 3;// punctuation that can occur within a word
  76. private static final byte midLetNum = 4;// punctuation that can occur inside a wors or a number
  77. private static final byte preNum = 5; // characters that may serve as a prefix to a number
  78. private static final byte postNum = 6; // characters that may serve as a suffix to a number
  79. private static final byte midNum = 7; // punctuation that can occur inside a number
  80. private static final byte preMidNum = 8;// punctuation that can occur either at the beginning
  81. // of or inside a number
  82. private static final byte blank = 9; // white space (other than always-break characters)
  83. private static final byte lf = 10; // the ASCII LF character
  84. private static final byte kata = 11; // Katakana
  85. private static final byte hira = 12; // Hiragana
  86. private static final byte kanji = 13; // all CJK ideographs
  87. private static final byte diacrit = 14; // CJK diacriticals
  88. private static final byte cr = 15; // the ASCII CR character
  89. private static final byte nsm = 16; // Unicode non-spacing marks
  90. private static final byte EOS = 17; // end of string
  91. private static final int COL_COUNT = 18;// number of categories
  92. private static final byte SI = (byte)0x80;
  93. private static final byte STOP = (byte) 0;
  94. private static final byte SI_STOP = (byte)SI + STOP;
  95. public WordBreakData() {
  96. super(kWordForward, kWordBackward, kWordMap);
  97. }
  98. // This table locates word boundaries, as this is defined for "find whole words"
  99. // searches and often for double-click selection. In this case, "words" are kept
  100. // separate from whitespace and punctuation.
  101. // The rules implemented here are as follows:
  102. // 1) Unless mentioned below, all characters are treated as "words" unto themselves
  103. // and have break positions on both sides (state 14)
  104. // 2) A "word" is kept together, and consists of a sequence of letters. Certain
  105. // punctuation marks, such as apostrophes and hyphens, are allowed inside a "word"
  106. // without causing a break, but only if they're flanked on both sides by letters.
  107. // (states 2 and 7)
  108. // 3) A "number" is kept together, and consists of an optional prefix character (such
  109. // as a minus, decimal point, or currency symbol), followed by a sequence of digits,
  110. // followed by an optional suffix character (such as a percent sign). The sequence
  111. // of digits may contain certain punctuation characters (such as commas and periods),
  112. // but only if they're flanked on both sides by digits. (states 3, 8, and 14)
  113. // 4) If a "number" and "word" occur in succession without any intervening characters,
  114. // they are kept together. This allows sequences like "$30F3" or "ascii2ebcdic" to
  115. // be treated as single units. (transitions between states 2 and 3)
  116. // 5) Sequences of whitespace are kept together. (state 6)
  117. // 6) The CR-LF sequence is kept together. (states 4 and 13)
  118. // 7) A sequence of Kanji is kept together. (state 12)
  119. // 8) Sequences of Hiragana and Katakana are kept together, and may include their
  120. // common diacritical marks. (states 10 and 11)
  121. // [The logic for Kanji and Kana characters is an approximation. There is no way
  122. // to detect real Japanese word boundaries without a dictionary.]
  123. // 9) Unicode non-spacing marks are completely transparent to the algorithm.
  124. // (see the "nsm" column)
  125. private static final byte kWordForwardData[] =
  126. {
  127. // brk let num mLe mLN
  128. // prN poN mNu pMN blk
  129. // lf kat hir kan dia
  130. // cr nsm EOS
  131. // 0 - dummy state
  132. STOP, STOP, STOP, STOP, STOP,
  133. STOP, STOP, STOP, STOP, STOP,
  134. STOP, STOP, STOP, STOP, STOP,
  135. STOP, STOP, STOP,
  136. // 1 - main dispatch state
  137. (byte)(SI+14), (byte)(SI+2), (byte)(SI+3), (byte)(SI+14), (byte)(SI+14),
  138. (byte)(SI+5), (byte)(SI+14), (byte)(SI+14), (byte)(SI+5), (byte)(SI+6),
  139. (byte)(SI+4), (byte)(SI+10), (byte)(SI+11), (byte)(SI+12), (byte)(SI+9),
  140. (byte)(SI+13), (byte)(1), SI_STOP,
  141. // 2 - This state eats letters, advances to state 3 for numbers, and
  142. // goes to state 7 for mid-word punctuation.
  143. SI_STOP, (byte)(SI+2), (byte)(SI+3), (byte)(SI+7), (byte)(SI+7),
  144. SI_STOP, SI_STOP, SI_STOP, (byte)(SI+7), SI_STOP,
  145. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  146. SI_STOP, (byte)(2), SI_STOP,
  147. // 3 - This state eats digits, advances to state 2 for letters, uses
  148. // state 8 to handle mid-number punctuation, and goes to state 14 for
  149. // number-suffix characters.
  150. SI_STOP, (byte)(SI+2), (byte)(SI+3), SI_STOP, (byte)(SI+8),
  151. SI_STOP, (byte)(SI+14), (byte)(SI+8), (byte)(SI+8), SI_STOP,
  152. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  153. SI_STOP, (byte)(3), SI_STOP,
  154. // 4 - This state handles LFs by eating the LF and stopping.
  155. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  156. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  157. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  158. SI_STOP, SI_STOP, SI_STOP,
  159. // 5 - This state handles number-prefix characters. If the next character
  160. // is a digit, it goes to state 3; otherwise, it stops (the character is
  161. // a "word" by itself).
  162. SI_STOP, SI_STOP, (byte)(SI+3), SI_STOP, SI_STOP,
  163. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  164. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  165. SI_STOP, (byte)(5), SI_STOP,
  166. // 6 - This state eats whitespace and stops on everything else.
  167. // (Except for CRs and LFs, which are kept together with the whitespace.)
  168. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  169. SI_STOP, SI_STOP, SI_STOP, SI_STOP, (byte)(SI+6),
  170. (byte)(SI+4), SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  171. (byte)(SI+13), (byte)(6), SI_STOP,
  172. // 7 - This state handles mid-word punctuation: If the next character is a
  173. // letter, we're still in the word and we keep going. Otherwise, we stop,
  174. // and the break was actually before this character.
  175. STOP, (byte)(SI+2), STOP, STOP, STOP,
  176. STOP, STOP, STOP, STOP, STOP,
  177. STOP, STOP, STOP, STOP, STOP,
  178. STOP, (byte)(7), STOP,
  179. // 8 - This state handles mid-number punctuation: If the next character is a
  180. // digit, we're still in the word and we keep going. Otherwise, we stop,
  181. // and the break position is actually before this character.
  182. STOP, STOP, (byte)(SI+3), STOP, STOP,
  183. STOP, STOP, STOP, STOP, STOP,
  184. STOP, STOP, STOP, STOP, STOP,
  185. STOP, (byte)(8), STOP,
  186. // 9 - This state handles CJK diacritics. It'll keep going if the next
  187. // character is CJK; otherwise, it stops.
  188. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  189. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  190. SI_STOP, (byte)(SI+10), (byte)(SI+11), SI_STOP, (byte)(SI+9),
  191. SI_STOP, (byte)(9), SI_STOP,
  192. // 10 - This state eats Katakana and CJK discritics.
  193. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  194. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  195. SI_STOP, (byte)(SI+10), SI_STOP, SI_STOP, (byte)(SI+10),
  196. SI_STOP, (byte)(10), SI_STOP,
  197. // 11 - This state eats Hiragana and CJK diacritics.
  198. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  199. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  200. SI_STOP, SI_STOP, (byte)(SI+11), SI_STOP, (byte)(SI+11),
  201. SI_STOP, (byte)(11), SI_STOP,
  202. // 12 - This state eats Kanji.
  203. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  204. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  205. SI_STOP, SI_STOP, SI_STOP, (byte)(SI+12), SI_STOP,
  206. SI_STOP, (byte)(12), SI_STOP,
  207. // 13 - This state handles CRs, which are "words" unto themselves (or
  208. // with preceding whitespace) unless followed by an LFs.
  209. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  210. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  211. (byte)(SI+4), SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  212. SI_STOP, SI_STOP, SI_STOP,
  213. // 14 - This state handles LFs and number-suffix characters (when they
  214. // actually end a number) by eating the character and stopping.
  215. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  216. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  217. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  218. SI_STOP, (byte)(14), SI_STOP
  219. };
  220. private static final WordBreakTable kWordForward =
  221. new WordBreakTable(COL_COUNT, kWordForwardData);
  222. // This table is a completely-reversed version of the forward table.
  223. private static final byte kWordBackwardData[] =
  224. {
  225. // brk let num mLe mLN
  226. // prN poN mNu pMN blk
  227. // lf kat hir kan dia
  228. // cr nsm EOS
  229. // 0
  230. STOP, STOP, STOP, STOP, STOP,
  231. STOP, STOP, STOP, STOP, STOP,
  232. STOP, STOP, STOP, STOP, STOP,
  233. STOP, STOP, STOP,
  234. // 1
  235. (byte)(SI+6), (byte)(SI+2), (byte)(SI+3), (byte)(SI+4), (byte)(SI+5),
  236. (byte)(SI+6), (byte)(SI+7), (byte)(SI+7), (byte)(SI+5), (byte)(SI+8),
  237. (byte)(SI+8), (byte)(SI+9), (byte)(SI+10), (byte)(SI+12), (byte)(SI+11),
  238. (byte)(SI+8), (byte)(1), STOP,
  239. // 2
  240. STOP, (byte)(SI+2), (byte)(SI+3), (byte)(4), (byte)(4),
  241. STOP, STOP, STOP, (byte)(4), STOP,
  242. STOP, STOP, STOP, STOP, STOP,
  243. STOP, (byte)(2), STOP,
  244. // 3
  245. STOP, (byte)(SI+2), (byte)(SI+3), STOP, (byte)(7),
  246. SI_STOP, STOP, (byte)(7), (byte)(SI+7), STOP,
  247. STOP, STOP, STOP, STOP, STOP,
  248. STOP, (byte)(3), STOP,
  249. // 4
  250. STOP, (byte)(SI+2), STOP, STOP, STOP,
  251. STOP, STOP, STOP, STOP, STOP,
  252. STOP, STOP, STOP, STOP, STOP,
  253. STOP, (byte)(4), STOP,
  254. // 5
  255. STOP, (byte)(SI+2), (byte)(SI+3), STOP, STOP,
  256. STOP, STOP, STOP, STOP, STOP,
  257. STOP, STOP, STOP, STOP, STOP,
  258. STOP, (byte)(5), STOP,
  259. // 6
  260. STOP, STOP, STOP, STOP, STOP,
  261. STOP, STOP, STOP, STOP, STOP,
  262. STOP, STOP, STOP, STOP, STOP,
  263. STOP, (byte)(6), STOP,
  264. // 7
  265. STOP, STOP, (byte)(SI+3), STOP, STOP,
  266. STOP, STOP, STOP, STOP, STOP,
  267. STOP, STOP, STOP, STOP, STOP,
  268. STOP, (byte)(7), STOP,
  269. // 8
  270. STOP, STOP, STOP, STOP, STOP,
  271. STOP, STOP, STOP, STOP, (byte)(SI+8),
  272. (byte)(SI+8), STOP, STOP, STOP, STOP,
  273. (byte)(SI+8), (byte)(8), STOP,
  274. // 9
  275. STOP, STOP, STOP, STOP, STOP,
  276. STOP, STOP, STOP, STOP, STOP,
  277. STOP, (byte)(SI+9), STOP, STOP, (byte)(9),
  278. STOP, (byte)(9), STOP,
  279. // 10
  280. STOP, STOP, STOP, STOP, STOP,
  281. STOP, STOP, STOP, STOP, STOP,
  282. STOP, STOP, (byte)(SI+10),STOP, (byte)(10),
  283. STOP, (byte)(10), STOP,
  284. // 11
  285. STOP, STOP, STOP, STOP, STOP,
  286. STOP, STOP, STOP, STOP, STOP,
  287. STOP, (byte)(SI+9), (byte)(SI+10), STOP, (byte)(SI+11),
  288. STOP, (byte)(11), STOP,
  289. // 12
  290. STOP, STOP, STOP, STOP, STOP,
  291. STOP, STOP, STOP, STOP, STOP,
  292. STOP, STOP, STOP, (byte)(SI+12), STOP,
  293. STOP, (byte)(12), STOP
  294. };
  295. private static final WordBreakTable kWordBackward =
  296. new WordBreakTable(COL_COUNT, kWordBackwardData);
  297. private static final int kRawMapping[] =
  298. {
  299. BREAK, // UNASSIGNED = 0,
  300. letter, // UPPERCASE_LETTER = 1,
  301. letter, // LOWERCASE_LETTER = 2,
  302. letter, // TITLECASE_LETTER = 3,
  303. letter, // MODIFIER_LETTER = 4,
  304. letter, // OTHER_LETTER = 5,
  305. nsm, // NON_SPACING_MARK = 6,
  306. nsm, // ENCLOSING_MARK = 7,
  307. BREAK, // COMBINING_SPACING_MARK = 8,
  308. number, // DECIMAL_DIGIT_NUMBER = 9,
  309. letter, // LETTER_NUMBER = 10,
  310. number, // OTHER_NUMBER = 11,
  311. blank, // SPACE_SEPARATOR = 12,
  312. BREAK, // LINE_SEPARATOR = 13,
  313. BREAK, // PARAGRAPH_SEPARATOR = 14,
  314. BREAK, // CONTROL = 15,
  315. BREAK, // FORMAT = 16
  316. BREAK, // ???? = 17,
  317. BREAK, // PRIVATE_USE = 18,
  318. BREAK, // SURROGATE = 19,
  319. midLetter, // DASH_PUNCTUATION = 20,
  320. BREAK, // START_PUNCTUATION = 21,
  321. BREAK, // END_PUNCTUATION = 22,
  322. BREAK, // CONNECTOR_PUNCTUATION = 23,
  323. BREAK, // OTHER_PUNCTUATION = 24,
  324. BREAK, // MATH_SYMBOL = 25,
  325. preNum, // CURRENCY_SYMBOL = 26,
  326. BREAK, // MODIFIER_SYMBOL = 27,
  327. BREAK, // OTHER_SYMBOL = 28,
  328. BREAK, // INITIAL_QUOTE_PUNCTUATION = 29,
  329. BREAK, // FINAL_QUOTE_PUNCTUATION = 30,
  330. };
  331. private static final SpecialMapping kExceptionChar[] =
  332. {
  333. //note: the ranges in this table must be sorted in ascending order
  334. //as required by the UnicodeClassMapping class.
  335. new SpecialMapping(ASCII_HORIZONTAL_TABULATION, blank),
  336. new SpecialMapping(ASCII_LINEFEED, lf),
  337. new SpecialMapping(ASCII_FORM_FEED, lf),
  338. new SpecialMapping(ASCII_CARRIAGE_RETURN, cr),
  339. new SpecialMapping(ASCII_QUOTATION_MARK, midLetNum),
  340. new SpecialMapping(ASCII_NUMBER_SIGN, preNum),
  341. new SpecialMapping(ASCII_PERCENT, postNum),
  342. new SpecialMapping(ASCII_AMPERSAND, postNum),
  343. new SpecialMapping(ASCII_APOSTROPHE, midLetNum),
  344. new SpecialMapping(ASCII_COMMA, midNum),
  345. new SpecialMapping(ASCII_FULL_STOP, preMidNum),
  346. new SpecialMapping(ASCII_CENT_SIGN, postNum),
  347. new SpecialMapping(LATIN1_SOFTHYPHEN, midLetter),
  348. new SpecialMapping(ARABIC_PERCENT_SIGN, postNum),
  349. new SpecialMapping(ARABIC_DECIMAL_SEPARATOR, midNum),
  350. new SpecialMapping(PUNCTUATION_HYPHENATION_POINT, midLetter),
  351. new SpecialMapping(PUNCTUATION_LINE_SEPARATOR,
  352. PUNCTUATION_PARAGRAPH_SEPARATOR, lf),
  353. new SpecialMapping(PER_MILLE_SIGN, postNum),
  354. new SpecialMapping(PER_TEN_THOUSAND_SIGN, postNum),
  355. new SpecialMapping(IDEOGRAPHIC_ITERATION_MARK, kanji),
  356. new SpecialMapping(HIRAGANA_LETTER_SMALL_A, HIRAGANA_LETTER_VU, hira),
  357. new SpecialMapping(COMBINING_KATAKANA_HIRAGANA_VOICED_SOUND_MARK,
  358. HIRAGANA_SEMIVOICED_SOUND_MARK, diacrit),
  359. new SpecialMapping(HIRAGANA_ITERATION_MARK, HIRAGANA_VOICED_ITERATION_MARK, hira),
  360. new SpecialMapping(KATAKANA_LETTER_SMALL_A,
  361. KATAKANA_LETTER_SMALL_KE, kata),
  362. new SpecialMapping(KATAKANA_HIRAGANA_PROLONGED_SOUND_MARK, diacrit),
  363. new SpecialMapping(KATAKANA_ITERATION_MARK, KATAKANA_VOICED_ITERATION_MARK, kata),
  364. new SpecialMapping(UNICODE_LOW_BOUND_HAN,
  365. UNICODE_HIGH_BOUND_HAN, kanji),
  366. new SpecialMapping(HANGUL_SYL_LOW, HANGUL_SYL_HIGH, letter),
  367. new SpecialMapping(CJK_COMPATIBILITY_F900,
  368. CJK_COMPATIBILITY_FA2D, kanji),
  369. new SpecialMapping(END_OF_STRING, EOS)
  370. };
  371. private static final boolean WordExceptionFlags[] = {
  372. false, // kNonCharacter = 0,
  373. false, // kUppercaseLetter = 1,
  374. false, // kLowercaseLetter = 2,
  375. false, // kTitlecaseLetter = 3,
  376. true, // kModifierLetter = 4,
  377. true, // kOtherLetter = 5,
  378. true, // kNonSpacingMark = 6,
  379. false, // kEnclosingMark = 7,
  380. false, // kCombiningSpacingMark = 8,
  381. false, // kDecimalNumber = 9,
  382. false, // kLetterNumber = 10,
  383. false, // kOtherNumber = 11,
  384. false, // kSpaceSeparator = 12,
  385. true, // kLineSeparator = 13,
  386. true, // kParagraphSeparator = 14,
  387. true, // kControlCharacter = 15,
  388. false, // kFormatCharacter = 16,
  389. false, // UNDEFINED = 17,
  390. false, // kPrivateUseCharacter = 18,
  391. false, // kSurrogate = 19,
  392. true, // kDashPunctuation = 20,
  393. false, // kOpenPunctuation = 21,
  394. false, // kClosePunctuation = 22,
  395. false, // kConnectorPunctuation = 23,
  396. true, // kOtherPunctuation = 24,
  397. false, // kMathSymbol = 25,
  398. true, // kCurrencySymbol = 26,
  399. false, // kModifierSymbol = 27,
  400. false, // kOtherSymbol = 28,
  401. false, // kInitialQuotePunctuation = 29,
  402. false, // kFinalQuotePunctuation = 30,
  403. };
  404. private static final int kWordAsciiValues[] = {
  405. // null soh stx etx eot enq ask bell
  406. BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
  407. // bs ht lf vt ff cr so si
  408. BREAK, blank, lf, BREAK, lf, cr, BREAK, BREAK,
  409. // dle dc1 dc2 dc3 dc4 nak syn etb
  410. BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
  411. // can em sub esc fs gs rs us
  412. BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
  413. // sp ! " # $ % & '
  414. blank, BREAK, midLetNum, preNum, preNum, postNum, postNum, midLetNum,
  415. // ( ) * + , - . /
  416. BREAK, BREAK, BREAK, BREAK, midNum, midLetter, preMidNum, BREAK,
  417. // 0 1 2 3 4 5 6 7
  418. number, number, number, number, number, number, number, number,
  419. // 8 9 : ; < = > ?
  420. number, number, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
  421. // @ A B C D E F G
  422. BREAK, letter, letter, letter, letter, letter, letter, letter,
  423. // H I J K L M N O
  424. letter, letter, letter, letter, letter, letter, letter, letter,
  425. // P Q R S T U V W
  426. letter, letter, letter, letter, letter, letter, letter, letter,
  427. // X Y Z [ \ ] ^ _
  428. letter, letter, letter, BREAK, BREAK, BREAK, BREAK, BREAK,
  429. // ` a b c d e f g
  430. BREAK, letter, letter, letter, letter, letter, letter, letter,
  431. // h i j k l m n o
  432. letter, letter, letter, letter, letter, letter, letter, letter,
  433. // p q r s t u v w
  434. letter, letter, letter, letter, letter, letter, letter, letter,
  435. // x y z { | } ~ del
  436. letter, letter, letter, BREAK, BREAK, BREAK, BREAK, BREAK,
  437. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  438. BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
  439. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  440. BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
  441. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  442. BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
  443. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  444. BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
  445. // nbsp inv-! cents pounds currency yen broken-bar section
  446. blank, BREAK, postNum, preNum, preNum, preNum, BREAK, BREAK,
  447. // umlaut copyright super-a gui-left not soft-hyph registered macron
  448. BREAK, BREAK, letter, BREAK, BREAK, midLetter, BREAK, BREAK,
  449. // degree +/- super-2 super-3 acute micro paragraph bullet
  450. BREAK, BREAK, number, number, BREAK, letter, BREAK, BREAK,
  451. // cedilla super-1 super-o gui-right 1/4 1/2 3/4 inv-?
  452. BREAK, letter, BREAK, BREAK, number, number, number, BREAK,
  453. // A-grave A-acute A-hat A-tilde A-umlaut A-ring AE C-cedilla
  454. letter, letter, letter, letter, letter, letter, letter, letter,
  455. // E-grave E-acute E-hat E-umlaut I-grave I-acute I-hat I-umlaut
  456. letter, letter, letter, letter, letter, letter, letter, letter,
  457. // Edh N-tilde O-grave O-acute O-hat O-tilde O-umlaut times
  458. letter, letter, letter, letter, letter, letter, letter, BREAK,
  459. // O-slash U-grave U-acute U-hat U-umlaut Y-acute Thorn ess-zed
  460. letter, letter, letter, letter, letter, letter, letter, letter,
  461. // a-grave a-acute a-hat a-tilde a-umlaut a-ring ae c-cedilla
  462. letter, letter, letter, letter, letter, letter, letter, letter,
  463. // e-grave e-acute e-hat e-umlaut i-grave i-acute i-hat i-umlaut
  464. letter, letter, letter, letter, letter, letter, letter, letter,
  465. // edh n-tilde o-grave o-acute o-hat o-tilde o-umlaut over
  466. letter, letter, letter, letter, letter, letter, letter, BREAK,
  467. // o-slash u-grave u-acute u-hat u-umlaut y-acute thorn y-umlaut
  468. letter, letter, letter, letter, letter, letter, letter, letter
  469. };
  470. private static final UnicodeClassMapping kWordMap
  471. = new UnicodeClassMapping(kRawMapping, kExceptionChar, WordExceptionFlags,
  472. kWordAsciiValues);
  473. }