1. /*
  2. * @(#)WordBreakData.java 1.16 00/01/19
  3. *
  4. * Copyright 1996-2000 Sun Microsystems, Inc. All Rights Reserved.
  5. *
  6. * This software is the proprietary information of Sun Microsystems, Inc.
  7. * Use is subject to license terms.
  8. *
  9. */
  10. /*
  11. * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  12. * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
  13. *
  14. * The original version of this source code and documentation
  15. * is copyrighted and owned by Taligent, Inc., a wholly-owned
  16. * subsidiary of IBM. These materials are provided under terms
  17. * of a License Agreement between Taligent and Sun. This technology
  18. * is protected by multiple US and International patents.
  19. *
  20. * This notice and attribution to Taligent may not be removed.
  21. * Taligent is a registered trademark of Taligent, Inc.
  22. *
  23. */
  24. package java.text;
  25. /**
  26. * The WordBreakData contains data used by SimpleTextBoundary
  27. * to determine word breaks.
  28. * @see #BreakIterator
  29. */
  30. final class WordBreakData extends TextBoundaryData
  31. {
  32. // THEORY OF OPERATION: This class contains all the tables necessary to do
  33. // character-break iteration. This class descends from TextBoundaryData, which
  34. // is abstract. This class doesn't define any non-static members; it inherits the
  35. // non-static members from TextBoundaryData and fills them in with pointers to
  36. // the static members defined here.
  37. // There are two main parts to a TextBoundaryData object: the state-transition
  38. // tables and the character-mapping tables. The forward state table defines the
  39. // transitions for a deterministic finite state machine that locates character
  40. // boundaries. The rows are the states and the columns are character categories.
  41. // The cell values consist of two parts: The first is the row number of the next
  42. // state to transition to, or a "stop" value (0). (Because 0 is the stop value
  43. // rather than a valid state number, row 0 of the array isn't ever looked at; we
  44. // fill it with STOP values by convention.) The second part is a flag indicating
  45. // whether the iterator should update its break position on this transition. When
  46. // the flag is set, the sign bit of the value is turned on (SI is used to represent
  47. // the flag bit being turned on-- we do it this way rather than just using negative
  48. // numbers because we still need to see the SI flag when the value of the transition
  49. // is STOP. SI_STOP is used to denote this.) The starting state in all state tables
  50. // is 1.
  51. // The backward state table works the same way as the forward state table, but is
  52. // usually simplified. The iterator uses the backward state table only to find a
  53. // "safe place" to start iterating forward. It then seeks forward from the "safe
  54. // place" to the actual break position using the forward table. A "safe place" is
  55. // a spot in the text that is guaranteed to be a break position.
  56. // The character-category mapping tables are split into several pieces, one for
  57. // each stage of the category-mapping process: 1) kRawMapping maps generic Unicode
  58. // character categories to the character categories used by this break iterator.
  59. // The index of the array is the Unicode category number as returned by
  60. // Character.getType(). 2) The kExceptionFlags table is a table of Boolean values
  61. // indicating whether all the characters in the Unicode category have the
  62. // raw-mapping value. The rows correspond to the rows of the raw-mapping table. If
  63. // an entry is true, then we find the right category using... 3) The kExceptionChar
  64. // table. This table is a sorted list of SpecialMapping objects. Each entry defines
  65. // a range of contiguous characters that share the same category and the category
  66. // number. This list is binary-searched to find an entry corresponding to the
  67. // charactre being mapped. Only characters whose breaking category is different from
  68. // the raw-mapping value (the breaking category for their Unicode category) are
  69. // listed in this table. 4) The kAsciiValues table is a fast-path table for characters
  70. // in the Latin1 range. This table maps straight from a character value to a
  71. // category number, bypassing all the other tables. The programmer must take care
  72. // that all of the different category-mapping tables are consistent.
  73. // In the current implementation, all of these tables are created and maintained
  74. // by hand, not using a tool.
  75. private static final byte BREAK = 0; // characters not listed in any other category
  76. private static final byte letter = 1; // letters
  77. private static final byte number = 2; // digits
  78. private static final byte midLetter = 3;// punctuation that can occur within a word
  79. private static final byte midLetNum = 4;// punctuation that can occur inside a wors or a number
  80. private static final byte preNum = 5; // characters that may serve as a prefix to a number
  81. private static final byte postNum = 6; // characters that may serve as a suffix to a number
  82. private static final byte midNum = 7; // punctuation that can occur inside a number
  83. private static final byte preMidNum = 8;// punctuation that can occur either at the beginning
  84. // of or inside a number
  85. private static final byte blank = 9; // white space (other than always-break characters)
  86. private static final byte lf = 10; // the ASCII LF character
  87. private static final byte kata = 11; // Katakana
  88. private static final byte hira = 12; // Hiragana
  89. private static final byte kanji = 13; // all CJK ideographs
  90. private static final byte diacrit = 14; // CJK diacriticals
  91. private static final byte cr = 15; // the ASCII CR character
  92. private static final byte nsm = 16; // Unicode non-spacing marks
  93. private static final byte EOS = 17; // end of string
  94. private static final int COL_COUNT = 18;// number of categories
  95. private static final byte SI = (byte)0x80;
  96. private static final byte STOP = (byte) 0;
  97. private static final byte SI_STOP = (byte)SI + STOP;
  98. public WordBreakData() {
  99. super(kWordForward, kWordBackward, kWordMap);
  100. }
  101. // This table locates word boundaries, as this is defined for "find whole words"
  102. // searches and often for double-click selection. In this case, "words" are kept
  103. // separate from whitespace and punctuation.
  104. // The rules implemented here are as follows:
  105. // 1) Unless mentioned below, all characters are treated as "words" unto themselves
  106. // and have break positions on both sides (state 14)
  107. // 2) A "word" is kept together, and consists of a sequence of letters. Certain
  108. // punctuation marks, such as apostrophes and hyphens, are allowed inside a "word"
  109. // without causing a break, but only if they're flanked on both sides by letters.
  110. // (states 2 and 7)
  111. // 3) A "number" is kept together, and consists of an optional prefix character (such
  112. // as a minus, decimal point, or currency symbol), followed by a sequence of digits,
  113. // followed by an optional suffix character (such as a percent sign). The sequence
  114. // of digits may contain certain punctuation characters (such as commas and periods),
  115. // but only if they're flanked on both sides by digits. (states 3, 8, and 14)
  116. // 4) If a "number" and "word" occur in succession without any intervening characters,
  117. // they are kept together. This allows sequences like "$30F3" or "ascii2ebcdic" to
  118. // be treated as single units. (transitions between states 2 and 3)
  119. // 5) Sequences of whitespace are kept together. (state 6)
  120. // 6) The CR-LF sequence is kept together. (states 4 and 13)
  121. // 7) A sequence of Kanji is kept together. (state 12)
  122. // 8) Sequences of Hiragana and Katakana are kept together, and may include their
  123. // common diacritical marks. (states 10 and 11)
  124. // [The logic for Kanji and Kana characters is an approximation. There is no way
  125. // to detect real Japanese word boundaries without a dictionary.]
  126. // 9) Unicode non-spacing marks are completely transparent to the algorithm.
  127. // (see the "nsm" column)
  128. private static final byte kWordForwardData[] =
  129. {
  130. // brk let num mLe mLN
  131. // prN poN mNu pMN blk
  132. // lf kat hir kan dia
  133. // cr nsm EOS
  134. // 0 - dummy state
  135. STOP, STOP, STOP, STOP, STOP,
  136. STOP, STOP, STOP, STOP, STOP,
  137. STOP, STOP, STOP, STOP, STOP,
  138. STOP, STOP, STOP,
  139. // 1 - main dispatch state
  140. (byte)(SI+14), (byte)(SI+2), (byte)(SI+3), (byte)(SI+14), (byte)(SI+14),
  141. (byte)(SI+5), (byte)(SI+14), (byte)(SI+14), (byte)(SI+5), (byte)(SI+6),
  142. (byte)(SI+4), (byte)(SI+10), (byte)(SI+11), (byte)(SI+12), (byte)(SI+9),
  143. (byte)(SI+13), (byte)(1), SI_STOP,
  144. // 2 - This state eats letters, advances to state 3 for numbers, and
  145. // goes to state 7 for mid-word punctuation.
  146. SI_STOP, (byte)(SI+2), (byte)(SI+3), (byte)(SI+7), (byte)(SI+7),
  147. SI_STOP, SI_STOP, SI_STOP, (byte)(SI+7), SI_STOP,
  148. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  149. SI_STOP, (byte)(2), SI_STOP,
  150. // 3 - This state eats digits, advances to state 2 for letters, uses
  151. // state 8 to handle mid-number punctuation, and goes to state 14 for
  152. // number-suffix characters.
  153. SI_STOP, (byte)(SI+2), (byte)(SI+3), SI_STOP, (byte)(SI+8),
  154. SI_STOP, (byte)(SI+14), (byte)(SI+8), (byte)(SI+8), SI_STOP,
  155. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  156. SI_STOP, (byte)(3), SI_STOP,
  157. // 4 - This state handles LFs by eating the LF and stopping.
  158. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  159. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  160. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  161. SI_STOP, SI_STOP, SI_STOP,
  162. // 5 - This state handles number-prefix characters. If the next character
  163. // is a digit, it goes to state 3; otherwise, it stops (the character is
  164. // a "word" by itself).
  165. SI_STOP, SI_STOP, (byte)(SI+3), SI_STOP, SI_STOP,
  166. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  167. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  168. SI_STOP, (byte)(5), SI_STOP,
  169. // 6 - This state eats whitespace and stops on everything else.
  170. // (Except for CRs and LFs, which are kept together with the whitespace.)
  171. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  172. SI_STOP, SI_STOP, SI_STOP, SI_STOP, (byte)(SI+6),
  173. (byte)(SI+4), SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  174. (byte)(SI+13), (byte)(6), SI_STOP,
  175. // 7 - This state handles mid-word punctuation: If the next character is a
  176. // letter, we're still in the word and we keep going. Otherwise, we stop,
  177. // and the break was actually before this character.
  178. STOP, (byte)(SI+2), STOP, STOP, STOP,
  179. STOP, STOP, STOP, STOP, STOP,
  180. STOP, STOP, STOP, STOP, STOP,
  181. STOP, (byte)(7), STOP,
  182. // 8 - This state handles mid-number punctuation: If the next character is a
  183. // digit, we're still in the word and we keep going. Otherwise, we stop,
  184. // and the break position is actually before this character.
  185. STOP, STOP, (byte)(SI+3), STOP, STOP,
  186. STOP, STOP, STOP, STOP, STOP,
  187. STOP, STOP, STOP, STOP, STOP,
  188. STOP, (byte)(8), STOP,
  189. // 9 - This state handles CJK diacritics. It'll keep going if the next
  190. // character is CJK; otherwise, it stops.
  191. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  192. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  193. SI_STOP, (byte)(SI+10), (byte)(SI+11), SI_STOP, (byte)(SI+9),
  194. SI_STOP, (byte)(9), SI_STOP,
  195. // 10 - This state eats Katakana and CJK discritics.
  196. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  197. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  198. SI_STOP, (byte)(SI+10), SI_STOP, SI_STOP, (byte)(SI+10),
  199. SI_STOP, (byte)(10), SI_STOP,
  200. // 11 - This state eats Hiragana and CJK diacritics.
  201. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  202. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  203. SI_STOP, SI_STOP, (byte)(SI+11), SI_STOP, (byte)(SI+11),
  204. SI_STOP, (byte)(11), SI_STOP,
  205. // 12 - This state eats Kanji.
  206. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  207. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  208. SI_STOP, SI_STOP, SI_STOP, (byte)(SI+12), SI_STOP,
  209. SI_STOP, (byte)(12), SI_STOP,
  210. // 13 - This state handles CRs, which are "words" unto themselves (or
  211. // with preceding whitespace) unless followed by an LFs.
  212. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  213. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  214. (byte)(SI+4), SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  215. SI_STOP, SI_STOP, SI_STOP,
  216. // 14 - This state handles LFs and number-suffix characters (when they
  217. // actually end a number) by eating the character and stopping.
  218. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  219. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  220. SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  221. SI_STOP, (byte)(14), SI_STOP
  222. };
  223. private static final WordBreakTable kWordForward =
  224. new WordBreakTable(COL_COUNT, kWordForwardData);
  225. // This table is a completely-reversed version of the forward table.
  226. private static final byte kWordBackwardData[] =
  227. {
  228. // brk let num mLe mLN
  229. // prN poN mNu pMN blk
  230. // lf kat hir kan dia
  231. // cr nsm EOS
  232. // 0
  233. STOP, STOP, STOP, STOP, STOP,
  234. STOP, STOP, STOP, STOP, STOP,
  235. STOP, STOP, STOP, STOP, STOP,
  236. STOP, STOP, STOP,
  237. // 1
  238. (byte)(SI+6), (byte)(SI+2), (byte)(SI+3), (byte)(SI+4), (byte)(SI+5),
  239. (byte)(SI+6), (byte)(SI+7), (byte)(SI+7), (byte)(SI+5), (byte)(SI+8),
  240. (byte)(SI+8), (byte)(SI+9), (byte)(SI+10), (byte)(SI+12), (byte)(SI+11),
  241. (byte)(SI+8), (byte)(1), STOP,
  242. // 2
  243. STOP, (byte)(SI+2), (byte)(SI+3), (byte)(4), (byte)(4),
  244. STOP, STOP, STOP, (byte)(4), STOP,
  245. STOP, STOP, STOP, STOP, STOP,
  246. STOP, (byte)(2), STOP,
  247. // 3
  248. STOP, (byte)(SI+2), (byte)(SI+3), STOP, (byte)(7),
  249. SI_STOP, STOP, (byte)(7), (byte)(SI+7), STOP,
  250. STOP, STOP, STOP, STOP, STOP,
  251. STOP, (byte)(3), STOP,
  252. // 4
  253. STOP, (byte)(SI+2), STOP, STOP, STOP,
  254. STOP, STOP, STOP, STOP, STOP,
  255. STOP, STOP, STOP, STOP, STOP,
  256. STOP, (byte)(4), STOP,
  257. // 5
  258. STOP, (byte)(SI+2), (byte)(SI+3), STOP, STOP,
  259. STOP, STOP, STOP, STOP, STOP,
  260. STOP, STOP, STOP, STOP, STOP,
  261. STOP, (byte)(5), STOP,
  262. // 6
  263. STOP, STOP, STOP, STOP, STOP,
  264. STOP, STOP, STOP, STOP, STOP,
  265. STOP, STOP, STOP, STOP, STOP,
  266. STOP, (byte)(6), STOP,
  267. // 7
  268. STOP, STOP, (byte)(SI+3), STOP, STOP,
  269. STOP, STOP, STOP, STOP, STOP,
  270. STOP, STOP, STOP, STOP, STOP,
  271. STOP, (byte)(7), STOP,
  272. // 8
  273. STOP, STOP, STOP, STOP, STOP,
  274. STOP, STOP, STOP, STOP, (byte)(SI+8),
  275. (byte)(SI+8), STOP, STOP, STOP, STOP,
  276. (byte)(SI+8), (byte)(8), STOP,
  277. // 9
  278. STOP, STOP, STOP, STOP, STOP,
  279. STOP, STOP, STOP, STOP, STOP,
  280. STOP, (byte)(SI+9), STOP, STOP, (byte)(9),
  281. STOP, (byte)(9), STOP,
  282. // 10
  283. STOP, STOP, STOP, STOP, STOP,
  284. STOP, STOP, STOP, STOP, STOP,
  285. STOP, STOP, (byte)(SI+10),STOP, (byte)(10),
  286. STOP, (byte)(10), STOP,
  287. // 11
  288. STOP, STOP, STOP, STOP, STOP,
  289. STOP, STOP, STOP, STOP, STOP,
  290. STOP, (byte)(SI+9), (byte)(SI+10), STOP, (byte)(SI+11),
  291. STOP, (byte)(11), STOP,
  292. // 12
  293. STOP, STOP, STOP, STOP, STOP,
  294. STOP, STOP, STOP, STOP, STOP,
  295. STOP, STOP, STOP, (byte)(SI+12), STOP,
  296. STOP, (byte)(12), STOP
  297. };
  298. private static final WordBreakTable kWordBackward =
  299. new WordBreakTable(COL_COUNT, kWordBackwardData);
  300. private static final int kRawMapping[] =
  301. {
  302. BREAK, // UNASSIGNED = 0,
  303. letter, // UPPERCASE_LETTER = 1,
  304. letter, // LOWERCASE_LETTER = 2,
  305. letter, // TITLECASE_LETTER = 3,
  306. letter, // MODIFIER_LETTER = 4,
  307. letter, // OTHER_LETTER = 5,
  308. nsm, // NON_SPACING_MARK = 6,
  309. nsm, // ENCLOSING_MARK = 7,
  310. BREAK, // COMBINING_SPACING_MARK = 8,
  311. number, // DECIMAL_DIGIT_NUMBER = 9,
  312. letter, // LETTER_NUMBER = 10,
  313. number, // OTHER_NUMBER = 11,
  314. blank, // SPACE_SEPARATOR = 12,
  315. BREAK, // LINE_SEPARATOR = 13,
  316. BREAK, // PARAGRAPH_SEPARATOR = 14,
  317. BREAK, // CONTROL = 15,
  318. BREAK, // FORMAT = 16
  319. BREAK, // ???? = 17,
  320. BREAK, // PRIVATE_USE = 18,
  321. BREAK, // SURROGATE = 19,
  322. midLetter, // DASH_PUNCTUATION = 20,
  323. BREAK, // START_PUNCTUATION = 21,
  324. BREAK, // END_PUNCTUATION = 22,
  325. BREAK, // CONNECTOR_PUNCTUATION = 23,
  326. BREAK, // OTHER_PUNCTUATION = 24,
  327. BREAK, // MATH_SYMBOL = 25,
  328. preNum, // CURRENCY_SYMBOL = 26,
  329. BREAK, // MODIFIER_SYMBOL = 27,
  330. BREAK // OTHER_SYMBOL = 28
  331. };
  332. private static final SpecialMapping kExceptionChar[] =
  333. {
  334. //note: the ranges in this table must be sorted in ascending order
  335. //as required by the UnicodeClassMapping class.
  336. new SpecialMapping(ASCII_HORIZONTAL_TABULATION, blank),
  337. new SpecialMapping(ASCII_LINEFEED, lf),
  338. new SpecialMapping(ASCII_FORM_FEED, lf),
  339. new SpecialMapping(ASCII_CARRIAGE_RETURN, cr),
  340. new SpecialMapping(ASCII_QUOTATION_MARK, midLetNum),
  341. new SpecialMapping(ASCII_NUMBER_SIGN, preNum),
  342. new SpecialMapping(ASCII_PERCENT, postNum),
  343. new SpecialMapping(ASCII_AMPERSAND, postNum),
  344. new SpecialMapping(ASCII_APOSTROPHE, midLetNum),
  345. new SpecialMapping(ASCII_COMMA, midNum),
  346. new SpecialMapping(ASCII_FULL_STOP, preMidNum),
  347. new SpecialMapping(ASCII_CENT_SIGN, postNum),
  348. new SpecialMapping(LATIN1_SOFTHYPHEN, midLetter),
  349. new SpecialMapping(ARABIC_PERCENT_SIGN, postNum),
  350. new SpecialMapping(ARABIC_DECIMAL_SEPARATOR, midNum),
  351. new SpecialMapping(PUNCTUATION_HYPHENATION_POINT, midLetter),
  352. new SpecialMapping(PUNCTUATION_LINE_SEPARATOR,
  353. PUNCTUATION_PARAGRAPH_SEPARATOR, lf),
  354. new SpecialMapping(PER_MILLE_SIGN, postNum),
  355. new SpecialMapping(PER_TEN_THOUSAND_SIGN, postNum),
  356. new SpecialMapping(IDEOGRAPHIC_ITERATION_MARK, kanji),
  357. new SpecialMapping(HIRAGANA_LETTER_SMALL_A, HIRAGANA_LETTER_VU, hira),
  358. new SpecialMapping(COMBINING_KATAKANA_HIRAGANA_VOICED_SOUND_MARK,
  359. HIRAGANA_SEMIVOICED_SOUND_MARK, diacrit),
  360. new SpecialMapping(HIRAGANA_ITERATION_MARK, HIRAGANA_VOICED_ITERATION_MARK, hira),
  361. new SpecialMapping(KATAKANA_LETTER_SMALL_A,
  362. KATAKANA_LETTER_SMALL_KE, kata),
  363. new SpecialMapping(KATAKANA_HIRAGANA_PROLONGED_SOUND_MARK, diacrit),
  364. new SpecialMapping(KATAKANA_ITERATION_MARK, KATAKANA_VOICED_ITERATION_MARK, kata),
  365. new SpecialMapping(UNICODE_LOW_BOUND_HAN,
  366. UNICODE_HIGH_BOUND_HAN, kanji),
  367. new SpecialMapping(HANGUL_SYL_LOW, HANGUL_SYL_HIGH, letter),
  368. new SpecialMapping(CJK_COMPATIBILITY_F900,
  369. CJK_COMPATIBILITY_FA2D, kanji),
  370. new SpecialMapping(END_OF_STRING, EOS)
  371. };
  372. private static final boolean WordExceptionFlags[] = {
  373. false, // kNonCharacter = 0,
  374. false, // kUppercaseLetter = 1,
  375. false, // kLowercaseLetter = 2,
  376. false, // kTitlecaseLetter = 3,
  377. true, // kModifierLetter = 4,
  378. true, // kOtherLetter = 5,
  379. true, // kNonSpacingMark = 6,
  380. false, // kEnclosingMark = 7,
  381. false, // kCombiningSpacingMark = 8,
  382. false, // kDecimalNumber = 9,
  383. false, // kLetterNumber = 10,
  384. false, // kOtherNumber = 11,
  385. false, // kSpaceSeparator = 12,
  386. true, // kLineSeparator = 13,
  387. true, // kParagraphSeparator = 14,
  388. true, // kControlCharacter = 15,
  389. false, // kFormatCharacter = 16,
  390. false, // UNDEFINED = 17,
  391. false, // kPrivateUseCharacter = 18,
  392. false, // kSurrogate = 19,
  393. true, // kDashPunctuation = 20,
  394. false, // kOpenPunctuation = 21,
  395. false, // kClosePunctuation = 22,
  396. false, // kConnectorPunctuation = 23,
  397. true, // kOtherPunctuation = 24,
  398. false, // kMathSymbol = 25,
  399. true, // kCurrencySymbol = 26,
  400. false, // kModifierSymbol = 27,
  401. false // kOtherSymbol = 28
  402. };
  403. private static final int kWordAsciiValues[] = {
  404. // null soh stx etx eot enq ask bell
  405. BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
  406. // bs ht lf vt ff cr so si
  407. BREAK, blank, lf, BREAK, lf, cr, BREAK, BREAK,
  408. // dle dc1 dc2 dc3 dc4 nak syn etb
  409. BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
  410. // can em sub esc fs gs rs us
  411. BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
  412. // sp ! " # $ % & '
  413. blank, BREAK, midLetNum, preNum, preNum, postNum, postNum, midLetNum,
  414. // ( ) * + , - . /
  415. BREAK, BREAK, BREAK, BREAK, midNum, midLetter, preMidNum, BREAK,
  416. // 0 1 2 3 4 5 6 7
  417. number, number, number, number, number, number, number, number,
  418. // 8 9 : ; < = > ?
  419. number, number, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
  420. // @ A B C D E F G
  421. BREAK, letter, letter, letter, letter, letter, letter, letter,
  422. // H I J K L M N O
  423. letter, letter, letter, letter, letter, letter, letter, letter,
  424. // P Q R S T U V W
  425. letter, letter, letter, letter, letter, letter, letter, letter,
  426. // X Y Z [ \ ] ^ _
  427. letter, letter, letter, BREAK, BREAK, BREAK, BREAK, BREAK,
  428. // ` a b c d e f g
  429. BREAK, letter, letter, letter, letter, letter, letter, letter,
  430. // h i j k l m n o
  431. letter, letter, letter, letter, letter, letter, letter, letter,
  432. // p q r s t u v w
  433. letter, letter, letter, letter, letter, letter, letter, letter,
  434. // x y z { | } ~ del
  435. letter, letter, letter, BREAK, BREAK, BREAK, BREAK, BREAK,
  436. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  437. BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
  438. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  439. BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
  440. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  441. BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
  442. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  443. BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
  444. // nbsp inv-! cents pounds currency yen broken-bar section
  445. blank, BREAK, postNum, preNum, preNum, preNum, BREAK, BREAK,
  446. // umlaut copyright super-a gui-left not soft-hyph registered macron
  447. BREAK, BREAK, letter, BREAK, BREAK, midLetter, BREAK, BREAK,
  448. // degree +/- super-2 super-3 acute micro paragraph bullet
  449. BREAK, BREAK, number, number, BREAK, letter, BREAK, BREAK,
  450. // cedilla super-1 super-o gui-right 1/4 1/2 3/4 inv-?
  451. BREAK, letter, BREAK, BREAK, number, number, number, BREAK,
  452. // A-grave A-acute A-hat A-tilde A-umlaut A-ring AE C-cedilla
  453. letter, letter, letter, letter, letter, letter, letter, letter,
  454. // E-grave E-acute E-hat E-umlaut I-grave I-acute I-hat I-umlaut
  455. letter, letter, letter, letter, letter, letter, letter, letter,
  456. // Edh N-tilde O-grave O-acute O-hat O-tilde O-umlaut times
  457. letter, letter, letter, letter, letter, letter, letter, BREAK,
  458. // O-slash U-grave U-acute U-hat U-umlaut Y-acute Thorn ess-zed
  459. letter, letter, letter, letter, letter, letter, letter, letter,
  460. // a-grave a-acute a-hat a-tilde a-umlaut a-ring ae c-cedilla
  461. letter, letter, letter, letter, letter, letter, letter, letter,
  462. // e-grave e-acute e-hat e-umlaut i-grave i-acute i-hat i-umlaut
  463. letter, letter, letter, letter, letter, letter, letter, letter,
  464. // edh n-tilde o-grave o-acute o-hat o-tilde o-umlaut over
  465. letter, letter, letter, letter, letter, letter, letter, BREAK,
  466. // o-slash u-grave u-acute u-hat u-umlaut y-acute thorn y-umlaut
  467. letter, letter, letter, letter, letter, letter, letter, letter
  468. };
  469. private static final UnicodeClassMapping kWordMap
  470. = new UnicodeClassMapping(kRawMapping, kExceptionChar, WordExceptionFlags,
  471. kWordAsciiValues);
  472. }