1. /*
  2. * @(#)LineBreakData.java 1.17 00/01/19
  3. *
  4. * Copyright 1996-2000 Sun Microsystems, Inc. All Rights Reserved.
  5. *
  6. * This software is the proprietary information of Sun Microsystems, Inc.
  7. * Use is subject to license terms.
  8. *
  9. */
  10. /*
  11. * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  12. * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
  13. *
  14. * The original version of this source code and documentation
  15. * is copyrighted and owned by Taligent, Inc., a wholly-owned
  16. * subsidiary of IBM. These materials are provided under terms
  17. * of a License Agreement between Taligent and Sun. This technology
  18. * is protected by multiple US and International patents.
  19. *
  20. * This notice and attribution to Taligent may not be removed.
  21. * Taligent is a registered trademark of Taligent, Inc.
  22. *
  23. */
  24. package java.text;
  25. /**
  26. * The LineBreakData contains data used by SimpleTextBoundary
  27. * to determine line breaks.
  28. * @see #BreakIterator
  29. */
  30. final class LineBreakData extends TextBoundaryData
  31. {
  32. // THEORY OF OPERATION: This class contains all the tables necessary to do
  33. // character-break iteration. This class descends from TextBoundaryData, which
  34. // is abstract. This class doesn't define any non-static members; it inherits the
  35. // non-static members from TextBoundaryData and fills them in with pointers to
  36. // the static members defined here.
  37. // There are two main parts to a TextBoundaryData object: the state-transition
  38. // tables and the character-mapping tables. The forward state table defines the
  39. // transitions for a deterministic finite state machine that locates character
  40. // boundaries. The rows are the states and the columns are character categories.
  41. // The cell values consist of two parts: The first is the row number of the next
  42. // state to transition to, or a "stop" value (0). (Because 0 is the stop value
  43. // rather than a valid state number, row 0 of the array isn't ever looked at; we
  44. // fill it with STOP values by convention.) The second part is a flag indicating
  45. // whether the iterator should update its break position on this transition. When
  46. // the flag is set, the sign bit of the value is turned on (SI is used to represent
  47. // the flag bit being turned on-- we do it this way rather than just using negative
  48. // numbers because we still need to see the SI flag when the value of the transition
  49. // is STOP. SI_STOP is used to denote this.) The starting state in all state tables
  50. // is 1.
  51. // The backward state table works the same way as the forward state table, but is
  52. // usually simplified. The iterator uses the backward state table only to find a
  53. // "safe place" to start iterating forward. It then seeks forward from the "safe
  54. // place" to the actual break position using the forward table. A "safe place" is
  55. // a spot in the text that is guaranteed to be a break position.
  56. // The character-category mapping tables are split into several pieces, one for
  57. // each stage of the category-mapping process: 1) kRawMapping maps generic Unicode
  58. // character categories to the character categories used by this break iterator.
  59. // The index of the array is the Unicode category number as returned by
  60. // Character.getType(). 2) The kExceptionFlags table is a table of Boolean values
  61. // indicating whether all the characters in the Unicode category have the
  62. // raw-mapping value. The rows correspond to the rows of the raw-mapping table. If
  63. // an entry is true, then we find the right category using... 3) The kExceptionChar
  64. // table. This table is a sorted list of SpecialMapping objects. Each entry defines
  65. // a range of contiguous characters that share the same category and the category
  66. // number. This list is binary-searched to find an entry corresponding to the
  67. // charactre being mapped. Only characters whose breaking category is different from
  68. // the raw-mapping value (the breaking category for their Unicode category) are
  69. // listed in this table. 4) The kAsciiValues table is a fast-path table for characters
  70. // in the Latin1 range. This table maps straight from a character value to a
  71. // category number, bypassing all the other tables. The programmer must take care
  72. // that all of the different category-mapping tables are consistent.
  73. // In the current implementation, all of these tables are created and maintained
  74. // by hand, not using a tool.
  75. private static final byte BREAK = 0;
  76. //always breaks (must be present as first item)
  77. private static final byte blank = 1;
  78. //spaces, tabs, nulls.
  79. private static final byte cr = 2;
  80. //carriage return
  81. private static final byte nonBlank = 3;
  82. //everything not included elsewhere
  83. private static final byte op = 4;
  84. //hyphens....
  85. private static final byte jwrd = 5;
  86. //hiragana, katakana, and kanji
  87. private static final byte preJwrd = 6;
  88. //characters that bind to the beginning of a Japanese word
  89. private static final byte postJwrd = 7;
  90. //characters that bind to the end of a Japanese word
  91. private static final byte digit = 8;
  92. //digits
  93. private static final byte numPunct = 9;
  94. //punctuation that can appear within a number
  95. private static final byte currency = 10;
  96. //currency symbols that can precede a number
  97. private static final byte quote = 11;
  98. // the ASCII quotation mark
  99. private static final byte nsm = 12;
  100. // non-spacing marks
  101. private static final byte nbsp = 13;
  102. // non-breaking characters
  103. private static final byte EOS = 14;
  104. private static final int COL_COUNT = 15;
  105. private static final byte SI = (byte)0x80;
  106. private static final byte STOP = (byte) 0;
  107. private static final byte SI_STOP = (byte)SI + STOP;
  108. public LineBreakData() {
  109. super(kLineForward, kLineBackward, kLineMap);
  110. }
  111. // This table locates legal line-break positions. i.e., a process that word-wraps a line of
  112. // text can use this version of the BreakIterator to tell it where the legal places for
  113. // breaking a line are.
  114. // The rules implemented here are as follows:
  115. // 1) There is always a legal break position after a line or paragraph separator, but
  116. // one can occur before only when the preceding character is also a line or paragraph
  117. // separator. (The CR-LF sequence is also kept together.) (states 4 and 7)
  118. // 2) There is never a break before a non-spacing mark, unless it's preceded by a line
  119. // or paragraph separator. (the nsm column)
  120. // 3) There is never a break on either side of a non-breaking space (or other non-breaking
  121. // chartacters). (the nbsp column, and state 1)
  122. // 4) There is always a break before and after Kanji and Kana characters, except for certain
  123. // punctuation that must be kept with the following character and certain punctuation
  124. // and diacritic marks that must be kept with the preceding character. (states 5 and 8)
  125. // 5) There is always a legal break position following a dash, except when it is followed
  126. // by a digit, a line/paragraph separator, or whitespace. (state 6)
  127. // 6) There is never a break before a whitespace character. There is a break after a
  128. // whitespace character, except when it's followed by a line/paragraph separator.
  129. // (state 2)
  130. // 7) Breaks don't occur anywhere else. (state 1)
  131. private static final byte kLineForwardData[] =
  132. {
  133. // brk bl cr nBl
  134. // op kan prJ poJ
  135. // dgt np curr quote
  136. // nsm nbsp EOS
  137. // 00 - dummy state
  138. STOP, STOP, STOP, STOP,
  139. STOP, STOP, STOP, STOP,
  140. STOP, STOP, STOP, STOP,
  141. STOP, STOP, STOP,
  142. // 01 - main dispatch state. This state eats pre-Kanji punctuation,
  143. // non-breaking spaces, and non-spacing diacritics without transitioning
  144. // to other states.
  145. (byte)(SI+4), (byte)(SI+2), (byte)(SI+7), (byte)(SI+3),
  146. (byte)(SI+6), (byte)(SI+5), (byte)(SI+1), (byte)(SI+8),
  147. (byte)(SI+9), (byte)(SI+8), (byte)(SI+1), (byte)(SI+3),
  148. (byte)(SI+1), (byte)(SI+1), SI_STOP,
  149. // 02 - This state eats whitespce and stops on almost anything else
  150. // (the exceptions are non-breaking spaces, which go back to 1,
  151. // and CRs and LFs)
  152. (byte)(SI+4), (byte)(SI+2), (byte)(SI+7), SI_STOP,
  153. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  154. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  155. (byte)(SI+2), (byte)(SI+1), SI_STOP,
  156. // 03 - This state eats non-whitespace characters that aren't
  157. // otherwise accounted for. The only difference between
  158. // this and state 1 is that it stops on Kanji (you can break
  159. // between any two Kanji characters)
  160. (byte)(SI+4), (byte)(SI+2), (byte)(SI+7), (byte)(SI+3),
  161. (byte)(SI+6), SI_STOP, (byte)(SI+1), (byte)(SI+8),
  162. (byte)(SI+9), (byte)(SI+8), (byte)(SI+1), (byte)(SI+3),
  163. (byte)(SI+3), (byte)(SI+1), SI_STOP,
  164. // 04 - this is the state you go to when you see a hard line-
  165. // breaking character. It eats that character and stops.
  166. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  167. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  168. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  169. SI_STOP, SI_STOP, SI_STOP,
  170. // 05 - this is the state that handles Kanji. It handles
  171. // post-Kanji punctuation, whitespace, non-breaking spaces,
  172. // and line terminators, but stops on everything else
  173. // (including more Kanji)
  174. (byte)(SI+4), (byte)(SI+2), (byte)(SI+7), SI_STOP,
  175. SI_STOP, SI_STOP, SI_STOP, (byte)(SI+8),
  176. SI_STOP, (byte)(SI+8), SI_STOP, SI_STOP,
  177. (byte)(SI+5), (byte)(SI+1), SI_STOP,
  178. // 06 - This state handles dashes. It'll continue on
  179. // whitespace, more dashes, line terminators, and digits
  180. // (the dash is a minus sign), but stops on everything else
  181. // (unless there's an nbsp, a dash is always a legal
  182. // break position).
  183. (byte)(SI+4), SI_STOP, (byte)(SI+7), SI_STOP,
  184. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  185. (byte)(SI+9), SI_STOP, (byte)(SI+11), SI_STOP,
  186. (byte)(SI+6), (byte)(SI+1), SI_STOP,
  187. // 07 - This state handles CRs. A CR is a line terminator
  188. // when it appears alone, and considered "half" a line
  189. // terminator when it occurs right before any other line
  190. // terminator (except another CR).
  191. (byte)(SI+4), SI_STOP, SI_STOP, SI_STOP,
  192. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  193. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  194. SI_STOP, SI_STOP, SI_STOP,
  195. // 08 - This state eats post-Kanji punctuation, and passes
  196. // whitespace, non-breaking characters, dashes, line terminators,
  197. // etc. It stops on almost everything else.
  198. (byte)(SI+4), (byte)(SI+2), (byte)(SI+7), SI_STOP,
  199. SI_STOP, SI_STOP, SI_STOP, (byte)(SI+8),
  200. SI_STOP, (byte)(SI+8), SI_STOP, (byte)(SI+3),
  201. (byte)(SI+8), (byte)(SI+1), SI_STOP,
  202. // 09 - This state is the main "number" state. It eats
  203. // digits.
  204. (byte)(SI+4), (byte)(SI+2), (byte)(SI+7), (byte)(SI+3),
  205. (byte)(SI+6), SI_STOP, SI_STOP, (byte)(SI+8),
  206. (byte)(SI+9), (byte)(SI+10), (byte)(SI+10), (byte)(SI+3),
  207. (byte)(SI+9), (byte)(SI+1), SI_STOP,
  208. // 10 - This state is the secondary "number" state. It
  209. // easts punctuation that can occur inside a number.
  210. (byte)(SI+4), (byte)(SI+2), (byte)(SI+7), SI_STOP,
  211. SI_STOP, SI_STOP, SI_STOP, (byte)(SI+8),
  212. (byte)(SI+9), (byte)(SI+8), SI_STOP, SI_STOP,
  213. (byte)(SI+10), (byte)(SI+1), SI_STOP,
  214. // 11 - This state is here to allow a dash to go before a
  215. // currency symbol and still be treated as a minus sign
  216. // (if the character after the currency symbol is a digit).
  217. STOP, STOP, STOP, STOP,
  218. STOP, STOP, STOP, STOP,
  219. (byte)(SI+9), STOP, STOP, STOP,
  220. (byte)(11), (byte)(SI+1), STOP
  221. };
  222. private static final WordBreakTable kLineForward
  223. = new WordBreakTable(COL_COUNT, kLineForwardData);
  224. // This table locates unambiguous break positions when iterating backward.
  225. // It implements the following rules:
  226. // 1) For most characters, there is a break before them if they're preceded
  227. // by whitespace, Kanji, or a line/paragraph separator. (CR-LF is kept together)
  228. // 2) There is a break before a Kanji character, except when it's preceded by
  229. // a Kanji-prefix character. (state 4)
  230. // 3) There is NOT a break before a Kanji-suffix character, except when preceded
  231. // by whitespace, a line/paragraph separator, or a dash. (state 3)
  232. // 4) There is never a break on either side of a non-break character. (the nbsp column)
  233. // 5) There is never a break before a non-spacing mark (the nsm column)
  234. // [In this set of rules, "break" means "unambiguous break position". There may sometimes
  235. // be actual breaks in positions this table always skips.]
  236. private static final byte kLineBackwardData[] =
  237. {
  238. // brk bl cr nBl
  239. // op kan prJ poJ
  240. // dgt np curr quote
  241. // nsm nbsp EOS
  242. /*00*/
  243. STOP, STOP, STOP, STOP,
  244. STOP, STOP, STOP, STOP,
  245. STOP, STOP, STOP, STOP,
  246. STOP, STOP, STOP,
  247. /*01*/
  248. (byte)(SI+1), (byte)(SI+1), (byte)(SI+1), (byte)(SI+2),
  249. (byte)(SI+2), (byte)(SI+4), (byte)(SI+2), (byte)(SI+3),
  250. (byte)(SI+2), (byte)(SI+3), (byte)(SI+2), (byte)(SI+2),
  251. (byte)(SI+1), (byte)(SI+2), STOP,
  252. /*02*/
  253. STOP, STOP, STOP, (byte)(SI+2),
  254. (byte)(SI+2), STOP, (byte)(SI+2), (byte)(SI+3),
  255. (byte)(SI+2), (byte)(SI+3), (byte)(SI+2), (byte)(SI+2),
  256. (byte)(SI+2), (byte)(SI+2), STOP,
  257. /*03*/
  258. STOP, STOP, STOP, (byte)(SI+2),
  259. STOP, (byte)(SI+4), (byte)(SI+2), (byte)(SI+3),
  260. (byte)(SI+2), (byte)(SI+3), (byte)(SI+2), (byte)(SI+2),
  261. (byte)(SI+3), (byte)(SI+2), STOP,
  262. /*04*/
  263. STOP, STOP, STOP, STOP,
  264. STOP, STOP, (byte)(SI+2), STOP,
  265. STOP, STOP, (byte)(SI+2), STOP,
  266. (byte)(SI+4), (byte)(SI+4), STOP
  267. };
  268. private static final WordBreakTable kLineBackward
  269. = new WordBreakTable(COL_COUNT, kLineBackwardData);
  270. private static final int kRawMapping[] =
  271. {
  272. nonBlank, //UNASSIGNED = 0,
  273. nonBlank, //UPPERCASE_LETTER = 1,
  274. nonBlank, //LOWERCASE_LETTER = 2,
  275. nonBlank, //TITLECASE_LETTER = 3,
  276. nonBlank, //MODIFIER_LETTER = 4,
  277. nonBlank, //OTHER_LETTER = 5,
  278. nsm, //NON_SPACING_MARK = 6,
  279. nsm, //ENCLOSING_MARK = 7,
  280. nonBlank, //COMBINING_SPACING_MARK = 8,
  281. digit, //DECIMAL_DIGIT_NUMBER = 9,
  282. nonBlank, //LETTER_NUMBER = 10,
  283. digit, //OTHER_NUMBER = 11,
  284. blank, //SPACE_SEPARATOR = 12,
  285. blank, //LINE_SEPARATOR = 13,
  286. blank, //PARAGRAPH_SEPARATOR = 14, ???????????
  287. blank, //CONTROL = 15,
  288. nonBlank, //PRIVATE_USE = 16,
  289. nonBlank, //FORMAT = 17
  290. nonBlank, //???? = 18,
  291. nonBlank, //SURROGATE = 19,
  292. op, //DASH_PUNCTUATION = 20,
  293. preJwrd, //START_PUNCTUATION = 21,
  294. postJwrd, //END_PUNCTUATION = 22,
  295. nonBlank, //CONNECTOR_PUNCTUATION = 23,
  296. nonBlank, //OTHER_PUNCTUATION = 24,
  297. nonBlank, //MATH_SYMBOL = 25,
  298. preJwrd, //CURRENCY_SYMBOL = 26,
  299. nonBlank, //MODIFIER_SYMBOL = 27,
  300. nonBlank //OTHER_SYMBOL = 28;
  301. };
  302. private static SpecialMapping kExceptionChar[] =
  303. {
  304. //note: the ranges in this table must be sorted in ascending order as
  305. // required by the UnicodeClassMapping class.
  306. new SpecialMapping(ASCII_END_OF_TEXT, BREAK),
  307. new SpecialMapping(ASCII_HORIZONTAL_TABULATION,
  308. ASCII_FORM_FEED, BREAK),
  309. new SpecialMapping(ASCII_CARRIAGE_RETURN, cr),
  310. new SpecialMapping(ASCII_EXCLAMATION_MARK, postJwrd),
  311. new SpecialMapping(ASCII_QUOTATION_MARK, quote),
  312. new SpecialMapping(ASCII_DOLLAR_SIGN, preJwrd),
  313. new SpecialMapping(ASCII_PERCENT, postJwrd),
  314. new SpecialMapping(ASCII_COMMA, numPunct),
  315. new SpecialMapping(ASCII_FULL_STOP, numPunct),
  316. new SpecialMapping(ASCII_COLON, ASCII_SEMICOLON, postJwrd),
  317. new SpecialMapping(ASCII_QUESTION_MARK, postJwrd),
  318. new SpecialMapping(ASCII_NONBREAKING_SPACE, nbsp),
  319. new SpecialMapping(ASCII_CENT_SIGN, postJwrd),
  320. new SpecialMapping(LATIN1_SOFTHYPHEN, op),
  321. new SpecialMapping(LATIN1_DEGREE_SIGN, postJwrd),
  322. new SpecialMapping(ARABIC_PERCENT_SIGN, postJwrd),
  323. new SpecialMapping(FIGURE_SPACE, nbsp),
  324. new SpecialMapping(NONBREAKING_HYPHEN, nbsp),
  325. new SpecialMapping(PUNCTUATION_LINE_SEPARATOR,
  326. PUNCTUATION_PARAGRAPH_SEPARATOR, BREAK),
  327. new SpecialMapping(PER_MILLE_SIGN, postJwrd),
  328. new SpecialMapping(PER_TEN_THOUSAND_SIGN, postJwrd),
  329. new SpecialMapping(PRIME, TRIPLE_PRIME, postJwrd),
  330. new SpecialMapping(DEGREE_CELSIUS, postJwrd),
  331. new SpecialMapping(DEGREE_FAHRENHEIT, postJwrd),
  332. new SpecialMapping(PUNCTUATION_IDEOGRAPHIC_COMMA,
  333. PUNCTUATION_IDEOGRAPHIC_FULL_STOP, postJwrd),
  334. new SpecialMapping(IDEOGRAPHIC_ITERATION_MARK, postJwrd),
  335. new SpecialMapping(HIRAGANA_LETTER_SMALL_A, postJwrd),
  336. new SpecialMapping(HIRAGANA_LETTER_A, jwrd),
  337. new SpecialMapping(HIRAGANA_LETTER_SMALL_I, postJwrd),
  338. new SpecialMapping(HIRAGANA_LETTER_I, jwrd),
  339. new SpecialMapping(HIRAGANA_LETTER_SMALL_U, postJwrd),
  340. new SpecialMapping(HIRAGANA_LETTER_U, jwrd),
  341. new SpecialMapping(HIRAGANA_LETTER_SMALL_E, postJwrd),
  342. new SpecialMapping(HIRAGANA_LETTER_E, jwrd),
  343. new SpecialMapping(HIRAGANA_LETTER_SMALL_O, postJwrd),
  344. new SpecialMapping(HIRAGANA_LETTER_O, HIRAGANA_LETTER_DI, jwrd),
  345. new SpecialMapping(HIRAGANA_LETTER_SMALL_TU, postJwrd),
  346. new SpecialMapping(HIRAGANA_LETTER_TU, HIRAGANA_LETTER_MO, jwrd),
  347. new SpecialMapping(HIRAGANA_LETTER_SMALL_YA, postJwrd),
  348. new SpecialMapping(HIRAGANA_LETTER_YA, jwrd),
  349. new SpecialMapping(HIRAGANA_LETTER_SMALL_YU, postJwrd),
  350. new SpecialMapping(HIRAGANA_LETTER_YU, jwrd),
  351. new SpecialMapping(HIRAGANA_LETTER_SMALL_YO, postJwrd),
  352. new SpecialMapping(HIRAGANA_LETTER_YO, HIRAGANA_LETTER_RO, jwrd),
  353. new SpecialMapping(HIRAGANA_LETTER_SMALL_WA, postJwrd),
  354. new SpecialMapping(HIRAGANA_LETTER_WA, HIRAGANA_LETTER_VU, jwrd),
  355. new SpecialMapping(COMBINING_KATAKANA_HIRAGANA_VOICED_SOUND_MARK,
  356. HIRAGANA_SEMIVOICED_SOUND_MARK, postJwrd),
  357. new SpecialMapping(HIRAGANA_ITERATION_MARK, HIRAGANA_VOICED_ITERATION_MARK, postJwrd),
  358. new SpecialMapping(KATAKANA_LETTER_SMALL_A, postJwrd),
  359. new SpecialMapping(KATAKANA_LETTER_A, jwrd),
  360. new SpecialMapping(KATAKANA_LETTER_SMALL_I, postJwrd),
  361. new SpecialMapping(KATAKANA_LETTER_I, jwrd),
  362. new SpecialMapping(KATAKANA_LETTER_SMALL_U, postJwrd),
  363. new SpecialMapping(KATAKANA_LETTER_U, jwrd),
  364. new SpecialMapping(KATAKANA_LETTER_SMALL_E, postJwrd),
  365. new SpecialMapping(KATAKANA_LETTER_E, jwrd),
  366. new SpecialMapping(KATAKANA_LETTER_SMALL_O, postJwrd),
  367. new SpecialMapping(KATAKANA_LETTER_O, KATAKANA_LETTER_DI, jwrd),
  368. new SpecialMapping(KATAKANA_LETTER_SMALL_TU, postJwrd),
  369. new SpecialMapping(KATAKANA_LETTER_TU, KATAKANA_LETTER_MO, jwrd),
  370. new SpecialMapping(KATAKANA_LETTER_SMALL_YA, postJwrd),
  371. new SpecialMapping(KATAKANA_LETTER_YA, jwrd),
  372. new SpecialMapping(KATAKANA_LETTER_SMALL_YU, postJwrd),
  373. new SpecialMapping(KATAKANA_LETTER_YU, jwrd),
  374. new SpecialMapping(KATAKANA_LETTER_SMALL_YO, postJwrd),
  375. new SpecialMapping(KATAKANA_LETTER_YO, KATAKANA_LETTER_RO, jwrd),
  376. new SpecialMapping(KATAKANA_LETTER_SMALL_WA, postJwrd),
  377. new SpecialMapping(KATAKANA_LETTER_WA, KATAKANA_LETTER_VU, jwrd),
  378. new SpecialMapping(KATAKANA_LETTER_SMALL_KA, KATAKANA_LETTER_SMALL_KE, postJwrd),
  379. new SpecialMapping(KATAKANA_LETTER_VA, KATAKANA_LETTER_VO, jwrd),
  380. new SpecialMapping(KATAKANA_HIRAGANA_PROLONGED_SOUND_MARK, postJwrd),
  381. new SpecialMapping(KATAKANA_ITERATION_MARK, KATAKANA_VOICED_ITERATION_MARK, postJwrd),
  382. new SpecialMapping(UNICODE_LOW_BOUND_HAN,UNICODE_HIGH_BOUND_HAN,jwrd),
  383. new SpecialMapping(CJK_COMPATIBILITY_F900,
  384. CJK_COMPATIBILITY_FA2D, jwrd),
  385. new SpecialMapping(UNICODE_ZERO_WIDTH_NON_BREAKING_SPACE, nbsp),
  386. new SpecialMapping(FULLWIDTH_EXCLAMATION_MARK, postJwrd),
  387. new SpecialMapping(FULLWIDTH_COMMA, postJwrd),
  388. new SpecialMapping(FULLWIDTH_FULL_STOP, postJwrd),
  389. new SpecialMapping(FULLWIDTH_QUESTION_MARK, postJwrd),
  390. new SpecialMapping(END_OF_STRING, EOS)
  391. };
  392. private static final boolean LineExceptionFlags[] = {
  393. false, // kNonCharacter = 0,
  394. false, // kUppercaseLetter = 1,
  395. false, // kLowercaseLetter = 2,
  396. false, // kTitlecaseLetter = 3,
  397. true, // kModifierLetter = 4,
  398. true, // kOtherLetter = 5,
  399. true, // kNonSpacingMark = 6,
  400. false, // kEnclosingMark = 7,
  401. false, // kCombiningSpacingMark = 8,
  402. false, // kDecimalNumber = 9,
  403. false, // kLetterNumber = 10,
  404. false, // kOtherNumber = 11,
  405. true, // kSpaceSeparator = 12,
  406. true, // kLineSeparator = 13,
  407. true, // kParagraphSeparator = 14,
  408. true, // kControlCharacter = 15,
  409. true, // kFormatCharacter = 16,
  410. false, // UNDEFINED = 17,
  411. false, // kPrivateUseCharacter = 18,
  412. false, // kSurrogate = 19,
  413. true, // kDashPunctuation = 20,
  414. false, // kOpenPunctuation = 21,
  415. false, // kClosePunctuation = 22,
  416. false, // kConnectorPunctuation = 23,
  417. true, // kOtherPunctuation = 24,
  418. false, // kMathSymbol = 25,
  419. true, // kCurrencySymbol = 26,
  420. false, // kModifierSymbol = 27,
  421. true // kOtherSymbol = 28
  422. };
  423. private static final int kLineAsciiValues[] = {
  424. // null soh stx etx eot enq ask bell
  425. blank, blank, blank, BREAK, blank, blank, blank, blank,
  426. // bs ht lf vt ff cr so si
  427. blank, BREAK, BREAK, BREAK, BREAK, cr, blank, blank,
  428. // dle dc1 dc2 dc3 dc4 nak syn etb
  429. blank, blank, blank, blank, blank, blank, blank, blank,
  430. // can em sub esc fs gs rs us
  431. blank, blank, blank, blank, blank, blank, blank, blank,
  432. // sp ! " # $ % & '
  433. blank, postJwrd, quote, nonBlank, currency, postJwrd, nonBlank, nonBlank,
  434. // ( ) * + , - . /
  435. preJwrd, postJwrd, nonBlank, nonBlank, numPunct, op, numPunct, nonBlank,
  436. // 0 1 2 3 4 5 6 7
  437. digit, digit, digit, digit, digit, digit, digit, digit,
  438. // 8 9 : ; < = > ?
  439. digit, digit, postJwrd, postJwrd, nonBlank, nonBlank, nonBlank, postJwrd,
  440. // @ A B C D E F G
  441. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  442. // H I J K L M N O
  443. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  444. // P Q R S T U V W
  445. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  446. // X Y Z [ \ ] ^ _
  447. nonBlank, nonBlank, nonBlank, preJwrd, nonBlank, postJwrd, nonBlank, nonBlank,
  448. // ` a b c d e f g
  449. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  450. // h i j k l m n o
  451. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  452. // p q r s t u v w
  453. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  454. // x y z { | } ~ del
  455. nonBlank, nonBlank, nonBlank, preJwrd, nonBlank, postJwrd, nonBlank, blank,
  456. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  457. blank, blank, blank, blank, blank, blank, blank, blank,
  458. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  459. blank, blank, blank, blank, blank, blank, blank, blank,
  460. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  461. blank, blank, blank, blank, blank, blank, blank, blank,
  462. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  463. blank, blank, blank, blank, blank, blank, blank, blank,
  464. // nbsp inv-! cents pounds currency yen broken-bar section
  465. nbsp, nonBlank, postJwrd, currency, currency, currency, nonBlank, nonBlank,
  466. // umlaut copyright super-a gui-left not soft-hyph registered macron
  467. nonBlank, nonBlank, nonBlank, preJwrd, nonBlank, op, nonBlank, nonBlank,
  468. // degree +/- super-2 super-3 acute micro paragraph bullet
  469. postJwrd, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  470. // cedilla super-1 super-o gui-right 1/4 1/2 3/4 inv-?
  471. nonBlank, nonBlank, nonBlank, postJwrd, digit, digit, digit, nonBlank,
  472. // A-grave A-acute A-hat A-tilde A-umlaut A-ring AE C-cedilla
  473. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  474. // E-grave E-acute E-hat E-umlaut I-grave I-acute I-hat I-umlaut
  475. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  476. // Edh N-tilde O-grave O-acute O-hat O-tilde O-umlaut times
  477. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  478. // O=slash U-grave U-acute U-hat U-umlaut Y-acute Thorn ess-zed
  479. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  480. // a-grave a-acute a-hat a-tilde a-umlaut a-ring ae c-cedilla
  481. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  482. // e-grave e-acute e-hat e-umlaut i-grave i-acute i-hat i-umlaut
  483. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  484. // edh n-tilde o-grave o-acute o-hat o-tilde o-umlaut over
  485. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  486. // o-slash u-grave u-acute u-hat u-umlaut y-acute thorn y=umlaut
  487. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank
  488. };
  489. private static final UnicodeClassMapping kLineMap
  490. = new UnicodeClassMapping(kRawMapping, kExceptionChar, LineExceptionFlags,
  491. kLineAsciiValues);
  492. }