1. /*
  2. * @(#)LineBreakData.java 1.20 03/01/23
  3. *
  4. * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
  5. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
  6. */
  7. /*
  8. * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  9. * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
  10. *
  11. * The original version of this source code and documentation
  12. * is copyrighted and owned by Taligent, Inc., a wholly-owned
  13. * subsidiary of IBM. These materials are provided under terms
  14. * of a License Agreement between Taligent and Sun. This technology
  15. * is protected by multiple US and International patents.
  16. *
  17. * This notice and attribution to Taligent may not be removed.
  18. * Taligent is a registered trademark of Taligent, Inc.
  19. *
  20. */
  21. package java.text;
  22. /**
  23. * The LineBreakData contains data used by SimpleTextBoundary
  24. * to determine line breaks.
  25. * @see #BreakIterator
  26. */
  27. final class LineBreakData extends TextBoundaryData
  28. {
  29. // THEORY OF OPERATION: This class contains all the tables necessary to do
  30. // character-break iteration. This class descends from TextBoundaryData, which
  31. // is abstract. This class doesn't define any non-static members; it inherits the
  32. // non-static members from TextBoundaryData and fills them in with pointers to
  33. // the static members defined here.
  34. // There are two main parts to a TextBoundaryData object: the state-transition
  35. // tables and the character-mapping tables. The forward state table defines the
  36. // transitions for a deterministic finite state machine that locates character
  37. // boundaries. The rows are the states and the columns are character categories.
  38. // The cell values consist of two parts: The first is the row number of the next
  39. // state to transition to, or a "stop" value (0). (Because 0 is the stop value
  40. // rather than a valid state number, row 0 of the array isn't ever looked at; we
  41. // fill it with STOP values by convention.) The second part is a flag indicating
  42. // whether the iterator should update its break position on this transition. When
  43. // the flag is set, the sign bit of the value is turned on (SI is used to represent
  44. // the flag bit being turned on-- we do it this way rather than just using negative
  45. // numbers because we still need to see the SI flag when the value of the transition
  46. // is STOP. SI_STOP is used to denote this.) The starting state in all state tables
  47. // is 1.
  48. // The backward state table works the same way as the forward state table, but is
  49. // usually simplified. The iterator uses the backward state table only to find a
  50. // "safe place" to start iterating forward. It then seeks forward from the "safe
  51. // place" to the actual break position using the forward table. A "safe place" is
  52. // a spot in the text that is guaranteed to be a break position.
  53. // The character-category mapping tables are split into several pieces, one for
  54. // each stage of the category-mapping process: 1) kRawMapping maps generic Unicode
  55. // character categories to the character categories used by this break iterator.
  56. // The index of the array is the Unicode category number as returned by
  57. // Character.getType(). 2) The kExceptionFlags table is a table of Boolean values
  58. // indicating whether all the characters in the Unicode category have the
  59. // raw-mapping value. The rows correspond to the rows of the raw-mapping table. If
  60. // an entry is true, then we find the right category using... 3) The kExceptionChar
  61. // table. This table is a sorted list of SpecialMapping objects. Each entry defines
  62. // a range of contiguous characters that share the same category and the category
  63. // number. This list is binary-searched to find an entry corresponding to the
  64. // charactre being mapped. Only characters whose breaking category is different from
  65. // the raw-mapping value (the breaking category for their Unicode category) are
  66. // listed in this table. 4) The kAsciiValues table is a fast-path table for characters
  67. // in the Latin1 range. This table maps straight from a character value to a
  68. // category number, bypassing all the other tables. The programmer must take care
  69. // that all of the different category-mapping tables are consistent.
  70. // In the current implementation, all of these tables are created and maintained
  71. // by hand, not using a tool.
  72. private static final byte BREAK = 0;
  73. //always breaks (must be present as first item)
  74. private static final byte blank = 1;
  75. //spaces, tabs, nulls.
  76. private static final byte cr = 2;
  77. //carriage return
  78. private static final byte nonBlank = 3;
  79. //everything not included elsewhere
  80. private static final byte op = 4;
  81. //hyphens....
  82. private static final byte jwrd = 5;
  83. //hiragana, katakana, and kanji
  84. private static final byte preJwrd = 6;
  85. //characters that bind to the beginning of a Japanese word
  86. private static final byte postJwrd = 7;
  87. //characters that bind to the end of a Japanese word
  88. private static final byte digit = 8;
  89. //digits
  90. private static final byte numPunct = 9;
  91. //punctuation that can appear within a number
  92. private static final byte currency = 10;
  93. //currency symbols that can precede a number
  94. private static final byte quote = 11;
  95. // the ASCII quotation mark
  96. private static final byte nsm = 12;
  97. // non-spacing marks
  98. private static final byte nbsp = 13;
  99. // non-breaking characters
  100. private static final byte EOS = 14;
  101. private static final int COL_COUNT = 15;
  102. private static final byte SI = (byte)0x80;
  103. private static final byte STOP = (byte) 0;
  104. private static final byte SI_STOP = (byte)SI + STOP;
  105. public LineBreakData() {
  106. super(kLineForward, kLineBackward, kLineMap);
  107. }
  108. // This table locates legal line-break positions. i.e., a process that word-wraps a line of
  109. // text can use this version of the BreakIterator to tell it where the legal places for
  110. // breaking a line are.
  111. // The rules implemented here are as follows:
  112. // 1) There is always a legal break position after a line or paragraph separator, but
  113. // one can occur before only when the preceding character is also a line or paragraph
  114. // separator. (The CR-LF sequence is also kept together.) (states 4 and 7)
  115. // 2) There is never a break before a non-spacing mark, unless it's preceded by a line
  116. // or paragraph separator. (the nsm column)
  117. // 3) There is never a break on either side of a non-breaking space (or other non-breaking
  118. // chartacters). (the nbsp column, and state 1)
  119. // 4) There is always a break before and after Kanji and Kana characters, except for certain
  120. // punctuation that must be kept with the following character and certain punctuation
  121. // and diacritic marks that must be kept with the preceding character. (states 5 and 8)
  122. // 5) There is always a legal break position following a dash, except when it is followed
  123. // by a digit, a line/paragraph separator, or whitespace. (state 6)
  124. // 6) There is never a break before a whitespace character. There is a break after a
  125. // whitespace character, except when it's followed by a line/paragraph separator.
  126. // (state 2)
  127. // 7) Breaks don't occur anywhere else. (state 1)
  128. private static final byte kLineForwardData[] =
  129. {
  130. // brk bl cr nBl
  131. // op kan prJ poJ
  132. // dgt np curr quote
  133. // nsm nbsp EOS
  134. // 00 - dummy state
  135. STOP, STOP, STOP, STOP,
  136. STOP, STOP, STOP, STOP,
  137. STOP, STOP, STOP, STOP,
  138. STOP, STOP, STOP,
  139. // 01 - main dispatch state. This state eats pre-Kanji punctuation,
  140. // non-breaking spaces, and non-spacing diacritics without transitioning
  141. // to other states.
  142. (byte)(SI+4), (byte)(SI+2), (byte)(SI+7), (byte)(SI+3),
  143. (byte)(SI+6), (byte)(SI+5), (byte)(SI+1), (byte)(SI+8),
  144. (byte)(SI+9), (byte)(SI+8), (byte)(SI+1), (byte)(SI+3),
  145. (byte)(SI+1), (byte)(SI+1), SI_STOP,
  146. // 02 - This state eats whitespce and stops on almost anything else
  147. // (the exceptions are non-breaking spaces, which go back to 1,
  148. // and CRs and LFs)
  149. (byte)(SI+4), (byte)(SI+2), (byte)(SI+7), SI_STOP,
  150. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  151. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  152. (byte)(SI+2), (byte)(SI+1), SI_STOP,
  153. // 03 - This state eats non-whitespace characters that aren't
  154. // otherwise accounted for. The only difference between
  155. // this and state 1 is that it stops on Kanji (you can break
  156. // between any two Kanji characters)
  157. (byte)(SI+4), (byte)(SI+2), (byte)(SI+7), (byte)(SI+3),
  158. (byte)(SI+6), SI_STOP, (byte)(SI+1), (byte)(SI+8),
  159. (byte)(SI+9), (byte)(SI+8), (byte)(SI+1), (byte)(SI+3),
  160. (byte)(SI+3), (byte)(SI+1), SI_STOP,
  161. // 04 - this is the state you go to when you see a hard line-
  162. // breaking character. It eats that character and stops.
  163. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  164. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  165. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  166. SI_STOP, SI_STOP, SI_STOP,
  167. // 05 - this is the state that handles Kanji. It handles
  168. // post-Kanji punctuation, whitespace, non-breaking spaces,
  169. // and line terminators, but stops on everything else
  170. // (including more Kanji)
  171. (byte)(SI+4), (byte)(SI+2), (byte)(SI+7), SI_STOP,
  172. SI_STOP, SI_STOP, SI_STOP, (byte)(SI+8),
  173. SI_STOP, (byte)(SI+8), SI_STOP, SI_STOP,
  174. (byte)(SI+5), (byte)(SI+1), SI_STOP,
  175. // 06 - This state handles dashes. It'll continue on
  176. // whitespace, more dashes, line terminators, and digits
  177. // (the dash is a minus sign), but stops on everything else
  178. // (unless there's an nbsp, a dash is always a legal
  179. // break position).
  180. (byte)(SI+4), SI_STOP, (byte)(SI+7), SI_STOP,
  181. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  182. (byte)(SI+9), SI_STOP, (byte)(SI+11), SI_STOP,
  183. (byte)(SI+6), (byte)(SI+1), SI_STOP,
  184. // 07 - This state handles CRs. A CR is a line terminator
  185. // when it appears alone, and considered "half" a line
  186. // terminator when it occurs right before any other line
  187. // terminator (except another CR).
  188. (byte)(SI+4), SI_STOP, SI_STOP, SI_STOP,
  189. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  190. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  191. SI_STOP, SI_STOP, SI_STOP,
  192. // 08 - This state eats post-Kanji punctuation, and passes
  193. // whitespace, non-breaking characters, dashes, line terminators,
  194. // etc. It stops on almost everything else.
  195. (byte)(SI+4), (byte)(SI+2), (byte)(SI+7), SI_STOP,
  196. SI_STOP, SI_STOP, SI_STOP, (byte)(SI+8),
  197. SI_STOP, (byte)(SI+8), SI_STOP, (byte)(SI+3),
  198. (byte)(SI+8), (byte)(SI+1), SI_STOP,
  199. // 09 - This state is the main "number" state. It eats
  200. // digits.
  201. (byte)(SI+4), (byte)(SI+2), (byte)(SI+7), (byte)(SI+3),
  202. (byte)(SI+6), SI_STOP, SI_STOP, (byte)(SI+8),
  203. (byte)(SI+9), (byte)(SI+10), (byte)(SI+10), (byte)(SI+3),
  204. (byte)(SI+9), (byte)(SI+1), SI_STOP,
  205. // 10 - This state is the secondary "number" state. It
  206. // easts punctuation that can occur inside a number.
  207. (byte)(SI+4), (byte)(SI+2), (byte)(SI+7), SI_STOP,
  208. SI_STOP, SI_STOP, SI_STOP, (byte)(SI+8),
  209. (byte)(SI+9), (byte)(SI+8), SI_STOP, SI_STOP,
  210. (byte)(SI+10), (byte)(SI+1), SI_STOP,
  211. // 11 - This state is here to allow a dash to go before a
  212. // currency symbol and still be treated as a minus sign
  213. // (if the character after the currency symbol is a digit).
  214. STOP, STOP, STOP, STOP,
  215. STOP, STOP, STOP, STOP,
  216. (byte)(SI+9), STOP, STOP, STOP,
  217. (byte)(11), (byte)(SI+1), STOP
  218. };
  219. private static final WordBreakTable kLineForward
  220. = new WordBreakTable(COL_COUNT, kLineForwardData);
  221. // This table locates unambiguous break positions when iterating backward.
  222. // It implements the following rules:
  223. // 1) For most characters, there is a break before them if they're preceded
  224. // by whitespace, Kanji, or a line/paragraph separator. (CR-LF is kept together)
  225. // 2) There is a break before a Kanji character, except when it's preceded by
  226. // a Kanji-prefix character. (state 4)
  227. // 3) There is NOT a break before a Kanji-suffix character, except when preceded
  228. // by whitespace, a line/paragraph separator, or a dash. (state 3)
  229. // 4) There is never a break on either side of a non-break character. (the nbsp column)
  230. // 5) There is never a break before a non-spacing mark (the nsm column)
  231. // [In this set of rules, "break" means "unambiguous break position". There may sometimes
  232. // be actual breaks in positions this table always skips.]
  233. private static final byte kLineBackwardData[] =
  234. {
  235. // brk bl cr nBl
  236. // op kan prJ poJ
  237. // dgt np curr quote
  238. // nsm nbsp EOS
  239. /*00*/
  240. STOP, STOP, STOP, STOP,
  241. STOP, STOP, STOP, STOP,
  242. STOP, STOP, STOP, STOP,
  243. STOP, STOP, STOP,
  244. /*01*/
  245. (byte)(SI+1), (byte)(SI+1), (byte)(SI+1), (byte)(SI+2),
  246. (byte)(SI+2), (byte)(SI+4), (byte)(SI+2), (byte)(SI+3),
  247. (byte)(SI+2), (byte)(SI+3), (byte)(SI+2), (byte)(SI+2),
  248. (byte)(SI+1), (byte)(SI+2), STOP,
  249. /*02*/
  250. STOP, STOP, STOP, (byte)(SI+2),
  251. (byte)(SI+2), STOP, (byte)(SI+2), (byte)(SI+3),
  252. (byte)(SI+2), (byte)(SI+3), (byte)(SI+2), (byte)(SI+2),
  253. (byte)(SI+2), (byte)(SI+2), STOP,
  254. /*03*/
  255. STOP, STOP, STOP, (byte)(SI+2),
  256. STOP, (byte)(SI+4), (byte)(SI+2), (byte)(SI+3),
  257. (byte)(SI+2), (byte)(SI+3), (byte)(SI+2), (byte)(SI+2),
  258. (byte)(SI+3), (byte)(SI+2), STOP,
  259. /*04*/
  260. STOP, STOP, STOP, STOP,
  261. STOP, STOP, (byte)(SI+2), STOP,
  262. STOP, STOP, (byte)(SI+2), STOP,
  263. (byte)(SI+4), (byte)(SI+4), STOP
  264. };
  265. private static final WordBreakTable kLineBackward
  266. = new WordBreakTable(COL_COUNT, kLineBackwardData);
  267. private static final int kRawMapping[] =
  268. {
  269. nonBlank, //UNASSIGNED = 0,
  270. nonBlank, //UPPERCASE_LETTER = 1,
  271. nonBlank, //LOWERCASE_LETTER = 2,
  272. nonBlank, //TITLECASE_LETTER = 3,
  273. nonBlank, //MODIFIER_LETTER = 4,
  274. nonBlank, //OTHER_LETTER = 5,
  275. nsm, //NON_SPACING_MARK = 6,
  276. nsm, //ENCLOSING_MARK = 7,
  277. nonBlank, //COMBINING_SPACING_MARK = 8,
  278. digit, //DECIMAL_DIGIT_NUMBER = 9,
  279. nonBlank, //LETTER_NUMBER = 10,
  280. digit, //OTHER_NUMBER = 11,
  281. blank, //SPACE_SEPARATOR = 12,
  282. blank, //LINE_SEPARATOR = 13,
  283. blank, //PARAGRAPH_SEPARATOR = 14, ???????????
  284. blank, //CONTROL = 15,
  285. nonBlank, //PRIVATE_USE = 16,
  286. nonBlank, //FORMAT = 17
  287. nonBlank, //???? = 18,
  288. nonBlank, //SURROGATE = 19,
  289. op, //DASH_PUNCTUATION = 20,
  290. preJwrd, //START_PUNCTUATION = 21,
  291. postJwrd, //END_PUNCTUATION = 22,
  292. nonBlank, //CONNECTOR_PUNCTUATION = 23,
  293. nonBlank, //OTHER_PUNCTUATION = 24,
  294. nonBlank, //MATH_SYMBOL = 25,
  295. preJwrd, //CURRENCY_SYMBOL = 26,
  296. nonBlank, //MODIFIER_SYMBOL = 27,
  297. nonBlank, //OTHER_SYMBOL = 28,
  298. preJwrd, //INITIAL_QUOTE_PUNCTUATION = 29,
  299. postJwrd, //FINAL_QUOTE_PUNCTUATION = 30,
  300. };
  301. private static SpecialMapping kExceptionChar[] =
  302. {
  303. //note: the ranges in this table must be sorted in ascending order as
  304. // required by the UnicodeClassMapping class.
  305. new SpecialMapping(ASCII_END_OF_TEXT, BREAK),
  306. new SpecialMapping(ASCII_HORIZONTAL_TABULATION,
  307. ASCII_FORM_FEED, BREAK),
  308. new SpecialMapping(ASCII_CARRIAGE_RETURN, cr),
  309. new SpecialMapping(ASCII_EXCLAMATION_MARK, postJwrd),
  310. new SpecialMapping(ASCII_QUOTATION_MARK, quote),
  311. new SpecialMapping(ASCII_DOLLAR_SIGN, preJwrd),
  312. new SpecialMapping(ASCII_PERCENT, postJwrd),
  313. new SpecialMapping(ASCII_COMMA, numPunct),
  314. new SpecialMapping(ASCII_FULL_STOP, numPunct),
  315. new SpecialMapping(ASCII_COLON, ASCII_SEMICOLON, postJwrd),
  316. new SpecialMapping(ASCII_QUESTION_MARK, postJwrd),
  317. new SpecialMapping(ASCII_NONBREAKING_SPACE, nbsp),
  318. new SpecialMapping(ASCII_CENT_SIGN, postJwrd),
  319. new SpecialMapping(LATIN1_SOFTHYPHEN, op),
  320. new SpecialMapping(LATIN1_DEGREE_SIGN, postJwrd),
  321. new SpecialMapping(ARABIC_PERCENT_SIGN, postJwrd),
  322. new SpecialMapping(FIGURE_SPACE, nbsp),
  323. new SpecialMapping(NONBREAKING_HYPHEN, nbsp),
  324. new SpecialMapping(PUNCTUATION_LINE_SEPARATOR,
  325. PUNCTUATION_PARAGRAPH_SEPARATOR, BREAK),
  326. new SpecialMapping(PER_MILLE_SIGN, postJwrd),
  327. new SpecialMapping(PER_TEN_THOUSAND_SIGN, postJwrd),
  328. new SpecialMapping(PRIME, TRIPLE_PRIME, postJwrd),
  329. new SpecialMapping(DEGREE_CELSIUS, postJwrd),
  330. new SpecialMapping(DEGREE_FAHRENHEIT, postJwrd),
  331. new SpecialMapping(PUNCTUATION_IDEOGRAPHIC_COMMA,
  332. PUNCTUATION_IDEOGRAPHIC_FULL_STOP, postJwrd),
  333. new SpecialMapping(IDEOGRAPHIC_ITERATION_MARK, postJwrd),
  334. new SpecialMapping(HIRAGANA_LETTER_SMALL_A, postJwrd),
  335. new SpecialMapping(HIRAGANA_LETTER_A, jwrd),
  336. new SpecialMapping(HIRAGANA_LETTER_SMALL_I, postJwrd),
  337. new SpecialMapping(HIRAGANA_LETTER_I, jwrd),
  338. new SpecialMapping(HIRAGANA_LETTER_SMALL_U, postJwrd),
  339. new SpecialMapping(HIRAGANA_LETTER_U, jwrd),
  340. new SpecialMapping(HIRAGANA_LETTER_SMALL_E, postJwrd),
  341. new SpecialMapping(HIRAGANA_LETTER_E, jwrd),
  342. new SpecialMapping(HIRAGANA_LETTER_SMALL_O, postJwrd),
  343. new SpecialMapping(HIRAGANA_LETTER_O, HIRAGANA_LETTER_DI, jwrd),
  344. new SpecialMapping(HIRAGANA_LETTER_SMALL_TU, postJwrd),
  345. new SpecialMapping(HIRAGANA_LETTER_TU, HIRAGANA_LETTER_MO, jwrd),
  346. new SpecialMapping(HIRAGANA_LETTER_SMALL_YA, postJwrd),
  347. new SpecialMapping(HIRAGANA_LETTER_YA, jwrd),
  348. new SpecialMapping(HIRAGANA_LETTER_SMALL_YU, postJwrd),
  349. new SpecialMapping(HIRAGANA_LETTER_YU, jwrd),
  350. new SpecialMapping(HIRAGANA_LETTER_SMALL_YO, postJwrd),
  351. new SpecialMapping(HIRAGANA_LETTER_YO, HIRAGANA_LETTER_RO, jwrd),
  352. new SpecialMapping(HIRAGANA_LETTER_SMALL_WA, postJwrd),
  353. new SpecialMapping(HIRAGANA_LETTER_WA, HIRAGANA_LETTER_VU, jwrd),
  354. new SpecialMapping(COMBINING_KATAKANA_HIRAGANA_VOICED_SOUND_MARK,
  355. HIRAGANA_SEMIVOICED_SOUND_MARK, postJwrd),
  356. new SpecialMapping(HIRAGANA_ITERATION_MARK, HIRAGANA_VOICED_ITERATION_MARK, postJwrd),
  357. new SpecialMapping(KATAKANA_LETTER_SMALL_A, postJwrd),
  358. new SpecialMapping(KATAKANA_LETTER_A, jwrd),
  359. new SpecialMapping(KATAKANA_LETTER_SMALL_I, postJwrd),
  360. new SpecialMapping(KATAKANA_LETTER_I, jwrd),
  361. new SpecialMapping(KATAKANA_LETTER_SMALL_U, postJwrd),
  362. new SpecialMapping(KATAKANA_LETTER_U, jwrd),
  363. new SpecialMapping(KATAKANA_LETTER_SMALL_E, postJwrd),
  364. new SpecialMapping(KATAKANA_LETTER_E, jwrd),
  365. new SpecialMapping(KATAKANA_LETTER_SMALL_O, postJwrd),
  366. new SpecialMapping(KATAKANA_LETTER_O, KATAKANA_LETTER_DI, jwrd),
  367. new SpecialMapping(KATAKANA_LETTER_SMALL_TU, postJwrd),
  368. new SpecialMapping(KATAKANA_LETTER_TU, KATAKANA_LETTER_MO, jwrd),
  369. new SpecialMapping(KATAKANA_LETTER_SMALL_YA, postJwrd),
  370. new SpecialMapping(KATAKANA_LETTER_YA, jwrd),
  371. new SpecialMapping(KATAKANA_LETTER_SMALL_YU, postJwrd),
  372. new SpecialMapping(KATAKANA_LETTER_YU, jwrd),
  373. new SpecialMapping(KATAKANA_LETTER_SMALL_YO, postJwrd),
  374. new SpecialMapping(KATAKANA_LETTER_YO, KATAKANA_LETTER_RO, jwrd),
  375. new SpecialMapping(KATAKANA_LETTER_SMALL_WA, postJwrd),
  376. new SpecialMapping(KATAKANA_LETTER_WA, KATAKANA_LETTER_VU, jwrd),
  377. new SpecialMapping(KATAKANA_LETTER_SMALL_KA, KATAKANA_LETTER_SMALL_KE, postJwrd),
  378. new SpecialMapping(KATAKANA_LETTER_VA, KATAKANA_LETTER_VO, jwrd),
  379. new SpecialMapping(KATAKANA_HIRAGANA_PROLONGED_SOUND_MARK, postJwrd),
  380. new SpecialMapping(KATAKANA_ITERATION_MARK, KATAKANA_VOICED_ITERATION_MARK, postJwrd),
  381. new SpecialMapping(UNICODE_LOW_BOUND_HAN,UNICODE_HIGH_BOUND_HAN,jwrd),
  382. new SpecialMapping(CJK_COMPATIBILITY_F900,
  383. CJK_COMPATIBILITY_FA2D, jwrd),
  384. new SpecialMapping(UNICODE_ZERO_WIDTH_NON_BREAKING_SPACE, nbsp),
  385. new SpecialMapping(FULLWIDTH_EXCLAMATION_MARK, postJwrd),
  386. new SpecialMapping(FULLWIDTH_COMMA, postJwrd),
  387. new SpecialMapping(FULLWIDTH_FULL_STOP, postJwrd),
  388. new SpecialMapping(FULLWIDTH_QUESTION_MARK, postJwrd),
  389. new SpecialMapping(END_OF_STRING, EOS)
  390. };
  391. private static final boolean LineExceptionFlags[] = {
  392. false, // kNonCharacter = 0,
  393. false, // kUppercaseLetter = 1,
  394. false, // kLowercaseLetter = 2,
  395. false, // kTitlecaseLetter = 3,
  396. true, // kModifierLetter = 4,
  397. true, // kOtherLetter = 5,
  398. true, // kNonSpacingMark = 6,
  399. false, // kEnclosingMark = 7,
  400. false, // kCombiningSpacingMark = 8,
  401. false, // kDecimalNumber = 9,
  402. false, // kLetterNumber = 10,
  403. false, // kOtherNumber = 11,
  404. true, // kSpaceSeparator = 12,
  405. true, // kLineSeparator = 13,
  406. true, // kParagraphSeparator = 14,
  407. true, // kControlCharacter = 15,
  408. true, // kFormatCharacter = 16,
  409. false, // UNDEFINED = 17,
  410. false, // kPrivateUseCharacter = 18,
  411. false, // kSurrogate = 19,
  412. true, // kDashPunctuation = 20,
  413. false, // kOpenPunctuation = 21,
  414. false, // kClosePunctuation = 22,
  415. false, // kConnectorPunctuation = 23,
  416. true, // kOtherPunctuation = 24,
  417. false, // kMathSymbol = 25,
  418. true, // kCurrencySymbol = 26,
  419. false, // kModifierSymbol = 27,
  420. true, // kOtherSymbol = 28,
  421. false, // kInitialQuotePunctuation = 29,
  422. false, // kFinalQuotePunctuation = 30,
  423. };
  424. private static final int kLineAsciiValues[] = {
  425. // null soh stx etx eot enq ask bell
  426. blank, blank, blank, BREAK, blank, blank, blank, blank,
  427. // bs ht lf vt ff cr so si
  428. blank, BREAK, BREAK, BREAK, BREAK, cr, blank, blank,
  429. // dle dc1 dc2 dc3 dc4 nak syn etb
  430. blank, blank, blank, blank, blank, blank, blank, blank,
  431. // can em sub esc fs gs rs us
  432. blank, blank, blank, blank, blank, blank, blank, blank,
  433. // sp ! " # $ % & '
  434. blank, postJwrd, quote, nonBlank, currency, postJwrd, nonBlank, nonBlank,
  435. // ( ) * + , - . /
  436. preJwrd, postJwrd, nonBlank, nonBlank, numPunct, op, numPunct, nonBlank,
  437. // 0 1 2 3 4 5 6 7
  438. digit, digit, digit, digit, digit, digit, digit, digit,
  439. // 8 9 : ; < = > ?
  440. digit, digit, postJwrd, postJwrd, nonBlank, nonBlank, nonBlank, postJwrd,
  441. // @ A B C D E F G
  442. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  443. // H I J K L M N O
  444. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  445. // P Q R S T U V W
  446. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  447. // X Y Z [ \ ] ^ _
  448. nonBlank, nonBlank, nonBlank, preJwrd, nonBlank, postJwrd, nonBlank, nonBlank,
  449. // ` a b c d e f g
  450. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  451. // h i j k l m n o
  452. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  453. // p q r s t u v w
  454. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  455. // x y z { | } ~ del
  456. nonBlank, nonBlank, nonBlank, preJwrd, nonBlank, postJwrd, nonBlank, blank,
  457. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  458. blank, blank, blank, blank, blank, blank, blank, blank,
  459. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  460. blank, blank, blank, blank, blank, blank, blank, blank,
  461. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  462. blank, blank, blank, blank, blank, blank, blank, blank,
  463. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  464. blank, blank, blank, blank, blank, blank, blank, blank,
  465. // nbsp inv-! cents pounds currency yen broken-bar section
  466. nbsp, nonBlank, postJwrd, currency, currency, currency, nonBlank, nonBlank,
  467. // umlaut copyright super-a gui-left not soft-hyph registered macron
  468. nonBlank, nonBlank, nonBlank, preJwrd, nonBlank, op, nonBlank, nonBlank,
  469. // degree +/- super-2 super-3 acute micro paragraph bullet
  470. postJwrd, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  471. // cedilla super-1 super-o gui-right 1/4 1/2 3/4 inv-?
  472. nonBlank, nonBlank, nonBlank, postJwrd, digit, digit, digit, nonBlank,
  473. // A-grave A-acute A-hat A-tilde A-umlaut A-ring AE C-cedilla
  474. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  475. // E-grave E-acute E-hat E-umlaut I-grave I-acute I-hat I-umlaut
  476. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  477. // Edh N-tilde O-grave O-acute O-hat O-tilde O-umlaut times
  478. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  479. // O=slash U-grave U-acute U-hat U-umlaut Y-acute Thorn ess-zed
  480. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  481. // a-grave a-acute a-hat a-tilde a-umlaut a-ring ae c-cedilla
  482. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  483. // e-grave e-acute e-hat e-umlaut i-grave i-acute i-hat i-umlaut
  484. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  485. // edh n-tilde o-grave o-acute o-hat o-tilde o-umlaut over
  486. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank,
  487. // o-slash u-grave u-acute u-hat u-umlaut y-acute thorn y=umlaut
  488. nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank, nonBlank
  489. };
  490. private static final UnicodeClassMapping kLineMap
  491. = new UnicodeClassMapping(kRawMapping, kExceptionChar, LineExceptionFlags,
  492. kLineAsciiValues);
  493. }