1. /*
  2. * @(#)CharacterBreakData.java 1.17 03/01/23
  3. *
  4. * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
  5. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
  6. */
  7. /*
  8. * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  9. * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
  10. *
  11. * The original version of this source code and documentation
  12. * is copyrighted and owned by Taligent, Inc., a wholly-owned
  13. * subsidiary of IBM. These materials are provided under terms
  14. * of a License Agreement between Taligent and Sun. This technology
  15. * is protected by multiple US and International patents.
  16. *
  17. * This notice and attribution to Taligent may not be removed.
  18. * Taligent is a registered trademark of Taligent, Inc.
  19. *
  20. */
  21. package java.text;
  22. /**
  23. * The CharacterBreakData contains data used by SimpleTextBoundary
  24. * to determine character breaks.
  25. * @see #BreakIterator
  26. */
  27. final class CharacterBreakData extends TextBoundaryData
  28. {
  29. // THEORY OF OPERATION: This class contains all the tables necessary to do
  30. // character-break iteration. This class descends from TextBoundaryData, which
  31. // is abstract. This class doesn't define any non-static members; it inherits the
  32. // non-static members from TextBoundaryData and fills them in with pointers to
  33. // the static members defined here.
  34. // There are two main parts to a TextBoundaryData object: the state-transition
  35. // tables and the character-mapping tables. The forward state table defines the
  36. // transitions for a deterministic finite state machine that locates character
  37. // boundaries. The rows are the states and the columns are character categories.
  38. // The cell values consist of two parts: The first is the row number of the next
  39. // state to transition to, or a "stop" value (0). (Because 0 is the stop value
  40. // rather than a valid state number, row 0 of the array isn't ever looked at; we
  41. // fill it with STOP values by convention.) The second part is a flag indicating
  42. // whether the iterator should update its break position on this transition. When
  43. // the flag is set, the sign bit of the value is turned on (SI is used to represent
  44. // the flag bit being turned on-- we do it this way rather than just using negative
  45. // numbers because we still need to see the SI flag when the value of the transition
  46. // is STOP. SI_STOP is used to denote this.) The starting state in all state tables
  47. // is 1.
  48. // The backward state table works the same way as the forward state table, but is
  49. // usually simplified. The iterator uses the backward state table only to find a
  50. // "safe place" to start iterating forward. It then seeks forward from the "safe
  51. // place" to the actual break position using the forward table. A "safe place" is
  52. // a spot in the text that is guaranteed to be a break position.
  53. // The character-category mapping tables are split into several pieces, one for
  54. // each stage of the category-mapping process: 1) kRawMapping maps generic Unicode
  55. // character categories to the character categories used by this break iterator.
  56. // The index of the array is the Unicode category number as returned by
  57. // Character.getType(). 2) The kExceptionFlags table is a table of Boolean values
  58. // indicating whether all the characters in the Unicode category have the
  59. // raw-mapping value. The rows correspond to the rows of the raw-mapping table. If
  60. // an entry is true, then we find the right category using... 3) The kExceptionChar
  61. // table. This table is a sorted list of SpecialMapping objects. Each entry defines
  62. // a range of contiguous characters that share the same category and the category
  63. // number. This list is binary-searched to find an entry corresponding to the
  64. // charactre being mapped. Only characters whose breaking category is different from
  65. // the raw-mapping value (the breaking category for their Unicode category) are
  66. // listed in this table. 4) The kAsciiValues table is a fast-path table for characters
  67. // in the Latin1 range. This table maps straight from a character value to a
  68. // category number, bypassing all the other tables. The programmer must take care
  69. // that all of the different category-mapping tables are consistent.
  70. // In the current implementation, all of these tables are created and maintained
  71. // by hand, not using a tool.
  72. // constant names for the category numbers
  73. private static final byte accent_diacritic = 0; // all Unicode non-spacing marks
  74. private static final byte baseForm = 1; // everything that isn't accounted for elsewhere
  75. private static final byte baseCR = 2; // the ASCII carriage return
  76. private static final byte baseLF = 3; // all other line/paragraph separators
  77. private static final byte choseong = 4; // Korean initial consonant
  78. private static final byte jungseong = 5; // Korean vowel
  79. private static final byte jongseong = 6; // Korean final consonant
  80. private static final byte EOS = 7; // end of string
  81. private static final int COL_COUNT = 8; // the number of items in this list (and therefore,
  82. // the number of columns in the state tables)
  83. private static final byte SI = (byte)0x80;
  84. private static final byte STOP = (byte) 0;
  85. private static final byte SI_STOP = (byte)SI + STOP;
  86. public CharacterBreakData() {
  87. super(kCharacterForwardTable, kCharacterBackwardTable, kCharacterMap);
  88. }
  89. // This table locates logical character ("grapheme") boundaries. A logical
  90. // character is a sequence of Unicode code-point values that are seen as a single
  91. // character by the user. This table implements the following logic:
  92. // 1) Unless otherwise mentioned, each individual code point is a character.
  93. // 2) A regular character followed by one or more Unicode non-spacing marks is
  94. // treated as a single character.
  95. // 3) The CR-LF sequence is treated as a single character.
  96. // 4) A Hangul syllable spelled out with individual jamos is treated as a single
  97. // character, according to the rules specified under "Conjoining Jamo Behavior"
  98. // in the Unicode standard.
  99. // UTF-16 surrogate pairs are NOT trated as single characters in this version of the
  100. // character-breaking tables. Rule 1 is implemented by state 2, rule 2 is implemented
  101. // by rules 3 and 7 (line/paragraph separators are NOT kept together with any non-
  102. // spacing marks that follow them!). Rule 4 is implemented with states 4, 5, and 6.
  103. private static final byte kCharacterForwardData[] =
  104. {
  105. // acct base cr lf
  106. // cho jung jong EOS
  107. STOP, STOP, STOP, STOP,
  108. STOP, STOP, STOP, STOP,
  109. // 1 - main dispatch state
  110. (byte)(SI+2), (byte)(SI+2), (byte)(SI+3), (byte)(SI+7),
  111. (byte)(SI+4), (byte)(SI+5), (byte)(SI+6), SI_STOP,
  112. // 2 - if the character is regular base or accent, we end up in this
  113. // state, which eats accents until it sees something else
  114. (byte)(SI+2), SI_STOP, SI_STOP, SI_STOP,
  115. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  116. // 3 - a CR character causes a transition. If the next character is
  117. // an LF, it transitions to state 7; otherwise, it does exactly
  118. // the same thing as state 7
  119. SI_STOP, SI_STOP, SI_STOP, (byte)(SI+7),
  120. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  121. // 4 - this state eats Korean initial consonants and uses
  122. // states 5 and 6 to take care of the other parts of the syllable
  123. (byte)(SI+2), SI_STOP, SI_STOP, SI_STOP,
  124. (byte)(SI+4), (byte)(SI+5), (byte)(SI+6), SI_STOP,
  125. // 5 - this state eats Korean vowels
  126. (byte)(SI+2), SI_STOP, SI_STOP, SI_STOP,
  127. SI_STOP, (byte)(SI+5), (byte)(SI+6), SI_STOP,
  128. // 6 - this state eats Korean final consonants
  129. (byte)(SI+2), SI_STOP, SI_STOP, SI_STOP,
  130. SI_STOP, SI_STOP, (byte)(SI+6), SI_STOP,
  131. // 7 - This state is reached when an LF or other line separator
  132. // is seen. It eats the LF and stops.
  133. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  134. SI_STOP, SI_STOP, SI_STOP, SI_STOP
  135. };
  136. private static final WordBreakTable kCharacterForwardTable =
  137. new WordBreakTable(COL_COUNT, kCharacterForwardData);
  138. // This table implements the backward-seeking logic. Here, we merely
  139. // eat characters until we see a Hangul syllable-initial consonant,
  140. // an ASCII carriage return, a "base" character (most characters), or
  141. // the end of the string. These characters all represent unambiguous
  142. // break positions.
  143. private static final byte kCharacterBackwardData[] =
  144. {
  145. // acct base cr lf
  146. // cho jung jong EOS
  147. STOP, STOP, STOP, STOP,
  148. STOP, STOP, STOP, STOP,
  149. // 1
  150. (byte)(SI+1), SI_STOP, SI_STOP, (byte)(SI+1),
  151. SI_STOP, (byte)(SI+1), (byte)(SI+1), SI_STOP
  152. };
  153. private static final WordBreakTable kCharacterBackwardTable =
  154. new WordBreakTable(COL_COUNT, kCharacterBackwardData);
  155. private static final int kRawMapping[] =
  156. {
  157. baseForm, //UNASSIGNED = 0,
  158. baseForm, //UPPERCASE_LETTER = 1,
  159. baseForm, //LOWERCASE_LETTER = 2,
  160. baseForm, //TITLECASE_LETTER = 3,
  161. baseForm, //MODIFIER_LETTER = 4,
  162. baseForm, //OTHER_LETTER = 5,
  163. accent_diacritic, //NON_SPACING_MARK = 6,
  164. accent_diacritic, //ENCLOSING_MARK = 7,
  165. baseForm, //COMBINING_SPACING_MARK = 8,
  166. baseForm, //DECIMAL_DIGIT_NUMBER = 9,
  167. baseForm, //LETTER_NUMBER = 10,
  168. baseForm, //OTHER_NUMBER = 11,
  169. baseForm, //SPACE_SEPARATOR = 12,
  170. baseForm, //LINE_SEPARATOR = 13,
  171. baseForm, //PARAGRAPH_SEPARATOR = 14,
  172. baseForm, //CONTROL = 15,
  173. baseForm, //FORMAT = 16,
  174. baseForm, //???? = 17,
  175. baseForm, //PRIVATE_USE = 18,
  176. baseForm, //SURROGATE = 19,
  177. baseForm, //DASH_PUNCTUATION = 20,
  178. baseForm, //START_PUNCTUATION = 21,
  179. baseForm, //END_PUNCTUATION = 22,
  180. baseForm, //CONNECTOR_PUNCTUATION = 23,
  181. baseForm, //OTHER_PUNCTUATION = 24,
  182. baseForm, //MATH_SYMBOL = 25,
  183. baseForm, //CURRENCY_SYMBOL = 26,
  184. baseForm, //MODIFIER_SYMBOL = 27,
  185. baseForm, //OTHER_SYMBOL = 28;
  186. baseForm, //INITIAL_QUOTE_PUNCTUATION = 29;
  187. baseForm, //FINAL_QUOTE_PUNCTUATION = 30;
  188. };
  189. private static final SpecialMapping kExceptionChar[] = //{};
  190. {
  191. new SpecialMapping(ASCII_LINEFEED, baseLF),
  192. new SpecialMapping(ASCII_CARRIAGE_RETURN, baseCR),
  193. new SpecialMapping(HANGUL_CHOSEONG_LOW, HANGUL_CHOSEONG_HIGH, choseong),
  194. new SpecialMapping(HANGUL_JUNGSEONG_LOW, HANGUL_JUNGSEONG_HIGH, jungseong),
  195. new SpecialMapping(HANGUL_JONGSEONG_LOW, HANGUL_JONGSEONG_HIGH, jongseong),
  196. new SpecialMapping(PUNCTUATION_LINE_SEPARATOR, PUNCTUATION_PARAGRAPH_SEPARATOR, baseLF),
  197. new SpecialMapping(END_OF_STRING, EOS)
  198. };
  199. private static final boolean CharacterExceptionFlags[] = {
  200. false, // kNonCharacter = 0,
  201. false, // kUppercaseLetter = 1,
  202. false, // kLowercaseLetter = 2,
  203. false, // kTitlecaseLetter = 3,
  204. false, // kModifierLetter = 4,
  205. true, // kOtherLetter = 5,
  206. false, // kNonSpacingMark = 6,
  207. false, // kEnclosingMark = 7,
  208. false, // kCombiningSpacingMark = 8,
  209. false, // kDecimalNumber = 9,
  210. false, // kLetterNumber = 10,
  211. false, // kOtherNumber = 11,
  212. false, // kSpaceSeparator = 12,
  213. true, // kLineSeparator = 13,
  214. true, // kParagraphSeparator = 14,
  215. true, // kControlCharacter = 15,
  216. false, // kFormatCharacter = 16,
  217. false, // UNDEFINED = 17,
  218. false, // kPrivateUseCharacter = 18,
  219. false, // kSurrogate = 19,
  220. false, // kDashPunctuation = 20,
  221. false, // kOpenPunctuation = 21,
  222. false, // kClosePunctuation = 22,
  223. false, // kConnectorPunctuation = 23,
  224. false, // kOtherPunctuation = 24,
  225. false, // kMathSymbol = 25,
  226. false, // kCurrencySymbol = 26,
  227. false, // kModifierSymbol = 27,
  228. false, // kOtherSymbol = 28,
  229. false, // kInitialQuotePunctuation = 29,
  230. false, // kFinalQuotePunctuation = 30,
  231. };
  232. private static final int kCharacterAsciiValues[] = {
  233. // null soh stx etx eot enq ask bell
  234. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  235. // bs ht lf vt ff cr so si
  236. baseForm, baseForm, baseLF, baseForm, baseForm, baseCR, baseForm, baseForm,
  237. // dle dc1 dc2 dc3 dc4 nak syn etb
  238. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  239. // can em sub esc fs gs rs us
  240. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  241. // sp ! " # $ % & '
  242. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  243. // ( ) * + , - . /
  244. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  245. // 0 1 2 3 4 5 6 7
  246. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  247. // 8 9 : ; < = > ?
  248. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  249. // @ A B C D E F G
  250. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  251. // H I J K L M N O
  252. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  253. // P Q R S T U V W
  254. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  255. // X Y Z [ \ ] ^ _
  256. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  257. // ` a b c d e f g
  258. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  259. // h i j k l m n o
  260. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  261. // p q r s t u v w
  262. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  263. // x y z { | } ~ del
  264. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  265. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  266. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  267. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  268. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  269. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  270. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  271. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  272. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  273. // nbsp inv-! cents pounds currency yen broken-bar section
  274. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  275. // umlaut copyright super-a gui-left not soft-hyph registered macron
  276. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  277. // degree +/- super-2 super-3 acute micro paragraph bullet
  278. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  279. // cedilla super-1 super-o gui-right 1/4 1/2 3/4 inv-?
  280. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  281. // A-grave A-acute A-hat A-tilde A-umlaut A-ring AE C-cedilla
  282. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  283. // E-grave E-acute E-hat E-umlaut I-grave I-acute I-hat I-umlaut
  284. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  285. // Edh N-tilde O-grave O-acute O-hat O-tilde O-umlaut times
  286. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  287. // O-slash U-grave U-acute U-hat U-umlaut Y-acute Thorn ess-zed
  288. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  289. // a-grave a-acute a-hat a-tilde a-umlaut a-ring ae c-cedilla
  290. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  291. // e-grave e-acute e-hat e-umlaut i-grave i-acute i-hat i-umlaut
  292. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  293. // edh n-tilde o-grave o-acute o-hat o-tilde o-umlaut over
  294. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  295. // o-slash u-grave u-acute u-hat u-umlaut y-acute thorn y-umlaut
  296. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm
  297. };
  298. private static final UnicodeClassMapping kCharacterMap
  299. = new UnicodeClassMapping(kRawMapping, kExceptionChar, CharacterExceptionFlags,
  300. kCharacterAsciiValues);
  301. }