1. /*
  2. * @(#)CharacterBreakData.java 1.14 00/01/19
  3. *
  4. * Copyright 1996-2000 Sun Microsystems, Inc. All Rights Reserved.
  5. *
  6. * This software is the proprietary information of Sun Microsystems, Inc.
  7. * Use is subject to license terms.
  8. *
  9. */
  10. /*
  11. * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  12. * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
  13. *
  14. * The original version of this source code and documentation
  15. * is copyrighted and owned by Taligent, Inc., a wholly-owned
  16. * subsidiary of IBM. These materials are provided under terms
  17. * of a License Agreement between Taligent and Sun. This technology
  18. * is protected by multiple US and International patents.
  19. *
  20. * This notice and attribution to Taligent may not be removed.
  21. * Taligent is a registered trademark of Taligent, Inc.
  22. *
  23. */
  24. package java.text;
  25. /**
  26. * The CharacterBreakData contains data used by SimpleTextBoundary
  27. * to determine character breaks.
  28. * @see #BreakIterator
  29. */
  30. final class CharacterBreakData extends TextBoundaryData
  31. {
  32. // THEORY OF OPERATION: This class contains all the tables necessary to do
  33. // character-break iteration. This class descends from TextBoundaryData, which
  34. // is abstract. This class doesn't define any non-static members; it inherits the
  35. // non-static members from TextBoundaryData and fills them in with pointers to
  36. // the static members defined here.
  37. // There are two main parts to a TextBoundaryData object: the state-transition
  38. // tables and the character-mapping tables. The forward state table defines the
  39. // transitions for a deterministic finite state machine that locates character
  40. // boundaries. The rows are the states and the columns are character categories.
  41. // The cell values consist of two parts: The first is the row number of the next
  42. // state to transition to, or a "stop" value (0). (Because 0 is the stop value
  43. // rather than a valid state number, row 0 of the array isn't ever looked at; we
  44. // fill it with STOP values by convention.) The second part is a flag indicating
  45. // whether the iterator should update its break position on this transition. When
  46. // the flag is set, the sign bit of the value is turned on (SI is used to represent
  47. // the flag bit being turned on-- we do it this way rather than just using negative
  48. // numbers because we still need to see the SI flag when the value of the transition
  49. // is STOP. SI_STOP is used to denote this.) The starting state in all state tables
  50. // is 1.
  51. // The backward state table works the same way as the forward state table, but is
  52. // usually simplified. The iterator uses the backward state table only to find a
  53. // "safe place" to start iterating forward. It then seeks forward from the "safe
  54. // place" to the actual break position using the forward table. A "safe place" is
  55. // a spot in the text that is guaranteed to be a break position.
  56. // The character-category mapping tables are split into several pieces, one for
  57. // each stage of the category-mapping process: 1) kRawMapping maps generic Unicode
  58. // character categories to the character categories used by this break iterator.
  59. // The index of the array is the Unicode category number as returned by
  60. // Character.getType(). 2) The kExceptionFlags table is a table of Boolean values
  61. // indicating whether all the characters in the Unicode category have the
  62. // raw-mapping value. The rows correspond to the rows of the raw-mapping table. If
  63. // an entry is true, then we find the right category using... 3) The kExceptionChar
  64. // table. This table is a sorted list of SpecialMapping objects. Each entry defines
  65. // a range of contiguous characters that share the same category and the category
  66. // number. This list is binary-searched to find an entry corresponding to the
  67. // charactre being mapped. Only characters whose breaking category is different from
  68. // the raw-mapping value (the breaking category for their Unicode category) are
  69. // listed in this table. 4) The kAsciiValues table is a fast-path table for characters
  70. // in the Latin1 range. This table maps straight from a character value to a
  71. // category number, bypassing all the other tables. The programmer must take care
  72. // that all of the different category-mapping tables are consistent.
  73. // In the current implementation, all of these tables are created and maintained
  74. // by hand, not using a tool.
  75. // constant names for the category numbers
  76. private static final byte accent_diacritic = 0; // all Unicode non-spacing marks
  77. private static final byte baseForm = 1; // everything that isn't accounted for elsewhere
  78. private static final byte baseCR = 2; // the ASCII carriage return
  79. private static final byte baseLF = 3; // all other line/paragraph separators
  80. private static final byte choseong = 4; // Korean initial consonant
  81. private static final byte jungseong = 5; // Korean vowel
  82. private static final byte jongseong = 6; // Korean final consonant
  83. private static final byte EOS = 7; // end of string
  84. private static final int COL_COUNT = 8; // the number of items in this list (and therefore,
  85. // the number of columns in the state tables)
  86. private static final byte SI = (byte)0x80;
  87. private static final byte STOP = (byte) 0;
  88. private static final byte SI_STOP = (byte)SI + STOP;
  89. public CharacterBreakData() {
  90. super(kCharacterForwardTable, kCharacterBackwardTable, kCharacterMap);
  91. }
  92. // This table locates logical character ("grapheme") boundaries. A logical
  93. // character is a sequence of Unicode code-point values that are seen as a single
  94. // character by the user. This table implements the following logic:
  95. // 1) Unless otherwise mentioned, each individual code point is a character.
  96. // 2) A regular character followed by one or more Unicode non-spacing marks is
  97. // treated as a single character.
  98. // 3) The CR-LF sequence is treated as a single character.
  99. // 4) A Hangul syllable spelled out with individual jamos is treated as a single
  100. // character, according to the rules specified under "Conjoining Jamo Behavior"
  101. // in the Unicode standard.
  102. // UTF-16 surrogate pairs are NOT trated as single characters in this version of the
  103. // character-breaking tables. Rule 1 is implemented by state 2, rule 2 is implemented
  104. // by rules 3 and 7 (line/paragraph separators are NOT kept together with any non-
  105. // spacing marks that follow them!). Rule 4 is implemented with states 4, 5, and 6.
  106. private static final byte kCharacterForwardData[] =
  107. {
  108. // acct base cr lf
  109. // cho jung jong EOS
  110. STOP, STOP, STOP, STOP,
  111. STOP, STOP, STOP, STOP,
  112. // 1 - main dispatch state
  113. (byte)(SI+2), (byte)(SI+2), (byte)(SI+3), (byte)(SI+7),
  114. (byte)(SI+4), (byte)(SI+5), (byte)(SI+6), SI_STOP,
  115. // 2 - if the character is regular base or accent, we end up in this
  116. // state, which eats accents until it sees something else
  117. (byte)(SI+2), SI_STOP, SI_STOP, SI_STOP,
  118. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  119. // 3 - a CR character causes a transition. If the next character is
  120. // an LF, it transitions to state 7; otherwise, it does exactly
  121. // the same thing as state 7
  122. SI_STOP, SI_STOP, SI_STOP, (byte)(SI+7),
  123. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  124. // 4 - this state eats Korean initial consonants and uses
  125. // states 5 and 6 to take care of the other parts of the syllable
  126. (byte)(SI+2), SI_STOP, SI_STOP, SI_STOP,
  127. (byte)(SI+4), (byte)(SI+5), (byte)(SI+6), SI_STOP,
  128. // 5 - this state eats Korean vowels
  129. (byte)(SI+2), SI_STOP, SI_STOP, SI_STOP,
  130. SI_STOP, (byte)(SI+5), (byte)(SI+6), SI_STOP,
  131. // 6 - this state eats Korean final consonants
  132. (byte)(SI+2), SI_STOP, SI_STOP, SI_STOP,
  133. SI_STOP, SI_STOP, (byte)(SI+6), SI_STOP,
  134. // 7 - This state is reached when an LF or other line separator
  135. // is seen. It eats the LF and stops.
  136. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  137. SI_STOP, SI_STOP, SI_STOP, SI_STOP
  138. };
  139. private static final WordBreakTable kCharacterForwardTable =
  140. new WordBreakTable(COL_COUNT, kCharacterForwardData);
  141. // This table implements the backward-seeking logic. Here, we merely
  142. // eat characters until we see a Hangul syllable-initial consonant,
  143. // an ASCII carriage return, a "base" character (most characters), or
  144. // the end of the string. These characters all represent unambiguous
  145. // break positions.
  146. private static final byte kCharacterBackwardData[] =
  147. {
  148. // acct base cr lf
  149. // cho jung jong EOS
  150. STOP, STOP, STOP, STOP,
  151. STOP, STOP, STOP, STOP,
  152. // 1
  153. (byte)(SI+1), SI_STOP, SI_STOP, (byte)(SI+1),
  154. SI_STOP, (byte)(SI+1), (byte)(SI+1), SI_STOP
  155. };
  156. private static final WordBreakTable kCharacterBackwardTable =
  157. new WordBreakTable(COL_COUNT, kCharacterBackwardData);
  158. private static final int kRawMapping[] =
  159. {
  160. baseForm, //UNASSIGNED = 0,
  161. baseForm, //UPPERCASE_LETTER = 1,
  162. baseForm, //LOWERCASE_LETTER = 2,
  163. baseForm, //TITLECASE_LETTER = 3,
  164. baseForm, //MODIFIER_LETTER = 4,
  165. baseForm, //OTHER_LETTER = 5,
  166. accent_diacritic, //NON_SPACING_MARK = 6,
  167. accent_diacritic, //ENCLOSING_MARK = 7,
  168. baseForm, //COMBINING_SPACING_MARK = 8,
  169. baseForm, //DECIMAL_DIGIT_NUMBER = 9,
  170. baseForm, //LETTER_NUMBER = 10,
  171. baseForm, //OTHER_NUMBER = 11,
  172. baseForm, //SPACE_SEPARATOR = 12,
  173. baseForm, //LINE_SEPARATOR = 13,
  174. baseForm, //PARAGRAPH_SEPARATOR = 14,
  175. baseForm, //CONTROL = 15,
  176. baseForm, //FORMAT = 16,
  177. baseForm, //???? = 17,
  178. baseForm, //PRIVATE_USE = 18,
  179. baseForm, //SURROGATE = 19,
  180. baseForm, //DASH_PUNCTUATION = 20,
  181. baseForm, //START_PUNCTUATION = 21,
  182. baseForm, //END_PUNCTUATION = 22,
  183. baseForm, //CONNECTOR_PUNCTUATION = 23,
  184. baseForm, //OTHER_PUNCTUATION = 24,
  185. baseForm, //MATH_SYMBOL = 25,
  186. baseForm, //CURRENCY_SYMBOL = 26,
  187. baseForm, //MODIFIER_SYMBOL = 27,
  188. baseForm, //OTHER_SYMBOL = 28;
  189. };
  190. private static final SpecialMapping kExceptionChar[] = //{};
  191. {
  192. new SpecialMapping(ASCII_LINEFEED, baseLF),
  193. new SpecialMapping(ASCII_CARRIAGE_RETURN, baseCR),
  194. new SpecialMapping(HANGUL_CHOSEONG_LOW, HANGUL_CHOSEONG_HIGH, choseong),
  195. new SpecialMapping(HANGUL_JUNGSEONG_LOW, HANGUL_JUNGSEONG_HIGH, jungseong),
  196. new SpecialMapping(HANGUL_JONGSEONG_LOW, HANGUL_JONGSEONG_HIGH, jongseong),
  197. new SpecialMapping(PUNCTUATION_LINE_SEPARATOR, PUNCTUATION_PARAGRAPH_SEPARATOR, baseLF),
  198. new SpecialMapping(END_OF_STRING, EOS)
  199. };
  200. private static final boolean CharacterExceptionFlags[] = {
  201. false, // kNonCharacter = 0,
  202. false, // kUppercaseLetter = 1,
  203. false, // kLowercaseLetter = 2,
  204. false, // kTitlecaseLetter = 3,
  205. false, // kModifierLetter = 4,
  206. true, // kOtherLetter = 5,
  207. false, // kNonSpacingMark = 6,
  208. false, // kEnclosingMark = 7,
  209. false, // kCombiningSpacingMark = 8,
  210. false, // kDecimalNumber = 9,
  211. false, // kLetterNumber = 10,
  212. false, // kOtherNumber = 11,
  213. false, // kSpaceSeparator = 12,
  214. true, // kLineSeparator = 13,
  215. true, // kParagraphSeparator = 14,
  216. true, // kControlCharacter = 15,
  217. false, // kFormatCharacter = 16,
  218. false, // UNDEFINED = 17,
  219. false, // kPrivateUseCharacter = 18,
  220. false, // kSurrogate = 19,
  221. false, // kDashPunctuation = 20,
  222. false, // kOpenPunctuation = 21,
  223. false, // kClosePunctuation = 22,
  224. false, // kConnectorPunctuation = 23,
  225. false, // kOtherPunctuation = 24,
  226. false, // kMathSymbol = 25,
  227. false, // kCurrencySymbol = 26,
  228. false, // kModifierSymbol = 27,
  229. false // kOtherSymbol = 28
  230. };
  231. private static final int kCharacterAsciiValues[] = {
  232. // null soh stx etx eot enq ask bell
  233. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  234. // bs ht lf vt ff cr so si
  235. baseForm, baseForm, baseLF, baseForm, baseForm, baseCR, baseForm, baseForm,
  236. // dle dc1 dc2 dc3 dc4 nak syn etb
  237. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  238. // can em sub esc fs gs rs us
  239. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  240. // sp ! " # $ % & '
  241. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  242. // ( ) * + , - . /
  243. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  244. // 0 1 2 3 4 5 6 7
  245. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  246. // 8 9 : ; < = > ?
  247. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  248. // @ A B C D E F G
  249. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  250. // H I J K L M N O
  251. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  252. // P Q R S T U V W
  253. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  254. // X Y Z [ \ ] ^ _
  255. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  256. // ` a b c d e f g
  257. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  258. // h i j k l m n o
  259. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  260. // p q r s t u v w
  261. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  262. // x y z { | } ~ del
  263. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  264. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  265. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  266. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  267. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  268. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  269. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  270. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  271. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  272. // nbsp inv-! cents pounds currency yen broken-bar section
  273. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  274. // umlaut copyright super-a gui-left not soft-hyph registered macron
  275. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  276. // degree +/- super-2 super-3 acute micro paragraph bullet
  277. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  278. // cedilla super-1 super-o gui-right 1/4 1/2 3/4 inv-?
  279. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  280. // A-grave A-acute A-hat A-tilde A-umlaut A-ring AE C-cedilla
  281. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  282. // E-grave E-acute E-hat E-umlaut I-grave I-acute I-hat I-umlaut
  283. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  284. // Edh N-tilde O-grave O-acute O-hat O-tilde O-umlaut times
  285. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  286. // O-slash U-grave U-acute U-hat U-umlaut Y-acute Thorn ess-zed
  287. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  288. // a-grave a-acute a-hat a-tilde a-umlaut a-ring ae c-cedilla
  289. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  290. // e-grave e-acute e-hat e-umlaut i-grave i-acute i-hat i-umlaut
  291. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  292. // edh n-tilde o-grave o-acute o-hat o-tilde o-umlaut over
  293. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  294. // o-slash u-grave u-acute u-hat u-umlaut y-acute thorn y-umlaut
  295. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm
  296. };
  297. private static final UnicodeClassMapping kCharacterMap
  298. = new UnicodeClassMapping(kRawMapping, kExceptionChar, CharacterExceptionFlags,
  299. kCharacterAsciiValues);
  300. }