1. /*
  2. * @(#)CharacterBreakData.java 1.13 01/11/29
  3. *
  4. * Copyright 2002 Sun Microsystems, Inc. All rights reserved.
  5. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
  6. */
  7. /*
  8. * @(#)CharacterBreakData.java 1.13 01/11/29
  9. *
  10. * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  11. * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
  12. *
  13. * Portions copyright (c) 1996-1998 Sun Microsystems, Inc.
  14. * All Rights Reserved.
  15. *
  16. * The original version of this source code and documentation
  17. * is copyrighted and owned by Taligent, Inc., a wholly-owned
  18. * subsidiary of IBM. These materials are provided under terms
  19. * of a License Agreement between Taligent and Sun. This technology
  20. * is protected by multiple US and International patents.
  21. *
  22. * This notice and attribution to Taligent may not be removed.
  23. * Taligent is a registered trademark of Taligent, Inc.
  24. *
  25. * Permission to use, copy, modify, and distribute this software
  26. * and its documentation for NON-COMMERCIAL purposes and without
  27. * fee is hereby granted provided that this copyright notice
  28. * appears in all copies. Please refer to the file "copyright.html"
  29. * for further important copyright and licensing information.
  30. *
  31. * SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
  32. * THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
  33. * TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
  34. * PARTICULAR PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR
  35. * ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR
  36. * DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
  37. *
  38. */
  39. package java.text;
  40. /**
  41. * The CharacterBreakData contains data used by SimpleTextBoundary
  42. * to determine character breaks.
  43. * @see #BreakIterator
  44. */
  45. final class CharacterBreakData extends TextBoundaryData
  46. {
  47. private static final byte accent_diacritic = 0;
  48. private static final byte baseForm = 1;
  49. private static final byte baseCR = 2;
  50. private static final byte baseLF = 3;
  51. private static final byte choseong = 4; // Korean initial consonant
  52. private static final byte jungseong = 5; // Korean vowel
  53. private static final byte jongseong = 6; // Korean final consonant
  54. private static final byte EOS = 7;
  55. private static final int COL_COUNT = 8;
  56. private static final byte SI = (byte)0x80;
  57. private static final byte STOP = (byte) 0;
  58. private static final byte SI_STOP = (byte)SI + STOP;
  59. public CharacterBreakData() {
  60. super(kCharacterForwardTable, kCharacterBackwardTable, kCharacterMap);
  61. }
  62. private static final byte kCharacterForwardData[] =
  63. {
  64. // acct base cr lf
  65. // cho jung jong EOS
  66. STOP, STOP, STOP, STOP,
  67. STOP, STOP, STOP, STOP,
  68. // 1
  69. (byte)(SI+2), (byte)(SI+2), (byte)(SI+3), (byte)(SI+7),
  70. (byte)(SI+4), (byte)(SI+5), (byte)(SI+6), SI_STOP,
  71. // 2
  72. (byte)(SI+2), SI_STOP, SI_STOP, SI_STOP,
  73. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  74. // 3
  75. SI_STOP, SI_STOP, SI_STOP, (byte)(SI+7),
  76. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  77. // 4
  78. (byte)(SI+2), SI_STOP, SI_STOP, SI_STOP,
  79. (byte)(SI+4), (byte)(SI+5), (byte)(SI+6), SI_STOP,
  80. // 5
  81. (byte)(SI+2), SI_STOP, SI_STOP, SI_STOP,
  82. SI_STOP, (byte)(SI+5), (byte)(SI+6), SI_STOP,
  83. // 6
  84. (byte)(SI+2), SI_STOP, SI_STOP, SI_STOP,
  85. SI_STOP, SI_STOP, (byte)(SI+6), SI_STOP,
  86. // 7
  87. SI_STOP, SI_STOP, SI_STOP, SI_STOP,
  88. SI_STOP, SI_STOP, SI_STOP, SI_STOP
  89. };
  90. private static final WordBreakTable kCharacterForwardTable =
  91. new WordBreakTable(COL_COUNT, kCharacterForwardData);
  92. private static final byte kCharacterBackwardData[] =
  93. {
  94. // acct base cr lf
  95. // cho jung jong EOS
  96. STOP, STOP, STOP, STOP,
  97. STOP, STOP, STOP, STOP,
  98. // 1
  99. (byte)(SI+1), SI_STOP, SI_STOP, (byte)(SI+1),
  100. SI_STOP, (byte)(SI+1), (byte)(SI+1), SI_STOP
  101. };
  102. private static final WordBreakTable kCharacterBackwardTable =
  103. new WordBreakTable(COL_COUNT, kCharacterBackwardData);
  104. private static final int kRawMapping[] =
  105. {
  106. baseForm, //UNASSIGNED = 0,
  107. baseForm, //UPPERCASE_LETTER = 1,
  108. baseForm, //LOWERCASE_LETTER = 2,
  109. baseForm, //TITLECASE_LETTER = 3,
  110. baseForm, //MODIFIER_LETTER = 4,
  111. baseForm, //OTHER_LETTER = 5,
  112. accent_diacritic, //NON_SPACING_MARK = 6,
  113. accent_diacritic, //ENCLOSING_MARK = 7,
  114. baseForm, //COMBINING_SPACING_MARK = 8,
  115. baseForm, //DECIMAL_DIGIT_NUMBER = 9,
  116. baseForm, //LETTER_NUMBER = 10,
  117. baseForm, //OTHER_NUMBER = 11,
  118. baseForm, //SPACE_SEPARATOR = 12,
  119. baseForm, //LINE_SEPARATOR = 13,
  120. baseForm, //PARAGRAPH_SEPARATOR = 14,
  121. baseForm, //CONTROL = 15,
  122. baseForm, //FORMAT = 16,
  123. baseForm, //???? = 17,
  124. baseForm, //PRIVATE_USE = 18,
  125. baseForm, //SURROGATE = 19,
  126. baseForm, //DASH_PUNCTUATION = 20,
  127. baseForm, //START_PUNCTUATION = 21,
  128. baseForm, //END_PUNCTUATION = 22,
  129. baseForm, //CONNECTOR_PUNCTUATION = 23,
  130. baseForm, //OTHER_PUNCTUATION = 24,
  131. baseForm, //MATH_SYMBOL = 25,
  132. baseForm, //CURRENCY_SYMBOL = 26,
  133. baseForm, //MODIFIER_SYMBOL = 27,
  134. baseForm, //OTHER_SYMBOL = 28;
  135. };
  136. private static final SpecialMapping kExceptionChar[] = //{};
  137. {
  138. new SpecialMapping(ASCII_LINEFEED, baseLF),
  139. new SpecialMapping(ASCII_CARRIAGE_RETURN, baseCR),
  140. new SpecialMapping(HANGUL_CHOSEONG_LOW, HANGUL_CHOSEONG_HIGH, choseong),
  141. new SpecialMapping(HANGUL_JUNGSEONG_LOW, HANGUL_JUNGSEONG_HIGH, jungseong),
  142. new SpecialMapping(HANGUL_JONGSEONG_LOW, HANGUL_JONGSEONG_HIGH, jongseong),
  143. new SpecialMapping(PUNCTUATION_LINE_SEPARATOR, PUNCTUATION_PARAGRAPH_SEPARATOR, baseLF),
  144. new SpecialMapping(END_OF_STRING, EOS)
  145. };
  146. private static final boolean CharacterExceptionFlags[] = {
  147. false, // kNonCharacter = 0,
  148. false, // kUppercaseLetter = 1,
  149. false, // kLowercaseLetter = 2,
  150. false, // kTitlecaseLetter = 3,
  151. false, // kModifierLetter = 4,
  152. true, // kOtherLetter = 5,
  153. false, // kNonSpacingMark = 6,
  154. false, // kEnclosingMark = 7,
  155. false, // kCombiningSpacingMark = 8,
  156. false, // kDecimalNumber = 9,
  157. false, // kLetterNumber = 10,
  158. false, // kOtherNumber = 11,
  159. false, // kSpaceSeparator = 12,
  160. true, // kLineSeparator = 13,
  161. true, // kParagraphSeparator = 14,
  162. true, // kControlCharacter = 15,
  163. false, // kFormatCharacter = 16,
  164. false, // UNDEFINED = 17,
  165. false, // kPrivateUseCharacter = 18,
  166. false, // kSurrogate = 19,
  167. false, // kDashPunctuation = 20,
  168. false, // kOpenPunctuation = 21,
  169. false, // kClosePunctuation = 22,
  170. false, // kConnectorPunctuation = 23,
  171. false, // kOtherPunctuation = 24,
  172. false, // kMathSymbol = 25,
  173. false, // kCurrencySymbol = 26,
  174. false, // kModifierSymbol = 27,
  175. false // kOtherSymbol = 28
  176. };
  177. private static final int kCharacterAsciiValues[] = {
  178. // null soh stx etx eot enq ask bell
  179. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  180. // bs ht lf vt ff cr so si
  181. baseForm, baseForm, baseLF, baseForm, baseForm, baseCR, baseForm, baseForm,
  182. // dle dc1 dc2 dc3 dc4 nak syn etb
  183. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  184. // can em sub esc fs gs rs us
  185. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  186. // sp ! " # $ % & '
  187. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  188. // ( ) * + , - . /
  189. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  190. // 0 1 2 3 4 5 6 7
  191. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  192. // 8 9 : ; < = > ?
  193. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  194. // @ A B C D E F G
  195. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  196. // H I J K L M N O
  197. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  198. // P Q R S T U V W
  199. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  200. // X Y Z [ \ ] ^ _
  201. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  202. // ` a b c d e f g
  203. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  204. // h i j k l m n o
  205. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  206. // p q r s t u v w
  207. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  208. // x y z { | } ~ del
  209. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  210. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  211. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  212. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  213. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  214. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  215. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  216. // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
  217. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  218. // nbsp inv-! cents pounds currency yen broken-bar section
  219. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  220. // umlaut copyright super-a gui-left not soft-hyph registered macron
  221. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  222. // degree +/- super-2 super-3 acute micro paragraph bullet
  223. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  224. // cedilla super-1 super-o gui-right 1/4 1/2 3/4 inv-?
  225. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  226. // A-grave A-acute A-hat A-tilde A-umlaut A-ring AE C-cedilla
  227. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  228. // E-grave E-acute E-hat E-umlaut I-grave I-acute I-hat I-umlaut
  229. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  230. // Edh N-tilde O-grave O-acute O-hat O-tilde O-umlaut times
  231. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  232. // O-slash U-grave U-acute U-hat U-umlaut Y-acute Thorn ess-zed
  233. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  234. // a-grave a-acute a-hat a-tilde a-umlaut a-ring ae c-cedilla
  235. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  236. // e-grave e-acute e-hat e-umlaut i-grave i-acute i-hat i-umlaut
  237. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  238. // edh n-tilde o-grave o-acute o-hat o-tilde o-umlaut over
  239. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  240. // o-slash u-grave u-acute u-hat u-umlaut y-acute thorn y-umlaut
  241. baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm
  242. };
  243. private static final UnicodeClassMapping kCharacterMap
  244. = new UnicodeClassMapping(kRawMapping, kExceptionChar, CharacterExceptionFlags,
  245. kCharacterAsciiValues);
  246. }