1. /*
  2. * @(#)ConditionalSpecialCasing.java 1.3 03/12/19
  3. *
  4. * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
  5. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
  6. */
  7. package java.lang;
  8. import java.text.BreakIterator;
  9. import java.util.HashSet;
  10. import java.util.Hashtable;
  11. import java.util.Iterator;
  12. import java.util.Locale;
  13. import sun.text.Normalizer;
  14. /**
  15. * This is a utility class for <code>String.toLowerCase()</code> and
  16. * <code>String.toUpperCase()</code>, that handles special casing with
  17. * conditions. In other words, it handles the mappings with conditions
  18. * that are defined in
  19. * <a href="http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt">Special
  20. * Casing Properties</a> file.
  21. * <p>
  22. * Note that the unconditional case mappings (including 1:M mappings)
  23. * are handled in <code>Character.toLower/UpperCase()</code>.
  24. */
  25. final class ConditionalSpecialCasing {
  26. // context conditions.
  27. final static int FINAL_CASED = 1;
  28. final static int AFTER_SOFT_DOTTED = 2;
  29. final static int MORE_ABOVE = 3;
  30. final static int AFTER_I = 4;
  31. final static int NOT_BEFORE_DOT = 5;
  32. // combining class definitions
  33. final static int COMBINING_CLASS_ABOVE = 230;
  34. // Special case mapping entries
  35. static Entry[] entry = {
  36. //# ================================================================================
  37. //# Conditional mappings
  38. //# ================================================================================
  39. new Entry(0x03A3, new char[]{0x03C2}, new char[]{0x03A3}, null, FINAL_CASED), // # GREEK CAPITAL LETTER SIGMA
  40. //# ================================================================================
  41. //# Locale-sensitive mappings
  42. //# ================================================================================
  43. //# Lithuanian
  44. new Entry(0x0307, new char[]{0x0307}, new char[]{}, "lt", AFTER_SOFT_DOTTED), // # COMBINING DOT ABOVE
  45. new Entry(0x0049, new char[]{0x0069, 0x0307}, new char[]{0x0049}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER I
  46. new Entry(0x004A, new char[]{0x006A, 0x0307}, new char[]{0x004A}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER J
  47. new Entry(0x012E, new char[]{0x012F, 0x0307}, new char[]{0x012E}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER I WITH OGONEK
  48. new Entry(0x00CC, new char[]{0x0069, 0x0307, 0x0300}, new char[]{0x00CC}, "lt", 0), // # LATIN CAPITAL LETTER I WITH GRAVE
  49. new Entry(0x00CD, new char[]{0x0069, 0x0307, 0x0301}, new char[]{0x00CD}, "lt", 0), // # LATIN CAPITAL LETTER I WITH ACUTE
  50. new Entry(0x0128, new char[]{0x0069, 0x0307, 0x0303}, new char[]{0x0128}, "lt", 0), // # LATIN CAPITAL LETTER I WITH TILDE
  51. //# ================================================================================
  52. //# Turkish and Azeri
  53. // new Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "tr", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE
  54. // new Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "az", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE
  55. new Entry(0x0307, new char[]{}, new char[]{0x0307}, "tr", AFTER_I), // # COMBINING DOT ABOVE
  56. new Entry(0x0307, new char[]{}, new char[]{0x0307}, "az", AFTER_I), // # COMBINING DOT ABOVE
  57. new Entry(0x0049, new char[]{0x0131}, new char[]{0x0049}, "tr", NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I
  58. new Entry(0x0049, new char[]{0x0131}, new char[]{0x0049}, "az", NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I
  59. new Entry(0x0069, new char[]{0x0069}, new char[]{0x0130}, "tr", 0), // # LATIN SMALL LETTER I
  60. new Entry(0x0069, new char[]{0x0069}, new char[]{0x0130}, "az", 0) // # LATIN SMALL LETTER I
  61. };
  62. // A hash table that contains the above entries
  63. static Hashtable entryTable = new Hashtable();
  64. static {
  65. // create hashtable from the entry
  66. for (int i = 0; i < entry.length; i ++) {
  67. Entry cur = entry[i];
  68. Integer cp = new Integer(cur.getCodePoint());
  69. HashSet set = (HashSet)entryTable.get(cp);
  70. if (set == null) {
  71. set = new HashSet();
  72. }
  73. set.add(cur);
  74. entryTable.put(cp, set);
  75. }
  76. }
  77. static int toLowerCaseEx(String src, int index, Locale locale) {
  78. char[] result = lookUpTable(src, index, locale, true);
  79. if (result != null) {
  80. if (result.length == 1) {
  81. return result[0];
  82. } else {
  83. return Character.CHAR_ERROR;
  84. }
  85. } else {
  86. // default to Character class' one
  87. return Character.toLowerCase(src.codePointAt(index));
  88. }
  89. }
  90. static int toUpperCaseEx(String src, int index, Locale locale) {
  91. char[] result = lookUpTable(src, index, locale, false);
  92. if (result != null) {
  93. if (result.length == 1) {
  94. return result[0];
  95. } else {
  96. return Character.CHAR_ERROR;
  97. }
  98. } else {
  99. // default to Character class' one
  100. return Character.toUpperCaseEx(src.codePointAt(index));
  101. }
  102. }
  103. static char[] toLowerCaseCharArray(String src, int index, Locale locale) {
  104. return lookUpTable(src, index, locale, true);
  105. }
  106. static char[] toUpperCaseCharArray(String src, int index, Locale locale) {
  107. char[] result = lookUpTable(src, index, locale, false);
  108. if (result != null) {
  109. return result;
  110. } else {
  111. return Character.toUpperCaseCharArray(src.codePointAt(index));
  112. }
  113. }
  114. private static char[] lookUpTable(String src, int index, Locale locale, boolean bLowerCasing) {
  115. HashSet set = (HashSet)entryTable.get(new Integer(src.codePointAt(index)));
  116. if (set != null) {
  117. Iterator iter = set.iterator();
  118. String currentLang = locale.getLanguage();
  119. while (iter.hasNext()) {
  120. Entry entry = (Entry)iter.next();
  121. String conditionLang= entry.getLanguage();
  122. if (((conditionLang == null) || (conditionLang.equals(currentLang))) &&
  123. isConditionMet(src, index, locale, entry.getCondition())) {
  124. return (bLowerCasing ? entry.getLowerCase() : entry.getUpperCase());
  125. }
  126. }
  127. }
  128. return null;
  129. }
  130. private static boolean isConditionMet(String src, int index, Locale locale, int condition) {
  131. switch (condition) {
  132. case FINAL_CASED:
  133. return isFinalCased(src, index, locale);
  134. case AFTER_SOFT_DOTTED:
  135. return isAfterSoftDotted(src, index);
  136. case MORE_ABOVE:
  137. return isMoreAbove(src, index);
  138. case AFTER_I:
  139. return isAfterI(src, index);
  140. case NOT_BEFORE_DOT:
  141. return !isBeforeDot(src, index);
  142. default:
  143. return true;
  144. }
  145. }
  146. /**
  147. * Implements the "Final_Cased" condition
  148. *
  149. * Specification: Within the closest word boundaries containing C, there is a cased
  150. * letter before C, and there is no cased letter after C.
  151. *
  152. * Regular Expression:
  153. * Before C: [{cased==true}][{wordBoundary!=true}]*
  154. * After C: !([{wordBoundary!=true}]*[{cased}])
  155. */
  156. private static boolean isFinalCased(String src, int index, Locale locale) {
  157. BreakIterator wordBoundary = BreakIterator.getWordInstance(locale);
  158. wordBoundary.setText(src);
  159. int ch;
  160. // Look for a preceding 'cased' letter
  161. for (int i = index; (i >= 0) && !wordBoundary.isBoundary(i);
  162. i -= Character.charCount(ch)) {
  163. ch = src.codePointBefore(i);
  164. if (isCased(ch)) {
  165. int len = src.length();
  166. // Check that there is no 'cased' letter after the index
  167. for (i = index + Character.charCount(src.codePointAt(index));
  168. (i < len) && !wordBoundary.isBoundary(i);
  169. i += Character.charCount(ch)) {
  170. ch = src.codePointAt(i);
  171. if (isCased(ch)) {
  172. return false;
  173. }
  174. }
  175. return true;
  176. }
  177. }
  178. return false;
  179. }
  180. /**
  181. * Implements the "After_I" condition
  182. *
  183. * Specification: The last preceding base character was an uppercase I,
  184. * and there is no intervening combining character class 230 (ABOVE).
  185. *
  186. * Regular Expression:
  187. * Before C: [I]([{cc!=230}&{cc!=0}])*
  188. */
  189. private static boolean isAfterI(String src, int index) {
  190. int ch;
  191. int cc;
  192. // Look for the last preceding base character
  193. for (int i = index; i > 0; i -= Character.charCount(ch)) {
  194. ch = src.codePointBefore(i);
  195. if (ch == 'I') {
  196. return true;
  197. } else {
  198. cc = Normalizer.getClass(ch);
  199. if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {
  200. return false;
  201. }
  202. }
  203. }
  204. return false;
  205. }
  206. /**
  207. * Implements the "After_Soft_Dotted" condition
  208. *
  209. * Specification: The last preceding character with combining class
  210. * of zero before C was Soft_Dotted, and there is no intervening
  211. * combining character class 230 (ABOVE).
  212. *
  213. * Regular Expression:
  214. * Before C: [{Soft_Dotted==true}]([{cc!=230}&{cc!=0}])*
  215. */
  216. private static boolean isAfterSoftDotted(String src, int index) {
  217. int ch;
  218. int cc;
  219. // Look for the last preceding character
  220. for (int i = index; i > 0; i -= Character.charCount(ch)) {
  221. ch = src.codePointBefore(i);
  222. if (isSoftDotted(ch)) {
  223. return true;
  224. } else {
  225. cc = Normalizer.getClass(ch);
  226. if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {
  227. return false;
  228. }
  229. }
  230. }
  231. return false;
  232. }
  233. /**
  234. * Implements the "More_Above" condition
  235. *
  236. * Specification: C is followed by one or more characters of combining
  237. * class 230 (ABOVE) in the combining character sequence.
  238. *
  239. * Regular Expression:
  240. * After C: [{cc!=0}]*[{cc==230}]
  241. */
  242. private static boolean isMoreAbove(String src, int index) {
  243. int ch;
  244. int cc;
  245. int len = src.length();
  246. // Look for a following ABOVE combining class character
  247. for (int i = index + Character.charCount(src.codePointAt(index));
  248. i < len; i += Character.charCount(ch)) {
  249. ch = src.codePointAt(i);
  250. cc = Normalizer.getClass(ch);
  251. if (cc == COMBINING_CLASS_ABOVE) {
  252. return true;
  253. } else if (cc == 0) {
  254. return false;
  255. }
  256. }
  257. return false;
  258. }
  259. /**
  260. * Implements the "Before_Dot" condition
  261. *
  262. * Specification: C is followed by <code>U+0307 COMBINING DOT ABOVE</code>.
  263. * Any sequence of characters with a combining class that is
  264. * neither 0 nor 230 may intervene between the current character
  265. * and the combining dot above.
  266. *
  267. * Regular Expression:
  268. * After C: ([{cc!=230}&{cc!=0}])*[\u0307]
  269. */
  270. private static boolean isBeforeDot(String src, int index) {
  271. int ch;
  272. int cc;
  273. int len = src.length();
  274. // Look for a following COMBINING DOT ABOVE
  275. for (int i = index + Character.charCount(src.codePointAt(index));
  276. i < len; i += Character.charCount(ch)) {
  277. ch = src.codePointAt(i);
  278. if (ch == '\u0307') {
  279. return true;
  280. } else {
  281. cc = Normalizer.getClass(ch);
  282. if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {
  283. return false;
  284. }
  285. }
  286. }
  287. return false;
  288. }
  289. /**
  290. * Examines whether a character is 'cased'.
  291. *
  292. * A character C is defined to be 'cased' if and only if at least one of
  293. * following are true for C: uppercase==true, or lowercase==true, or
  294. * general_category==titlecase_letter.
  295. *
  296. * The uppercase and lowercase property values are specified in the data
  297. * file DerivedCoreProperties.txt in the Unicode Character Database.
  298. */
  299. private static boolean isCased(int ch) {
  300. int type = Character.getType(ch);
  301. if (type == Character.LOWERCASE_LETTER ||
  302. type == Character.UPPERCASE_LETTER ||
  303. type == Character.TITLECASE_LETTER) {
  304. return true;
  305. } else {
  306. // Check for Other_Lowercase and Other_Uppercase
  307. //
  308. if ((ch >= 0x02B0) && (ch <= 0x02B8)) {
  309. // MODIFIER LETTER SMALL H..MODIFIER LETTER SMALL Y
  310. return true;
  311. } else if ((ch >= 0x02C0) && (ch <= 0x02C1)) {
  312. // MODIFIER LETTER GLOTTAL STOP..MODIFIER LETTER REVERSED GLOTTAL STOP
  313. return true;
  314. } else if ((ch >= 0x02E0) && (ch <= 0x02E4)) {
  315. // MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP
  316. return true;
  317. } else if (ch == 0x0345) {
  318. // COMBINING GREEK YPOGEGRAMMENI
  319. return true;
  320. } else if (ch == 0x037A) {
  321. // GREEK YPOGEGRAMMENI
  322. return true;
  323. } else if ((ch >= 0x1D2C) && (ch <= 0x1D61)) {
  324. // MODIFIER LETTER CAPITAL A..MODIFIER LETTER SMALL CHI
  325. return true;
  326. } else if ((ch >= 0x2160) && (ch <= 0x217F)) {
  327. // ROMAN NUMERAL ONE..ROMAN NUMERAL ONE THOUSAND
  328. // SMALL ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL ONE THOUSAND
  329. return true;
  330. } else if ((ch >= 0x24B6) && (ch <= 0x24E9)) {
  331. // CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN CAPITAL LETTER Z
  332. // CIRCLED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z
  333. return true;
  334. } else {
  335. return false;
  336. }
  337. }
  338. }
  339. private static boolean isSoftDotted(int ch) {
  340. switch (ch) {
  341. case 0x0069: // Soft_Dotted # L& LATIN SMALL LETTER I
  342. case 0x006A: // Soft_Dotted # L& LATIN SMALL LETTER J
  343. case 0x012F: // Soft_Dotted # L& LATIN SMALL LETTER I WITH OGONEK
  344. case 0x0268: // Soft_Dotted # L& LATIN SMALL LETTER I WITH STROKE
  345. case 0x0456: // Soft_Dotted # L& CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
  346. case 0x0458: // Soft_Dotted # L& CYRILLIC SMALL LETTER JE
  347. case 0x1D62: // Soft_Dotted # L& LATIN SUBSCRIPT SMALL LETTER I
  348. case 0x1E2D: // Soft_Dotted # L& LATIN SMALL LETTER I WITH TILDE BELOW
  349. case 0x1ECB: // Soft_Dotted # L& LATIN SMALL LETTER I WITH DOT BELOW
  350. case 0x2071: // Soft_Dotted # L& SUPERSCRIPT LATIN SMALL LETTER I
  351. return true;
  352. default:
  353. return false;
  354. }
  355. }
  356. /**
  357. * An internal class that represents an entry in the Special Casing Properties.
  358. */
  359. static class Entry {
  360. int ch;
  361. char [] lower;
  362. char [] upper;
  363. String lang;
  364. int condition;
  365. Entry(int ch, char[] lower, char[] upper, String lang, int condition) {
  366. this.ch = ch;
  367. this.lower = lower;
  368. this.upper = upper;
  369. this.lang = lang;
  370. this.condition = condition;
  371. }
  372. int getCodePoint() {
  373. return ch;
  374. }
  375. char[] getLowerCase() {
  376. return lower;
  377. }
  378. char[] getUpperCase() {
  379. return upper;
  380. }
  381. String getLanguage() {
  382. return lang;
  383. }
  384. int getCondition() {
  385. return condition;
  386. }
  387. }
  388. }