- /*
- * @(#)ConditionalSpecialCasing.java 1.3 03/12/19
- *
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
- * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
- */
-
- package java.lang;
-
- import java.text.BreakIterator;
- import java.util.HashSet;
- import java.util.Hashtable;
- import java.util.Iterator;
- import java.util.Locale;
- import sun.text.Normalizer;
-
-
- /**
- * This is a utility class for <code>String.toLowerCase()</code> and
- * <code>String.toUpperCase()</code>, that handles special casing with
- * conditions. In other words, it handles the mappings with conditions
- * that are defined in
- * <a href="http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt">Special
- * Casing Properties</a> file.
- * <p>
- * Note that the unconditional case mappings (including 1:M mappings)
- * are handled in <code>Character.toLower/UpperCase()</code>.
- */
- final class ConditionalSpecialCasing {
-
- // context conditions.
- final static int FINAL_CASED = 1;
- final static int AFTER_SOFT_DOTTED = 2;
- final static int MORE_ABOVE = 3;
- final static int AFTER_I = 4;
- final static int NOT_BEFORE_DOT = 5;
-
- // combining class definitions
- final static int COMBINING_CLASS_ABOVE = 230;
-
- // Special case mapping entries
- static Entry[] entry = {
- //# ================================================================================
- //# Conditional mappings
- //# ================================================================================
- new Entry(0x03A3, new char[]{0x03C2}, new char[]{0x03A3}, null, FINAL_CASED), // # GREEK CAPITAL LETTER SIGMA
-
- //# ================================================================================
- //# Locale-sensitive mappings
- //# ================================================================================
- //# Lithuanian
- new Entry(0x0307, new char[]{0x0307}, new char[]{}, "lt", AFTER_SOFT_DOTTED), // # COMBINING DOT ABOVE
- new Entry(0x0049, new char[]{0x0069, 0x0307}, new char[]{0x0049}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER I
- new Entry(0x004A, new char[]{0x006A, 0x0307}, new char[]{0x004A}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER J
- new Entry(0x012E, new char[]{0x012F, 0x0307}, new char[]{0x012E}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER I WITH OGONEK
- new Entry(0x00CC, new char[]{0x0069, 0x0307, 0x0300}, new char[]{0x00CC}, "lt", 0), // # LATIN CAPITAL LETTER I WITH GRAVE
- new Entry(0x00CD, new char[]{0x0069, 0x0307, 0x0301}, new char[]{0x00CD}, "lt", 0), // # LATIN CAPITAL LETTER I WITH ACUTE
- new Entry(0x0128, new char[]{0x0069, 0x0307, 0x0303}, new char[]{0x0128}, "lt", 0), // # LATIN CAPITAL LETTER I WITH TILDE
-
- //# ================================================================================
- //# Turkish and Azeri
- // new Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "tr", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE
- // new Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "az", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE
- new Entry(0x0307, new char[]{}, new char[]{0x0307}, "tr", AFTER_I), // # COMBINING DOT ABOVE
- new Entry(0x0307, new char[]{}, new char[]{0x0307}, "az", AFTER_I), // # COMBINING DOT ABOVE
- new Entry(0x0049, new char[]{0x0131}, new char[]{0x0049}, "tr", NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I
- new Entry(0x0049, new char[]{0x0131}, new char[]{0x0049}, "az", NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I
- new Entry(0x0069, new char[]{0x0069}, new char[]{0x0130}, "tr", 0), // # LATIN SMALL LETTER I
- new Entry(0x0069, new char[]{0x0069}, new char[]{0x0130}, "az", 0) // # LATIN SMALL LETTER I
- };
-
- // A hash table that contains the above entries
- static Hashtable entryTable = new Hashtable();
- static {
- // create hashtable from the entry
- for (int i = 0; i < entry.length; i ++) {
- Entry cur = entry[i];
- Integer cp = new Integer(cur.getCodePoint());
- HashSet set = (HashSet)entryTable.get(cp);
- if (set == null) {
- set = new HashSet();
- }
- set.add(cur);
- entryTable.put(cp, set);
- }
- }
-
- static int toLowerCaseEx(String src, int index, Locale locale) {
- char[] result = lookUpTable(src, index, locale, true);
-
- if (result != null) {
- if (result.length == 1) {
- return result[0];
- } else {
- return Character.CHAR_ERROR;
- }
- } else {
- // default to Character class' one
- return Character.toLowerCase(src.codePointAt(index));
- }
- }
-
- static int toUpperCaseEx(String src, int index, Locale locale) {
- char[] result = lookUpTable(src, index, locale, false);
-
- if (result != null) {
- if (result.length == 1) {
- return result[0];
- } else {
- return Character.CHAR_ERROR;
- }
- } else {
- // default to Character class' one
- return Character.toUpperCaseEx(src.codePointAt(index));
- }
- }
-
- static char[] toLowerCaseCharArray(String src, int index, Locale locale) {
- return lookUpTable(src, index, locale, true);
- }
-
- static char[] toUpperCaseCharArray(String src, int index, Locale locale) {
- char[] result = lookUpTable(src, index, locale, false);
- if (result != null) {
- return result;
- } else {
- return Character.toUpperCaseCharArray(src.codePointAt(index));
- }
- }
-
- private static char[] lookUpTable(String src, int index, Locale locale, boolean bLowerCasing) {
- HashSet set = (HashSet)entryTable.get(new Integer(src.codePointAt(index)));
-
- if (set != null) {
- Iterator iter = set.iterator();
- String currentLang = locale.getLanguage();
- while (iter.hasNext()) {
- Entry entry = (Entry)iter.next();
- String conditionLang= entry.getLanguage();
- if (((conditionLang == null) || (conditionLang.equals(currentLang))) &&
- isConditionMet(src, index, locale, entry.getCondition())) {
- return (bLowerCasing ? entry.getLowerCase() : entry.getUpperCase());
- }
- }
- }
-
- return null;
- }
-
- private static boolean isConditionMet(String src, int index, Locale locale, int condition) {
- switch (condition) {
- case FINAL_CASED:
- return isFinalCased(src, index, locale);
-
- case AFTER_SOFT_DOTTED:
- return isAfterSoftDotted(src, index);
-
- case MORE_ABOVE:
- return isMoreAbove(src, index);
-
- case AFTER_I:
- return isAfterI(src, index);
-
- case NOT_BEFORE_DOT:
- return !isBeforeDot(src, index);
-
- default:
- return true;
- }
- }
-
- /**
- * Implements the "Final_Cased" condition
- *
- * Specification: Within the closest word boundaries containing C, there is a cased
- * letter before C, and there is no cased letter after C.
- *
- * Regular Expression:
- * Before C: [{cased==true}][{wordBoundary!=true}]*
- * After C: !([{wordBoundary!=true}]*[{cased}])
- */
- private static boolean isFinalCased(String src, int index, Locale locale) {
- BreakIterator wordBoundary = BreakIterator.getWordInstance(locale);
- wordBoundary.setText(src);
- int ch;
-
- // Look for a preceding 'cased' letter
- for (int i = index; (i >= 0) && !wordBoundary.isBoundary(i);
- i -= Character.charCount(ch)) {
-
- ch = src.codePointBefore(i);
- if (isCased(ch)) {
-
- int len = src.length();
- // Check that there is no 'cased' letter after the index
- for (i = index + Character.charCount(src.codePointAt(index));
- (i < len) && !wordBoundary.isBoundary(i);
- i += Character.charCount(ch)) {
-
- ch = src.codePointAt(i);
- if (isCased(ch)) {
- return false;
- }
- }
-
- return true;
- }
- }
-
- return false;
- }
-
- /**
- * Implements the "After_I" condition
- *
- * Specification: The last preceding base character was an uppercase I,
- * and there is no intervening combining character class 230 (ABOVE).
- *
- * Regular Expression:
- * Before C: [I]([{cc!=230}&{cc!=0}])*
- */
- private static boolean isAfterI(String src, int index) {
- int ch;
- int cc;
-
- // Look for the last preceding base character
- for (int i = index; i > 0; i -= Character.charCount(ch)) {
-
- ch = src.codePointBefore(i);
-
- if (ch == 'I') {
- return true;
- } else {
- cc = Normalizer.getClass(ch);
- if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {
- return false;
- }
- }
- }
-
- return false;
- }
-
- /**
- * Implements the "After_Soft_Dotted" condition
- *
- * Specification: The last preceding character with combining class
- * of zero before C was Soft_Dotted, and there is no intervening
- * combining character class 230 (ABOVE).
- *
- * Regular Expression:
- * Before C: [{Soft_Dotted==true}]([{cc!=230}&{cc!=0}])*
- */
- private static boolean isAfterSoftDotted(String src, int index) {
- int ch;
- int cc;
-
- // Look for the last preceding character
- for (int i = index; i > 0; i -= Character.charCount(ch)) {
-
- ch = src.codePointBefore(i);
-
- if (isSoftDotted(ch)) {
- return true;
- } else {
- cc = Normalizer.getClass(ch);
- if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {
- return false;
- }
- }
- }
-
- return false;
- }
-
- /**
- * Implements the "More_Above" condition
- *
- * Specification: C is followed by one or more characters of combining
- * class 230 (ABOVE) in the combining character sequence.
- *
- * Regular Expression:
- * After C: [{cc!=0}]*[{cc==230}]
- */
- private static boolean isMoreAbove(String src, int index) {
- int ch;
- int cc;
- int len = src.length();
-
- // Look for a following ABOVE combining class character
- for (int i = index + Character.charCount(src.codePointAt(index));
- i < len; i += Character.charCount(ch)) {
-
- ch = src.codePointAt(i);
- cc = Normalizer.getClass(ch);
-
- if (cc == COMBINING_CLASS_ABOVE) {
- return true;
- } else if (cc == 0) {
- return false;
- }
- }
-
- return false;
- }
-
- /**
- * Implements the "Before_Dot" condition
- *
- * Specification: C is followed by <code>U+0307 COMBINING DOT ABOVE</code>.
- * Any sequence of characters with a combining class that is
- * neither 0 nor 230 may intervene between the current character
- * and the combining dot above.
- *
- * Regular Expression:
- * After C: ([{cc!=230}&{cc!=0}])*[\u0307]
- */
- private static boolean isBeforeDot(String src, int index) {
- int ch;
- int cc;
- int len = src.length();
-
- // Look for a following COMBINING DOT ABOVE
- for (int i = index + Character.charCount(src.codePointAt(index));
- i < len; i += Character.charCount(ch)) {
-
- ch = src.codePointAt(i);
-
- if (ch == '\u0307') {
- return true;
- } else {
- cc = Normalizer.getClass(ch);
- if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {
- return false;
- }
- }
- }
-
- return false;
- }
-
- /**
- * Examines whether a character is 'cased'.
- *
- * A character C is defined to be 'cased' if and only if at least one of
- * following are true for C: uppercase==true, or lowercase==true, or
- * general_category==titlecase_letter.
- *
- * The uppercase and lowercase property values are specified in the data
- * file DerivedCoreProperties.txt in the Unicode Character Database.
- */
- private static boolean isCased(int ch) {
- int type = Character.getType(ch);
- if (type == Character.LOWERCASE_LETTER ||
- type == Character.UPPERCASE_LETTER ||
- type == Character.TITLECASE_LETTER) {
- return true;
- } else {
- // Check for Other_Lowercase and Other_Uppercase
- //
- if ((ch >= 0x02B0) && (ch <= 0x02B8)) {
- // MODIFIER LETTER SMALL H..MODIFIER LETTER SMALL Y
- return true;
- } else if ((ch >= 0x02C0) && (ch <= 0x02C1)) {
- // MODIFIER LETTER GLOTTAL STOP..MODIFIER LETTER REVERSED GLOTTAL STOP
- return true;
- } else if ((ch >= 0x02E0) && (ch <= 0x02E4)) {
- // MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP
- return true;
- } else if (ch == 0x0345) {
- // COMBINING GREEK YPOGEGRAMMENI
- return true;
- } else if (ch == 0x037A) {
- // GREEK YPOGEGRAMMENI
- return true;
- } else if ((ch >= 0x1D2C) && (ch <= 0x1D61)) {
- // MODIFIER LETTER CAPITAL A..MODIFIER LETTER SMALL CHI
- return true;
- } else if ((ch >= 0x2160) && (ch <= 0x217F)) {
- // ROMAN NUMERAL ONE..ROMAN NUMERAL ONE THOUSAND
- // SMALL ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL ONE THOUSAND
- return true;
- } else if ((ch >= 0x24B6) && (ch <= 0x24E9)) {
- // CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN CAPITAL LETTER Z
- // CIRCLED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z
- return true;
- } else {
- return false;
- }
- }
- }
-
- private static boolean isSoftDotted(int ch) {
- switch (ch) {
- case 0x0069: // Soft_Dotted # L& LATIN SMALL LETTER I
- case 0x006A: // Soft_Dotted # L& LATIN SMALL LETTER J
- case 0x012F: // Soft_Dotted # L& LATIN SMALL LETTER I WITH OGONEK
- case 0x0268: // Soft_Dotted # L& LATIN SMALL LETTER I WITH STROKE
- case 0x0456: // Soft_Dotted # L& CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
- case 0x0458: // Soft_Dotted # L& CYRILLIC SMALL LETTER JE
- case 0x1D62: // Soft_Dotted # L& LATIN SUBSCRIPT SMALL LETTER I
- case 0x1E2D: // Soft_Dotted # L& LATIN SMALL LETTER I WITH TILDE BELOW
- case 0x1ECB: // Soft_Dotted # L& LATIN SMALL LETTER I WITH DOT BELOW
- case 0x2071: // Soft_Dotted # L& SUPERSCRIPT LATIN SMALL LETTER I
- return true;
- default:
- return false;
- }
- }
-
- /**
- * An internal class that represents an entry in the Special Casing Properties.
- */
- static class Entry {
- int ch;
- char [] lower;
- char [] upper;
- String lang;
- int condition;
-
- Entry(int ch, char[] lower, char[] upper, String lang, int condition) {
- this.ch = ch;
- this.lower = lower;
- this.upper = upper;
- this.lang = lang;
- this.condition = condition;
- }
-
- int getCodePoint() {
- return ch;
- }
-
- char[] getLowerCase() {
- return lower;
- }
-
- char[] getUpperCase() {
- return upper;
- }
-
- String getLanguage() {
- return lang;
- }
-
- int getCondition() {
- return condition;
- }
- }
- }