- /*
- * $Id: XmlChars.java,v 1.1.1.1 2000/11/23 01:53:35 edwingo Exp $
- *
- * The Apache Software License, Version 1.1
- *
- *
- * Copyright (c) 2000 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Crimson" and "Apache Software Foundation" must
- * not be used to endorse or promote products derived from this
- * software without prior written permission. For written
- * permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * nor may "Apache" appear in their name, without prior written
- * permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation and was
- * originally based on software copyright (c) 1999, Sun Microsystems, Inc.,
- * http://www.sun.com. For more information on the Apache Software
- * Foundation, please see <http://www.apache.org/>.
- */
-
- package org.apache.crimson.util;
-
-
- /**
- * Methods in this class are used to determine whether characters may
- * appear in certain roles in XML documents. Such methods are used
- * both to parse and to create such documents.
- *
- * @version 1.8
- * @author David Brownell
- */
- public class XmlChars
- {
- // can't construct instances
- private XmlChars () { }
-
- /**
- * Returns true if the argument, a UCS-4 character code, is valid in
- * XML documents. Unicode characters fit into the low sixteen
- * bits of a UCS-4 character, and pairs of Unicode <em>surrogate
- * characters</em> can be combined to encode UCS-4 characters in
- * documents containing only Unicode. (The <code>char</code> datatype
- * in the Java Programming Language represents Unicode characters,
- * including unpaired surrogates.)
- *
- * <P> In XML, UCS-4 characters can also be encoded by the use of
- * <em>character references</em> such as <b>�</b>, which
- * happens to refer to a character that is disallowed in XML documents.
- * UCS-4 characters allowed in XML documents can be expressed with
- * one or two Unicode characters.
- *
- * @param ucs4char The 32-bit UCS-4 character being tested.
- */
- static public boolean isChar (int ucs4char)
- {
- // [2] Char ::= #x0009 | #x000A | #x000D
- // | [#x0020-#xD7FF]
- // ... surrogates excluded!
- // | [#xE000-#xFFFD]
- // | [#x10000-#x10ffff]
- return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF)
- || ucs4char == 0x000A || ucs4char == 0x0009
- || ucs4char == 0x000D
- || (ucs4char >= 0xE000 && ucs4char <= 0xFFFD)
- || (ucs4char >= 0x10000 && ucs4char <= 0x10ffff));
- }
-
- /**
- * Returns true if the character is allowed to be a non-initial
- * character in names according to the XML recommendation.
- * @see #isNCNameChar
- * @see #isLetter
- */
- public static boolean isNameChar (char c)
- {
- // [4] NameChar ::= Letter | Digit | '.' | '_' | ':'
- // | CombiningChar | Extender
-
- if (isLetter2 (c))
- return true;
- else if (c == '>')
- return false;
- else if (c == '.' || c == '-' || c == '_' || c == ':'
- || isExtender (c))
- return true;
- else
- return false;
- }
-
- /**
- * Returns true if the character is allowed to be a non-initial
- * character in unscoped names according to the rules of the XML
- * Namespaces proposed recommendation. Except for precluding
- * the colon (used to separate names from their scopes) these
- * characters are just as allowed by the XML recommendation.
- * @see #isNameChar
- * @see #isLetter
- */
- public static boolean isNCNameChar (char c)
- {
- // [NC 5] NCNameChar ::= Letter | Digit | '.' | '_'
- // | CombiningChar | Extender
- return c != ':' && isNameChar (c);
- }
-
- /**
- * Returns true if the character is allowed where XML supports
- * whitespace characters, false otherwise.
- */
- public static boolean isSpace (char c)
- {
- return c == ' ' || c == '\t' || c == '\n' || c == '\r';
- }
-
-
- /*
- * NOTE: java.lang.Character.getType() values are:
- *
- * UNASSIGNED = 0,
- *
- * UPPERCASE_LETTER = 1, // Lu
- * LOWERCASE_LETTER = 2, // Ll
- * TITLECASE_LETTER = 3, // Lt
- * MODIFIER_LETTER = 4, // Lm
- * OTHER_LETTER = 5, // Lo
- * NON_SPACING_MARK = 6, // Mn
- * ENCLOSING_MARK = 7, // Me
- * COMBINING_SPACING_MARK = 8, // Mc
- * DECIMAL_DIGIT_NUMBER = 9, // Nd
- * LETTER_NUMBER = 10, // Nl
- * OTHER_NUMBER = 11, // No
- * SPACE_SEPARATOR = 12, // Zs
- * LINE_SEPARATOR = 13, // Zl
- * PARAGRAPH_SEPARATOR = 14, // Zp
- * CONTROL = 15, // Cc
- * FORMAT = 16, // Cf
- * // 17 reserved for proposed Ci category
- * PRIVATE_USE = 18, // Co
- * SURROGATE = 19, // Cs
- * DASH_PUNCTUATION = 20, // Pd
- * START_PUNCTUATION = 21, // Ps
- * END_PUNCTUATION = 22, // Pe
- * CONNECTOR_PUNCTUATION = 23, // Pc
- * OTHER_PUNCTUATION = 24, // Po
- * MATH_SYMBOL = 25, // Sm
- * CURRENCY_SYMBOL = 26, // Sc
- * MODIFIER_SYMBOL = 27, // Sk
- * OTHER_SYMBOL = 28; // So
- */
-
- /**
- * Returns true if the character is an XML "letter". XML Names must
- * start with Letters or a few other characters, but other characters
- * in names must only satisfy the <em>isNameChar</em> predicate.
- *
- * @see #isNameChar
- * @see #isNCNameChar
- */
- public static boolean isLetter (char c)
- {
- // [84] Letter ::= BaseChar | Ideographic
- // [85] BaseChar ::= ... too much to repeat
- // [86] Ideographic ::= ... too much to repeat
-
- //
- // Optimize the typical case.
- //
- if (c >= 'a' && c <= 'z')
- return true;
- if (c == '/')
- return false;
- if (c >= 'A' && c <= 'Z')
- return true;
-
- //
- // Since the tables are too ridiculous to use in code,
- // we're using the footnotes here to drive this test.
- //
- switch (Character.getType (c)) {
- // app. B footnote says these are 'name start'
- // chars' ...
- case Character.LOWERCASE_LETTER: // Ll
- case Character.UPPERCASE_LETTER: // Lu
- case Character.OTHER_LETTER: // Lo
- case Character.TITLECASE_LETTER: // Lt
- case Character.LETTER_NUMBER: // Nl
-
- // OK, here we just have some exceptions to check...
- return !isCompatibilityChar (c)
- // per "5.14 of Unicode", rule out some combiners
- && !(c >= 0x20dd && c <= 0x20e0);
-
- default:
- // check for some exceptions: these are "alphabetic"
- return ((c >= 0x02bb && c <= 0x02c1)
- || c == 0x0559 || c == 0x06e5 || c == 0x06e6);
- }
- }
-
- //
- // XML 1.0 discourages "compatibility" characters in names; these
- // were defined to permit passing through some information stored in
- // older non-Unicode character sets. These always have alternative
- // representations in Unicode, e.g. using combining chars.
- //
- private static boolean isCompatibilityChar (char c)
- {
- // the numerous comparisions here seem unavoidable,
- // but the switch can reduce the number which must
- // actually be executed.
-
- switch ((c >> 8) & 0x0ff) {
- case 0x00:
- // ISO Latin/1 has a few compatibility characters
- return c == 0x00aa || c == 0x00b5 || c == 0x00ba;
-
- case 0x01:
- // as do Latin Extended A and (parts of) B
- return (c >= 0x0132 && c <= 0x0133)
- || (c >= 0x013f && c <= 0x0140)
- || c == 0x0149
- || c == 0x017f
- || (c >= 0x01c4 && c <= 0x01cc)
- || (c >= 0x01f1 && c <= 0x01f3) ;
-
- case 0x02:
- // some spacing modifiers
- return (c >= 0x02b0 && c <= 0x02b8)
- || (c >= 0x02e0 && c <= 0x02e4);
-
- case 0x03:
- return c == 0x037a; // Greek
-
- case 0x05:
- return c == 0x0587; // Armenian
-
- case 0x0e:
- return c >= 0x0edc && c <= 0x0edd; // Laotian
-
- case 0x11:
- // big chunks of Hangul Jamo are all "compatibility"
- return c == 0x1101
- || c == 0x1104
- || c == 0x1108
- || c == 0x110a
- || c == 0x110d
- || (c >= 0x1113 && c <= 0x113b)
- || c == 0x113d
- || c == 0x113f
- || (c >= 0x1141 && c <= 0x114b)
- || c == 0x114d
- || c == 0x114f
- || (c >= 0x1151 && c <= 0x1153)
- || (c >= 0x1156 && c <= 0x1158)
- || c == 0x1162
- || c == 0x1164
- || c == 0x1166
- || c == 0x1168
- || (c >= 0x116a && c <= 0x116c)
- || (c >= 0x116f && c <= 0x1171)
- || c == 0x1174
- || (c >= 0x1176 && c <= 0x119d)
- || (c >= 0x119f && c <= 0x11a2)
- || (c >= 0x11a9 && c <= 0x11aa)
- || (c >= 0x11ac && c <= 0x11ad)
- || (c >= 0x11b0 && c <= 0x11b6)
- || c == 0x11b9
- || c == 0x11bb
- || (c >= 0x11c3 && c <= 0x11ea)
- || (c >= 0x11ec && c <= 0x11ef)
- || (c >= 0x11f1 && c <= 0x11f8)
- ;
-
- case 0x20:
- return c == 0x207f; // superscript
-
- case 0x21:
- return
- // various letterlike symbols
- c == 0x2102
- || c == 0x2107
- || (c >= 0x210a && c <= 0x2113)
- || c == 0x2115
- || (c >= 0x2118 && c <= 0x211d)
- || c == 0x2124
- || c == 0x2128
- || (c >= 0x212c && c <= 0x212d)
- || (c >= 0x212f && c <= 0x2138)
-
- // most Roman numerals (less 1K, 5K, 10K)
- || (c >= 0x2160 && c <= 0x217f)
- ;
-
- case 0x30:
- // some Hiragana
- return c >= 0x309b && c <= 0x309c;
-
- case 0x31:
- // all Hangul Compatibility Jamo
- return c >= 0x3131 && c <= 0x318e;
-
- case 0xf9:
- case 0xfa:
- case 0xfb:
- case 0xfc:
- case 0xfd:
- case 0xfe:
- case 0xff:
- // the whole "compatibility" area is for that purpose!
- return true;
-
- default:
- // most of Unicode isn't flagged as being for compatibility
- return false;
- }
- }
-
- // guts of isNameChar/isNCNameChar
- private static boolean isLetter2 (char c)
- {
- // [84] Letter ::= BaseChar | Ideographic
- // [85] BaseChar ::= ... too much to repeat
- // [86] Ideographic ::= ... too much to repeat
- // [87] CombiningChar ::= ... too much to repeat
-
- //
- // Optimize the typical case.
- //
- if (c >= 'a' && c <= 'z')
- return true;
- if (c == '>')
- return false;
- if (c >= 'A' && c <= 'Z')
- return true;
-
- //
- // Since the tables are too ridiculous to use in code,
- // we're using the footnotes here to drive this test.
- //
- switch (Character.getType (c)) {
- // app. B footnote says these are 'name start'
- // chars' ...
- case Character.LOWERCASE_LETTER: // Ll
- case Character.UPPERCASE_LETTER: // Lu
- case Character.OTHER_LETTER: // Lo
- case Character.TITLECASE_LETTER: // Lt
- case Character.LETTER_NUMBER: // Nl
- // ... and these are name characters 'other
- // than name start characters'
- case Character.COMBINING_SPACING_MARK: // Mc
- case Character.ENCLOSING_MARK: // Me
- case Character.NON_SPACING_MARK: // Mn
- case Character.MODIFIER_LETTER: // Lm
- case Character.DECIMAL_DIGIT_NUMBER: // Nd
-
- // OK, here we just have some exceptions to check...
- return !isCompatibilityChar (c)
- // per "5.14 of Unicode", rule out some combiners
- && !(c >= 0x20dd && c <= 0x20e0);
-
- default:
- // added a character ...
- return c == 0x0387;
- }
- }
-
- private static boolean isDigit (char c)
- {
- // [88] Digit ::= ...
-
- //
- // java.lang.Character.isDigit is correct from the XML point
- // of view except that it allows "fullwidth" digits.
- //
- return Character.isDigit (c)
- && ! ( (c >= 0xff10) && (c <= 0xff19));
- }
-
- private static boolean isExtender (char c)
- {
- // [89] Extender ::= ...
- return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
- || c == 0x0640 || c == 0x0e46 || c == 0x0ec6
- || c == 0x3005 || (c >= 0x3031 && c <= 0x3035)
- || (c >= 0x309d && c <= 0x309e)
- || (c >= 0x30fc && c <= 0x30fe)
- ;
- }
- }