1. /*
  2. * $Id: XmlChars.java,v 1.1.1.1 2000/11/23 01:53:35 edwingo Exp $
  3. *
  4. * The Apache Software License, Version 1.1
  5. *
  6. *
  7. * Copyright (c) 2000 The Apache Software Foundation. All rights
  8. * reserved.
  9. *
  10. * Redistribution and use in source and binary forms, with or without
  11. * modification, are permitted provided that the following conditions
  12. * are met:
  13. *
  14. * 1. Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. *
  17. * 2. Redistributions in binary form must reproduce the above copyright
  18. * notice, this list of conditions and the following disclaimer in
  19. * the documentation and/or other materials provided with the
  20. * distribution.
  21. *
  22. * 3. The end-user documentation included with the redistribution,
  23. * if any, must include the following acknowledgment:
  24. * "This product includes software developed by the
  25. * Apache Software Foundation (http://www.apache.org/)."
  26. * Alternately, this acknowledgment may appear in the software itself,
  27. * if and wherever such third-party acknowledgments normally appear.
  28. *
  29. * 4. The names "Crimson" and "Apache Software Foundation" must
  30. * not be used to endorse or promote products derived from this
  31. * software without prior written permission. For written
  32. * permission, please contact apache@apache.org.
  33. *
  34. * 5. Products derived from this software may not be called "Apache",
  35. * nor may "Apache" appear in their name, without prior written
  36. * permission of the Apache Software Foundation.
  37. *
  38. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  39. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  40. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  41. * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  42. * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  43. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  44. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  45. * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  46. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  47. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  48. * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  49. * SUCH DAMAGE.
  50. * ====================================================================
  51. *
  52. * This software consists of voluntary contributions made by many
  53. * individuals on behalf of the Apache Software Foundation and was
  54. * originally based on software copyright (c) 1999, Sun Microsystems, Inc.,
  55. * http://www.sun.com. For more information on the Apache Software
  56. * Foundation, please see <http://www.apache.org/>.
  57. */
  58. package org.apache.crimson.util;
  59. /**
  60. * Methods in this class are used to determine whether characters may
  61. * appear in certain roles in XML documents. Such methods are used
  62. * both to parse and to create such documents.
  63. *
  64. * @version 1.8
  65. * @author David Brownell
  66. */
  67. public class XmlChars
  68. {
  69. // can't construct instances
  70. private XmlChars () { }
  71. /**
  72. * Returns true if the argument, a UCS-4 character code, is valid in
  73. * XML documents. Unicode characters fit into the low sixteen
  74. * bits of a UCS-4 character, and pairs of Unicode <em>surrogate
  75. * characters</em> can be combined to encode UCS-4 characters in
  76. * documents containing only Unicode. (The <code>char</code> datatype
  77. * in the Java Programming Language represents Unicode characters,
  78. * including unpaired surrogates.)
  79. *
  80. * <P> In XML, UCS-4 characters can also be encoded by the use of
  81. * <em>character references</em> such as <b>&#x12345678;</b>, which
  82. * happens to refer to a character that is disallowed in XML documents.
  83. * UCS-4 characters allowed in XML documents can be expressed with
  84. * one or two Unicode characters.
  85. *
  86. * @param ucs4char The 32-bit UCS-4 character being tested.
  87. */
  88. static public boolean isChar (int ucs4char)
  89. {
  90. // [2] Char ::= #x0009 | #x000A | #x000D
  91. // | [#x0020-#xD7FF]
  92. // ... surrogates excluded!
  93. // | [#xE000-#xFFFD]
  94. // | [#x10000-#x10ffff]
  95. return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF)
  96. || ucs4char == 0x000A || ucs4char == 0x0009
  97. || ucs4char == 0x000D
  98. || (ucs4char >= 0xE000 && ucs4char <= 0xFFFD)
  99. || (ucs4char >= 0x10000 && ucs4char <= 0x10ffff));
  100. }
  101. /**
  102. * Returns true if the character is allowed to be a non-initial
  103. * character in names according to the XML recommendation.
  104. * @see #isNCNameChar
  105. * @see #isLetter
  106. */
  107. public static boolean isNameChar (char c)
  108. {
  109. // [4] NameChar ::= Letter | Digit | '.' | '_' | ':'
  110. // | CombiningChar | Extender
  111. if (isLetter2 (c))
  112. return true;
  113. else if (c == '>')
  114. return false;
  115. else if (c == '.' || c == '-' || c == '_' || c == ':'
  116. || isExtender (c))
  117. return true;
  118. else
  119. return false;
  120. }
  121. /**
  122. * Returns true if the character is allowed to be a non-initial
  123. * character in unscoped names according to the rules of the XML
  124. * Namespaces proposed recommendation. Except for precluding
  125. * the colon (used to separate names from their scopes) these
  126. * characters are just as allowed by the XML recommendation.
  127. * @see #isNameChar
  128. * @see #isLetter
  129. */
  130. public static boolean isNCNameChar (char c)
  131. {
  132. // [NC 5] NCNameChar ::= Letter | Digit | '.' | '_'
  133. // | CombiningChar | Extender
  134. return c != ':' && isNameChar (c);
  135. }
  136. /**
  137. * Returns true if the character is allowed where XML supports
  138. * whitespace characters, false otherwise.
  139. */
  140. public static boolean isSpace (char c)
  141. {
  142. return c == ' ' || c == '\t' || c == '\n' || c == '\r';
  143. }
  144. /*
  145. * NOTE: java.lang.Character.getType() values are:
  146. *
  147. * UNASSIGNED = 0,
  148. *
  149. * UPPERCASE_LETTER = 1, // Lu
  150. * LOWERCASE_LETTER = 2, // Ll
  151. * TITLECASE_LETTER = 3, // Lt
  152. * MODIFIER_LETTER = 4, // Lm
  153. * OTHER_LETTER = 5, // Lo
  154. * NON_SPACING_MARK = 6, // Mn
  155. * ENCLOSING_MARK = 7, // Me
  156. * COMBINING_SPACING_MARK = 8, // Mc
  157. * DECIMAL_DIGIT_NUMBER = 9, // Nd
  158. * LETTER_NUMBER = 10, // Nl
  159. * OTHER_NUMBER = 11, // No
  160. * SPACE_SEPARATOR = 12, // Zs
  161. * LINE_SEPARATOR = 13, // Zl
  162. * PARAGRAPH_SEPARATOR = 14, // Zp
  163. * CONTROL = 15, // Cc
  164. * FORMAT = 16, // Cf
  165. * // 17 reserved for proposed Ci category
  166. * PRIVATE_USE = 18, // Co
  167. * SURROGATE = 19, // Cs
  168. * DASH_PUNCTUATION = 20, // Pd
  169. * START_PUNCTUATION = 21, // Ps
  170. * END_PUNCTUATION = 22, // Pe
  171. * CONNECTOR_PUNCTUATION = 23, // Pc
  172. * OTHER_PUNCTUATION = 24, // Po
  173. * MATH_SYMBOL = 25, // Sm
  174. * CURRENCY_SYMBOL = 26, // Sc
  175. * MODIFIER_SYMBOL = 27, // Sk
  176. * OTHER_SYMBOL = 28; // So
  177. */
  178. /**
  179. * Returns true if the character is an XML "letter". XML Names must
  180. * start with Letters or a few other characters, but other characters
  181. * in names must only satisfy the <em>isNameChar</em> predicate.
  182. *
  183. * @see #isNameChar
  184. * @see #isNCNameChar
  185. */
  186. public static boolean isLetter (char c)
  187. {
  188. // [84] Letter ::= BaseChar | Ideographic
  189. // [85] BaseChar ::= ... too much to repeat
  190. // [86] Ideographic ::= ... too much to repeat
  191. //
  192. // Optimize the typical case.
  193. //
  194. if (c >= 'a' && c <= 'z')
  195. return true;
  196. if (c == '/')
  197. return false;
  198. if (c >= 'A' && c <= 'Z')
  199. return true;
  200. //
  201. // Since the tables are too ridiculous to use in code,
  202. // we're using the footnotes here to drive this test.
  203. //
  204. switch (Character.getType (c)) {
  205. // app. B footnote says these are 'name start'
  206. // chars' ...
  207. case Character.LOWERCASE_LETTER: // Ll
  208. case Character.UPPERCASE_LETTER: // Lu
  209. case Character.OTHER_LETTER: // Lo
  210. case Character.TITLECASE_LETTER: // Lt
  211. case Character.LETTER_NUMBER: // Nl
  212. // OK, here we just have some exceptions to check...
  213. return !isCompatibilityChar (c)
  214. // per "5.14 of Unicode", rule out some combiners
  215. && !(c >= 0x20dd && c <= 0x20e0);
  216. default:
  217. // check for some exceptions: these are "alphabetic"
  218. return ((c >= 0x02bb && c <= 0x02c1)
  219. || c == 0x0559 || c == 0x06e5 || c == 0x06e6);
  220. }
  221. }
  222. //
  223. // XML 1.0 discourages "compatibility" characters in names; these
  224. // were defined to permit passing through some information stored in
  225. // older non-Unicode character sets. These always have alternative
  226. // representations in Unicode, e.g. using combining chars.
  227. //
  228. private static boolean isCompatibilityChar (char c)
  229. {
  230. // the numerous comparisions here seem unavoidable,
  231. // but the switch can reduce the number which must
  232. // actually be executed.
  233. switch ((c >> 8) & 0x0ff) {
  234. case 0x00:
  235. // ISO Latin/1 has a few compatibility characters
  236. return c == 0x00aa || c == 0x00b5 || c == 0x00ba;
  237. case 0x01:
  238. // as do Latin Extended A and (parts of) B
  239. return (c >= 0x0132 && c <= 0x0133)
  240. || (c >= 0x013f && c <= 0x0140)
  241. || c == 0x0149
  242. || c == 0x017f
  243. || (c >= 0x01c4 && c <= 0x01cc)
  244. || (c >= 0x01f1 && c <= 0x01f3) ;
  245. case 0x02:
  246. // some spacing modifiers
  247. return (c >= 0x02b0 && c <= 0x02b8)
  248. || (c >= 0x02e0 && c <= 0x02e4);
  249. case 0x03:
  250. return c == 0x037a; // Greek
  251. case 0x05:
  252. return c == 0x0587; // Armenian
  253. case 0x0e:
  254. return c >= 0x0edc && c <= 0x0edd; // Laotian
  255. case 0x11:
  256. // big chunks of Hangul Jamo are all "compatibility"
  257. return c == 0x1101
  258. || c == 0x1104
  259. || c == 0x1108
  260. || c == 0x110a
  261. || c == 0x110d
  262. || (c >= 0x1113 && c <= 0x113b)
  263. || c == 0x113d
  264. || c == 0x113f
  265. || (c >= 0x1141 && c <= 0x114b)
  266. || c == 0x114d
  267. || c == 0x114f
  268. || (c >= 0x1151 && c <= 0x1153)
  269. || (c >= 0x1156 && c <= 0x1158)
  270. || c == 0x1162
  271. || c == 0x1164
  272. || c == 0x1166
  273. || c == 0x1168
  274. || (c >= 0x116a && c <= 0x116c)
  275. || (c >= 0x116f && c <= 0x1171)
  276. || c == 0x1174
  277. || (c >= 0x1176 && c <= 0x119d)
  278. || (c >= 0x119f && c <= 0x11a2)
  279. || (c >= 0x11a9 && c <= 0x11aa)
  280. || (c >= 0x11ac && c <= 0x11ad)
  281. || (c >= 0x11b0 && c <= 0x11b6)
  282. || c == 0x11b9
  283. || c == 0x11bb
  284. || (c >= 0x11c3 && c <= 0x11ea)
  285. || (c >= 0x11ec && c <= 0x11ef)
  286. || (c >= 0x11f1 && c <= 0x11f8)
  287. ;
  288. case 0x20:
  289. return c == 0x207f; // superscript
  290. case 0x21:
  291. return
  292. // various letterlike symbols
  293. c == 0x2102
  294. || c == 0x2107
  295. || (c >= 0x210a && c <= 0x2113)
  296. || c == 0x2115
  297. || (c >= 0x2118 && c <= 0x211d)
  298. || c == 0x2124
  299. || c == 0x2128
  300. || (c >= 0x212c && c <= 0x212d)
  301. || (c >= 0x212f && c <= 0x2138)
  302. // most Roman numerals (less 1K, 5K, 10K)
  303. || (c >= 0x2160 && c <= 0x217f)
  304. ;
  305. case 0x30:
  306. // some Hiragana
  307. return c >= 0x309b && c <= 0x309c;
  308. case 0x31:
  309. // all Hangul Compatibility Jamo
  310. return c >= 0x3131 && c <= 0x318e;
  311. case 0xf9:
  312. case 0xfa:
  313. case 0xfb:
  314. case 0xfc:
  315. case 0xfd:
  316. case 0xfe:
  317. case 0xff:
  318. // the whole "compatibility" area is for that purpose!
  319. return true;
  320. default:
  321. // most of Unicode isn't flagged as being for compatibility
  322. return false;
  323. }
  324. }
  325. // guts of isNameChar/isNCNameChar
  326. private static boolean isLetter2 (char c)
  327. {
  328. // [84] Letter ::= BaseChar | Ideographic
  329. // [85] BaseChar ::= ... too much to repeat
  330. // [86] Ideographic ::= ... too much to repeat
  331. // [87] CombiningChar ::= ... too much to repeat
  332. //
  333. // Optimize the typical case.
  334. //
  335. if (c >= 'a' && c <= 'z')
  336. return true;
  337. if (c == '>')
  338. return false;
  339. if (c >= 'A' && c <= 'Z')
  340. return true;
  341. //
  342. // Since the tables are too ridiculous to use in code,
  343. // we're using the footnotes here to drive this test.
  344. //
  345. switch (Character.getType (c)) {
  346. // app. B footnote says these are 'name start'
  347. // chars' ...
  348. case Character.LOWERCASE_LETTER: // Ll
  349. case Character.UPPERCASE_LETTER: // Lu
  350. case Character.OTHER_LETTER: // Lo
  351. case Character.TITLECASE_LETTER: // Lt
  352. case Character.LETTER_NUMBER: // Nl
  353. // ... and these are name characters 'other
  354. // than name start characters'
  355. case Character.COMBINING_SPACING_MARK: // Mc
  356. case Character.ENCLOSING_MARK: // Me
  357. case Character.NON_SPACING_MARK: // Mn
  358. case Character.MODIFIER_LETTER: // Lm
  359. case Character.DECIMAL_DIGIT_NUMBER: // Nd
  360. // OK, here we just have some exceptions to check...
  361. return !isCompatibilityChar (c)
  362. // per "5.14 of Unicode", rule out some combiners
  363. && !(c >= 0x20dd && c <= 0x20e0);
  364. default:
  365. // added a character ...
  366. return c == 0x0387;
  367. }
  368. }
  369. private static boolean isDigit (char c)
  370. {
  371. // [88] Digit ::= ...
  372. //
  373. // java.lang.Character.isDigit is correct from the XML point
  374. // of view except that it allows "fullwidth" digits.
  375. //
  376. return Character.isDigit (c)
  377. && ! ( (c >= 0xff10) && (c <= 0xff19));
  378. }
  379. private static boolean isExtender (char c)
  380. {
  381. // [89] Extender ::= ...
  382. return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
  383. || c == 0x0640 || c == 0x0e46 || c == 0x0ec6
  384. || c == 0x3005 || (c >= 0x3031 && c <= 0x3035)
  385. || (c >= 0x309d && c <= 0x309e)
  386. || (c >= 0x30fc && c <= 0x30fe)
  387. ;
  388. }
  389. }