1. /*
  2. * Copyright 2001-2004 The Apache Software Foundation.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. package org.apache.commons.betwixt;
  17. /**
  18. * <p><code>XMLUtils</code> contains basic utility methods for XML.</p>
  19. *
  20. * <p>The code for {@link #isWellFormedXMLName} is based on code in
  21. * <code>org.apache.xerces.util.XMLChar</code>
  22. * in <a href='http://xml.apache.org/xerces2-j/index.html'>Apache Xerces</a>.
  23. * The authors of this class are credited below.</p>
  24. *
  25. * @author Glenn Marcy, IBM
  26. * @author Andy Clark, IBM
  27. * @author Eric Ye, IBM
  28. * @author Arnaud Le Hors, IBM
  29. * @author Rahul Srivastava, Sun Microsystems Inc.
  30. *
  31. * @author Robert Burrell Donkin
  32. * @since 0.5
  33. */
  34. public class XMLUtils {
  35. // Constants
  36. //-------------------------------------------------------------------------
  37. /** Escaped <code><</code> entity */
  38. public static final String LESS_THAN_ENTITY = "<";
  39. /** Escaped <code>></code> entity */
  40. public static final String GREATER_THAN_ENTITY = ">";
  41. /** Escaped <code>&</code> entity */
  42. public static final String AMPERSAND_ENTITY = "&";
  43. /** Escaped <code>'</code> entity */
  44. public static final String APOSTROPHE_ENTITY = "'";
  45. /** Escaped <code>"</code> entity */
  46. public static final String QUOTE_ENTITY = """;
  47. // Used by isWellFormedXMLName
  48. /** Name start character mask. */
  49. private static final int MASK_NAME_START = 0x01;
  50. /** Name character mask. */
  51. private static final int MASK_NAME = 0x02;
  52. // Class attributes
  53. //-------------------------------------------------------------------------
  54. /** Character flags. */
  55. private static final byte[] CHARS = new byte[1 << 16];
  56. //
  57. // Static initialization
  58. //
  59. static {
  60. //
  61. // [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' |
  62. // CombiningChar | Extender
  63. //
  64. int nameChar[] = {
  65. 0x002D, 0x002E, // '-' and '.'
  66. };
  67. //
  68. // [5] Name ::= (Letter | '_' | ':') (NameChar)*
  69. //
  70. int nameStartChar[] = {
  71. 0x003A, 0x005F, // ':' and '_'
  72. };
  73. //
  74. // [84] Letter ::= BaseChar | Ideographic
  75. //
  76. int letterRange[] = {
  77. // BaseChar
  78. 0x0041, 0x005A, 0x0061, 0x007A, 0x00C0, 0x00D6, 0x00D8, 0x00F6,
  79. 0x00F8, 0x0131, 0x0134, 0x013E, 0x0141, 0x0148, 0x014A, 0x017E,
  80. 0x0180, 0x01C3, 0x01CD, 0x01F0, 0x01F4, 0x01F5, 0x01FA, 0x0217,
  81. 0x0250, 0x02A8, 0x02BB, 0x02C1, 0x0388, 0x038A, 0x038E, 0x03A1,
  82. 0x03A3, 0x03CE, 0x03D0, 0x03D6, 0x03E2, 0x03F3, 0x0401, 0x040C,
  83. 0x040E, 0x044F, 0x0451, 0x045C, 0x045E, 0x0481, 0x0490, 0x04C4,
  84. 0x04C7, 0x04C8, 0x04CB, 0x04CC, 0x04D0, 0x04EB, 0x04EE, 0x04F5,
  85. 0x04F8, 0x04F9, 0x0531, 0x0556, 0x0561, 0x0586, 0x05D0, 0x05EA,
  86. 0x05F0, 0x05F2, 0x0621, 0x063A, 0x0641, 0x064A, 0x0671, 0x06B7,
  87. 0x06BA, 0x06BE, 0x06C0, 0x06CE, 0x06D0, 0x06D3, 0x06E5, 0x06E6,
  88. 0x0905, 0x0939, 0x0958, 0x0961, 0x0985, 0x098C, 0x098F, 0x0990,
  89. 0x0993, 0x09A8, 0x09AA, 0x09B0, 0x09B6, 0x09B9, 0x09DC, 0x09DD,
  90. 0x09DF, 0x09E1, 0x09F0, 0x09F1, 0x0A05, 0x0A0A, 0x0A0F, 0x0A10,
  91. 0x0A13, 0x0A28, 0x0A2A, 0x0A30, 0x0A32, 0x0A33, 0x0A35, 0x0A36,
  92. 0x0A38, 0x0A39, 0x0A59, 0x0A5C, 0x0A72, 0x0A74, 0x0A85, 0x0A8B,
  93. 0x0A8F, 0x0A91, 0x0A93, 0x0AA8, 0x0AAA, 0x0AB0, 0x0AB2, 0x0AB3,
  94. 0x0AB5, 0x0AB9, 0x0B05, 0x0B0C, 0x0B0F, 0x0B10, 0x0B13, 0x0B28,
  95. 0x0B2A, 0x0B30, 0x0B32, 0x0B33, 0x0B36, 0x0B39, 0x0B5C, 0x0B5D,
  96. 0x0B5F, 0x0B61, 0x0B85, 0x0B8A, 0x0B8E, 0x0B90, 0x0B92, 0x0B95,
  97. 0x0B99, 0x0B9A, 0x0B9E, 0x0B9F, 0x0BA3, 0x0BA4, 0x0BA8, 0x0BAA,
  98. 0x0BAE, 0x0BB5, 0x0BB7, 0x0BB9, 0x0C05, 0x0C0C, 0x0C0E, 0x0C10,
  99. 0x0C12, 0x0C28, 0x0C2A, 0x0C33, 0x0C35, 0x0C39, 0x0C60, 0x0C61,
  100. 0x0C85, 0x0C8C, 0x0C8E, 0x0C90, 0x0C92, 0x0CA8, 0x0CAA, 0x0CB3,
  101. 0x0CB5, 0x0CB9, 0x0CE0, 0x0CE1, 0x0D05, 0x0D0C, 0x0D0E, 0x0D10,
  102. 0x0D12, 0x0D28, 0x0D2A, 0x0D39, 0x0D60, 0x0D61, 0x0E01, 0x0E2E,
  103. 0x0E32, 0x0E33, 0x0E40, 0x0E45, 0x0E81, 0x0E82, 0x0E87, 0x0E88,
  104. 0x0E94, 0x0E97, 0x0E99, 0x0E9F, 0x0EA1, 0x0EA3, 0x0EAA, 0x0EAB,
  105. 0x0EAD, 0x0EAE, 0x0EB2, 0x0EB3, 0x0EC0, 0x0EC4, 0x0F40, 0x0F47,
  106. 0x0F49, 0x0F69, 0x10A0, 0x10C5, 0x10D0, 0x10F6, 0x1102, 0x1103,
  107. 0x1105, 0x1107, 0x110B, 0x110C, 0x110E, 0x1112, 0x1154, 0x1155,
  108. 0x115F, 0x1161, 0x116D, 0x116E, 0x1172, 0x1173, 0x11AE, 0x11AF,
  109. 0x11B7, 0x11B8, 0x11BC, 0x11C2, 0x1E00, 0x1E9B, 0x1EA0, 0x1EF9,
  110. 0x1F00, 0x1F15, 0x1F18, 0x1F1D, 0x1F20, 0x1F45, 0x1F48, 0x1F4D,
  111. 0x1F50, 0x1F57, 0x1F5F, 0x1F7D, 0x1F80, 0x1FB4, 0x1FB6, 0x1FBC,
  112. 0x1FC2, 0x1FC4, 0x1FC6, 0x1FCC, 0x1FD0, 0x1FD3, 0x1FD6, 0x1FDB,
  113. 0x1FE0, 0x1FEC, 0x1FF2, 0x1FF4, 0x1FF6, 0x1FFC, 0x212A, 0x212B,
  114. 0x2180, 0x2182, 0x3041, 0x3094, 0x30A1, 0x30FA, 0x3105, 0x312C,
  115. 0xAC00, 0xD7A3,
  116. // Ideographic
  117. 0x3021, 0x3029, 0x4E00, 0x9FA5,
  118. };
  119. int letterChar[] = {
  120. // BaseChar
  121. 0x0386, 0x038C, 0x03DA, 0x03DC, 0x03DE, 0x03E0, 0x0559, 0x06D5,
  122. 0x093D, 0x09B2, 0x0A5E, 0x0A8D, 0x0ABD, 0x0AE0, 0x0B3D, 0x0B9C,
  123. 0x0CDE, 0x0E30, 0x0E84, 0x0E8A, 0x0E8D, 0x0EA5, 0x0EA7, 0x0EB0,
  124. 0x0EBD, 0x1100, 0x1109, 0x113C, 0x113E, 0x1140, 0x114C, 0x114E,
  125. 0x1150, 0x1159, 0x1163, 0x1165, 0x1167, 0x1169, 0x1175, 0x119E,
  126. 0x11A8, 0x11AB, 0x11BA, 0x11EB, 0x11F0, 0x11F9, 0x1F59, 0x1F5B,
  127. 0x1F5D, 0x1FBE, 0x2126, 0x212E,
  128. // Ideographic
  129. 0x3007,
  130. };
  131. //
  132. // [87] CombiningChar ::= ...
  133. //
  134. int combiningCharRange[] = {
  135. 0x0300, 0x0345, 0x0360, 0x0361, 0x0483, 0x0486, 0x0591, 0x05A1,
  136. 0x05A3, 0x05B9, 0x05BB, 0x05BD, 0x05C1, 0x05C2, 0x064B, 0x0652,
  137. 0x06D6, 0x06DC, 0x06DD, 0x06DF, 0x06E0, 0x06E4, 0x06E7, 0x06E8,
  138. 0x06EA, 0x06ED, 0x0901, 0x0903, 0x093E, 0x094C, 0x0951, 0x0954,
  139. 0x0962, 0x0963, 0x0981, 0x0983, 0x09C0, 0x09C4, 0x09C7, 0x09C8,
  140. 0x09CB, 0x09CD, 0x09E2, 0x09E3, 0x0A40, 0x0A42, 0x0A47, 0x0A48,
  141. 0x0A4B, 0x0A4D, 0x0A70, 0x0A71, 0x0A81, 0x0A83, 0x0ABE, 0x0AC5,
  142. 0x0AC7, 0x0AC9, 0x0ACB, 0x0ACD, 0x0B01, 0x0B03, 0x0B3E, 0x0B43,
  143. 0x0B47, 0x0B48, 0x0B4B, 0x0B4D, 0x0B56, 0x0B57, 0x0B82, 0x0B83,
  144. 0x0BBE, 0x0BC2, 0x0BC6, 0x0BC8, 0x0BCA, 0x0BCD, 0x0C01, 0x0C03,
  145. 0x0C3E, 0x0C44, 0x0C46, 0x0C48, 0x0C4A, 0x0C4D, 0x0C55, 0x0C56,
  146. 0x0C82, 0x0C83, 0x0CBE, 0x0CC4, 0x0CC6, 0x0CC8, 0x0CCA, 0x0CCD,
  147. 0x0CD5, 0x0CD6, 0x0D02, 0x0D03, 0x0D3E, 0x0D43, 0x0D46, 0x0D48,
  148. 0x0D4A, 0x0D4D, 0x0E34, 0x0E3A, 0x0E47, 0x0E4E, 0x0EB4, 0x0EB9,
  149. 0x0EBB, 0x0EBC, 0x0EC8, 0x0ECD, 0x0F18, 0x0F19, 0x0F71, 0x0F84,
  150. 0x0F86, 0x0F8B, 0x0F90, 0x0F95, 0x0F99, 0x0FAD, 0x0FB1, 0x0FB7,
  151. 0x20D0, 0x20DC, 0x302A, 0x302F,
  152. };
  153. int combiningCharChar[] = {
  154. 0x05BF, 0x05C4, 0x0670, 0x093C, 0x094D, 0x09BC, 0x09BE, 0x09BF,
  155. 0x09D7, 0x0A02, 0x0A3C, 0x0A3E, 0x0A3F, 0x0ABC, 0x0B3C, 0x0BD7,
  156. 0x0D57, 0x0E31, 0x0EB1, 0x0F35, 0x0F37, 0x0F39, 0x0F3E, 0x0F3F,
  157. 0x0F97, 0x0FB9, 0x20E1, 0x3099, 0x309A,
  158. };
  159. //
  160. // [88] Digit ::= ...
  161. //
  162. int digitRange[] = {
  163. 0x0030, 0x0039, 0x0660, 0x0669, 0x06F0, 0x06F9, 0x0966, 0x096F,
  164. 0x09E6, 0x09EF, 0x0A66, 0x0A6F, 0x0AE6, 0x0AEF, 0x0B66, 0x0B6F,
  165. 0x0BE7, 0x0BEF, 0x0C66, 0x0C6F, 0x0CE6, 0x0CEF, 0x0D66, 0x0D6F,
  166. 0x0E50, 0x0E59, 0x0ED0, 0x0ED9, 0x0F20, 0x0F29,
  167. };
  168. //
  169. // [89] Extender ::= ...
  170. //
  171. int extenderRange[] = {
  172. 0x3031, 0x3035, 0x309D, 0x309E, 0x30FC, 0x30FE,
  173. };
  174. int extenderChar[] = {
  175. 0x00B7, 0x02D0, 0x02D1, 0x0387, 0x0640, 0x0E46, 0x0EC6, 0x3005,
  176. };
  177. //
  178. // Initialize
  179. //
  180. // set name start characters
  181. for (int i = 0; i < nameStartChar.length; i++) {
  182. CHARS[nameStartChar[i]] |= MASK_NAME_START | MASK_NAME;
  183. }
  184. for (int i = 0; i < letterRange.length; i += 2) {
  185. for (int j = letterRange[i]; j <= letterRange[i + 1]; j++) {
  186. CHARS[j] |= MASK_NAME_START | MASK_NAME;
  187. }
  188. }
  189. for (int i = 0; i < letterChar.length; i++) {
  190. CHARS[letterChar[i]] |= MASK_NAME_START | MASK_NAME;
  191. }
  192. // set name characters
  193. for (int i = 0; i < nameChar.length; i++) {
  194. CHARS[nameChar[i]] |= MASK_NAME;
  195. }
  196. for (int i = 0; i < digitRange.length; i += 2) {
  197. for (int j = digitRange[i]; j <= digitRange[i + 1]; j++) {
  198. CHARS[j] |= MASK_NAME;
  199. }
  200. }
  201. for (int i = 0; i < combiningCharRange.length; i += 2) {
  202. for (int j = combiningCharRange[i]; j <= combiningCharRange[i + 1]; j++) {
  203. CHARS[j] |= MASK_NAME;
  204. }
  205. }
  206. for (int i = 0; i < combiningCharChar.length; i++) {
  207. CHARS[combiningCharChar[i]] |= MASK_NAME;
  208. }
  209. for (int i = 0; i < extenderRange.length; i += 2) {
  210. for (int j = extenderRange[i]; j <= extenderRange[i + 1]; j++) {
  211. CHARS[j] |= MASK_NAME;
  212. }
  213. }
  214. for (int i = 0; i < extenderChar.length; i++) {
  215. CHARS[extenderChar[i]] |= MASK_NAME;
  216. }
  217. }
  218. // Constructor
  219. //-------------------------------------------------------------------------
  220. /**
  221. * <p>Constructor for use by tools that required <code>JavaBean</code> instances.</p>
  222. *
  223. * <p>This constructor is public <strong>only</strong>
  224. * to permit tools that require a JavaBean instance to operate.
  225. * <code>XMLUtils</code> instances should <strong>not</strong> be constructed in standard
  226. * programming. Instead, the class methods should be called directly.</p>
  227. */
  228. public XMLUtils() {}
  229. // Class methods
  230. //-------------------------------------------------------------------------
  231. /**
  232. * <p>Escape the <code>toString</code> of the given object.
  233. * For use as body text.</p>
  234. *
  235. * @param value escape <code>value.toString()</code>
  236. * @return text with escaped delimiters
  237. */
  238. public static final String escapeBodyValue(Object value) {
  239. StringBuffer buffer = new StringBuffer(value.toString());
  240. for (int i=0, size = buffer.length(); i <size; i++) {
  241. switch (buffer.charAt(i)) {
  242. case '<':
  243. buffer.replace(i, i+1, LESS_THAN_ENTITY);
  244. size += 3;
  245. i+=3;
  246. break;
  247. case '>':
  248. buffer.replace(i, i+1, GREATER_THAN_ENTITY);
  249. size += 3;
  250. i += 3;
  251. break;
  252. case '&':
  253. buffer.replace(i, i+1, AMPERSAND_ENTITY);
  254. size += 4;
  255. i += 4;
  256. break;
  257. }
  258. }
  259. return buffer.toString();
  260. }
  261. /**
  262. * <p>Escape the <code>toString</code> of the given object.
  263. * For use in an attribute value.</p>
  264. *
  265. * @param value escape <code>value.toString()</code>
  266. * @return text with characters restricted (for use in attributes) escaped
  267. */
  268. public static final String escapeAttributeValue(Object value) {
  269. StringBuffer buffer = new StringBuffer(value.toString());
  270. for (int i=0, size = buffer.length(); i <size; i++) {
  271. switch (buffer.charAt(i)) {
  272. case '<':
  273. buffer.replace(i, i+1, LESS_THAN_ENTITY);
  274. size += 3;
  275. i+=3;
  276. break;
  277. case '>':
  278. buffer.replace(i, i+1, GREATER_THAN_ENTITY);
  279. size += 3;
  280. i += 3;
  281. break;
  282. case '&':
  283. buffer.replace(i, i+1, AMPERSAND_ENTITY);
  284. size += 4;
  285. i += 4;
  286. break;
  287. case '\'':
  288. buffer.replace(i, i+1, APOSTROPHE_ENTITY);
  289. size += 5;
  290. i += 5;
  291. break;
  292. case '\"':
  293. buffer.replace(i, i+1, QUOTE_ENTITY);
  294. size += 5;
  295. i += 5;
  296. break;
  297. }
  298. }
  299. return buffer.toString();
  300. }
  301. /**
  302. * Escapes the given content suitable for insertion within a
  303. * <code>CDATA</code> sequence.
  304. * Within a <code>CDATA</code> section, only the <code>CDEnd</code>
  305. * string ']]>' is recognized as markup.
  306. * @param content the body content whose character data should
  307. * be escaped in a way appropriate for use within a <code>CDATA</code>
  308. * section of xml.
  309. * @return escaped character data, not null
  310. */
  311. public static final String escapeCDATAContent(String content) {
  312. StringBuffer buffer = new StringBuffer(content);
  313. escapeCDATAContent(buffer);
  314. return buffer.toString();
  315. }
  316. /**
  317. * Escapes the given content suitable for insertion within a
  318. * <code>CDATA</code> sequence.
  319. * Within a <code>CDATA</code> section, only the <code>CDEnd</code>
  320. * string ']]>' is recognized as markup.
  321. * @param bufferedContent the body content within a buffer
  322. * whose character data should
  323. * be escaped in a way appropriate for use within a <code>CDATA</code>
  324. * section of xml.
  325. * @return escaped character data, not null
  326. */
  327. public static final void escapeCDATAContent(StringBuffer bufferedContent) {
  328. for (int i=2, size = bufferedContent.length(); i<size; i++) {
  329. char at = bufferedContent.charAt(i);
  330. if ( at == '>'
  331. && bufferedContent.charAt(i-1) == ']'
  332. && bufferedContent.charAt(i-2) == ']') {
  333. bufferedContent.replace(i, i+1, GREATER_THAN_ENTITY);
  334. size += 3;
  335. i+=3;
  336. }
  337. }
  338. }
  339. /**
  340. * <p>Is this string a well formed xml name?</p>
  341. *
  342. * <p>Only certain characters are allowed in well formed element and attribute
  343. * names in xml. For example, white space is not allowed in a name.</p>
  344. *
  345. * <p>The code for this method is based on code in
  346. * <code>org.apache.xerces.util.XMLChar</code>
  347. * in <a href='http://xml.apache.org/xerces2-j/index.html'>Apache Xerces</a>.
  348. * The authors of this class are credited at the top of this class.</p>
  349. *
  350. * @param name the <code>String</code> to be checked for use as an xml attribute
  351. * or element name. Returns false if <code>name</code> is null
  352. * @return true if this string would be a well-formed name
  353. */
  354. public static boolean isWellFormedXMLName( String name ) {
  355. if ( name == null ) {
  356. return false;
  357. }
  358. if ( name.length() == 0 ) {
  359. return false;
  360. }
  361. char ch = name.charAt(0);
  362. if( isNameStartChar(ch) == false) {
  363. return false;
  364. }
  365. for (int i = 1; i < name.length(); i++ ) {
  366. ch = name.charAt(i);
  367. if( isNameChar( ch ) == false ) {
  368. return false;
  369. }
  370. }
  371. return true;
  372. }
  373. /**
  374. * Returns true if the specified character is a valid name
  375. * character as defined by the XML 1.0 specification.
  376. *
  377. * @param c The character to check.
  378. * @return true if this is an XML name character
  379. */
  380. public static boolean isNameChar(int c) {
  381. return c < 0x10000 && (CHARS[c] & MASK_NAME) != 0;
  382. }
  383. /**
  384. * Returns true if the specified character is a valid name start
  385. * character as defined in the XML 1.0 specification.
  386. *
  387. * @param c The character to check.
  388. * @return trus if this is an XML name start character
  389. */
  390. public static boolean isNameStartChar(int c) {
  391. return c < 0x10000 && (CHARS[c] & MASK_NAME_START) != 0;
  392. }
  393. }