1. /*
  2. * The Apache Software License, Version 1.1
  3. *
  4. *
  5. * Copyright (c) 1999 The Apache Software Foundation. All rights
  6. * reserved.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. *
  12. * 1. Redistributions of source code must retain the above copyright
  13. * notice, this list of conditions and the following disclaimer.
  14. *
  15. * 2. Redistributions in binary form must reproduce the above copyright
  16. * notice, this list of conditions and the following disclaimer in
  17. * the documentation and/or other materials provided with the
  18. * distribution.
  19. *
  20. * 3. The end-user documentation included with the redistribution,
  21. * if any, must include the following acknowledgment:
  22. * "This product includes software developed by the
  23. * Apache Software Foundation (http://www.apache.org/)."
  24. * Alternately, this acknowledgment may appear in the software itself,
  25. * if and wherever such third-party acknowledgments normally appear.
  26. *
  27. * 4. The names "Xalan" and "Apache Software Foundation" must
  28. * not be used to endorse or promote products derived from this
  29. * software without prior written permission. For written
  30. * permission, please contact apache@apache.org.
  31. *
  32. * 5. Products derived from this software may not be called "Apache",
  33. * nor may "Apache" appear in their name, without prior written
  34. * permission of the Apache Software Foundation.
  35. *
  36. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  37. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  38. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  39. * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  40. * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  41. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  42. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  43. * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  44. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  45. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  46. * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  47. * SUCH DAMAGE.
  48. * ====================================================================
  49. *
  50. * This software consists of voluntary contributions made by many
  51. * individuals on behalf of the Apache Software Foundation and was
  52. * originally based on software copyright (c) 1999, Lotus
  53. * Development Corporation., http://www.lotus.com. For more
  54. * information on the Apache Software Foundation, please see
  55. * <http://www.apache.org/>.
  56. */
  57. package org.apache.xalan.serialize;
  58. import java.util.BitSet;
  59. import java.io.InputStream;
  60. import java.io.InputStreamReader;
  61. import java.io.BufferedReader;
  62. import java.net.*;
  63. import java.util.Hashtable;
  64. import org.apache.xml.utils.CharKey;
  65. import org.apache.xalan.res.XSLMessages;
  66. import org.apache.xalan.res.XSLTErrorResources;
  67. /**
  68. * This class provides services that tell if a character should have
  69. * special treatement, such as entity reference substitution or normalization
  70. * of a newline character. It also provides character to entity reference
  71. * lookup.
  72. *
  73. * DEVELOPERS: See Known Issue in the constructor.
  74. */
  75. public class CharInfo
  76. {
  77. /** Bit map that tells if a given character should have special treatment. */
  78. BitSet m_specialsMap = new BitSet(65535);
  79. /** Lookup table for characters to entity references. */
  80. private Hashtable m_charToEntityRef = new Hashtable();
  81. /**
  82. * The name of the HTML entities file.
  83. * If specified, the file will be resource loaded with the default class loader.
  84. */
  85. public static String HTML_ENTITIES_RESOURCE = "HTMLEntities.res";
  86. /**
  87. * The name of the XML entities file.
  88. * If specified, the file will be resource loaded with the default class loader.
  89. */
  90. public static String XML_ENTITIES_RESOURCE = "XMLEntities.res";
  91. /** The linefeed character, which the parser should always normalize. */
  92. public static char S_LINEFEED = 0x0A;
  93. /** The carriage return character, which the parser should always normalize. */
  94. public static char S_CARRIAGERETURN = 0x0D;
  95. /** a zero length Class array used in the constructor */
  96. private static final Class[] NO_CLASSES = new Class[0];
  97. /** a zero length Object array used in the constructor */
  98. private static final Object[] NO_OBJS = new Object[0];
  99. /**
  100. * Constructor that reads in a resource file that describes the mapping of
  101. * characters to entity references.
  102. *
  103. * Resource files must be encoded in UTF-8 and have a format like:
  104. * <pre>
  105. * # First char # is a comment
  106. * Entity numericValue
  107. * quot 34
  108. * amp 38
  109. * </pre>
  110. * (Note: Why don't we just switch to .properties files? Oct-01 -sc)
  111. *
  112. * @param entitiesResource Name of entities resource file that should
  113. * be loaded, which describes that mapping of characters to entity references.
  114. */
  115. public CharInfo(String entitiesResource)
  116. {
  117. InputStream is = null;
  118. BufferedReader reader = null;
  119. int index;
  120. String name;
  121. String value;
  122. int code;
  123. String line;
  124. try
  125. {
  126. try
  127. {
  128. // Maintenance note: we should evaluate replacing getting the
  129. // ClassLoader with javax.xml.transform.FactoryFinder.findClassLoader()
  130. // or similar code
  131. ClassLoader cl = CharInfo.class.getClassLoader();
  132. if (cl == null) {
  133. is = ClassLoader.getSystemResourceAsStream(entitiesResource);
  134. } else {
  135. is = cl.getResourceAsStream(entitiesResource);
  136. }
  137. }
  138. catch (Exception e) {}
  139. if (is == null)
  140. is = CharInfo.class.getResourceAsStream(entitiesResource);
  141. if (is == null)
  142. {
  143. URL url = new URL(entitiesResource);
  144. is = url.openStream();
  145. }
  146. if (is == null)
  147. throw new RuntimeException(XSLMessages.createMessage(XSLTErrorResources.ER_RESOURCE_COULD_NOT_FIND, new Object[]{entitiesResource, entitiesResource }));
  148. // Fix Bugzilla#4000: force reading in UTF-8
  149. // This creates the de facto standard that Xalan's resource
  150. // files must be encoded in UTF-8. This should work in all JVMs.
  151. //
  152. // %REVIEW% KNOWN ISSUE: IT FAILS IN MICROSOFT VJ++, which
  153. // didn't implement the UTF-8 encoding. Theoretically, we should
  154. // simply let it fail in that case, since the JVM is obviously
  155. // broken if it doesn't support such a basic standard. But
  156. // since there are still some users attempting to use VJ++ for
  157. // development, we have dropped in a fallback which makes a
  158. // second attempt using the platform's default encoding. In VJ++
  159. // this is apparently ASCII, which is subset of UTF-8... and
  160. // since the strings we'll be reading here are also primarily
  161. // limited to the 7-bit ASCII range (at least, in English
  162. // versions of Xalan), this should work well enough to keep us
  163. // on the air until we're ready to officially decommit from
  164. // VJ++.
  165. try
  166. {
  167. reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
  168. }
  169. catch(java.io.UnsupportedEncodingException e)
  170. {
  171. reader = new BufferedReader(new InputStreamReader(is));
  172. }
  173. line = reader.readLine();
  174. while (line != null)
  175. {
  176. if (line.length() == 0 || line.charAt(0) == '#')
  177. {
  178. line = reader.readLine();
  179. continue;
  180. }
  181. index = line.indexOf(' ');
  182. if (index > 1)
  183. {
  184. name = line.substring(0, index);
  185. ++index;
  186. if (index < line.length())
  187. {
  188. value = line.substring(index);
  189. index = value.indexOf(' ');
  190. if (index > 0)
  191. value = value.substring(0, index);
  192. code = Integer.parseInt(value);
  193. defineEntity(name, (char) code);
  194. }
  195. }
  196. line = reader.readLine();
  197. }
  198. is.close();
  199. m_specialsMap.set(S_LINEFEED);
  200. m_specialsMap.set(S_CARRIAGERETURN);
  201. }
  202. catch (Exception except)
  203. {
  204. throw new RuntimeException(XSLMessages.createMessage(XSLTErrorResources.ER_RESOURCE_COULD_NOT_LOAD, new Object[]{entitiesResource, except.toString(), entitiesResource, except.toString() }));
  205. }
  206. finally
  207. {
  208. if (is != null)
  209. {
  210. try
  211. {
  212. is.close();
  213. }
  214. catch (Exception except){}
  215. }
  216. }
  217. }
  218. /**
  219. * Defines a new character reference. The reference's name and value are
  220. * supplied. Nothing happens if the character reference is already defined.
  221. * <p>Unlike internal entities, character references are a string to single
  222. * character mapping. They are used to map non-ASCII characters both on
  223. * parsing and printing, primarily for HTML documents. '<amp;' is an
  224. * example of a character reference.</p>
  225. *
  226. * @param name The entity's name
  227. * @param value The entity's value
  228. */
  229. protected void defineEntity(String name, char value)
  230. {
  231. CharKey character = new CharKey(value);
  232. m_charToEntityRef.put(character, name);
  233. m_specialsMap.set(value);
  234. }
  235. private CharKey m_charKey = new CharKey();
  236. /**
  237. * Resolve a character to an entity reference name.
  238. *
  239. * This is reusing a stored key object, in an effort to avoid
  240. * heap activity. Unfortunately, that introduces a threading risk.
  241. * Simplest fix for now is to make it a synchronized method, or to give
  242. * up the reuse; I see very little performance difference between them.
  243. * Long-term solution would be to replace the hashtable with a sparse array
  244. * keyed directly from the character's integer value; see DTM's
  245. * string pool for a related solution.
  246. *
  247. * @param value character value that should be resolved to a name.
  248. *
  249. * @return name of character entity, or null if not found.
  250. */
  251. synchronized
  252. public String getEntityNameForChar(char value)
  253. {
  254. // CharKey m_charKey = new CharKey(); //Alternative to synchronized
  255. m_charKey.setChar(value);
  256. return (String) m_charToEntityRef.get(m_charKey);
  257. }
  258. /**
  259. * Tell if the character argument should have special treatment.
  260. *
  261. * @param value character value.
  262. *
  263. * @return true if the character should have any special treatment.
  264. */
  265. public boolean isSpecial(char value)
  266. {
  267. return m_specialsMap.get(value);
  268. }
  269. }