1. /*
  2. * Copyright 1999-2004 The Apache Software Foundation.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. /*
  17. * $Id: CharInfo.java,v 1.11 2004/02/23 10:29:37 aruny Exp $
  18. */
  19. package com.sun.org.apache.xml.internal.serializer;
  20. import java.io.BufferedReader;
  21. import java.io.InputStream;
  22. import java.io.InputStreamReader;
  23. import java.io.UnsupportedEncodingException;
  24. import java.net.URL;
  25. import java.util.Hashtable;
  26. import java.util.PropertyResourceBundle;
  27. import java.util.Enumeration;
  28. import java.util.ResourceBundle;
  29. import javax.xml.transform.TransformerException;
  30. import com.sun.org.apache.xml.internal.res.XMLErrorResources;
  31. import com.sun.org.apache.xml.internal.res.XMLMessages;
  32. import com.sun.org.apache.xml.internal.utils.CharKey;
  33. import com.sun.org.apache.xml.internal.utils.SystemIDResolver;
  34. import com.sun.org.apache.xml.internal.utils.WrappedRuntimeException;
  35. /**
  36. * This class provides services that tell if a character should have
  37. * special treatement, such as entity reference substitution or normalization
  38. * of a newline character. It also provides character to entity reference
  39. * lookup.
  40. *
  41. * DEVELOPERS: See Known Issue in the constructor.
  42. *
  43. * @xsl.usage internal
  44. */
  45. class CharInfo
  46. {
  47. /** Lookup table for characters to entity references. */
  48. private Hashtable m_charToEntityRef = new Hashtable();
  49. /**
  50. * The name of the HTML entities file.
  51. * If specified, the file will be resource loaded with the default class loader.
  52. */
  53. public static String HTML_ENTITIES_RESOURCE = "com.sun.org.apache.xml.internal.serializer.HTMLEntities";
  54. /**
  55. * The name of the XML entities file.
  56. * If specified, the file will be resource loaded with the default class loader.
  57. */
  58. public static String XML_ENTITIES_RESOURCE = "com.sun.org.apache.xml.internal.serializer.XMLEntities";
  59. /** The horizontal tab character, which the parser should always normalize. */
  60. public static final char S_HORIZONAL_TAB = 0x09;
  61. /** The linefeed character, which the parser should always normalize. */
  62. public static final char S_LINEFEED = 0x0A;
  63. /** The carriage return character, which the parser should always normalize. */
  64. public static char S_CARRIAGERETURN = 0x0D;
  65. /** This flag is an optimization for HTML entities. It false if entities
  66. * other than quot (34), amp (38), lt (60) and gt (62) are defined
  67. * in the range 0 to 127.
  68. * @xsl.usage internal
  69. */
  70. final boolean onlyQuotAmpLtGt;
  71. /** Copy the first 0,1 ... ASCII_MAX values into an array */
  72. private static final int ASCII_MAX = 128;
  73. /** Array of values is faster access than a set of bits
  74. * to quickly check ASCII characters in attribute values.
  75. */
  76. private boolean[] isSpecialAttrASCII = new boolean[ASCII_MAX];
  77. /** Array of values is faster access than a set of bits
  78. * to quickly check ASCII characters in text nodes.
  79. */
  80. private boolean[] isSpecialTextASCII = new boolean[ASCII_MAX];
  81. private boolean[] isCleanTextASCII = new boolean[ASCII_MAX];
  82. /** An array of bits to record if the character is in the set.
  83. * Although information in this array is complete, the
  84. * isSpecialAttrASCII array is used first because access to its values
  85. * is common and faster.
  86. */
  87. private int array_of_bits[] = createEmptySetOfIntegers(65535);
  88. // 5 for 32 bit words, 6 for 64 bit words ...
  89. /*
  90. * This constant is used to shift an integer to quickly
  91. * calculate which element its bit is stored in.
  92. * 5 for 32 bit words (int) , 6 for 64 bit words (long)
  93. */
  94. private static final int SHIFT_PER_WORD = 5;
  95. /*
  96. * A mask to get the low order bits which are used to
  97. * calculate the value of the bit within a given word,
  98. * that will represent the presence of the integer in the
  99. * set.
  100. *
  101. * 0x1F for 32 bit words (int),
  102. * or 0x3F for 64 bit words (long)
  103. */
  104. private static final int LOW_ORDER_BITMASK = 0x1f;
  105. /*
  106. * This is used for optimizing the lookup of bits representing
  107. * the integers in the set. It is the index of the first element
  108. * in the array array_of_bits[] that is not used.
  109. */
  110. private int firstWordNotUsed;
  111. /**
  112. * Constructor that reads in a resource file that describes the mapping of
  113. * characters to entity references.
  114. * This constructor is private, just to force the use
  115. * of the getCharInfo(entitiesResource) factory
  116. *
  117. * Resource files must be encoded in UTF-8 and can either be properties
  118. * files with a .properties extension assumed. Alternatively, they can
  119. * have the following form, with no particular extension assumed:
  120. *
  121. * <pre>
  122. * # First char # is a comment
  123. * Entity numericValue
  124. * quot 34
  125. * amp 38
  126. * </pre>
  127. *
  128. * @param entitiesResource Name of properties or resource file that should
  129. * be loaded, which describes that mapping of characters to entity
  130. * references.
  131. */
  132. private CharInfo(String entitiesResource, String method)
  133. {
  134. this(entitiesResource, method, false);
  135. }
  136. private CharInfo(String entitiesResource, String method, boolean internal)
  137. {
  138. ResourceBundle entities = null;
  139. boolean noExtraEntities = true;
  140. // Make various attempts to interpret the parameter as a properties
  141. // file or resource file, as follows:
  142. //
  143. // 1) attempt to load .properties file using ResourceBundle
  144. // 2) try using the class loader to find the specified file a resource
  145. // file
  146. // 3) try treating the resource a URI
  147. if (internal) {
  148. try {
  149. // Load entity property files by using PropertyResourceBundle,
  150. // cause of security issure for applets
  151. entities = PropertyResourceBundle.getBundle(entitiesResource);
  152. } catch (Exception e) {}
  153. }
  154. if (entities != null) {
  155. Enumeration keys = entities.getKeys();
  156. while (keys.hasMoreElements()){
  157. String name = (String) keys.nextElement();
  158. String value = entities.getString(name);
  159. int code = Integer.parseInt(value);
  160. defineEntity(name, (char) code);
  161. if (extraEntity(code))
  162. noExtraEntities = false;
  163. }
  164. set(S_LINEFEED);
  165. set(S_CARRIAGERETURN);
  166. } else {
  167. InputStream is = null;
  168. // Load user specified resource file by using URL loading, it
  169. // requires a valid URI as parameter
  170. try {
  171. if (internal) {
  172. is = CharInfo.class.getResourceAsStream(entitiesResource);
  173. } else {
  174. ClassLoader cl = ObjectFactory.findClassLoader();
  175. if (cl == null) {
  176. is = ClassLoader.getSystemResourceAsStream(entitiesResource);
  177. } else {
  178. is = cl.getResourceAsStream(entitiesResource);
  179. }
  180. if (is == null) {
  181. try {
  182. URL url = new URL(entitiesResource);
  183. is = url.openStream();
  184. } catch (Exception e) {}
  185. }
  186. }
  187. if (is == null) {
  188. throw new RuntimeException(
  189. XMLMessages.createXMLMessage(
  190. XMLErrorResources.ER_RESOURCE_COULD_NOT_FIND,
  191. new Object[] {entitiesResource, entitiesResource}));
  192. }
  193. // Fix Bugzilla#4000: force reading in UTF-8
  194. // This creates the de facto standard that Xalan's resource
  195. // files must be encoded in UTF-8. This should work in all
  196. // JVMs.
  197. //
  198. // %REVIEW% KNOWN ISSUE: IT FAILS IN MICROSOFT VJ++, which
  199. // didn't implement the UTF-8 encoding. Theoretically, we should
  200. // simply let it fail in that case, since the JVM is obviously
  201. // broken if it doesn't support such a basic standard. But
  202. // since there are still some users attempting to use VJ++ for
  203. // development, we have dropped in a fallback which makes a
  204. // second attempt using the platform's default encoding. In VJ++
  205. // this is apparently ASCII, which is subset of UTF-8... and
  206. // since the strings we'll be reading here are also primarily
  207. // limited to the 7-bit ASCII range (at least, in English
  208. // versions of Xalan), this should work well enough to keep us
  209. // on the air until we're ready to officially decommit from
  210. // VJ++.
  211. BufferedReader reader;
  212. try {
  213. reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
  214. } catch (UnsupportedEncodingException e) {
  215. reader = new BufferedReader(new InputStreamReader(is));
  216. }
  217. String line = reader.readLine();
  218. while (line != null) {
  219. if (line.length() == 0 || line.charAt(0) == '#') {
  220. line = reader.readLine();
  221. continue;
  222. }
  223. int index = line.indexOf(' ');
  224. if (index > 1) {
  225. String name = line.substring(0, index);
  226. ++index;
  227. if (index < line.length()) {
  228. String value = line.substring(index);
  229. index = value.indexOf(' ');
  230. if (index > 0) {
  231. value = value.substring(0, index);
  232. }
  233. int code = Integer.parseInt(value);
  234. defineEntity(name, (char) code);
  235. if (extraEntity(code))
  236. noExtraEntities = false;
  237. }
  238. }
  239. line = reader.readLine();
  240. }
  241. is.close();
  242. set(S_LINEFEED);
  243. set(S_CARRIAGERETURN);
  244. } catch (Exception e) {
  245. throw new RuntimeException(
  246. XMLMessages.createXMLMessage(
  247. XMLErrorResources.ER_RESOURCE_COULD_NOT_LOAD,
  248. new Object[] { entitiesResource,
  249. e.toString(),
  250. entitiesResource,
  251. e.toString()}));
  252. } finally {
  253. if (is != null) {
  254. try {
  255. is.close();
  256. } catch (Exception except) {}
  257. }
  258. }
  259. }
  260. /* initialize the array isCleanTextASCII[] with a cache of values
  261. * for use by ToStream.character(char[], int , int)
  262. * and the array isSpecialTextASCII[] with the opposite values
  263. * (all in the name of performance!)
  264. */
  265. for (int ch = 0; ch <ASCII_MAX; ch++)
  266. if((((0x20 <= ch || (0x0A == ch || 0x0D == ch || 0x09 == ch)))
  267. && (!get(ch))) || ('"' == ch))
  268. {
  269. isCleanTextASCII[ch] = true;
  270. isSpecialTextASCII[ch] = false;
  271. }
  272. else {
  273. isCleanTextASCII[ch] = false;
  274. isSpecialTextASCII[ch] = true;
  275. }
  276. /* Now that we've used get(ch) just above to initialize the
  277. * two arrays we will change by adding a tab to the set of
  278. * special chars for XML (but not HTML!).
  279. * We do this because a tab is always a
  280. * special character in an XML attribute,
  281. * but only a special character in XML text
  282. * if it has an entity defined for it.
  283. * This is the reason for this delay.
  284. */
  285. if (Method.XML.equals(method))
  286. {
  287. set(S_HORIZONAL_TAB);
  288. }
  289. onlyQuotAmpLtGt = noExtraEntities;
  290. // initialize the array with a cache of the BitSet values
  291. for (int i=0; i<ASCII_MAX; i++)
  292. isSpecialAttrASCII[i] = get(i);
  293. }
  294. /**
  295. * Defines a new character reference. The reference's name and value are
  296. * supplied. Nothing happens if the character reference is already defined.
  297. * <p>Unlike internal entities, character references are a string to single
  298. * character mapping. They are used to map non-ASCII characters both on
  299. * parsing and printing, primarily for HTML documents. '<amp;' is an
  300. * example of a character reference.</p>
  301. *
  302. * @param name The entity's name
  303. * @param value The entity's value
  304. */
  305. private void defineEntity(String name, char value)
  306. {
  307. CharKey character = new CharKey(value);
  308. m_charToEntityRef.put(character, name);
  309. set(value);
  310. }
  311. private CharKey m_charKey = new CharKey();
  312. /**
  313. * Resolve a character to an entity reference name.
  314. *
  315. * This is reusing a stored key object, in an effort to avoid
  316. * heap activity. Unfortunately, that introduces a threading risk.
  317. * Simplest fix for now is to make it a synchronized method, or to give
  318. * up the reuse; I see very little performance difference between them.
  319. * Long-term solution would be to replace the hashtable with a sparse array
  320. * keyed directly from the character's integer value; see DTM's
  321. * string pool for a related solution.
  322. *
  323. * @param value character value that should be resolved to a name.
  324. *
  325. * @return name of character entity, or null if not found.
  326. * @xsl.usage internal
  327. */
  328. synchronized public String getEntityNameForChar(char value)
  329. {
  330. // CharKey m_charKey = new CharKey(); //Alternative to synchronized
  331. m_charKey.setChar(value);
  332. return (String) m_charToEntityRef.get(m_charKey);
  333. }
  334. /**
  335. * Tell if the character argument that is from
  336. * an attribute value should have special treatment.
  337. *
  338. * @param value the value of a character that is in an attribute value
  339. * @return true if the character should have any special treatment,
  340. * such as when writing out attribute values,
  341. * or entity references.
  342. * @xsl.usage internal
  343. */
  344. public final boolean isSpecialAttrChar(int value)
  345. {
  346. // for performance try the values in the boolean array first,
  347. // this is faster access than the BitSet for common ASCII values
  348. if (value < ASCII_MAX)
  349. return isSpecialAttrASCII[value];
  350. // rather than java.util.BitSet, our private
  351. // implementation is faster (and less general).
  352. return get(value);
  353. }
  354. /**
  355. * Tell if the character argument that is from a
  356. * text node should have special treatment.
  357. *
  358. * @param value the value of a character that is in a text node
  359. * @return true if the character should have any special treatment,
  360. * such as when writing out attribute values,
  361. * or entity references.
  362. * @xsl.usage internal
  363. */
  364. public final boolean isSpecialTextChar(int value)
  365. {
  366. // for performance try the values in the boolean array first,
  367. // this is faster access than the BitSet for common ASCII values
  368. if (value < ASCII_MAX)
  369. return isSpecialTextASCII[value];
  370. // rather than java.util.BitSet, our private
  371. // implementation is faster (and less general).
  372. return get(value);
  373. }
  374. /**
  375. * This method is used to determine if an ASCII character in
  376. * a text node (not an attribute value) is "clean".
  377. * @param value the character to check (0 to 127).
  378. * @return true if the character can go to the writer as-is
  379. * @xsl.usage internal
  380. */
  381. public final boolean isTextASCIIClean(int value)
  382. {
  383. return isCleanTextASCII[value];
  384. }
  385. // In the future one might want to use the array directly and avoid
  386. // the method call, but I think the JIT alreay inlines this well enough
  387. // so don't do it (for now) - bjm
  388. // public final boolean[] getASCIIClean()
  389. // {
  390. // return isCleanTextASCII;
  391. // }
  392. /**
  393. * Factory that reads in a resource file that describes the mapping of
  394. * characters to entity references.
  395. *
  396. * Resource files must be encoded in UTF-8 and have a format like:
  397. * <pre>
  398. * # First char # is a comment
  399. * Entity numericValue
  400. * quot 34
  401. * amp 38
  402. * </pre>
  403. * (Note: Why don't we just switch to .properties files? Oct-01 -sc)
  404. *
  405. * @param entitiesResource Name of entities resource file that should
  406. * be loaded, which describes that mapping of characters to entity references.
  407. * @param method the output method type, which should be one of "xml", "html", "text"...
  408. *
  409. * @xsl.usage internal
  410. */
  411. public static CharInfo getCharInfo(String entitiesFileName, String method)
  412. {
  413. CharInfo charInfo = (CharInfo) m_getCharInfoCache.get(entitiesFileName);
  414. if (charInfo != null) {
  415. return charInfo;
  416. }
  417. // try to load it internally - cache
  418. try {
  419. charInfo = new CharInfo(entitiesFileName, method, true);
  420. m_getCharInfoCache.put(entitiesFileName, charInfo);
  421. return charInfo;
  422. } catch (Exception e) {}
  423. // try to load it externally - do not cache
  424. try {
  425. return new CharInfo(entitiesFileName, method);
  426. } catch (Exception e) {}
  427. String absoluteEntitiesFileName;
  428. if (entitiesFileName.indexOf(':') < 0) {
  429. absoluteEntitiesFileName =
  430. SystemIDResolver.getAbsoluteURIFromRelative(entitiesFileName);
  431. } else {
  432. try {
  433. absoluteEntitiesFileName =
  434. SystemIDResolver.getAbsoluteURI(entitiesFileName, null);
  435. } catch (TransformerException te) {
  436. throw new WrappedRuntimeException(te);
  437. }
  438. }
  439. return new CharInfo(absoluteEntitiesFileName, method, false);
  440. }
  441. /** Table of user-specified char infos. */
  442. private static Hashtable m_getCharInfoCache = new Hashtable();
  443. /**
  444. * Returns the array element holding the bit value for the
  445. * given integer
  446. * @param i the integer that might be in the set of integers
  447. *
  448. */
  449. private static int arrayIndex(int i) {
  450. return (i >> SHIFT_PER_WORD);
  451. }
  452. /**
  453. * For a given integer in the set it returns the single bit
  454. * value used within a given word that represents whether
  455. * the integer is in the set or not.
  456. */
  457. private static int bit(int i) {
  458. int ret = (1 << (i & LOW_ORDER_BITMASK));
  459. return ret;
  460. }
  461. /**
  462. * Creates a new empty set of integers (characters)
  463. * @param max the maximum integer to be in the set.
  464. */
  465. private int[] createEmptySetOfIntegers(int max) {
  466. firstWordNotUsed = 0; // an optimization
  467. int[] arr = new int[arrayIndex(max - 1) + 1];
  468. return arr;
  469. }
  470. /**
  471. * Adds the integer (character) to the set of integers.
  472. * @param i the integer to add to the set, valid values are
  473. * 0, 1, 2 ... up to the maximum that was specified at
  474. * the creation of the set.
  475. */
  476. private final void set(int i) {
  477. int j = (i >> SHIFT_PER_WORD); // this word is used
  478. int k = j + 1;
  479. if(firstWordNotUsed < k) // for optimization purposes.
  480. firstWordNotUsed = k;
  481. array_of_bits[j] |= (1 << (i & LOW_ORDER_BITMASK));
  482. }
  483. /**
  484. * Return true if the integer (character)is in the set of integers.
  485. *
  486. * This implementation uses an array of integers with 32 bits per
  487. * integer. If a bit is set to 1 the corresponding integer is
  488. * in the set of integers.
  489. *
  490. * @param i an integer that is tested to see if it is the
  491. * set of integers, or not.
  492. */
  493. private final boolean get(int i) {
  494. boolean in_the_set = false;
  495. int j = (i >> SHIFT_PER_WORD); // wordIndex(i)
  496. // an optimization here, ... a quick test to see
  497. // if this integer is beyond any of the words in use
  498. if(j < firstWordNotUsed)
  499. in_the_set = (array_of_bits[j] &
  500. (1 << (i & LOW_ORDER_BITMASK))
  501. ) != 0; // 0L for 64 bit words
  502. return in_the_set;
  503. }
  504. // record if there are any entities other than
  505. // quot, amp, lt, gt (probably user defined)
  506. /**
  507. * @return true if the entity
  508. * @param code The value of the character that has an entity defined
  509. * for it.
  510. */
  511. private boolean extraEntity(int entityValue)
  512. {
  513. boolean extra = false;
  514. if (entityValue < 128)
  515. {
  516. switch (entityValue)
  517. {
  518. case 34 : // quot
  519. case 38 : // amp
  520. case 60 : // lt
  521. case 62 : // gt
  522. break;
  523. default : // other entity in range 0 to 127
  524. extra = true;
  525. }
  526. }
  527. return extra;
  528. }
  529. }