1. /*
  2. * $Id: Resolver.java,v 1.1.1.1 2000/11/23 01:53:33 edwingo Exp $
  3. *
  4. * The Apache Software License, Version 1.1
  5. *
  6. *
  7. * Copyright (c) 2000 The Apache Software Foundation. All rights
  8. * reserved.
  9. *
  10. * Redistribution and use in source and binary forms, with or without
  11. * modification, are permitted provided that the following conditions
  12. * are met:
  13. *
  14. * 1. Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. *
  17. * 2. Redistributions in binary form must reproduce the above copyright
  18. * notice, this list of conditions and the following disclaimer in
  19. * the documentation and/or other materials provided with the
  20. * distribution.
  21. *
  22. * 3. The end-user documentation included with the redistribution,
  23. * if any, must include the following acknowledgment:
  24. * "This product includes software developed by the
  25. * Apache Software Foundation (http://www.apache.org/)."
  26. * Alternately, this acknowledgment may appear in the software itself,
  27. * if and wherever such third-party acknowledgments normally appear.
  28. *
  29. * 4. The names "Crimson" and "Apache Software Foundation" must
  30. * not be used to endorse or promote products derived from this
  31. * software without prior written permission. For written
  32. * permission, please contact apache@apache.org.
  33. *
  34. * 5. Products derived from this software may not be called "Apache",
  35. * nor may "Apache" appear in their name, without prior written
  36. * permission of the Apache Software Foundation.
  37. *
  38. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  39. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  40. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  41. * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  42. * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  43. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  44. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  45. * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  46. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  47. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  48. * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  49. * SUCH DAMAGE.
  50. * ====================================================================
  51. *
  52. * This software consists of voluntary contributions made by many
  53. * individuals on behalf of the Apache Software Foundation and was
  54. * originally based on software copyright (c) 1999, Sun Microsystems, Inc.,
  55. * http://www.sun.com. For more information on the Apache Software
  56. * Foundation, please see <http://www.apache.org/>.
  57. */
  58. package org.apache.crimson.parser;
  59. import java.io.File;
  60. import java.io.FileInputStream;
  61. import java.io.InputStream;
  62. import java.io.IOException;
  63. import java.net.URL;
  64. import java.net.URLConnection;
  65. import java.net.HttpURLConnection;
  66. import java.util.Hashtable;
  67. import org.xml.sax.*;
  68. /**
  69. * This entity resolver class provides a number of utilities which can help
  70. * managment of external parsed entities in XML. These are commonly used
  71. * to hold markup declarations that are to be used as part of a Document
  72. * Type Declaration (DTD), or to hold text marked up with XML.
  73. *
  74. * <P> Features include: <UL>
  75. *
  76. * <LI> Static factory methods are provided for constructing SAX InputSource
  77. * objects from Files, URLs, or MIME objects. This eliminates a class of
  78. * error-prone coding in applications.
  79. *
  80. * <LI> Character encodings for XML documents are correctly supported: <UL>
  81. *
  82. * <LI> The encodings defined in the RFCs for MIME content types
  83. * (2046 for general MIME, and 2376 for XML in particular), are
  84. * supported, handling <em>charset=...</em> attributes and accepting
  85. * content types which are known to be safe for use with XML;
  86. *
  87. * <LI> The character encoding autodetection algorithm identified
  88. * in the XML specification is used, and leverages all of
  89. * the JDK 1.1 (and later) character encoding support.
  90. *
  91. * <LI> The use of MIME typing may optionally be disabled, forcing the
  92. * use of autodetection, to support web servers which don't correctly
  93. * report MIME types for XML. For example, they may report text that
  94. * is encoded in EUC-JP as being US-ASCII text, leading to fatal
  95. * errors during parsing.
  96. *
  97. * <LI> The InputSource objects returned by this class always
  98. * have a <code>java.io.Reader</code> available as the "character
  99. * stream" property.
  100. *
  101. * </UL>
  102. *
  103. * <LI> Catalog entries can map public identifiers to Java resources or
  104. * to local URLs. These are used to reduce network dependencies and loads,
  105. * and will often be used for external DTD components. For example, packages
  106. * shipping DTD files as resources in JAR files can eliminate network traffic
  107. * when accessing them, and sites may provide local caches of common DTDs.
  108. * Note that no particular catalog syntax is supported by this class, only
  109. * the notion of a set of entries.
  110. *
  111. * </UL>
  112. *
  113. * <P> Subclasses can perform tasks such as supporting new URI schemes for
  114. * URIs which are not URLs, such as URNs (see RFC 2396) or for accessing
  115. * MIME entities which are part of a <em>multipart/related</em> group
  116. * (see RFC 2387). They may also be used to support particular catalog
  117. * syntaxes, such as the <a href="http://www.oasis-open.org/html/a401.htm">
  118. * SGML/Open Catalog (SOCAT)</a> which supports the SGML notion of "Formal
  119. * Public Identifiers (FPIs).
  120. *
  121. * @author David Brownell
  122. * @author Rajiv Mordani
  123. * @version $Revision: 1.1.1.1 $
  124. */
  125. public class Resolver implements EntityResolver
  126. {
  127. private boolean ignoringMIME;
  128. // table mapping public IDs to (local) URIs
  129. private Hashtable id2uri;
  130. // tables mapping public IDs to resources and classloaders
  131. private Hashtable id2resource;
  132. private Hashtable id2loader;
  133. //
  134. // table of MIME content types (less attributes!) known
  135. // to be mostly "OK" to use with XML MIME entities. the
  136. // idea is to rule out obvious braindamage ("image/jpg")
  137. // not the subtle stuff ("text/html") that might actually
  138. // be (or become) safe.
  139. //
  140. private static final String types [] = {
  141. "application/xml",
  142. "text/xml",
  143. "text/plain",
  144. "text/html", // commonly mis-inferred
  145. "application/x-netcdf", // this is often illegal XML
  146. "content/unknown"
  147. };
  148. /** Constructs a resolver. */
  149. public Resolver () { }
  150. /**
  151. * Returns an input source, using the MIME type information and URL
  152. * scheme to statically determine the correct character encoding if
  153. * possible and otherwise autodetecting it. MIME carefully specifies
  154. * the character encoding defaults, and how attributes of the content
  155. * type can change it. XML further specifies two mandatory encodings
  156. * (UTF-8 and UTF-16), and includes an XML declaration which can be
  157. * used to internally label most documents encoded using US-ASCII
  158. * supersets (such as Shift_JIS, EUC-JP, ISO-2022-*, ISO-8859-*, and
  159. * more).
  160. *
  161. * <P> This method can be used to access XML documents which do not
  162. * have URIs (such as servlet input streams, or most JavaMail message
  163. * entities) and to support access methods such as HTTP POST or PUT.
  164. * (URLs normally return content using the GET method.)
  165. *
  166. * <P> <em> The caller should set the system ID in order for relative URIs
  167. * found in this document to be interpreted correctly.</em> In some cases,
  168. * a custom resolver will need to be used; for example, documents
  169. * may be grouped in a single MIME "multipart/related" bundle, and
  170. * relative URLs would refer to other documents in that bundle.
  171. *
  172. * @param contentType The MIME content type for the source for which
  173. * an InputSource is desired, such as <em>text/xml;charset=utf-8</em>.
  174. * @param stream The input byte stream for the input source.
  175. * @param checkType If true, this verifies that the content type is known
  176. * to support XML documents, such as <em>application/xml</em>.
  177. * @param scheme Unless this is "file", unspecified MIME types
  178. * default to US-ASCII. Files are always autodetected since most
  179. * file systems discard character encoding information.
  180. */
  181. public static InputSource createInputSource (
  182. String contentType,
  183. InputStream stream,
  184. boolean checkType,
  185. String scheme
  186. ) throws IOException
  187. {
  188. InputSource retval;
  189. String charset = null;
  190. if (contentType != null) {
  191. int index;
  192. contentType = contentType.toLowerCase ();
  193. index = contentType.indexOf (';');
  194. if (index != -1) {
  195. String attributes;
  196. attributes = contentType.substring (index + 1);
  197. contentType = contentType.substring (0, index);
  198. // use "charset=..." if it's available
  199. index = attributes.indexOf ("charset");
  200. if (index != -1) {
  201. attributes = attributes.substring (index + 7);
  202. // strip out subsequent attributes
  203. if ((index = attributes.indexOf (';')) != -1)
  204. attributes = attributes.substring (0, index);
  205. // find start of value
  206. if ((index = attributes.indexOf ('=')) != -1) {
  207. attributes = attributes.substring (index + 1);
  208. // strip out rfc822 comments
  209. if ((index = attributes.indexOf ('(')) != -1)
  210. attributes = attributes.substring (0, index);
  211. // double quotes are optional
  212. if ((index = attributes.indexOf ('"')) != -1) {
  213. attributes = attributes.substring (index + 1);
  214. attributes = attributes.substring (0,
  215. attributes.indexOf ('"'));
  216. }
  217. charset = attributes.trim ();
  218. // XXX "\;", "\)" etc were mishandled above
  219. }
  220. }
  221. }
  222. //
  223. // Check MIME type.
  224. //
  225. if (checkType) {
  226. boolean isOK = false;
  227. for (int i = 0; i < types.length; i++)
  228. if (types [i].equals (contentType)) {
  229. isOK = true;
  230. break;
  231. }
  232. if (!isOK)
  233. throw new IOException ("Not XML: " + contentType);
  234. }
  235. //
  236. // "text/*" MIME types have hard-wired character set
  237. // defaults, as specified in the RFCs. For XML, we
  238. // ignore the system "file.encoding" property since
  239. // autodetection is more correct.
  240. //
  241. if (charset == null) {
  242. contentType = contentType.trim ();
  243. if (contentType.startsWith ("text/")) {
  244. if (!"file".equalsIgnoreCase (scheme))
  245. charset = "US-ASCII";
  246. }
  247. // "application/*" has no default
  248. }
  249. }
  250. retval = new InputSource (XmlReader.createReader (stream, charset));
  251. retval.setByteStream (stream);
  252. retval.setEncoding (charset);
  253. return retval;
  254. }
  255. /**
  256. * Creates an input source from a given URI.
  257. *
  258. * @param uri the URI (system ID) for the entity
  259. * @param checkType if true, the MIME content type for the entity
  260. * is checked for document type and character set encoding.
  261. */
  262. static public InputSource createInputSource (URL uri, boolean checkType)
  263. throws IOException
  264. {
  265. URLConnection conn = uri.openConnection ();
  266. if (conn instanceof HttpURLConnection) {
  267. int status = ((HttpURLConnection)conn).getResponseCode ();
  268. if ((status >= 400 && status <= 417) ||
  269. (status >=500 && status <=505))
  270. {
  271. throw new IOException ("Error in opening uri " + uri +
  272. "status code=" + status);
  273. }
  274. }
  275. InputSource retval;
  276. if (checkType) {
  277. String contentType = conn.getContentType ();
  278. retval = createInputSource (contentType, conn.getInputStream (),
  279. false, uri.getProtocol ());
  280. } else {
  281. retval = new InputSource (
  282. XmlReader.createReader (conn.getInputStream ()));
  283. }
  284. retval.setSystemId (conn.getURL ().toString ());
  285. return retval;
  286. }
  287. /**
  288. * Creates an input source from a given file, autodetecting
  289. * the character encoding.
  290. *
  291. * @param uri the URI (system ID) for the entity
  292. */
  293. static public InputSource createInputSource (File file)
  294. throws IOException
  295. {
  296. InputSource retval;
  297. String path;
  298. retval = new InputSource (
  299. XmlReader.createReader (new FileInputStream (file)));
  300. // On JDK 1.2 and later, simplify this:
  301. // "path = file.toURL ().toString ()".
  302. path = file.getAbsolutePath ();
  303. if (File.separatorChar != '/')
  304. path = path.replace (File.separatorChar, '/');
  305. if (!path.startsWith ("/"))
  306. path = "/" + path;
  307. if (!path.endsWith ("/") && file.isDirectory ())
  308. path = path + "/";
  309. retval.setSystemId ("file:" + path);
  310. return retval;
  311. }
  312. /**
  313. * <b>SAX:</b>
  314. * Resolve the given entity into an input source. If the name can't
  315. * be mapped to a preferred form of the entity, the URI is used. To
  316. * resolve the entity, first a local catalog mapping names to URIs is
  317. * consulted. If no mapping is found there, a catalog mapping names
  318. * to java resources is consulted. Finally, if neither mapping found
  319. * a copy of the entity, the specified URI is used.
  320. *
  321. * <P> When a URI is used, <a href="#createInputSource">
  322. * createInputSource</a> is used to correctly deduce the character
  323. * encoding used by this entity. No MIME type checking is done.
  324. *
  325. * @param name Used to find alternate copies of the entity, when
  326. * this value is non-null; this is the XML "public ID".
  327. * @param uri Used when no alternate copy of the entity is found;
  328. * this is the XML "system ID", normally a URI.
  329. */
  330. public InputSource resolveEntity (String name, String uri)
  331. throws IOException, SAXException
  332. {
  333. InputSource retval;
  334. String mappedURI = name2uri (name);
  335. InputStream stream;
  336. // prefer explicit URI mappings, then bundled resources...
  337. if (mappedURI == null && (stream = mapResource (name)) != null) {
  338. uri = "java:resource:" + (String) id2resource.get (name);
  339. retval = new InputSource (XmlReader.createReader (stream));
  340. // ...and treat all URIs the same (as URLs for now).
  341. } else {
  342. URL url;
  343. URLConnection conn;
  344. if (mappedURI != null)
  345. uri = mappedURI;
  346. else if (uri == null)
  347. return null;
  348. url = new URL (uri);
  349. conn = url.openConnection ();
  350. uri = conn.getURL ().toString ();
  351. // System.out.println ("++ URI: " + url);
  352. if (ignoringMIME)
  353. retval = new InputSource (
  354. XmlReader.createReader (conn.getInputStream ()));
  355. else {
  356. String contentType = conn.getContentType ();
  357. retval = createInputSource (contentType,
  358. conn.getInputStream (),
  359. false, url.getProtocol ());
  360. }
  361. }
  362. retval.setSystemId (uri);
  363. retval.setPublicId (name);
  364. return retval;
  365. }
  366. /**
  367. * Returns true if this resolver is ignoring MIME types in the documents
  368. * it returns, to work around bugs in how servers have reported the
  369. * documents' MIME types.
  370. */
  371. public boolean isIgnoringMIME ()
  372. { return ignoringMIME; }
  373. /**
  374. * Tells the resolver whether to ignore MIME types in the documents it
  375. * retrieves. Many web servers incorrectly assign text documents a
  376. * default character encoding, even when that is incorrect. For example,
  377. * all HTTP text documents default to use ISO-8859-1 (used for Western
  378. * European languages), and other MIME sources default text documents
  379. * to use US-ASCII (a seven bit encoding). For XML documents which
  380. * include text encoding declarations (as most should do), these server
  381. * bugs can be worked around by ignoring the MIME type entirely.
  382. */
  383. public void setIgnoringMIME (boolean value)
  384. { ignoringMIME = value; }
  385. // maps the public ID to an alternate URI, if one is registered
  386. private String name2uri (String publicId)
  387. {
  388. if (publicId == null || id2uri == null)
  389. return null;
  390. return (String) id2uri.get (publicId);
  391. }
  392. /**
  393. * Registers the given public ID as corresponding to a particular
  394. * URI, typically a local copy. This URI will be used in preference
  395. * to ones provided as system IDs in XML entity declarations. This
  396. * mechanism would most typically be used for Document Type Definitions
  397. * (DTDs), where the public IDs are formally managed and versioned.
  398. *
  399. * @param publicId The managed public ID being mapped
  400. * @param uri The URI of the preferred copy of that entity
  401. */
  402. public void registerCatalogEntry (
  403. String publicId,
  404. String uri
  405. )
  406. {
  407. if (id2uri == null)
  408. id2uri = new Hashtable (17);
  409. id2uri.put (publicId, uri);
  410. }
  411. // return the resource as a stream
  412. private InputStream mapResource (String publicId)
  413. {
  414. // System.out.println ("++ PUBLIC: " + publicId);
  415. if (publicId == null || id2resource == null)
  416. return null;
  417. String resourceName = (String) id2resource.get (publicId);
  418. ClassLoader loader = null;
  419. if (resourceName == null)
  420. return null;
  421. // System.out.println ("++ Resource: " + resourceName);
  422. if (id2loader != null)
  423. loader = (ClassLoader) id2loader.get (publicId);
  424. // System.out.println ("++ Loader: " + loader);
  425. if (loader == null)
  426. return ClassLoader.getSystemResourceAsStream (resourceName);
  427. return loader.getResourceAsStream (resourceName);
  428. }
  429. /**
  430. * Registers a given public ID as corresponding to a particular Java
  431. * resource in a given class loader, typically distributed with a
  432. * software package. This resource will be preferred over system IDs
  433. * included in XML documents. This mechanism should most typically be
  434. * used for Document Type Definitions (DTDs), where the public IDs are
  435. * formally managed and versioned.
  436. *
  437. * <P> If a mapping to a URI has been provided, that mapping takes
  438. * precedence over this one.
  439. *
  440. * @param publicId The managed public ID being mapped
  441. * @param resourceName The name of the Java resource
  442. * @param loader The class loader holding the resource, or null if
  443. * it is a system resource.
  444. */
  445. public void registerCatalogEntry (
  446. String publicId,
  447. String resourceName,
  448. ClassLoader loader
  449. )
  450. {
  451. if (id2resource == null)
  452. id2resource = new Hashtable (17);
  453. id2resource.put (publicId, resourceName);
  454. if (loader != null) {
  455. if (id2loader == null)
  456. id2loader = new Hashtable (17);
  457. id2loader.put (publicId, loader);
  458. }
  459. }
  460. }