Resolver

/*
 * $Id: Resolver.java,v 1.1.1.1 2000/11/23 01:53:33 edwingo Exp $
 *
 * The Apache Software License, Version 1.1
 *
 *
 * Copyright (c) 2000 The Apache Software Foundation.  All rights 
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer. 
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. The end-user documentation included with the redistribution,
 *    if any, must include the following acknowledgment:  
 *       "This product includes software developed by the
 *        Apache Software Foundation (http://www.apache.org/)."
 *    Alternately, this acknowledgment may appear in the software itself,
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Crimson" and "Apache Software Foundation" must
 *    not be used to endorse or promote products derived from this
 *    software without prior written permission. For written 
 *    permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
 *    nor may "Apache" appear in their name, without prior written
 *    permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation and was
 * originally based on software copyright (c) 1999, Sun Microsystems, Inc., 
 * http://www.sun.com.  For more information on the Apache Software 
 * Foundation, please see <http://www.apache.org/>.
 */


package org.apache.crimson.parser;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.IOException;
import java.net.URL;
import java.net.URLConnection;
import java.net.HttpURLConnection;
import java.util.Hashtable;

import org.xml.sax.*;


/**
 * This entity resolver class provides a number of utilities which can help
 * managment of external parsed entities in XML.  These are commonly used
 * to hold markup declarations that are to be used as part of a Document
 * Type Declaration (DTD), or to hold text marked up with XML.
 *
 * <P> Features include: <UL>
 *
 * <LI> Static factory methods are provided for constructing SAX InputSource
 * objects from Files, URLs, or MIME objects.  This eliminates a class of
 * error-prone coding in applications.
 *
 * <LI> Character encodings for XML documents are correctly supported: <UL>
 *
 *	<LI> The encodings defined in the RFCs for MIME content types
 *	(2046 for general MIME, and 2376 for XML in particular), are
 *	supported, handling <em>charset=...</em> attributes and accepting
 *	content types which are known to be safe for use with XML;
 *
 *	<LI> The character encoding autodetection algorithm identified
 *	in the XML specification is used, and leverages all of
 *	the JDK 1.1 (and later) character encoding support.
 *
 *	<LI> The use of MIME typing may optionally be disabled, forcing the
 *	use of autodetection, to support web servers which don't correctly
 *	report MIME types for XML.  For example, they may report text that
 *	is encoded in EUC-JP as being US-ASCII text, leading to fatal
 *	errors during parsing.
 *
 *	<LI> The InputSource objects returned by this class always
 *	have a <code>java.io.Reader</code> available as the "character
 *	stream" property.
 *
 *	</UL>
 *
 * <LI> Catalog entries can map public identifiers to Java resources or
 * to local URLs.  These are used to reduce network dependencies and loads,
 * and will often be used for external DTD components.  For example, packages
 * shipping DTD files as resources in JAR files can eliminate network traffic
 * when accessing them, and sites may provide local caches of common DTDs.
 * Note that no particular catalog syntax is supported by this class, only
 * the notion of a set of entries.
 *
 * </UL>
 *
 * <P> Subclasses can perform tasks such as supporting new URI schemes for
 * URIs which are not URLs, such as URNs (see RFC 2396) or for accessing
 * MIME entities which are part of a <em>multipart/related</em> group
 * (see RFC 2387).  They may also be used to support particular catalog
 * syntaxes, such as the <a href="http://www.oasis-open.org/html/a401.htm">
 * SGML/Open Catalog (SOCAT)</a> which supports the SGML notion of "Formal
 * Public Identifiers (FPIs).
 *
 * @author David Brownell
 * @author Rajiv Mordani
 * @version $Revision: 1.1.1.1 $
 */
public class Resolver implements EntityResolver
{
    private boolean		ignoringMIME;

    // table mapping public IDs to (local) URIs
    private Hashtable		id2uri;

    // tables mapping public IDs to resources and classloaders
    private Hashtable		id2resource;
    private Hashtable		id2loader;

    //
    // table of MIME content types (less attributes!) known
    // to be mostly "OK" to use with XML MIME entities.  the
    // idea is to rule out obvious braindamage ("image/jpg")
    // not the subtle stuff ("text/html") that might actually
    // be (or become) safe.
    //
    private static final String types [] = {
	"application/xml",
	"text/xml",
	"text/plain",
	"text/html",			// commonly mis-inferred
	"application/x-netcdf",		// this is often illegal XML
	"content/unknown"
    };

    /** Constructs a resolver. */
    public			Resolver () { }
    
    /**
     * Returns an input source, using the MIME type information and URL
     * scheme to statically determine the correct character encoding if
     * possible and otherwise autodetecting it.  MIME carefully specifies
     * the character encoding defaults, and how attributes of the content
     * type can change it.  XML further specifies two mandatory encodings
     * (UTF-8 and UTF-16), and includes an XML declaration which can be
     * used to internally label most documents encoded using US-ASCII
     * supersets (such as Shift_JIS, EUC-JP, ISO-2022-*, ISO-8859-*, and
     * more).
     *
     * <P> This method can be used to access XML documents which do not
     * have URIs (such as servlet input streams, or most JavaMail message
     * entities) and to support access methods such as HTTP POST or PUT.
     * (URLs normally return content using the GET method.)
     *
     * <P> <em> The caller should set the system ID in order for relative URIs
     * found in this document to be interpreted correctly.</em> In some cases,
     * a custom resolver will need to be used; for example, documents
     * may be grouped in a single MIME "multipart/related" bundle, and
     * relative URLs would refer to other documents in that bundle.
     *
     * @param contentType The MIME content type for the source for which
     *	an InputSource is desired, such as <em>text/xml;charset=utf-8</em>.
     * @param stream The input byte stream for the input source.
     * @param checkType If true, this verifies that the content type is known
     *	to support XML documents, such as <em>application/xml</em>.
     * @param scheme Unless this is "file", unspecified MIME types
     *	default to US-ASCII.  Files are always autodetected since most
     *	file systems discard character encoding information.
     */
    public static InputSource createInputSource (
	String		contentType,
	InputStream	stream,
	boolean		checkType,
	String		scheme
    ) throws IOException
    {
        InputSource     retval;
	String		charset = null;
	    
	if (contentType != null) {
	    int		index;

	    contentType = contentType.toLowerCase ();
	    index = contentType.indexOf (';');
	    if (index != -1) {
		String	attributes;

		attributes = contentType.substring (index + 1);
		contentType = contentType.substring (0, index);

		// use "charset=..." if it's available
		index = attributes.indexOf ("charset");
		if (index != -1) {
		    attributes = attributes.substring (index + 7);
		    // strip out subsequent attributes
		    if ((index = attributes.indexOf (';')) != -1)
			attributes = attributes.substring (0, index);
		    // find start of value
		    if ((index = attributes.indexOf ('=')) != -1) {
			attributes = attributes.substring (index + 1);
			// strip out rfc822 comments
			if ((index = attributes.indexOf ('(')) != -1)
			    attributes = attributes.substring (0, index);
			// double quotes are optional
			if ((index = attributes.indexOf ('"')) != -1) {
			    attributes = attributes.substring (index + 1);
			    attributes = attributes.substring (0,
				    attributes.indexOf ('"'));
			}
			charset = attributes.trim ();
			// XXX "\;", "\)" etc were mishandled above
		    }
		}
	    }

	    //
	    // Check MIME type.
	    //
	    if (checkType) {
		boolean isOK = false;
		for (int i = 0; i < types.length; i++)
		    if (types [i].equals (contentType)) {
			isOK = true;
			break;
		    }
		if (!isOK)
		    throw new IOException ("Not XML: " + contentType);
	    }

	    //
	    // "text/*" MIME types have hard-wired character set
	    // defaults, as specified in the RFCs.  For XML, we
	    // ignore the system "file.encoding" property since
	    // autodetection is more correct.
	    //
	    if (charset == null) {
		contentType = contentType.trim ();
		if (contentType.startsWith ("text/")) {
		    if (!"file".equalsIgnoreCase (scheme))
			charset = "US-ASCII";
		}
		// "application/*" has no default
	    }
	}

	retval = new InputSource (XmlReader.createReader (stream, charset));
	retval.setByteStream (stream);
	retval.setEncoding (charset);
	return retval;
    }


    /**
     * Creates an input source from a given URI.
     *
     * @param uri the URI (system ID) for the entity
     * @param checkType if true, the MIME content type for the entity
     *	is checked for document type and character set encoding.
     */
    static public InputSource createInputSource (URL uri, boolean checkType)
    throws IOException
    {
	URLConnection	conn = uri.openConnection ();
	if (conn instanceof HttpURLConnection) {
	    int status = ((HttpURLConnection)conn).getResponseCode ();
	    if ((status >= 400 && status <= 417) ||
		(status >=500 && status <=505))
	    {
	   	throw new IOException ("Error in opening uri " + uri + 
					"status code=" + status); 
	    }
	}
	InputSource	retval;
	
	if (checkType) {
	    String	contentType = conn.getContentType ();
	    retval = createInputSource (contentType, conn.getInputStream (),
		    false, uri.getProtocol ());
	} else {
	    retval = new InputSource (
		    XmlReader.createReader (conn.getInputStream ()));
	}
	retval.setSystemId (conn.getURL ().toString ());
	return retval;
    }


    /**
     * Creates an input source from a given file, autodetecting
     * the character encoding.
     *
     * @param uri the URI (system ID) for the entity
     */
    static public InputSource createInputSource (File file)
    throws IOException
    {
	InputSource	retval;
	String		path;
	
	retval = new InputSource (
		    XmlReader.createReader (new FileInputStream (file)));

	// On JDK 1.2 and later, simplify this:
	//	"path = file.toURL ().toString ()".
	path = file.getAbsolutePath ();
	if (File.separatorChar != '/')
	    path = path.replace (File.separatorChar, '/');
	if (!path.startsWith ("/"))
	    path = "/" + path;
	if (!path.endsWith ("/") && file.isDirectory ())
	    path = path + "/";

	retval.setSystemId ("file:" + path);
	return retval;
    }


    /**
     * <b>SAX:</b>
     * Resolve the given entity into an input source.  If the name can't
     * be mapped to a preferred form of the entity, the URI is used.  To
     * resolve the entity, first a local catalog mapping names to URIs is
     * consulted.  If no mapping is found there, a catalog mapping names
     * to java resources is consulted.  Finally, if neither mapping found
     * a copy of the entity, the specified URI is used.
     *
     * <P> When a URI is used, <a href="#createInputSource">
     * createInputSource</a> is used to correctly deduce the character
     * encoding used by this entity.  No MIME type checking is done.
     *
     * @param name Used to find alternate copies of the entity, when
     *	this value is non-null; this is the XML "public ID".
     * @param uri Used when no alternate copy of the entity is found;
     *	this is the XML "system ID", normally a URI.
     */
    public InputSource 	resolveEntity (String name, String uri)
    throws IOException, SAXException
    {    	    
	InputSource     retval;        
	String		mappedURI = name2uri (name);
	InputStream	stream;

	// prefer explicit URI mappings, then bundled resources...
	if (mappedURI == null && (stream = mapResource (name)) != null) {
	    uri = "java:resource:" + (String) id2resource.get (name);
	    retval = new InputSource (XmlReader.createReader (stream));

	// ...and treat all URIs the same (as URLs for now). 
	} else {
	    URL			url;
	    URLConnection	conn;
	    
	    if (mappedURI != null)
		uri = mappedURI;
	    else if (uri == null)
		return null;

	    url = new URL (uri);
	    conn = url.openConnection ();
	    uri = conn.getURL ().toString ();
	    // System.out.println ("++ URI: " + url);
	    if (ignoringMIME)
		retval = new InputSource (
			XmlReader.createReader (conn.getInputStream ()));
	    else {
		String		contentType = conn.getContentType ();
		retval = createInputSource (contentType,
			conn.getInputStream (),
			false, url.getProtocol ());
	    }
	}
	retval.setSystemId (uri);
	retval.setPublicId (name);
	return retval;
    }


    /**
     * Returns true if this resolver is ignoring MIME types in the documents
     * it returns, to work around bugs in how servers have reported the
     * documents' MIME types.
     */
    public boolean isIgnoringMIME ()
	{ return ignoringMIME; }

    /**
     * Tells the resolver whether to ignore MIME types in the documents it
     * retrieves.  Many web servers incorrectly assign text documents a
     * default character encoding, even when that is incorrect.  For example,
     * all HTTP text documents default to use ISO-8859-1 (used for Western
     * European languages), and other MIME sources default text documents
     * to use US-ASCII (a seven bit encoding).  For XML documents which
     * include text encoding declarations (as most should do), these server
     * bugs can be worked around by ignoring the MIME type entirely.
     */
    public void setIgnoringMIME (boolean value)
	{ ignoringMIME = value; }


    // maps the public ID to an alternate URI, if one is registered
    private String name2uri (String publicId)
    {
	if (publicId == null || id2uri == null)
	    return null;
	return (String) id2uri.get (publicId);
    }


    /**
     * Registers the given public ID as corresponding to a particular 
     * URI, typically a local copy.  This URI will be used in preference
     * to ones provided as system IDs in XML entity declarations.  This
     * mechanism would most typically be used for Document Type Definitions
     * (DTDs), where the public IDs are formally managed and versioned.
     *
     * @param publicId The managed public ID being mapped
     * @param uri The URI of the preferred copy of that entity
     */
    public void registerCatalogEntry (
	String		publicId,
	String		uri
    )
    {
	if (id2uri == null)
	    id2uri = new Hashtable (17);
	id2uri.put (publicId, uri);
    }


    // return the resource as a stream
    private InputStream mapResource (String publicId)
    {
	// System.out.println ("++ PUBLIC: " + publicId);
	if (publicId == null || id2resource == null)
	    return null;

	String		resourceName = (String) id2resource.get (publicId);
	ClassLoader	loader = null;

	if (resourceName == null)
	    return null;
	// System.out.println ("++ Resource: " + resourceName);
	
	if (id2loader != null)
	    loader = (ClassLoader) id2loader.get (publicId);
	// System.out.println ("++ Loader: " + loader);
	if (loader == null)
	    return ClassLoader.getSystemResourceAsStream (resourceName);
	return loader.getResourceAsStream (resourceName);
    }

    /**
     * Registers a given public ID as corresponding to a particular Java
     * resource in a given class loader, typically distributed with a
     * software package.  This resource will be preferred over system IDs
     * included in XML documents.  This mechanism should most typically be
     * used for Document Type Definitions (DTDs), where the public IDs are
     * formally managed and versioned.
     *
     * <P> If a mapping to a URI has been provided, that mapping takes
     * precedence over this one.
     *
     * @param publicId The managed public ID being mapped
     * @param resourceName The name of the Java resource
     * @param loader The class loader holding the resource, or null if
     *	it is a system resource.
     */
    public void registerCatalogEntry (
	String		publicId,
	String		resourceName,
	ClassLoader	loader
    )
    {
	if (id2resource == null)
	    id2resource = new Hashtable (17);
	id2resource.put (publicId, resourceName);

	if (loader != null) {
	    if (id2loader == null)
		id2loader = new Hashtable (17);
	    id2loader.put (publicId, loader);
	}
    }
}