XIncludeTextReader

/*
 * The Apache Software License, Version 1.1
 *
 *
 * Copyright (c) 2001-2004 The Apache Software Foundation.  All rights
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. The end-user documentation included with the redistribution,
 *    if any, must include the following acknowledgment:
 *       "This product includes software developed by the
 *        Apache Software Foundation (http://www.apache.org/)."
 *    Alternately, this acknowledgment may appear in the software itself,
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Xerces" and "Apache Software Foundation" must
 *    not be used to endorse or promote products derived from this
 *    software without prior written permission. For written
 *    permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
 *    nor may "Apache" appear in their name, without prior written
 *    permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation and was
 * originally based on software copyright (c) 2003, International
 * Business Machines, Inc., http://www.apache.org.  For more
 * information on the Apache Software Foundation, please see
 * <http://www.apache.org/>.
 */

package com.sun.org.apache.xerces.internal.xinclude;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.Locale;

import com.sun.org.apache.xerces.internal.impl.io.ASCIIReader;
import com.sun.org.apache.xerces.internal.impl.io.UTF8Reader;
import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter;
import com.sun.org.apache.xerces.internal.impl.XMLEntityManager;
import com.sun.org.apache.xerces.internal.impl.XMLErrorReporter;
import com.sun.org.apache.xerces.internal.util.EncodingMap;
import com.sun.org.apache.xerces.internal.util.MessageFormatter;
import com.sun.org.apache.xerces.internal.util.XMLChar;
import com.sun.org.apache.xerces.internal.util.XMLStringBuffer;
import com.sun.org.apache.xerces.internal.xni.parser.XMLInputSource;

/**
 * This class is used for reading resources requested in <include> elements,
 * when the parse attribute of the <include> element is "text".  Using this
 * class will open the location, detect the encoding, and discard the byte order
 * mark, if applicable.
 * 
 * REVISIT:
 * Much of the code in this class is taken from XMLEntityManager.  It would be nice
 * if this code could be shared in some way.  However, since XMLEntityManager is used
 * for reading files as XML, and this needs to read files as text, there would need
 * to be some refactoring done.
 * 
 * @author Michael Glavassevich, IBM
 * @author Peter McCracken, IBM
 * @author Arun Yadav, Sun Microsystems Inc.
 *
 * @version $Id: XIncludeTextReader.java,v 1.10 2004/04/15 04:51:56 mrglavas Exp $
 *
 * @see XIncludeHandler
 */
public class XIncludeTextReader {

    private Reader fReader;
    private XIncludeHandler fHandler;
    private XMLInputSource fSource;
    private XMLErrorReporter fErrorReporter;
    
    // Content negotation parameters
    private String fAccept;
    private String fAcceptLanguage;
 
    /**
     * Construct the XIncludeReader using the XMLInputSource and XIncludeHandler.
     *
     * @param source The XMLInputSource to use.
     * @param handler The XIncludeHandler to use.
     */
    public XIncludeTextReader(XMLInputSource source, XIncludeHandler handler)
        throws IOException {
        fHandler = handler;
        fSource = source;
    }
    
    /**
     * Sets the XMLErrorReporter used for reporting errors while
     * reading the text include.
     *
     * @param errorReporter the XMLErrorReporter to be used for
     * reporting errors.
     */
    public void setErrorReporter(XMLErrorReporter errorReporter) {
        fErrorReporter = errorReporter;
    }
    
    /**
     * Sets content negotation parameters to be attached to an HTTP request.
     * 
     * @param accept the Accept HTTP request property
     * @param acceptLanguage the Accept-Language HTTP request property
     */
    public void setHttpProperties(String accept, String acceptLanguage) {
        fAccept = accept;
        fAcceptLanguage = acceptLanguage;
    }

    /**
     * Return the Reader for given XMLInputSource.
     *
     * @param source The XMLInputSource to use.
     */
    protected Reader getReader(XMLInputSource source) throws IOException {
        if (source.getCharacterStream() != null) {
            return source.getCharacterStream();
        }
        else {
            InputStream stream = null;

            String encoding = source.getEncoding();
            if (encoding == null) {
                encoding = "UTF-8";
            }
            if (source.getByteStream() != null) {
                stream = source.getByteStream();
                // Wrap the InputStream so that it is possible to rewind it.
                if (!(stream instanceof BufferedInputStream)) {
                    stream = new BufferedInputStream(stream);
                }
            }
            else {
                String expandedSystemId = XMLEntityManager.expandSystemId(source.getSystemId(), source.getBaseSystemId(), false);

                URL url = new URL(expandedSystemId);
                URLConnection urlCon = url.openConnection();
				
                // If this is an HTTP connection attach any 
                // content negotation parameters to the request.
                if (urlCon instanceof HttpURLConnection) {
                    if( fAccept != null && fAccept.length() > 0) {
                        urlCon.setRequestProperty(XIncludeHandler.HTTP_ACCEPT, fAccept);
                    }
                    if( fAcceptLanguage != null && fAcceptLanguage.length() > 0) {
                        urlCon.setRequestProperty(XIncludeHandler.HTTP_ACCEPT_LANGUAGE, fAcceptLanguage);
                    }
                }
                
                // Wrap the InputStream so that it is possible to rewind it.
                stream = new BufferedInputStream(urlCon.getInputStream());
                
                // content type will be string like "text/xml; charset=UTF-8" or "text/xml"
                String rawContentType = urlCon.getContentType();
                
                // text/xml and application/xml offer only one optional parameter
                int index = (rawContentType != null) ? rawContentType.indexOf(';') : -1;

                String contentType = null;
                String charset = null;
                if (index != -1) {
                    // this should be something like "text/xml"
                    contentType = rawContentType.substring(0, index).trim();

                    // this should be something like "charset=UTF-8", but we want to
                    // strip it down to just "UTF-8"
                    charset = rawContentType.substring(index + 1).trim();
                    if (charset.startsWith("charset=")) {
                        // 8 is the length of "charset="
                        charset = charset.substring(8).trim();
                        // strip quotes, if present
                        if ((charset.charAt(0) == '"'
                            && charset.charAt(charset.length() - 1) == '"')
                            || (charset.charAt(0) == '\''
                                && charset.charAt(charset.length() - 1)
                                    == '\'')) {
                            charset =
                                charset.substring(1, charset.length() - 1);
                        }
                    }
                    else {
                        charset = null;
                    }
                }
                else {
                    contentType = rawContentType.trim();
                }

                String detectedEncoding = null;
                /**  The encoding of such a resource is determined by:
                    1 external encoding information, if available, otherwise
                         -- the most common type of external information is the "charset" parameter of a MIME package
                    2 if the media type of the resource is text/xml, application/xml, or matches the conventions text/*+xml or application/*+xml as described in XML Media Types [IETF RFC 3023], the encoding is recognized as specified in XML 1.0, otherwise
                    3 the value of the encoding attribute if one exists, otherwise
                    4 UTF-8.
                 **/
                if (contentType.equals("text/xml")) {
                    if (charset != null) {
                        detectedEncoding = charset;
                    }
                    else {
                        // see RFC2376 or 3023, section 3.1
                        detectedEncoding = "US-ASCII";
                    }
                }
                else if (contentType.equals("application/xml")) {
                    if (charset != null) {
                        detectedEncoding = charset;
                    }
                    else {
                        // see RFC2376 or 3023, section 3.2
                        detectedEncoding = getEncodingName(stream);
                    }
                }
                else if (contentType.endsWith("+xml")) {
                    detectedEncoding = getEncodingName(stream);
                }

                if (detectedEncoding != null) {
                    encoding = detectedEncoding;
                }
                // else 3 or 4.
            }
            
            encoding = encoding.toUpperCase(Locale.ENGLISH);
            
            // eat the Byte Order Mark
            consumeBOM(stream, encoding);
            
            // If the document is UTF-8 or US-ASCII use 
            // the Xerces readers for these encodings. For
            // US-ASCII consult the encoding map since
            // this encoding has many aliases.
            if (encoding.equals("UTF-8")) {
                return new UTF8Reader(stream, 
                    XMLEntityManager.DEFAULT_BUFFER_SIZE, 
                    fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN), 
                    fErrorReporter.getLocale() );
            }
            
            // Try to use a Java reader.
            String javaEncoding = EncodingMap.getIANA2JavaMapping(encoding);
            
            // If the specified encoding wasn't a recognized IANA encoding throw an IOException.
            // The XIncludeHandler will report this as a ResourceError and then will
            // attempt to include a fallback if there is one.
            if (javaEncoding == null) {
            	MessageFormatter aFormatter = 
                    fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN);
            	Locale aLocale = fErrorReporter.getLocale();
            	throw new IOException( aFormatter.formatMessage( aLocale, 
            	    "EncodingDeclInvalid", 
                    new Object[] {encoding} ) );
            }
            else if (javaEncoding.equals("ASCII")) {
                return new ASCIIReader(stream,
                    XMLEntityManager.DEFAULT_BUFFER_SIZE,
                    fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN), 
                    fErrorReporter.getLocale() );
            }
            
            return new InputStreamReader(stream, javaEncoding);
        }
    }

    /** 
     * XMLEntityManager cares about endian-ness, since it creates its own optimized
     * readers. Since we're just using generic Java readers for now, we're not caring
     * about endian-ness.  If this changes, even more code needs to be copied from
     * XMLEntity manager. -- PJM
     */
    protected String getEncodingName(InputStream stream) throws IOException {
        final byte[] b4 = new byte[4];
        String encoding = null;

        // this has the potential to throw an exception
        // it will be fixed when we ensure the stream is rewindable (see note above)
        stream.mark(4);
        int count = stream.read(b4, 0, 4);
        stream.reset();
        if (count == 4) {
            encoding = getEncodingName(b4);
        }

        return encoding;
    }

    /**
     * Removes the byte order mark from the stream, if it exists.
     * @param stream
     * @param encoding
     * @throws IOException
     */
    protected void consumeBOM(InputStream stream, String encoding)
        throws IOException {

        byte[] b = new byte[3];
        int count = 0;
        stream.mark(3);
        if (encoding.equals("UTF-8")) {
            count = stream.read(b, 0, 3);
            if (count == 3) {
                int b0 = b[0] & 0xFF; 
                int b1 = b[1] & 0xFF;
                int b2 = b[2] & 0xFF;
                if (b0 != 0xEF || b1 != 0xBB || b2 != 0xBF) {
                    // First three bytes are not BOM, so reset.
                    stream.reset();
                }
            }
            else {
                stream.reset();
            }
        }
        else if (encoding.startsWith("UTF-16")) {
            count = stream.read(b, 0, 2);
            if (count == 2) {
                int b0 = b[0] & 0xFF;
                int b1 = b[1] & 0xFF;
                if ((b0 != 0xFE || b1 != 0xFF) 
                    && (b0 != 0xFF || b1 != 0xFE)) {
                    // First two bytes are not BOM, so reset.
                    stream.reset();
                }
            }
            else {
                stream.reset();
            }
        }
        // We could do UTF-32, but since the getEncodingName() doesn't support that
        // we won't support it here.
        // To implement UTF-32, look for:  00 00 FE FF for big-endian
        //                             or  FF FE 00 00 for little-endian
    }

    /**
     * REVISIT: This code is taken from com.sun.org.apache.xerces.internal.impl.XMLEntityManager.
     *          Is there any way we can share the code, without having it implemented twice?
     *          I think we should make it public and static in XMLEntityManager. --PJM
     *
     * Returns the IANA encoding name that is auto-detected from
     * the bytes specified, with the endian-ness of that encoding where appropriate.
     *
     * @param b4    The first four bytes of the input.
     * @return the encoding name, or null if no encoding could be detected
     */
    protected String getEncodingName(byte[] b4) {

        // UTF-16, with BOM
        int b0 = b4[0] & 0xFF;
        int b1 = b4[1] & 0xFF;
        if (b0 == 0xFE && b1 == 0xFF) {
            // UTF-16, big-endian
            return "UTF-16BE";
        }
        if (b0 == 0xFF && b1 == 0xFE) {
            // UTF-16, little-endian
            return "UTF-16LE";
        }

        // UTF-8 with a BOM
        int b2 = b4[2] & 0xFF;
        if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
            return "UTF-8";
        }

        // other encodings
        int b3 = b4[3] & 0xFF;
        if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
            // UCS-4, big endian (1234)
            return "ISO-10646-UCS-4";
        }
        if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
            // UCS-4, little endian (4321)
            return "ISO-10646-UCS-4";
        }
        if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
            // UCS-4, unusual octet order (2143)
            return "ISO-10646-UCS-4";
        }
        if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
            // UCS-4, unusual octect order (3412)
            return "ISO-10646-UCS-4";
        }
        if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
            // UTF-16, big-endian, no BOM
            // (or could turn out to be UCS-2...
            return "UTF-16BE";
        }
        if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
            // UTF-16, little-endian, no BOM
            // (or could turn out to be UCS-2...
            return "UTF-16LE";
        }
        if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
            // EBCDIC
            // a la xerces1, return CP037 instead of EBCDIC here
            return "CP037";
        }

        // this signals us to use the value from the encoding attribute
        return null;

    } // getEncodingName(byte[]):Object[]

    /**
     * Read the input stream as text, and pass the text on to the XIncludeHandler
     * using calls to characters().  This will read all of the text it can from the
     * resource.
     * 
     * @throws IOException
     */
    public void parse() throws IOException {
        // REVISIT: This method needs to be rewritten to improve performance: both
        // time and memory. We should be reading chunks and reporting chunks instead 
        // of reading characters individually and reporting all the characters in 
        // one callback. Also, currently we don't provide any locator information:
        // line number, column number, etc... so if we report an error it will appear
        // as if the invalid XML character was in the include parent. -- mrglavas
        XMLStringBuffer buffer = new XMLStringBuffer();
        fReader = getReader(fSource);
        int ch;
        while((ch = fReader.read()) != -1) {
            if (isValid(ch)) {
                buffer.append((char)ch);
            }
            else if (XMLChar.isHighSurrogate(ch)) {
            	int ch2 = fReader.read();
            	if (XMLChar.isLowSurrogate(ch2)) {

                    // convert surrogates to a supplemental character
                    int sup = XMLChar.supplemental((char)ch, (char)ch2);

                    // supplemental character must be a valid XML character
                    if (!isValid(sup)) {
                        fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
                                                   "InvalidCharInContent", 
                                                   new Object[] { Integer.toString(sup, 16) },
                                                   XMLErrorReporter.SEVERITY_FATAL_ERROR);
                        continue;
                    }                 
                    buffer.append((char) ch);
                    buffer.append((char) ch2);
                }
                else {
                    fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
                                               "InvalidCharInContent", 
                                               new Object[] { Integer.toString(ch, 16) },
                                               XMLErrorReporter.SEVERITY_FATAL_ERROR);
                }
            }
            else {
                fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
                                           "InvalidCharInContent", 
                                           new Object[] { Integer.toString(ch, 16) },
                                           XMLErrorReporter.SEVERITY_FATAL_ERROR);
            }
        }
        if (fHandler != null && buffer.length > 0) {
            fHandler.characters(
                buffer,
                fHandler.modifyAugmentations(null, true));
        }
    }
    
    /**
     * Closes the stream.  Call this after parse(), or when there is no longer any need
     * for this object.
     * 
     * @throws IOException
     */
    public void close() throws IOException {
        if (fReader != null) {
            fReader.close();
        }
    }
    
    /**
     * Returns true if the specified character is a valid XML character
     * as per the rules of XML 1.0.
     *
     * @param ch The character to check.
     */
    protected boolean isValid(int ch) {
        return XMLChar.isValid(ch);
    }
}