- /*
- * $Id: InputEntity.java,v 1.3 2001/09/29 00:44:34 edwingo Exp $
- *
- * The Apache Software License, Version 1.1
- *
- *
- * Copyright (c) 2000 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Crimson" and "Apache Software Foundation" must
- * not be used to endorse or promote products derived from this
- * software without prior written permission. For written
- * permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * nor may "Apache" appear in their name, without prior written
- * permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation and was
- * originally based on software copyright (c) 1999, Sun Microsystems, Inc.,
- * http://www.sun.com. For more information on the Apache Software
- * Foundation, please see <http://www.apache.org/>.
- */
-
-
- package org.apache.crimson.parser;
-
- import java.io.CharConversionException;
- import java.io.UnsupportedEncodingException;
- import java.io.InputStreamReader;
- import java.io.InputStream;
- import java.io.IOException;
- import java.io.Reader;
- import java.io.File;
- import java.net.MalformedURLException;
- import java.net.URL;
- import java.util.Locale;
-
- import org.xml.sax.*;
-
- import org.apache.crimson.util.XmlChars;
-
-
- /**
- * This is how the parser talks to its input entities, of all kinds.
- * The entities are in a stack.
- *
- * <P> For internal entities, the character arrays are referenced here,
- * and read from as needed (they're read-only). External entities have
- * mutable buffers, that are read into as needed.
- *
- * <P> <em>Note:</em> This maps CRLF (and CR) to LF without regard for
- * whether it's in an external (parsed) entity or not. The XML 1.0 spec
- * is inconsistent in explaining EOL handling; this is the sensible way.
- *
- * @author David Brownell
- * @version $Revision: 1.3 $
- */
- final class InputEntity implements Locator
- {
- private int start, finish;
- private char buf [];
- private int lineNumber = 1;
- private boolean returnedFirstHalf = false;
- private boolean maybeInCRLF = false;
-
- // name of entity (never main document or unnamed DTD PE)
- private String name;
-
- private InputEntity next;
-
- // for system and public IDs in diagnostics
- private InputSource input;
-
- // this is a buffer; some buffers can be replenished.
- private Reader reader;
- private boolean isClosed;
-
- private ErrorHandler errHandler;
- private Locale locale;
-
- private StringBuffer rememberedText;
- private int startRemember;
-
- // record if this is a PE, so endParsedEntity won't be called
- private boolean isPE;
-
- // InputStreamReader throws an internal per-read exception, so
- // we minimize reads. We also add a byte to compensate for the
- // "ungetc" byte we keep, so that our downstream reads are as
- // nicely sized as we can make them.
- final private static int BUFSIZ = 8 * 1024 + 1;
-
- final private static char newline [] = { '\n' };
-
- public static InputEntity getInputEntity (ErrorHandler h, Locale l)
- {
- InputEntity retval = new InputEntity ();
- retval.errHandler = h;
- retval.locale = l;
- return retval;
- }
-
- private InputEntity () { }
-
- //
- // predicate: return true iff this is an internal entity reader,
- // and so may safely be "popped" as needed. external entities have
- // syntax to uphold; internal parameter entities have at most validity
- // constraints to monitor. also, only external entities get decent
- // location diagnostics.
- //
- public boolean isInternal () { return reader == null; }
-
- //
- // predicate: return true iff this is the toplevel document
- //
- public boolean isDocument () { return next == null; }
-
- //
- // predicate: return true iff this is a PE expansion (so that
- // LexicalEventListner.endParsedEntity won't be called)
- //
- public boolean isParameterEntity () { return isPE; }
-
- //
- // return name of current entity
- //
- public String getName () { return name; }
-
- private static String convertToFileURL(String filename) {
- // On JDK 1.2 and later, simplify this to:
- // "path = file.toURL().toString()".
- String path = new File(filename).getAbsolutePath();
- if (File.separatorChar != '/') {
- path = path.replace(File.separatorChar, '/');
- }
- if (!path.startsWith("/")) {
- path = "/" + path;
- }
- return "file:" + path;
- }
-
- /**
- * Use this for an external parsed entity
- */
- public void init(InputSource in, String name, InputEntity stack,
- boolean isPE)
- throws IOException, SAXException
- {
- input = in;
- this.isPE = isPE;
- reader = in.getCharacterStream ();
-
- if (reader == null) {
- InputStream bytes = in.getByteStream ();
-
- if (bytes == null) {
- // When the app first provides an external InputSource, the
- // SystemId may not be a valid URI and just be a simple
- // filename. In this case, convert the filename to a
- // "file:" URL instead of throwing an exception. Note:
- // this does not strictly conform to the SAX spec but is
- // convenient for users.
- String systemId = in.getSystemId();
- URL url;
- try {
- url = new URL(systemId);
- } catch (MalformedURLException e) {
- String urlString = convertToFileURL(systemId);
- in.setSystemId(urlString);
- url = new URL(urlString);
- }
-
- reader = XmlReader.createReader(url.openStream());
- } else if (in.getEncoding () != null)
- reader = XmlReader.createReader (
- in.getByteStream (),
- in.getEncoding ());
- else
- reader = XmlReader.createReader (in.getByteStream ());
- }
- next = stack;
- buf = new char [BUFSIZ];
- this.name = name;
- checkRecursion (stack);
- }
-
- //
- // use this for an internal parsed entity; buffer is readonly
- //
- public void init (char b [], String name,
- InputEntity stack, boolean isPE)
- throws SAXException
- {
- next = stack;
- buf = b;
- finish = b.length;
- this.name = name;
- this.isPE = isPE;
- checkRecursion (stack);
- }
-
- private void checkRecursion (InputEntity stack) throws SAXException
- {
- if (stack == null)
- return;
- for (stack = stack.next; stack != null; stack = stack.next) {
- if (stack.name != null && stack.name.equals (name))
- fatal ("P-069", new Object [] { name });
- }
- }
-
- public InputEntity pop () throws IOException
- {
- // caller has ensured there's nothing left to read
- close ();
- return next;
- }
-
- /** returns true iff there's no more data to consume ... */
- public boolean isEOF ()
- throws IOException, SAXException
- {
- // called to ensure WF-ness of included entities and to pop
- // input entities appropriately ... EOF is not always legal.
- if (start >= finish) {
- fillbuf ();
- return start >= finish;
- } else
- return false;
- }
-
- /**
- * Returns the name of the encoding in use, else null; the name
- * returned is in as standard a form as we can get.
- */
- public String getEncoding ()
- {
- if (reader == null)
- return null;
- if (reader instanceof XmlReader)
- return ((XmlReader)reader).getEncoding ();
-
- // XXX prefer a java2std() call to normalize names...
-
- if (reader instanceof InputStreamReader)
- return ((InputStreamReader)reader).getEncoding ();
- return null;
- }
-
-
- /**
- * returns the next name char, or NUL ... faster than getc(),
- * and the common "name or nmtoken must be next" case won't
- * need ungetc().
- */
- public char getNameChar () throws IOException, SAXException
- {
- if (finish <= start)
- fillbuf ();
- if (finish > start) {
- char c = buf [start++];
- if (XmlChars.isNameChar (c))
- return c;
- start--;
- }
- return 0;
- }
-
- /**
- * gets the next Java character -- might be part of an XML
- * text character represented by a surrogate pair, or be
- * the end of the entity.
- */
- public char getc () throws IOException, SAXException
- {
- if (finish <= start)
- fillbuf ();
- if (finish > start) {
- char c = buf [start++];
-
- // [2] Char ::= #x0009 | #x000A | #x000D
- // | [#x0020-#xD7FF]
- // | [#xE000-#xFFFD]
- // plus surrogate _pairs_ representing [#x10000-#x10ffff]
- if (returnedFirstHalf) {
- if (c >= 0xdc00 && c <= 0xdfff) {
- returnedFirstHalf = false;
- return c;
- } else
- fatal ("P-070", new Object [] { Integer.toHexString (c) });
- }
- if ((c >= 0x0020 && c <= 0xD7FF)
- || c == 0x0009
- // no surrogates!
- || (c >= 0xE000 && c <= 0xFFFD))
- return c;
-
- //
- // CRLF and CR are both line ends; map both to LF, and
- // keep line count correct.
- //
- else if (c == '\r' && !isInternal ()) {
- maybeInCRLF = true;
- c = getc ();
- if (c != '\n')
- ungetc ();
- maybeInCRLF = false;
-
- lineNumber++;
- return '\n';
-
- } else if (c == '\n' || c == '\r') { // LF, or 2nd char in CRLF
- if (!isInternal () && !maybeInCRLF)
- lineNumber++;
- return c;
- }
-
- // surrogates...
- if (c >= 0xd800 && c < 0xdc00) {
- returnedFirstHalf = true;
- return c;
- }
-
- fatal ("P-071", new Object [] { Integer.toHexString (c) });
- }
- throw new EndOfInputException ();
- }
-
-
- public boolean peekc (char c) throws IOException, SAXException
- {
- if (finish <= start)
- fillbuf ();
- if (finish > start) {
- if (buf [start] == c) {
- start++;
- return true;
- } else
- return false;
- }
- return false;
- }
-
-
- /**
- * two character pushback is guaranteed
- */
- public void ungetc ()
- {
- if (start == 0)
- throw new InternalError ("ungetc");
- start--;
-
- if (buf [start] == '\n' || buf [start] == '\r') {
- if (!isInternal ())
- lineNumber--;
- } else if (returnedFirstHalf)
- returnedFirstHalf = false;
- }
-
-
- /**
- * optional grammatical whitespace (discarded)
- */
- public boolean maybeWhitespace ()
- throws IOException, SAXException
- {
- char c;
- boolean isSpace = false;
- boolean sawCR = false;
-
- // [3] S ::= #20 | #09 | #0D | #0A
- for (;;) {
- if (finish <= start)
- fillbuf ();
- if (finish <= start)
- return isSpace;
-
- c = buf [start++];
- if (c == 0x20 || c == 0x09 || c == '\n' || c == '\r') {
- isSpace = true;
-
- //
- // CR, LF are line endings ... CLRF is one, not two!
- //
- if ((c == '\n' || c == '\r') && !isInternal ()) {
- if (!(c == '\n' && sawCR)) {
- lineNumber++;
- sawCR = false;
- }
- if (c == '\r')
- sawCR = true;
- }
- } else {
- start--;
- return isSpace;
- }
- }
- }
-
-
- /**
- * normal content; whitespace in markup may be handled
- * specially if the parser uses the content model.
- *
- * <P> content terminates with markup delimiter characters,
- * namely ampersand (&) and left angle bracket (<).
- *
- * <P> the document handler's characters() method is called
- * on all the content found
- */
- public boolean parsedContent (
- ContentHandler contentHandler,
- ElementValidator validator
- ) throws IOException, SAXException
- {
- // [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
-
- int first; // first char to return
- int last; // last char to return
- boolean sawContent; // sent any chars?
- char c;
-
- // deliver right out of the buffer, until delimiter, EOF,
- // or error, refilling as we go
- for (first = last = start, sawContent = false; ; last++) {
-
- // buffer empty?
- if (last >= finish) {
- if (last > first) {
- validator.text ();
- contentHandler.characters (buf, first, last - first);
- sawContent = true;
- start = last;
- }
- if (isEOF ()) // calls fillbuf
- return sawContent;
- first = start;
- last = first - 1; // incremented in loop
- continue;
- }
-
- c = buf [last];
-
- //
- // pass most chars through ASAP; this inlines the code of
- // [2] !XmlChars.isChar(c) leaving only characters needing
- // special treatment ... line ends, surrogates, and:
- // 0x0026 == '&'
- // 0x003C == '<'
- // 0x005D == ']'
- // Comparisons ordered for speed on 'typical' text
- //
- if ( (c > 0x005D && c <= 0xD7FF) // a-z and more
- || (c < 0x0026 && c >= 0x0020) // space & punct
- || (c > 0x003C && c < 0x005D) // A-Z & punct
- || (c > 0x0026 && c < 0x003C) // 0-9 & punct
- || c == 0x0009
- || (c >= 0xE000 && c <= 0xFFFD)
- )
- continue;
-
- // terminate on markup delimiters
- if (c == '<' || c == '&')
- break;
-
- // count lines
- if (c == '\n') {
- if (!isInternal ())
- lineNumber++;
- continue;
- }
-
- // External entities get CR, CRLF --> LF mapping
- // Internal ones got it already, and we can't repeat
- // else we break char ref handling!!
- if (c == '\r') {
- if (isInternal ())
- continue;
-
- contentHandler.characters (buf, first, last - first);
- contentHandler.characters (newline, 0, 1);
- sawContent = true;
- lineNumber++;
- if (finish > (last + 1)) {
- if (buf [last + 1] == '\n')
- last++;
- } else { // CR at end of buffer
- // XXX case not yet handled: CRLF here will look like two lines
- }
- first = start = last + 1;
- continue;
- }
-
- // ']]>' is a WF error -- must fail if we see it
- if (c == ']') {
- switch (finish - last) {
- // for suspicious end-of-buffer cases, get more data
- // into the buffer to rule out this sequence.
- case 2:
- if (buf [last + 1] != ']')
- continue;
- // FALLTHROUGH
-
- case 1:
- if (reader == null || isClosed)
- continue;
- if (last == first)
- throw new InternalError ("fillbuf");
- last--;
- if (last > first) {
- validator.text ();
- contentHandler.characters (buf, first, last - first);
- sawContent = true;
- start = last;
- }
- fillbuf ();
- first = last = start;
- continue;
-
- // otherwise any "]]>" would be buffered, and we can
- // see right away if that's what we have
- default:
- if (buf [last + 1] == ']' && buf [last + 2] == '>')
- fatal ("P-072", null);
- continue;
- }
- }
-
- // correctly paired surrogates are OK
- if (c >= 0xd800 && c <= 0xdfff) {
- if ((last + 1) >= finish) {
- if (last > first) {
- validator.text ();
- contentHandler.characters (buf, first, last - first);
- sawContent = true;
- start = last + 1;
- }
- if (isEOF ()) { // calls fillbuf
- fatal ("P-081",
- new Object [] { Integer.toHexString (c) });
- }
- first = start;
- last = first ;
- continue;
- }
- if (checkSurrogatePair (last))
- last++;
- else {
- last--;
- // also terminate on surrogate pair oddities
- break;
- }
- continue;
- }
-
- fatal ("P-071", new Object [] { Integer.toHexString (c) });
- }
- if (last == first)
- return sawContent;
- validator.text ();
- contentHandler.characters (buf, first, last - first);
- start = last;
- return true;
- }
-
-
- /**
- * CDATA -- character data, terminated by "]]>" and optionally
- * including unescaped markup delimiters (ampersand and left angle
- * bracket). This should otherwise be exactly like character data,
- * modulo differences in error report details.
- *
- * <P> The document handler's characters() or ignorableWhitespace()
- * methods are invoked on all the character data found
- *
- * @param contentHandler gets callbacks for character data
- * @param validator text() or ignorableWhitespace() methods are
- * called appropriately
- * @param ignorableWhitespace if true, whitespace characters will
- * be reported using contentHandler.ignorableWhitespace(); implicitly,
- * non-whitespace characters will cause validation errors
- * @param standaloneWhitespaceInvalid if true, ignorable whitespace
- * causes a validity error report as well as a callback
- */
- public void unparsedContent (
- ContentHandler contentHandler,
- ElementValidator validator,
- boolean ignorableWhitespace,
- String whitespaceInvalidMessage
- ) throws IOException, SAXException
- {
- // [18] CDSect ::= CDStart CData CDEnd
- // [19] CDStart ::= '<![CDATA['
- // [20] CData ::= (Char* - (Char* ']]>' Char*))
- // [21] CDEnd ::= ']]>'
-
- // Caller has already consumed the leading '<![CDATA[' so all that
- // remains to be parsed of [18] is "CData CDEnd"
-
- // only a literal ']]>' stops this ...
- int last;
-
- for (;;) { // until ']]>' seen
- boolean done = false;
- char c;
-
- // don't report ignorable whitespace as "text" for
- // validation purposes.
- boolean white = ignorableWhitespace;
-
- for (last = start; last < finish; last++) {
- c = buf [last];
-
- //
- // Reject illegal characters.
- //
- if (!XmlChars.isChar (c)) {
- white = false;
- if (c >= 0xd800 && c <= 0xdfff) {
- if (checkSurrogatePair (last)) {
- last++;
- continue;
- } else {
- last--;
- break;
- }
- }
- fatal ("P-071", new Object []
- { Integer.toHexString (buf [last]) });
- }
- if (c == '\n') {
- if (!isInternal ())
- lineNumber++;
- continue;
- }
- if (c == '\r') {
- // As above, we can't repeat CR/CRLF --> LF mapping
- if (isInternal ())
- continue;
-
- if (white) {
- if (whitespaceInvalidMessage != null)
- errHandler.error (new SAXParseException (
- Parser2.messages.getMessage (locale,
- whitespaceInvalidMessage),
- this));
- contentHandler.ignorableWhitespace (buf, start,
- last - start);
- contentHandler.ignorableWhitespace (newline, 0, 1);
- } else {
- validator.text ();
- contentHandler.characters (buf, start, last - start);
- contentHandler.characters (newline, 0, 1);
- }
- lineNumber++;
- if (finish > (last + 1)) {
- if (buf [last + 1] == '\n')
- last++;
- } else { // CR at end of buffer
- // XXX case not yet handled ... as above
- }
- start = last + 1;
- continue;
- }
- if (c != ']') {
- if (c != ' ' && c != '\t')
- white = false;
- continue;
- }
- // assert(buf[last] == ']');
- if ((last + 2) < finish) {
- if (buf [last + 1] == ']' && buf [last + 2] == '>') {
- done = true;
- break;
- }
- white = false;
- continue;
- } else {
- // "last" is at or one before end of buffered data.
- // Report what we have so far, not including "last", by
- // breaking and executing code below, outside inner
- // loop, then continuing on to find end of CDATA section.
- break;
- }
- }
- if (white) {
- if (whitespaceInvalidMessage != null)
- errHandler.error (new SAXParseException (
- Parser2.messages.getMessage (locale,
- whitespaceInvalidMessage),
- this));
- contentHandler.ignorableWhitespace (buf, start, last - start);
- } else {
- validator.text ();
- contentHandler.characters (buf, start, last - start);
- }
- if (done) {
- start = last + 3;
- break;
- }
- start = last;
- fillbuf();
- if (isEOF ())
- fatal ("P-073", null);
- }
- }
-
- // return false to backstep at end of buffer)
- private boolean checkSurrogatePair (int offset)
- throws SAXException
- {
- if ((offset + 1) >= finish)
- return false;
-
- char c1 = buf [offset++];
- char c2 = buf [offset];
-
- if ((c1 >= 0xd800 && c1 < 0xdc00) && (c2 >= 0xdc00 && c2 <= 0xdfff))
- return true;
- fatal ("P-074", new Object [] {
- Integer.toHexString (c1 & 0x0ffff),
- Integer.toHexString (c2 & 0x0ffff)
- });
- return false;
- }
-
-
- /**
- * whitespace in markup (flagged to app, discardable)
- *
- * <P> the document handler's ignorableWhitespace() method
- * is called on all the whitespace found
- */
- public boolean ignorableWhitespace (ContentHandler handler)
- throws IOException, SAXException
- {
- char c;
- boolean isSpace = false;
- int first;
-
- // [3] S ::= #20 | #09 | #0D | #0A
- for (first = start;;) {
- if (finish <= start) {
- if (isSpace)
- handler.ignorableWhitespace (buf, first, start - first);
- fillbuf ();
- first = start;
- }
- if (finish <= start)
- return isSpace;
-
- c = buf [start++];
- switch (c) {
- case '\n':
- if (!isInternal ())
- lineNumber++;
- // XXX handles Macintosh line endings wrong
- // fallthrough
- case 0x09:
- case 0x20:
- isSpace = true;
- continue;
-
- case '\r':
- isSpace = true;
- if (!isInternal ())
- lineNumber++;
- handler.ignorableWhitespace (buf, first,
- (start - 1) - first);
- handler.ignorableWhitespace (newline, 0, 1);
- if (start < finish && buf [start] == '\n')
- ++start;
- first = start;
- continue;
-
- default:
- ungetc ();
- if (isSpace)
- handler.ignorableWhitespace (buf, first, start - first);
- return isSpace;
- }
- }
- }
-
- /**
- * returns false iff 'next' string isn't as provided,
- * else skips that text and returns true
- *
- * <P> NOTE: two alternative string representations are
- * both passed in, since one is faster.
- */
- public boolean peek (String next, char chars [])
- throws IOException, SAXException
- {
- int len;
- int i;
-
- if (chars != null)
- len = chars.length;
- else
- len = next.length ();
-
- // buffer should hold the whole thing ... give it a
- // chance for the end-of-buffer case and cope with EOF
- // by letting fillbuf compact and fill
- if (finish <= start || (finish - start) < len)
- fillbuf ();
-
- // can't peek past EOF
- if (finish <= start)
- return false;
-
- // compare the string; consume iff it matches
- if (chars != null) {
- for (i = 0; i < len && (start + i) < finish; i++) {
- if (buf [start + i] != chars [i])
- return false;
- }
- } else {
- for (i = 0; i < len && (start + i) < finish; i++) {
- if (buf [start + i] != next.charAt (i))
- return false;
- }
- }
-
- // if the first fillbuf didn't get enough data, give
- // fillbuf another chance to read
- if (i < len) {
- if (reader == null || isClosed)
- return false;
-
- //
- // This diagnostic "knows" that the only way big strings would
- // fail to be peeked is where it's a symbol ... e.g. for an
- // </EndTag> construct. That knowledge could also be applied
- // to get rid of the symbol length constraint, since having
- // the wrong symbol is a fatal error anyway ...
- //
- if (len > buf.length)
- fatal ("P-077", new Object [] { new Integer (buf.length) });
-
- fillbuf ();
- return peek (next, chars);
- }
-
- start += len;
- return true;
- }
-
- /**
- * This method is used to disambiguate between XMLDecl, TextDecl, and
- * PI by doing a lookahead w/o consuming any characters. We look for
- * "<?xml" plus a whitespace character, but no more. For example, we
- * could have input documents with the PI "<?xml-stylesheet ... >".
- *
- * @return true iff next chars match either the prefix for XMLDecl or
- * TextDecl
- */
- boolean isXmlDeclOrTextDeclPrefix()
- throws IOException, SAXException
- {
- // [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl?
- // SDDecl? S? '>'
- // [77] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
- // [24] VersionInfo ::= S 'version' Eq \'|\" versionNum \'|\"
-
- String match = "<?xml";
- int matchLen = match.length();
-
- // Length of the entire prefix including whitespace
- int prefixLen = matchLen + 1;
-
- // buffer should hold the whole thing ... give it a
- // chance for the end-of-buffer case and cope with EOF
- // by letting fillbuf compact and fill
- if (finish <= start || (finish - start) < prefixLen)
- fillbuf ();
-
- // can't peek past EOF
- if (finish <= start)
- return false;
-
- // Compare the non-whitespace part of the prefix
- int i;
- for (i = 0; i < matchLen && (start + i) < finish; i++) {
- if (buf [start + i] != match.charAt (i))
- return false;
- }
-
- // if the first fillbuf didn't get enough data, give
- // fillbuf another chance to read
- if (i < matchLen) {
- if (reader == null || isClosed)
- return false;
-
- fillbuf ();
- return isXmlDeclOrTextDeclPrefix();
- }
-
- // assert(i == matchLen);
- // Match whitespace
- if (!XmlChars.isSpace(buf[i])) {
- return false;
- }
-
- return true;
- }
-
-
- //
- // Support for reporting the internal DTD subset, so <!DOCTYPE...>
- // declarations can be recreated. This is collected as a single
- // string; such subsets are normally small, and many applications
- // don't even care about this.
- //
- public void startRemembering ()
- {
- if (startRemember != 0)
- throw new InternalError ();
- startRemember = start;
- }
-
- public String rememberText ()
- {
- String retval;
-
- // If the internal subset crossed a buffer boundary, we
- // created a temporary buffer.
- if (rememberedText != null) {
- rememberedText.append (buf, startRemember,
- start - startRemember);
- retval = rememberedText.toString ();
- } else
- retval = new String (buf, startRemember,
- start - startRemember);
-
- startRemember = 0;
- rememberedText = null;
- return retval;
- }
-
- // LOCATOR METHODS
-
- private Locator getLocator ()
- {
- InputEntity current = this;
-
- // don't report locations within internal entities!
-
- while (current != null && current.input == null)
- current = current.next;
- return current == null ? this : current;
- }
-
-
- /** Returns the public ID of this input source, if known */
- public String getPublicId ()
- {
- Locator where = getLocator ();
- if (where == this)
- return input.getPublicId ();
- return where.getPublicId ();
- }
-
- /** Returns the system ID of this input source, if known */
- public String getSystemId ()
- {
- Locator where = getLocator ();
- if (where == this)
- return input.getSystemId ();
- return where.getSystemId ();
- }
-
- /** Returns the current line number in this input source */
- public int getLineNumber ()
- {
- Locator where = getLocator ();
- if (where == this)
- return lineNumber;
- return where.getLineNumber ();
- }
-
- /** returns -1; maintaining column numbers hurts performance */
- public int getColumnNumber ()
- {
- return -1; // not maintained (speed)
- }
-
-
-
- //
- // n.b. for non-EOF end-of-buffer cases, reader should return
- // at least a handful of bytes so various lookaheads behave.
- //
- // two character pushback exists except at first; characters
- // represented by surrogate pairs can't be pushed back (they'd
- // only be in character data anyway).
- //
- // SAX exception thrown on char conversion problems; line number
- // will be low, as a rule.
- //
- private void fillbuf () throws IOException, SAXException
- {
- // don't touched fixed buffers, that'll usually
- // change entity values (and isn't needed anyway)
- // likewise, ignore closed streams
- if (reader == null || isClosed)
- return;
-
- // if remembering DTD text, copy!
- if (startRemember != 0) {
- if (rememberedText == null)
- rememberedText = new StringBuffer (buf.length);
- rememberedText.append (buf, startRemember,
- start - startRemember);
- }
-
- boolean extra = (finish > 0) && (start > 0);
- int len;
-
- if (extra) // extra pushback
- start--;
- len = finish - start;
-
- System.arraycopy (buf, start, buf, 0, len);
- start = 0;
- finish = len;
-
- try {
- len = buf.length - len;
- len = reader.read (buf, finish, len);
- } catch (UnsupportedEncodingException e) {
- fatal ("P-075", new Object [] { e.getMessage () });
- } catch (CharConversionException e) {
- fatal ("P-076", new Object [] { e.getMessage () });
- }
- if (len >= 0)
- finish += len;
- else
- close ();
- if (extra) // extra pushback
- start++;
-
- if (startRemember != 0)
- // assert extra == true
- startRemember = 1;
- }
-
- public void close ()
- {
- try {
- if (reader != null && !isClosed)
- reader.close ();
- isClosed = true;
- } catch (IOException e) {
- /* NOTHING */
- }
- }
-
-
- private void fatal (String messageId, Object params []) throws SAXException
- {
- SAXParseException x = new SAXParseException (
- Parser2.messages.getMessage (locale, messageId, params),
- this);
-
- // not continuable ... e.g. WF errors
- close ();
- errHandler.fatalError (x);
- throw x;
- }
- }