1. /*
  2. * $Id: InputEntity.java,v 1.3 2001/09/29 00:44:34 edwingo Exp $
  3. *
  4. * The Apache Software License, Version 1.1
  5. *
  6. *
  7. * Copyright (c) 2000 The Apache Software Foundation. All rights
  8. * reserved.
  9. *
  10. * Redistribution and use in source and binary forms, with or without
  11. * modification, are permitted provided that the following conditions
  12. * are met:
  13. *
  14. * 1. Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. *
  17. * 2. Redistributions in binary form must reproduce the above copyright
  18. * notice, this list of conditions and the following disclaimer in
  19. * the documentation and/or other materials provided with the
  20. * distribution.
  21. *
  22. * 3. The end-user documentation included with the redistribution,
  23. * if any, must include the following acknowledgment:
  24. * "This product includes software developed by the
  25. * Apache Software Foundation (http://www.apache.org/)."
  26. * Alternately, this acknowledgment may appear in the software itself,
  27. * if and wherever such third-party acknowledgments normally appear.
  28. *
  29. * 4. The names "Crimson" and "Apache Software Foundation" must
  30. * not be used to endorse or promote products derived from this
  31. * software without prior written permission. For written
  32. * permission, please contact apache@apache.org.
  33. *
  34. * 5. Products derived from this software may not be called "Apache",
  35. * nor may "Apache" appear in their name, without prior written
  36. * permission of the Apache Software Foundation.
  37. *
  38. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  39. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  40. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  41. * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  42. * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  43. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  44. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  45. * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  46. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  47. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  48. * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  49. * SUCH DAMAGE.
  50. * ====================================================================
  51. *
  52. * This software consists of voluntary contributions made by many
  53. * individuals on behalf of the Apache Software Foundation and was
  54. * originally based on software copyright (c) 1999, Sun Microsystems, Inc.,
  55. * http://www.sun.com. For more information on the Apache Software
  56. * Foundation, please see <http://www.apache.org/>.
  57. */
  58. package org.apache.crimson.parser;
  59. import java.io.CharConversionException;
  60. import java.io.UnsupportedEncodingException;
  61. import java.io.InputStreamReader;
  62. import java.io.InputStream;
  63. import java.io.IOException;
  64. import java.io.Reader;
  65. import java.io.File;
  66. import java.net.MalformedURLException;
  67. import java.net.URL;
  68. import java.util.Locale;
  69. import org.xml.sax.*;
  70. import org.apache.crimson.util.XmlChars;
  71. /**
  72. * This is how the parser talks to its input entities, of all kinds.
  73. * The entities are in a stack.
  74. *
  75. * <P> For internal entities, the character arrays are referenced here,
  76. * and read from as needed (they're read-only). External entities have
  77. * mutable buffers, that are read into as needed.
  78. *
  79. * <P> <em>Note:</em> This maps CRLF (and CR) to LF without regard for
  80. * whether it's in an external (parsed) entity or not. The XML 1.0 spec
  81. * is inconsistent in explaining EOL handling; this is the sensible way.
  82. *
  83. * @author David Brownell
  84. * @version $Revision: 1.3 $
  85. */
  86. final class InputEntity implements Locator
  87. {
  88. private int start, finish;
  89. private char buf [];
  90. private int lineNumber = 1;
  91. private boolean returnedFirstHalf = false;
  92. private boolean maybeInCRLF = false;
  93. // name of entity (never main document or unnamed DTD PE)
  94. private String name;
  95. private InputEntity next;
  96. // for system and public IDs in diagnostics
  97. private InputSource input;
  98. // this is a buffer; some buffers can be replenished.
  99. private Reader reader;
  100. private boolean isClosed;
  101. private ErrorHandler errHandler;
  102. private Locale locale;
  103. private StringBuffer rememberedText;
  104. private int startRemember;
  105. // record if this is a PE, so endParsedEntity won't be called
  106. private boolean isPE;
  107. // InputStreamReader throws an internal per-read exception, so
  108. // we minimize reads. We also add a byte to compensate for the
  109. // "ungetc" byte we keep, so that our downstream reads are as
  110. // nicely sized as we can make them.
  111. final private static int BUFSIZ = 8 * 1024 + 1;
  112. final private static char newline [] = { '\n' };
  113. public static InputEntity getInputEntity (ErrorHandler h, Locale l)
  114. {
  115. InputEntity retval = new InputEntity ();
  116. retval.errHandler = h;
  117. retval.locale = l;
  118. return retval;
  119. }
  120. private InputEntity () { }
  121. //
  122. // predicate: return true iff this is an internal entity reader,
  123. // and so may safely be "popped" as needed. external entities have
  124. // syntax to uphold; internal parameter entities have at most validity
  125. // constraints to monitor. also, only external entities get decent
  126. // location diagnostics.
  127. //
  128. public boolean isInternal () { return reader == null; }
  129. //
  130. // predicate: return true iff this is the toplevel document
  131. //
  132. public boolean isDocument () { return next == null; }
  133. //
  134. // predicate: return true iff this is a PE expansion (so that
  135. // LexicalEventListner.endParsedEntity won't be called)
  136. //
  137. public boolean isParameterEntity () { return isPE; }
  138. //
  139. // return name of current entity
  140. //
  141. public String getName () { return name; }
  142. private static String convertToFileURL(String filename) {
  143. // On JDK 1.2 and later, simplify this to:
  144. // "path = file.toURL().toString()".
  145. String path = new File(filename).getAbsolutePath();
  146. if (File.separatorChar != '/') {
  147. path = path.replace(File.separatorChar, '/');
  148. }
  149. if (!path.startsWith("/")) {
  150. path = "/" + path;
  151. }
  152. return "file:" + path;
  153. }
  154. /**
  155. * Use this for an external parsed entity
  156. */
  157. public void init(InputSource in, String name, InputEntity stack,
  158. boolean isPE)
  159. throws IOException, SAXException
  160. {
  161. input = in;
  162. this.isPE = isPE;
  163. reader = in.getCharacterStream ();
  164. if (reader == null) {
  165. InputStream bytes = in.getByteStream ();
  166. if (bytes == null) {
  167. // When the app first provides an external InputSource, the
  168. // SystemId may not be a valid URI and just be a simple
  169. // filename. In this case, convert the filename to a
  170. // "file:" URL instead of throwing an exception. Note:
  171. // this does not strictly conform to the SAX spec but is
  172. // convenient for users.
  173. String systemId = in.getSystemId();
  174. URL url;
  175. try {
  176. url = new URL(systemId);
  177. } catch (MalformedURLException e) {
  178. String urlString = convertToFileURL(systemId);
  179. in.setSystemId(urlString);
  180. url = new URL(urlString);
  181. }
  182. reader = XmlReader.createReader(url.openStream());
  183. } else if (in.getEncoding () != null)
  184. reader = XmlReader.createReader (
  185. in.getByteStream (),
  186. in.getEncoding ());
  187. else
  188. reader = XmlReader.createReader (in.getByteStream ());
  189. }
  190. next = stack;
  191. buf = new char [BUFSIZ];
  192. this.name = name;
  193. checkRecursion (stack);
  194. }
  195. //
  196. // use this for an internal parsed entity; buffer is readonly
  197. //
  198. public void init (char b [], String name,
  199. InputEntity stack, boolean isPE)
  200. throws SAXException
  201. {
  202. next = stack;
  203. buf = b;
  204. finish = b.length;
  205. this.name = name;
  206. this.isPE = isPE;
  207. checkRecursion (stack);
  208. }
  209. private void checkRecursion (InputEntity stack) throws SAXException
  210. {
  211. if (stack == null)
  212. return;
  213. for (stack = stack.next; stack != null; stack = stack.next) {
  214. if (stack.name != null && stack.name.equals (name))
  215. fatal ("P-069", new Object [] { name });
  216. }
  217. }
  218. public InputEntity pop () throws IOException
  219. {
  220. // caller has ensured there's nothing left to read
  221. close ();
  222. return next;
  223. }
  224. /** returns true iff there's no more data to consume ... */
  225. public boolean isEOF ()
  226. throws IOException, SAXException
  227. {
  228. // called to ensure WF-ness of included entities and to pop
  229. // input entities appropriately ... EOF is not always legal.
  230. if (start >= finish) {
  231. fillbuf ();
  232. return start >= finish;
  233. } else
  234. return false;
  235. }
  236. /**
  237. * Returns the name of the encoding in use, else null; the name
  238. * returned is in as standard a form as we can get.
  239. */
  240. public String getEncoding ()
  241. {
  242. if (reader == null)
  243. return null;
  244. if (reader instanceof XmlReader)
  245. return ((XmlReader)reader).getEncoding ();
  246. // XXX prefer a java2std() call to normalize names...
  247. if (reader instanceof InputStreamReader)
  248. return ((InputStreamReader)reader).getEncoding ();
  249. return null;
  250. }
  251. /**
  252. * returns the next name char, or NUL ... faster than getc(),
  253. * and the common "name or nmtoken must be next" case won't
  254. * need ungetc().
  255. */
  256. public char getNameChar () throws IOException, SAXException
  257. {
  258. if (finish <= start)
  259. fillbuf ();
  260. if (finish > start) {
  261. char c = buf [start++];
  262. if (XmlChars.isNameChar (c))
  263. return c;
  264. start--;
  265. }
  266. return 0;
  267. }
  268. /**
  269. * gets the next Java character -- might be part of an XML
  270. * text character represented by a surrogate pair, or be
  271. * the end of the entity.
  272. */
  273. public char getc () throws IOException, SAXException
  274. {
  275. if (finish <= start)
  276. fillbuf ();
  277. if (finish > start) {
  278. char c = buf [start++];
  279. // [2] Char ::= #x0009 | #x000A | #x000D
  280. // | [#x0020-#xD7FF]
  281. // | [#xE000-#xFFFD]
  282. // plus surrogate _pairs_ representing [#x10000-#x10ffff]
  283. if (returnedFirstHalf) {
  284. if (c >= 0xdc00 && c <= 0xdfff) {
  285. returnedFirstHalf = false;
  286. return c;
  287. } else
  288. fatal ("P-070", new Object [] { Integer.toHexString (c) });
  289. }
  290. if ((c >= 0x0020 && c <= 0xD7FF)
  291. || c == 0x0009
  292. // no surrogates!
  293. || (c >= 0xE000 && c <= 0xFFFD))
  294. return c;
  295. //
  296. // CRLF and CR are both line ends; map both to LF, and
  297. // keep line count correct.
  298. //
  299. else if (c == '\r' && !isInternal ()) {
  300. maybeInCRLF = true;
  301. c = getc ();
  302. if (c != '\n')
  303. ungetc ();
  304. maybeInCRLF = false;
  305. lineNumber++;
  306. return '\n';
  307. } else if (c == '\n' || c == '\r') { // LF, or 2nd char in CRLF
  308. if (!isInternal () && !maybeInCRLF)
  309. lineNumber++;
  310. return c;
  311. }
  312. // surrogates...
  313. if (c >= 0xd800 && c < 0xdc00) {
  314. returnedFirstHalf = true;
  315. return c;
  316. }
  317. fatal ("P-071", new Object [] { Integer.toHexString (c) });
  318. }
  319. throw new EndOfInputException ();
  320. }
  321. public boolean peekc (char c) throws IOException, SAXException
  322. {
  323. if (finish <= start)
  324. fillbuf ();
  325. if (finish > start) {
  326. if (buf [start] == c) {
  327. start++;
  328. return true;
  329. } else
  330. return false;
  331. }
  332. return false;
  333. }
  334. /**
  335. * two character pushback is guaranteed
  336. */
  337. public void ungetc ()
  338. {
  339. if (start == 0)
  340. throw new InternalError ("ungetc");
  341. start--;
  342. if (buf [start] == '\n' || buf [start] == '\r') {
  343. if (!isInternal ())
  344. lineNumber--;
  345. } else if (returnedFirstHalf)
  346. returnedFirstHalf = false;
  347. }
  348. /**
  349. * optional grammatical whitespace (discarded)
  350. */
  351. public boolean maybeWhitespace ()
  352. throws IOException, SAXException
  353. {
  354. char c;
  355. boolean isSpace = false;
  356. boolean sawCR = false;
  357. // [3] S ::= #20 | #09 | #0D | #0A
  358. for (;;) {
  359. if (finish <= start)
  360. fillbuf ();
  361. if (finish <= start)
  362. return isSpace;
  363. c = buf [start++];
  364. if (c == 0x20 || c == 0x09 || c == '\n' || c == '\r') {
  365. isSpace = true;
  366. //
  367. // CR, LF are line endings ... CLRF is one, not two!
  368. //
  369. if ((c == '\n' || c == '\r') && !isInternal ()) {
  370. if (!(c == '\n' && sawCR)) {
  371. lineNumber++;
  372. sawCR = false;
  373. }
  374. if (c == '\r')
  375. sawCR = true;
  376. }
  377. } else {
  378. start--;
  379. return isSpace;
  380. }
  381. }
  382. }
  383. /**
  384. * normal content; whitespace in markup may be handled
  385. * specially if the parser uses the content model.
  386. *
  387. * <P> content terminates with markup delimiter characters,
  388. * namely ampersand (&amp;) and left angle bracket (&lt;).
  389. *
  390. * <P> the document handler's characters() method is called
  391. * on all the content found
  392. */
  393. public boolean parsedContent (
  394. ContentHandler contentHandler,
  395. ElementValidator validator
  396. ) throws IOException, SAXException
  397. {
  398. // [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
  399. int first; // first char to return
  400. int last; // last char to return
  401. boolean sawContent; // sent any chars?
  402. char c;
  403. // deliver right out of the buffer, until delimiter, EOF,
  404. // or error, refilling as we go
  405. for (first = last = start, sawContent = false; ; last++) {
  406. // buffer empty?
  407. if (last >= finish) {
  408. if (last > first) {
  409. validator.text ();
  410. contentHandler.characters (buf, first, last - first);
  411. sawContent = true;
  412. start = last;
  413. }
  414. if (isEOF ()) // calls fillbuf
  415. return sawContent;
  416. first = start;
  417. last = first - 1; // incremented in loop
  418. continue;
  419. }
  420. c = buf [last];
  421. //
  422. // pass most chars through ASAP; this inlines the code of
  423. // [2] !XmlChars.isChar(c) leaving only characters needing
  424. // special treatment ... line ends, surrogates, and:
  425. // 0x0026 == '&'
  426. // 0x003C == '<'
  427. // 0x005D == ']'
  428. // Comparisons ordered for speed on 'typical' text
  429. //
  430. if ( (c > 0x005D && c <= 0xD7FF) // a-z and more
  431. || (c < 0x0026 && c >= 0x0020) // space & punct
  432. || (c > 0x003C && c < 0x005D) // A-Z & punct
  433. || (c > 0x0026 && c < 0x003C) // 0-9 & punct
  434. || c == 0x0009
  435. || (c >= 0xE000 && c <= 0xFFFD)
  436. )
  437. continue;
  438. // terminate on markup delimiters
  439. if (c == '<' || c == '&')
  440. break;
  441. // count lines
  442. if (c == '\n') {
  443. if (!isInternal ())
  444. lineNumber++;
  445. continue;
  446. }
  447. // External entities get CR, CRLF --> LF mapping
  448. // Internal ones got it already, and we can't repeat
  449. // else we break char ref handling!!
  450. if (c == '\r') {
  451. if (isInternal ())
  452. continue;
  453. contentHandler.characters (buf, first, last - first);
  454. contentHandler.characters (newline, 0, 1);
  455. sawContent = true;
  456. lineNumber++;
  457. if (finish > (last + 1)) {
  458. if (buf [last + 1] == '\n')
  459. last++;
  460. } else { // CR at end of buffer
  461. // XXX case not yet handled: CRLF here will look like two lines
  462. }
  463. first = start = last + 1;
  464. continue;
  465. }
  466. // ']]>' is a WF error -- must fail if we see it
  467. if (c == ']') {
  468. switch (finish - last) {
  469. // for suspicious end-of-buffer cases, get more data
  470. // into the buffer to rule out this sequence.
  471. case 2:
  472. if (buf [last + 1] != ']')
  473. continue;
  474. // FALLTHROUGH
  475. case 1:
  476. if (reader == null || isClosed)
  477. continue;
  478. if (last == first)
  479. throw new InternalError ("fillbuf");
  480. last--;
  481. if (last > first) {
  482. validator.text ();
  483. contentHandler.characters (buf, first, last - first);
  484. sawContent = true;
  485. start = last;
  486. }
  487. fillbuf ();
  488. first = last = start;
  489. continue;
  490. // otherwise any "]]>" would be buffered, and we can
  491. // see right away if that's what we have
  492. default:
  493. if (buf [last + 1] == ']' && buf [last + 2] == '>')
  494. fatal ("P-072", null);
  495. continue;
  496. }
  497. }
  498. // correctly paired surrogates are OK
  499. if (c >= 0xd800 && c <= 0xdfff) {
  500. if ((last + 1) >= finish) {
  501. if (last > first) {
  502. validator.text ();
  503. contentHandler.characters (buf, first, last - first);
  504. sawContent = true;
  505. start = last + 1;
  506. }
  507. if (isEOF ()) { // calls fillbuf
  508. fatal ("P-081",
  509. new Object [] { Integer.toHexString (c) });
  510. }
  511. first = start;
  512. last = first ;
  513. continue;
  514. }
  515. if (checkSurrogatePair (last))
  516. last++;
  517. else {
  518. last--;
  519. // also terminate on surrogate pair oddities
  520. break;
  521. }
  522. continue;
  523. }
  524. fatal ("P-071", new Object [] { Integer.toHexString (c) });
  525. }
  526. if (last == first)
  527. return sawContent;
  528. validator.text ();
  529. contentHandler.characters (buf, first, last - first);
  530. start = last;
  531. return true;
  532. }
  533. /**
  534. * CDATA -- character data, terminated by "]]>" and optionally
  535. * including unescaped markup delimiters (ampersand and left angle
  536. * bracket). This should otherwise be exactly like character data,
  537. * modulo differences in error report details.
  538. *
  539. * <P> The document handler's characters() or ignorableWhitespace()
  540. * methods are invoked on all the character data found
  541. *
  542. * @param contentHandler gets callbacks for character data
  543. * @param validator text() or ignorableWhitespace() methods are
  544. * called appropriately
  545. * @param ignorableWhitespace if true, whitespace characters will
  546. * be reported using contentHandler.ignorableWhitespace(); implicitly,
  547. * non-whitespace characters will cause validation errors
  548. * @param standaloneWhitespaceInvalid if true, ignorable whitespace
  549. * causes a validity error report as well as a callback
  550. */
  551. public void unparsedContent (
  552. ContentHandler contentHandler,
  553. ElementValidator validator,
  554. boolean ignorableWhitespace,
  555. String whitespaceInvalidMessage
  556. ) throws IOException, SAXException
  557. {
  558. // [18] CDSect ::= CDStart CData CDEnd
  559. // [19] CDStart ::= '<![CDATA['
  560. // [20] CData ::= (Char* - (Char* ']]>' Char*))
  561. // [21] CDEnd ::= ']]>'
  562. // Caller has already consumed the leading '<![CDATA[' so all that
  563. // remains to be parsed of [18] is "CData CDEnd"
  564. // only a literal ']]>' stops this ...
  565. int last;
  566. for (;;) { // until ']]>' seen
  567. boolean done = false;
  568. char c;
  569. // don't report ignorable whitespace as "text" for
  570. // validation purposes.
  571. boolean white = ignorableWhitespace;
  572. for (last = start; last < finish; last++) {
  573. c = buf [last];
  574. //
  575. // Reject illegal characters.
  576. //
  577. if (!XmlChars.isChar (c)) {
  578. white = false;
  579. if (c >= 0xd800 && c <= 0xdfff) {
  580. if (checkSurrogatePair (last)) {
  581. last++;
  582. continue;
  583. } else {
  584. last--;
  585. break;
  586. }
  587. }
  588. fatal ("P-071", new Object []
  589. { Integer.toHexString (buf [last]) });
  590. }
  591. if (c == '\n') {
  592. if (!isInternal ())
  593. lineNumber++;
  594. continue;
  595. }
  596. if (c == '\r') {
  597. // As above, we can't repeat CR/CRLF --> LF mapping
  598. if (isInternal ())
  599. continue;
  600. if (white) {
  601. if (whitespaceInvalidMessage != null)
  602. errHandler.error (new SAXParseException (
  603. Parser2.messages.getMessage (locale,
  604. whitespaceInvalidMessage),
  605. this));
  606. contentHandler.ignorableWhitespace (buf, start,
  607. last - start);
  608. contentHandler.ignorableWhitespace (newline, 0, 1);
  609. } else {
  610. validator.text ();
  611. contentHandler.characters (buf, start, last - start);
  612. contentHandler.characters (newline, 0, 1);
  613. }
  614. lineNumber++;
  615. if (finish > (last + 1)) {
  616. if (buf [last + 1] == '\n')
  617. last++;
  618. } else { // CR at end of buffer
  619. // XXX case not yet handled ... as above
  620. }
  621. start = last + 1;
  622. continue;
  623. }
  624. if (c != ']') {
  625. if (c != ' ' && c != '\t')
  626. white = false;
  627. continue;
  628. }
  629. // assert(buf[last] == ']');
  630. if ((last + 2) < finish) {
  631. if (buf [last + 1] == ']' && buf [last + 2] == '>') {
  632. done = true;
  633. break;
  634. }
  635. white = false;
  636. continue;
  637. } else {
  638. // "last" is at or one before end of buffered data.
  639. // Report what we have so far, not including "last", by
  640. // breaking and executing code below, outside inner
  641. // loop, then continuing on to find end of CDATA section.
  642. break;
  643. }
  644. }
  645. if (white) {
  646. if (whitespaceInvalidMessage != null)
  647. errHandler.error (new SAXParseException (
  648. Parser2.messages.getMessage (locale,
  649. whitespaceInvalidMessage),
  650. this));
  651. contentHandler.ignorableWhitespace (buf, start, last - start);
  652. } else {
  653. validator.text ();
  654. contentHandler.characters (buf, start, last - start);
  655. }
  656. if (done) {
  657. start = last + 3;
  658. break;
  659. }
  660. start = last;
  661. fillbuf();
  662. if (isEOF ())
  663. fatal ("P-073", null);
  664. }
  665. }
  666. // return false to backstep at end of buffer)
  667. private boolean checkSurrogatePair (int offset)
  668. throws SAXException
  669. {
  670. if ((offset + 1) >= finish)
  671. return false;
  672. char c1 = buf [offset++];
  673. char c2 = buf [offset];
  674. if ((c1 >= 0xd800 && c1 < 0xdc00) && (c2 >= 0xdc00 && c2 <= 0xdfff))
  675. return true;
  676. fatal ("P-074", new Object [] {
  677. Integer.toHexString (c1 & 0x0ffff),
  678. Integer.toHexString (c2 & 0x0ffff)
  679. });
  680. return false;
  681. }
  682. /**
  683. * whitespace in markup (flagged to app, discardable)
  684. *
  685. * <P> the document handler's ignorableWhitespace() method
  686. * is called on all the whitespace found
  687. */
  688. public boolean ignorableWhitespace (ContentHandler handler)
  689. throws IOException, SAXException
  690. {
  691. char c;
  692. boolean isSpace = false;
  693. int first;
  694. // [3] S ::= #20 | #09 | #0D | #0A
  695. for (first = start;;) {
  696. if (finish <= start) {
  697. if (isSpace)
  698. handler.ignorableWhitespace (buf, first, start - first);
  699. fillbuf ();
  700. first = start;
  701. }
  702. if (finish <= start)
  703. return isSpace;
  704. c = buf [start++];
  705. switch (c) {
  706. case '\n':
  707. if (!isInternal ())
  708. lineNumber++;
  709. // XXX handles Macintosh line endings wrong
  710. // fallthrough
  711. case 0x09:
  712. case 0x20:
  713. isSpace = true;
  714. continue;
  715. case '\r':
  716. isSpace = true;
  717. if (!isInternal ())
  718. lineNumber++;
  719. handler.ignorableWhitespace (buf, first,
  720. (start - 1) - first);
  721. handler.ignorableWhitespace (newline, 0, 1);
  722. if (start < finish && buf [start] == '\n')
  723. ++start;
  724. first = start;
  725. continue;
  726. default:
  727. ungetc ();
  728. if (isSpace)
  729. handler.ignorableWhitespace (buf, first, start - first);
  730. return isSpace;
  731. }
  732. }
  733. }
  734. /**
  735. * returns false iff 'next' string isn't as provided,
  736. * else skips that text and returns true
  737. *
  738. * <P> NOTE: two alternative string representations are
  739. * both passed in, since one is faster.
  740. */
  741. public boolean peek (String next, char chars [])
  742. throws IOException, SAXException
  743. {
  744. int len;
  745. int i;
  746. if (chars != null)
  747. len = chars.length;
  748. else
  749. len = next.length ();
  750. // buffer should hold the whole thing ... give it a
  751. // chance for the end-of-buffer case and cope with EOF
  752. // by letting fillbuf compact and fill
  753. if (finish <= start || (finish - start) < len)
  754. fillbuf ();
  755. // can't peek past EOF
  756. if (finish <= start)
  757. return false;
  758. // compare the string; consume iff it matches
  759. if (chars != null) {
  760. for (i = 0; i < len && (start + i) < finish; i++) {
  761. if (buf [start + i] != chars [i])
  762. return false;
  763. }
  764. } else {
  765. for (i = 0; i < len && (start + i) < finish; i++) {
  766. if (buf [start + i] != next.charAt (i))
  767. return false;
  768. }
  769. }
  770. // if the first fillbuf didn't get enough data, give
  771. // fillbuf another chance to read
  772. if (i < len) {
  773. if (reader == null || isClosed)
  774. return false;
  775. //
  776. // This diagnostic "knows" that the only way big strings would
  777. // fail to be peeked is where it's a symbol ... e.g. for an
  778. // </EndTag> construct. That knowledge could also be applied
  779. // to get rid of the symbol length constraint, since having
  780. // the wrong symbol is a fatal error anyway ...
  781. //
  782. if (len > buf.length)
  783. fatal ("P-077", new Object [] { new Integer (buf.length) });
  784. fillbuf ();
  785. return peek (next, chars);
  786. }
  787. start += len;
  788. return true;
  789. }
  790. /**
  791. * This method is used to disambiguate between XMLDecl, TextDecl, and
  792. * PI by doing a lookahead w/o consuming any characters. We look for
  793. * "<?xml" plus a whitespace character, but no more. For example, we
  794. * could have input documents with the PI "<?xml-stylesheet ... >".
  795. *
  796. * @return true iff next chars match either the prefix for XMLDecl or
  797. * TextDecl
  798. */
  799. boolean isXmlDeclOrTextDeclPrefix()
  800. throws IOException, SAXException
  801. {
  802. // [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl?
  803. // SDDecl? S? '>'
  804. // [77] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
  805. // [24] VersionInfo ::= S 'version' Eq \'|\" versionNum \'|\"
  806. String match = "<?xml";
  807. int matchLen = match.length();
  808. // Length of the entire prefix including whitespace
  809. int prefixLen = matchLen + 1;
  810. // buffer should hold the whole thing ... give it a
  811. // chance for the end-of-buffer case and cope with EOF
  812. // by letting fillbuf compact and fill
  813. if (finish <= start || (finish - start) < prefixLen)
  814. fillbuf ();
  815. // can't peek past EOF
  816. if (finish <= start)
  817. return false;
  818. // Compare the non-whitespace part of the prefix
  819. int i;
  820. for (i = 0; i < matchLen && (start + i) < finish; i++) {
  821. if (buf [start + i] != match.charAt (i))
  822. return false;
  823. }
  824. // if the first fillbuf didn't get enough data, give
  825. // fillbuf another chance to read
  826. if (i < matchLen) {
  827. if (reader == null || isClosed)
  828. return false;
  829. fillbuf ();
  830. return isXmlDeclOrTextDeclPrefix();
  831. }
  832. // assert(i == matchLen);
  833. // Match whitespace
  834. if (!XmlChars.isSpace(buf[i])) {
  835. return false;
  836. }
  837. return true;
  838. }
  839. //
  840. // Support for reporting the internal DTD subset, so <!DOCTYPE...>
  841. // declarations can be recreated. This is collected as a single
  842. // string; such subsets are normally small, and many applications
  843. // don't even care about this.
  844. //
  845. public void startRemembering ()
  846. {
  847. if (startRemember != 0)
  848. throw new InternalError ();
  849. startRemember = start;
  850. }
  851. public String rememberText ()
  852. {
  853. String retval;
  854. // If the internal subset crossed a buffer boundary, we
  855. // created a temporary buffer.
  856. if (rememberedText != null) {
  857. rememberedText.append (buf, startRemember,
  858. start - startRemember);
  859. retval = rememberedText.toString ();
  860. } else
  861. retval = new String (buf, startRemember,
  862. start - startRemember);
  863. startRemember = 0;
  864. rememberedText = null;
  865. return retval;
  866. }
  867. // LOCATOR METHODS
  868. private Locator getLocator ()
  869. {
  870. InputEntity current = this;
  871. // don't report locations within internal entities!
  872. while (current != null && current.input == null)
  873. current = current.next;
  874. return current == null ? this : current;
  875. }
  876. /** Returns the public ID of this input source, if known */
  877. public String getPublicId ()
  878. {
  879. Locator where = getLocator ();
  880. if (where == this)
  881. return input.getPublicId ();
  882. return where.getPublicId ();
  883. }
  884. /** Returns the system ID of this input source, if known */
  885. public String getSystemId ()
  886. {
  887. Locator where = getLocator ();
  888. if (where == this)
  889. return input.getSystemId ();
  890. return where.getSystemId ();
  891. }
  892. /** Returns the current line number in this input source */
  893. public int getLineNumber ()
  894. {
  895. Locator where = getLocator ();
  896. if (where == this)
  897. return lineNumber;
  898. return where.getLineNumber ();
  899. }
  900. /** returns -1; maintaining column numbers hurts performance */
  901. public int getColumnNumber ()
  902. {
  903. return -1; // not maintained (speed)
  904. }
  905. //
  906. // n.b. for non-EOF end-of-buffer cases, reader should return
  907. // at least a handful of bytes so various lookaheads behave.
  908. //
  909. // two character pushback exists except at first; characters
  910. // represented by surrogate pairs can't be pushed back (they'd
  911. // only be in character data anyway).
  912. //
  913. // SAX exception thrown on char conversion problems; line number
  914. // will be low, as a rule.
  915. //
  916. private void fillbuf () throws IOException, SAXException
  917. {
  918. // don't touched fixed buffers, that'll usually
  919. // change entity values (and isn't needed anyway)
  920. // likewise, ignore closed streams
  921. if (reader == null || isClosed)
  922. return;
  923. // if remembering DTD text, copy!
  924. if (startRemember != 0) {
  925. if (rememberedText == null)
  926. rememberedText = new StringBuffer (buf.length);
  927. rememberedText.append (buf, startRemember,
  928. start - startRemember);
  929. }
  930. boolean extra = (finish > 0) && (start > 0);
  931. int len;
  932. if (extra) // extra pushback
  933. start--;
  934. len = finish - start;
  935. System.arraycopy (buf, start, buf, 0, len);
  936. start = 0;
  937. finish = len;
  938. try {
  939. len = buf.length - len;
  940. len = reader.read (buf, finish, len);
  941. } catch (UnsupportedEncodingException e) {
  942. fatal ("P-075", new Object [] { e.getMessage () });
  943. } catch (CharConversionException e) {
  944. fatal ("P-076", new Object [] { e.getMessage () });
  945. }
  946. if (len >= 0)
  947. finish += len;
  948. else
  949. close ();
  950. if (extra) // extra pushback
  951. start++;
  952. if (startRemember != 0)
  953. // assert extra == true
  954. startRemember = 1;
  955. }
  956. public void close ()
  957. {
  958. try {
  959. if (reader != null && !isClosed)
  960. reader.close ();
  961. isClosed = true;
  962. } catch (IOException e) {
  963. /* NOTHING */
  964. }
  965. }
  966. private void fatal (String messageId, Object params []) throws SAXException
  967. {
  968. SAXParseException x = new SAXParseException (
  969. Parser2.messages.getMessage (locale, messageId, params),
  970. this);
  971. // not continuable ... e.g. WF errors
  972. close ();
  973. errHandler.fatalError (x);
  974. throw x;
  975. }
  976. }