1. /*
  2. * $Id: XmlReader.java,v 1.3 2002/01/24 20:25:27 edwingo Exp $
  3. *
  4. * The Apache Software License, Version 1.1
  5. *
  6. *
  7. * Copyright (c) 2000 The Apache Software Foundation. All rights
  8. * reserved.
  9. *
  10. * Redistribution and use in source and binary forms, with or without
  11. * modification, are permitted provided that the following conditions
  12. * are met:
  13. *
  14. * 1. Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. *
  17. * 2. Redistributions in binary form must reproduce the above copyright
  18. * notice, this list of conditions and the following disclaimer in
  19. * the documentation and/or other materials provided with the
  20. * distribution.
  21. *
  22. * 3. The end-user documentation included with the redistribution,
  23. * if any, must include the following acknowledgment:
  24. * "This product includes software developed by the
  25. * Apache Software Foundation (http://www.apache.org/)."
  26. * Alternately, this acknowledgment may appear in the software itself,
  27. * if and wherever such third-party acknowledgments normally appear.
  28. *
  29. * 4. The names "Crimson" and "Apache Software Foundation" must
  30. * not be used to endorse or promote products derived from this
  31. * software without prior written permission. For written
  32. * permission, please contact apache@apache.org.
  33. *
  34. * 5. Products derived from this software may not be called "Apache",
  35. * nor may "Apache" appear in their name, without prior written
  36. * permission of the Apache Software Foundation.
  37. *
  38. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  39. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  40. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  41. * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  42. * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  43. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  44. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  45. * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  46. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  47. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  48. * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  49. * SUCH DAMAGE.
  50. * ====================================================================
  51. *
  52. * This software consists of voluntary contributions made by many
  53. * individuals on behalf of the Apache Software Foundation and was
  54. * originally based on software copyright (c) 1999, Sun Microsystems, Inc.,
  55. * http://www.sun.com. For more information on the Apache Software
  56. * Foundation, please see <http://www.apache.org/>.
  57. */
  58. package org.apache.crimson.parser;
  59. import java.io.*;
  60. import java.util.Hashtable;
  61. // NOTE: Add I18N support to this class when JDK gets the ability to
  62. // defer selection of locale for exception messages ... use the same
  63. // technique for both.
  64. /**
  65. * This handles several XML-related tasks that normal java.io Readers
  66. * don't support, inluding use of IETF standard encoding names and
  67. * automatic detection of most XML encodings. The former is needed
  68. * for interoperability; the latter is needed to conform with the XML
  69. * spec. This class also optimizes reading some common encodings by
  70. * providing low-overhead unsynchronized Reader support.
  71. *
  72. * <P> Note that the autodetection facility should be used only on
  73. * data streams which have an unknown character encoding. For example,
  74. * it should never be used on MIME text/xml entities.
  75. *
  76. * <P> Note that XML processors are only required to support UTF-8 and
  77. * UTF-16 character encodings. Autodetection permits the underlying Java
  78. * implementation to provide support for many other encodings, such as
  79. * US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP.
  80. *
  81. * @author David Brownell
  82. * @version $Revision: 1.3 $
  83. */
  84. // package private
  85. final class XmlReader extends Reader
  86. {
  87. private static final int MAXPUSHBACK = 512;
  88. private Reader in;
  89. private String assignedEncoding;
  90. private boolean closed;
  91. //
  92. // This class always delegates I/O to a reader, which gets
  93. // its data from the very beginning of the XML text. It needs
  94. // to use a pushback stream since (a) autodetection can read
  95. // partial UTF-8 characters which need to be fully processed,
  96. // (b) the "Unicode" readers swallow characters that they think
  97. // are byte order marks, so tests fail if they don't see the
  98. // real byte order mark.
  99. //
  100. // It's got do this efficiently: character I/O is solidly on the
  101. // critical path. (So keep buffer length over 2 Kbytes to avoid
  102. // excess buffering. Many URL handlers stuff a BufferedInputStream
  103. // between here and the real data source, and larger buffers keep
  104. // that from slowing you down.)
  105. //
  106. /**
  107. * Constructs the reader from an input stream, autodetecting
  108. * the encoding to use according to the heuristic specified
  109. * in the XML 1.0 recommendation.
  110. *
  111. * @param in the input stream from which the reader is constructed
  112. * @exception IOException on error, such as unrecognized encoding
  113. */
  114. public static Reader createReader (InputStream in) throws IOException
  115. {
  116. return new XmlReader (in);
  117. }
  118. /**
  119. * Creates a reader supporting the given encoding, mapping
  120. * from standard encoding names to ones that understood by
  121. * Java where necessary.
  122. *
  123. * @param in the input stream from which the reader is constructed
  124. * @param encoding the IETF standard name of the encoding to use;
  125. * if null, autodetection is used.
  126. * @exception IOException on error, including unrecognized encoding
  127. */
  128. public static Reader createReader (InputStream in, String encoding)
  129. throws IOException
  130. {
  131. if (encoding == null)
  132. return new XmlReader (in);
  133. if ("UTF-8".equalsIgnoreCase (encoding)
  134. || "UTF8".equalsIgnoreCase (encoding))
  135. return new Utf8Reader (in);
  136. if ("US-ASCII".equalsIgnoreCase (encoding)
  137. || "ASCII".equalsIgnoreCase (encoding))
  138. return new AsciiReader (in);
  139. if ("ISO-8859-1".equalsIgnoreCase (encoding)
  140. // plus numerous aliases ...
  141. )
  142. return new Iso8859_1Reader (in);
  143. //
  144. // What we really want is an administerable resource mapping
  145. // encoding names/aliases to classnames. For example a property
  146. // file resource, "readers/mapping.props", holding and a set
  147. // of readers in that (sub)package... defaulting to this call
  148. // only if no better choice is available.
  149. //
  150. return new InputStreamReader (in, std2java (encoding));
  151. }
  152. //
  153. // JDK doesn't know all of the standard encoding names, and
  154. // in particular none of the EBCDIC ones IANA defines (and
  155. // which IBM encourages).
  156. //
  157. static private final Hashtable charsets = new Hashtable (31);
  158. static {
  159. charsets.put ("UTF-16", "Unicode");
  160. charsets.put ("ISO-10646-UCS-2", "Unicode");
  161. // NOTE: no support for ISO-10646-UCS-4 yet.
  162. charsets.put ("EBCDIC-CP-US", "cp037");
  163. charsets.put ("EBCDIC-CP-CA", "cp037");
  164. charsets.put ("EBCDIC-CP-NL", "cp037");
  165. charsets.put ("EBCDIC-CP-WT", "cp037");
  166. charsets.put ("EBCDIC-CP-DK", "cp277");
  167. charsets.put ("EBCDIC-CP-NO", "cp277");
  168. charsets.put ("EBCDIC-CP-FI", "cp278");
  169. charsets.put ("EBCDIC-CP-SE", "cp278");
  170. charsets.put ("EBCDIC-CP-IT", "cp280");
  171. charsets.put ("EBCDIC-CP-ES", "cp284");
  172. charsets.put ("EBCDIC-CP-GB", "cp285");
  173. charsets.put ("EBCDIC-CP-FR", "cp297");
  174. charsets.put ("EBCDIC-CP-AR1", "cp420");
  175. charsets.put ("EBCDIC-CP-HE", "cp424");
  176. charsets.put ("EBCDIC-CP-BE", "cp500");
  177. charsets.put ("EBCDIC-CP-CH", "cp500");
  178. charsets.put ("EBCDIC-CP-ROECE", "cp870");
  179. charsets.put ("EBCDIC-CP-YU", "cp870");
  180. charsets.put ("EBCDIC-CP-IS", "cp871");
  181. charsets.put ("EBCDIC-CP-AR2", "cp918");
  182. // IANA also defines two that JDK 1.2 doesn't handle:
  183. // EBCDIC-CP-GR --> CP423
  184. // EBCDIC-CP-TR --> CP905
  185. }
  186. // returns an encoding name supported by JDK >= 1.1.6
  187. // for some cases required by the XML spec
  188. private static String std2java (String encoding)
  189. {
  190. String temp = encoding.toUpperCase ();
  191. temp = (String) charsets.get (temp);
  192. return temp != null ? temp : encoding;
  193. }
  194. /** Returns the standard name of the encoding in use */
  195. public String getEncoding ()
  196. {
  197. return assignedEncoding;
  198. }
  199. private XmlReader (InputStream stream) throws IOException
  200. {
  201. super (stream);
  202. PushbackInputStream pb;
  203. byte buf [];
  204. int len;
  205. /*if (stream instanceof PushbackInputStream)
  206. pb = (PushbackInputStream) stream;
  207. else*/
  208. /**
  209. * Commented out the above code to make sure it works when the
  210. * document is accessed using http. URL connection in the code uses
  211. * a PushbackInputStream with size 7 and when we try to push back
  212. * MAX which default value is set to 512 we get and exception. So
  213. * that's why we need to wrap the stream irrespective of what type
  214. * of stream we start off with.
  215. */
  216. pb = new PushbackInputStream (stream, MAXPUSHBACK);
  217. //
  218. // See if we can figure out the character encoding used
  219. // in this file by peeking at the first few bytes.
  220. //
  221. buf = new byte [4];
  222. len = pb.read (buf);
  223. if (len > 0)
  224. pb.unread (buf, 0, len);
  225. if (len == 4) switch (buf [0] & 0x0ff) {
  226. case 0:
  227. // 00 3c 00 3f == illegal UTF-16 big-endian
  228. if (buf [1] == 0x3c && buf [2] == 0x00 && buf [3] == 0x3f) {
  229. setEncoding (pb, "UnicodeBig");
  230. return;
  231. }
  232. // else it's probably UCS-4
  233. break;
  234. case '<': // 0x3c: the most common cases!
  235. switch (buf [1] & 0x0ff) {
  236. // First character is '<'; could be XML without
  237. // an XML directive such as "<hello>", "<!-- ...",
  238. // and so on.
  239. default:
  240. break;
  241. // 3c 00 3f 00 == illegal UTF-16 little endian
  242. case 0x00:
  243. if (buf [2] == 0x3f && buf [3] == 0x00) {
  244. setEncoding (pb, "UnicodeLittle");
  245. return;
  246. }
  247. // else probably UCS-4
  248. break;
  249. // 3c 3f 78 6d == ASCII and supersets '<?xm'
  250. case '?':
  251. if (buf [2] != 'x' || buf [3] != 'm')
  252. break;
  253. //
  254. // One of several encodings could be used:
  255. // Shift-JIS, ASCII, UTF-8, ISO-8859-*, etc
  256. //
  257. useEncodingDecl (pb, "UTF8");
  258. return;
  259. }
  260. break;
  261. // 4c 6f a7 94 ... some EBCDIC code page
  262. case 0x4c:
  263. if (buf [1] == 0x6f
  264. && (0x0ff & buf [2]) == 0x0a7
  265. && (0x0ff & buf [3]) == 0x094) {
  266. useEncodingDecl (pb, "CP037");
  267. return;
  268. }
  269. // whoops, treat as UTF-8
  270. break;
  271. // UTF-16 big-endian
  272. case 0xfe:
  273. if ((buf [1] & 0x0ff) != 0xff)
  274. break;
  275. setEncoding (pb, "UTF-16");
  276. return;
  277. // UTF-16 little-endian
  278. case 0xff:
  279. if ((buf [1] & 0x0ff) != 0xfe)
  280. break;
  281. setEncoding (pb, "UTF-16");
  282. return;
  283. // default ... no XML declaration
  284. default:
  285. break;
  286. }
  287. //
  288. // If all else fails, assume XML without a declaration, and
  289. // using UTF-8 encoding.
  290. //
  291. setEncoding (pb, "UTF-8");
  292. }
  293. /*
  294. * Read the encoding decl on the stream, knowing that it should
  295. * be readable using the specified encoding (basically, ASCII or
  296. * EBCDIC). The body of the document may use a wider range of
  297. * characters than the XML/Text decl itself, so we switch to use
  298. * the specified encoding as soon as we can. (ASCII is a subset
  299. * of UTF-8, ISO-8859-*, ISO-2022-JP, EUC-JP, and more; EBCDIC
  300. * has a variety of "code pages" that have these characters as
  301. * a common subset.)
  302. */
  303. private void useEncodingDecl (PushbackInputStream pb, String encoding)
  304. throws IOException
  305. {
  306. byte buffer [] = new byte [MAXPUSHBACK];
  307. int len;
  308. Reader r;
  309. int c;
  310. //
  311. // Buffer up a bunch of input, and set up to read it in
  312. // the specified encoding ... we can skip the first four
  313. // bytes since we know that "<?xm" was read to determine
  314. // what encoding to use!
  315. //
  316. len = pb.read (buffer, 0, buffer.length);
  317. pb.unread (buffer, 0, len);
  318. r = new InputStreamReader (
  319. new ByteArrayInputStream (buffer, 4, len),
  320. encoding);
  321. //
  322. // Next must be "l" (and whitespace) else we conclude
  323. // error and choose UTF-8.
  324. //
  325. if ((c = r.read ()) != 'l') {
  326. setEncoding (pb, "UTF-8");
  327. return;
  328. }
  329. //
  330. // Then, we'll skip any
  331. // S version="..." [or single quotes]
  332. // bit and get any subsequent
  333. // S encoding="..." [or single quotes]
  334. //
  335. // We put an arbitrary size limit on how far we read; lots
  336. // of space will break this algorithm.
  337. //
  338. StringBuffer buf = new StringBuffer ();
  339. StringBuffer keyBuf = null;
  340. String key = null;
  341. boolean sawEq = false;
  342. char quoteChar = 0;
  343. boolean sawQuestion = false;
  344. XmlDecl:
  345. for (int i = 0; i < MAXPUSHBACK - 5; ++i) {
  346. if ((c = r.read ()) == -1)
  347. break;
  348. // ignore whitespace before/between "key = 'value'"
  349. if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
  350. continue;
  351. // ... but require at least a little!
  352. if (i == 0)
  353. break;
  354. // terminate the loop ASAP
  355. if (c == '?')
  356. sawQuestion = true;
  357. else if (sawQuestion) {
  358. if (c == '>')
  359. break;
  360. sawQuestion = false;
  361. }
  362. // did we get the "key =" bit yet?
  363. if (key == null || !sawEq) {
  364. if (keyBuf == null) {
  365. if (Character.isWhitespace ((char) c))
  366. continue;
  367. keyBuf = buf;
  368. buf.setLength (0);
  369. buf.append ((char)c);
  370. sawEq = false;
  371. } else if (Character.isWhitespace ((char) c)) {
  372. key = keyBuf.toString ();
  373. } else if (c == '=') {
  374. if (key == null)
  375. key = keyBuf.toString ();
  376. sawEq = true;
  377. keyBuf = null;
  378. quoteChar = 0;
  379. } else
  380. keyBuf.append ((char)c);
  381. continue;
  382. }
  383. // space before quoted value
  384. if (Character.isWhitespace ((char) c))
  385. continue;
  386. if (c == '"' || c == '\'') {
  387. if (quoteChar == 0) {
  388. quoteChar = (char) c;
  389. buf.setLength (0);
  390. continue;
  391. } else if (c == quoteChar) {
  392. if ("encoding".equals (key)) {
  393. assignedEncoding = buf.toString ();
  394. // [81] Encname ::= [A-Za-z] ([A-Za-z0-9._]|'-')*
  395. for (i = 0; i < assignedEncoding.length(); i++) {
  396. c = assignedEncoding.charAt (i);
  397. if ((c >= 'A' && c <= 'Z')
  398. || (c >= 'a' && c <= 'z'))
  399. continue;
  400. if (i == 0)
  401. break XmlDecl;
  402. if (i > 0 && (c == '-'
  403. || (c >= '0' && c <= '9')
  404. || c == '.' || c == '_'))
  405. continue;
  406. // map illegal names to UTF-8 default
  407. break XmlDecl;
  408. }
  409. setEncoding (pb, assignedEncoding);
  410. return;
  411. } else {
  412. key = null;
  413. continue;
  414. }
  415. }
  416. }
  417. buf.append ((char) c);
  418. }
  419. setEncoding (pb, "UTF-8");
  420. }
  421. private void setEncoding (InputStream stream, String encoding)
  422. throws IOException
  423. {
  424. assignedEncoding = encoding;
  425. in = createReader (stream, encoding);
  426. }
  427. /**
  428. * Reads the number of characters read into the buffer, or -1 on EOF.
  429. */
  430. public int read (char buf [], int off, int len) throws IOException
  431. {
  432. int val;
  433. if (closed)
  434. return -1; // throw new IOException ("closed");
  435. val = in.read (buf, off, len);
  436. if (val == -1)
  437. close ();
  438. return val;
  439. }
  440. /**
  441. * Reads a single character.
  442. */
  443. public int read () throws IOException
  444. {
  445. int val;
  446. if (closed)
  447. throw new IOException ("closed");
  448. val = in.read ();
  449. if (val == -1)
  450. close ();
  451. return val;
  452. }
  453. /**
  454. * Returns true iff the reader supports mark/reset.
  455. */
  456. public boolean markSupported ()
  457. {
  458. return in == null ? false : in.markSupported ();
  459. }
  460. /**
  461. * Sets a mark allowing a limited number of characters to
  462. * be "peeked", by reading and then resetting.
  463. * @param value how many characters may be "peeked".
  464. */
  465. public void mark (int value) throws IOException
  466. {
  467. if (in != null) in.mark (value);
  468. }
  469. /**
  470. * Resets the current position to the last marked position.
  471. */
  472. public void reset () throws IOException
  473. {
  474. if (in != null) in.reset ();
  475. }
  476. /**
  477. * Skips a specified number of characters.
  478. */
  479. public long skip (long value) throws IOException
  480. {
  481. return in == null ? 0 : in.skip (value);
  482. }
  483. /**
  484. * Returns true iff input characters are known to be ready.
  485. */
  486. public boolean ready () throws IOException
  487. {
  488. return in == null ? false : in.ready ();
  489. }
  490. /**
  491. * Closes the reader.
  492. */
  493. public void close () throws IOException
  494. {
  495. if (closed)
  496. return;
  497. in.close ();
  498. in = null;
  499. closed = true;
  500. }
  501. //
  502. // Delegating to a converter module will always be slower than
  503. // direct conversion. Use a similar approach for any other
  504. // readers that need to be particularly fast; only block I/O
  505. // speed matters to this package. For UTF-16, separate readers
  506. // for big and little endian streams make a difference, too;
  507. // fewer conditionals in the critical path!
  508. //
  509. static abstract class BaseReader extends Reader
  510. {
  511. protected InputStream instream;
  512. protected byte buffer [];
  513. protected int start, finish;
  514. BaseReader (InputStream stream)
  515. {
  516. super (stream);
  517. instream = stream;
  518. buffer = new byte [8192];
  519. }
  520. public boolean ready () throws IOException
  521. {
  522. return instream == null
  523. || (finish - start) > 0
  524. || instream.available () != 0;
  525. }
  526. // caller shouldn't read again
  527. public void close () throws IOException
  528. {
  529. if (instream != null) {
  530. instream.close ();
  531. start = finish = 0;
  532. buffer = null;
  533. instream = null;
  534. }
  535. }
  536. }
  537. //
  538. // We want this reader, to make the default encoding be as fast
  539. // as we can make it. JDK's "UTF8" (not "UTF-8" till JDK 1.2)
  540. // InputStreamReader works, but 20+% slower speed isn't OK for
  541. // the default/primary encoding.
  542. //
  543. static final class Utf8Reader extends BaseReader
  544. {
  545. // 2nd half of UTF-8 surrogate pair
  546. private char nextChar;
  547. Utf8Reader (InputStream stream)
  548. {
  549. super (stream);
  550. }
  551. public int read (char buf [], int offset, int len) throws IOException
  552. {
  553. int i = 0, c = 0;
  554. if (len <= 0)
  555. return 0;
  556. // avoid many runtime bounds checks ... a good optimizer
  557. // (static or JIT) will now remove checks from the loop.
  558. if ((offset + len) > buf.length || offset < 0)
  559. throw new ArrayIndexOutOfBoundsException ();
  560. // Consume remaining half of any surrogate pair immediately
  561. if (nextChar != 0) {
  562. buf [offset + i++] = nextChar;
  563. nextChar = 0;
  564. }
  565. while (i < len) {
  566. // stop or read data if needed
  567. if (finish <= start) {
  568. if (instream == null) {
  569. c = -1;
  570. break;
  571. }
  572. start = 0;
  573. finish = instream.read (buffer, 0, buffer.length);
  574. if (finish <= 0) {
  575. this.close ();
  576. c = -1;
  577. break;
  578. }
  579. }
  580. //
  581. // RFC 2279 describes UTF-8; there are six encodings.
  582. // Each encoding takes a fixed number of characters
  583. // (1-6 bytes) and is flagged by a bit pattern in the
  584. // first byte. The five and six byte-per-character
  585. // encodings address characters which are disallowed
  586. // in XML documents, as do some four byte ones.
  587. //
  588. //
  589. // Single byte == ASCII. Common; optimize.
  590. //
  591. c = buffer [start] & 0x0ff;
  592. if ((c & 0x80) == 0x00) {
  593. // 0x0000 <= c <= 0x007f
  594. start++;
  595. buf [offset + i++] = (char) c;
  596. continue;
  597. }
  598. //
  599. // Multibyte chars -- check offsets optimistically,
  600. // ditto the "10xx xxxx" format for subsequent bytes
  601. //
  602. int off = start;
  603. try {
  604. // 2 bytes
  605. if ((buffer [off] & 0x0E0) == 0x0C0) {
  606. c = (buffer [off++] & 0x1f) << 6;
  607. c += buffer [off++] & 0x3f;
  608. // 0x0080 <= c <= 0x07ff
  609. // 3 bytes
  610. } else if ((buffer [off] & 0x0F0) == 0x0E0) {
  611. c = (buffer [off++] & 0x0f) << 12;
  612. c += (buffer [off++] & 0x3f) << 6;
  613. c += buffer [off++] & 0x3f;
  614. // 0x0800 <= c <= 0xffff
  615. // 4 bytes
  616. } else if ((buffer [off] & 0x0f8) == 0x0F0) {
  617. c = (buffer [off++] & 0x07) << 18;
  618. c += (buffer [off++] & 0x3f) << 12;
  619. c += (buffer [off++] & 0x3f) << 6;
  620. c += buffer [off++] & 0x3f;
  621. // 0x0001 0000 <= c <= 0x001f ffff
  622. // Unicode supports c <= 0x0010 ffff ...
  623. if (c > 0x0010ffff)
  624. throw new CharConversionException (
  625. "UTF-8 encoding of character 0x00"
  626. + Integer.toHexString (c)
  627. + " can't be converted to Unicode."
  628. );
  629. else if (c > 0xffff) {
  630. // Convert UCS-4 char to surrogate pair (UTF-16)
  631. c -= 0x10000;
  632. nextChar = (char) (0xDC00 + (c & 0x03ff));
  633. c = 0xD800 + (c >> 10);
  634. }
  635. // 5 and 6 byte versions are XML WF errors, but
  636. // typically come from mislabeled encodings
  637. } else
  638. throw new CharConversionException (
  639. "Unconvertible UTF-8 character"
  640. + " beginning with 0x"
  641. + Integer.toHexString (
  642. buffer [start] & 0xff)
  643. );
  644. } catch (ArrayIndexOutOfBoundsException e) {
  645. // off > length && length >= buffer.length
  646. c = 0;
  647. }
  648. //
  649. // if the buffer held only a partial character,
  650. // compact it and try to read the rest of the
  651. // character. worst case involves three
  652. // single-byte reads -- quite rare.
  653. //
  654. if (off > finish) {
  655. System.arraycopy (buffer, start,
  656. buffer, 0, finish - start);
  657. finish -= start;
  658. start = 0;
  659. off = instream.read (buffer, finish,
  660. buffer.length - finish);
  661. if (off < 0) {
  662. this.close ();
  663. throw new CharConversionException (
  664. "Partial UTF-8 char");
  665. }
  666. finish += off;
  667. continue;
  668. }
  669. //
  670. // check the format of the non-initial bytes
  671. //
  672. for (start++; start < off; start++) {
  673. if ((buffer [start] & 0xC0) != 0x80) {
  674. this.close ();
  675. throw new CharConversionException (
  676. "Malformed UTF-8 char -- "
  677. + "is an XML encoding declaration missing?"
  678. );
  679. }
  680. }
  681. //
  682. // If this needed a surrogate pair, consume ASAP
  683. //
  684. buf [offset + i++] = (char) c;
  685. if (nextChar != 0 && i < len) {
  686. buf [offset + i++] = nextChar;
  687. nextChar = 0;
  688. }
  689. }
  690. if (i > 0)
  691. return i;
  692. return (c == -1) ? -1 : 0;
  693. }
  694. }
  695. //
  696. // We want ASCII and ISO-8859 Readers since they're the most common
  697. // encodings in the US and Europe, and we don't want performance
  698. // regressions for them. They're also easy to implement efficiently,
  699. // since they're bitmask subsets of UNICODE.
  700. //
  701. // XXX haven't benchmarked these readers vs what we get out of JDK.
  702. //
  703. static final class AsciiReader extends BaseReader
  704. {
  705. AsciiReader (InputStream in) { super (in); }
  706. public int read (char buf [], int offset, int len) throws IOException
  707. {
  708. int i, c;
  709. if (instream == null)
  710. return -1;
  711. // avoid many runtime bounds checks ... a good optimizer
  712. // (static or JIT) will now remove checks from the loop.
  713. if ((offset + len) > buf.length || offset < 0)
  714. throw new ArrayIndexOutOfBoundsException ();
  715. for (i = 0; i < len; i++) {
  716. if (start >= finish) {
  717. start = 0;
  718. finish = instream.read (buffer, 0, buffer.length);
  719. if (finish <= 0) {
  720. if (finish <= 0)
  721. this.close ();
  722. break;
  723. }
  724. }
  725. c = buffer [start++];
  726. if ((c & 0x80) != 0)
  727. throw new CharConversionException (
  728. "Illegal ASCII character, 0x"
  729. + Integer.toHexString (c & 0xff)
  730. );
  731. buf [offset + i] = (char) c;
  732. }
  733. if (i == 0 && finish <= 0)
  734. return -1;
  735. return i;
  736. }
  737. }
  738. static final class Iso8859_1Reader extends BaseReader
  739. {
  740. Iso8859_1Reader (InputStream in) { super (in); }
  741. public int read (char buf [], int offset, int len) throws IOException
  742. {
  743. int i;
  744. if (instream == null)
  745. return -1;
  746. // avoid many runtime bounds checks ... a good optimizer
  747. // (static or JIT) will now remove checks from the loop.
  748. if ((offset + len) > buf.length || offset < 0)
  749. throw new ArrayIndexOutOfBoundsException ();
  750. for (i = 0; i < len; i++) {
  751. if (start >= finish) {
  752. start = 0;
  753. finish = instream.read (buffer, 0, buffer.length);
  754. if (finish <= 0) {
  755. if (finish <= 0)
  756. this.close ();
  757. break;
  758. }
  759. }
  760. buf [offset + i] = (char) (0x0ff & buffer [start++]);
  761. }
  762. if (i == 0 && finish <= 0)
  763. return -1;
  764. return i;
  765. }
  766. }
  767. }