1. /*
  2. * The Apache Software License, Version 1.1
  3. *
  4. *
  5. * Copyright (c) 2001-2004 The Apache Software Foundation. All rights
  6. * reserved.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. *
  12. * 1. Redistributions of source code must retain the above copyright
  13. * notice, this list of conditions and the following disclaimer.
  14. *
  15. * 2. Redistributions in binary form must reproduce the above copyright
  16. * notice, this list of conditions and the following disclaimer in
  17. * the documentation and/or other materials provided with the
  18. * distribution.
  19. *
  20. * 3. The end-user documentation included with the redistribution,
  21. * if any, must include the following acknowledgment:
  22. * "This product includes software developed by the
  23. * Apache Software Foundation (http://www.apache.org/)."
  24. * Alternately, this acknowledgment may appear in the software itself,
  25. * if and wherever such third-party acknowledgments normally appear.
  26. *
  27. * 4. The names "Xerces" and "Apache Software Foundation" must
  28. * not be used to endorse or promote products derived from this
  29. * software without prior written permission. For written
  30. * permission, please contact apache@apache.org.
  31. *
  32. * 5. Products derived from this software may not be called "Apache",
  33. * nor may "Apache" appear in their name, without prior written
  34. * permission of the Apache Software Foundation.
  35. *
  36. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  37. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  38. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  39. * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  40. * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  41. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  42. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  43. * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  44. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  45. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  46. * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  47. * SUCH DAMAGE.
  48. * ====================================================================
  49. *
  50. * This software consists of voluntary contributions made by many
  51. * individuals on behalf of the Apache Software Foundation and was
  52. * originally based on software copyright (c) 2003, International
  53. * Business Machines, Inc., http://www.apache.org. For more
  54. * information on the Apache Software Foundation, please see
  55. * <http://www.apache.org/>.
  56. */
  57. package com.sun.org.apache.xerces.internal.xinclude;
  58. import java.io.BufferedInputStream;
  59. import java.io.IOException;
  60. import java.io.InputStream;
  61. import java.io.InputStreamReader;
  62. import java.io.Reader;
  63. import java.net.HttpURLConnection;
  64. import java.net.URL;
  65. import java.net.URLConnection;
  66. import java.util.Locale;
  67. import com.sun.org.apache.xerces.internal.impl.io.ASCIIReader;
  68. import com.sun.org.apache.xerces.internal.impl.io.UTF8Reader;
  69. import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter;
  70. import com.sun.org.apache.xerces.internal.impl.XMLEntityManager;
  71. import com.sun.org.apache.xerces.internal.impl.XMLErrorReporter;
  72. import com.sun.org.apache.xerces.internal.util.EncodingMap;
  73. import com.sun.org.apache.xerces.internal.util.MessageFormatter;
  74. import com.sun.org.apache.xerces.internal.util.XMLChar;
  75. import com.sun.org.apache.xerces.internal.util.XMLStringBuffer;
  76. import com.sun.org.apache.xerces.internal.xni.parser.XMLInputSource;
  77. /**
  78. * This class is used for reading resources requested in <include> elements,
  79. * when the parse attribute of the <include> element is "text". Using this
  80. * class will open the location, detect the encoding, and discard the byte order
  81. * mark, if applicable.
  82. *
  83. * REVISIT:
  84. * Much of the code in this class is taken from XMLEntityManager. It would be nice
  85. * if this code could be shared in some way. However, since XMLEntityManager is used
  86. * for reading files as XML, and this needs to read files as text, there would need
  87. * to be some refactoring done.
  88. *
  89. * @author Michael Glavassevich, IBM
  90. * @author Peter McCracken, IBM
  91. * @author Arun Yadav, Sun Microsystems Inc.
  92. *
  93. * @version $Id: XIncludeTextReader.java,v 1.10 2004/04/15 04:51:56 mrglavas Exp $
  94. *
  95. * @see XIncludeHandler
  96. */
  97. public class XIncludeTextReader {
  98. private Reader fReader;
  99. private XIncludeHandler fHandler;
  100. private XMLInputSource fSource;
  101. private XMLErrorReporter fErrorReporter;
  102. // Content negotation parameters
  103. private String fAccept;
  104. private String fAcceptLanguage;
  105. /**
  106. * Construct the XIncludeReader using the XMLInputSource and XIncludeHandler.
  107. *
  108. * @param source The XMLInputSource to use.
  109. * @param handler The XIncludeHandler to use.
  110. */
  111. public XIncludeTextReader(XMLInputSource source, XIncludeHandler handler)
  112. throws IOException {
  113. fHandler = handler;
  114. fSource = source;
  115. }
  116. /**
  117. * Sets the XMLErrorReporter used for reporting errors while
  118. * reading the text include.
  119. *
  120. * @param errorReporter the XMLErrorReporter to be used for
  121. * reporting errors.
  122. */
  123. public void setErrorReporter(XMLErrorReporter errorReporter) {
  124. fErrorReporter = errorReporter;
  125. }
  126. /**
  127. * Sets content negotation parameters to be attached to an HTTP request.
  128. *
  129. * @param accept the Accept HTTP request property
  130. * @param acceptLanguage the Accept-Language HTTP request property
  131. */
  132. public void setHttpProperties(String accept, String acceptLanguage) {
  133. fAccept = accept;
  134. fAcceptLanguage = acceptLanguage;
  135. }
  136. /**
  137. * Return the Reader for given XMLInputSource.
  138. *
  139. * @param source The XMLInputSource to use.
  140. */
  141. protected Reader getReader(XMLInputSource source) throws IOException {
  142. if (source.getCharacterStream() != null) {
  143. return source.getCharacterStream();
  144. }
  145. else {
  146. InputStream stream = null;
  147. String encoding = source.getEncoding();
  148. if (encoding == null) {
  149. encoding = "UTF-8";
  150. }
  151. if (source.getByteStream() != null) {
  152. stream = source.getByteStream();
  153. // Wrap the InputStream so that it is possible to rewind it.
  154. if (!(stream instanceof BufferedInputStream)) {
  155. stream = new BufferedInputStream(stream);
  156. }
  157. }
  158. else {
  159. String expandedSystemId = XMLEntityManager.expandSystemId(source.getSystemId(), source.getBaseSystemId(), false);
  160. URL url = new URL(expandedSystemId);
  161. URLConnection urlCon = url.openConnection();
  162. // If this is an HTTP connection attach any
  163. // content negotation parameters to the request.
  164. if (urlCon instanceof HttpURLConnection) {
  165. if( fAccept != null && fAccept.length() > 0) {
  166. urlCon.setRequestProperty(XIncludeHandler.HTTP_ACCEPT, fAccept);
  167. }
  168. if( fAcceptLanguage != null && fAcceptLanguage.length() > 0) {
  169. urlCon.setRequestProperty(XIncludeHandler.HTTP_ACCEPT_LANGUAGE, fAcceptLanguage);
  170. }
  171. }
  172. // Wrap the InputStream so that it is possible to rewind it.
  173. stream = new BufferedInputStream(urlCon.getInputStream());
  174. // content type will be string like "text/xml; charset=UTF-8" or "text/xml"
  175. String rawContentType = urlCon.getContentType();
  176. // text/xml and application/xml offer only one optional parameter
  177. int index = (rawContentType != null) ? rawContentType.indexOf(';') : -1;
  178. String contentType = null;
  179. String charset = null;
  180. if (index != -1) {
  181. // this should be something like "text/xml"
  182. contentType = rawContentType.substring(0, index).trim();
  183. // this should be something like "charset=UTF-8", but we want to
  184. // strip it down to just "UTF-8"
  185. charset = rawContentType.substring(index + 1).trim();
  186. if (charset.startsWith("charset=")) {
  187. // 8 is the length of "charset="
  188. charset = charset.substring(8).trim();
  189. // strip quotes, if present
  190. if ((charset.charAt(0) == '"'
  191. && charset.charAt(charset.length() - 1) == '"')
  192. || (charset.charAt(0) == '\''
  193. && charset.charAt(charset.length() - 1)
  194. == '\'')) {
  195. charset =
  196. charset.substring(1, charset.length() - 1);
  197. }
  198. }
  199. else {
  200. charset = null;
  201. }
  202. }
  203. else {
  204. contentType = rawContentType.trim();
  205. }
  206. String detectedEncoding = null;
  207. /** The encoding of such a resource is determined by:
  208. 1 external encoding information, if available, otherwise
  209. -- the most common type of external information is the "charset" parameter of a MIME package
  210. 2 if the media type of the resource is text/xml, application/xml, or matches the conventions text/*+xml or application/*+xml as described in XML Media Types [IETF RFC 3023], the encoding is recognized as specified in XML 1.0, otherwise
  211. 3 the value of the encoding attribute if one exists, otherwise
  212. 4 UTF-8.
  213. **/
  214. if (contentType.equals("text/xml")) {
  215. if (charset != null) {
  216. detectedEncoding = charset;
  217. }
  218. else {
  219. // see RFC2376 or 3023, section 3.1
  220. detectedEncoding = "US-ASCII";
  221. }
  222. }
  223. else if (contentType.equals("application/xml")) {
  224. if (charset != null) {
  225. detectedEncoding = charset;
  226. }
  227. else {
  228. // see RFC2376 or 3023, section 3.2
  229. detectedEncoding = getEncodingName(stream);
  230. }
  231. }
  232. else if (contentType.endsWith("+xml")) {
  233. detectedEncoding = getEncodingName(stream);
  234. }
  235. if (detectedEncoding != null) {
  236. encoding = detectedEncoding;
  237. }
  238. // else 3 or 4.
  239. }
  240. encoding = encoding.toUpperCase(Locale.ENGLISH);
  241. // eat the Byte Order Mark
  242. consumeBOM(stream, encoding);
  243. // If the document is UTF-8 or US-ASCII use
  244. // the Xerces readers for these encodings. For
  245. // US-ASCII consult the encoding map since
  246. // this encoding has many aliases.
  247. if (encoding.equals("UTF-8")) {
  248. return new UTF8Reader(stream,
  249. XMLEntityManager.DEFAULT_BUFFER_SIZE,
  250. fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN),
  251. fErrorReporter.getLocale() );
  252. }
  253. // Try to use a Java reader.
  254. String javaEncoding = EncodingMap.getIANA2JavaMapping(encoding);
  255. // If the specified encoding wasn't a recognized IANA encoding throw an IOException.
  256. // The XIncludeHandler will report this as a ResourceError and then will
  257. // attempt to include a fallback if there is one.
  258. if (javaEncoding == null) {
  259. MessageFormatter aFormatter =
  260. fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN);
  261. Locale aLocale = fErrorReporter.getLocale();
  262. throw new IOException( aFormatter.formatMessage( aLocale,
  263. "EncodingDeclInvalid",
  264. new Object[] {encoding} ) );
  265. }
  266. else if (javaEncoding.equals("ASCII")) {
  267. return new ASCIIReader(stream,
  268. XMLEntityManager.DEFAULT_BUFFER_SIZE,
  269. fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN),
  270. fErrorReporter.getLocale() );
  271. }
  272. return new InputStreamReader(stream, javaEncoding);
  273. }
  274. }
  275. /**
  276. * XMLEntityManager cares about endian-ness, since it creates its own optimized
  277. * readers. Since we're just using generic Java readers for now, we're not caring
  278. * about endian-ness. If this changes, even more code needs to be copied from
  279. * XMLEntity manager. -- PJM
  280. */
  281. protected String getEncodingName(InputStream stream) throws IOException {
  282. final byte[] b4 = new byte[4];
  283. String encoding = null;
  284. // this has the potential to throw an exception
  285. // it will be fixed when we ensure the stream is rewindable (see note above)
  286. stream.mark(4);
  287. int count = stream.read(b4, 0, 4);
  288. stream.reset();
  289. if (count == 4) {
  290. encoding = getEncodingName(b4);
  291. }
  292. return encoding;
  293. }
  294. /**
  295. * Removes the byte order mark from the stream, if it exists.
  296. * @param stream
  297. * @param encoding
  298. * @throws IOException
  299. */
  300. protected void consumeBOM(InputStream stream, String encoding)
  301. throws IOException {
  302. byte[] b = new byte[3];
  303. int count = 0;
  304. stream.mark(3);
  305. if (encoding.equals("UTF-8")) {
  306. count = stream.read(b, 0, 3);
  307. if (count == 3) {
  308. int b0 = b[0] & 0xFF;
  309. int b1 = b[1] & 0xFF;
  310. int b2 = b[2] & 0xFF;
  311. if (b0 != 0xEF || b1 != 0xBB || b2 != 0xBF) {
  312. // First three bytes are not BOM, so reset.
  313. stream.reset();
  314. }
  315. }
  316. else {
  317. stream.reset();
  318. }
  319. }
  320. else if (encoding.startsWith("UTF-16")) {
  321. count = stream.read(b, 0, 2);
  322. if (count == 2) {
  323. int b0 = b[0] & 0xFF;
  324. int b1 = b[1] & 0xFF;
  325. if ((b0 != 0xFE || b1 != 0xFF)
  326. && (b0 != 0xFF || b1 != 0xFE)) {
  327. // First two bytes are not BOM, so reset.
  328. stream.reset();
  329. }
  330. }
  331. else {
  332. stream.reset();
  333. }
  334. }
  335. // We could do UTF-32, but since the getEncodingName() doesn't support that
  336. // we won't support it here.
  337. // To implement UTF-32, look for: 00 00 FE FF for big-endian
  338. // or FF FE 00 00 for little-endian
  339. }
  340. /**
  341. * REVISIT: This code is taken from com.sun.org.apache.xerces.internal.impl.XMLEntityManager.
  342. * Is there any way we can share the code, without having it implemented twice?
  343. * I think we should make it public and static in XMLEntityManager. --PJM
  344. *
  345. * Returns the IANA encoding name that is auto-detected from
  346. * the bytes specified, with the endian-ness of that encoding where appropriate.
  347. *
  348. * @param b4 The first four bytes of the input.
  349. * @return the encoding name, or null if no encoding could be detected
  350. */
  351. protected String getEncodingName(byte[] b4) {
  352. // UTF-16, with BOM
  353. int b0 = b4[0] & 0xFF;
  354. int b1 = b4[1] & 0xFF;
  355. if (b0 == 0xFE && b1 == 0xFF) {
  356. // UTF-16, big-endian
  357. return "UTF-16BE";
  358. }
  359. if (b0 == 0xFF && b1 == 0xFE) {
  360. // UTF-16, little-endian
  361. return "UTF-16LE";
  362. }
  363. // UTF-8 with a BOM
  364. int b2 = b4[2] & 0xFF;
  365. if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
  366. return "UTF-8";
  367. }
  368. // other encodings
  369. int b3 = b4[3] & 0xFF;
  370. if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
  371. // UCS-4, big endian (1234)
  372. return "ISO-10646-UCS-4";
  373. }
  374. if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
  375. // UCS-4, little endian (4321)
  376. return "ISO-10646-UCS-4";
  377. }
  378. if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
  379. // UCS-4, unusual octet order (2143)
  380. return "ISO-10646-UCS-4";
  381. }
  382. if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
  383. // UCS-4, unusual octect order (3412)
  384. return "ISO-10646-UCS-4";
  385. }
  386. if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
  387. // UTF-16, big-endian, no BOM
  388. // (or could turn out to be UCS-2...
  389. return "UTF-16BE";
  390. }
  391. if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
  392. // UTF-16, little-endian, no BOM
  393. // (or could turn out to be UCS-2...
  394. return "UTF-16LE";
  395. }
  396. if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
  397. // EBCDIC
  398. // a la xerces1, return CP037 instead of EBCDIC here
  399. return "CP037";
  400. }
  401. // this signals us to use the value from the encoding attribute
  402. return null;
  403. } // getEncodingName(byte[]):Object[]
  404. /**
  405. * Read the input stream as text, and pass the text on to the XIncludeHandler
  406. * using calls to characters(). This will read all of the text it can from the
  407. * resource.
  408. *
  409. * @throws IOException
  410. */
  411. public void parse() throws IOException {
  412. // REVISIT: This method needs to be rewritten to improve performance: both
  413. // time and memory. We should be reading chunks and reporting chunks instead
  414. // of reading characters individually and reporting all the characters in
  415. // one callback. Also, currently we don't provide any locator information:
  416. // line number, column number, etc... so if we report an error it will appear
  417. // as if the invalid XML character was in the include parent. -- mrglavas
  418. XMLStringBuffer buffer = new XMLStringBuffer();
  419. fReader = getReader(fSource);
  420. int ch;
  421. while((ch = fReader.read()) != -1) {
  422. if (isValid(ch)) {
  423. buffer.append((char)ch);
  424. }
  425. else if (XMLChar.isHighSurrogate(ch)) {
  426. int ch2 = fReader.read();
  427. if (XMLChar.isLowSurrogate(ch2)) {
  428. // convert surrogates to a supplemental character
  429. int sup = XMLChar.supplemental((char)ch, (char)ch2);
  430. // supplemental character must be a valid XML character
  431. if (!isValid(sup)) {
  432. fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
  433. "InvalidCharInContent",
  434. new Object[] { Integer.toString(sup, 16) },
  435. XMLErrorReporter.SEVERITY_FATAL_ERROR);
  436. continue;
  437. }
  438. buffer.append((char) ch);
  439. buffer.append((char) ch2);
  440. }
  441. else {
  442. fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
  443. "InvalidCharInContent",
  444. new Object[] { Integer.toString(ch, 16) },
  445. XMLErrorReporter.SEVERITY_FATAL_ERROR);
  446. }
  447. }
  448. else {
  449. fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
  450. "InvalidCharInContent",
  451. new Object[] { Integer.toString(ch, 16) },
  452. XMLErrorReporter.SEVERITY_FATAL_ERROR);
  453. }
  454. }
  455. if (fHandler != null && buffer.length > 0) {
  456. fHandler.characters(
  457. buffer,
  458. fHandler.modifyAugmentations(null, true));
  459. }
  460. }
  461. /**
  462. * Closes the stream. Call this after parse(), or when there is no longer any need
  463. * for this object.
  464. *
  465. * @throws IOException
  466. */
  467. public void close() throws IOException {
  468. if (fReader != null) {
  469. fReader.close();
  470. }
  471. }
  472. /**
  473. * Returns true if the specified character is a valid XML character
  474. * as per the rules of XML 1.0.
  475. *
  476. * @param ch The character to check.
  477. */
  478. protected boolean isValid(int ch) {
  479. return XMLChar.isValid(ch);
  480. }
  481. }