1. /*
  2. * The Apache Software License, Version 1.1
  3. *
  4. *
  5. * Copyright (c) 1999-2002 The Apache Software Foundation. All rights
  6. * reserved.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. *
  12. * 1. Redistributions of source code must retain the above copyright
  13. * notice, this list of conditions and the following disclaimer.
  14. *
  15. * 2. Redistributions in binary form must reproduce the above copyright
  16. * notice, this list of conditions and the following disclaimer in
  17. * the documentation and/or other materials provided with the
  18. * distribution.
  19. *
  20. * 3. The end-user documentation included with the redistribution,
  21. * if any, must include the following acknowledgment:
  22. * "This product includes software developed by the
  23. * Apache Software Foundation (http://www.apache.org/)."
  24. * Alternately, this acknowledgment may appear in the software itself,
  25. * if and wherever such third-party acknowledgments normally appear.
  26. *
  27. * 4. The names "Xerces" and "Apache Software Foundation" must
  28. * not be used to endorse or promote products derived from this
  29. * software without prior written permission. For written
  30. * permission, please contact apache@apache.org.
  31. *
  32. * 5. Products derived from this software may not be called "Apache",
  33. * nor may "Apache" appear in their name, without prior written
  34. * permission of the Apache Software Foundation.
  35. *
  36. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  37. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  38. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  39. * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  40. * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  41. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  42. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  43. * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  44. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  45. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  46. * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  47. * SUCH DAMAGE.
  48. * ====================================================================
  49. *
  50. * This software consists of voluntary contributions made by many
  51. * individuals on behalf of the Apache Software Foundation and was
  52. * originally based on software copyright (c) 1999, International
  53. * Business Machines, Inc., http://www.apache.org. For more
  54. * information on the Apache Software Foundation, please see
  55. * <http://www.apache.org/>.
  56. */
  57. // Sep 14, 2000:
  58. // Fixed problem with namespace handling. Contributed by
  59. // David Blondeau <blondeau@intalio.com>
  60. // Sep 14, 2000:
  61. // Fixed serializer to report IO exception directly, instead at
  62. // the end of document processing.
  63. // Reported by Patrick Higgins <phiggins@transzap.com>
  64. // Aug 21, 2000:
  65. // Fixed bug in startDocument not calling prepare.
  66. // Reported by Mikael Staldal <d96-mst-ingen-reklam@d.kth.se>
  67. // Aug 21, 2000:
  68. // Added ability to omit DOCTYPE declaration.
  69. package com.sun.org.apache.xml.internal.serialize;
  70. import java.io.IOException;
  71. import java.io.OutputStream;
  72. import java.io.Writer;
  73. import com.sun.org.apache.xerces.internal.dom.DOMMessageFormatter;
  74. import org.w3c.dom.DOMError;
  75. import com.sun.org.apache.xerces.internal.impl.Constants;
  76. import com.sun.org.apache.xerces.internal.util.NamespaceSupport;
  77. import com.sun.org.apache.xerces.internal.util.SymbolTable;
  78. import com.sun.org.apache.xerces.internal.util.XML11Char;
  79. import com.sun.org.apache.xerces.internal.util.XMLChar;
  80. import org.xml.sax.SAXException;
  81. /**
  82. * Implements an XML serializer supporting both DOM and SAX pretty
  83. * serializing. For usage instructions see {@link Serializer}.
  84. * <p>
  85. * If an output stream is used, the encoding is taken from the
  86. * output format (defaults to <tt>UTF-8</tt>). If a writer is
  87. * used, make sure the writer uses the same encoding (if applies)
  88. * as specified in the output format.
  89. * <p>
  90. * The serializer supports both DOM and SAX. SAX serializing is done by firing
  91. * SAX events and using the serializer as a document handler. DOM serializing is done
  92. * by calling {@link #serialize(Document)} or by using DOM Level 3
  93. * {@link org.w3c.dom.ls.DOMSerializer} and
  94. * serializing with {@link org.w3c.dom.ls.DOMSerializer#write},
  95. * {@link org.w3c.dom.ls.DOMSerializer#writeToString}.
  96. * <p>
  97. * If an I/O exception occurs while serializing, the serializer
  98. * will not throw an exception directly, but only throw it
  99. * at the end of serializing (either DOM or SAX's {@link
  100. * org.xml.sax.DocumentHandler#endDocument}.
  101. * <p>
  102. * For elements that are not specified as whitespace preserving,
  103. * the serializer will potentially break long text lines at space
  104. * boundaries, indent lines, and serialize elements on separate
  105. * lines. Line terminators will be regarded as spaces, and
  106. * spaces at beginning of line will be stripped.
  107. * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
  108. * @author <a href="mailto:rahul.srivastava@sun.com">Rahul Srivastava</a>
  109. * @author Elena Litani IBM
  110. * @version $Revision: 1.8 $ $Date: 2004/01/29 21:11:30 $
  111. * @see Serializer
  112. */
  113. public class XML11Serializer
  114. extends XMLSerializer {
  115. //
  116. // constants
  117. //
  118. protected static final boolean DEBUG = false;
  119. //
  120. // data
  121. //
  122. //
  123. // DOM Level 3 implementation: variables intialized in DOMSerializerImpl
  124. //
  125. /** stores namespaces in scope */
  126. protected NamespaceSupport fNSBinder;
  127. /** stores all namespace bindings on the current element */
  128. protected NamespaceSupport fLocalNSBinder;
  129. /** symbol table for serialization */
  130. protected SymbolTable fSymbolTable;
  131. // is node dom level 1 node?
  132. protected boolean fDOML1 = false;
  133. // counter for new prefix names
  134. protected int fNamespaceCounter = 1;
  135. protected final static String PREFIX = "NS";
  136. /**
  137. * Controls whether namespace fixup should be performed during
  138. * the serialization.
  139. * NOTE: if this field is set to true the following
  140. * fields need to be initialized: fNSBinder, fLocalNSBinder, fSymbolTable,
  141. * XMLSymbols.EMPTY_STRING, fXmlSymbol, fXmlnsSymbol, fNamespaceCounter.
  142. */
  143. protected boolean fNamespaces = false;
  144. private boolean fPreserveSpace;
  145. /**
  146. * Constructs a new serializer. The serializer cannot be used without
  147. * calling {@link #setOutputCharStream} or {@link #setOutputByteStream}
  148. * first.
  149. */
  150. public XML11Serializer() {
  151. super( );
  152. _format.setVersion("1.1");
  153. }
  154. /**
  155. * Constructs a new serializer. The serializer cannot be used without
  156. * calling {@link #setOutputCharStream} or {@link #setOutputByteStream}
  157. * first.
  158. */
  159. public XML11Serializer( OutputFormat format ) {
  160. super( format );
  161. _format.setVersion("1.1");
  162. }
  163. /**
  164. * Constructs a new serializer that writes to the specified writer
  165. * using the specified output format. If <tt>format</tt> is null,
  166. * will use a default output format.
  167. *
  168. * @param writer The writer to use
  169. * @param format The output format to use, null for the default
  170. */
  171. public XML11Serializer( Writer writer, OutputFormat format ) {
  172. super( writer, format );
  173. _format.setVersion("1.1");
  174. }
  175. /**
  176. * Constructs a new serializer that writes to the specified output
  177. * stream using the specified output format. If <tt>format</tt>
  178. * is null, will use a default output format.
  179. *
  180. * @param output The output stream to use
  181. * @param format The output format to use, null for the default
  182. */
  183. public XML11Serializer( OutputStream output, OutputFormat format ) {
  184. super( output, format != null ? format : new OutputFormat( Method.XML, null, false ) );
  185. _format.setVersion("1.1");
  186. }
  187. //-----------------------------------------//
  188. // SAX content handler serializing methods //
  189. //-----------------------------------------//
  190. public void characters( char[] chars, int start, int length )
  191. throws SAXException
  192. {
  193. ElementState state;
  194. try {
  195. state = content();
  196. // Check if text should be print as CDATA section or unescaped
  197. // based on elements listed in the output format (the element
  198. // state) or whether we are inside a CDATA section or entity.
  199. if ( state.inCData || state.doCData ) {
  200. int saveIndent;
  201. // Print a CDATA section. The text is not escaped, but ']]>'
  202. // appearing in the code must be identified and dealt with.
  203. // The contents of a text node is considered space preserving.
  204. if ( ! state.inCData ) {
  205. _printer.printText( "<![CDATA[" );
  206. state.inCData = true;
  207. }
  208. saveIndent = _printer.getNextIndent();
  209. _printer.setNextIndent( 0 );
  210. char ch;
  211. for ( int index = start ; index < length ; ++index ) {
  212. ch = chars[index];
  213. if ( ch == ']' && index + 2 < length &&
  214. chars[ index + 1 ] == ']' && chars[ index + 2 ] == '>' ) {
  215. _printer.printText("]]]]><![CDATA[>");
  216. index +=2;
  217. continue;
  218. }
  219. if (!XML11Char.isXML11Valid(ch)) {
  220. // check if it is surrogate
  221. if (++index <length) {
  222. surrogates(ch, chars[index]);
  223. }
  224. else {
  225. fatalError("The character '"+(char)ch+"' is an invalid XML character");
  226. }
  227. continue;
  228. } else {
  229. if ( _encodingInfo.isPrintable((char)ch) && XML11Char.isXML11ValidLiteral(ch)) {
  230. _printer.printText((char)ch);
  231. } else {
  232. // The character is not printable -- split CDATA section
  233. _printer.printText("]]>&#x");
  234. _printer.printText(Integer.toHexString(ch));
  235. _printer.printText(";<![CDATA[");
  236. }
  237. }
  238. }
  239. _printer.setNextIndent( saveIndent );
  240. } else {
  241. int saveIndent;
  242. if ( state.preserveSpace ) {
  243. // If preserving space then hold of indentation so no
  244. // excessive spaces are printed at line breaks, escape
  245. // the text content without replacing spaces and print
  246. // the text breaking only at line breaks.
  247. saveIndent = _printer.getNextIndent();
  248. _printer.setNextIndent( 0 );
  249. printText( chars, start, length, true, state.unescaped );
  250. _printer.setNextIndent( saveIndent );
  251. } else {
  252. printText( chars, start, length, false, state.unescaped );
  253. }
  254. }
  255. } catch ( IOException except ) {
  256. throw new SAXException( except );
  257. }
  258. }
  259. //
  260. // overwrite printing functions to make sure serializer prints out valid XML
  261. //
  262. protected void printEscaped( String source ) throws IOException {
  263. int length = source.length();
  264. for ( int i = 0 ; i < length ; ++i ) {
  265. int ch = source.charAt(i);
  266. if (!XML11Char.isXML11Valid(ch)) {
  267. if (++i <length) {
  268. surrogates(ch, source.charAt(i));
  269. } else {
  270. fatalError("The character '"+(char)ch+"' is an invalid XML character");
  271. }
  272. continue;
  273. }
  274. if (ch == '\n' || ch == '\r' || ch == '\t' || ch == 0x0085 || ch == 0x2028){
  275. printHex(ch);
  276. } else if (ch == '<') {
  277. _printer.printText("<");
  278. } else if (ch == '&') {
  279. _printer.printText("&");
  280. } else if (ch == '"') {
  281. _printer.printText(""");
  282. } else if ((ch >= ' ' && _encodingInfo.isPrintable((char) ch))) {
  283. _printer.printText((char) ch);
  284. } else {
  285. printHex(ch);
  286. }
  287. }
  288. }
  289. protected final void printCDATAText(String text) throws IOException {
  290. int length = text.length();
  291. char ch;
  292. for (int index = 0; index < length; ++index) {
  293. ch = text.charAt(index);
  294. if (ch == ']'
  295. && index + 2 < length
  296. && text.charAt(index + 1) == ']'
  297. && text.charAt(index + 2) == '>') { // check for ']]>'
  298. if (fDOMErrorHandler != null){
  299. // REVISIT: this means that if DOM Error handler is not registered we don't report any
  300. // fatal errors and might serialize not wellformed document
  301. if ((features & DOMSerializerImpl.SPLITCDATA) == 0
  302. && (features & DOMSerializerImpl.WELLFORMED) == 0) {
  303. // issue fatal error
  304. String msg =
  305. DOMMessageFormatter.formatMessage(
  306. DOMMessageFormatter.SERIALIZER_DOMAIN,
  307. "EndingCDATA",
  308. null);
  309. modifyDOMError(
  310. msg,
  311. DOMError.SEVERITY_FATAL_ERROR,
  312. fCurrentNode);
  313. boolean continueProcess =
  314. fDOMErrorHandler.handleError(fDOMError);
  315. if (!continueProcess) {
  316. throw new IOException();
  317. }
  318. } else {
  319. // issue warning
  320. String msg =
  321. DOMMessageFormatter.formatMessage(
  322. DOMMessageFormatter.SERIALIZER_DOMAIN,
  323. "SplittingCDATA",
  324. null);
  325. modifyDOMError(
  326. msg,
  327. DOMError.SEVERITY_WARNING,
  328. fCurrentNode);
  329. fDOMErrorHandler.handleError(fDOMError);
  330. }
  331. }
  332. // split CDATA section
  333. _printer.printText("]]]]><![CDATA[>");
  334. index += 2;
  335. continue;
  336. }
  337. if (!XML11Char.isXML11Valid(ch)) {
  338. // check if it is surrogate
  339. if (++index < length) {
  340. surrogates(ch, text.charAt(index));
  341. } else {
  342. fatalError(
  343. "The character '"
  344. + (char) ch
  345. + "' is an invalid XML character");
  346. }
  347. continue;
  348. } else {
  349. if (_encodingInfo.isPrintable((char) ch)
  350. && XML11Char.isXML11ValidLiteral(ch)) {
  351. _printer.printText((char) ch);
  352. } else {
  353. // The character is not printable -- split CDATA section
  354. _printer.printText("]]>&#x");
  355. _printer.printText(Integer.toHexString(ch));
  356. _printer.printText(";<![CDATA[");
  357. }
  358. }
  359. }
  360. }
  361. // note that this "int" should, in all cases, be a char.
  362. // REVISIT: make it a char...
  363. protected final void printXMLChar( int ch ) throws IOException {
  364. if (ch == '\r' || ch == 0x0085 || ch == 0x2028) {
  365. printHex(ch);
  366. } else if ( ch == '<') {
  367. _printer.printText("<");
  368. } else if (ch == '&') {
  369. _printer.printText("&");
  370. } else if (ch == '>'){
  371. // character sequence "]]>" can't appear in content, therefore
  372. // we should escape '>'
  373. _printer.printText(">");
  374. } else if ( _encodingInfo.isPrintable((char)ch) && XML11Char.isXML11ValidLiteral(ch)) {
  375. _printer.printText((char)ch);
  376. } else {
  377. printHex(ch);
  378. }
  379. }
  380. protected final void surrogates(int high, int low) throws IOException{
  381. if (XMLChar.isHighSurrogate(high)) {
  382. if (!XMLChar.isLowSurrogate(low)) {
  383. //Invalid XML
  384. fatalError("The character '"+(char)low+"' is an invalid XML character");
  385. }
  386. else {
  387. int supplemental = XMLChar.supplemental((char)high, (char)low);
  388. if (!XML11Char.isXML11Valid(supplemental)) {
  389. //Invalid XML
  390. fatalError("The character '"+(char)supplemental+"' is an invalid XML character");
  391. }
  392. else {
  393. if (content().inCData ) {
  394. _printer.printText("]]>&#x");
  395. _printer.printText(Integer.toHexString(supplemental));
  396. _printer.printText(";<![CDATA[");
  397. }
  398. else {
  399. printHex(supplemental);
  400. }
  401. }
  402. }
  403. } else {
  404. fatalError("The character '"+(char)high+"' is an invalid XML character");
  405. }
  406. }
  407. protected void printText( String text, boolean preserveSpace, boolean unescaped )
  408. throws IOException {
  409. int index;
  410. char ch;
  411. int length = text.length();
  412. if ( preserveSpace ) {
  413. // Preserving spaces: the text must print exactly as it is,
  414. // without breaking when spaces appear in the text and without
  415. // consolidating spaces. If a line terminator is used, a line
  416. // break will occur.
  417. for ( index = 0 ; index < length ; ++index ) {
  418. ch = text.charAt( index );
  419. if (!XML11Char.isXML11Valid(ch)) {
  420. // check if it is surrogate
  421. if (++index <length) {
  422. surrogates(ch, text.charAt(index));
  423. } else {
  424. fatalError("The character '"+(char)ch+"' is an invalid XML character");
  425. }
  426. continue;
  427. }
  428. if ( unescaped && XML11Char.isXML11ValidLiteral(ch)) {
  429. _printer.printText( ch );
  430. } else
  431. printXMLChar( ch );
  432. }
  433. } else {
  434. // Not preserving spaces: print one part at a time, and
  435. // use spaces between parts to break them into different
  436. // lines. Spaces at beginning of line will be stripped
  437. // by printing mechanism. Line terminator is treated
  438. // no different than other text part.
  439. for ( index = 0 ; index < length ; ++index ) {
  440. ch = text.charAt( index );
  441. if (!XML11Char.isXML11Valid(ch)) {
  442. // check if it is surrogate
  443. if (++index <length) {
  444. surrogates(ch, text.charAt(index));
  445. } else {
  446. fatalError("The character '"+(char)ch+"' is an invalid XML character");
  447. }
  448. continue;
  449. }
  450. if ( unescaped && XML11Char.isXML11ValidLiteral(ch) )
  451. _printer.printText( ch );
  452. else
  453. printXMLChar( ch);
  454. }
  455. }
  456. }
  457. protected void printText( char[] chars, int start, int length,
  458. boolean preserveSpace, boolean unescaped ) throws IOException {
  459. int index;
  460. char ch;
  461. if ( preserveSpace ) {
  462. // Preserving spaces: the text must print exactly as it is,
  463. // without breaking when spaces appear in the text and without
  464. // consolidating spaces. If a line terminator is used, a line
  465. // break will occur.
  466. while ( length-- > 0 ) {
  467. ch = chars[ start ];
  468. ++start;
  469. if (!XML11Char.isXML11Valid(ch)) {
  470. // check if it is surrogate
  471. if (++start <length) {
  472. surrogates(ch, chars[start]);
  473. } else {
  474. fatalError("The character '"+(char)ch+"' is an invalid XML character");
  475. }
  476. continue;
  477. }
  478. if ( unescaped && XML11Char.isXML11ValidLiteral(ch))
  479. _printer.printText( ch );
  480. else
  481. printXMLChar( ch );
  482. }
  483. } else {
  484. // Not preserving spaces: print one part at a time, and
  485. // use spaces between parts to break them into different
  486. // lines. Spaces at beginning of line will be stripped
  487. // by printing mechanism. Line terminator is treated
  488. // no different than other text part.
  489. while ( length-- > 0 ) {
  490. ch = chars[ start ];
  491. ++start;
  492. if (!XML11Char.isXML11Valid(ch)) {
  493. // check if it is surrogate
  494. if (++start <length) {
  495. surrogates(ch, chars[start]);
  496. } else {
  497. fatalError("The character '"+(char)ch+"' is an invalid XML character");
  498. }
  499. continue;
  500. }
  501. if ( unescaped && XML11Char.isXML11ValidLiteral(ch))
  502. _printer.printText( ch );
  503. else
  504. printXMLChar( ch );
  505. }
  506. }
  507. }
  508. public boolean reset() {
  509. super.reset();
  510. return true;
  511. }
  512. }