1. /*
  2. * The Apache Software License, Version 1.1
  3. *
  4. *
  5. * Copyright (c) 1999,2000 The Apache Software Foundation. All rights
  6. * reserved.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. *
  12. * 1. Redistributions of source code must retain the above copyright
  13. * notice, this list of conditions and the following disclaimer.
  14. *
  15. * 2. Redistributions in binary form must reproduce the above copyright
  16. * notice, this list of conditions and the following disclaimer in
  17. * the documentation and/or other materials provided with the
  18. * distribution.
  19. *
  20. * 3. The end-user documentation included with the redistribution,
  21. * if any, must include the following acknowledgment:
  22. * "This product includes software developed by the
  23. * Apache Software Foundation (http://www.apache.org/)."
  24. * Alternately, this acknowledgment may appear in the software itself,
  25. * if and wherever such third-party acknowledgments normally appear.
  26. *
  27. * 4. The names "Xerces" and "Apache Software Foundation" must
  28. * not be used to endorse or promote products derived from this
  29. * software without prior written permission. For written
  30. * permission, please contact apache@apache.org.
  31. *
  32. * 5. Products derived from this software may not be called "Apache",
  33. * nor may "Apache" appear in their name, without prior written
  34. * permission of the Apache Software Foundation.
  35. *
  36. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  37. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  38. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  39. * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  40. * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  41. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  42. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  43. * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  44. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  45. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  46. * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  47. * SUCH DAMAGE.
  48. * ====================================================================
  49. *
  50. * This software consists of voluntary contributions made by many
  51. * individuals on behalf of the Apache Software Foundation and was
  52. * originally based on software copyright (c) 1999, International
  53. * Business Machines, Inc., http://www.apache.org. For more
  54. * information on the Apache Software Foundation, please see
  55. * <http://www.apache.org/>.
  56. */
  57. package com.sun.org.apache.html.internal.dom;
  58. import java.util.Vector;
  59. import com.sun.org.apache.xerces.internal.dom.ElementImpl;
  60. import com.sun.org.apache.xerces.internal.dom.ProcessingInstructionImpl;
  61. import com.sun.org.apache.xerces.internal.dom.TextImpl;
  62. import org.w3c.dom.Node;
  63. import org.w3c.dom.html.HTMLDocument;
  64. import org.xml.sax.AttributeList;
  65. import org.xml.sax.DocumentHandler;
  66. import org.xml.sax.Locator;
  67. import org.xml.sax.SAXException;
  68. /**
  69. * This is a SAX document handler that is used to build an HTML document.
  70. * It can build a document from any SAX parser, but is specifically tuned
  71. * for working with the OpenXML HTML parser.
  72. *
  73. *
  74. * @version $Revision: 1.6 $ $Date: 2003/05/08 20:13:09 $
  75. * @author <a href="mailto:arkin@openxml.org">Assaf Arkin</a>
  76. */
  77. public class HTMLBuilder
  78. implements DocumentHandler
  79. {
  80. /**
  81. * The document that is being built.
  82. */
  83. protected HTMLDocumentImpl _document;
  84. /**
  85. * The current node in the document into which elements, text and
  86. * other nodes will be inserted. This starts as the document iself
  87. * and reflects each element that is currently being parsed.
  88. */
  89. protected ElementImpl _current;
  90. /**
  91. * A reference to the current locator, this is generally the parser
  92. * itself. The locator is used to locate errors and identify the
  93. * source locations of elements.
  94. */
  95. private Locator _locator;
  96. /**
  97. * Applies only to whitespace appearing between element tags in element content,
  98. * as per the SAX definition, and true by default.
  99. */
  100. private boolean _ignoreWhitespace = true;
  101. /**
  102. * Indicates whether finished building a document. If so, can start building
  103. * another document. Must be initially true to get the first document processed.
  104. */
  105. private boolean _done = true;
  106. /**
  107. * The document is only created the same time as the document element, however, certain
  108. * nodes may precede the document element (comment and PI), and they are accumulated
  109. * in this vector.
  110. */
  111. protected Vector _preRootNodes;
  112. public void startDocument()
  113. throws SAXException
  114. {
  115. if ( ! _done )
  116. throw new SAXException( "HTM001 State error: startDocument fired twice on one builder." );
  117. _document = null;
  118. _done = false;
  119. }
  120. public void endDocument()
  121. throws SAXException
  122. {
  123. if ( _document == null )
  124. throw new SAXException( "HTM002 State error: document never started or missing document element." );
  125. if ( _current != null )
  126. throw new SAXException( "HTM003 State error: document ended before end of document element." );
  127. _current = null;
  128. _done = true;
  129. }
  130. public synchronized void startElement( String tagName, AttributeList attrList )
  131. throws SAXException
  132. {
  133. ElementImpl elem;
  134. int i;
  135. if ( tagName == null )
  136. throw new SAXException( "HTM004 Argument 'tagName' is null." );
  137. // If this is the root element, this is the time to create a new document,
  138. // because only know we know the document element name and namespace URI.
  139. if ( _document == null )
  140. {
  141. // No need to create the element explicitly.
  142. _document = new HTMLDocumentImpl();
  143. elem = (ElementImpl) _document.getDocumentElement();
  144. _current = elem;
  145. if ( _current == null )
  146. throw new SAXException( "HTM005 State error: Document.getDocumentElement returns null." );
  147. // Insert nodes (comment and PI) that appear before the root element.
  148. if ( _preRootNodes != null )
  149. {
  150. for ( i = _preRootNodes.size() ; i-- > 0 ; )
  151. _document.insertBefore( (Node) _preRootNodes.elementAt( i ), elem );
  152. _preRootNodes = null;
  153. }
  154. }
  155. else
  156. {
  157. // This is a state error, indicates that document has been parsed in full,
  158. // or that there are two root elements.
  159. if ( _current == null )
  160. throw new SAXException( "HTM006 State error: startElement called after end of document element." );
  161. elem = (ElementImpl) _document.createElement( tagName );
  162. _current.appendChild( elem );
  163. _current = elem;
  164. }
  165. // Add the attributes (specified and not-specified) to this element.
  166. if ( attrList != null )
  167. {
  168. for ( i = 0 ; i < attrList.getLength() ; ++ i )
  169. elem.setAttribute( attrList.getName( i ), attrList.getValue( i ) );
  170. }
  171. }
  172. public void endElement( String tagName )
  173. throws SAXException
  174. {
  175. if ( _current == null )
  176. throw new SAXException( "HTM007 State error: endElement called with no current node." );
  177. if ( ! _current.getNodeName().equalsIgnoreCase( tagName ))
  178. throw new SAXException( "HTM008 State error: mismatch in closing tag name " + tagName + "\n" + tagName);
  179. // Move up to the parent element. When you reach the top (closing the root element).
  180. // the parent is document and current is null.
  181. if ( _current.getParentNode() == _current.getOwnerDocument() )
  182. _current = null;
  183. else
  184. _current = (ElementImpl) _current.getParentNode();
  185. }
  186. public void characters( String text )
  187. throws SAXException
  188. {
  189. if ( _current == null )
  190. throw new SAXException( "HTM009 State error: character data found outside of root element." );
  191. _current.appendChild( new TextImpl( _document, text ) );
  192. }
  193. public void characters( char[] text, int start, int length )
  194. throws SAXException
  195. {
  196. if ( _current == null )
  197. throw new SAXException( "HTM010 State error: character data found outside of root element." );
  198. _current.appendChild( new TextImpl( _document, new String( text, start, length ) ) );
  199. }
  200. public void ignorableWhitespace( char[] text, int start, int length )
  201. throws SAXException
  202. {
  203. Node node;
  204. if ( ! _ignoreWhitespace )
  205. _current.appendChild( new TextImpl( _document, new String( text, start, length ) ) );
  206. }
  207. public void processingInstruction( String target, String instruction )
  208. throws SAXException
  209. {
  210. Node node;
  211. // Processing instruction may appear before the document element (in fact, before the
  212. // document has been created, or after the document element has been closed.
  213. if ( _current == null && _document == null )
  214. {
  215. if ( _preRootNodes == null )
  216. _preRootNodes = new Vector();
  217. _preRootNodes.addElement( new ProcessingInstructionImpl( null, target, instruction ) );
  218. }
  219. else
  220. if ( _current == null && _document != null )
  221. _document.appendChild( new ProcessingInstructionImpl( _document, target, instruction ) );
  222. else
  223. _current.appendChild( new ProcessingInstructionImpl( _document, target, instruction ) );
  224. }
  225. public HTMLDocument getHTMLDocument()
  226. {
  227. return (HTMLDocument) _document;
  228. }
  229. public void setDocumentLocator( Locator locator )
  230. {
  231. _locator = locator;
  232. }
  233. }