1. /*
  2. * @(#)DocumentParser.java 1.22 00/02/02
  3. *
  4. * Copyright 1998-2000 Sun Microsystems, Inc. All Rights Reserved.
  5. *
  6. * This software is the proprietary information of Sun Microsystems, Inc.
  7. * Use is subject to license terms.
  8. *
  9. */
  10. package javax.swing.text.html.parser;
  11. import javax.swing.text.SimpleAttributeSet;
  12. import javax.swing.text.html.HTMLEditorKit;
  13. import javax.swing.text.html.HTML;
  14. import javax.swing.text.ChangedCharSetException;
  15. import java.util.*;
  16. import java.io.*;
  17. import java.net.*;
  18. import sun.io.*;
  19. /**
  20. * A Parser for HTML Documents (actually, you can specify a DTD, but
  21. * you should really only use this class with the html dtd in swing).
  22. * Reads an InputStream of HTML and
  23. * invokes the appropriate methods in the ParserCallback class. This
  24. * is the default parser used by HTMLEditorKit to parse HTML url's.
  25. * <p>This will message the callback for all valid tags, as well as
  26. * tags that are implied but not explicitly specified. For example, the
  27. * html string (<p>blah) only has a p tag defined. The callback
  28. * will see the following methods:
  29. * <ol><li><i>handleStartTag(html, ...)</i></li>
  30. * <li><i>handleStartTag(head, ...)</i></li>
  31. * <li><i>handleEndTag(head)</i></li>
  32. * <li><i>handleStartTag(body, ...)</i></li>
  33. * <li>handleStartTag(p, ...)</i></li>
  34. * <li>handleText(...)</li>
  35. * <li><i>handleEndTag(p)</i></li>
  36. * <li><i>handleEndTag(body)</i></li>
  37. * <li><i>handleEndTag(html)</i></li>
  38. * </ol>
  39. * The items in <i>italic</i> are implied, that is, although they were not
  40. * explicitly specified, to be correct html they should have been present
  41. * (head isn't necessary, but it is still generated). For tags that
  42. * are implied, the AttributeSet argument will have a value of
  43. * <code>Boolean.TRUE</code> for the key
  44. * <code>HTMLEditorKit.ParserCallback.IMPLIED</code>.
  45. * <p>HTML.Attributes defines a type safe enumeration of html attributes.
  46. * If an attribute key of a tag is defined in HTML.Attribute, the
  47. * HTML.Attribute will be used as the key, otherwise a String will be used.
  48. * For example <p foo=bar class=neat> has two attributes. foo is
  49. * not defined in HTML.Attribute, where as class is, therefore the
  50. * AttributeSet will have two values in it, HTML.Attribute.CLASS with
  51. * a String value of 'neat' and the String key 'foo' with a String value of
  52. * 'bar'.
  53. * <p>The position argument will indicate the start of the tag, comment
  54. * or text. Similiar to arrays, the first character in the stream has a
  55. * position of 0. For tags that are
  56. * implied the position will indicate
  57. * the location of the next encountered tag. In the first example,
  58. * the implied start body and html tags will have the same position as the
  59. * p tag, and the implied end p, html and body tags will all have the same
  60. * position.
  61. * <p>As html skips whitespace the position for text will be the position
  62. * of the first valid character, eg in the string '\n\n\nblah'
  63. * the text 'blah' will have a position of 3, the newlines are skipped.
  64. * <p>
  65. * For attributes that do not have a value, eg in the html
  66. * string <code><foo blah></code> the attribute <code>blah</code>
  67. * does not have a value, there are two possible values that will be
  68. * placed in the AttributeSet's value:
  69. * <ul>
  70. * <li>If the DTD does not contain an definition for the element, or the
  71. * definition does not have an explicit value then the value in the
  72. * AttributeSet will be <code>HTML.NULL_ATTRIBUTE_VALUE</code>.
  73. * <li>If the DTD contains an explicit value, as in:
  74. * <code><!ATTLIST OPTION selected (selected) #IMPLIED></code>
  75. * this value from the dtd (in this case selected) will be used.
  76. * </ul>
  77. * <p>
  78. * Once the stream has been parsed, the callback is notified of the most
  79. * likely end of line string. The end of line string will be one of
  80. * \n, \r or \r\n, which ever is encountered the most in parsing the
  81. * stream.
  82. *
  83. * @version 1.22 02/02/00
  84. * @author Sunita Mani
  85. */
  86. public class DocumentParser extends javax.swing.text.html.parser.Parser {
  87. private int inbody;
  88. private int intitle;
  89. private int inhead;
  90. private int instyle;
  91. private boolean seentitle;
  92. private HTMLEditorKit.ParserCallback callback = null;
  93. private boolean ignoreCharSet = false;
  94. private static final boolean debugFlag = false;
  95. public DocumentParser(DTD dtd) {
  96. super(dtd);
  97. }
  98. public void parse(Reader in, HTMLEditorKit.ParserCallback callback, boolean ignoreCharSet) throws IOException {
  99. this.ignoreCharSet = ignoreCharSet;
  100. this.callback = callback;
  101. parse(in);
  102. // end of line
  103. callback.handleEndOfLineString(getEndOfLineString());
  104. }
  105. /**
  106. * Handle Start Tag.
  107. */
  108. protected void handleStartTag(TagElement tag) {
  109. Element elem = tag.getElement();
  110. if (elem == dtd.body) {
  111. inbody++;
  112. } else if (elem == dtd.html) {
  113. } else if (elem == dtd.head) {
  114. inhead++;
  115. } else if (elem == dtd.title) {
  116. intitle++;
  117. } else if (elem == dtd.style) {
  118. instyle++;
  119. }
  120. if (debugFlag) {
  121. if (tag.fictional()) {
  122. debug("Start Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
  123. } else {
  124. debug("Start Tag: " + tag.getHTMLTag() + " attributes: " +
  125. getAttributes() + " pos: " + getCurrentPos());
  126. }
  127. }
  128. if (tag.fictional()) {
  129. SimpleAttributeSet attrs = new SimpleAttributeSet();
  130. attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED,
  131. Boolean.TRUE);
  132. callback.handleStartTag(tag.getHTMLTag(), attrs,
  133. getBlockStartPosition());
  134. } else {
  135. callback.handleStartTag(tag.getHTMLTag(), getAttributes(),
  136. getBlockStartPosition());
  137. flushAttributes();
  138. }
  139. }
  140. protected void handleComment(char text[]) {
  141. if (debugFlag) {
  142. debug("comment: ->" + new String(text) + "<-"
  143. + " pos: " + getCurrentPos());
  144. }
  145. callback.handleComment(text, getBlockStartPosition());
  146. }
  147. /**
  148. * Handle Empty Tag.
  149. */
  150. protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException {
  151. Element elem = tag.getElement();
  152. if (elem == dtd.meta && !ignoreCharSet) {
  153. SimpleAttributeSet atts = getAttributes();
  154. if (atts != null) {
  155. String content = (String)atts.getAttribute(HTML.Attribute.CONTENT);
  156. if (content != null) {
  157. if ("content-type".equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) {
  158. throw new ChangedCharSetException(content, false);
  159. } else if ("charset" .equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) {
  160. throw new ChangedCharSetException(content, true);
  161. }
  162. }
  163. }
  164. }
  165. if (inbody != 0 || elem == dtd.meta || elem == dtd.base || elem == dtd.isindex || elem == dtd.style || elem == dtd.link) {
  166. if (debugFlag) {
  167. if (tag.fictional()) {
  168. debug("Empty Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
  169. } else {
  170. debug("Empty Tag: " + tag.getHTMLTag() + " attributes: "
  171. + getAttributes() + " pos: " + getCurrentPos());
  172. }
  173. }
  174. if (tag.fictional()) {
  175. SimpleAttributeSet attrs = new SimpleAttributeSet();
  176. attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED,
  177. Boolean.TRUE);
  178. callback.handleSimpleTag(tag.getHTMLTag(), attrs,
  179. getBlockStartPosition());
  180. } else {
  181. callback.handleSimpleTag(tag.getHTMLTag(), getAttributes(),
  182. getBlockStartPosition());
  183. flushAttributes();
  184. }
  185. }
  186. }
  187. /**
  188. * Handle End Tag.
  189. */
  190. protected void handleEndTag(TagElement tag) {
  191. Element elem = tag.getElement();
  192. if (elem == dtd.body) {
  193. inbody--;
  194. } else if (elem == dtd.title) {
  195. intitle--;
  196. seentitle = true;
  197. } else if (elem == dtd.head) {
  198. inhead--;
  199. } else if (elem == dtd.style) {
  200. instyle--;
  201. }
  202. if (debugFlag) {
  203. debug("End Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
  204. }
  205. callback.handleEndTag(tag.getHTMLTag(), getBlockStartPosition());
  206. }
  207. /**
  208. * Handle Text.
  209. */
  210. protected void handleText(char data[]) {
  211. if (data != null) {
  212. if (inbody != 0 || ((instyle != 0) ||
  213. ((intitle != 0) && !seentitle))) {
  214. if (debugFlag) {
  215. debug("text: ->" + new String(data) + "<-" + " pos: " + getCurrentPos());
  216. }
  217. callback.handleText(data, getBlockStartPosition());
  218. }
  219. }
  220. }
  221. /*
  222. * Error handling.
  223. */
  224. protected void handleError(int ln, String errorMsg) {
  225. if (debugFlag) {
  226. debug("Error: ->" + errorMsg + "<-" + " pos: " + getCurrentPos());
  227. }
  228. /* PENDING: need to improve the error string. */
  229. callback.handleError(errorMsg, getCurrentPos());
  230. }
  231. /*
  232. * debug messages
  233. */
  234. private void debug(String msg) {
  235. System.out.println(msg);
  236. }
  237. }