1. /*
  2. * @(#)DocumentParser.java 1.28 03/12/19
  3. *
  4. * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
  5. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
  6. */
  7. package javax.swing.text.html.parser;
  8. import javax.swing.text.SimpleAttributeSet;
  9. import javax.swing.text.html.HTMLEditorKit;
  10. import javax.swing.text.html.HTML;
  11. import javax.swing.text.ChangedCharSetException;
  12. import java.util.*;
  13. import java.io.*;
  14. import java.net.*;
  15. /**
  16. * A Parser for HTML Documents (actually, you can specify a DTD, but
  17. * you should really only use this class with the html dtd in swing).
  18. * Reads an InputStream of HTML and
  19. * invokes the appropriate methods in the ParserCallback class. This
  20. * is the default parser used by HTMLEditorKit to parse HTML url's.
  21. * <p>This will message the callback for all valid tags, as well as
  22. * tags that are implied but not explicitly specified. For example, the
  23. * html string (<p>blah) only has a p tag defined. The callback
  24. * will see the following methods:
  25. * <ol><li><i>handleStartTag(html, ...)</i></li>
  26. * <li><i>handleStartTag(head, ...)</i></li>
  27. * <li><i>handleEndTag(head)</i></li>
  28. * <li><i>handleStartTag(body, ...)</i></li>
  29. * <li>handleStartTag(p, ...)</i></li>
  30. * <li>handleText(...)</li>
  31. * <li><i>handleEndTag(p)</i></li>
  32. * <li><i>handleEndTag(body)</i></li>
  33. * <li><i>handleEndTag(html)</i></li>
  34. * </ol>
  35. * The items in <i>italic</i> are implied, that is, although they were not
  36. * explicitly specified, to be correct html they should have been present
  37. * (head isn't necessary, but it is still generated). For tags that
  38. * are implied, the AttributeSet argument will have a value of
  39. * <code>Boolean.TRUE</code> for the key
  40. * <code>HTMLEditorKit.ParserCallback.IMPLIED</code>.
  41. * <p>HTML.Attributes defines a type safe enumeration of html attributes.
  42. * If an attribute key of a tag is defined in HTML.Attribute, the
  43. * HTML.Attribute will be used as the key, otherwise a String will be used.
  44. * For example <p foo=bar class=neat> has two attributes. foo is
  45. * not defined in HTML.Attribute, where as class is, therefore the
  46. * AttributeSet will have two values in it, HTML.Attribute.CLASS with
  47. * a String value of 'neat' and the String key 'foo' with a String value of
  48. * 'bar'.
  49. * <p>The position argument will indicate the start of the tag, comment
  50. * or text. Similiar to arrays, the first character in the stream has a
  51. * position of 0. For tags that are
  52. * implied the position will indicate
  53. * the location of the next encountered tag. In the first example,
  54. * the implied start body and html tags will have the same position as the
  55. * p tag, and the implied end p, html and body tags will all have the same
  56. * position.
  57. * <p>As html skips whitespace the position for text will be the position
  58. * of the first valid character, eg in the string '\n\n\nblah'
  59. * the text 'blah' will have a position of 3, the newlines are skipped.
  60. * <p>
  61. * For attributes that do not have a value, eg in the html
  62. * string <code><foo blah></code> the attribute <code>blah</code>
  63. * does not have a value, there are two possible values that will be
  64. * placed in the AttributeSet's value:
  65. * <ul>
  66. * <li>If the DTD does not contain an definition for the element, or the
  67. * definition does not have an explicit value then the value in the
  68. * AttributeSet will be <code>HTML.NULL_ATTRIBUTE_VALUE</code>.
  69. * <li>If the DTD contains an explicit value, as in:
  70. * <code><!ATTLIST OPTION selected (selected) #IMPLIED></code>
  71. * this value from the dtd (in this case selected) will be used.
  72. * </ul>
  73. * <p>
  74. * Once the stream has been parsed, the callback is notified of the most
  75. * likely end of line string. The end of line string will be one of
  76. * \n, \r or \r\n, which ever is encountered the most in parsing the
  77. * stream.
  78. *
  79. * @version 1.28 12/19/03
  80. * @author Sunita Mani
  81. */
  82. public class DocumentParser extends javax.swing.text.html.parser.Parser {
  83. private int inbody;
  84. private int intitle;
  85. private int inhead;
  86. private int instyle;
  87. private int inscript;
  88. private boolean seentitle;
  89. private HTMLEditorKit.ParserCallback callback = null;
  90. private boolean ignoreCharSet = false;
  91. private static final boolean debugFlag = false;
  92. public DocumentParser(DTD dtd) {
  93. super(dtd);
  94. }
  95. public void parse(Reader in, HTMLEditorKit.ParserCallback callback, boolean ignoreCharSet) throws IOException {
  96. this.ignoreCharSet = ignoreCharSet;
  97. this.callback = callback;
  98. parse(in);
  99. // end of line
  100. callback.handleEndOfLineString(getEndOfLineString());
  101. }
  102. /**
  103. * Handle Start Tag.
  104. */
  105. protected void handleStartTag(TagElement tag) {
  106. Element elem = tag.getElement();
  107. if (elem == dtd.body) {
  108. inbody++;
  109. } else if (elem == dtd.html) {
  110. } else if (elem == dtd.head) {
  111. inhead++;
  112. } else if (elem == dtd.title) {
  113. intitle++;
  114. } else if (elem == dtd.style) {
  115. instyle++;
  116. } else if (elem == dtd.script) {
  117. inscript++;
  118. }
  119. if (debugFlag) {
  120. if (tag.fictional()) {
  121. debug("Start Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
  122. } else {
  123. debug("Start Tag: " + tag.getHTMLTag() + " attributes: " +
  124. getAttributes() + " pos: " + getCurrentPos());
  125. }
  126. }
  127. if (tag.fictional()) {
  128. SimpleAttributeSet attrs = new SimpleAttributeSet();
  129. attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED,
  130. Boolean.TRUE);
  131. callback.handleStartTag(tag.getHTMLTag(), attrs,
  132. getBlockStartPosition());
  133. } else {
  134. callback.handleStartTag(tag.getHTMLTag(), getAttributes(),
  135. getBlockStartPosition());
  136. flushAttributes();
  137. }
  138. }
  139. protected void handleComment(char text[]) {
  140. if (debugFlag) {
  141. debug("comment: ->" + new String(text) + "<-"
  142. + " pos: " + getCurrentPos());
  143. }
  144. callback.handleComment(text, getBlockStartPosition());
  145. }
  146. /**
  147. * Handle Empty Tag.
  148. */
  149. protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException {
  150. Element elem = tag.getElement();
  151. if (elem == dtd.meta && !ignoreCharSet) {
  152. SimpleAttributeSet atts = getAttributes();
  153. if (atts != null) {
  154. String content = (String)atts.getAttribute(HTML.Attribute.CONTENT);
  155. if (content != null) {
  156. if ("content-type".equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) {
  157. if (!content.equalsIgnoreCase("text/html") &&
  158. !content.equalsIgnoreCase("text/plain")) {
  159. throw new ChangedCharSetException(content, false);
  160. }
  161. } else if ("charset" .equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) {
  162. throw new ChangedCharSetException(content, true);
  163. }
  164. }
  165. }
  166. }
  167. if (inbody != 0 || elem == dtd.meta || elem == dtd.base || elem == dtd.isindex || elem == dtd.style || elem == dtd.link) {
  168. if (debugFlag) {
  169. if (tag.fictional()) {
  170. debug("Empty Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
  171. } else {
  172. debug("Empty Tag: " + tag.getHTMLTag() + " attributes: "
  173. + getAttributes() + " pos: " + getCurrentPos());
  174. }
  175. }
  176. if (tag.fictional()) {
  177. SimpleAttributeSet attrs = new SimpleAttributeSet();
  178. attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED,
  179. Boolean.TRUE);
  180. callback.handleSimpleTag(tag.getHTMLTag(), attrs,
  181. getBlockStartPosition());
  182. } else {
  183. callback.handleSimpleTag(tag.getHTMLTag(), getAttributes(),
  184. getBlockStartPosition());
  185. flushAttributes();
  186. }
  187. }
  188. }
  189. /**
  190. * Handle End Tag.
  191. */
  192. protected void handleEndTag(TagElement tag) {
  193. Element elem = tag.getElement();
  194. if (elem == dtd.body) {
  195. inbody--;
  196. } else if (elem == dtd.title) {
  197. intitle--;
  198. seentitle = true;
  199. } else if (elem == dtd.head) {
  200. inhead--;
  201. } else if (elem == dtd.style) {
  202. instyle--;
  203. } else if (elem == dtd.script) {
  204. inscript--;
  205. }
  206. if (debugFlag) {
  207. debug("End Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
  208. }
  209. callback.handleEndTag(tag.getHTMLTag(), getBlockStartPosition());
  210. }
  211. /**
  212. * Handle Text.
  213. */
  214. protected void handleText(char data[]) {
  215. if (data != null) {
  216. if (inscript != 0) {
  217. callback.handleComment(data, getBlockStartPosition());
  218. return;
  219. }
  220. if (inbody != 0 || ((instyle != 0) ||
  221. ((intitle != 0) && !seentitle))) {
  222. if (debugFlag) {
  223. debug("text: ->" + new String(data) + "<-" + " pos: " + getCurrentPos());
  224. }
  225. callback.handleText(data, getBlockStartPosition());
  226. }
  227. }
  228. }
  229. /*
  230. * Error handling.
  231. */
  232. protected void handleError(int ln, String errorMsg) {
  233. if (debugFlag) {
  234. debug("Error: ->" + errorMsg + "<-" + " pos: " + getCurrentPos());
  235. }
  236. /* PENDING: need to improve the error string. */
  237. callback.handleError(errorMsg, getCurrentPos());
  238. }
  239. /*
  240. * debug messages
  241. */
  242. private void debug(String msg) {
  243. System.out.println(msg);
  244. }
  245. }