1. /*
  2. * @(#)Parser.java 1.24 00/02/02
  3. *
  4. * Copyright 1998-2000 Sun Microsystems, Inc. All Rights Reserved.
  5. *
  6. * This software is the proprietary information of Sun Microsystems, Inc.
  7. * Use is subject to license terms.
  8. *
  9. */
  10. package javax.swing.text.html.parser;
  11. import javax.swing.text.SimpleAttributeSet;
  12. import javax.swing.text.html.HTML;
  13. import javax.swing.text.ChangedCharSetException;
  14. import java.io.*;
  15. import java.util.Hashtable;
  16. import java.util.Properties;
  17. import java.util.Vector;
  18. import java.util.Enumeration;
  19. import java.net.URL;
  20. import sun.misc.MessageUtils;
  21. /**
  22. * A simple DTD-driven HTML parser. The parser reads an
  23. * HTML file from an InputStream and calls various methods
  24. * (which should be overridden in a subclass) when tags and
  25. * data are encountered.
  26. * <p>
  27. * Unfortunately there are many badly implemented HTML parsers
  28. * out there, and as a result there are many badly formatted
  29. * HTML files. This parser attempts to parse most HTML files.
  30. * This means that the implementation sometimes deviates from
  31. * the SGML specification in favor of HTML.
  32. * <p>
  33. * The parser treats \r and \r\n as \n. Newlines after starttags
  34. * and before end tags are ignored just as specified in the SGML/HTML
  35. * specification.
  36. *
  37. * @see DTD
  38. * @see TagElement
  39. * @see SimpleAttributeSet
  40. * @version 1.24, 02/02/00
  41. * @author Arthur van Hoff
  42. * @author Sunita Mani
  43. */
  44. public
  45. class Parser implements DTDConstants {
  46. private char text[] = new char[1024];
  47. private int textpos = 0;
  48. private TagElement last;
  49. private boolean space;
  50. private char str[] = new char[128];
  51. private int strpos = 0;
  52. protected DTD dtd = null;
  53. private int ch;
  54. private int ln;
  55. private Reader in;
  56. private Element recent;
  57. private TagStack stack;
  58. private boolean skipTag = false;
  59. private TagElement lastFormSent = null;
  60. private SimpleAttributeSet attributes = new SimpleAttributeSet();
  61. // State for <html>, <head> and <body>. Since people like to slap
  62. // together HTML documents without thinking, occasionally they
  63. // have multiple instances of these tags. These booleans track
  64. // the first sightings of these tags so they can be safely ignored
  65. // by the parser if repeated.
  66. private boolean seenHtml = false;
  67. private boolean seenHead = false;
  68. private boolean seenBody = false;
  69. /**
  70. * This flag determines whether or not the Parser will be strict
  71. * in enforcing SGML compatibility. If false, it will be lenient
  72. * with certain common classes of erroneous HTML constructs.
  73. * Strict or not, in either case an error will be recorded.
  74. *
  75. */
  76. protected boolean strict = false;
  77. /** Number of \r\n's encountered. */
  78. private int crlfCount;
  79. /** Number of \r's encountered. A \r\n will not increment this. */
  80. private int crCount;
  81. /** Number of \n's encountered. A \r\n will not increment this. */
  82. private int lfCount;
  83. //
  84. // To correctly identify the start of a tag/comment/text we need two
  85. // ivars. Two are needed as handleText isn't invoked until the tag
  86. // after the text has been parsed, that is the parser parses the text,
  87. // then a tag, then invokes handleText followed by handleStart.
  88. //
  89. /** The start position of the current block. Block is overloaded here,
  90. * it really means the current start position for the current comment,
  91. * tag, text. Use getBlockStartPosition to access this. */
  92. private int currentBlockStartPos;
  93. /** Start position of the last block. */
  94. private int lastBlockStartPos;
  95. public Parser(DTD dtd) {
  96. this.dtd = dtd;
  97. }
  98. /**
  99. * @return the line number of the line currently being parsed
  100. */
  101. protected int getCurrentLine() {
  102. return ln;
  103. }
  104. /**
  105. * Returns the start position of the current block. Block is
  106. * overloaded here, it really means the current start position for
  107. * the current comment tag, text, block.... This is provided for
  108. * subclassers that wish to know the start of the current block when
  109. * called with one of the handleXXX methods.
  110. */
  111. int getBlockStartPosition() {
  112. return Math.max(0, lastBlockStartPos - 1);
  113. }
  114. /**
  115. * Makes a TagElement.
  116. */
  117. protected TagElement makeTag(Element elem, boolean fictional) {
  118. return new TagElement(elem, fictional);
  119. }
  120. protected TagElement makeTag(Element elem) {
  121. return makeTag(elem, false);
  122. }
  123. protected SimpleAttributeSet getAttributes() {
  124. return attributes;
  125. }
  126. protected void flushAttributes() {
  127. attributes.removeAttributes(attributes);
  128. }
  129. /**
  130. * Called when PCDATA is encountered.
  131. */
  132. protected void handleText(char text[]) {
  133. }
  134. /**
  135. * Called when an HTML title tag is encountered.
  136. */
  137. protected void handleTitle(char text[]) {
  138. // default behavior is to call handleText. Subclasses
  139. // can override if necessary.
  140. handleText(text);
  141. }
  142. /**
  143. * Called when an HTML comment is encountered.
  144. */
  145. protected void handleComment(char text[]) {
  146. }
  147. protected void handleEOFInComment() {
  148. // We've reached EOF. Our recovery strategy is to
  149. // see if we have more than one line in the comment;
  150. // if so, we pretend that the comment was an unterminated
  151. // single line comment, and reparse the lines after the
  152. // first line as normal HTML content.
  153. int commentEndPos = strIndexOf('\n');
  154. if (commentEndPos >= 0) {
  155. handleComment(getChars(0, commentEndPos));
  156. try {
  157. in.close();
  158. in = new CharArrayReader(getChars(commentEndPos + 1));
  159. ch = '>';
  160. } catch (IOException e) {
  161. error("ioexception");
  162. }
  163. resetStrBuffer();
  164. } else {
  165. // no newline, so signal an error
  166. error("eof.comment");
  167. }
  168. }
  169. /**
  170. * Called when an empty tag is encountered.
  171. */
  172. protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException {
  173. }
  174. /**
  175. * Called when a start tag is encountered.
  176. */
  177. protected void handleStartTag(TagElement tag) {
  178. }
  179. /**
  180. * Called when an end tag is encountered.
  181. */
  182. protected void handleEndTag(TagElement tag) {
  183. }
  184. /**
  185. * An error has occurred.
  186. */
  187. protected void handleError(int ln, String msg) {
  188. /*
  189. Thread.dumpStack();
  190. System.out.println("**** " + stack);
  191. System.out.println("line " + ln + ": error: " + msg);
  192. System.out.println();
  193. */
  194. }
  195. /**
  196. * Output text.
  197. */
  198. void handleText(TagElement tag) {
  199. if (tag.breaksFlow()) {
  200. space = false;
  201. }
  202. if (textpos == 0) {
  203. if ((!space) || (stack == null) || last.breaksFlow() ||
  204. !stack.advance(dtd.pcdata)) {
  205. last = tag;
  206. space = false;
  207. lastBlockStartPos = currentBlockStartPos;
  208. return;
  209. }
  210. }
  211. if (space) {
  212. // enlarge buffer if needed
  213. if (textpos + 1 > text.length) {
  214. char newtext[] = new char[text.length + 200];
  215. System.arraycopy(text, 0, newtext, 0, text.length);
  216. text = newtext;
  217. }
  218. // output pending space
  219. text[textpos++] = ' ';
  220. space = false;
  221. }
  222. char newtext[] = new char[textpos];
  223. System.arraycopy(text, 0, newtext, 0, textpos);
  224. // Handles cases of bad html where the title tag
  225. // was getting lost when we did error recovery.
  226. if (tag.getElement().getName().equals("title")) {
  227. handleTitle(newtext);
  228. } else {
  229. handleText(newtext);
  230. }
  231. lastBlockStartPos = currentBlockStartPos;
  232. textpos = 0;
  233. last = tag;
  234. space = false;
  235. }
  236. /**
  237. * Invoke the error handler.
  238. */
  239. protected void error(String err, String arg1, String arg2,
  240. String arg3) {
  241. // big hack, but this should never get used...
  242. handleError (ln, err + arg1 + arg2 + arg3);
  243. }
  244. protected void error(String err, String arg1, String arg2) {
  245. error(err, arg1, arg2, "?");
  246. }
  247. protected void error(String err, String arg1) {
  248. error(err, arg1, "?", "?");
  249. }
  250. protected void error(String err) {
  251. error(err, "?", "?", "?");
  252. }
  253. /**
  254. * Handle a start tag. The new tag is pushed
  255. * onto the tag stack. The attribute list is
  256. * checked for required attributes.
  257. */
  258. protected void startTag(TagElement tag) throws ChangedCharSetException {
  259. Element elem = tag.getElement();
  260. // If the tag is an empty tag and texpos != 0
  261. // this implies that there is text before the
  262. // start tag that needs to be processed before
  263. // handling the tag.
  264. //
  265. if (!elem.isEmpty() || textpos != 0) {
  266. handleText(tag);
  267. } else {
  268. // this variable gets updated in handleText().
  269. // Since in this case we do not call handleText()
  270. // we need to update it here.
  271. //
  272. last = tag;
  273. // Note that we should really check last.breakFlows before
  274. // assuming this should be false.
  275. space = false;
  276. }
  277. lastBlockStartPos = currentBlockStartPos;
  278. // check required attributes
  279. for (AttributeList a = elem.atts ; a != null ; a = a.next) {
  280. if ((a.modifier == REQUIRED) && ((attributes.isEmpty()) || (!attributes.isDefined(a.name)))) {
  281. error("req.att ", a.getName(), elem.getName());
  282. }
  283. }
  284. if (elem.isEmpty()) {
  285. handleEmptyTag(tag);
  286. } else if (elem.getName().equals("form")) {
  287. handleStartTag(tag);
  288. } else {
  289. recent = elem;
  290. stack = new TagStack(tag, stack);
  291. handleStartTag(tag);
  292. }
  293. }
  294. /**
  295. * Handle an end tag. The end tag is popped
  296. * from the tag stack.
  297. */
  298. protected void endTag(boolean omitted) {
  299. handleText(stack.tag);
  300. if (omitted && !stack.elem.omitEnd()) {
  301. error("end.missing", stack.elem.getName());
  302. } else if (!stack.terminate()) {
  303. error("end.unexpected", stack.elem.getName());
  304. }
  305. // handle the tag
  306. handleEndTag(stack.tag);
  307. stack = stack.next;
  308. recent = (stack != null) ? stack.elem : null;
  309. }
  310. boolean ignoreElement(Element elem) {
  311. String stackElement = stack.elem.getName();
  312. String elemName = elem.getName();
  313. /* We ignore all elements that are not valid in the context of
  314. a table except <td>, <th> (these we handle in
  315. legalElementContext()) and #pcdata. We also ignore the
  316. <font> tag in the context of <ul> and <ol> We additonally
  317. ignore the <meta> and the <style> tag if the body tag has
  318. been seen. **/
  319. if ((elemName.equals("html") && seenHtml) ||
  320. (elemName.equals("head") && seenHead) ||
  321. (elemName.equals("body") && seenBody)) {
  322. return true;
  323. }
  324. if (elemName.equals("dt") || elemName.equals("dd")) {
  325. TagStack s = stack;
  326. while (s != null && !s.elem.getName().equals("dl")) {
  327. s = s.next;
  328. }
  329. if (s == null) {
  330. return true;
  331. }
  332. }
  333. if (((stackElement.equals("table")) &&
  334. (!elemName.equals("#pcdata")) && (!elemName.equals("input"))) ||
  335. ((elemName.equals("font")) &&
  336. (stackElement.equals("ul") || stackElement.equals("ol"))) ||
  337. (elemName.equals("meta") && stack != null) ||
  338. elemName.equals("style") ||
  339. (stackElement.equals("table") && elemName.equals("a"))) {
  340. return true;
  341. }
  342. return false;
  343. }
  344. /**
  345. * Marks the first time a tag has been seen in a document
  346. */
  347. protected void markFirstTime(Element elem) {
  348. String elemName = elem.getName();
  349. if (elemName.equals("html")) {
  350. seenHtml = true;
  351. } else if (elemName.equals("head")) {
  352. seenHead = true;
  353. } else if (elemName.equals("body")) {
  354. seenBody = true;
  355. }
  356. }
  357. /**
  358. * Create a legal content for an element.
  359. */
  360. boolean legalElementContext(Element elem) throws ChangedCharSetException {
  361. // System.out.println("-- legalContext -- " + elem);
  362. // Deal with the empty stack
  363. if (stack == null) {
  364. // System.out.println("-- stack is empty");
  365. if (elem != dtd.html) {
  366. // System.out.println("-- pushing html");
  367. startTag(makeTag(dtd.html, true));
  368. return legalElementContext(elem);
  369. }
  370. return true;
  371. }
  372. // Is it allowed in the current context
  373. if (stack.advance(elem)) {
  374. // System.out.println("-- legal context");
  375. markFirstTime(elem);
  376. return true;
  377. }
  378. boolean insertTag = false;
  379. // The use of all error recovery strategies are contingent
  380. // on the value of the strict property.
  381. //
  382. // These are commonly occuring errors. if insertTag is true,
  383. // then we want to adopt an error recovery strategy that
  384. // involves attempting to insert an additional tag to
  385. // legalize the context. The two errors addressed here
  386. // are:
  387. // 1) when a <td> or <th> is seen soon after a <table> tag.
  388. // In this case we insert a <tr>.
  389. // 2) when any other tag apart from a <tr> is seen
  390. // in the context of a <tr>. In this case we would
  391. // like to add a <td>. If a <tr> is seen within a
  392. // <tr> context, then we will close out the current
  393. // <tr>.
  394. //
  395. // This insertion strategy is handled later in the method.
  396. // The reason for checking this now, is that in other cases
  397. // we would like to apply other error recovery strategies for example
  398. // ignoring tags.
  399. //
  400. // In certain cases it is better to ignore a tag than try to
  401. // fix the situation. So the first test is to see if this
  402. // is what we need to do.
  403. //
  404. String stackElemName = stack.elem.getName();
  405. String elemName = elem.getName();
  406. if (!strict &&
  407. ((stackElemName.equals("table") && elemName.equals("td")) ||
  408. (stackElemName.equals("table") && elemName.equals("th")) ||
  409. (stackElemName.equals("tr") && !elemName.equals("tr")))){
  410. insertTag = true;
  411. }
  412. if (!strict && !insertTag && (stack.elem.getName() != elem.getName() ||
  413. elem.getName().equals("body"))) {
  414. if (skipTag = ignoreElement(elem)) {
  415. error("tag.ignore", elem.getName());
  416. return skipTag;
  417. }
  418. }
  419. // Check for anything after the start of the table besides tr, td, th
  420. // or caption, and if those aren't there, insert the <tr> and call
  421. // legalElementContext again.
  422. if (!strict && stackElemName.equals("table") &&
  423. !elemName.equals("tr") && !elemName.equals("td") &&
  424. !elemName.equals("th") && !elemName.equals("caption")) {
  425. Element e = dtd.getElement("tr");
  426. TagElement t = makeTag(e, true);
  427. legalTagContext(t);
  428. startTag(t);
  429. error("start.missing", elem.getName());
  430. return legalElementContext(elem);
  431. }
  432. // They try to find a legal context by checking if the current
  433. // tag is valid in an enclosing context. If so
  434. // close out the tags by outputing end tags and then
  435. // insert the curent tag. If the tags that are
  436. // being closed out do not have an optional end tag
  437. // specification in the DTD then an html error is
  438. // reported.
  439. //
  440. if (!insertTag && stack.terminate() && (!strict || stack.elem.omitEnd())) {
  441. for (TagStack s = stack.next ; s != null ; s = s.next) {
  442. if (s.advance(elem)) {
  443. while (stack != s) {
  444. endTag(true);
  445. }
  446. return true;
  447. }
  448. if (!s.terminate() || (strict && !s.elem.omitEnd())) {
  449. break;
  450. }
  451. }
  452. }
  453. // Check if we know what tag is expected next.
  454. // If so insert the tag. Report an error if the
  455. // tag does not have its start tag spec in the DTD as optional.
  456. //
  457. Element next = stack.first();
  458. if (next != null && (!strict || next.omitStart()) &&
  459. !(next==dtd.head && elem==dtd.pcdata) ) {
  460. // System.out.println("-- omitting start tag: " + next);
  461. TagElement t = makeTag(next, true);
  462. legalTagContext(t);
  463. startTag(t);
  464. if (!next.omitStart()) {
  465. error("start.missing", elem.getName());
  466. }
  467. return legalElementContext(elem);
  468. }
  469. // Traverse the list of expected elements and determine if adding
  470. // any of these elements would make for a legal context.
  471. //
  472. if (!strict) {
  473. ContentModel content = stack.contentModel();
  474. Vector elemVec = new Vector();
  475. if (content != null) {
  476. content.getElements(elemVec);
  477. for (Enumeration v = elemVec.elements(); v.hasMoreElements();) {
  478. Element e = (Element)v.nextElement();
  479. // Ensure that this element has not been included as
  480. // part of the exclusions in the DTD.
  481. //
  482. if (stack.excluded(e.getIndex())) {
  483. continue;
  484. }
  485. boolean reqAtts = false;
  486. for (AttributeList a = e.getAttributes(); a != null ; a = a.next) {
  487. if (a.modifier == REQUIRED) {
  488. reqAtts = true;
  489. break;
  490. }
  491. }
  492. // Ensure that no tag that has required attributes
  493. // gets inserted.
  494. //
  495. if (reqAtts) {
  496. continue;
  497. }
  498. ContentModel m = e.getContent();
  499. if (m != null && m.first(elem)) {
  500. // System.out.println("-- adding a legal tag: " + e);
  501. TagElement t = makeTag(e, true);
  502. legalTagContext(t);
  503. startTag(t);
  504. error("start.missing", e.getName());
  505. return legalElementContext(elem);
  506. }
  507. }
  508. }
  509. }
  510. // Check if the stack can be terminated. If so add the appropriate
  511. // end tag. Report an error if the tag being ended does not have its
  512. // end tag spec in the DTD as optional.
  513. //
  514. if (stack.terminate() && (stack.elem != dtd.body) && (!strict || stack.elem.omitEnd())) {
  515. // System.out.println("-- omitting end tag: " + stack.elem);
  516. if (!stack.elem.omitEnd()) {
  517. error("end.missing", elem.getName());
  518. }
  519. endTag(true);
  520. return legalElementContext(elem);
  521. }
  522. // At this point we know that something is screwed up.
  523. return false;
  524. }
  525. /**
  526. * Create a legal context for a tag.
  527. */
  528. void legalTagContext(TagElement tag) throws ChangedCharSetException {
  529. if (legalElementContext(tag.getElement())) {
  530. markFirstTime(tag.getElement());
  531. return;
  532. }
  533. // Avoid putting a block tag in a flow tag.
  534. if (tag.breaksFlow() && (stack != null) && !stack.tag.breaksFlow()) {
  535. endTag(true);
  536. legalTagContext(tag);
  537. return;
  538. }
  539. // Avoid putting something wierd in the head of the document.
  540. for (TagStack s = stack ; s != null ; s = s.next) {
  541. if (s.tag.getElement() == dtd.head) {
  542. while (stack != s) {
  543. endTag(true);
  544. }
  545. endTag(true);
  546. legalTagContext(tag);
  547. return;
  548. }
  549. }
  550. // Everything failed
  551. error("tag.unexpected", tag.getElement().getName());
  552. }
  553. /**
  554. * Error context. Something went wrong, make sure we are in
  555. * the document's body context
  556. */
  557. void errorContext() throws ChangedCharSetException {
  558. for (; (stack != null) && (stack.tag.getElement() != dtd.body) ; stack = stack.next) {
  559. handleEndTag(stack.tag);
  560. }
  561. if (stack == null) {
  562. legalElementContext(dtd.body);
  563. startTag(makeTag(dtd.body, true));
  564. }
  565. }
  566. /**
  567. * Add a char to the string buffer.
  568. */
  569. void addString(int c) {
  570. if (strpos == str.length) {
  571. char newstr[] = new char[str.length + 128];
  572. System.arraycopy(str, 0, newstr, 0, str.length);
  573. str = newstr;
  574. }
  575. str[strpos++] = (char)c;
  576. }
  577. /**
  578. * Get the string that's been accumulated.
  579. */
  580. String getString(int pos) {
  581. char newStr[] = new char[strpos - pos];
  582. System.arraycopy(str, pos, newStr, 0, strpos - pos);
  583. strpos = pos;
  584. return new String(newStr);
  585. }
  586. char[] getChars(int pos) {
  587. char newStr[] = new char[strpos - pos];
  588. System.arraycopy(str, pos, newStr, 0, strpos - pos);
  589. strpos = pos;
  590. return newStr;
  591. }
  592. char[] getChars(int pos, int endPos) {
  593. char newStr[] = new char[endPos - pos];
  594. System.arraycopy(str, pos, newStr, 0, endPos - pos);
  595. // REMIND: it's not clear whether this version should set strpos or not
  596. // strpos = pos;
  597. return newStr;
  598. }
  599. void resetStrBuffer() {
  600. strpos = 0;
  601. }
  602. int strIndexOf(char target) {
  603. for (int i = 0; i < strpos; i++) {
  604. if (str[i] == target) {
  605. return i;
  606. }
  607. }
  608. return -1;
  609. }
  610. /**
  611. * Skip space.
  612. * [5] 297:5
  613. */
  614. void skipSpace() throws IOException {
  615. while (true) {
  616. switch (ch) {
  617. case '\n':
  618. ln++;
  619. ch = readCh();
  620. lfCount++;
  621. break;
  622. case '\r':
  623. ln++;
  624. if ((ch = readCh()) == '\n') {
  625. ch = readCh();
  626. crlfCount++;
  627. }
  628. else {
  629. crCount++;
  630. }
  631. break;
  632. case ' ':
  633. case '\t':
  634. ch = readCh();
  635. break;
  636. default:
  637. return;
  638. }
  639. }
  640. }
  641. /**
  642. * Parse identifier. Uppercase characters are folded
  643. * to lowercase when lower is true. Returns falsed if
  644. * no identifier is found. [55] 346:17
  645. */
  646. boolean parseIdentifier(boolean lower) throws IOException {
  647. switch (ch) {
  648. case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
  649. case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
  650. case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
  651. case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
  652. case 'Y': case 'Z':
  653. if (lower) {
  654. ch = 'a' + (ch - 'A');
  655. }
  656. case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
  657. case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
  658. case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
  659. case 's': case 't': case 'u': case 'v': case 'w': case 'x':
  660. case 'y': case 'z':
  661. break;
  662. default:
  663. return false;
  664. }
  665. while (true) {
  666. addString(ch);
  667. switch (ch = readCh()) {
  668. case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
  669. case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
  670. case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
  671. case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
  672. case 'Y': case 'Z':
  673. if (lower) {
  674. ch = 'a' + (ch - 'A');
  675. }
  676. case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
  677. case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
  678. case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
  679. case 's': case 't': case 'u': case 'v': case 'w': case 'x':
  680. case 'y': case 'z':
  681. case '0': case '1': case '2': case '3': case '4':
  682. case '5': case '6': case '7': case '8': case '9':
  683. case '.': case '-':
  684. case '_': // not officially allowed
  685. break;
  686. default:
  687. return true;
  688. }
  689. }
  690. }
  691. /**
  692. * Parse an entity reference. [59] 350:17
  693. */
  694. private char[] parseEntityReference() throws IOException {
  695. int pos = strpos;
  696. if ((ch = readCh()) == '#') {
  697. int n = 0;
  698. ch = readCh();
  699. if ((ch >= '0') && (ch <= '9')) {
  700. while ((ch >= '0') && (ch <= '9')) {
  701. n = (n * 10) + ch - '0';
  702. ch = readCh();
  703. }
  704. switch (ch) {
  705. case '\n':
  706. ln++;
  707. ch = readCh();
  708. lfCount++;
  709. break;
  710. case '\r':
  711. ln++;
  712. if ((ch = readCh()) == '\n') {
  713. ch = readCh();
  714. crlfCount++;
  715. }
  716. else {
  717. crCount++;
  718. }
  719. break;
  720. case ';':
  721. ch = readCh();
  722. break;
  723. }
  724. char data[] = {(char)n};
  725. return data;
  726. }
  727. addString('#');
  728. if (!parseIdentifier(false)) {
  729. error("ident.expected");
  730. strpos = pos;
  731. char data[] = {'&', '#'};
  732. return data;
  733. }
  734. } else if (!parseIdentifier(false)) {
  735. char data[] = {'&'};
  736. return data;
  737. }
  738. switch (ch) {
  739. case '\n':
  740. ln++;
  741. ch = readCh();
  742. lfCount++;
  743. break;
  744. case '\r':
  745. ln++;
  746. if ((ch = readCh()) == '\n') {
  747. ch = readCh();
  748. crlfCount++;
  749. }
  750. else {
  751. crCount++;
  752. }
  753. break;
  754. case ';':
  755. ch = readCh();
  756. break;
  757. }
  758. String nm = getString(pos);
  759. Entity ent = dtd.getEntity(nm);
  760. // entities are case sensitive - however if strict
  761. // is false then we will try to make a match by
  762. // converting the string to all lowercase.
  763. //
  764. if (!strict && (ent == null)) {
  765. ent = dtd.getEntity(nm.toLowerCase());
  766. }
  767. if ((ent == null) || !ent.isGeneral()) {
  768. if (nm.length() == 0) {
  769. error("invalid.entref", nm);
  770. return new char[0];
  771. }
  772. /* given that there is not a match restore the entity reference */
  773. String str = "&" + nm;
  774. char b[] = new char[str.length()];
  775. str.getChars(0, b.length, b, 0);
  776. return b;
  777. }
  778. return ent.getData();
  779. }
  780. /**
  781. * Parse a comment. [92] 391:7
  782. */
  783. void parseComment() throws IOException {
  784. while (true) {
  785. int c = ch;
  786. switch (c) {
  787. case '-':
  788. /** Presuming that the start string of a comment "<!--" has
  789. already been parsed, the '-' character is valid only as
  790. part of a comment termination and further more it must
  791. be present in even numbers. Hence if strict is true, we
  792. presume the comment has been terminated and return.
  793. However if strict is false, then there is no even number
  794. requirement and this character can appear anywhere in the
  795. comment. The parser reads on until it sees the following
  796. pattern: "-->" or "--!>".
  797. **/
  798. if (!strict && (strpos != 0) && (str[strpos - 1] == '-')) {
  799. if ((ch = readCh()) == '>') {
  800. return;
  801. }
  802. if (ch == '!') {
  803. if ((ch = readCh()) == '>') {
  804. return;
  805. } else {
  806. /* to account for extra read()'s that happened */
  807. addString('-');
  808. addString('!');
  809. continue;
  810. }
  811. }
  812. break;
  813. }
  814. if ((ch = readCh()) == '-') {
  815. ch = readCh();
  816. if (strict || ch == '>') {
  817. return;
  818. }
  819. if (ch == '!') {
  820. if ((ch = readCh()) == '>') {
  821. return;
  822. } else {
  823. /* to account for extra read()'s that happened */
  824. addString('-');
  825. addString('!');
  826. continue;
  827. }
  828. }
  829. /* to account for the extra read() */
  830. addString('-');
  831. }
  832. break;
  833. case -1:
  834. handleEOFInComment();
  835. return;
  836. case '\n':
  837. ln++;
  838. ch = readCh();
  839. lfCount++;
  840. break;
  841. case '>':
  842. ch = readCh();
  843. break;
  844. case '\r':
  845. ln++;
  846. if ((ch = readCh()) == '\n') {
  847. ch = readCh();
  848. crlfCount++;
  849. }
  850. else {
  851. crCount++;
  852. }
  853. c = '\n';
  854. break;
  855. default:
  856. ch = readCh();
  857. break;
  858. }
  859. addString(c);
  860. }
  861. }
  862. /**
  863. * Parse literal content. [46] 343:1 and [47] 344:1
  864. */
  865. void parseLiteral(boolean replace) throws IOException {
  866. while (true) {
  867. int c = ch;
  868. switch (c) {
  869. case -1:
  870. error("eof.literal", stack.elem.getName());
  871. endTag(true);
  872. return;
  873. case '>':
  874. ch = readCh();
  875. int i = textpos - (stack.elem.name.length() + 2), j = 0;
  876. // match end tag
  877. if ((i >= 0) && (text[i++] == '<') && (text[i] == '/')) {
  878. while ((++i < textpos) &&
  879. (Character.toLowerCase(text[i]) == stack.elem.name.charAt(j++)));
  880. if (i == textpos) {
  881. textpos -= (stack.elem.name.length() + 2);
  882. if ((textpos > 0) && (text[textpos-1] == '\n')) {
  883. textpos--;
  884. }
  885. endTag(false);
  886. return;
  887. }
  888. }
  889. break;
  890. case '&':
  891. char data[] = parseEntityReference();
  892. if (textpos + data.length > text.length) {
  893. char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
  894. System.arraycopy(text, 0, newtext, 0, text.length);
  895. text = newtext;
  896. }
  897. System.arraycopy(data, 0, text, textpos, data.length);
  898. textpos += data.length;
  899. continue;
  900. case '\n':
  901. ln++;
  902. ch = readCh();
  903. lfCount++;
  904. break;
  905. case '\r':
  906. ln++;
  907. if ((ch = readCh()) == '\n') {
  908. ch = readCh();
  909. crlfCount++;
  910. }
  911. else {
  912. crCount++;
  913. }
  914. c = '\n';
  915. break;
  916. default:
  917. ch = readCh();
  918. break;
  919. }
  920. // output character
  921. if (textpos == text.length) {
  922. char newtext[] = new char[text.length + 128];
  923. System.arraycopy(text, 0, newtext, 0, text.length);
  924. text = newtext;
  925. }
  926. text[textpos++] = (char)c;
  927. }
  928. }
  929. /**
  930. * Parse attribute value. [33] 331:1
  931. */
  932. String parseAttributeValue(boolean lower) throws IOException {
  933. int delim = -1;
  934. // Check for a delimiter
  935. switch(ch) {
  936. case '\'':
  937. case '"':
  938. delim = ch;
  939. ch = readCh();
  940. break;
  941. }
  942. // Parse the rest of the value
  943. while (true) {
  944. int c = ch;
  945. switch (c) {
  946. case '\n':
  947. ln++;
  948. ch = readCh();
  949. lfCount++;
  950. if (delim < 0) {
  951. return getString(0);
  952. }
  953. break;
  954. case '\r':
  955. ln++;
  956. if ((ch = readCh()) == '\n') {
  957. ch = readCh();
  958. crlfCount++;
  959. }
  960. else {
  961. crCount++;
  962. }
  963. if (delim < 0) {
  964. return getString(0);
  965. }
  966. break;
  967. case '\t':
  968. if (delim < 0)
  969. c = ' ';
  970. case ' ':
  971. ch = readCh();
  972. if (delim < 0) {
  973. return getString(0);
  974. }
  975. break;
  976. case '>':
  977. case '<':
  978. if (delim < 0) {
  979. return getString(0);
  980. }
  981. ch = readCh();
  982. break;
  983. case '\'':
  984. case '"':
  985. ch = readCh();
  986. if (c == delim) {
  987. return getString(0);
  988. } else if (delim == -1) {
  989. error("attvalerr");
  990. if (strict || ch == ' ') {
  991. return getString(0);
  992. } else {
  993. continue;
  994. }
  995. }
  996. break;
  997. case '=':
  998. if (delim < 0) {
  999. /* In SGML a construct like <img src=/cgi-bin/foo?x=1>
  1000. is considered invalid since an = sign can only be contained
  1001. in an attributes value if the string is quoted.
  1002. */
  1003. error("attvalerr");
  1004. /* If strict is true then we return with the string we have thus far.
  1005. Otherwise we accept the = sign as part of the attribute's value and
  1006. process the rest of the img tag. */
  1007. if (strict) {
  1008. return getString(0);
  1009. }
  1010. }
  1011. ch = readCh();
  1012. break;
  1013. case '&':
  1014. if (strict && delim < 0) {
  1015. ch = readCh();
  1016. break;
  1017. }
  1018. char data[] = parseEntityReference();
  1019. for (int i = 0 ; i < data.length ; i++) {
  1020. c = data[i];
  1021. addString((lower && (c >= 'A') && (c <= 'Z')) ? 'a' + c - 'A' : c);
  1022. }
  1023. continue;
  1024. case -1:
  1025. return getString(0);
  1026. default:
  1027. if (lower && (c >= 'A') && (c <= 'Z')) {
  1028. c = 'a' + c - 'A';
  1029. }
  1030. ch = readCh();
  1031. break;
  1032. }
  1033. addString(c);
  1034. }
  1035. }
  1036. /**
  1037. * Parse attribute specification List. [31] 327:17
  1038. */
  1039. void parseAttributeSpecificationList(Element elem) throws IOException {
  1040. while (true) {
  1041. skipSpace();
  1042. switch (ch) {
  1043. case '/':
  1044. case '>':
  1045. case '<':
  1046. case -1:
  1047. return;
  1048. case '-':
  1049. if ((ch = readCh()) == '-') {
  1050. ch = readCh();
  1051. parseComment();
  1052. strpos = 0;
  1053. } else {
  1054. error("invalid.tagchar", "-", elem.getName());
  1055. ch = readCh();
  1056. }
  1057. continue;
  1058. }
  1059. AttributeList att = null;
  1060. String attname = null;
  1061. String attvalue = null;
  1062. if (parseIdentifier(true)) {
  1063. attname = getString(0);
  1064. skipSpace();
  1065. if (ch == '=') {
  1066. ch = readCh();
  1067. skipSpace();
  1068. att = elem.getAttribute(attname);
  1069. // Bug ID 4102750
  1070. // Load the NAME of an Attribute Case Sensitive
  1071. // The case of the NAME must be intact
  1072. // MG 021898
  1073. attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION) && (att.type != NAME));
  1074. // attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION));
  1075. } else {
  1076. attvalue = attname;
  1077. att = elem.getAttributeByValue(attvalue);
  1078. if (att == null) {
  1079. att = elem.getAttribute(attname);
  1080. if (att != null) {
  1081. attvalue = att.getValue();
  1082. }
  1083. else {
  1084. // Make it null so that NULL_ATTRIBUTE_VALUE is
  1085. // used
  1086. attvalue = null;
  1087. }
  1088. }
  1089. }
  1090. } else if (!strict && ch == ',') { // allows for comma separated attribute-value pairs
  1091. ch = readCh();
  1092. continue;
  1093. } else if (!strict && ch == '"') { // allows for quoted attributes
  1094. ch = readCh();
  1095. skipSpace();
  1096. if (parseIdentifier(true)) {
  1097. attname = getString(0);
  1098. if (ch == '"') {
  1099. ch = readCh();
  1100. }
  1101. skipSpace();
  1102. if (ch == '=') {
  1103. ch = readCh();
  1104. skipSpace();
  1105. att = elem.getAttribute(attname);
  1106. attvalue = parseAttributeValue((att != null) &&
  1107. (att.type != CDATA) &&
  1108. (att.type != NOTATION));
  1109. } else {
  1110. attvalue = attname;
  1111. att = elem.getAttributeByValue(attvalue);
  1112. if (att == null) {
  1113. att = elem.getAttribute(attname);
  1114. if (att != null) {
  1115. attvalue = att.getValue();
  1116. }
  1117. }
  1118. }
  1119. } else {
  1120. char str[] = {(char)ch};
  1121. error("invalid.tagchar", new String(str), elem.getName());
  1122. ch = readCh();
  1123. continue;
  1124. }
  1125. } else if (!strict && (attributes.isEmpty()) && (ch == '=')) {
  1126. ch = readCh();
  1127. skipSpace();
  1128. attname = elem.getName();
  1129. att = elem.getAttribute(attname);
  1130. attvalue = parseAttributeValue((att != null) &&
  1131. (att.type != CDATA) &&
  1132. (att.type != NOTATION));
  1133. } else if (!strict && (ch == '=')) {
  1134. ch = readCh();
  1135. skipSpace();
  1136. attvalue = parseAttributeValue(true);
  1137. error("attvalerr");
  1138. return;
  1139. } else {
  1140. char str[] = {(char)ch};
  1141. error("invalid.tagchar", new String(str), elem.getName());
  1142. if (!strict) {
  1143. ch = readCh();
  1144. continue;
  1145. } else {
  1146. return;
  1147. }
  1148. }
  1149. if (att != null) {
  1150. attname = att.getName();
  1151. } else {
  1152. error("invalid.tagatt", attname, elem.getName());
  1153. }
  1154. // Check out the value
  1155. if (attributes.isDefined(attname)) {
  1156. error("multi.tagatt", attname, elem.getName());
  1157. }
  1158. if (attvalue == null) {
  1159. attvalue = ((att != null) && (att.value != null)) ? att.value :
  1160. HTML.NULL_ATTRIBUTE_VALUE;
  1161. } else if ((att != null) && (att.values != null) && !att.values.contains(attvalue)) {
  1162. error("invalid.tagattval", attname, elem.getName());
  1163. }
  1164. HTML.Attribute attkey = HTML.getAttributeKey(attname);
  1165. if (attkey == null) {
  1166. attributes.addAttribute(attname, attvalue);
  1167. } else {
  1168. attributes.addAttribute(attkey, attvalue);
  1169. }
  1170. }
  1171. }
  1172. /**
  1173. * Parses th Document Declaration Type markup declaration.
  1174. * Currently ignores it.
  1175. */
  1176. public String parseDTDMarkup() throws IOException {
  1177. StringBuffer strBuff = new StringBuffer();
  1178. ch = readCh();
  1179. while(true) {
  1180. switch (ch) {
  1181. case '>':
  1182. ch = readCh();
  1183. return strBuff.toString();
  1184. case -1:
  1185. error("invalid.markup");
  1186. return strBuff.toString();
  1187. case '\n':
  1188. ln++;
  1189. ch = readCh();
  1190. lfCount++;
  1191. break;
  1192. case '"':
  1193. ch = readCh();
  1194. break;
  1195. case '\r':
  1196. ln++;
  1197. if ((ch = readCh()) == '\n') {
  1198. ch = readCh();
  1199. crlfCount++;
  1200. }
  1201. else {
  1202. crCount++;
  1203. }
  1204. break;
  1205. default:
  1206. strBuff.append((char)(ch & 0xFF));
  1207. ch = readCh();
  1208. break;
  1209. }
  1210. }
  1211. }
  1212. /**
  1213. * Parse markup declarations.
  1214. * Currently only handles the Document Type Declaration markup.
  1215. * Returns true if it is a markup declaration false otherwise.
  1216. */
  1217. protected boolean parseMarkupDeclarations(StringBuffer strBuff) throws IOException {
  1218. /* Currently handles only the DOCTYPE */
  1219. if ((strBuff.length() == "DOCTYPE".length()) &&
  1220. (strBuff.toString().toUpperCase().equals("DOCTYPE"))) {
  1221. parseDTDMarkup();
  1222. return true;
  1223. }
  1224. return false;
  1225. }
  1226. /**
  1227. * Parse an invalid tag.
  1228. */
  1229. void parseInvalidTag() throws IOException {
  1230. // ignore all data upto the close bracket '>'
  1231. while (true) {
  1232. skipSpace();
  1233. switch (ch) {
  1234. case '>':
  1235. case -1:
  1236. ch = readCh();
  1237. return;
  1238. case '<':
  1239. return;
  1240. default:
  1241. ch = readCh();
  1242. }
  1243. }
  1244. }
  1245. /**
  1246. * Parse a start or end tag.
  1247. */
  1248. void parseTag() throws IOException {
  1249. Element elem = null;
  1250. boolean net = false;
  1251. boolean warned = false;
  1252. boolean unknown = false;
  1253. switch (ch = readCh()) {
  1254. case '!':
  1255. switch (ch = readCh()) {
  1256. case '-':
  1257. // Parse comment. [92] 391:7
  1258. while (true) {
  1259. if (ch == '-') {
  1260. if (!strict || ((ch = readCh()) == '-')) {
  1261. ch = readCh();
  1262. if (!strict && ch == '-') {
  1263. ch = readCh();
  1264. }
  1265. // send over any text you might see
  1266. // before parsing and sending the
  1267. // comment
  1268. if (textpos != 0) {
  1269. char newtext[] = new char[textpos];
  1270. System.arraycopy(text, 0, newtext, 0, textpos);
  1271. handleText(newtext);
  1272. lastBlockStartPos = currentBlockStartPos;
  1273. textpos = 0;
  1274. }
  1275. parseComment();
  1276. handleComment(getChars(0));
  1277. continue;
  1278. } else if (!warned) {
  1279. warned = true;
  1280. error("invalid.commentchar", "-");
  1281. }
  1282. }
  1283. skipSpace();
  1284. switch (ch) {
  1285. case '-':
  1286. continue;
  1287. case '>':
  1288. ch = readCh();
  1289. case -1:
  1290. return;
  1291. default:
  1292. ch = readCh();
  1293. if (!warned) {
  1294. warned = true;
  1295. error("invalid.commentchar",
  1296. String.valueOf((char)ch));
  1297. }
  1298. break;
  1299. }
  1300. }
  1301. default:
  1302. // deal with marked sections
  1303. StringBuffer strBuff = new StringBuffer();
  1304. while (true) {
  1305. strBuff.append((char)ch);
  1306. if (parseMarkupDeclarations(strBuff)) {
  1307. return;
  1308. }
  1309. switch(ch) {
  1310. case '>':
  1311. ch = readCh();
  1312. case -1:
  1313. error("invalid.markup");
  1314. return;
  1315. case '\n':
  1316. ln++;
  1317. ch = readCh();
  1318. lfCount++;
  1319. break;
  1320. case '\r':
  1321. ln++;
  1322. if ((ch = readCh()) == '\n') {
  1323. ch = readCh();
  1324. crlfCount++;
  1325. }
  1326. else {
  1327. crCount++;
  1328. }
  1329. break;
  1330. default:
  1331. ch = readCh();
  1332. break;
  1333. }
  1334. }
  1335. }
  1336. case '/':
  1337. // parse end tag [19] 317:4
  1338. switch (ch = readCh()) {
  1339. case '>':
  1340. ch = readCh();
  1341. case '<':
  1342. // empty end tag. either </> or </<
  1343. if (recent == null) {
  1344. error("invalid.shortend");
  1345. return;
  1346. }
  1347. elem = recent;
  1348. break;
  1349. default:
  1350. if (!parseIdentifier(true)) {
  1351. error("expected.endtagname");
  1352. return;
  1353. }
  1354. skipSpace();
  1355. switch (ch) {
  1356. case '>':
  1357. ch = readCh();
  1358. case '<':
  1359. break;
  1360. default:
  1361. error("expected", "'>'");
  1362. while ((ch != -1) && (ch != '\n') && (ch != '>')) {
  1363. ch = readCh();
  1364. }
  1365. if (ch == '>') {
  1366. ch = readCh();
  1367. }
  1368. break;
  1369. }
  1370. String elemStr = getString(0);
  1371. if (!dtd.elementExists(elemStr)) {
  1372. error("end.unrecognized", elemStr);
  1373. // Ignore RE before end tag
  1374. if ((textpos > 0) && (text[textpos-1] == '\n')) {
  1375. textpos--;
  1376. }
  1377. elem = dtd.getElement("unknown");
  1378. elem.name = elemStr;
  1379. unknown = true;
  1380. } else {
  1381. elem = dtd.getElement(elemStr);
  1382. }
  1383. break;
  1384. }
  1385. // If the stack is null, we're seeing end tags without any begin
  1386. // tags. Ignore them.
  1387. if (stack == null) {
  1388. error("end.extra.tag", elem.getName());
  1389. return;
  1390. }
  1391. // Ignore RE before end tag
  1392. if ((textpos > 0) && (text[textpos-1] == '\n')) {
  1393. // In a pre tag, if there are blank lines
  1394. // we do not want to remove the newline
  1395. // before the end tag. Hence this code.
  1396. //
  1397. if (stack.pre) {
  1398. if ((textpos > 1) && (text[textpos-2] != '\n')) {
  1399. textpos--;
  1400. }
  1401. } else {
  1402. textpos--;
  1403. }
  1404. }
  1405. // If the end tag is a form, since we did not put it
  1406. // on the tag stack, there is no corresponding start
  1407. // start tag to find. Hence do not touch the tag stack.
  1408. //
  1409. if (!strict && elem.getName().equals("form")) {
  1410. if (lastFormSent != null) {
  1411. handleEndTag(lastFormSent);
  1412. return;
  1413. } else {
  1414. // do nothing.
  1415. return;
  1416. }
  1417. }
  1418. if (unknown) {
  1419. // we will not see a corresponding start tag
  1420. // on the the stack. If we are seeing an
  1421. // end tag, lets send this on as an empty
  1422. // tag with the end tag attribute set to
  1423. // true.
  1424. TagElement t = makeTag(elem);
  1425. handleText(t);
  1426. attributes.addAttribute(HTML.Attribute.ENDTAG, "true");
  1427. handleEmptyTag(makeTag(elem));
  1428. unknown = false;
  1429. return;
  1430. }
  1431. // find the corresponding start tag
  1432. // A commonly occuring error appears to be the insertion
  1433. // of extra end tags in a table. The intent here is ignore
  1434. // such extra end tags.
  1435. //
  1436. if (!strict) {
  1437. String stackElem = stack.elem.getName();
  1438. if (stackElem.equals("table")) {
  1439. // If it isnt a valid end tag ignore it and return
  1440. //
  1441. if (!elem.getName().equals(stackElem)) {
  1442. error("tag.ignore", elem.getName());
  1443. return;
  1444. }
  1445. }
  1446. if (stackElem.equals("tr") ||
  1447. stackElem.equals("td")) {
  1448. if ((!elem.getName().equals("table")) &&
  1449. (!elem.getName().equals(stackElem))) {
  1450. error("tag.ignore", elem.getName());
  1451. return;
  1452. }
  1453. }
  1454. }
  1455. TagStack sp = stack;
  1456. while ((sp != null) && (elem != sp.elem)) {
  1457. sp = sp.next;
  1458. }
  1459. if (sp == null) {
  1460. error("unmatched.endtag", elem.getName());
  1461. return;
  1462. }
  1463. // People put font ending tags in the darndest places.
  1464. // Don't close other contexts based on them being between
  1465. // a font tag and the corresponding end tag. Instead,
  1466. // ignore the end tag like it doesn't exist and allow the end
  1467. // of the document to close us out.
  1468. String elemName = elem.getName();
  1469. if (stack != sp &&
  1470. (elemName.equals("font") ||
  1471. elemName.equals("center"))) {
  1472. // Since closing out a center tag can have real wierd
  1473. // effects on the formatting, make sure that tags
  1474. // for which omitting an end tag is legimitate
  1475. // get closed out.
  1476. //
  1477. if (elemName.equals("center")) {
  1478. while(stack.elem.omitEnd() && stack != sp) {
  1479. endTag(true);
  1480. }
  1481. if (stack.elem == elem) {
  1482. endTag(false);
  1483. }
  1484. }
  1485. return;
  1486. }
  1487. // People do the same thing with center tags. In this
  1488. // case we would like to close off the center tag but
  1489. // not necessarily all enclosing tags.
  1490. // end tags
  1491. while (stack != sp) {
  1492. endTag(true);
  1493. }
  1494. endTag(false);
  1495. return;
  1496. case -1:
  1497. error("eof");
  1498. return;
  1499. }
  1500. // start tag [14] 314:1
  1501. if (!parseIdentifier(true)) {
  1502. elem = recent;
  1503. if ((ch != '>') || (elem == null)) {
  1504. error("expected.tagname");
  1505. return;
  1506. }
  1507. } else {
  1508. String elemStr = getString(0);
  1509. if (elemStr.equals("image")) {
  1510. elemStr = new String("img");
  1511. }
  1512. /* determine if this element is part of the dtd. */
  1513. if (!dtd.elementExists(elemStr)) {
  1514. // parseInvalidTag();
  1515. error("tag.unrecognized ", elemStr);
  1516. elem = dtd.getElement("unknown");
  1517. elem.name = elemStr;
  1518. unknown = true;
  1519. } else {
  1520. elem = dtd.getElement(elemStr);
  1521. }
  1522. }
  1523. // Parse attributes
  1524. parseAttributeSpecificationList(elem);
  1525. switch (ch) {
  1526. case '/':
  1527. net = true;
  1528. case '>':
  1529. ch = readCh();
  1530. case '<':
  1531. break;
  1532. default:
  1533. error("expected", "'>'");
  1534. break;
  1535. }
  1536. if (!strict) {
  1537. if (elem.getName().equals("script")) {
  1538. error("javascript.unsupported");
  1539. }
  1540. }
  1541. // ignore RE after start tag
  1542. //
  1543. if (!elem.isEmpty()) {
  1544. if (ch == '\n') {
  1545. ln++;
  1546. lfCount++;
  1547. ch = readCh();
  1548. } else if (ch == '\r') {
  1549. ln++;
  1550. if ((ch = readCh()) == '\n') {
  1551. ch = readCh();
  1552. crlfCount++;
  1553. }
  1554. else {
  1555. crCount++;
  1556. }
  1557. }
  1558. }
  1559. // ensure a legal context for the tag
  1560. TagElement tag = makeTag(elem, false);
  1561. /** In dealing with forms, we have decided to treat
  1562. them as legal in any context. Also, even though
  1563. they do have a start and an end tag, we will
  1564. not put this tag on the stack. This is to deal
  1565. several pages in the web oasis that choose to
  1566. start and end forms in any possible location. **/
  1567. if (!strict && elem.getName().equals("form")) {
  1568. if (lastFormSent == null) {
  1569. lastFormSent = tag;
  1570. } else {
  1571. handleEndTag(lastFormSent);
  1572. lastFormSent = tag;
  1573. }
  1574. } else {
  1575. // Smlly, if a tag is unknown, we will apply
  1576. // no legalTagContext logic to it.
  1577. //
  1578. if (!unknown) {
  1579. legalTagContext(tag);
  1580. // If skip tag is true, this implies that
  1581. // the tag was illegal and that the error
  1582. // recovery strategy adopted is to ignore
  1583. // the tag.
  1584. if (!strict && skipTag) {
  1585. skipTag = false;
  1586. return;
  1587. }
  1588. }
  1589. }
  1590. startTag(tag);
  1591. if (!elem.isEmpty()) {
  1592. switch (elem.getType()) {
  1593. case CDATA:
  1594. parseLiteral(false);
  1595. break;
  1596. case RCDATA:
  1597. parseLiteral(true);
  1598. break;
  1599. default:
  1600. if (stack != null) {
  1601. stack.net = net;
  1602. }
  1603. break;
  1604. }
  1605. }
  1606. }
  1607. /**
  1608. * Parse Content. [24] 320:1
  1609. */
  1610. void parseContent() throws IOException {
  1611. Thread curThread = Thread.currentThread();
  1612. for (;;) {
  1613. if (curThread.isInterrupted()) {
  1614. curThread.interrupt(); // resignal the interrupt
  1615. break;
  1616. }
  1617. int c = ch;
  1618. currentBlockStartPos = currentPosition;
  1619. switch (c) {
  1620. case '<':
  1621. parseTag();
  1622. lastBlockStartPos = currentPosition;
  1623. continue;
  1624. case '/':
  1625. ch = readCh();
  1626. if ((stack != null) && stack.net) {
  1627. // null end tag.
  1628. endTag(false);
  1629. continue;
  1630. }
  1631. break;
  1632. case -1:
  1633. return;
  1634. case '&':
  1635. if (textpos == 0) {
  1636. if (!legalElementContext(dtd.pcdata)) {
  1637. error("unexpected.pcdata");
  1638. }
  1639. if (last.breaksFlow()) {
  1640. space = false;
  1641. }
  1642. }
  1643. char data[] = parseEntityReference();
  1644. if (textpos + data.length + 1 > text.length) {
  1645. char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
  1646. System.arraycopy(text, 0, newtext, 0, text.length);
  1647. text = newtext;
  1648. }
  1649. if (space) {
  1650. space = false;
  1651. text[textpos++] = ' ';
  1652. }
  1653. System.arraycopy(data, 0, text, textpos, data.length);
  1654. textpos += data.length;
  1655. continue;
  1656. case '\n':
  1657. ln++;
  1658. lfCount++;
  1659. ch = readCh();
  1660. if ((stack != null) && stack.pre) {
  1661. break;
  1662. }
  1663. space = true;
  1664. if (textpos == 0) {
  1665. lastBlockStartPos = currentPosition;
  1666. }
  1667. continue;
  1668. case '\r':
  1669. ln++;
  1670. c = '\n';
  1671. if ((ch = readCh()) == '\n') {
  1672. ch = readCh();
  1673. crlfCount++;
  1674. }
  1675. else {
  1676. crCount++;
  1677. }
  1678. if ((stack != null) && stack.pre) {
  1679. break;
  1680. }
  1681. if (textpos == 0) {
  1682. lastBlockStartPos = currentPosition;
  1683. }
  1684. space = true;
  1685. continue;
  1686. case '\t':
  1687. case ' ':
  1688. ch = readCh();
  1689. if ((stack != null) && stack.pre) {
  1690. break;
  1691. }
  1692. space = true;
  1693. if (textpos == 0) {
  1694. lastBlockStartPos = currentPosition;
  1695. }
  1696. continue;
  1697. default:
  1698. if (textpos == 0) {
  1699. if (!legalElementContext(dtd.pcdata)) {
  1700. error("unexpected.pcdata");
  1701. }
  1702. if (last.breaksFlow()) {
  1703. space = false;
  1704. }
  1705. }
  1706. ch = readCh();
  1707. break;
  1708. }
  1709. // enlarge buffer if needed
  1710. if (textpos + 2 > text.length) {
  1711. char newtext[] = new char[text.length + 128];
  1712. System.arraycopy(text, 0, newtext, 0, text.length);
  1713. text = newtext;
  1714. }
  1715. // output pending space
  1716. if (space) {
  1717. if (textpos == 0) {
  1718. lastBlockStartPos--;
  1719. }
  1720. text[textpos++] = ' ';
  1721. space = false;
  1722. }
  1723. text[textpos++] = (char)c;
  1724. }
  1725. }
  1726. /**
  1727. * Returns the end of line string. This will return the end of line
  1728. * string that has been encountered the most, one of \r, \n or \r\n.
  1729. */
  1730. String getEndOfLineString() {
  1731. if (crlfCount >= crCount) {
  1732. if (lfCount >= crlfCount) {
  1733. return "\n";
  1734. }
  1735. else {
  1736. return "\r\n";
  1737. }
  1738. }
  1739. else {
  1740. if (crCount > lfCount) {
  1741. return "\r";
  1742. }
  1743. else {
  1744. return "\n";
  1745. }
  1746. }
  1747. }
  1748. /**
  1749. * Parse an HTML stream, given a DTD.
  1750. */
  1751. public synchronized void parse(Reader in) throws IOException {
  1752. this.in = in;
  1753. this.ln = 1;
  1754. seenHtml = false;
  1755. seenHead = false;
  1756. seenBody = false;
  1757. crCount = lfCount = crlfCount = 0;
  1758. try {
  1759. try {
  1760. ch = readCh();
  1761. text = new char[1024];
  1762. str = new char[128];
  1763. parseContent();
  1764. // NOTE: interruption may have occurred. Control flows out
  1765. // of here normally.
  1766. while (stack != null) {
  1767. endTag(true);
  1768. }
  1769. } finally {
  1770. in.close();
  1771. }
  1772. } catch (IOException e) {
  1773. errorContext();
  1774. error("ioexception");
  1775. throw e;
  1776. } catch (Exception e) {
  1777. errorContext();
  1778. error("exception", e.getClass().getName(), e.getMessage());
  1779. e.printStackTrace();
  1780. } catch (ThreadDeath e) {
  1781. errorContext();
  1782. error("terminated");
  1783. e.printStackTrace();
  1784. throw e;
  1785. } finally {
  1786. for (; stack != null ; stack = stack.next) {
  1787. handleEndTag(stack.tag);
  1788. }
  1789. text = null;
  1790. str = null;
  1791. }
  1792. }
  1793. /*
  1794. * Input cache. This is much faster than calling down to a synchronized
  1795. * method of BufferedReader for each byte. Measurements done 5/30/97
  1796. * show that there's no point in having a bigger buffer: Increasing
  1797. * the buffer to 8192 had no measurable impact for a program discarding
  1798. * one character at a time (reading from an http URL to a local machine).
  1799. */
  1800. private char buf[] = new char[256];
  1801. private int pos;
  1802. private int len;
  1803. /*
  1804. tracks position relative to the beginning of the
  1805. document.
  1806. */
  1807. private int currentPosition;
  1808. private final int readCh() throws IOException {
  1809. if (pos >= len) {
  1810. // This loop allows us to ignore interrupts if the flag
  1811. // says so
  1812. for (;;) {
  1813. try {
  1814. len = in.read(buf);
  1815. break;
  1816. } catch (InterruptedIOException ex) {
  1817. throw ex;
  1818. }
  1819. }
  1820. if (len <= 0) {
  1821. return -1; // eof
  1822. }
  1823. pos = 0;
  1824. }
  1825. ++currentPosition;
  1826. return buf[pos++];
  1827. }
  1828. protected int getCurrentPos() {
  1829. return currentPosition;
  1830. }
  1831. }