1. /*
  2. * The Apache Software License, Version 1.1
  3. *
  4. *
  5. * Copyright (c) 1999-2004 The Apache Software Foundation.
  6. * All rights reserved.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. *
  12. * 1. Redistributions of source code must retain the above copyright
  13. * notice, this list of conditions and the following disclaimer.
  14. *
  15. * 2. Redistributions in binary form must reproduce the above copyright
  16. * notice, this list of conditions and the following disclaimer in
  17. * the documentation and/or other materials provided with the
  18. * distribution.
  19. *
  20. * 3. The end-user documentation included with the redistribution,
  21. * if any, must include the following acknowledgment:
  22. * "This product includes software developed by the
  23. * Apache Software Foundation (http://www.apache.org/)."
  24. * Alternately, this acknowledgment may appear in the software itself,
  25. * if and wherever such third-party acknowledgments normally appear.
  26. *
  27. * 4. The names "Xerces" and "Apache Software Foundation" must
  28. * not be used to endorse or promote products derived from this
  29. * software without prior written permission. For written
  30. * permission, please contact apache@apache.org.
  31. *
  32. * 5. Products derived from this software may not be called "Apache",
  33. * nor may "Apache" appear in their name, without prior written
  34. * permission of the Apache Software Foundation.
  35. *
  36. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  37. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  38. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  39. * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  40. * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  41. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  42. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  43. * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  44. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  45. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  46. * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  47. * SUCH DAMAGE.
  48. * ====================================================================
  49. *
  50. * This software consists of voluntary contributions made by many
  51. * individuals on behalf of the Apache Software Foundation and was
  52. * originally based on software copyright (c) 1999, International
  53. * Business Machines, Inc., http://www.apache.org. For more
  54. * information on the Apache Software Foundation, please see
  55. * <http://www.apache.org/>.
  56. */
  57. package com.sun.org.apache.xerces.internal.impl;
  58. import java.io.IOException;
  59. import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter;
  60. import com.sun.org.apache.xerces.internal.util.XML11Char;
  61. import com.sun.org.apache.xerces.internal.util.XMLChar;
  62. import com.sun.org.apache.xerces.internal.util.XMLStringBuffer;
  63. import com.sun.org.apache.xerces.internal.xni.XMLString;
  64. import com.sun.org.apache.xerces.internal.xni.XNIException;
  65. /**
  66. * This class is responsible for scanning XML document structure
  67. * and content. The scanner acts as the source for the document
  68. * information which is communicated to the document handler.
  69. * <p>
  70. * This component requires the following features and properties from the
  71. * component manager that uses it:
  72. * <ul>
  73. * <li>http://xml.org/sax/features/namespaces</li>
  74. * <li>http://xml.org/sax/features/validation</li>
  75. * <li>http://apache.org/xml/features/nonvalidating/load-external-dtd</li>
  76. * <li>http://apache.org/xml/features/scanner/notify-char-refs</li>
  77. * <li>http://apache.org/xml/features/scanner/notify-builtin-refs</li>
  78. * <li>http://apache.org/xml/properties/internal/symbol-table</li>
  79. * <li>http://apache.org/xml/properties/internal/error-reporter</li>
  80. * <li>http://apache.org/xml/properties/internal/entity-manager</li>
  81. * <li>http://apache.org/xml/properties/internal/dtd-scanner</li>
  82. * </ul>
  83. *
  84. * @author Glenn Marcy, IBM
  85. * @author Andy Clark, IBM
  86. * @author Arnaud Le Hors, IBM
  87. * @author Eric Ye, IBM
  88. *
  89. * @version $Id: XML11DocumentScannerImpl.java,v 1.19 2004/04/25 05:05:50 mrglavas Exp $
  90. */
  91. public class XML11DocumentScannerImpl
  92. extends XMLDocumentScannerImpl {
  93. /** Array of 3 strings. */
  94. private String[] fStrings = new String[3];
  95. /** String. */
  96. private XMLString fString = new XMLString();
  97. /** String buffer. */
  98. private XMLStringBuffer fStringBuffer = new XMLStringBuffer();
  99. private XMLStringBuffer fStringBuffer2 = new XMLStringBuffer();
  100. private XMLStringBuffer fStringBuffer3 = new XMLStringBuffer();
  101. //
  102. // Constructors
  103. //
  104. /** Default constructor. */
  105. public XML11DocumentScannerImpl() {super();} // <init>()
  106. //
  107. // overridden methods
  108. //
  109. // XMLDocumentFragmentImpl methods
  110. /**
  111. * Scans element content.
  112. *
  113. * @return Returns the next character on the stream.
  114. */
  115. protected int scanContent() throws IOException, XNIException {
  116. XMLString content = fString;
  117. int c = fEntityScanner.scanContent(content);
  118. if (c == '\r' || c == 0x85 || c == 0x2028) {
  119. // happens when there is the character reference
  120. // but scanContent doesn't do entity expansions...
  121. // is this *really* necessary??? - NG
  122. fEntityScanner.scanChar();
  123. fStringBuffer.clear();
  124. fStringBuffer.append(fString);
  125. fStringBuffer.append((char)c);
  126. content = fStringBuffer;
  127. c = -1;
  128. }
  129. if (fDocumentHandler != null && content.length > 0) {
  130. fDocumentHandler.characters(content, null);
  131. }
  132. if (c == ']' && fString.length == 0) {
  133. fStringBuffer.clear();
  134. fStringBuffer.append((char)fEntityScanner.scanChar());
  135. // remember where we are in case we get an endEntity before we
  136. // could flush the buffer out - this happens when we're parsing an
  137. // entity which ends with a ]
  138. fInScanContent = true;
  139. //
  140. // We work on a single character basis to handle cases such as:
  141. // ']]]>' which we might otherwise miss.
  142. //
  143. if (fEntityScanner.skipChar(']')) {
  144. fStringBuffer.append(']');
  145. while (fEntityScanner.skipChar(']')) {
  146. fStringBuffer.append(']');
  147. }
  148. if (fEntityScanner.skipChar('>')) {
  149. reportFatalError("CDEndInContent", null);
  150. }
  151. }
  152. if (fDocumentHandler != null && fStringBuffer.length != 0) {
  153. fDocumentHandler.characters(fStringBuffer, null);
  154. }
  155. fInScanContent = false;
  156. c = -1;
  157. }
  158. return c;
  159. } // scanContent():int
  160. /**
  161. * Scans an attribute value and normalizes whitespace converting all
  162. * whitespace characters to space characters.
  163. *
  164. * [10] AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'"
  165. *
  166. * @param value The XMLString to fill in with the value.
  167. * @param nonNormalizedValue The XMLString to fill in with the
  168. * non-normalized value.
  169. * @param atName The name of the attribute being parsed (for error msgs).
  170. * @param checkEntities true if undeclared entities should be reported as VC violation,
  171. * false if undeclared entities should be reported as WFC violation.
  172. * @param eleName The name of element to which this attribute belongs.
  173. *
  174. * <strong>Note:</strong> This method uses fStringBuffer2, anything in it
  175. * at the time of calling is lost.
  176. **/
  177. protected void scanAttributeValue(XMLString value,
  178. XMLString nonNormalizedValue,
  179. String atName,
  180. boolean checkEntities,String eleName)
  181. throws IOException, XNIException
  182. {
  183. // quote
  184. int quote = fEntityScanner.peekChar();
  185. if (quote != '\'' && quote != '"') {
  186. reportFatalError("OpenQuoteExpected", new Object[]{eleName,atName});
  187. }
  188. fEntityScanner.scanChar();
  189. int entityDepth = fEntityDepth;
  190. int c = fEntityScanner.scanLiteral(quote, value);
  191. if (DEBUG_ATTR_NORMALIZATION) {
  192. System.out.println("** scanLiteral -> \""
  193. + value.toString() + "\"");
  194. }
  195. fStringBuffer2.clear();
  196. fStringBuffer2.append(value);
  197. normalizeWhitespace(value);
  198. if (DEBUG_ATTR_NORMALIZATION) {
  199. System.out.println("** normalizeWhitespace -> \""
  200. + value.toString() + "\"");
  201. }
  202. if (c != quote) {
  203. fScanningAttribute = true;
  204. fStringBuffer.clear();
  205. do {
  206. fStringBuffer.append(value);
  207. if (DEBUG_ATTR_NORMALIZATION) {
  208. System.out.println("** value2: \""
  209. + fStringBuffer.toString() + "\"");
  210. }
  211. if (c == '&') {
  212. fEntityScanner.skipChar('&');
  213. if (entityDepth == fEntityDepth) {
  214. fStringBuffer2.append('&');
  215. }
  216. if (fEntityScanner.skipChar('#')) {
  217. if (entityDepth == fEntityDepth) {
  218. fStringBuffer2.append('#');
  219. }
  220. int ch = scanCharReferenceValue(fStringBuffer, fStringBuffer2);
  221. if (ch != -1) {
  222. if (DEBUG_ATTR_NORMALIZATION) {
  223. System.out.println("** value3: \""
  224. + fStringBuffer.toString()
  225. + "\"");
  226. }
  227. }
  228. }
  229. else {
  230. String entityName = fEntityScanner.scanName();
  231. if (entityName == null) {
  232. reportFatalError("NameRequiredInReference", null);
  233. }
  234. else if (entityDepth == fEntityDepth) {
  235. fStringBuffer2.append(entityName);
  236. }
  237. if (!fEntityScanner.skipChar(';')) {
  238. reportFatalError("SemicolonRequiredInReference",
  239. new Object []{entityName});
  240. }
  241. else if (entityDepth == fEntityDepth) {
  242. fStringBuffer2.append(';');
  243. }
  244. if (entityName == fAmpSymbol) {
  245. fStringBuffer.append('&');
  246. if (DEBUG_ATTR_NORMALIZATION) {
  247. System.out.println("** value5: \""
  248. + fStringBuffer.toString()
  249. + "\"");
  250. }
  251. }
  252. else if (entityName == fAposSymbol) {
  253. fStringBuffer.append('\'');
  254. if (DEBUG_ATTR_NORMALIZATION) {
  255. System.out.println("** value7: \""
  256. + fStringBuffer.toString()
  257. + "\"");
  258. }
  259. }
  260. else if (entityName == fLtSymbol) {
  261. fStringBuffer.append('<');
  262. if (DEBUG_ATTR_NORMALIZATION) {
  263. System.out.println("** value9: \""
  264. + fStringBuffer.toString()
  265. + "\"");
  266. }
  267. }
  268. else if (entityName == fGtSymbol) {
  269. fStringBuffer.append('>');
  270. if (DEBUG_ATTR_NORMALIZATION) {
  271. System.out.println("** valueB: \""
  272. + fStringBuffer.toString()
  273. + "\"");
  274. }
  275. }
  276. else if (entityName == fQuotSymbol) {
  277. fStringBuffer.append('"');
  278. if (DEBUG_ATTR_NORMALIZATION) {
  279. System.out.println("** valueD: \""
  280. + fStringBuffer.toString()
  281. + "\"");
  282. }
  283. }
  284. else {
  285. if (fEntityManager.isExternalEntity(entityName)) {
  286. reportFatalError("ReferenceToExternalEntity",
  287. new Object[] { entityName });
  288. }
  289. else {
  290. if (!fEntityManager.isDeclaredEntity(entityName)) {
  291. //WFC & VC: Entity Declared
  292. if (checkEntities) {
  293. if (fValidation) {
  294. fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
  295. "EntityNotDeclared",
  296. new Object[]{entityName},
  297. XMLErrorReporter.SEVERITY_ERROR);
  298. }
  299. }
  300. else {
  301. reportFatalError("EntityNotDeclared",
  302. new Object[]{entityName});
  303. }
  304. }
  305. fEntityManager.startEntity(entityName, true);
  306. }
  307. }
  308. }
  309. }
  310. else if (c == '<') {
  311. reportFatalError("LessthanInAttValue",
  312. new Object[] { eleName, atName });
  313. fEntityScanner.scanChar();
  314. if (entityDepth == fEntityDepth) {
  315. fStringBuffer2.append((char)c);
  316. }
  317. }
  318. else if (c == '%' || c == ']') {
  319. fEntityScanner.scanChar();
  320. fStringBuffer.append((char)c);
  321. if (entityDepth == fEntityDepth) {
  322. fStringBuffer2.append((char)c);
  323. }
  324. if (DEBUG_ATTR_NORMALIZATION) {
  325. System.out.println("** valueF: \""
  326. + fStringBuffer.toString() + "\"");
  327. }
  328. }
  329. // note that none of these characters should ever get through
  330. // XML11EntityScanner. Not sure why
  331. // this check was originally necessary. - NG
  332. else if (c == '\n' || c == '\r' || c == 0x85 || c == 0x2028) {
  333. fEntityScanner.scanChar();
  334. fStringBuffer.append(' ');
  335. if (entityDepth == fEntityDepth) {
  336. fStringBuffer2.append('\n');
  337. }
  338. }
  339. else if (c != -1 && XMLChar.isHighSurrogate(c)) {
  340. fStringBuffer3.clear();
  341. if (scanSurrogates(fStringBuffer3)) {
  342. fStringBuffer.append(fStringBuffer3);
  343. if (entityDepth == fEntityDepth) {
  344. fStringBuffer2.append(fStringBuffer3);
  345. }
  346. if (DEBUG_ATTR_NORMALIZATION) {
  347. System.out.println("** valueI: \""
  348. + fStringBuffer.toString()
  349. + "\"");
  350. }
  351. }
  352. }
  353. else if (c != -1 && XML11Char.isXML11Invalid(c)) {
  354. reportFatalError("InvalidCharInAttValue",
  355. new Object[] {eleName, atName, Integer.toString(c, 16)});
  356. fEntityScanner.scanChar();
  357. if (entityDepth == fEntityDepth) {
  358. fStringBuffer2.append((char)c);
  359. }
  360. }
  361. c = fEntityScanner.scanLiteral(quote, value);
  362. if (entityDepth == fEntityDepth) {
  363. fStringBuffer2.append(value);
  364. }
  365. normalizeWhitespace(value);
  366. } while (c != quote || entityDepth != fEntityDepth);
  367. fStringBuffer.append(value);
  368. if (DEBUG_ATTR_NORMALIZATION) {
  369. System.out.println("** valueN: \""
  370. + fStringBuffer.toString() + "\"");
  371. }
  372. value.setValues(fStringBuffer);
  373. fScanningAttribute = false;
  374. }
  375. nonNormalizedValue.setValues(fStringBuffer2);
  376. // quote
  377. int cquote = fEntityScanner.scanChar();
  378. if (cquote != quote) {
  379. reportFatalError("CloseQuoteExpected", new Object[]{eleName,atName});
  380. }
  381. } // scanAttributeValue()
  382. //
  383. // XMLScanner methods
  384. //
  385. // NOTE: this is a carbon copy of the code in XML11DTDScannerImpl;
  386. // we need to override these methods in both places.
  387. // this needs to be refactored!!! - NG
  388. /**
  389. * Scans public ID literal.
  390. *
  391. * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
  392. * [13] PubidChar::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
  393. *
  394. * The returned string is normalized according to the following rule,
  395. * from http://www.w3.org/TR/REC-xml#dt-pubid:
  396. *
  397. * Before a match is attempted, all strings of white space in the public
  398. * identifier must be normalized to single space characters (#x20), and
  399. * leading and trailing white space must be removed.
  400. *
  401. * @param literal The string to fill in with the public ID literal.
  402. * @return True on success.
  403. *
  404. * <strong>Note:</strong> This method uses fStringBuffer, anything in it at
  405. * the time of calling is lost.
  406. */
  407. protected boolean scanPubidLiteral(XMLString literal)
  408. throws IOException, XNIException
  409. {
  410. int quote = fEntityScanner.scanChar();
  411. if (quote != '\'' && quote != '"') {
  412. reportFatalError("QuoteRequiredInPublicID", null);
  413. return false;
  414. }
  415. fStringBuffer.clear();
  416. // skip leading whitespace
  417. boolean skipSpace = true;
  418. boolean dataok = true;
  419. while (true) {
  420. int c = fEntityScanner.scanChar();
  421. // REVISIT: none of these except \n and 0x20 should make it past the entity scanner
  422. if (c == ' ' || c == '\n' || c == '\r' || c == 0x85 || c == 0x2028) {
  423. if (!skipSpace) {
  424. // take the first whitespace as a space and skip the others
  425. fStringBuffer.append(' ');
  426. skipSpace = true;
  427. }
  428. }
  429. else if (c == quote) {
  430. if (skipSpace) {
  431. // if we finished on a space let's trim it
  432. fStringBuffer.length--;
  433. }
  434. literal.setValues(fStringBuffer);
  435. break;
  436. }
  437. else if (XMLChar.isPubid(c)) {
  438. fStringBuffer.append((char)c);
  439. skipSpace = false;
  440. }
  441. else if (c == -1) {
  442. reportFatalError("PublicIDUnterminated", null);
  443. return false;
  444. }
  445. else {
  446. dataok = false;
  447. reportFatalError("InvalidCharInPublicID",
  448. new Object[]{Integer.toHexString(c)});
  449. }
  450. }
  451. return dataok;
  452. }
  453. /**
  454. * Normalize whitespace in an XMLString converting all whitespace
  455. * characters to space characters.
  456. */
  457. protected void normalizeWhitespace(XMLString value) {
  458. int end = value.offset + value.length;
  459. for (int i = value.offset; i < end; i++) {
  460. int c = value.ch[i];
  461. if (XMLChar.isSpace(c)) {
  462. value.ch[i] = ' ';
  463. }
  464. }
  465. }
  466. // returns true if the given character is not
  467. // valid with respect to the version of
  468. // XML understood by this scanner.
  469. protected boolean isInvalid(int value) {
  470. return (XML11Char.isXML11Invalid(value));
  471. } // isInvalid(int): boolean
  472. // returns true if the given character is not
  473. // valid or may not be used outside a character reference
  474. // with respect to the version of XML understood by this scanner.
  475. protected boolean isInvalidLiteral(int value) {
  476. return (!XML11Char.isXML11ValidLiteral(value));
  477. } // isInvalidLiteral(int): boolean
  478. // returns true if the given character is
  479. // a valid nameChar with respect to the version of
  480. // XML understood by this scanner.
  481. protected boolean isValidNameChar(int value) {
  482. return (XML11Char.isXML11Name(value));
  483. } // isValidNameChar(int): boolean
  484. // returns true if the given character is
  485. // a valid nameStartChar with respect to the version of
  486. // XML understood by this scanner.
  487. protected boolean isValidNameStartChar(int value) {
  488. return (XML11Char.isXML11NameStart(value));
  489. } // isValidNameStartChar(int): boolean
  490. // returns true if the given character is
  491. // a valid NCName character with respect to the version of
  492. // XML understood by this scanner.
  493. protected boolean isValidNCName(int value) {
  494. return (XML11Char.isXML11NCName(value));
  495. } // isValidNCName(int): boolean
  496. // returns true if the given character is
  497. // a valid high surrogate for a nameStartChar
  498. // with respect to the version of XML understood
  499. // by this scanner.
  500. protected boolean isValidNameStartHighSurrogate(int value) {
  501. return XML11Char.isXML11NameHighSurrogate(value);
  502. } // isValidNameStartHighSurrogate(int): boolean
  503. protected boolean versionSupported(String version) {
  504. return (version.equals("1.1") || version.equals("1.0"));
  505. } // versionSupported(String): boolean
  506. // returns the error message key for unsupported
  507. // versions of XML with respect to the version of
  508. // XML understood by this scanner.
  509. protected String getVersionNotSupportedKey () {
  510. return "VersionNotSupported11";
  511. } // getVersionNotSupportedKey: String
  512. } // class XML11DocumentScannerImpl