1. /*
  2. * Copyright 2002 Sun Microsystems, Inc. All rights reserved.
  3. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
  4. */
  5. package javax.mail.internet;
  6. import java.util.*;
  7. /**
  8. * This class tokenizes RFC822 and MIME headers into the basic
  9. * symbols specified by RFC822 and MIME. <p>
  10. *
  11. * This class handles folded headers (ie headers with embedded
  12. * CRLF SPACE sequences). The folds are removed in the returned
  13. * tokens.
  14. *
  15. * @version 1.8, 99/12/06
  16. * @author John Mani
  17. */
  18. public class HeaderTokenizer {
  19. /**
  20. * The Token class represents tokens returned by the
  21. * HeaderTokenizer.
  22. */
  23. public static class Token {
  24. private int type;
  25. private String value;
  26. /**
  27. * Token type indicating an ATOM.
  28. */
  29. public static final int ATOM = -1;
  30. /**
  31. * Token type indicating a quoted string. The value
  32. * field contains the string without the quotes.
  33. */
  34. public static final int QUOTEDSTRING = -2;
  35. /**
  36. * Token type indicating a comment. The value field
  37. * contains the comment string without the comment
  38. * start and end symbols.
  39. */
  40. public static final int COMMENT = -3;
  41. /**
  42. * Token type indicating end of input.
  43. */
  44. public static final int EOF = -4;
  45. /**
  46. * Constructor.
  47. * @param type Token type
  48. * @param value Token value
  49. */
  50. public Token(int type, String value) {
  51. this.type = type;
  52. this.value = value;
  53. }
  54. /**
  55. * Return the type of the token. If the token represents a
  56. * delimiter or a control character, the type is that character
  57. * itself, converted to an integer. Otherwise, it's value is
  58. * one of the following:
  59. * <ul>
  60. * <li><code>ATOM</code> A sequence of ASCII characters
  61. * delimited by either SPACE, CTL, "(", <"> or the
  62. * specified SPECIALS
  63. * <li><code>QUOTEDSTRING</code> A sequence of ASCII characters
  64. * within quotes
  65. * <li><code>COMMENT</code> A sequence of ASCII characters
  66. * within "(" and ")".
  67. * <li><code>EOF</code> End of header
  68. * </ul>
  69. */
  70. public int getType() {
  71. return type;
  72. }
  73. /**
  74. * Returns the value of the token just read. When the current
  75. * token is a quoted string, this field contains the body of the
  76. * string, without the quotes. When the current token is a comment,
  77. * this field contains the body of the comment.
  78. *
  79. * @return token value
  80. */
  81. public String getValue() {
  82. return value;
  83. }
  84. }
  85. private String string; // the string to be tokenized
  86. private boolean skipComments; // should comments be skipped ?
  87. private String delimiters; // delimiter string
  88. private int currentPos; // current parse position
  89. private int maxPos; // string length
  90. private int nextPos; // track start of next Token for next()
  91. private int peekPos; // track start of next Token for peek()
  92. /**
  93. * RFC822 specials
  94. */
  95. public final static String RFC822 = "()<>@,;:\\\"\t .[]";
  96. /**
  97. * MIME specials
  98. */
  99. public final static String MIME = "()<>@,;:\\\"\t []/?=";
  100. // The EOF Token
  101. private final static Token EOFToken = new Token(Token.EOF, null);
  102. /**
  103. * Constructor that takes a rfc822 style header.
  104. *
  105. * @param header The rfc822 header to be tokenized
  106. * @param delimiters Set of delimiter characters
  107. * to be used to delimit ATOMS. These
  108. * are usually <code>RFC822</code> or
  109. * <code>MIME</code>
  110. * @param skipComments If true, comments are skipped and
  111. * not returned as tokens
  112. */
  113. public HeaderTokenizer(String header, String delimiters,
  114. boolean skipComments) {
  115. string = (header == null) ? "" : header; // paranoia ?!
  116. this.skipComments = skipComments;
  117. this.delimiters = delimiters;
  118. currentPos = nextPos = peekPos = 0;
  119. maxPos = string.length();
  120. }
  121. /**
  122. * Constructor. Comments are ignored and not returned as tokens
  123. *
  124. * @param header The header that is tokenized
  125. * @param delimiters The delimiters to be used
  126. */
  127. public HeaderTokenizer(String header, String delimiters) {
  128. this(header, delimiters, true);
  129. }
  130. /**
  131. * Constructor. The RFC822 defined delimiters - RFC822 - are
  132. * used to delimit ATOMS. Also comments are skipped and not
  133. * returned as tokens
  134. */
  135. public HeaderTokenizer(String header) {
  136. this(header, RFC822);
  137. }
  138. /**
  139. * Parses the next token from this String. <p>
  140. *
  141. * Clients sit in a loop calling next() to parse successive
  142. * tokens until an EOF Token is returned.
  143. *
  144. * @return the next Token
  145. * @exception ParseException if the parse fails
  146. */
  147. public Token next() throws ParseException {
  148. Token tk;
  149. currentPos = nextPos; // setup currentPos
  150. tk = getNext();
  151. nextPos = peekPos = currentPos; // update currentPos and peekPos
  152. return tk;
  153. }
  154. /**
  155. * Peek at the next token, without actually removing the token
  156. * from the parse stream. Invoking this method multiple times
  157. * will return successive tokens, until <code>next()</code> is
  158. * called. <p>
  159. *
  160. * @return the next Token
  161. * @exception ParseException if the parse fails
  162. */
  163. public Token peek() throws ParseException {
  164. Token tk;
  165. currentPos = peekPos; // setup currentPos
  166. tk = getNext();
  167. peekPos = currentPos; // update peekPos
  168. return tk;
  169. }
  170. /**
  171. * Return the rest of the Header.
  172. *
  173. * @return String rest of header. null is returned if we are
  174. * already at end of header
  175. */
  176. public String getRemainder() {
  177. return string.substring(nextPos);
  178. }
  179. /*
  180. * Return the next token starting from 'currentPos'. After the
  181. * parse, 'currentPos' is updated to point to the start of the
  182. * next token.
  183. */
  184. private Token getNext() throws ParseException {
  185. // If we're already at end of string, return EOF
  186. if (currentPos >= maxPos)
  187. return EOFToken;
  188. // Skip white-space, position currentPos beyond the space
  189. if (skipWhiteSpace() == Token.EOF)
  190. return EOFToken;
  191. char c;
  192. int start;
  193. boolean filter = false;
  194. c = string.charAt(currentPos);
  195. // Check or Skip comments and position currentPos
  196. // beyond the comment
  197. while (c == '(') {
  198. // Parsing comment ..
  199. int nesting;
  200. for (start = ++currentPos, nesting = 1;
  201. nesting > 0 && currentPos < maxPos;
  202. currentPos++) {
  203. c = string.charAt(currentPos);
  204. if (c == '\\') { // Escape sequence
  205. currentPos++; // skip the escaped character
  206. filter = true;
  207. } else if (c == '\r')
  208. filter = true;
  209. else if (c == '(')
  210. nesting++;
  211. else if (c == ')')
  212. nesting--;
  213. }
  214. if (nesting != 0)
  215. throw new ParseException("Unbalanced comments");
  216. if (!skipComments) {
  217. // Return the comment, if we are asked to.
  218. // Note that the comment start & end markers are ignored.
  219. String s;
  220. if (filter) // need to go thru the token again.
  221. s = filterToken(string, start, currentPos-1);
  222. else
  223. s = string.substring(start,currentPos-1);
  224. return new Token(Token.COMMENT, s);
  225. }
  226. // Skip any whitespace after the comment.
  227. if (skipWhiteSpace() == Token.EOF)
  228. return EOFToken;
  229. c = string.charAt(currentPos);
  230. }
  231. // Check for quoted-string and position currentPos
  232. // beyond the terminating quote
  233. if (c == '"') {
  234. for (start = ++currentPos; currentPos < maxPos; currentPos++) {
  235. c = string.charAt(currentPos);
  236. if (c == '\\') { // Escape sequence
  237. currentPos++;
  238. filter = true;
  239. } else if (c == '\r')
  240. filter = true;
  241. else if (c == '"') {
  242. currentPos++;
  243. String s;
  244. if (filter)
  245. s = filterToken(string, start, currentPos-1);
  246. else
  247. s = string.substring(start,currentPos-1);
  248. return new Token(Token.QUOTEDSTRING, s);
  249. }
  250. }
  251. throw new ParseException("Unbalanced quoted string");
  252. }
  253. // Check for SPECIAL or CTL
  254. if (c < 040 || c >= 0177 || delimiters.indexOf(c) >= 0) {
  255. currentPos++; // re-position currentPos
  256. char ch[] = new char[1];
  257. ch[0] = c;
  258. return new Token((int)c, new String(ch));
  259. }
  260. // Check for ATOM
  261. for (start = currentPos; currentPos < maxPos; currentPos++) {
  262. c = string.charAt(currentPos);
  263. // ATOM is delimited by either SPACE, CTL, "(", <">
  264. // or the specified SPECIALS
  265. if (c < 040 || c >= 0177 || c == '(' || c == ' ' ||
  266. c == '"' || delimiters.indexOf(c) >= 0)
  267. break;
  268. }
  269. return new Token(Token.ATOM, string.substring(start, currentPos));
  270. }
  271. // Skip SPACE, HT, CR and NL
  272. private int skipWhiteSpace() {
  273. char c;
  274. for (; currentPos < maxPos; currentPos++)
  275. if (((c = string.charAt(currentPos)) != ' ') &&
  276. (c != '\t') && (c != '\r') && (c != '\n'))
  277. return currentPos;
  278. return Token.EOF;
  279. }
  280. /* Process escape sequences and embedded LWSPs from a comment or
  281. * quoted string.
  282. */
  283. private static String filterToken(String s, int start, int end) {
  284. StringBuffer sb = new StringBuffer();
  285. char c;
  286. boolean gotEscape = false;
  287. boolean gotCR = false;
  288. for (int i = start; i < end; i++) {
  289. c = s.charAt(i);
  290. if (c == '\n' && gotCR) {
  291. // This LF is part of an unescaped
  292. // CRLF sequence (i.e, LWSP). Skip it.
  293. gotCR = false;
  294. continue;
  295. }
  296. gotCR = false;
  297. if (!gotEscape) {
  298. // Previous character was NOT '\'
  299. if (c == '\\') // skip this character
  300. gotEscape = true;
  301. else if (c == '\r') // skip this character
  302. gotCR = true;
  303. else // append this character
  304. sb.append(c);
  305. } else {
  306. // Previous character was '\'. So no need to
  307. // bother with any special processing, just
  308. // append this character
  309. sb.append(c);
  310. gotEscape = false;
  311. }
  312. }
  313. return sb.toString();
  314. }
  315. }