1. /*
  2. * Copyright 2002 Sun Microsystems, Inc. All rights reserved.
  3. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
  4. */
  5. package javax.mail.internet;
  6. import javax.mail.MessagingException;
  7. import javax.activation.*;
  8. import java.util.*;
  9. import java.io.*;
  10. import com.sun.mail.util.*;
  11. /**
  12. * This is a utility class that provides various MIME related
  13. * functionality. <p>
  14. *
  15. * There are a set of methods to encode and decode MIME headers as
  16. * per RFC 2047. A brief description on handling such headers is
  17. * given below: <p>
  18. *
  19. * RFC 822 mail headers <strong>must</strong> contain only US-ASCII
  20. * characters. Headers that contain non US-ASCII characters must be
  21. * encoded so that they contain only US-ASCII characters. Basically,
  22. * this process involves using either BASE64 or QP to encode certain
  23. * characters. RFC 2047 describes this in detail. <p>
  24. *
  25. * In Java, Strings contain (16 bit) Unicode characters. ASCII is a
  26. * subset of Unicode (and occupies the range 0 - 127). A String
  27. * that contains only ASCII characters is already mail-safe. If the
  28. * String contains non US-ASCII characters, it must be encoded. An
  29. * additional complexity in this step is that since Unicode is not
  30. * yet a widely used charset, one might want to first charset-encode
  31. * the String into another charset and then do the transfer-encoding.
  32. * <p>
  33. * Note that to get the actual bytes of a mail-safe String (say,
  34. * for sending over SMTP), one must do
  35. * <p><blockquote><pre>
  36. *
  37. * byte[] bytes = string.getBytes("iso-8859-1");
  38. *
  39. * </pre></blockquote><p>
  40. *
  41. * The <code>setHeader()</code> and <code>addHeader()</code> methods
  42. * on MimeMessage and MimeBodyPart assume that the given header values
  43. * are Unicode strings that contain only US-ASCII characters. Hence
  44. * the callers of those methods must insure that the values they pass
  45. * do not contain non US-ASCII characters. The methods in this class
  46. * help do this. <p>
  47. *
  48. * The <code>getHeader()</code> family of methods on MimeMessage and
  49. * MimeBodyPart return the raw header value. These might be encoded
  50. * as per RFC 2047, and if so, must be decoded into Unicode Strings.
  51. * The methods in this class help to do this.
  52. *
  53. * @version 1.32, 00/10/17
  54. * @author John Mani
  55. */
  56. public class MimeUtility {
  57. // This class cannot be instantiated
  58. private MimeUtility() { }
  59. public static final int ALL = -1;
  60. /**
  61. * Get the content-transfer-encoding that should be applied
  62. * to the input stream of this datasource, to make it mailsafe. <p>
  63. *
  64. * The algorithm used here is: <br>
  65. * <ul>
  66. * <li>
  67. * If the primary type of this datasource is "text" and if all
  68. * the bytes in its input stream are US-ASCII, then the encoding
  69. * is "7bit". If more than half of the bytes are non-US-ASCII, then
  70. * the encoding is "base64". If less than half of the bytes are
  71. * non-US-ASCII, then the encoding is "quoted-printable".
  72. * <li>
  73. * If the primary type of this datasource is not "text", then if
  74. * all the bytes of its input stream are US-ASCII, the encoding
  75. * is "7bit". If there is even one non-US-ASCII character, the
  76. * encoding is "base64".
  77. * </ul>
  78. *
  79. * @param ds DataSource
  80. * @return the encoding. This is either "7bit",
  81. * "quoted-printable" or "base64"
  82. */
  83. public static String getEncoding(DataSource ds) {
  84. ContentType cType = null;
  85. InputStream is = null;
  86. String encoding = null;
  87. try {
  88. cType = new ContentType(ds.getContentType());
  89. is = ds.getInputStream();
  90. } catch (Exception ex) {
  91. return "base64"; // what else ?!
  92. }
  93. if (cType.match("text/*")) {
  94. // Check all of the available bytes
  95. int i = checkAscii(is, ALL, false);
  96. switch (i) {
  97. case ALL_ASCII:
  98. encoding = "7bit"; // all ascii
  99. break;
  100. case MOSTLY_ASCII:
  101. encoding = "quoted-printable"; // mostly ascii
  102. break;
  103. default:
  104. encoding = "base64"; // mostly binary
  105. break;
  106. }
  107. } else { // not "text"
  108. // Check all of available bytes, break out if we find
  109. // at least one non-US-ASCII character
  110. if (checkAscii(is, ALL, true) == ALL_ASCII) // all ascii
  111. encoding = "7bit";
  112. else // found atleast one non-ascii character, use b64
  113. encoding = "base64";
  114. }
  115. // Close the input stream
  116. try {
  117. is.close();
  118. } catch (IOException ioex) { }
  119. return encoding;
  120. }
  121. /**
  122. * Same as <code>getEncoding(DataSource)</code> except that instead
  123. * of reading the data from an <code>InputStream</code> it uses the
  124. * <code>writeTo</code> method to examine the data. This is more
  125. * efficient in the common case of a <code>DataHandler</code>
  126. * created with an object and a MIME type (for example, a
  127. * "text/plain" String) because all the I/O is done in this
  128. * thread. In the case requiring an <code>InputStream</code> the
  129. * <code>DataHandler</code> uses a thread, a pair of pipe streams,
  130. * and the <code>writeTo</code> method to produce the data. <p>
  131. *
  132. * @since JavaMail 1.2
  133. */
  134. public static String getEncoding(DataHandler dh) {
  135. ContentType cType = null;
  136. String encoding = null;
  137. /*
  138. * Try to pick the most efficient means of determining the
  139. * encoding. If this DataHandler was created using a DataSource,
  140. * the getEncoding(DataSource) method is typically faster. If
  141. * the DataHandler was created with an object, this method is
  142. * much faster. To distinguish the two cases, we use a heuristic.
  143. * A DataHandler created with an object will always have a null name.
  144. * A DataHandler created with a DataSource will usually have a
  145. * non-null name.
  146. *
  147. * XXX - This is actually quite a disgusting hack, but it makes
  148. * a common case run over twice as fast.
  149. */
  150. if (dh.getName() != null)
  151. return getEncoding(dh.getDataSource());
  152. try {
  153. cType = new ContentType(dh.getContentType());
  154. } catch (Exception ex) {
  155. return "base64"; // what else ?!
  156. }
  157. if (cType.match("text/*")) {
  158. // Check all of the available bytes
  159. AsciiOutputStream aos = new AsciiOutputStream(false);
  160. try {
  161. dh.writeTo(aos);
  162. } catch (IOException ex) { } // ignore it
  163. switch (aos.getAscii()) {
  164. case ALL_ASCII:
  165. encoding = "7bit"; // all ascii
  166. break;
  167. case MOSTLY_ASCII:
  168. encoding = "quoted-printable"; // mostly ascii
  169. break;
  170. default:
  171. encoding = "base64"; // mostly binary
  172. break;
  173. }
  174. } else { // not "text"
  175. // Check all of available bytes, break out if we find
  176. // at least one non-US-ASCII character
  177. AsciiOutputStream aos = new AsciiOutputStream(true);
  178. try {
  179. dh.writeTo(aos);
  180. } catch (IOException ex) { } // ignore it
  181. if (aos.getAscii() == ALL_ASCII) // all ascii
  182. encoding = "7bit";
  183. else // found atleast one non-ascii character, use b64
  184. encoding = "base64";
  185. }
  186. return encoding;
  187. }
  188. /**
  189. * Decode the given input stream. The Input stream returned is
  190. * the decoded input stream. All the encodings defined in RFC 2045
  191. * are supported here. They include "base64", "quoted-printable",
  192. * "7bit", "8bit", and "binary". In addition, "uuencode" is also
  193. * supported.
  194. *
  195. * @param is input stream
  196. * @param encoding the encoding of the stream.
  197. * @return decoded input stream.
  198. */
  199. public static InputStream decode(InputStream is, String encoding)
  200. throws MessagingException {
  201. if (encoding.equalsIgnoreCase("base64"))
  202. return new BASE64DecoderStream(is);
  203. else if (encoding.equalsIgnoreCase("quoted-printable"))
  204. return new QPDecoderStream(is);
  205. else if (encoding.equalsIgnoreCase("uuencode") ||
  206. encoding.equalsIgnoreCase("x-uuencode"))
  207. return new UUDecoderStream(is);
  208. else if (encoding.equalsIgnoreCase("binary") ||
  209. encoding.equalsIgnoreCase("7bit") ||
  210. encoding.equalsIgnoreCase("8bit"))
  211. return is;
  212. else
  213. throw new MessagingException("Unknown encoding: " + encoding);
  214. }
  215. /**
  216. * Wrap an encoder around the given output stream.
  217. * All the encodings defined in RFC 2045 are supported here.
  218. * They include "base64", "quoted-printable", "7bit", "8bit" and
  219. * "binary". In addition, "uuencode" is also supported.
  220. *
  221. * @param os output stream
  222. * @param encoding the encoding of the stream.
  223. * @return output stream that applies the
  224. * specified encoding.
  225. */
  226. public static OutputStream encode(OutputStream os, String encoding)
  227. throws MessagingException {
  228. if (encoding == null)
  229. return os;
  230. else if (encoding.equalsIgnoreCase("base64"))
  231. return new BASE64EncoderStream(os);
  232. else if (encoding.equalsIgnoreCase("quoted-printable"))
  233. return new QPEncoderStream(os);
  234. else if (encoding.equalsIgnoreCase("uuencode") ||
  235. encoding.equalsIgnoreCase("x-uuencode"))
  236. return new UUEncoderStream(os);
  237. else if (encoding.equalsIgnoreCase("binary") ||
  238. encoding.equalsIgnoreCase("7bit") ||
  239. encoding.equalsIgnoreCase("8bit"))
  240. return os;
  241. else
  242. throw new MessagingException("Unknown encoding: " +encoding);
  243. }
  244. /**
  245. * Wrap an encoder around the given output stream.
  246. * All the encodings defined in RFC 2045 are supported here.
  247. * They include "base64", "quoted-printable", "7bit", "8bit" and
  248. * "binary". In addition, "uuencode" is also supported.
  249. * The <code>filename</code> parameter is used with the "uuencode"
  250. * encoding and is included in the encoded output.
  251. *
  252. * @param os output stream
  253. * @param encoding the encoding of the stream.
  254. * @param filename name for the file being encoded (only used
  255. * with uuencode)
  256. * @return output stream that applies the
  257. * specified encoding.
  258. * @since JavaMail 1.2
  259. */
  260. public static OutputStream encode(OutputStream os, String encoding,
  261. String filename)
  262. throws MessagingException {
  263. if (encoding == null)
  264. return os;
  265. else if (encoding.equalsIgnoreCase("base64"))
  266. return new BASE64EncoderStream(os);
  267. else if (encoding.equalsIgnoreCase("quoted-printable"))
  268. return new QPEncoderStream(os);
  269. else if (encoding.equalsIgnoreCase("uuencode") ||
  270. encoding.equalsIgnoreCase("x-uuencode"))
  271. return new UUEncoderStream(os, filename);
  272. else if (encoding.equalsIgnoreCase("binary") ||
  273. encoding.equalsIgnoreCase("7bit") ||
  274. encoding.equalsIgnoreCase("8bit"))
  275. return os;
  276. else
  277. throw new MessagingException("Unknown encoding: " +encoding);
  278. }
  279. /**
  280. * Encode a RFC 822 "text" token into mail-safe form as per
  281. * RFC 2047. <p>
  282. *
  283. * The given Unicode string is examined for non US-ASCII
  284. * characters. If the string contains only US-ASCII characters,
  285. * it is returned as-is. If the string contains non US-ASCII
  286. * characters, it is first character-encoded using the platform's
  287. * default charset, then transfer-encoded using either the B or
  288. * Q encoding. The resulting bytes are then returned as a Unicode
  289. * string containing only ASCII characters. <p>
  290. *
  291. * Note that this method should be used to encode only
  292. * "unstructured" RFC 822 headers. <p>
  293. *
  294. * Example of usage:
  295. * <p><blockquote><pre>
  296. *
  297. * MimePart part = ...
  298. * String rawvalue = "FooBar Mailer, Japanese version 1.1"
  299. * try {
  300. * // If we know for sure that rawvalue contains only US-ASCII
  301. * // characters, we can skip the encoding part
  302. * part.setHeader("X-mailer", MimeUtility.encodeText(rawvalue));
  303. * } catch (UnsupportedEncodingException e) {
  304. * // encoding failure
  305. * } catch (MessagingException me) {
  306. * // setHeader() failure
  307. * }
  308. *
  309. * </pre></blockquote><p>
  310. *
  311. * @param text unicode string
  312. * @return Unicode string containing only US-ASCII characters
  313. * @exception UnsupportedEncodingException if the encoding fails
  314. */
  315. public static String encodeText(String text)
  316. throws UnsupportedEncodingException {
  317. return encodeText(text, null, null);
  318. }
  319. /**
  320. * Encode a RFC 822 "text" token into mail-safe form as per
  321. * RFC 2047. <p>
  322. *
  323. * The given Unicode string is examined for non US-ASCII
  324. * characters. If the string contains only US-ASCII characters,
  325. * it is returned as-is. If the string contains non US-ASCII
  326. * characters, it is first character-encoded using the specified
  327. * charset, then transfer-encoded using either the B or Q encoding.
  328. * The resulting bytes are then returned as a Unicode string
  329. * containing only ASCII characters. <p>
  330. *
  331. * Note that this method should be used to encode only
  332. * "unstructured" RFC 822 headers.
  333. *
  334. * @param text the header value
  335. * @param charset the charset. If this parameter is null, the
  336. * platform's default chatset is used.
  337. * @param encoding the encoding to be used. Currently supported
  338. * values are "B" and "Q". If this parameter is null, then
  339. * the "Q" encoding is used if most of characters to be
  340. * encoded are in the ASCII charset, otherwise "B" encoding
  341. * is used.
  342. * @return Unicode string containing only US-ASCII characters
  343. */
  344. public static String encodeText(String text, String charset,
  345. String encoding)
  346. throws UnsupportedEncodingException {
  347. return encodeWord(text, charset, encoding, false);
  348. }
  349. /**
  350. * Decode "unstructured" headers, that is, headers that are defined
  351. * as '*text' as per RFC 822. <p>
  352. *
  353. * The string is decoded using the algorithm specified in
  354. * RFC 2047, Section 6.1.1. If the charset-conversion fails
  355. * for any sequence, an UnsupportedEncodingException is thrown.
  356. * If the String is not an RFC 2047 style encoded header, it is
  357. * returned as-is <p>
  358. *
  359. * Example of usage:
  360. * <p><blockquote><pre>
  361. *
  362. * MimePart part = ...
  363. * String rawvalue = null;
  364. * String value = null;
  365. * try {
  366. * if ((rawvalue = part.getHeader("X-mailer")[0]) != null)
  367. * value = MimeUtility.decodeText(rawvalue);
  368. * } catch (UnsupportedEncodingException e) {
  369. * // Don't care
  370. * value = rawvalue;
  371. * } catch (MessagingException me) { }
  372. *
  373. * return value;
  374. *
  375. * </pre></blockquote><p>
  376. *
  377. * @param etext the possibly encoded value
  378. * @exception UnsupportedEncodingException if the charset
  379. * conversion failed.
  380. */
  381. public static String decodeText(String etext)
  382. throws UnsupportedEncodingException {
  383. /*
  384. * We look for sequences separated by "linear-white-space".
  385. * (as per RFC 2047, Section 6.1.1)
  386. * RFC 822 defines "linear-white-space" as SPACE | HT | CR | NL.
  387. */
  388. String lwsp = " \t\n\r";
  389. StringTokenizer st;
  390. /*
  391. * First, lets do a quick run thru the string and check
  392. * whether the sequence "=?" exists at all. If none exists,
  393. * we know there are no encoded-words in here and we can just
  394. * return the string as-is, without suffering thru the later
  395. * decoding logic.
  396. * This handles the most common case of unencoded headers
  397. * efficiently.
  398. */
  399. if (etext.indexOf("=?") == -1)
  400. return etext;
  401. // Encoded words found. Start decoding ...
  402. st = new StringTokenizer(etext, lwsp, true);
  403. StringBuffer sb = new StringBuffer(); // decode buffer
  404. StringBuffer wsb = new StringBuffer(); // white space buffer
  405. boolean prevWasEncoded = false;
  406. while (st.hasMoreTokens()) {
  407. char c;
  408. String s = st.nextToken();
  409. // If whitespace, append it to the whitespace buffer
  410. if (((c = s.charAt(0)) == ' ') || (c == '\t') ||
  411. (c == '\r') || (c == '\n'))
  412. wsb.append(c);
  413. else {
  414. // Check if token is an 'encoded-word' ..
  415. String word;
  416. try {
  417. word = decodeWord(s);
  418. // Yes, this IS an 'encoded-word'.
  419. if (!prevWasEncoded && wsb.length() > 0) {
  420. // if the previous word was also encoded, we
  421. // should ignore the collected whitespace. Else
  422. // we include the whitespace as well.
  423. sb.append(wsb);
  424. }
  425. prevWasEncoded = true;
  426. } catch (ParseException pex) {
  427. // This is NOT an 'encoded-word'.
  428. word = s;
  429. // include colleced whitespace ..
  430. if (wsb.length() > 0)
  431. sb.append(wsb);
  432. prevWasEncoded = false;
  433. }
  434. sb.append(word); // append the actual word
  435. wsb.setLength(0); // reset wsb for reuse
  436. }
  437. }
  438. return sb.toString();
  439. }
  440. /**
  441. * Encode a RFC 822 "word" token into mail-safe form as per
  442. * RFC 2047. <p>
  443. *
  444. * The given Unicode string is examined for non US-ASCII
  445. * characters. If the string contains only US-ASCII characters,
  446. * it is returned as-is. If the string contains non US-ASCII
  447. * characters, it is first character-encoded using the platform's
  448. * default charset, then transfer-encoded using either the B or
  449. * Q encoding. The resulting bytes are then returned as a Unicode
  450. * string containing only ASCII characters. <p>
  451. *
  452. * This method is meant to be used when creating RFC 822 "phrases".
  453. * The InternetAddress class, for example, uses this to encode
  454. * it's 'phrase' component.
  455. *
  456. * @param text unicode string
  457. * @return Array of Unicode strings containing only US-ASCII
  458. * characters.
  459. * @exception UnsupportedEncodingException if the encoding fails
  460. */
  461. public static String encodeWord(String word)
  462. throws UnsupportedEncodingException {
  463. return encodeWord(word, null, null);
  464. }
  465. /**
  466. * Encode a RFC 822 "word" token into mail-safe form as per
  467. * RFC 2047. <p>
  468. *
  469. * The given Unicode string is examined for non US-ASCII
  470. * characters. If the string contains only US-ASCII characters,
  471. * it is returned as-is. If the string contains non US-ASCII
  472. * characters, it is first character-encoded using the specified
  473. * charset, then transfer-encoded using either the B or Q encoding.
  474. * The resulting bytes are then returned as a Unicode string
  475. * containing only ASCII characters. <p>
  476. *
  477. * @param text unicode string
  478. * @param charset the MIME charset
  479. * @param encoding the encoding to be used. Currently supported
  480. * values are "B" and "Q". If this parameter is null, then
  481. * the "Q" encoding is used if most of characters to be
  482. * encoded are in the ASCII charset, otherwise "B" encoding
  483. * is used.
  484. * @return Unicode string containing only US-ASCII characters
  485. * @exception UnsupportedEncodingException if the encoding fails
  486. */
  487. public static String encodeWord(String word, String charset,
  488. String encoding)
  489. throws UnsupportedEncodingException {
  490. return encodeWord(word, charset, encoding, true);
  491. }
  492. /*
  493. * Encode the given string. The parameter 'encodingWord' should
  494. * be true if a RFC 822 "word" token is being encoded and false if a
  495. * RFC 822 "text" token is being encoded. This is because the
  496. * "Q" encoding defined in RFC 2047 has more restrictions when
  497. * encoding "word" tokens. (Sigh)
  498. */
  499. private static String encodeWord(String string, String charset,
  500. String encoding, boolean encodingWord)
  501. throws UnsupportedEncodingException {
  502. // If 'string' contains only US-ASCII characters, just
  503. // return it.
  504. if (checkAscii(string) == ALL_ASCII)
  505. return string;
  506. // Else, apply the specified charset conversion.
  507. String jcharset;
  508. if (charset == null) { // use default charset
  509. jcharset = getDefaultJavaCharset(); // the java charset
  510. charset = getDefaultMIMECharset(); // the MIME equivalent
  511. } else // MIME charset -> java charset
  512. jcharset = javaCharset(charset);
  513. // If no transfer-encoding is specified, figure one out.
  514. if (encoding == null) {
  515. byte[] bytes = string.getBytes(jcharset);
  516. if (checkAscii(bytes) != MOSTLY_NONASCII)
  517. encoding = "Q";
  518. else
  519. encoding = "B";
  520. }
  521. boolean b64;
  522. if (encoding.equalsIgnoreCase("B"))
  523. b64 = true;
  524. else if (encoding.equalsIgnoreCase("Q"))
  525. b64 = false;
  526. else
  527. throw new UnsupportedEncodingException(
  528. "Unknown transfer encoding: " + encoding);
  529. StringBuffer outb = new StringBuffer(); // the output buffer
  530. doEncode(string, b64, jcharset,
  531. // As per RFC 2047, size of an encoded string should not
  532. // exceed 75 bytes.
  533. // 7 = size of "=?", '?', 'B'/'Q', '?', "?="
  534. 75 - 7 - charset.length(), // the available space
  535. "=?" + charset + "?" + encoding + "?", // prefix
  536. true, encodingWord, outb);
  537. return outb.toString();
  538. }
  539. private static void doEncode(String string, boolean b64,
  540. String jcharset, int avail, String prefix,
  541. boolean first, boolean encodingWord, StringBuffer buf)
  542. throws UnsupportedEncodingException {
  543. // First find out what the length of the encoded version of
  544. // 'string' would be.
  545. byte[] bytes = string.getBytes(jcharset);
  546. int len;
  547. if (b64) // "B" encoding
  548. len = BEncoderStream.encodedLength(bytes);
  549. else // "Q"
  550. len = QEncoderStream.encodedLength(bytes, encodingWord);
  551. int size;
  552. if ((len > avail) && ((size = string.length()) > 1)) {
  553. // If the length is greater than 'avail', split 'string'
  554. // into two and recurse.
  555. doEncode(string.substring(0, size2), b64, jcharset,
  556. avail, prefix, first, encodingWord, buf);
  557. doEncode(string.substring(size2, size), b64, jcharset,
  558. avail, prefix, false, encodingWord, buf);
  559. } else {
  560. // length <= than 'avail'. Encode the given string
  561. ByteArrayOutputStream os = new ByteArrayOutputStream();
  562. OutputStream eos; // the encoder
  563. if (b64) // "B" encoding
  564. eos = new BEncoderStream(os);
  565. else // "Q" encoding
  566. eos = new QEncoderStream(os, encodingWord);
  567. try { // do the encoding
  568. eos.write(bytes);
  569. eos.close();
  570. } catch (IOException ioex) { }
  571. byte[] encodedBytes = os.toByteArray(); // the encoded stuff
  572. // Now write out the encoded (all ASCII) bytes into our
  573. // StringBuffer
  574. if (!first) // not the first line of this sequence
  575. buf.append("\r\n "); // start a continuation line
  576. buf.append(prefix);
  577. for (int i = 0; i < encodedBytes.length; i++)
  578. buf.append((char)encodedBytes[i]);
  579. buf.append("?="); // terminate the current sequence
  580. }
  581. }
  582. /**
  583. * The string is parsed using the rules in RFC 2047 for parsing
  584. * an "encoded-word". If the parse fails, a ParseException is
  585. * thrown. Otherwise, it is transfer-decoded, and then
  586. * charset-converted into Unicode. If the charset-conversion
  587. * fails, an UnsupportedEncodingException is thrown.<p>
  588. *
  589. * @param eword the possibly encoded value
  590. * @exception ParseException if the string is not an
  591. * encoded-word as per RFC 2047.
  592. * @exception UnsupportedEncodingException if the charset
  593. * conversion failed.
  594. */
  595. public static String decodeWord(String eword)
  596. throws ParseException, UnsupportedEncodingException {
  597. if (!eword.startsWith("=?")) // not an encoded word
  598. throw new ParseException();
  599. // get charset
  600. int start = 2; int pos;
  601. if ((pos = eword.indexOf('?', start)) == -1)
  602. throw new ParseException();
  603. String charset = javaCharset(eword.substring(start, pos));
  604. // get encoding
  605. start = pos+1;
  606. if ((pos = eword.indexOf('?', start)) == -1)
  607. throw new ParseException();
  608. String encoding = eword.substring(start, pos);
  609. // get encoded-sequence
  610. start = pos+1;
  611. if ((pos = eword.indexOf("?=", start)) == -1)
  612. throw new ParseException();
  613. String word = eword.substring(start, pos);
  614. try {
  615. // Extract the bytes from word
  616. ByteArrayInputStream bis =
  617. new ByteArrayInputStream(ASCIIUtility.getBytes(word));
  618. // Get the appropriate decoder
  619. InputStream is;
  620. if (encoding.equalsIgnoreCase("B"))
  621. is = new BASE64DecoderStream(bis);
  622. else if (encoding.equalsIgnoreCase("Q"))
  623. is = new QDecoderStream(bis);
  624. else
  625. throw new UnsupportedEncodingException(
  626. "unknown encoding: " + encoding);
  627. // For b64 & q, size of decoded word <= size of word. So
  628. // the decoded bytes must fit into the 'bytes' array. This
  629. // is certainly more efficient than writing bytes into a
  630. // ByteArrayOutputStream and then pulling out the byte[]
  631. // from it.
  632. int count = bis.available();
  633. byte[] bytes = new byte[count];
  634. // count is set to the actual number of decoded bytes
  635. count = is.read(bytes, 0, count);
  636. // Finally, convert the decoded bytes into a String using
  637. // the specified charset
  638. return new String(bytes, 0, count, charset);
  639. } catch (UnsupportedEncodingException uex) {
  640. // explicitly catch and rethrow this exception, otherwise
  641. // the below IOException catch will swallow this up!
  642. throw uex;
  643. } catch (IOException ioex) {
  644. // Shouldn't happen.
  645. throw new ParseException();
  646. } catch (IllegalArgumentException iex) {
  647. /* An unknown charset of the form ISO-XXX-XXX, will cause
  648. * the JDK to throw an IllegalArgumentException ... Since the
  649. * JDK will attempt to create a classname using this string,
  650. * but valid classnames must not contain the character '-',
  651. * and this results in an IllegalArgumentException, rather than
  652. * the expected UnsupportedEncodingException. Yikes
  653. */
  654. throw new UnsupportedEncodingException();
  655. }
  656. }
  657. /**
  658. * A utility method to quote a word, if the word contains any
  659. * characters from the specified 'specials' list.<p>
  660. *
  661. * The <code>HeaderTokenizer</code> class defines two special
  662. * sets of delimiters - MIME and RFC 822. <p>
  663. *
  664. * This method is typically used during the generation of
  665. * RFC 822 and MIME header fields.
  666. *
  667. * @param word word to be quoted
  668. * @param specials the set of special characters
  669. * @return the possibly quoted word
  670. * @see javax.mail.internet.HeaderTokenizer#MIME
  671. * @see javax.mail.internet.HeaderTokenizer#RFC822
  672. */
  673. public static String quote(String word, String specials) {
  674. int len = word.length();
  675. /*
  676. * Look for any "bad" characters, Escape and
  677. * quote the entire string if necessary.
  678. */
  679. boolean needQuoting = false;
  680. for (int i = 0; i < len; i++) {
  681. char c = word.charAt(i);
  682. if (c == '"' || c == '\\' || c == '\r' || c == '\n') {
  683. // need to escape them and then quote the whole string
  684. StringBuffer sb = new StringBuffer(len + 3);
  685. sb.append('"');
  686. for (int j = 0; j < len; j++) {
  687. char cc = word.charAt(j);
  688. if ((cc == '"') || (cc == '\\') ||
  689. (cc == '\r') || (cc == '\n'))
  690. // Escape the character
  691. sb.append('\\');
  692. sb.append(cc);
  693. }
  694. sb.append('"');
  695. return sb.toString();
  696. } else if (c < 040 || c >= 0177 || specials.indexOf(c) >= 0)
  697. // These characters cause the string to be quoted
  698. needQuoting = true;
  699. }
  700. if (needQuoting) {
  701. StringBuffer sb = new StringBuffer(len + 2);
  702. sb.append('"').append(word).append('"');
  703. return sb.toString();
  704. } else
  705. return word;
  706. }
  707. /**
  708. * Convert a MIME charset name into a valid Java charset name. <p>
  709. *
  710. * @param charset the MIME charset name
  711. * @return the Java charset equivalent. If a suitable mapping is
  712. * not available, the passed in charset is itself returned.
  713. */
  714. public static String javaCharset(String charset) {
  715. if (mime2java == null || charset == null)
  716. // no mapping table, or charset parameter is null
  717. return charset;
  718. String alias = (String)mime2java.get(charset.toLowerCase());
  719. return alias == null ? charset : alias;
  720. }
  721. /**
  722. * Convert a java charset into its MIME charset name. <p>
  723. *
  724. * Note that a future version of JDK (post 1.2) might provide
  725. * this functionality, in which case, we may deprecate this
  726. * method then.
  727. *
  728. * @param charset the JDK charset
  729. * @return the MIME/IANA equivalent. If a mapping
  730. * is not possible, the passed in charset itself
  731. * is returned.
  732. * @since JavaMail 1.1
  733. */
  734. public static String mimeCharset(String charset) {
  735. if (java2mime == null || charset == null)
  736. // no mapping table or charset param is null
  737. return charset;
  738. String alias = (String)java2mime.get(charset.toLowerCase());
  739. return alias == null ? charset : alias;
  740. }
  741. private static String defaultJavaCharset;
  742. private static String defaultMIMECharset;
  743. /**
  744. * Get the default charset corresponding to the system's current
  745. * default locale. <p>
  746. *
  747. * @return the default charset of the system's default locale,
  748. * as a Java charset. (NOT a MIME charset)
  749. * @since JavaMail 1.1
  750. */
  751. public static String getDefaultJavaCharset() {
  752. if (defaultJavaCharset == null) {
  753. try {
  754. defaultJavaCharset = System.getProperty("file.encoding",
  755. "8859_1");
  756. } catch (SecurityException sex) {
  757. class NullInputStream extends InputStream {
  758. public int read() {
  759. return 0;
  760. }
  761. }
  762. InputStreamReader reader =
  763. new InputStreamReader(new NullInputStream());
  764. defaultJavaCharset = reader.getEncoding();
  765. if (defaultJavaCharset == null)
  766. defaultJavaCharset = "8859_1";
  767. }
  768. }
  769. return defaultJavaCharset;
  770. }
  771. /*
  772. * Get the default MIME charset for this locale.
  773. */
  774. static String getDefaultMIMECharset() {
  775. if (defaultMIMECharset == null)
  776. defaultMIMECharset = System.getProperty("mail.mime.charset");
  777. if (defaultMIMECharset == null)
  778. defaultMIMECharset = mimeCharset(getDefaultJavaCharset());
  779. return defaultMIMECharset;
  780. }
  781. private static Hashtable mime2java;
  782. private static Hashtable java2mime;
  783. static {
  784. // Use this class's classloader to load the mapping file
  785. InputStream is =
  786. javax.mail.internet.MimeUtility.class.getResourceAsStream(
  787. "/META-INF/javamail.charset.map");
  788. if (is != null) {
  789. is = new LineInputStream(is);
  790. // Load the JDK-to-MIME charset mapping table
  791. java2mime = new Hashtable(20);
  792. loadMappings((LineInputStream)is, java2mime);
  793. // Load the MIME-to-JDK charset mapping table
  794. mime2java = new Hashtable(10);
  795. loadMappings((LineInputStream)is, mime2java);
  796. }
  797. }
  798. private static void loadMappings(LineInputStream is, Hashtable table) {
  799. String currLine;
  800. while (true) {
  801. try {
  802. currLine = is.readLine();
  803. } catch (IOException ioex) {
  804. break; // error in reading, stop
  805. }
  806. if (currLine == null) // end of file, stop
  807. break;
  808. if (currLine.startsWith("--") && currLine.endsWith("--"))
  809. // end of this table
  810. break;
  811. // ignore empty lines and comments
  812. if (currLine.trim().length() == 0 || currLine.startsWith("#"))
  813. continue;
  814. // A valid entry is of the form <key><separator><value>
  815. // where, <separator> := SPACE | HT. Parse this
  816. StringTokenizer tk = new StringTokenizer(currLine, " \t");
  817. try {
  818. String key = tk.nextToken();
  819. String value = tk.nextToken();
  820. table.put(key.toLowerCase(), value);
  821. } catch (NoSuchElementException nex) { }
  822. }
  823. }
  824. static final int ALL_ASCII = 1;
  825. static final int MOSTLY_ASCII = 2;
  826. static final int MOSTLY_NONASCII = 3;
  827. /**
  828. * Check if the given string contains non US-ASCII characters.
  829. * @param s string
  830. * @return ALL_ASCII if all characters in the string
  831. * belong to the US-ASCII charset. MOSTLY_NONASCII
  832. * if any one character is non-ascii.
  833. */
  834. static int checkAscii(String s) {
  835. int l = s.length();
  836. for (int i=0; i < l; i++) {
  837. if (nonascii((int)s.charAt(i))) // non-ascii
  838. return MOSTLY_NONASCII;
  839. }
  840. return ALL_ASCII; // all ascii
  841. }
  842. /**
  843. * Check if the given byte array contains non US-ASCII characters.
  844. * @param b byte array
  845. * @return ALL_ASCII if all characters in the string
  846. * belong to the US-ASCII charset. MOSTLY_ASCII
  847. * if more than half of the available characters
  848. * are US-ASCII characters. Else MOSTLY_NONASCII.
  849. */
  850. static int checkAscii(byte[] b) {
  851. int ascii = 0, non_ascii = 0;
  852. for (int i=0; i < b.length; i++) {
  853. // The '&' operator automatically causes b[i] to be promoted
  854. // to an int, and we mask out the higher bytes in the int
  855. // so that the resulting value is not a negative integer.
  856. if (nonascii(b[i] & 0xff)) // non-ascii
  857. non_ascii++;
  858. else
  859. ascii++;
  860. }
  861. if (non_ascii == 0)
  862. return ALL_ASCII;
  863. if (ascii > non_ascii)
  864. return MOSTLY_ASCII;
  865. return MOSTLY_NONASCII;
  866. }
  867. /**
  868. * Check if the given input stream contains non US-ASCII characters.
  869. * Upto <code>max</code> bytes are checked. If <code>max</code> is
  870. * set to <code>ALL</code>, then all the bytes available in this
  871. * input stream are checked. If <code>breakOnNonAscii</code> is true
  872. * the check terminates when the first non-US-ASCII character is
  873. * found and MOSTLY_NONASCII is returned. Else, the check continues
  874. * till <code>max</code> bytes or till the end of stream.
  875. *
  876. * @param is the input stream
  877. * @param max maximum bytes to check for. The special value
  878. * ALL indicates that all the bytes in this input
  879. * stream must be checked.
  880. * @param breakOnNonAscii if <code>true</code>, then terminate the
  881. * the check when the first non-US-ASCII character
  882. * is found.
  883. * @return ALL_ASCII if all characters in the string
  884. * belong to the US-ASCII charset. MOSTLY_ASCII
  885. * if more than half of the available characters
  886. * are US-ASCII characters. Else MOSTLY_NONASCII.
  887. */
  888. static int checkAscii(InputStream is, int max, boolean breakOnNonAscii) {
  889. int ascii = 0, non_ascii = 0;
  890. int len;
  891. int block = 4096;
  892. int linelen = 0;
  893. boolean longLine = false;
  894. byte buf[] = null;
  895. if (max != 0) {
  896. block = (max == ALL) ? 4096 : Math.min(max, 4096);
  897. buf = new byte[block];
  898. }
  899. while (max != 0) {
  900. try {
  901. if ((len = is.read(buf, 0, block)) == -1)
  902. break;
  903. for (int i = 0; i < len; i++) {
  904. // The '&' operator automatically causes b[i] to
  905. // be promoted to an int, and we mask out the higher
  906. // bytes in the int so that the resulting value is
  907. // not a negative integer.
  908. int b = buf[i] & 0xff;
  909. if (b == '\r' || b == '\n')
  910. linelen = 0;
  911. else {
  912. linelen++;
  913. if (linelen > 998) // 1000 - CRLF
  914. longLine = true;
  915. }
  916. if (nonascii(b)) { // non-ascii
  917. if (breakOnNonAscii) // we are done
  918. return MOSTLY_NONASCII;
  919. else
  920. non_ascii++;
  921. } else
  922. ascii++;
  923. }
  924. } catch (IOException ioex) {
  925. break;
  926. }
  927. if (max != ALL)
  928. max -= len;
  929. }
  930. if (max == 0 && breakOnNonAscii)
  931. // We have been told to break on the first non-ascii character.
  932. // We haven't got any non-ascii character yet, but then we
  933. // have not checked all of the available bytes either. So we
  934. // cannot say for sure that this input stream is ALL_ASCII,
  935. // and hence we must play safe and return MOSTLY_NONASCII
  936. return MOSTLY_NONASCII;
  937. if (non_ascii == 0) { // no non-us-ascii characters so far
  938. // if we've seen a long line, we degrade to mostly ascii
  939. if (longLine)
  940. return MOSTLY_ASCII;
  941. else
  942. return ALL_ASCII;
  943. }
  944. if (ascii > non_ascii) // mostly ascii
  945. return MOSTLY_ASCII;
  946. return MOSTLY_NONASCII;
  947. }
  948. private static final boolean nonascii(int b) {
  949. return b >= 0177 || (b < 040 && b != '\r' && b != '\n' && b != '\t');
  950. }
  951. }
  952. /**
  953. * An OutputStream that determines whether the data written to
  954. * it is all ASCII, mostly ASCII, or mostly non-ASCII.
  955. */
  956. class AsciiOutputStream extends OutputStream {
  957. private boolean breakOnNonAscii;
  958. private int ascii = 0, non_ascii = 0;
  959. private int linelen = 0;
  960. private boolean longLine = false;
  961. private int ret = 0;
  962. public AsciiOutputStream(boolean breakOnNonAscii) {
  963. this.breakOnNonAscii = breakOnNonAscii;
  964. }
  965. public void write(int b) throws IOException {
  966. check(b);
  967. }
  968. public void write(byte b[]) throws IOException {
  969. write(b, 0, b.length);
  970. }
  971. public void write(byte b[], int off, int len) throws IOException {
  972. len += off;
  973. for (int i = off; i < len ; i++)
  974. check(b[i]);
  975. }
  976. private final void check(int b) throws IOException {
  977. b &= 0xff;
  978. if (b == '\r' || b == '\n')
  979. linelen = 0;
  980. else {
  981. linelen++;
  982. if (linelen > 998) // 1000 - CRLF
  983. longLine = true;
  984. }
  985. if (b > 0177) { // non-ascii
  986. non_ascii++;
  987. if (breakOnNonAscii) { // we are done
  988. ret = MimeUtility.MOSTLY_NONASCII;
  989. throw new EOFException();
  990. }
  991. } else
  992. ascii++;
  993. }
  994. /**
  995. * Return ASCII-ness of data stream.
  996. */
  997. public int getAscii() {
  998. if (ret != 0)
  999. return ret;
  1000. if (non_ascii == 0) { // no non-us-ascii characters so far
  1001. // if we've seen a long line, we degrade to mostly ascii
  1002. if (longLine)
  1003. return MimeUtility.MOSTLY_ASCII;
  1004. else
  1005. return MimeUtility.ALL_ASCII;
  1006. }
  1007. if (ascii > non_ascii) // mostly ascii
  1008. return MimeUtility.MOSTLY_ASCII;
  1009. return MimeUtility.MOSTLY_NONASCII;
  1010. }
  1011. }