1. /*
  2. * The Apache Software License, Version 1.1
  3. *
  4. *
  5. * Copyright (c) 2000-2002 The Apache Software Foundation. All rights
  6. * reserved.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. *
  12. * 1. Redistributions of source code must retain the above copyright
  13. * notice, this list of conditions and the following disclaimer.
  14. *
  15. * 2. Redistributions in binary form must reproduce the above copyright
  16. * notice, this list of conditions and the following disclaimer in
  17. * the documentation and/or other materials provided with the
  18. * distribution.
  19. *
  20. * 3. The end-user documentation included with the redistribution,
  21. * if any, must include the following acknowledgment:
  22. * "This product includes software developed by the
  23. * Apache Software Foundation (http://www.apache.org/)."
  24. * Alternately, this acknowledgment may appear in the software itself,
  25. * if and wherever such third-party acknowledgments normally appear.
  26. *
  27. * 4. The names "Xerces" and "Apache Software Foundation" must
  28. * not be used to endorse or promote products derived from this
  29. * software without prior written permission. For written
  30. * permission, please contact apache@apache.org.
  31. *
  32. * 5. Products derived from this software may not be called "Apache",
  33. * nor may "Apache" appear in their name, without prior written
  34. * permission of the Apache Software Foundation.
  35. *
  36. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  37. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  38. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  39. * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  40. * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  41. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  42. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  43. * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  44. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  45. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  46. * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  47. * SUCH DAMAGE.
  48. * ====================================================================
  49. *
  50. * This software consists of voluntary contributions made by many
  51. * individuals on behalf of the Apache Software Foundation and was
  52. * originally based on software copyright (c) 1999, International
  53. * Business Machines, Inc., http://www.apache.org. For more
  54. * information on the Apache Software Foundation, please see
  55. * <http://www.apache.org/>.
  56. */
  57. package com.sun.org.apache.xerces.internal.impl.io;
  58. import java.io.InputStream;
  59. import java.io.IOException;
  60. import java.io.Reader;
  61. /**
  62. * Reader for UCS-2 and UCS-4 encodings.
  63. * (i.e., encodings from ISO-10646-UCS-(2|4)).
  64. *
  65. * @author Neil Graham, IBM
  66. *
  67. * @version $Id: UCSReader.java,v 1.3 2002/07/08 16:24:03 neilg Exp $
  68. */
  69. public class UCSReader extends Reader {
  70. //
  71. // Constants
  72. //
  73. /** Default byte buffer size (8192, larger than that of ASCIIReader
  74. * since it's reasonable to surmise that the average UCS-4-encoded
  75. * file should be 4 times as large as the average ASCII-encoded file).
  76. */
  77. public static final int DEFAULT_BUFFER_SIZE = 8192;
  78. public static final short UCS2LE = 1;
  79. public static final short UCS2BE = 2;
  80. public static final short UCS4LE = 4;
  81. public static final short UCS4BE = 8;
  82. //
  83. // Data
  84. //
  85. /** Input stream. */
  86. protected InputStream fInputStream;
  87. /** Byte buffer. */
  88. protected byte[] fBuffer;
  89. // what kind of data we're dealing with
  90. protected short fEncoding;
  91. //
  92. // Constructors
  93. //
  94. /**
  95. * Constructs an ASCII reader from the specified input stream
  96. * using the default buffer size. The Endian-ness and whether this is
  97. * UCS-2 or UCS-4 needs also to be known in advance.
  98. *
  99. * @param inputStream The input stream.
  100. * @param encoding One of UCS2LE, UCS2BE, UCS4LE or UCS4BE.
  101. */
  102. public UCSReader(InputStream inputStream, short encoding) {
  103. this(inputStream, DEFAULT_BUFFER_SIZE, encoding);
  104. } // <init>(InputStream, short)
  105. /**
  106. * Constructs an ASCII reader from the specified input stream
  107. * and buffer size. The Endian-ness and whether this is
  108. * UCS-2 or UCS-4 needs also to be known in advance.
  109. *
  110. * @param inputStream The input stream.
  111. * @param size The initial buffer size.
  112. * @param encoding One of UCS2LE, UCS2BE, UCS4LE or UCS4BE.
  113. */
  114. public UCSReader(InputStream inputStream, int size, short encoding) {
  115. fInputStream = inputStream;
  116. fBuffer = new byte[size];
  117. fEncoding = encoding;
  118. } // <init>(InputStream,int,short)
  119. //
  120. // Reader methods
  121. //
  122. /**
  123. * Read a single character. This method will block until a character is
  124. * available, an I/O error occurs, or the end of the stream is reached.
  125. *
  126. * <p> Subclasses that intend to support efficient single-character input
  127. * should override this method.
  128. *
  129. * @return The character read, as an integer in the range 0 to 127
  130. * (<tt>0x00-0x7f</tt>), or -1 if the end of the stream has
  131. * been reached
  132. *
  133. * @exception IOException If an I/O error occurs
  134. */
  135. public int read() throws IOException {
  136. int b0 = fInputStream.read() & 0xff;
  137. if (b0 == 0xff)
  138. return -1;
  139. int b1 = fInputStream.read() & 0xff;
  140. if (b1 == 0xff)
  141. return -1;
  142. if(fEncoding >=4) {
  143. int b2 = fInputStream.read() & 0xff;
  144. if (b2 == 0xff)
  145. return -1;
  146. int b3 = fInputStream.read() & 0xff;
  147. if (b3 == 0xff)
  148. return -1;
  149. System.err.println("b0 is " + (b0 & 0xff) + " b1 " + (b1 & 0xff) + " b2 " + (b2 & 0xff) + " b3 " + (b3 & 0xff));
  150. if (fEncoding == UCS4BE)
  151. return (b0<<24)+(b1<<16)+(b2<<8)+b3;
  152. else
  153. return (b3<<24)+(b2<<16)+(b1<<8)+b0;
  154. } else { // UCS-2
  155. if (fEncoding == UCS2BE)
  156. return (b0<<8)+b1;
  157. else
  158. return (b1<<8)+b0;
  159. }
  160. } // read():int
  161. /**
  162. * Read characters into a portion of an array. This method will block
  163. * until some input is available, an I/O error occurs, or the end of the
  164. * stream is reached.
  165. *
  166. * @param ch Destination buffer
  167. * @param offset Offset at which to start storing characters
  168. * @param length Maximum number of characters to read
  169. *
  170. * @return The number of characters read, or -1 if the end of the
  171. * stream has been reached
  172. *
  173. * @exception IOException If an I/O error occurs
  174. */
  175. public int read(char ch[], int offset, int length) throws IOException {
  176. int byteLength = length << ((fEncoding >= 4)?2:1);
  177. if (byteLength > fBuffer.length) {
  178. byteLength = fBuffer.length;
  179. }
  180. int count = fInputStream.read(fBuffer, 0, byteLength);
  181. if(count == -1) return -1;
  182. // try and make count be a multiple of the number of bytes we're looking for
  183. if(fEncoding >= 4) { // BigEndian
  184. // this looks ugly, but it avoids an if at any rate...
  185. int numToRead = (4 - (count & 3) & 3);
  186. for(int i=0; i<numToRead; i++) {
  187. int charRead = fInputStream.read();
  188. if(charRead == -1) { // end of input; something likely went wrong!A Pad buffer with nulls.
  189. for (int j = i;j<numToRead; j++)
  190. fBuffer[count+j] = 0;
  191. break;
  192. } else {
  193. fBuffer[count+i] = (byte)charRead;
  194. }
  195. }
  196. count += numToRead;
  197. } else {
  198. int numToRead = count & 1;
  199. if(numToRead != 0) {
  200. count++;
  201. int charRead = fInputStream.read();
  202. if(charRead == -1) { // end of input; something likely went wrong!A Pad buffer with nulls.
  203. fBuffer[count] = 0;
  204. } else {
  205. fBuffer[count] = (byte)charRead;
  206. }
  207. }
  208. }
  209. // now count is a multiple of the right number of bytes
  210. int numChars = count >> ((fEncoding >= 4)?2:1);
  211. int curPos = 0;
  212. for (int i = 0; i < numChars; i++) {
  213. int b0 = fBuffer[curPos++] & 0xff;
  214. int b1 = fBuffer[curPos++] & 0xff;
  215. if(fEncoding >=4) {
  216. int b2 = fBuffer[curPos++] & 0xff;
  217. int b3 = fBuffer[curPos++] & 0xff;
  218. if (fEncoding == UCS4BE)
  219. ch[offset+i] = (char)((b0<<24)+(b1<<16)+(b2<<8)+b3);
  220. else
  221. ch[offset+i] = (char)((b3<<24)+(b2<<16)+(b1<<8)+b0);
  222. } else { // UCS-2
  223. if (fEncoding == UCS2BE)
  224. ch[offset+i] = (char)((b0<<8)+b1);
  225. else
  226. ch[offset+i] = (char)((b1<<8)+b0);
  227. }
  228. }
  229. return numChars;
  230. } // read(char[],int,int)
  231. /**
  232. * Skip characters. This method will block until some characters are
  233. * available, an I/O error occurs, or the end of the stream is reached.
  234. *
  235. * @param n The number of characters to skip
  236. *
  237. * @return The number of characters actually skipped
  238. *
  239. * @exception IOException If an I/O error occurs
  240. */
  241. public long skip(long n) throws IOException {
  242. // charWidth will represent the number of bits to move
  243. // n leftward to get num of bytes to skip, and then move the result rightward
  244. // to get num of chars effectively skipped.
  245. // The trick with &'ing, as with elsewhere in this dcode, is
  246. // intended to avoid an expensive use of / that might not be optimized
  247. // away.
  248. int charWidth = (fEncoding >=4)?2:1;
  249. long bytesSkipped = fInputStream.skip(n<<charWidth);
  250. if((bytesSkipped & (charWidth | 1)) == 0) return bytesSkipped >> charWidth;
  251. return (bytesSkipped >> charWidth) + 1;
  252. } // skip(long):long
  253. /**
  254. * Tell whether this stream is ready to be read.
  255. *
  256. * @return True if the next read() is guaranteed not to block for input,
  257. * false otherwise. Note that returning false does not guarantee that the
  258. * next read will block.
  259. *
  260. * @exception IOException If an I/O error occurs
  261. */
  262. public boolean ready() throws IOException {
  263. return false;
  264. } // ready()
  265. /**
  266. * Tell whether this stream supports the mark() operation.
  267. */
  268. public boolean markSupported() {
  269. return fInputStream.markSupported();
  270. } // markSupported()
  271. /**
  272. * Mark the present position in the stream. Subsequent calls to reset()
  273. * will attempt to reposition the stream to this point. Not all
  274. * character-input streams support the mark() operation.
  275. *
  276. * @param readAheadLimit Limit on the number of characters that may be
  277. * read while still preserving the mark. After
  278. * reading this many characters, attempting to
  279. * reset the stream may fail.
  280. *
  281. * @exception IOException If the stream does not support mark(),
  282. * or if some other I/O error occurs
  283. */
  284. public void mark(int readAheadLimit) throws IOException {
  285. fInputStream.mark(readAheadLimit);
  286. } // mark(int)
  287. /**
  288. * Reset the stream. If the stream has been marked, then attempt to
  289. * reposition it at the mark. If the stream has not been marked, then
  290. * attempt to reset it in some way appropriate to the particular stream,
  291. * for example by repositioning it to its starting point. Not all
  292. * character-input streams support the reset() operation, and some support
  293. * reset() without supporting mark().
  294. *
  295. * @exception IOException If the stream has not been marked,
  296. * or if the mark has been invalidated,
  297. * or if the stream does not support reset(),
  298. * or if some other I/O error occurs
  299. */
  300. public void reset() throws IOException {
  301. fInputStream.reset();
  302. } // reset()
  303. /**
  304. * Close the stream. Once a stream has been closed, further read(),
  305. * ready(), mark(), or reset() invocations will throw an IOException.
  306. * Closing a previously-closed stream, however, has no effect.
  307. *
  308. * @exception IOException If an I/O error occurs
  309. */
  310. public void close() throws IOException {
  311. fInputStream.close();
  312. } // close()
  313. } // class UCSReader