BreakIterator

/*
 * @(#)BreakIterator.java	1.35 03/12/19
 *
 * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
 * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
 */

/*
 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
 * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
 *
 * The original version of this source code and documentation
 * is copyrighted and owned by Taligent, Inc., a wholly-owned
 * subsidiary of IBM. These materials are provided under terms
 * of a License Agreement between Taligent and Sun. This technology
 * is protected by multiple US and International patents.
 *
 * This notice and attribution to Taligent may not be removed.
 * Taligent is a registered trademark of Taligent, Inc.
 *
 */

package java.text;

import java.util.Vector;
import java.util.Locale;
import java.util.ResourceBundle;
import java.util.MissingResourceException;
import sun.text.resources.LocaleData;
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;

import java.net.URL;
import java.io.InputStream;
import java.io.IOException;

import java.lang.ref.SoftReference;
import java.security.AccessController;
import java.security.PrivilegedAction;

/**
 * The <code>BreakIterator</code> class implements methods for finding
 * the location of boundaries in text. Instances of <code>BreakIterator</code>
 * maintain a current position and scan over text
 * returning the index of characters where boundaries occur.
 * Internally, <code>BreakIterator</code> scans text using a
 * <code>CharacterIterator</code>, and is thus able to scan text held
 * by any object implementing that protocol. A <code>StringCharacterIterator</code>
 * is used to scan <code>String</code> objects passed to <code>setText</code>.
 *
 * <p>
 * You use the factory methods provided by this class to create
 * instances of various types of break iterators. In particular,
 * use <code>getWordIterator</code>, <code>getLineIterator</code>,
 * <code>getSentenceIterator</code>, and <code>getCharacterIterator</code>
 * to create <code>BreakIterator</code>s that perform
 * word, line, sentence, and character boundary analysis respectively.
 * A single <code>BreakIterator</code> can work only on one unit
 * (word, line, sentence, and so on). You must use a different iterator
 * for each unit boundary analysis you wish to perform.
 *
 * <p>
 * Line boundary analysis determines where a text string can be
 * broken when line-wrapping. The mechanism correctly handles
 * punctuation and hyphenated words.
 *
 * <p>
 * Sentence boundary analysis allows selection with correct interpretation
 * of periods within numbers and abbreviations, and trailing punctuation
 * marks such as quotation marks and parentheses.
 *
 * <p>
 * Word boundary analysis is used by search and replace functions, as
 * well as within text editing applications that allow the user to
 * select words with a double click. Word selection provides correct
 * interpretation of punctuation marks within and following
 * words. Characters that are not part of a word, such as symbols
 * or punctuation marks, have word-breaks on both sides.
 *
 * <p>
 * Character boundary analysis allows users to interact with characters
 * as they expect to, for example, when moving the cursor through a text
 * string. Character boundary analysis provides correct navigation of
 * through character strings, regardless of how the character is stored.
 * For example, an accented character might be stored as a base character
 * and a diacritical mark. What users consider to be a character can
 * differ between languages.
 *
 * <p>
 * <code>BreakIterator</code> is intended for use with natural
 * languages only. Do not use this class to tokenize a programming language.
 *
 * <P>
 * <strong>Examples</strong>:<P>
 * Creating and using text boundaries
 * <blockquote>
 * <pre>
 * public static void main(String args[]) {
 *      if (args.length == 1) {
 *          String stringToExamine = args[0];
 *          //print each word in order
 *          BreakIterator boundary = BreakIterator.getWordInstance();
 *          boundary.setText(stringToExamine);
 *          printEachForward(boundary, stringToExamine);
 *          //print each sentence in reverse order
 *          boundary = BreakIterator.getSentenceInstance(Locale.US);
 *          boundary.setText(stringToExamine);
 *          printEachBackward(boundary, stringToExamine);
 *          printFirst(boundary, stringToExamine);
 *          printLast(boundary, stringToExamine);
 *      }
 * }
 * </pre>
 * </blockquote>
 *
 * Print each element in order
 * <blockquote>
 * <pre>
 * public static void printEachForward(BreakIterator boundary, String source) {
 *     int start = boundary.first();
 *     for (int end = boundary.next();
 *          end != BreakIterator.DONE;
 *          start = end, end = boundary.next()) {
 *          System.out.println(source.substring(start,end));
 *     }
 * }
 * </pre>
 * </blockquote>
 *
 * Print each element in reverse order
 * <blockquote>
 * <pre>
 * public static void printEachBackward(BreakIterator boundary, String source) {
 *     int end = boundary.last();
 *     for (int start = boundary.previous();
 *          start != BreakIterator.DONE;
 *          end = start, start = boundary.previous()) {
 *         System.out.println(source.substring(start,end));
 *     }
 * }
 * </pre>
 * </blockquote>
 *
 * Print first element
 * <blockquote>
 * <pre>
 * public static void printFirst(BreakIterator boundary, String source) {
 *     int start = boundary.first();
 *     int end = boundary.next();
 *     System.out.println(source.substring(start,end));
 * }
 * </pre>
 * </blockquote>
 *
 * Print last element
 * <blockquote>
 * <pre>
 * public static void printLast(BreakIterator boundary, String source) {
 *     int end = boundary.last();
 *     int start = boundary.previous();
 *     System.out.println(source.substring(start,end));
 * }
 * </pre>
 * </blockquote>
 *
 * Print the element at a specified position
 * <blockquote>
 * <pre>
 * public static void printAt(BreakIterator boundary, int pos, String source) {
 *     int end = boundary.following(pos);
 *     int start = boundary.previous();
 *     System.out.println(source.substring(start,end));
 * }
 * </pre>
 * </blockquote>
 *
 * Find the next word
 * <blockquote>
 * <pre>
 * public static int nextWordStartAfter(int pos, String text) {
 *     BreakIterator wb = BreakIterator.getWordInstance();
 *     wb.setText(text);
 *     int last = wb.following(pos);
 *     int current = wb.next();
 *     while (current != BreakIterator.DONE) {
 *         for (int p = last; p < current; p++) {
 *             if (Character.isLetter(text.codePointAt(p))
 *                 return last;
 *         }
 *         last = current;
 *         current = wb.next();
 *     }
 *     return BreakIterator.DONE;
 * }
 * </pre>
 * (The iterator returned by BreakIterator.getWordInstance() is unique in that
 * the break positions it returns don't represent both the start and end of the
 * thing being iterated over.  That is, a sentence-break iterator returns breaks
 * that each represent the end of one sentence and the beginning of the next.
 * With the word-break iterator, the characters between two boundaries might be a
 * word, or they might be the punctuation or whitespace between two words.  The
 * above code uses a simple heuristic to determine which boundary is the beginning
 * of a word: If the characters between this boundary and the next boundary
 * include at least one letter (this can be an alphabetical letter, a CJK ideograph,
 * a Hangul syllable, a Kana character, etc.), then the text between this boundary
 * and the next is a word; otherwise, it's the material between words.)
 * </blockquote>
 *
 * @see CharacterIterator
 *
 */

public abstract class BreakIterator implements Cloneable
{
    /**
     * Constructor. BreakIterator is stateless and has no default behavior.
     */
    protected BreakIterator()
    {
    }

    /**
     * Create a copy of this iterator
     * @return A copy of this
     */
    public Object clone()
    {
        try {
            return super.clone();
        }
        catch (CloneNotSupportedException e) {
            throw new InternalError();
        }
    }

    /**
     * DONE is returned by previous() and next() after all valid
     * boundaries have been returned.
     */
    public static final int DONE = -1;

    /**
     * Return the first boundary. The iterator's current position is set
     * to the first boundary.
     * @return The character index of the first text boundary.
     */
    public abstract int first();

    /**
     * Return the last boundary. The iterator's current position is set
     * to the last boundary.
     * @return The character index of the last text boundary.
     */
    public abstract int last();

    /**
     * Return the nth boundary from the current boundary
     * @param n which boundary to return.  A value of 0
     * does nothing.  Negative values move to previous boundaries
     * and positive values move to later boundaries.
     * @return The index of the nth boundary from the current position.
     */
    public abstract int next(int n);

    /**
     * Return the boundary following the current boundary.
     * @return The character index of the next text boundary or DONE if all
     * boundaries have been returned.  Equivalent to next(1).
     */
    public abstract int next();

    /**
     * Return the boundary preceding the current boundary.
     * @return The character index of the previous text boundary or DONE if all
     * boundaries have been returned.
     */
    public abstract int previous();

    /**
     * Return the first boundary following the specified offset.
     * The value returned is always greater than the offset or
     * the value BreakIterator.DONE
     * @param offset the offset to begin scanning. Valid values
     * are determined by the CharacterIterator passed to
     * setText().  Invalid values cause
     * an IllegalArgumentException to be thrown.
     * @return The first boundary after the specified offset.
     */
    public abstract int following(int offset);

    /**
     * Return the last boundary preceding the specfied offset.
     * The value returned is always less than the offset or the value
     * BreakIterator.DONE.
     * @param offset the offset to begin scanning.  Valid values are
     * determined by the CharacterIterator passed to setText().
     * Invalid values cause an IllegalArgumentException to be thrown.
     * @return The last boundary before the specified offset.
     * @since 1.2
     */
    public int preceding(int offset) {
        // NOTE:  This implementation is here solely because we can't add new
        // abstract methods to an existing class.  There is almost ALWAYS a
        // better, faster way to do this.
        int pos = following(offset);
        while (pos >= offset && pos != DONE)
            pos = previous();
        return pos;
    }

    /**
     * Return true if the specified position is a boundary position.
     * @param offset the offset to check.
     * @return True if "offset" is a boundary position.
     * @since 1.2
     */
    public boolean isBoundary(int offset) {
        // NOTE: This implementation probably is wrong for most situations
        // because it fails to take into account the possibility that a
        // CharacterIterator passed to setText() may not have a begin offset
        // of 0.  But since the abstract BreakIterator doesn't have that
        // knowledge, it assumes the begin offset is 0.  If you subclass
        // BreakIterator, copy the SimpleTextBoundary implementation of this
        // function into your subclass.  [This should have been abstract at
        // this level, but it's too late to fix that now.]
        if (offset == 0)
            return true;
        else
            return following(offset - 1) == offset;
    }

    /**
     * Return character index of the text boundary that was most recently
     * returned by next(), previous(), first(), or last()
     * @return The boundary most recently returned.
     */
    public abstract int current();

    /**
     * Get the text being scanned
     * @return the text being scanned
     */
    public abstract CharacterIterator getText();

    /**
     * Set a new text string to be scanned.  The current scan
     * position is reset to first().
     * @param newText new text to scan.
     */
    public void setText(String newText)
    {
        setText(new StringCharacterIterator(newText));
    }

    /**
     * Set a new text for scanning.  The current scan
     * position is reset to first().
     * @param newText new text to scan.
     */
    public abstract void setText(CharacterIterator newText);

    private static final int CHARACTER_INDEX = 0;
    private static final int WORD_INDEX = 1;
    private static final int LINE_INDEX = 2;
    private static final int SENTENCE_INDEX = 3;
    private static final SoftReference[] iterCache = new SoftReference[4];

    /**
     * Create BreakIterator for word-breaks using default locale.
     * Returns an instance of a BreakIterator implementing word breaks.
     * WordBreak  is usefull for word selection (ex. double click)
     * @return A BreakIterator for word-breaks
     * @see java.util.Locale#getDefault
     */
    public static BreakIterator getWordInstance()
    {
        return getWordInstance(Locale.getDefault());
    }

    /**
     * Create BreakIterator for word-breaks using specified locale.
     * Returns an instance of a BreakIterator implementing word breaks.
     * WordBreak is usefull for word selection (ex. double click)
     * @param where the local.  If a specific WordBreak is not
     * avaliable for the specified locale, a default WordBreak is returned.
     * @return A BreakIterator for word-breaks
     */
    public static BreakIterator getWordInstance(Locale where)
    {
        return getBreakInstance(where,
                                WORD_INDEX,
                                "WordData",
                                "WordDictionary");
    }

    /**
     * Create BreakIterator for line-breaks using default locale.
     * Returns an instance of a BreakIterator implementing line breaks. Line
     * breaks are logically possible line breaks, actual line breaks are
     * usually determined based on display width.
     * LineBreak is useful for word wrapping text.
     * @return A BreakIterator for line-breaks
     * @see java.util.Locale#getDefault
     */
    public static BreakIterator getLineInstance()
    {
        return getLineInstance(Locale.getDefault());
    }

    /**
     * Create BreakIterator for line-breaks using specified locale.
     * Returns an instance of a BreakIterator implementing line breaks. Line
     * breaks are logically possible line breaks, actual line breaks are
     * usually determined based on display width.
     * LineBreak is useful for word wrapping text.
     * @param where the local.  If a specific LineBreak is not
     * avaliable for the specified locale, a default LineBreak is returned.
     * @return A BreakIterator for line-breaks
     */
    public static BreakIterator getLineInstance(Locale where)
    {
        return getBreakInstance(where,
                                LINE_INDEX,
                                "LineData",
                                "LineDictionary");
    }

    /**
     * Create BreakIterator for character-breaks using default locale
     * Returns an instance of a BreakIterator implementing character breaks.
     * Character breaks are boundaries of combining character sequences.
     * @return A BreakIterator for character-breaks
     * @see Locale#getDefault
     */
    public static BreakIterator getCharacterInstance()
    {
        return getCharacterInstance(Locale.getDefault());
    }

    /**
     * Create BreakIterator for character-breaks using specified locale
     * Returns an instance of a BreakIterator implementing character breaks.
     * Character breaks are boundaries of combining character sequences.
     * @param where the local.  If a specific character break is not
     * avaliable for the specified local, a default character break is returned.
     * @return A BreakIterator for character-breaks
     */
    public static BreakIterator getCharacterInstance(Locale where)
    {
        return getBreakInstance(where,
                                CHARACTER_INDEX,
                                "CharacterData",
                                "CharacterDictionary");
    }

    /**
     * Create BreakIterator for sentence-breaks using default locale
     * Returns an instance of a BreakIterator implementing sentence breaks.
     * @return A BreakIterator for sentence-breaks
     * @see java.util.Locale#getDefault
     */
    public static BreakIterator getSentenceInstance()
    {
        return getSentenceInstance(Locale.getDefault());
    }

    /**
     * Create BreakIterator for sentence-breaks using specified locale
     * Returns an instance of a BreakIterator implementing sentence breaks.
     * @param where the local.  If a specific SentenceBreak is not
     * avaliable for the specified local, a default SentenceBreak is returned.
     * @return A BreakIterator for sentence-breaks
     */
    public static BreakIterator getSentenceInstance(Locale where)
    {
        return getBreakInstance(where,
                                SENTENCE_INDEX,
                                "SentenceData",
                                "SentenceDictionary");
    }

    private static BreakIterator getBreakInstance(Locale where,
                                                  int type,
                                                  String dataName,
                                                  String dictionaryName) {
        if (iterCache[type] != null) {
            BreakIteratorCache cache = (BreakIteratorCache) iterCache[type].get();
            if (cache != null) {
                if (cache.getLocale().equals(where)) {
                    return cache.createBreakInstance();
                }
            }
        }

        BreakIterator result = createBreakInstance(where,
                                                   type,
                                                   dataName,
                                                   dictionaryName);
        BreakIteratorCache cache = new BreakIteratorCache(where, result);
        iterCache[type] = new SoftReference(cache);
        return result;
    }

    private static ResourceBundle getBundle(final String baseName, final Locale locale) {
         return (ResourceBundle) AccessController.doPrivileged(new PrivilegedAction() {
            public Object run() {
                return ResourceBundle.getBundle(baseName, locale);
            }
        });
    }

    private static BreakIterator createBreakInstance(Locale where,
                                                     int type,
                                                     String dataName,
                                                     String dictionaryName) {

        ResourceBundle bundle = getBundle(
                        "sun.text.resources.BreakIteratorInfo", where);
        String[] classNames = bundle.getStringArray("BreakIteratorClasses");

        String dataFile = bundle.getString(dataName);

        try {
            if (classNames[type].equals("RuleBasedBreakIterator")) {
                return new RuleBasedBreakIterator(dataFile);
            }
            else if (classNames[type].equals("DictionaryBasedBreakIterator")) {
                String dictionaryFile = bundle.getString(dictionaryName);
                return new DictionaryBasedBreakIterator(dataFile, dictionaryFile);
            }
            else {
                throw new IllegalArgumentException("Invalid break iterator class \"" +
                                classNames[type] + "\"");
            }
        }
        catch (Exception e) {
            throw new InternalError(e.toString()); 
        }
    }

    /**
     * Returns an array of all locales for which the
     * <code>get*Instance</code> methods of this class can return
     * localized instances.
     * The array returned must contain at least a <code>Locale</code>
     * instance equal to {@link java.util.Locale#US Locale.US}.
     *
     * @return An array of locales for which localized
     *         <code>BreakIterator</code> instances are available.
     */
    public static synchronized Locale[] getAvailableLocales()
    {
        //FIX ME - this is a known bug.  It should return
        //all locales.
        return LocaleData.getAvailableLocales("NumberPatterns");
    }

    private static final class BreakIteratorCache {

        private BreakIterator iter;
        private Locale where;

        BreakIteratorCache(Locale where, BreakIterator iter) {
            this.where = where;
            this.iter = (BreakIterator) iter.clone();
        }

        Locale getLocale() {
            return where;
        }

        BreakIterator createBreakInstance() {
            return (BreakIterator) iter.clone();
        }
    }

    protected static long getLong(byte[] buf, int offset) {
        long num = buf[offset]&0xFF;
        for (int i = 1; i < 8; i++) {
            num = num<<8 | (buf[offset+i]&0xFF);
        }
        return num;
    }

    protected static int getInt(byte[] buf, int offset) {
        int num = buf[offset]&0xFF;
        for (int i = 1; i < 4; i++) {
            num = num<<8 | (buf[offset+i]&0xFF);
        }
        return num;
    }

    protected static short getShort(byte[] buf, int offset) {
        short num = (short)(buf[offset]&0xFF);
        num = (short)(num<<8 | (buf[offset+1]&0xFF));
        return num;
    }
}