- /*
- * @(#)SentenceBreakData.java 1.18 01/11/29
- *
- * Copyright 2002 Sun Microsystems, Inc. All rights reserved.
- * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
- */
-
- /*
- * @(#)SentenceBreakData.java 1.18 01/11/29
- *
- * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
- * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
- *
- * Portions copyright (c) 1996-1998 Sun Microsystems, Inc.
- * All Rights Reserved.
- *
- * The original version of this source code and documentation
- * is copyrighted and owned by Taligent, Inc., a wholly-owned
- * subsidiary of IBM. These materials are provided under terms
- * of a License Agreement between Taligent and Sun. This technology
- * is protected by multiple US and International patents.
- *
- * This notice and attribution to Taligent may not be removed.
- * Taligent is a registered trademark of Taligent, Inc.
- *
- * Permission to use, copy, modify, and distribute this software
- * and its documentation for NON-COMMERCIAL purposes and without
- * fee is hereby granted provided that this copyright notice
- * appears in all copies. Please refer to the file "copyright.html"
- * for further important copyright and licensing information.
- *
- * SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
- * THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
- * TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
- * PARTICULAR PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR
- * ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR
- * DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
- *
- */
-
- package java.text;
-
- /**
- * The SentenceBreakData contains data used by SimpleTextBoundary
- * to determine sentence breaks.
- * @see #BreakIterator
- */
- final class SentenceBreakData extends TextBoundaryData
- {
- private static final byte other = 0;
- // lower case letters, digits...
- private static final byte space = 1;
- // spaces...
- private static final byte terminator = 2;
- // period, questionmark...
- private static final byte ambiguosTerm = 3;
- // Ambiguos terminator
- private static final byte openBracket = 4;
- // open brackets
- private static final byte closeBracket = 5;
- // close brackets
- private static final byte cjk = 6;
- // Characters where the previous sentence does not have a space
- // after a terminator. Common in Japanese, Chinese, and Korean
- private static final byte paragraphBreak = 7;
- // Paragraph break
- private static final byte lowerCase = 8;
- // Lower case
- private static final byte upperCase = 9;
- private static final byte number = 10;
-
- private static final byte quote = 11;
- private static final byte nsm = 12;
- private static final byte EOS = 13;
-
- // digit
- private static final int COL_COUNT = 14;
-
- private static final byte SI = (byte)0x80;
- private static final byte STOP = (byte) 0;
- private static final byte SI_STOP = (byte)SI + STOP;
-
- public SentenceBreakData() {
- super(kSentenceForward, kSentenceBackward, kSentenceMap);
- }
-
- private static final byte kSentenceForwardData[] =
- {
- // other space terminator ambTerm
- // open close CJK PB
- // lower upper digit Quote
- // nsm EOS
-
- // 0
- STOP, STOP, STOP, STOP,
- STOP, STOP, STOP, STOP,
- STOP, STOP, STOP, STOP,
- STOP, STOP,
-
- // 1
- (byte)(SI+1), (byte)(SI+1), (byte)(SI+2), (byte)(SI+5),
- (byte)(SI+1), (byte)(SI+1), (byte)(SI+1), (byte)(SI+4),
- (byte)(SI+1), (byte)(SI+1), (byte)(SI+1), (byte)(SI+1),
- (byte)(SI+1), SI_STOP,
-
- // 2
- SI_STOP, (byte)(SI+3), (byte)(SI+2), (byte)(SI+5),
- SI_STOP, (byte)(SI+2), SI_STOP, (byte)(SI+4),
- SI_STOP, SI_STOP, SI_STOP, (byte)(SI+2),
- (byte)(SI+2), SI_STOP,
-
- // 3
- SI_STOP, (byte)(SI+3), SI_STOP, SI_STOP,
- SI_STOP, SI_STOP, SI_STOP, (byte)(SI+4),
- SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- (byte)(SI+3), SI_STOP,
-
- // 4
- SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, SI_STOP,
-
- // 5
- (byte)(SI+1), (byte)(SI+6), (byte)(SI+2), (byte)(SI+5),
- (byte)(SI+1), (byte)(SI+5), SI_STOP, (byte)(SI+4),
- (byte)(SI+1), (byte)(SI+1), (byte)(SI+1), (byte)(SI+5),
- (byte)(SI+5), SI_STOP,
-
- // 6
- SI_STOP, (byte)(SI+6), SI_STOP, SI_STOP,
- (byte)(SI+7), (byte)(SI+1), SI_STOP, (byte)(SI+4),
- (byte)(SI+1), SI_STOP, (byte)(SI+1), SI_STOP,
- (byte)(SI+6), SI_STOP,
-
- // 7
- SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- (byte)(7), SI_STOP, SI_STOP, SI_STOP,
- (byte)(SI+1), STOP, SI_STOP, SI_STOP,
- (byte)(SI+7), SI_STOP,
-
- // 8
- (byte)(SI+1), (byte)(SI+1), (byte)(SI+2), (byte)(SI+8),
- (byte)(SI+1), (byte)(SI+5), (byte)(SI+1), (byte)(SI+4),
- (byte)(SI+1), (byte)(SI+8), (byte)(SI+9), (byte)(SI+5),
- (byte)(SI+8), SI_STOP,
-
- // 9
- (byte)(SI+1), (byte)(SI+1), (byte)(SI+2), (byte)(SI+9),
- (byte)(SI+1), (byte)(SI+5), (byte)(SI+1), (byte)(SI+4),
- (byte)(SI+1), (byte)(SI+1), (byte)(SI+9), (byte)(SI+5),
- (byte)(SI+9), SI_STOP
- };
-
- private static final WordBreakTable kSentenceForward
- = new WordBreakTable(COL_COUNT, kSentenceForwardData);
-
- private static final byte kSentenceBackwardData[] =
- {
- // other space terminator ambTerm
- // open close CJK PB
- // lower upper digit quote
- // nsm EOS
-
- // 0
- STOP, STOP, STOP, STOP,
- STOP, STOP, STOP, STOP,
- STOP, STOP, STOP, STOP,
- STOP, STOP,
-
- // 1
- (byte)(SI+2), (byte)(SI+2), (byte)(SI+2), (byte)(SI+2),
- (byte)(SI+2), (byte)(SI+2), (byte)(SI+3), STOP,
- (byte)(SI+2), (byte)(SI+3), (byte)(SI+2), (byte)(SI+2),
- (byte)(SI+1), STOP,
-
- // 2
- (byte)(SI+2), (byte)(SI+2), (byte)(SI+2), (byte)(SI+2),
- (byte)(SI+2), (byte)(SI+2), (byte)(SI+3), STOP,
- (byte)(SI+2), (byte)(SI+3), (byte)(SI+2), (byte)(SI+2),
- (byte)(SI+2), STOP,
-
- // 3
- (byte)(SI+2), (byte)(SI+4), (byte)(SI+2), (byte)(SI+2),
- (byte)(SI+2), (byte)(SI+2), (byte)(SI+3), STOP,
- (byte)(SI+3), (byte)(SI+2), (byte)(SI+2), (byte)(SI+2),
- (byte)(SI+3), STOP,
-
- // 4
- (byte)(SI+2), (byte)(SI+4), SI_STOP, SI_STOP,
- (byte)(SI+2), (byte)(SI+2), (byte)(SI+3), STOP,
- (byte)(SI+2), (byte)(SI+3), (byte)(SI+2), (byte)(SI+2),
- (byte)(SI+4), STOP
- };
-
- private static final WordBreakTable kSentenceBackward
- = new WordBreakTable(COL_COUNT, kSentenceBackwardData);
-
- private static final int kRawMapping[] =
- {
- other, // UNASSIGNED = 0,
- upperCase, // UPPERCASE_LETTER = 1,
- lowerCase, // LOWERCASE_LETTER = 2,
- other, // TITLECASE_LETTER = 3,
- other, // MODIFIER_LETTER = 4,
- other, // OTHER_LETTER = 5,
- nsm, // NON_SPACING_MARK = 6,
- nsm, // ENCLOSING_MARK = 7,
- other, // COMBINING_SPACING_MARK = 8,
- number, // DECIMAL_DIGIT_NUMBER = 9,
- number, // LETTER_NUMBER = 10,
- number, // OTHER_NUMBER = 11,
- space, // SPACE_SEPARATOR = 12,
- space, // LINE_SEPARATOR = 13,
- space, // PARAGRAPH_SEPARATOR = 14, ???????
- other, // CONTROL = 15,
- other, // PRIVATE_USE = 16,
- other, // FORMAT = 17,
- other, // ???? = 18,
- other, // SURROGATE = 19,
- other, // DASH_PUNCTUATION = 20,
- openBracket, // START_PUNCTUATION = 21,
- closeBracket, // END_PUNCTUATION = 22,
- other, // CONNECTOR_PUNCTUATION = 23,
- other, // OTHER_PUNCTUATION = 24,
- other, // MATH_SYMBOL = 25,
- other, // CURRENCY_SYMBOL = 26,
- other, // MODIFIER_SYMBOL = 27,
- other, // OTHER_SYMBOL = 28;
- };
-
- private static final SpecialMapping kExceptionChar[] =
- {
- //note: the ranges in this table must be sorted in ascending order
- //as required by the UnicodeClassMapping class.
- new SpecialMapping(ASCII_HORIZONTAL_TABULATION, space),
- new SpecialMapping(ASCII_LINEFEED, space),
- new SpecialMapping(ASCII_FORM_FEED, terminator),
- new SpecialMapping(ASCII_CARRIAGE_RETURN, space),
-
- new SpecialMapping(ASCII_EXCLAMATION_MARK, terminator),
- new SpecialMapping(ASCII_QUOTATION_MARK, quote),
-
- new SpecialMapping(ASCII_APOSTROPHE, quote),
-
- new SpecialMapping(ASCII_FULL_STOP, ambiguosTerm),
- new SpecialMapping(ASCII_QUESTION_MARK, terminator),
- new SpecialMapping(ASCII_NONBREAKING_SPACE, other),
- new SpecialMapping(PUNCTUATION_LINE_SEPARATOR, space),
- new SpecialMapping(PUNCTUATION_PARAGRAPH_SEPARATOR, paragraphBreak),
- new SpecialMapping(PUNCTUATION_IDEOGRAPHIC_FULL_STOP, terminator),
- new SpecialMapping(HIRAGANA_LETTER_SMALL_A, HIRAGANA_LETTER_VU, cjk),
- new SpecialMapping(COMBINING_KATAKANA_HIRAGANA_VOICED_SOUND_MARK,
- HIRAGANA_SEMIVOICED_SOUND_MARK, cjk), // cjk
- new SpecialMapping(KATAKANA_LETTER_SMALL_A, KATAKANA_LETTER_SMALL_KE,
- cjk), // cjk
- new SpecialMapping(UNICODE_LOW_BOUND_HAN, UNICODE_HIGH_BOUND_HAN, cjk),
- new SpecialMapping(CJK_COMPATIBILITY_F900, CJK_COMPATIBILITY_FA2D,cjk),
- new SpecialMapping(UNICODE_ZERO_WIDTH_NON_BREAKING_SPACE, other),
- new SpecialMapping(FULLWIDTH_EXCLAMATION_MARK, terminator),
- new SpecialMapping(FULLWIDTH_FULL_STOP, ambiguosTerm),
- new SpecialMapping(FULLWIDTH_QUESTION_MARK, terminator),
- new SpecialMapping(END_OF_STRING, EOS)
- };
-
- private static final boolean SentenceExceptionFlags[] = {
- false, // kNonCharacter = 0,
- false, // kUppercaseLetter = 1,
- false, // kLowercaseLetter = 2,
- false, // kTitlecaseLetter = 3,
- false, // kModifierLetter = 4,
- true, // kOtherLetter = 5,
- true, // kNonSpacingMark = 6,
- false, // kEnclosingMark = 7,
- false, // kCombiningSpacingMark = 8,
- false, // kDecimalNumber = 9,
- false, // kLetterNumber = 10,
- false, // kOtherNumber = 11,
- true, // kSpaceSeparator = 12,
- true, // kLineSeparator = 13,
- true, // kParagraphSeparator = 14,
- true, // kControlCharacter = 15,
- true, // kFormatCharacter = 16,
- false, // UNDEFINED = 17,
- false, // kPrivateUseCharacter = 18,
- false, // kSurrogate = 19,
- false, // kDashPunctuation = 20,
- false, // kOpenPunctuation = 21,
- false, // kClosePunctuation = 22,
- false, // kConnectorPunctuation = 23,
- true, // kOtherPunctuation = 24,
- false, // kMathSymbol = 25,
- false, // kCurrencySymbol = 26,
- false, // kModifierSymbol = 27,
- false // kOtherSymbol = 28
- };
-
- private static final int kSentenceAsciiValues[] = {
- // null soh stx etx eot enq ask bell
- other, other, other, other, other, other, other, other,
- // bs ht lf vt ff cr so si
- other, space, space, other, terminator, space, other, other,
- // dle dc1 dc2 dc3 dc4 nak syn etb
- other, other, other, other, other, other, other, other,
- // can em sub esc fs gs rs us
- other, other, other, other, other, other, other, other,
- // sp ! " # $ % & '
- space, terminator, quote, other, other, other, other, quote,
- // ( ) * + , - . /
- openBracket, closeBracket, other, other, other, other, ambiguosTerm, other,
- // 0 1 2 3 4 5 6 7
- number, number, number, number, number, number, number, number,
- // 8 9 : ; < = > ?
- number, number, other, other, other, other, other, terminator,
- // @ A B C D E F G
- other, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase,
- // H I J K L M N O
- upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase,
- // P Q R S T U V W
- upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase,
- // X Y Z [ \ ] ^ _
- upperCase, upperCase, upperCase, openBracket, other, closeBracket, other, other,
- // ` a b c d e f g
- other, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase,
- // h i j k l m n o
- lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase,
- // p q r s t u v w
- lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase,
- // x y z { | } ~ del
- lowerCase, lowerCase, lowerCase, openBracket, other, closeBracket, other, other,
- // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
- other, other, other, other, other, other, other, other,
- // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
- other, other, other, other, other, other, other, other,
- // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
- other, other, other, other, other, other, other, other,
- // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
- other, other, other, other, other, other, other, other,
- // nbsp inv-! cents pounds currency yen broken-bar section
- other, other, other, other, other, other, other, other,
- // umlaut copyright super-a gui-left not soft-hyph registered macron
- other, other, lowerCase, openBracket, other, other, other, other,
- // degree +/- super-2 super-3 acute micro paragraph bullet
- other, other, number, number, other, lowerCase, other, other,
- // cedilla super-1 super-o gui-right 1/4 1/2 3/4 inv-?
- other, lowerCase, other, closeBracket, number, number, number, other,
- // A-grave A-acute A-hat A-tilde A-umlaut A-ring AE C-cedilla
- upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase,
- // E-grave E-acute E-hat E-umlaut I-grave I-acute I-hat I-umlaut
- upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase,
- // Edh N-tilde O-grave O-acute O-hat O-tilde O-umlaut times
- upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, other,
- // O=slash U-grave U-acute U-hat U-umlaut Y-acute Thorn ess-zed
- upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, lowerCase,
- // a-grave a-acute a-hat a-tilde a-umlaut a-ring ae c-cedilla
- lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase,
- // e-grave e-acute e-hat e-umlaut i-grave i-acute i-hat i-umlaut
- lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase,
- // edh n-tilde o-grave o-acute o-hat o-tilde o-umlaut over
- lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, other,
- // o-slash u-grave u-acute u-hat u-umlaut y-acute thorn y=umlaut
- lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase
- };
-
- private static final UnicodeClassMapping kSentenceMap
- = new UnicodeClassMapping(kRawMapping, kExceptionChar, SentenceExceptionFlags,
- kSentenceAsciiValues);
- }
-