1. /*
  2. * Copyright 2001-2004 The Apache Software Foundation.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. package org.apache.commons.codec.language;
  17. import org.apache.commons.codec.EncoderException;
  18. import org.apache.commons.codec.StringEncoder;
  19. /**
  20. * Encodes a string into a Refined Soundex value. A refined soundex code is
  21. * optimized for spell checking words. Soundex method originally developed by
  22. * <CITE>Margaret Odell</CITE> and <CITE>Robert Russell</CITE>.
  23. *
  24. * @author Apache Software Foundation
  25. * @version $Id: RefinedSoundex.java,v 1.21 2004/06/05 18:32:04 ggregory Exp $
  26. */
  27. public class RefinedSoundex implements StringEncoder {
  28. /**
  29. * This static variable contains an instance of the RefinedSoundex using
  30. * the US_ENGLISH mapping.
  31. */
  32. public static final RefinedSoundex US_ENGLISH = new RefinedSoundex();
  33. /**
  34. * RefinedSoundex is *refined* for a number of reasons one being that the
  35. * mappings have been altered. This implementation contains default
  36. * mappings for US English.
  37. */
  38. public static final char[] US_ENGLISH_MAPPING = "01360240043788015936020505".toCharArray();
  39. /**
  40. * Every letter of the alphabet is "mapped" to a numerical value. This char
  41. * array holds the values to which each letter is mapped. This
  42. * implementation contains a default map for US_ENGLISH
  43. */
  44. private char[] soundexMapping;
  45. /**
  46. * Creates an instance of the RefinedSoundex object using the default US
  47. * English mapping.
  48. */
  49. public RefinedSoundex() {
  50. this(US_ENGLISH_MAPPING);
  51. }
  52. /**
  53. * Creates a refined soundex instance using a custom mapping. This
  54. * constructor can be used to customize the mapping, and/or possibly
  55. * provide an internationalized mapping for a non-Western character set.
  56. *
  57. * @param mapping
  58. * Mapping array to use when finding the corresponding code for
  59. * a given character
  60. */
  61. public RefinedSoundex(char[] mapping) {
  62. this.soundexMapping = mapping;
  63. }
  64. /**
  65. * Returns the number of characters in the two encoded Strings that are the
  66. * same. This return value ranges from 0 to the length of the shortest
  67. * encoded String: 0 indicates little or no similarity, and 4 out of 4 (for
  68. * example) indicates strong similarity or identical values. For refined
  69. * Soundex, the return value can be greater than 4.
  70. *
  71. * @param s1
  72. * A String that will be encoded and compared.
  73. * @param s2
  74. * A String that will be encoded and compared.
  75. * @return The number of characters in the two encoded Strings that are the
  76. * same from 0 to to the length of the shortest encoded String.
  77. *
  78. * @see SoundexUtils#difference(StringEncoder,String,String)
  79. * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
  80. * MS T-SQL DIFFERENCE</a>
  81. *
  82. * @throws EncoderException
  83. * if an error occurs encoding one of the strings
  84. * @since 1.3
  85. */
  86. public int difference(String s1, String s2) throws EncoderException {
  87. return SoundexUtils.difference(this, s1, s2);
  88. }
  89. /**
  90. * Encodes an Object using the refined soundex algorithm. This method is
  91. * provided in order to satisfy the requirements of the Encoder interface,
  92. * and will throw an EncoderException if the supplied object is not of type
  93. * java.lang.String.
  94. *
  95. * @param pObject
  96. * Object to encode
  97. * @return An object (or type java.lang.String) containing the refined
  98. * soundex code which corresponds to the String supplied.
  99. * @throws EncoderException
  100. * if the parameter supplied is not of type java.lang.String
  101. */
  102. public Object encode(Object pObject) throws EncoderException {
  103. if (!(pObject instanceof java.lang.String)) {
  104. throw new EncoderException("Parameter supplied to RefinedSoundex encode is not of type java.lang.String");
  105. }
  106. return soundex((String) pObject);
  107. }
  108. /**
  109. * Encodes a String using the refined soundex algorithm.
  110. *
  111. * @param pString
  112. * A String object to encode
  113. * @return A Soundex code corresponding to the String supplied
  114. */
  115. public String encode(String pString) {
  116. return soundex(pString);
  117. }
  118. /**
  119. * Returns the mapping code for a given character. The mapping codes are
  120. * maintained in an internal char array named soundexMapping, and the
  121. * default values of these mappings are US English.
  122. *
  123. * @param c
  124. * char to get mapping for
  125. * @return A character (really a numeral) to return for the given char
  126. */
  127. char getMappingCode(char c) {
  128. if (!Character.isLetter(c)) {
  129. return 0;
  130. }
  131. return this.soundexMapping[Character.toUpperCase(c) - 'A'];
  132. }
  133. /**
  134. * Retreives the Refined Soundex code for a given String object.
  135. *
  136. * @param str
  137. * String to encode using the Refined Soundex algorithm
  138. * @return A soundex code for the String supplied
  139. */
  140. public String soundex(String str) {
  141. if (str == null) {
  142. return null;
  143. }
  144. str = SoundexUtils.clean(str);
  145. if (str.length() == 0) {
  146. return str;
  147. }
  148. StringBuffer sBuf = new StringBuffer();
  149. sBuf.append(str.charAt(0));
  150. char last, current;
  151. last = '*';
  152. for (int i = 0; i < str.length(); i++) {
  153. current = getMappingCode(str.charAt(i));
  154. if (current == last) {
  155. continue;
  156. } else if (current != 0) {
  157. sBuf.append(current);
  158. }
  159. last = current;
  160. }
  161. return sBuf.toString();
  162. }
  163. }