1. /*
  2. * Copyright 2001-2004 The Apache Software Foundation.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. package org.apache.commons.codec.language;
  17. import org.apache.commons.codec.EncoderException;
  18. import org.apache.commons.codec.StringEncoder;
  19. /**
  20. * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a
  21. * general purpose scheme to find word with similar phonemes.
  22. *
  23. * @author Apache Software Foundation
  24. * @version $Id: Soundex.java,v 1.26 2004/07/07 23:15:24 ggregory Exp $
  25. */
  26. public class Soundex implements StringEncoder {
  27. /**
  28. * An instance of Soundex using the US_ENGLISH_MAPPING mapping.
  29. *
  30. * @see #US_ENGLISH_MAPPING
  31. */
  32. public static final Soundex US_ENGLISH = new Soundex();
  33. /**
  34. * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
  35. * means do not encode.
  36. * <p>
  37. * (This constant is provided as both an implementation convenience and to allow Javadoc to pick
  38. * up the value for the constant values page.)
  39. * </p>
  40. *
  41. * @see #US_ENGLISH_MAPPING
  42. */
  43. public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202";
  44. /**
  45. * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
  46. * means do not encode.
  47. *
  48. * @see Soundex#Soundex(char[])
  49. */
  50. public static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
  51. /**
  52. * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This
  53. * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or
  54. * identical values.
  55. *
  56. * @param s1
  57. * A String that will be encoded and compared.
  58. * @param s2
  59. * A String that will be encoded and compared.
  60. * @return The number of characters in the two encoded Strings that are the same from 0 to 4.
  61. *
  62. * @see SoundexUtils#difference(StringEncoder,String,String)
  63. * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS
  64. * T-SQL DIFFERENCE </a>
  65. *
  66. * @throws EncoderException
  67. * if an error occurs encoding one of the strings
  68. * @since 1.3
  69. */
  70. public int difference(String s1, String s2) throws EncoderException {
  71. return SoundexUtils.difference(this, s1, s2);
  72. }
  73. /**
  74. * The maximum length of a Soundex code - Soundex codes are only four characters by definition.
  75. *
  76. * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
  77. */
  78. private int maxLength = 4;
  79. /**
  80. * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
  81. * letter is mapped. This implementation contains a default map for US_ENGLISH
  82. */
  83. private char[] soundexMapping;
  84. /**
  85. * Creates an instance using US_ENGLISH_MAPPING
  86. *
  87. * @see Soundex#Soundex(char[])
  88. * @see Soundex#US_ENGLISH_MAPPING
  89. */
  90. public Soundex() {
  91. this(US_ENGLISH_MAPPING);
  92. }
  93. /**
  94. * Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized
  95. * mapping for a non-Western character set.
  96. *
  97. * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
  98. * letter is mapped. This implementation contains a default map for US_ENGLISH
  99. *
  100. * @param mapping
  101. * Mapping array to use when finding the corresponding code for a given character
  102. */
  103. public Soundex(char[] mapping) {
  104. this.setSoundexMapping(mapping);
  105. }
  106. /**
  107. * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of
  108. * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String.
  109. *
  110. * @param pObject
  111. * Object to encode
  112. * @return An object (or type java.lang.String) containing the soundex code which corresponds to the String
  113. * supplied.
  114. * @throws EncoderException
  115. * if the parameter supplied is not of type java.lang.String
  116. * @throws IllegalArgumentException
  117. * if a character is not mapped
  118. */
  119. public Object encode(Object pObject) throws EncoderException {
  120. if (!(pObject instanceof String)) {
  121. throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String");
  122. }
  123. return soundex((String) pObject);
  124. }
  125. /**
  126. * Encodes a String using the soundex algorithm.
  127. *
  128. * @param pString
  129. * A String object to encode
  130. * @return A Soundex code corresponding to the String supplied
  131. * @throws IllegalArgumentException
  132. * if a character is not mapped
  133. */
  134. public String encode(String pString) {
  135. return soundex(pString);
  136. }
  137. /**
  138. * Used internally by the SoundEx algorithm.
  139. *
  140. * Consonants from the same code group separated by W or H are treated as one.
  141. *
  142. * @param str
  143. * the cleaned working string to encode (in upper case).
  144. * @param index
  145. * the character position to encode
  146. * @return Mapping code for a particular character
  147. * @throws IllegalArgumentException
  148. * if the character is not mapped
  149. */
  150. private char getMappingCode(String str, int index) {
  151. char mappedChar = this.map(str.charAt(index));
  152. // HW rule check
  153. if (index > 1 && mappedChar != '0') {
  154. char hwChar = str.charAt(index - 1);
  155. if ('H' == hwChar || 'W' == hwChar) {
  156. char preHWChar = str.charAt(index - 2);
  157. char firstCode = this.map(preHWChar);
  158. if (firstCode == mappedChar || 'H' == preHWChar || 'W' == preHWChar) {
  159. return 0;
  160. }
  161. }
  162. }
  163. return mappedChar;
  164. }
  165. /**
  166. * Returns the maxLength. Standard Soundex
  167. *
  168. * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
  169. * @return int
  170. */
  171. public int getMaxLength() {
  172. return this.maxLength;
  173. }
  174. /**
  175. * Returns the soundex mapping.
  176. *
  177. * @return soundexMapping.
  178. */
  179. private char[] getSoundexMapping() {
  180. return this.soundexMapping;
  181. }
  182. /**
  183. * Maps the given upper-case character to it's Soudex code.
  184. *
  185. * @param ch
  186. * An upper-case character.
  187. * @return A Soundex code.
  188. * @throws IllegalArgumentException
  189. * Thrown if <code>ch</code> is not mapped.
  190. */
  191. private char map(char ch) {
  192. int index = ch - 'A';
  193. if (index < 0 || index >= this.getSoundexMapping().length) {
  194. throw new IllegalArgumentException("The character is not mapped: " + ch);
  195. }
  196. return this.getSoundexMapping()[index];
  197. }
  198. /**
  199. * Sets the maxLength.
  200. *
  201. * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
  202. * @param maxLength
  203. * The maxLength to set
  204. */
  205. public void setMaxLength(int maxLength) {
  206. this.maxLength = maxLength;
  207. }
  208. /**
  209. * Sets the soundexMapping.
  210. *
  211. * @param soundexMapping
  212. * The soundexMapping to set.
  213. */
  214. private void setSoundexMapping(char[] soundexMapping) {
  215. this.soundexMapping = soundexMapping;
  216. }
  217. /**
  218. * Retreives the Soundex code for a given String object.
  219. *
  220. * @param str
  221. * String to encode using the Soundex algorithm
  222. * @return A soundex code for the String supplied
  223. * @throws IllegalArgumentException
  224. * if a character is not mapped
  225. */
  226. public String soundex(String str) {
  227. if (str == null) {
  228. return null;
  229. }
  230. str = SoundexUtils.clean(str);
  231. if (str.length() == 0) {
  232. return str;
  233. }
  234. char out[] = {'0', '0', '0', '0'};
  235. char last, mapped;
  236. int incount = 1, count = 1;
  237. out[0] = str.charAt(0);
  238. last = getMappingCode(str, 0);
  239. while ((incount < str.length()) && (count < out.length)) {
  240. mapped = getMappingCode(str, incount++);
  241. if (mapped != 0) {
  242. if ((mapped != '0') && (mapped != last)) {
  243. out[count++] = mapped;
  244. }
  245. last = mapped;
  246. }
  247. }
  248. return new String(out);
  249. }
  250. }