1. /*
  2. * Copyright 2001-2004 The Apache Software Foundation.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. package org.apache.commons.codec.language;
  17. import org.apache.commons.codec.EncoderException;
  18. import org.apache.commons.codec.StringEncoder;
  19. /**
  20. * Utility methods for {@link Soundex} and {@link RefinedSoundex} classes.
  21. *
  22. * @author Apache Software Foundation
  23. * @version $Id: SoundexUtils.java,v 1.5 2004/03/17 18:31:35 ggregory Exp $
  24. * @since 1.3
  25. */
  26. final class SoundexUtils {
  27. /**
  28. * Cleans up the input string before Soundex processing by only returning
  29. * upper case letters.
  30. *
  31. * @param str
  32. * The String to clean.
  33. * @return A clean String.
  34. */
  35. static String clean(String str) {
  36. if (str == null || str.length() == 0) {
  37. return str;
  38. }
  39. int len = str.length();
  40. char[] chars = new char[len];
  41. int count = 0;
  42. for (int i = 0; i < len; i++) {
  43. if (Character.isLetter(str.charAt(i))) {
  44. chars[count++] = str.charAt(i);
  45. }
  46. }
  47. if (count == len) {
  48. return str.toUpperCase();
  49. }
  50. return new String(chars, 0, count).toUpperCase();
  51. }
  52. /**
  53. * Encodes the Strings and returns the number of characters in the two
  54. * encoded Strings that are the same.
  55. * <ul>
  56. * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
  57. * little or no similarity, and 4 indicates strong similarity or identical
  58. * values.</li>
  59. * <li>For refined Soundex, the return value can be greater than 4.</li>
  60. * </ul>
  61. *
  62. * @param encoder
  63. * The encoder to use to encode the Strings.
  64. * @param s1
  65. * A String that will be encoded and compared.
  66. * @param s2
  67. * A String that will be encoded and compared.
  68. * @return The number of characters in the two Soundex encoded Strings that
  69. * are the same.
  70. *
  71. * @see #differenceEncoded(String,String)
  72. * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
  73. * MS T-SQL DIFFERENCE</a>
  74. *
  75. * @throws EncoderException
  76. * if an error occurs encoding one of the strings
  77. */
  78. static int difference(StringEncoder encoder, String s1, String s2) throws EncoderException {
  79. return differenceEncoded(encoder.encode(s1), encoder.encode(s2));
  80. }
  81. /**
  82. * Returns the number of characters in the two Soundex encoded Strings that
  83. * are the same.
  84. * <ul>
  85. * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
  86. * little or no similarity, and 4 indicates strong similarity or identical
  87. * values.</li>
  88. * <li>For refined Soundex, the return value can be greater than 4.</li>
  89. * </ul>
  90. *
  91. * @param es1
  92. * An encoded String.
  93. * @param es2
  94. * An encoded String.
  95. * @return The number of characters in the two Soundex encoded Strings that
  96. * are the same.
  97. *
  98. * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
  99. * MS T-SQL DIFFERENCE</a>
  100. */
  101. static int differenceEncoded(String es1, String es2) {
  102. if (es1 == null || es2 == null) {
  103. return 0;
  104. }
  105. int lengthToMatch = Math.min(es1.length(), es2.length());
  106. int diff = 0;
  107. for (int i = 0; i < lengthToMatch; i++) {
  108. if (es1.charAt(i) == es2.charAt(i)) {
  109. diff++;
  110. }
  111. }
  112. return diff;
  113. }
  114. }