· The US census uses the Soundex system to catalog names for proximity searching. An explanantion of the principles of the algorithm can be found here: [http://www.archives.gov/research/census/soundex.html]
1.) Direct SQL is classified as a dangerous API and requires code-access-security assertion on the server. This often causes ISV’s problems with Microsoft software certification.
2.) It will limit the soundex-codes to the language that SQL Server was installed in (US-EN). Enhancing the algorithm below (manually) will allow you to tailor language specific pronounciation.
Soundex |
using System; using System.Collections.Generic; using System.Linq; using System.Text; namespace Soundex { publicclassSoundexClass { publicstaticstring createSoundexCodeEN(string word) { // The length of the returned code. int length = 4; // Value to return. string value = ""; // The size of the word to process. int size = word.Length; // The word must be at least two characters in length. if (size > 1) { // Convert the word to uppercase characters. word = word.ToUpper(System.Globalization.CultureInfo.InvariantCulture); // Convert the word to a character array. char[] chars = word.ToCharArray(); // Buffer to hold the character codes. StringBuilder buffer = newStringBuilder(); buffer.Length = 0; // The current and previous character codes. int prevCode = 0; int currCode = 0; // Add the first character to the buffer. buffer.Append(chars[0]); // Loop through all the characters and convert them to the proper character code. for (int i = 1; i < size; i++) { switch (chars[i]) { case'A': case'E': case'I': case'O': case'U': case'H': case'W': case'Y': currCode = 0; break; case'B': case'F': case'P': case'V': currCode = 1; break; case'C': case'G': case'J': case'K': case'Q': case'S': case'X': case'Z': currCode = 2; break; case'D': case'T': currCode = 3; break; case'L': currCode = 4; break; case'M': case'N': currCode = 5; break; case'R': currCode = 6; break; } // Check if the current code is the same as the previous code. if (currCode != prevCode) { // Check to see if the current code is 0 (a vowel); do not process vowels. if (currCode != 0) buffer.Append(currCode); } // Set the previous character code. prevCode = currCode; // If the buffer size meets the length limit, exit the loop. if (buffer.Length == length) break; } // Pad the buffer, if required. size = buffer.Length; if (size < length) buffer.Append('0', (length - size)); // Set the value to return. value = buffer.ToString(); } // Return the value. return value; } } } |
staticvoid checkSoundex(Args _args) { info(Soundex.SoundexClass::createSoundexCodeEN("fischer")); info(Soundex.SoundexClass::createSoundexCodeEN("fisher")); } |
· The result of the above should be a soundex code of (F260) for both.
· Again, these noise-words are language specific and the ultimate solution needs to cater for language variants.
Soundex Table | ||
Column | Data type (size or EDT) | Description |
ContextRecId | Int64 (RefRecId) | Record being indexed. |
ContextTableId | Int64 (RefRecId) | Table being indexed. |
LanguageId | String (LanguageId) | Language of soundex. |
Position | Integer | Position in text. |
SoundexCode | Int32 | Soundex code. |
Word | String (50) | Word found at specified position. |
Index | AllowDuplicates | Column | Purpose |
GlobalIdx | Yes | LanguageId | Search for a word pattern across all tables. |
SoundexCode | |||
TargetedIdx | Yes | LanguageId | Search for a word pattern within a specific table. |
ContextTableId | |||
SoundexCode |
· (Global and targeted searches for similar sounding words).
· Bulk populate (clear down, re-calculate, re-populate) the entire soundex word index using X++.
· Same as above using a SQL agent or SSIS job.
· Iterative updates using “post-event handlers” (either X++ or .Net)
Job to bulk populate Soundex Table (names + addresses) |
staticvoid PopulateSoundexEN(Args _args) { DirPartyTable objDirPartyTable; LogisticsPostalAddress objLogisticsPostalAddress; Soundex objSoundex; List list; ListIterator iterator; str word, text, code; int position; // noise words for entities (EN) container noiseWords1 = ["Ltd", "Inc", "Bank"]; // noise word list for addresses (EN) container noiseWords2 = ["St", "Street", "Rd", "Road", "Cl", "Close", "Ave", "Avenue"]; // format for a valid word (alpha chars only) System.Text.RegularExpressions.Regex objRegex1 = new System.Text.RegularExpressions.Regex("^[A-Z][a-z]+$"); // format for an acceptable soundex code (alpha followed by 3 digits) (EN) System.Text.RegularExpressions.Regex objRegex2 = new System.Text.RegularExpressions.Regex("^[A-Z][0-9]{3}$"); // clear [Soundex] table delete_from objSoundex where objSoundex.LanguageId == "en"; // process entity names whileselect RecId, Name from objDirPartyTable { list = new List(Types::String); text = objDirPartyTable.Name; // replace any punctuation and formatting with a word seperator text = strReplace(text, '\n', ' '); text = strReplace(text, '-', ' '); text = strReplace(text, '/', ' '); text = strReplace(text, ',', ' '); text = strReplace(text, '.', ' '); // split text on space list = Global::strSplit(text, " "); iterator = new ListIterator(list); position = 1; while(iterator.more()) { word = iterator.value(); if (objRegex1.IsMatch(word) && conFind(noiseWords1, word) == 0) { objSoundex.initValue(); objSoundex.ContextTableId = tableName2id("DirPartyTable"); objSoundex.ContextRecId = objDirPartyTable.RecId; objSoundex.LanguageId = "en"; objSoundex.Position = position; objSoundex.Word = word; // only add valid soundex codes code = Soundex.SoundexClass::createSoundexCodeEN(word); if (objRegex2.IsMatch(code)) { objSoundex.SoundexCode = (char2num(subStr(code, 1, 2), 1) * 1000) + str2int(subStr(code, 2, 3)); objSoundex.insert(); } } position++; iterator.next(); } } // process entity addresses whileselect RecId, Address from objLogisticsPostalAddress { list = new List(Types::String); text = objLogisticsPostalAddress.Address; // replace any punctuation and formatting with a word seperator text = strReplace(text, '\n', ' '); text = strReplace(text, '-', ' '); text = strReplace(text, '/', ' '); text = strReplace(text, ',', ' '); text = strReplace(text, '.', ' '); // split text on space list = Global::strSplit(text, " "); iterator = new ListIterator(list); position = 1; while(iterator.more()) { word = iterator.value(); if (objRegex1.IsMatch(word) && conFind(noiseWords2, word) == 0) { objSoundex.initValue(); objSoundex.ContextTableId = tableName2id("LogisticsPostalAddress"); objSoundex.ContextRecId = objLogisticsPostalAddress.RecId; objSoundex.LanguageId = "en"; objSoundex.Position = position; objSoundex.Word = word; // only add valid soundex codes code = Soundex.SoundexClass::createSoundexCodeEN(word); if (objRegex2.IsMatch(code)) { objSoundex.SoundexCode = (char2num(subStr(code, 1, 2), 1) * 1000) + str2int(subStr(code, 2, 3)); objSoundex.insert(); } } position++; iterator.next(); } } } |
Populate [Soundex] table from direct SQL |
DECLARE @RecId BIGINT DECLARE @ContextTableId BIGINT DECLARE @ContextRecId BIGINT DECLARE @Str1 NVARCHAR(4000) SET @RecId = 0 SET @ContextTableId =(SELECTTOP 1 [TABLEID] FROM [SQLDICTIONARY] WHERE [NAME] ='DirPartyTable'AND [FIELDID] = 0) TRUNCATETABLE [Soundex] DECLARE cursor1 CURSORLOCALFOR SELECT RECID, NAME FROM DIRPARTYTABLE OPEN cursor1 FETCHNEXTFROM cursor1 INTO @ContextRecId, @Str1 WHILE@@FETCH_STATUS= 0 BEGIN PRINT @Str1 INSERTINTO [SOUNDEX](CONTEXTRECID, CONTEXTTABLEID, LANGUAGEID, WORD, POSITION, SOUNDEXCODE, DATAAREAID, RECVERSION, RECID) SELECT @ContextRecId, @ContextTableId,'en', Word, Position, SoundexCode,'LNL1', 1,(@RecId + Position) FROM dbo.Split(' ', @Str1) SET @RecId =(SELECTMAX(RECID)FROM [SOUNDEX]) FETCHNEXTFROM cursor1 INTO @ContextRecId, @Str1 END CLOSE cursor1 DEALLOCATE cursor1 |
Split string UDF |
CREATEFUNCTION dbo.Split(@sep char(1), @s nvarchar(4000)) RETURNStable AS RETURN ( WITH Pieces(pn, start,stop)AS ( SELECT 1, 1,CHARINDEX(@sep, @s) UNIONALL SELECT pn + 1,stop+ 1,CHARINDEX(@sep, @s,stop+ 1) FROM Pieces WHEREstop> 0 ) SELECT pn AS Position, SUBSTRING(@s, start,CASEWHENstop> 0 THENstop-start ELSE 4000 END)AS Word, SOUNDEX(SUBSTRING(@s, start,CASEWHENstop> 0 THENstop-start ELSE 4000 END))AS SoundexCode FROM Pieces ) |
· Note, when populating Dynamics tables using this approach you must ensure that they will only ever be used as lookup/reference tables. It really doesn’t matter what you put into the RecId column, just so long its unique.
Clik here to view.