Thursday, August 26, 2004

Spell Checking Code

Well, I ran through a bunch of scenarios for performance of my Spell Checking code. I got some amazing increases in speed. My algorithm is doubly blessed by the Python Cookbook... it uses the Soundex fxn, as well as mentioning that using a dictionary's has_key function is faster than checking for membership in a list. Here's the code, as well as the results.

For the sake of a good stress test, I used Guttenberg's Henry V text.


"""
The SpellChecker module contains various logic for
determing if a word is misspelled.
"""
import shelve, os
from re import compile

re_word = compile(r'[\w]+') #includes numbers
ordA = ord('A')

#Hooray for python cookbook!
def GetSoundex(name, len=4):
"""GetSoundex conforming to Odell-Russell algorithm"""
# digists hold the soundex values for the alphabet
soundex_digits = '01230120022455012623010202'
soundex_result = ''
first_letter = ''
for character in name.upper():
if character.isalpha():
current_digit = soundex_digits[ord(character) - ordA]

if not soundex_result:
soundex_result = character
elif current_digit != soundex_result[-1]:
#duplicate consecutive soundex digits are skipped
soundex_result += current_digit

#Remove all 0s from the soundex code
soundex_result = soundex_result.replace('0','')

#Return soundex code truncated or 0-padded to len characters
return (soundex_result + (len * '0'))[:len]

#Algorithm:

#For a given word, determines its soundex value
#Retrieve the list of words with the same soundex value (dictionary
#will have soundex values as its keys).
#If the word is in the list of words then it is spelled correctly,
#Else: flag the words as misspelled / unknown.

#Problems: if the values for soundex_digits change, the entire word
#list will need to be reevaluated.
#NOTE: Keep backup of original words list!

class SpellChecker(object):
def __init__(self, word_file='words.dat'):
self.word_file = word_file
self.InitializeWordList()

def InitializeWordList(self):
err = "Specified word file %s does not exist." % self.word_file
assert os.path.exists(self.word_file), err
self.word_shelf = shelve.open(self.word_file)

def __del__(self):
if self.__dict__.has_key('word_shelf'):
self.word_shelf.close()

def IsKnownWord(self,word):
"""
IsKnownWord returns a tuple,
True,None
or
False,
IsKnownWord determines whether or not the given
word is in the dictionary.
"""
snd_word = GetSoundex(word)
possible_matches = self.GetWordDictBySoundex(snd_word)
soundex_matches_list = [word_match.lower() for word_match in possible_matches.keys()]
if word.lower() in soundex_matches_list:
return True,None
else:
soundex_matches_list.sort()
return False, soundex_matches_list


def GetWordDictBySoundex(self, snd_word):
return self.word_shelf.get(snd_word,{})

def SpellCheckWordList(self, text):
results = {}
for word_match in re_word.finditer(file_data):
word = word_match.group(0)
results[word] = self.IsKnownWord(word)

return results

def SpellCheckText(self, text):
unknown_words = {}
for word_match in re_word.finditer(text):
word = word_match.group(0)
isKnown, altWords = self.IsKnownWord(word)
if not isKnown:
unknown_words[word_match] = altWords
return unknown_words


def SpellCheckTextUsingCache(self, text):
misspelled_list = []
correct_list = []
unknown_words = {}
for word_match in re_word.finditer(text):
word = word_match.group(0)
if word in correct_list:
continue
elif word in misspelled_list:
unknown_words[word_match] = altWords
else:
isKnown, altWords = self.IsKnownWord(word)
if not isKnown:
unknown_words[word_match] = altWords
misspelled_list.append(word)
else:
correct_list.append(word)
return unknown_words

def OptimizedSpellCheckTextUsingCache(self, text):
misspelled_list = {}
correct_list = {}
unknown_words = {}
for word_match in re_word.finditer(text):
word = word_match.group(0)
if correct_list.has_key(word):
continue
elif misspelled_list.has_key(word):
unknown_words[word_match] = altWords
else:
isKnown, altWords = self.IsKnownWord(word)
if not isKnown:
unknown_words[word_match] = altWords
misspelled_list[word] = 1
else:
correct_list[word] = 1
return unknown_words

def UglyOptimizedSpellCheckTextUsingCache(self, text):
misspelled_list = {}
correct_list = {}
unknown_words = {}
#Ugly optimizations
finditer = re_word.finditer
correct_has_key = correct_list.has_key
misspelled_has_key = misspelled_list.has_key
IsKnownWord = self.IsKnownWord
unknown_setitem = unknown_words.__setitem__
misspelled_setitem = misspelled_list.__setitem__
correct_setitem = correct_list.__setitem__

for word_match in finditer(text):
word = word_match.group(0)
if correct_has_key(word):
continue
elif misspelled_has_key(word):
unknown_setitem(word_match, altWords)
else:
isKnown, altWords = IsKnownWord(word)
if not isKnown:
unknown_setitem(word_match, altWords)
misspelled_setitem(word, 1)
else:
correct_setitem(word, 1)
return unknown_words

if __name__ == '__main__':
sc = SpellChecker()

import time
hv = open("G:\\HenryV.txt")
hvtext = hv.read()
hv.close()

def timeo(fun, text, n=10):
start = time.clock()
for i in range(n): fun(text)
stend = time.clock()
thetime = stend-start
return fun.__name__, thetime
fxns = [sc.SpellCheckText]
fxns.append(sc.SpellCheckTextUsingCache)
fxns.append(sc.OptimizedSpellCheckTextUsingCache)
fxns.append(sc.UglyOptimizedSpellCheckTextUsingCache)
for f in fxns:
print "%s: %.2f"%timeo(f, hvtext)



Against HenryV.txt round 1
SpellCheckText: 130.34
SpellCheckTextUsingCache: 85.46

Against HenryV.txt round 2 w/ optimizations [in list -> dict.has_key] from cookbook
SpellCheckText: 130.17
SpellCheckTextUsingCache: 79.23
OptimizedSpellCheckTextUsingCache: 29.79

Against HenryV.txt round 3 w/ additional ugly optimizations
SpellCheckText: 127.53
SpellCheckTextUsingCache: 78.27
OptimizedSpellCheckTextUsingCache: 29.29
UglyOptimizedSpellCheckTextUsingCache: 29.19

No comments: