Funny I did a similar thing recently, specifically for domain names. The code to train a Markov chain (without using any libraries outside standard Python) is this:
import re, collections
from itertools import *
def words(text):
return (w.group() for w in re.finditer(r"\w+", text.lower()) if w.group().isalpha())
F=collections.defaultdict(lambda:collections.defaultdict(lambda:1))
M = 5
def add_spaces(word):
return ' ' * M + word.strip() + ' ' * M
N = 0
from operator import add
def train(words):
for word in words:
w = add_spaces(word)
for i in range(len(w)-M):
F[w[i:i+M]][w[i+M]] += 1
global N
N = reduce(add,(n for d in F.itervalues() for n in d.itervalues()))
train(words(file('big.txt').read()))
Then, I tried to measure how "good" a particular word is (experimenting with different measures):
from math import log
def goodness(word):
w = add_spaces(word)
res = 0
for i in range(len(w)-M):
res += F[w[i:i+M]][w[i+M]]
# res += log(F[w[i:i+M]][w[i+M]])
global N
return float(res) / ((len(w)-M)*N)
print 'test score:', reduce(add, (goodness(w) for w in words(file('test.txt').read())))
Then, tried to find the top N words at less than 2 edits distance:
alphabet = 'abcdefghijklmnopqrstuvwxyz'
import types
def edits1(word):
if type(word)==types.StringType:
n = len(word)
return set(chain((word[0:i]+word[i+1:] for i in range(n)), # deletion
(word[0:i]+word[i+1]+word[i]+word[i+2:] for i in range(n-1)), # transposition
(word[0:i]+c+word[i+1:] for i in range(n) for c in alphabet), # alteration
(word[0:i]+c+word[i:] for i in range(n+1) for c in alphabet))) # insertion
else: #Assume word is actualy an iterable of words
return set(reduce(chain,(edits1(e) for e in word)))
def edits2(word):
return edits1(edits1(word))
def topn(n, word):
words = [(x, goodness(x)) for x in edits2(word)]
words.sort(cmp = lambda x,y: -cmp(x[1],y[1]))
return words[:n]
I have borrowed some code from Peter Norvig's spelling checker, and the style is very much influenced by it.
Try closer to something like 800 lines in the library. It comes with demos, a test suite, etc. And I think you're skipping the base class in your naive count.
I'm not really sure how the line count of a library is relevant to anything?
http://search.cpan.org/~rclamp/Algorithm-MarkovChain-0.06/li...