I wrote up a quick program to look at trigram density using the FreeBSD wordlist on OS X:
#!/usr/bin/env python | |
from collections import defaultdict | |
bigrams = defaultdict(set) # mapping first letter to possible second letter | |
trigrams = set() | |
with open("/usr/share/dict/words") as words: | |
for word in words: | |
word = word.replace("\n", "#") | |
for i in range(len(word) - 1): | |
bigrams[word[i]].add(word[i+1]) | |
if i < len(word) - 2: | |
trigrams.add(word[i:i+3]) | |
predicted_trigrams = set() | |
for first, second_set in bigrams.items(): | |
for second in second_set: | |
for third in bigrams[second]: | |
predicted_trigrams.add(first + second + third) | |
print sum(len(b) for b in bigrams.values()), "bigrams" | |
print len(predicted_trigrams), "predicted trigrams" | |
print len(trigrams), "trigrams" | |
print 1. * len(trigrams) / len(predicted_trigrams), "density" |