-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathEditModel.py
More file actions
146 lines (128 loc) · 4.88 KB
/
Copy pathEditModel.py
File metadata and controls
146 lines (128 loc) · 4.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import collections
class EditModel:
alphabet = 'abcdefghijklmnopqrstuvwxyz'
def __init__(self, edit_file='../data/count_1edit.txt', corpus=None):
self.edit_file = edit_file
self.edit_table = self.read_edit_table(self.edit_file)
if corpus:
self.initVocabulary(corpus)
def initVocabulary(self, corpus):
self.vocabulary = set()
for sentence in corpus.corpus:
for datum in sentence.data:
self.vocabulary.add(datum.word)
def editProbabilities(self, word):
"""Computes p(x|word) edit model for a given word. Returns a dictionary mapping x -> p(x|word)."""
s = [(word[:i], word[i:]) for i in range(len(word) + 1)]
counts = collections.defaultdict(lambda: 0)
# deletes
for a, b in s:
if b and a + b[1:] in self.vocabulary:
tail = ''
if len(a) > 0:
tail = a[-1]
original = tail + b[0]
replacement = tail
count = self.edit_count(original, replacement)
if count:
counts[a + b[1:]] += count
# transposes
for a, b in s:
if len(b)>1 and a + b[1] + b[0] + b[2:] in self.vocabulary:
# new word is a + b[1] + b[0] + b[2:]
# edit is b[0]b[1] -> b[1]b[0]
original = b[0] + b[1]
replacement = b[1] + b[0]
count = self.edit_count(original, replacement)
if count:
counts[a + b[1] + b[0] + b[2:]] += count
# replaces
for a, b in s:
if b:
for c in self.alphabet:
if a + c + b[1:] in self.vocabulary:
# new word is a + c + b[1:].
original = b[0]
replacement = c
count = self.edit_count(original, replacement)
if count:
counts[a + c + b[1:]] += count
# inserts
for a, b in s:
for c in self.alphabet:
if a + c + b in self.vocabulary:
# new word is a + c + b.
tail = ''
if len(a) > 0:
tail = a[-1]
original = tail
replacement = tail + c
count = self.edit_count(original, replacement)
if count:
counts[a + c + b] += count
# normalize counts. sum over them all, divide each entry by sum.
total = 0.0
for a,b in counts.iteritems():
total += b
# self count
selfCount = max(9*total, 1)
counts[word] = selfCount
total += selfCount
probs = {}
if(total != 0.0):
for a,b in counts.iteritems():
probs[a] = float(b)/total
return probs
def read_edit_table(self, file_name):
"""Reads in the string edit counts file. Stores a dictionary of tuples
(s1,s2) -> count."""
edit_table = collections.defaultdict(lambda: 0)
f = file(file_name)
for line in f:
contents = line.split('\t')
edit_table[contents[0]] = int(contents[1])
return edit_table
def edit_count(self, s1, s2):
"""Returns how many times substring s1 is edited as s2."""
return self.edit_table[s1 + "|" + s2]
# taken from http://mwh.geek.nz/2009/04/26/python-damerau-levenshtein-distance/
# MIT license.
def dameraulevenshtein(seq1, seq2):
"""Calculate the Damerau-Levenshtein distance between sequences.
This distance is the number of additions, deletions, substitutions,
and transpositions needed to transform the first sequence into the
second. Although generally used with strings, any sequences of
comparable objects will work.
Transpositions are exchanges of *consecutive* characters; all other
operations are self-explanatory.
This implementation is O(N*M) time and O(M) space, for N and M the
lengths of the two sequences.
>>> dameraulevenshtein('ba', 'abc')
2
>>> dameraulevenshtein('fee', 'deed')
2
It works with arbitrary sequences too:
>>> dameraulevenshtein('abcd', ['b', 'a', 'c', 'd', 'e'])
2
"""
# codesnippet:D0DE4716-B6E6-4161-9219-2903BF8F547F
# Conceptually, this is based on a len(seq1) + 1 * len(seq2) + 1 matrix.
# However, only the current and two previous rows are needed at once,
# so we only store those.
oneago = None
thisrow = range(1, len(seq2) + 1) + [0]
for x in xrange(len(seq1)):
# Python lists wrap around for negative indices, so put the
# leftmost column at the *end* of the list. This matches with
# the zero-indexed strings and saves extra calculation.
twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
for y in xrange(len(seq2)):
delcost = oneago[y] + 1
addcost = thisrow[y - 1] + 1
subcost = oneago[y - 1] + (seq1[x] != seq2[y])
thisrow[y] = min(delcost, addcost, subcost)
# This block deals with transpositions
if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
and seq1[x-1] == seq2[y] and seq1[x] != seq2[y]):
thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
return thisrow[len(seq2) - 1]