-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathUniformLanguageModel.py
More file actions
29 lines (25 loc) · 1.09 KB
/
Copy pathUniformLanguageModel.py
File metadata and controls
29 lines (25 loc) · 1.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import math, collections
class UniformLanguageModel:
def __init__(self, corpus):
"""Initialize your data structures in the constructor."""
self.words = set([])
self.train(corpus)
def train(self, corpus):
""" Takes a corpus and trains your language model.
Compute any counts or other corpus statistics in this function.
"""
for sentence in corpus.corpus: # iterate over sentences in the corpus
for datum in sentence.data: # iterate over datums in the sentence
word = datum.word # get the word
self.words.add(word)
def score(self, sentence):
""" Takes a list of strings as argument and returns the log-probability of the
sentence using your language model. Use whatever data you computed in train() here.
"""
score = 0.0
probability = math.log(1.0/len(self.words))
for token in sentence: # iterate over words in the sentence
score += probability
# NOTE: a simpler method would be just score = sentence.size() * - Math.log(words.size()).
# we show the 'for' loop for insructive purposes.
return score