-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathdocument.py
More file actions
125 lines (107 loc) · 4.14 KB
/
Copy pathdocument.py
File metadata and controls
125 lines (107 loc) · 4.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from __future__ import division
import os
import re
import math
import string
import operator
from collections import Counter
class Document(object):
def __init__(self, name, text, max_ngram_size=3):
self.name = self.simple_name(name)
self.text = text
self.max_ngram_size = max_ngram_size
self.NGRAM_COUNTS = self.ngram_counts(text)
self.TOTALS = self.totals(self.NGRAM_COUNTS)
# removes file extension from source's display name
def simple_name(self, name):
if name.split('.')[-1] == 'txt':
name = '.'.join(name.split('.')[:-1])
return name
# given a term one or more words, return how many times it occurs in this document
def term_count(self, term):
num_words = len(term.split())
if num_words > 0 and term in self.NGRAM_COUNTS[num_words-1]:
return self.NGRAM_COUNTS[num_words-1][term]
else:
return 0
def ngram_counts(self, text):
lines = text.split('\n')
if len(lines[0].split('\t')) == 2: # true if this is a tab delimited file of the form [ngram TAB count]
return self.ngram_counts_from_data(lines, self.max_ngram_size)
else:
return self.ngram_counts_from_text(text, self.max_ngram_size)
# gets ngram count data from a tab delimited file associating ngrams with counts
def ngram_counts_from_data(self, lines, max_ngram_size):
counters = [{} for _ in range(max_ngram_size)]
for line in lines:
if len(line.split('\t')) == 2:
ngram, count = line.split('\t')
ngram_size = len(ngram.split(' '))
if not ngram in counters[ngram_size - 1]:
counters[ngram_size - 1][ngram] = float(count)
else:
counters[ngram_size - 1][ngram] += float(count)
return counters
# gets ngram count data from a raw text file
def ngram_counts_from_text(self, text, max_ngram_size):
counters = []
word_re = "'?\w[\w']*(?:-\w+)*'?"
space_re = "\s"
for ngram_size in range(1, max_ngram_size+1):
regex_string = word_re + (space_re+word_re) *(ngram_size-1)
full_expression = "(?=(%s))(?<!\w)" % regex_string
newcounter = Counter(re.findall(full_expression, text.lower().replace(".","").replace("\n"," ").replace("'","")))
cleancounter = {key: val for key, val in newcounter.iteritems()} # stopgap that can perform adhoc replacements of stubborn apostrophe words
counters.append(cleancounter)
return counters
# splits the text into sentences, lowercases them and cleans punctuation
def make_sentences(self, str):
# split at period followed by newline or space, or question mark, or exclamation point
sentences = str.split('.\n' or '. ' or '?' or '!')
for sentence in range (0,len(sentences)):
sentences[sentence] = sentences[sentence].strip('\n')
sentences[sentence] = sentences[sentence].translate(string.maketrans("",""), string.punctuation.replace("'","")) # removes all punctuation except apostrophe
sentences[sentence] = sentences[sentence].lower()
return sentences
# returns all the text that occurs in quotes
def inside_quotes(self):
paragraphs = self.text.split('\n')
snowball = ''
for p in paragraphs:
in_quotes = " ".join(re.findall('"([^"]*)"', p))
snowball += "\n " + in_quotes
return snowball
# returns all the text that occurs outside quotes
def outside_quotes(self):
paragraphs = self.text.split('\n')
snowball = ''
for p in paragraphs:
out_of_quotes = re.sub('"([^"]*)"','', p)
snowball += "\n " + out_of_quotes
return snowball
# for each dictionary of ngrams, returns the total number of ngrams of that size in this document
def totals(self, ngram_counts):
totals = []
for c in ngram_counts:
total = sum([val for val in c.values()])
totals.append(total)
return totals
# returns the dictionary of ngrams of the specified size, mapped to counts
def get_ngrams(self, n):
return self.NGRAM_COUNTS[n-1]
# term frequency
def tf(self, term):
num_words = len(term.split())
return self.NGRAM_COUNTS[num_words-1][term] / self.TOTALS[num_words-1]
# sigscore given a baseline frequency
def sigscore(self, term, baseline):
frequency = self.tf(term)
expected_frequency = baseline
return frequency / expected_frequency
"""
path = 'texts/ncaa_recaps'
with open(path, 'r') as f:
text = f.read()
d = Document('ncaa', text, 2)
print d.outside_quotes()
"""