Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,3 @@ setuptools>=1.3.2
six>=1.9.0
tornado
urllib3
weighwords
3 changes: 1 addition & 2 deletions xtas/_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,9 @@
import os
import os.path
from tempfile import NamedTemporaryFile
from urllib import urlretrieve
from zipfile import ZipFile

from six.moves.urllib.request import urlretrieve


logger = logging.getLogger(__name__)

Expand Down
3 changes: 1 addition & 2 deletions xtas/tasks/_emotion.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@
import os.path
from shutil import copyfileobj, move
from tempfile import NamedTemporaryFile

from six.moves.urllib.request import urlopen
from urllib2 import urlopen

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.grid_search import GridSearchCV
Expand Down
4 changes: 2 additions & 2 deletions xtas/tasks/_nl_conll_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@
# Contributed by Daan Odijk (UvA), based on an example in the seqlearn
# package (https://github.com/larsmans/seqlearn).

from urllib2 import urlopen

from seqlearn.datasets import load_conll
from seqlearn.perceptron import StructuredPerceptron
from sklearn.feature_extraction import FeatureHasher

from six.moves.urllib.request import urlopen


_BASE_URL = 'http://www.cnts.ua.ac.be/conll2002/ner/data/ned.'

Expand Down
3 changes: 1 addition & 2 deletions xtas/tasks/_polarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@
from pprint import pprint
import tarfile
from tempfile import NamedTemporaryFile

from six.moves.urllib.request import urlretrieve
from urllib import urlretrieve

from sklearn.datasets import load_files
from sklearn.externals.joblib import dump, load
Expand Down
1 change: 1 addition & 0 deletions xtas/tasks/_weighwords/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .parsimonious import ParsimoniousLM
30 changes: 30 additions & 0 deletions xtas/tasks/_weighwords/logsum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Safe addition in log-space, taken from scikit-learn.
#
# Authors: G. Varoquaux, A. Gramfort, A. Passos, O. Grisel
# License: BSD

import numpy as np


def logsum(x):
"""Computes the sum of x assuming x is in the log domain.

Returns log(sum(exp(x))) while minimizing the possibility of
over/underflow.

Examples
========

>>> import numpy as np
>>> a = np.arange(10)
>>> np.log(np.sum(np.exp(a)))
9.4586297444267107
>>> logsum(a)
9.4586297444267107
"""
# Use the max to normalize, as with the log this is what accumulates
# the less errors
vmax = x.max(axis=0)
out = np.log(np.sum(np.exp(x - vmax), axis=0))
out += vmax
return out
175 changes: 175 additions & 0 deletions xtas/tasks/_weighwords/parsimonious.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
#!/usr/bin/env python

# Copyright 2011-2013 University of Amsterdam
# Author: Lars Buitinck

from collections import defaultdict
from heapq import nlargest
import logging
import numpy as np

from .logsum import logsum


logger = logging.getLogger(__name__)


class ParsimoniousLM(object):
"""Language model for a set of documents.

Constructing an object of this class fits a background model. The top
method can then be used to fit document-specific models, also for unseen
documents (with the same vocabulary as the background corpus).

Parameters
----------
documents : iterable over iterable over terms
w : float
Weight of document model (1 - weight of corpus model)
thresh : int
Don't include words that occur < thresh times

Attributes
----------
vocab : dict of term -> int
Mapping of terms to numeric indices
p_corpus : array of float
Log prob of terms
"""

def __init__(self, documents, w, thresh=0):
logger.info('Building corpus model')

self.w = w
self.vocab = vocab = {} # Vocabulary: maps terms to numeric indices
count = defaultdict(int) # Corpus frequency

for d in documents:
for tok in d:
i = vocab.setdefault(tok, len(vocab))
count[i] += 1

cf = np.empty(len(count), dtype=np.float)
for i, f in count.iteritems():
cf[i] = f
rare = (cf < thresh)
cf -= rare * cf

try:
old_error_settings = np.seterr(divide='ignore')

# lg P(t|C)
self.p_corpus = np.log(cf) - np.log(np.sum(cf))
finally:
np.seterr(**old_error_settings)

def top(self, k, d, max_iter=50, eps=1e-5, w=None):
'''Get the top k terms of a document d and their log probabilities.

Uses the Expectation Maximization (EM) algorithm to estimate term
probabilities.

Parameters
----------
max_iter : int, optional
Maximum number of iterations of EM algorithm to run.
eps : float, optional
Convergence threshold for EM algorithm.
w : float, optional
Weight of document model; overrides value given to __init__

Returns
-------
t_p : list of (str, float)
'''

tf, p_term = self._document_model(d)
p_term = self._EM(tf, p_term, w, max_iter, eps)

terms = [(t, p_term[i]) for t, i in self.vocab.iteritems()]
return nlargest(k, terms, lambda tp: tp[1])

def _document_model(self, d):
'''Build document model.

Parameters
----------
d : array of terms

Returns
-------
tf : array of int
Term frequencies
p_term : array of float
Term log probabilities

Initial p_term is 1/n_distinct for terms with non-zero tf,
0 for terms with 0 tf.
'''

logger.info('Gathering term probabilities')

tf = np.zeros(len(self.vocab), dtype=np.float) # Term frequency

for tok in d:
tf[self.vocab[tok]] += 1

n_distinct = (tf > 0).sum()

try:
old_error_settings = np.seterr(divide='ignore')
p_term = np.log(tf > 0) - np.log(n_distinct)
finally:
np.seterr(**old_error_settings)

return tf, p_term

def _EM(self, tf, p_term, w, max_iter, eps):
'''Expectation maximization.

Parameters
----------
tf : array of float
Term frequencies, as returned by document_model
p_term : array of float
Term probabilities, as returned by document_model
max_iter : int
Number of iterations to run.

Returns
-------
p_term : array of float
A posteriori term probabilities.
'''

logger.info('EM with max_iter=%d, eps=%g' % (max_iter, eps))

if w is None:
w = self.w
w_ = np.log(1 - w)
w = np.log(w)

p_corpus = self.p_corpus + w_
tf = np.log(tf)

try:
old_error_settings = np.seterr(divide='ignore')
p_term = np.asarray(p_term)
for i in xrange(1, max_iter + 1):
# E-step
p_term += w
E = tf + p_term - np.logaddexp(p_corpus, p_term)

# M-step
new_p_term = E - logsum(E)

diff = new_p_term - p_term
p_term = new_p_term
if (diff < eps).all():
logger.info('EM: convergence reached after %d iterations'
% i)
break
finally:
np.seterr(**old_error_settings)

return p_term
8 changes: 4 additions & 4 deletions xtas/tasks/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def lsa(docs, k, random_state=None):


@app.task
def lda(docs, k):
def lda(docs, k, random_state=None):
"""Latent Dirichlet allocation topic model.

Uses scikit-learn's TfidfVectorizer and LatentDirichletAllocation.
Expand All @@ -178,7 +178,7 @@ def lda(docs, k):
from sklearn.pipeline import make_pipeline

vect = _vectorizer()
lda = LatentDirichletAllocation(n_topics=k)
lda = LatentDirichletAllocation(n_topics=k, random_state=random_state)
pipe = make_pipeline(vect, lda).fit(docs)

vocab = vect.vocabulary_
Expand All @@ -202,7 +202,7 @@ def parsimonious_wordcloud(docs, w=.5, k=10):
Parameters
----------
docs : list
List of documents.
List of documents. Each document should be a list of terms.

w : float
Weight assigned to the document terms when fitting individual models,
Expand All @@ -218,7 +218,7 @@ def parsimonious_wordcloud(docs, w=.5, k=10):
For each document in docs, a top-k list of most probable words and
their log-probabilities.
"""
from weighwords import ParsimoniousLM
from ._weighwords import ParsimoniousLM

model = ParsimoniousLM(docs, w=w)
return [model.top(k, d) for d in docs]
5 changes: 2 additions & 3 deletions xtas/tasks/single.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,8 @@
from __future__ import absolute_import

import json

from six.moves.urllib.parse import urlencode
from six.moves.urllib.request import urlopen
from urllib import urlencode
from urllib2 import urlopen

from cytoolz import identity, pipe
import nltk
Expand Down
19 changes: 19 additions & 0 deletions xtas/tests/test_cluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import numbers

from nose.tools import assert_equal, assert_true

from xtas.tasks.cluster import parsimonious_wordcloud


def test_parsimonious_wordcloud():
docs = map(lambda s: s.split(),
["an apple a day keeps the doctor away",
"orange is the new black",
"comparing apples and oranges"])
lm = parsimonious_wordcloud(docs, k=4)
assert_equal(3, len(lm))
for x in lm:
assert_equal(4, len(x))
for term, weight in x:
assert_true(isinstance(term, basestring))
assert_true(isinstance(weight, numbers.Real))