diff --git a/requirements.txt b/requirements.txt index 84c2bff..484e589 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,4 +19,3 @@ setuptools>=1.3.2 six>=1.9.0 tornado urllib3 -weighwords diff --git a/xtas/_downloader.py b/xtas/_downloader.py index 6241881..91d3c7d 100644 --- a/xtas/_downloader.py +++ b/xtas/_downloader.py @@ -18,10 +18,9 @@ import os import os.path from tempfile import NamedTemporaryFile +from urllib import urlretrieve from zipfile import ZipFile -from six.moves.urllib.request import urlretrieve - logger = logging.getLogger(__name__) diff --git a/xtas/tasks/_emotion.py b/xtas/tasks/_emotion.py index a45caac..73c8450 100644 --- a/xtas/tasks/_emotion.py +++ b/xtas/tasks/_emotion.py @@ -18,8 +18,7 @@ import os.path from shutil import copyfileobj, move from tempfile import NamedTemporaryFile - -from six.moves.urllib.request import urlopen +from urllib2 import urlopen from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.grid_search import GridSearchCV diff --git a/xtas/tasks/_nl_conll_ner.py b/xtas/tasks/_nl_conll_ner.py index 7c3eb06..7d610a2 100644 --- a/xtas/tasks/_nl_conll_ner.py +++ b/xtas/tasks/_nl_conll_ner.py @@ -16,12 +16,12 @@ # Contributed by Daan Odijk (UvA), based on an example in the seqlearn # package (https://github.com/larsmans/seqlearn). +from urllib2 import urlopen + from seqlearn.datasets import load_conll from seqlearn.perceptron import StructuredPerceptron from sklearn.feature_extraction import FeatureHasher -from six.moves.urllib.request import urlopen - _BASE_URL = 'http://www.cnts.ua.ac.be/conll2002/ner/data/ned.' diff --git a/xtas/tasks/_polarity.py b/xtas/tasks/_polarity.py index 4ecc620..3edd9e6 100644 --- a/xtas/tasks/_polarity.py +++ b/xtas/tasks/_polarity.py @@ -17,8 +17,7 @@ from pprint import pprint import tarfile from tempfile import NamedTemporaryFile - -from six.moves.urllib.request import urlretrieve +from urllib import urlretrieve from sklearn.datasets import load_files from sklearn.externals.joblib import dump, load diff --git a/xtas/tasks/_weighwords/__init__.py b/xtas/tasks/_weighwords/__init__.py new file mode 100644 index 0000000..f725693 --- /dev/null +++ b/xtas/tasks/_weighwords/__init__.py @@ -0,0 +1 @@ +from .parsimonious import ParsimoniousLM diff --git a/xtas/tasks/_weighwords/logsum.py b/xtas/tasks/_weighwords/logsum.py new file mode 100644 index 0000000..6df59cd --- /dev/null +++ b/xtas/tasks/_weighwords/logsum.py @@ -0,0 +1,30 @@ +# Safe addition in log-space, taken from scikit-learn. +# +# Authors: G. Varoquaux, A. Gramfort, A. Passos, O. Grisel +# License: BSD + +import numpy as np + + +def logsum(x): + """Computes the sum of x assuming x is in the log domain. + + Returns log(sum(exp(x))) while minimizing the possibility of + over/underflow. + + Examples + ======== + + >>> import numpy as np + >>> a = np.arange(10) + >>> np.log(np.sum(np.exp(a))) + 9.4586297444267107 + >>> logsum(a) + 9.4586297444267107 + """ + # Use the max to normalize, as with the log this is what accumulates + # the less errors + vmax = x.max(axis=0) + out = np.log(np.sum(np.exp(x - vmax), axis=0)) + out += vmax + return out diff --git a/xtas/tasks/_weighwords/parsimonious.py b/xtas/tasks/_weighwords/parsimonious.py new file mode 100644 index 0000000..becaa02 --- /dev/null +++ b/xtas/tasks/_weighwords/parsimonious.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python + +# Copyright 2011-2013 University of Amsterdam +# Author: Lars Buitinck + +from collections import defaultdict +from heapq import nlargest +import logging +import numpy as np + +from .logsum import logsum + + +logger = logging.getLogger(__name__) + + +class ParsimoniousLM(object): + """Language model for a set of documents. + + Constructing an object of this class fits a background model. The top + method can then be used to fit document-specific models, also for unseen + documents (with the same vocabulary as the background corpus). + + Parameters + ---------- + documents : iterable over iterable over terms + w : float + Weight of document model (1 - weight of corpus model) + thresh : int + Don't include words that occur < thresh times + + Attributes + ---------- + vocab : dict of term -> int + Mapping of terms to numeric indices + p_corpus : array of float + Log prob of terms + """ + + def __init__(self, documents, w, thresh=0): + logger.info('Building corpus model') + + self.w = w + self.vocab = vocab = {} # Vocabulary: maps terms to numeric indices + count = defaultdict(int) # Corpus frequency + + for d in documents: + for tok in d: + i = vocab.setdefault(tok, len(vocab)) + count[i] += 1 + + cf = np.empty(len(count), dtype=np.float) + for i, f in count.iteritems(): + cf[i] = f + rare = (cf < thresh) + cf -= rare * cf + + try: + old_error_settings = np.seterr(divide='ignore') + + # lg P(t|C) + self.p_corpus = np.log(cf) - np.log(np.sum(cf)) + finally: + np.seterr(**old_error_settings) + + def top(self, k, d, max_iter=50, eps=1e-5, w=None): + '''Get the top k terms of a document d and their log probabilities. + + Uses the Expectation Maximization (EM) algorithm to estimate term + probabilities. + + Parameters + ---------- + max_iter : int, optional + Maximum number of iterations of EM algorithm to run. + eps : float, optional + Convergence threshold for EM algorithm. + w : float, optional + Weight of document model; overrides value given to __init__ + + Returns + ------- + t_p : list of (str, float) + ''' + + tf, p_term = self._document_model(d) + p_term = self._EM(tf, p_term, w, max_iter, eps) + + terms = [(t, p_term[i]) for t, i in self.vocab.iteritems()] + return nlargest(k, terms, lambda tp: tp[1]) + + def _document_model(self, d): + '''Build document model. + + Parameters + ---------- + d : array of terms + + Returns + ------- + tf : array of int + Term frequencies + p_term : array of float + Term log probabilities + + Initial p_term is 1/n_distinct for terms with non-zero tf, + 0 for terms with 0 tf. + ''' + + logger.info('Gathering term probabilities') + + tf = np.zeros(len(self.vocab), dtype=np.float) # Term frequency + + for tok in d: + tf[self.vocab[tok]] += 1 + + n_distinct = (tf > 0).sum() + + try: + old_error_settings = np.seterr(divide='ignore') + p_term = np.log(tf > 0) - np.log(n_distinct) + finally: + np.seterr(**old_error_settings) + + return tf, p_term + + def _EM(self, tf, p_term, w, max_iter, eps): + '''Expectation maximization. + + Parameters + ---------- + tf : array of float + Term frequencies, as returned by document_model + p_term : array of float + Term probabilities, as returned by document_model + max_iter : int + Number of iterations to run. + + Returns + ------- + p_term : array of float + A posteriori term probabilities. + ''' + + logger.info('EM with max_iter=%d, eps=%g' % (max_iter, eps)) + + if w is None: + w = self.w + w_ = np.log(1 - w) + w = np.log(w) + + p_corpus = self.p_corpus + w_ + tf = np.log(tf) + + try: + old_error_settings = np.seterr(divide='ignore') + p_term = np.asarray(p_term) + for i in xrange(1, max_iter + 1): + # E-step + p_term += w + E = tf + p_term - np.logaddexp(p_corpus, p_term) + + # M-step + new_p_term = E - logsum(E) + + diff = new_p_term - p_term + p_term = new_p_term + if (diff < eps).all(): + logger.info('EM: convergence reached after %d iterations' + % i) + break + finally: + np.seterr(**old_error_settings) + + return p_term diff --git a/xtas/tasks/cluster.py b/xtas/tasks/cluster.py index 25bc63d..7fba757 100644 --- a/xtas/tasks/cluster.py +++ b/xtas/tasks/cluster.py @@ -164,7 +164,7 @@ def lsa(docs, k, random_state=None): @app.task -def lda(docs, k): +def lda(docs, k, random_state=None): """Latent Dirichlet allocation topic model. Uses scikit-learn's TfidfVectorizer and LatentDirichletAllocation. @@ -178,7 +178,7 @@ def lda(docs, k): from sklearn.pipeline import make_pipeline vect = _vectorizer() - lda = LatentDirichletAllocation(n_topics=k) + lda = LatentDirichletAllocation(n_topics=k, random_state=random_state) pipe = make_pipeline(vect, lda).fit(docs) vocab = vect.vocabulary_ @@ -202,7 +202,7 @@ def parsimonious_wordcloud(docs, w=.5, k=10): Parameters ---------- docs : list - List of documents. + List of documents. Each document should be a list of terms. w : float Weight assigned to the document terms when fitting individual models, @@ -218,7 +218,7 @@ def parsimonious_wordcloud(docs, w=.5, k=10): For each document in docs, a top-k list of most probable words and their log-probabilities. """ - from weighwords import ParsimoniousLM + from ._weighwords import ParsimoniousLM model = ParsimoniousLM(docs, w=w) return [model.top(k, d) for d in docs] diff --git a/xtas/tasks/single.py b/xtas/tasks/single.py index 51a5f4e..7fa2346 100644 --- a/xtas/tasks/single.py +++ b/xtas/tasks/single.py @@ -24,9 +24,8 @@ from __future__ import absolute_import import json - -from six.moves.urllib.parse import urlencode -from six.moves.urllib.request import urlopen +from urllib import urlencode +from urllib2 import urlopen from cytoolz import identity, pipe import nltk diff --git a/xtas/tests/test_cluster.py b/xtas/tests/test_cluster.py new file mode 100644 index 0000000..d7b1447 --- /dev/null +++ b/xtas/tests/test_cluster.py @@ -0,0 +1,19 @@ +import numbers + +from nose.tools import assert_equal, assert_true + +from xtas.tasks.cluster import parsimonious_wordcloud + + +def test_parsimonious_wordcloud(): + docs = map(lambda s: s.split(), + ["an apple a day keeps the doctor away", + "orange is the new black", + "comparing apples and oranges"]) + lm = parsimonious_wordcloud(docs, k=4) + assert_equal(3, len(lm)) + for x in lm: + assert_equal(4, len(x)) + for term, weight in x: + assert_true(isinstance(term, basestring)) + assert_true(isinstance(weight, numbers.Real))