NLeSC · larsmans · May 6, 2016 · May 6, 2016 · May 6, 2016
diff --git a/requirements.txt b/requirements.txt
@@ -19,4 +19,3 @@ setuptools>=1.3.2
 six>=1.9.0
 tornado
 urllib3
-weighwords
diff --git a/xtas/_downloader.py b/xtas/_downloader.py
@@ -18,10 +18,9 @@
 import os
 import os.path
 from tempfile import NamedTemporaryFile
+from urllib import urlretrieve
 from zipfile import ZipFile
 
-from six.moves.urllib.request import urlretrieve
-
 
 logger = logging.getLogger(__name__)
 

diff --git a/xtas/tasks/_emotion.py b/xtas/tasks/_emotion.py
@@ -18,8 +18,7 @@
 import os.path
 from shutil import copyfileobj, move
 from tempfile import NamedTemporaryFile
-
-from six.moves.urllib.request import urlopen
+from urllib2 import urlopen
 
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.grid_search import GridSearchCV

diff --git a/xtas/tasks/_nl_conll_ner.py b/xtas/tasks/_nl_conll_ner.py
@@ -16,12 +16,12 @@
 # Contributed by Daan Odijk (UvA), based on an example in the seqlearn
 # package (https://github.com/larsmans/seqlearn).
 
+from urllib2 import urlopen
+
 from seqlearn.datasets import load_conll
 from seqlearn.perceptron import StructuredPerceptron
 from sklearn.feature_extraction import FeatureHasher
 
-from six.moves.urllib.request import urlopen
-
 
 _BASE_URL = 'http://www.cnts.ua.ac.be/conll2002/ner/data/ned.'
 

diff --git a/xtas/tasks/_polarity.py b/xtas/tasks/_polarity.py
@@ -17,8 +17,7 @@
 from pprint import pprint
 import tarfile
 from tempfile import NamedTemporaryFile
-
-from six.moves.urllib.request import urlretrieve
+from urllib import urlretrieve
 
 from sklearn.datasets import load_files
 from sklearn.externals.joblib import dump, load

diff --git a/xtas/tasks/_weighwords/__init__.py b/xtas/tasks/_weighwords/__init__.py
@@ -0,0 +1 @@
+from .parsimonious import ParsimoniousLM
diff --git a/xtas/tasks/_weighwords/logsum.py b/xtas/tasks/_weighwords/logsum.py
@@ -0,0 +1,30 @@
+# Safe addition in log-space, taken from scikit-learn.
+#
+# Authors: G. Varoquaux, A. Gramfort, A. Passos, O. Grisel
+# License: BSD
+
+import numpy as np
+
+
+def logsum(x):
+    """Computes the sum of x assuming x is in the log domain.
+
+    Returns log(sum(exp(x))) while minimizing the possibility of
+    over/underflow.
+
+    Examples
+    ========
+
+    >>> import numpy as np
+    >>> a = np.arange(10)
+    >>> np.log(np.sum(np.exp(a)))
+    9.4586297444267107
+    >>> logsum(a)
+    9.4586297444267107
+    """
+    # Use the max to normalize, as with the log this is what accumulates
+    # the less errors
+    vmax = x.max(axis=0)
+    out = np.log(np.sum(np.exp(x - vmax), axis=0))
+    out += vmax
+    return out
diff --git a/xtas/tasks/_weighwords/parsimonious.py b/xtas/tasks/_weighwords/parsimonious.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python
+
+# Copyright 2011-2013 University of Amsterdam
+# Author: Lars Buitinck
+
+from collections import defaultdict
+from heapq import nlargest
+import logging
+import numpy as np
+
+from .logsum import logsum
+
+
+logger = logging.getLogger(__name__)
+
+
+class ParsimoniousLM(object):
+    """Language model for a set of documents.
+
+    Constructing an object of this class fits a background model. The top
+    method can then be used to fit document-specific models, also for unseen
+    documents (with the same vocabulary as the background corpus).
+
+    Parameters
+    ----------
+    documents : iterable over iterable over terms
+    w : float
+        Weight of document model (1 - weight of corpus model)
+    thresh : int
+        Don't include words that occur < thresh times
+
+    Attributes
+    ----------
+    vocab : dict of term -> int
+        Mapping of terms to numeric indices
+    p_corpus : array of float
+        Log prob of terms
+    """
+
+    def __init__(self, documents, w, thresh=0):
+        logger.info('Building corpus model')
+
+        self.w = w
+        self.vocab = vocab = {}     # Vocabulary: maps terms to numeric indices
+        count = defaultdict(int)    # Corpus frequency
+
+        for d in documents:
+            for tok in d:
+                i = vocab.setdefault(tok, len(vocab))
+                count[i] += 1
+
+        cf = np.empty(len(count), dtype=np.float)
+        for i, f in count.iteritems():
+            cf[i] = f
+        rare = (cf < thresh)
+        cf -= rare * cf
+
+        try:
+            old_error_settings = np.seterr(divide='ignore')
+
+            # lg P(t|C)
+            self.p_corpus = np.log(cf) - np.log(np.sum(cf))
+        finally:
+            np.seterr(**old_error_settings)
+
+    def top(self, k, d, max_iter=50, eps=1e-5, w=None):
+        '''Get the top k terms of a document d and their log probabilities.
+
+        Uses the Expectation Maximization (EM) algorithm to estimate term
+        probabilities.
+
+        Parameters
+        ----------
+        max_iter : int, optional
+            Maximum number of iterations of EM algorithm to run.
+        eps : float, optional
+            Convergence threshold for EM algorithm.
+        w : float, optional
+            Weight of document model; overrides value given to __init__
+
+        Returns
+        -------
+        t_p : list of (str, float)
+        '''
+
+        tf, p_term = self._document_model(d)
+        p_term = self._EM(tf, p_term, w, max_iter, eps)
+
+        terms = [(t, p_term[i]) for t, i in self.vocab.iteritems()]
+        return nlargest(k, terms, lambda tp: tp[1])
+
+    def _document_model(self, d):
+        '''Build document model.
+
+        Parameters
+        ----------
+        d : array of terms
+
+        Returns
+        -------
+        tf : array of int
+            Term frequencies
+        p_term : array of float
+            Term log probabilities
+
+        Initial p_term is 1/n_distinct for terms with non-zero tf,
+        0 for terms with 0 tf.
+        '''
+
+        logger.info('Gathering term probabilities')
+
+        tf = np.zeros(len(self.vocab), dtype=np.float)  # Term frequency
+
+        for tok in d:
+            tf[self.vocab[tok]] += 1
+
+        n_distinct = (tf > 0).sum()
+
+        try:
+            old_error_settings = np.seterr(divide='ignore')
+            p_term = np.log(tf > 0) - np.log(n_distinct)
+        finally:
+            np.seterr(**old_error_settings)
+
+        return tf, p_term
+
+    def _EM(self, tf, p_term, w, max_iter, eps):
+        '''Expectation maximization.
+
+        Parameters
+        ----------
+        tf : array of float
+            Term frequencies, as returned by document_model
+        p_term : array of float
+            Term probabilities, as returned by document_model
+        max_iter : int
+            Number of iterations to run.
+
+        Returns
+        -------
+        p_term : array of float
+            A posteriori term probabilities.
+        '''
+
+        logger.info('EM with max_iter=%d, eps=%g' % (max_iter, eps))
+
+        if w is None:
+            w = self.w
+        w_ = np.log(1 - w)
+        w = np.log(w)
+
+        p_corpus = self.p_corpus + w_
+        tf = np.log(tf)
+
+        try:
+            old_error_settings = np.seterr(divide='ignore')
+            p_term = np.asarray(p_term)
+            for i in xrange(1, max_iter + 1):
+                # E-step
+                p_term += w
+                E = tf + p_term - np.logaddexp(p_corpus, p_term)
+
+                # M-step
+                new_p_term = E - logsum(E)
+
+                diff = new_p_term - p_term
+                p_term = new_p_term
+                if (diff < eps).all():
+                    logger.info('EM: convergence reached after %d iterations'
+                                % i)
+                    break
+        finally:
+            np.seterr(**old_error_settings)
+
+        return p_term
diff --git a/xtas/tasks/cluster.py b/xtas/tasks/cluster.py
@@ -164,7 +164,7 @@ def lsa(docs, k, random_state=None):
 
 
 @app.task
-def lda(docs, k):
+def lda(docs, k, random_state=None):
     """Latent Dirichlet allocation topic model.
 
     Uses scikit-learn's TfidfVectorizer and LatentDirichletAllocation.
@@ -178,7 +178,7 @@ def lda(docs, k):
     from sklearn.pipeline import make_pipeline
 
     vect = _vectorizer()
-    lda = LatentDirichletAllocation(n_topics=k)
+    lda = LatentDirichletAllocation(n_topics=k, random_state=random_state)
     pipe = make_pipeline(vect, lda).fit(docs)
 
     vocab = vect.vocabulary_
@@ -202,7 +202,7 @@ def parsimonious_wordcloud(docs, w=.5, k=10):
     Parameters
     ----------
     docs : list
-        List of documents.
+        List of documents. Each document should be a list of terms.
 
     w : float
         Weight assigned to the document terms when fitting individual models,
@@ -218,7 +218,7 @@ def parsimonious_wordcloud(docs, w=.5, k=10):
         For each document in docs, a top-k list of most probable words and
         their log-probabilities.
     """
-    from weighwords import ParsimoniousLM
+    from ._weighwords import ParsimoniousLM
 
     model = ParsimoniousLM(docs, w=w)
     return [model.top(k, d) for d in docs]
diff --git a/xtas/tasks/single.py b/xtas/tasks/single.py
@@ -24,9 +24,8 @@
 from __future__ import absolute_import
 
 import json
-
-from six.moves.urllib.parse import urlencode
-from six.moves.urllib.request import urlopen
+from urllib import urlencode
+from urllib2 import urlopen
 
 from cytoolz import identity, pipe
 import nltk

diff --git a/xtas/tests/test_cluster.py b/xtas/tests/test_cluster.py
@@ -0,0 +1,19 @@
+import numbers
+
+from nose.tools import assert_equal, assert_true
+
+from xtas.tasks.cluster import parsimonious_wordcloud
+
+
+def test_parsimonious_wordcloud():
+    docs = map(lambda s: s.split(),
+               ["an apple a day keeps the doctor away",
+                "orange is the new black",
+                "comparing apples and oranges"])
+    lm = parsimonious_wordcloud(docs, k=4)
+    assert_equal(3, len(lm))
+    for x in lm:
+        assert_equal(4, len(x))
+        for term, weight in x:
+            assert_true(isinstance(term, basestring))
+            assert_true(isinstance(weight, numbers.Real))
-Original file line number
+Diff line change
@@ Expand Up / @@ -19,4 +19,3 @@ setuptools>=1.3.2 @@
     six>=1.9.0
     tornado
     urllib3
-    weighwords