diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 60c22a4845..61987ab8b5 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -1,6 +1,7 @@ Changes ======= 0.13.0, 2016 +* Added Distance Metrics to matutils.pt (@bhargavvader, #656) * Tutorials migrated from website to ipynb (@j9chan, #721), (@jesford, #733), (@jesford, #725), (@jesford, #716) * New doc2vec intro tutorial (@seanlaw, #730) * Gensim Quick Start Tutorial (@andrewjlm, #727) diff --git a/README.rst b/README.rst index bba6830b6c..a6480fd848 100644 --- a/README.rst +++ b/README.rst @@ -3,13 +3,9 @@ gensim -- Topic Modelling in Python ============================================== |Travis|_ -|Downloads|_ |Wheel|_ -|License|_ .. |Travis| image:: https://img.shields.io/travis/piskvorky/gensim/develop.svg -.. |Downloads| image:: https://img.shields.io/pypi/dm/gensim.svg -.. |License| image:: https://img.shields.io/pypi/l/gensim.svg .. |Wheel| image:: https://img.shields.io/pypi/wheel/gensim.svg .. _Travis: https://travis-ci.org/piskvorky/gensim diff --git a/docs/notebooks/similarity_metrics.ipynb b/docs/notebooks/similarity_metrics.ipynb new file mode 100644 index 0000000000..1840ab3bcc --- /dev/null +++ b/docs/notebooks/similarity_metrics.ipynb @@ -0,0 +1,633 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## New Distance Metrics for Probability Distribution and Bag of Words " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A small tutorial to illustrate the new distance functions.\n", + "\n", + "We would need this mostly when comparing how similar two probability distributions are, and in the case of gensim, usually for LSI or LDA topic distributions after we have a LDA model.\n", + "\n", + "Gensim already has functionalities for this, in the sense of getting most similar documents - [this](http://radimrehurek.com/topic_modeling_tutorial/3%20-%20Indexing%20and%20Retrieval.html), [this](https://radimrehurek.com/gensim/tut3.html) and [this](https://radimrehurek.com/gensim/similarities/docsim.html) are such examples of documentation and tutorials.\n", + "\n", + "What this tutorial shows is a building block of these larger methods, which are a small suite of distance metrics.\n", + "We'll start by setting up a small corpus and showing off the methods." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:gensim.models.word2vec:Slow version of gensim.models.word2vec is being used\n", + "WARNING:gensim.models.doc2vec:Slow version of gensim.models.doc2vec is being used\n" + ] + } + ], + "source": [ + "from gensim.corpora import Dictionary\n", + "from gensim.models import ldamodel\n", + "from gensim.matutils import kullback_leibler, jaccard, hellinger, sparse2full\n", + "import numpy" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# you can use any corpus, this is just illustratory\n", + "\n", + "texts = [['bank','river','shore','water'],\n", + " ['river','water','flow','fast','tree'],\n", + " ['bank','water','fall','flow'],\n", + " ['bank','bank','water','rain','river'],\n", + " ['river','water','mud','tree'],\n", + " ['money','transaction','bank','finance'],\n", + " ['bank','borrow','money'], \n", + " ['bank','finance'],\n", + " ['finance','money','sell','bank'],\n", + " ['borrow','sell'],\n", + " ['bank','loan','sell']]\n", + "\n", + "dictionary = Dictionary(texts)\n", + "corpus = [dictionary.doc2bow(text) for text in texts]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n" + ] + }, + { + "data": { + "text/plain": [ + "[(0,\n", + " u'0.164*bank + 0.142*water + 0.108*river + 0.076*flow + 0.067*borrow + 0.063*sell + 0.060*tree + 0.048*money + 0.046*fast + 0.044*rain'),\n", + " (1,\n", + " u'0.196*bank + 0.120*finance + 0.100*money + 0.082*sell + 0.067*river + 0.065*water + 0.056*transaction + 0.049*loan + 0.046*tree + 0.040*mud')]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numpy.random.seed(1) # setting random seed to get the same results each time.\n", + "model = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=2)\n", + "\n", + "model.show_topics()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a few sample documents and get them ready to test Similarity. Let's call the 1st topic the water topic and the second topic the finance topic.\n", + "\n", + "Note: these are all distance metrics. This means that a value between 0 and 1 is returned, where values closer to 0 indicate a smaller 'distance' and therefore a larger similarity." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "doc_water = ['river', 'water', 'shore']\n", + "doc_finance = ['finance', 'money', 'sell']\n", + "doc_bank = ['finance', 'bank', 'tree', 'water']\n", + "\n", + "# now let's make these into a bag of words format\n", + "\n", + "bow_water = model.id2word.doc2bow(doc_water) \n", + "bow_finance = model.id2word.doc2bow(doc_finance) \n", + "bow_bank = model.id2word.doc2bow(doc_bank) \n", + "\n", + "# we can now get the LDA topic distributions for these\n", + "lda_bow_water = model[bow_water]\n", + "lda_bow_finance = model[bow_finance]\n", + "lda_bow_bank = model[bow_bank]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Hellinger and Kullback–Leibler" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We're now ready to apply our distance metrics.\n", + "\n", + "Let's start with the popular Hellinger distance. \n", + "The Hellinger distance metric gives an output in the range [0,1] for two probability distributions, with values closer to 0 meaning they are more similar." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.51251199778753564" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hellinger(lda_bow_water, lda_bow_finance)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.2340730527221043" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hellinger(lda_bow_finance, lda_bow_bank)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Makes sense, right? In the first example, Document 1 and Document 2 are hardly similar, so we get a value of roughly 0.5. \n", + "\n", + "In the second case, the documents are a lot more similar, semantically. Trained with the model, they give a much less distance value." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's run similar examples down with Kullback Leibler." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.30823547" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kullback_leibler(lda_bow_water, lda_bow_bank)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.19881117" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kullback_leibler(lda_bow_finance, lda_bow_bank)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `bank` document is a combination of both water and finance related terms - but as bank in this context is likely to belong to the finance topic, the distance values are less between the finance and bank bows." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[(0, 0.44146764073708356), (1, 0.55853235926291644)]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# just to confirm our suspicion that the bank bow is more to do with finance:\n", + "\n", + "model.get_document_topics(bow_bank)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's evident that while it isn't too skewed, it it more towards the finance topic." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Distance metrics (also referred to as similarity metrics), as suggested in the examples above, are mainly for probability distributions, but the methods can accept a bunch of formats for input. You can do some further reading on [Kullback Leibler](https://en.wikipedia.org/wiki/Kullback–Leibler_divergence) and [Hellinger](https://en.wikipedia.org/wiki/Hellinger_distance) to figure out what suits your needs." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Jaccard " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let us now look at the [Jaccard Distance](https://en.wikipedia.org/wiki/Jaccard_index) metric for similarity between bags of words (i.e, documents)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8571428571428572" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "jaccard(bow_water, bow_bank)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8333333333333334" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "jaccard(doc_water, doc_bank)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "jaccard(['word'], ['word'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The three examples above feature 2 different input methods. \n", + "\n", + "In the first case, we present to jaccard document vectors already in bag of words format. The distance can be defined as 1 minus the size of the intersection upon the size of the union of the vectors. \n", + "\n", + "We can see (on manual inspection as well), that the distance is likely to be high - and it is. \n", + "\n", + "The last two examples illustrate the ability for jaccard to accept even lists (i.e, documents) as inputs.\n", + "In the last case, because they are the same vectors, the value returned is 0 - this means the distance is 0 and they are very similar. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Distance Metrics for Topic Distributions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "While there are already standard methods to identify similarity of documents, our distance metrics has one more interesting use-case: topic distributions. \n", + "\n", + "Let's say we want to find out how similar our two topics are, water and finance." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[(3, 0.196),\n", + " (12, 0.12),\n", + " (10, 0.1),\n", + " (14, 0.082),\n", + " (2, 0.067),\n", + " (0, 0.065),\n", + " (11, 0.056),\n", + " (15, 0.049),\n", + " (5, 0.046),\n", + " (9, 0.04)]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "topic_water, topic_finance = model.show_topics()\n", + "\n", + "# some pre processing to get the topics in a format acceptable to our distance metrics\n", + "\n", + "def make_topics_bow(topic):\n", + " # takes the string returned by model.show_topics()\n", + " # split on strings to get topics and the probabilities\n", + " topic = topic.split('+')\n", + " # list to store topic bows\n", + " topic_bow = []\n", + " for word in topic:\n", + " # split probability and word\n", + " prob, word = word.split('*')\n", + " # get rid of spaces\n", + " word = word.replace(\" \",\"\")\n", + " # convert to word_type\n", + " word = model.id2word.doc2bow([word])[0][0]\n", + " topic_bow.append((word, float(prob)))\n", + " return topic_bow\n", + "\n", + "finance_distribution = make_topics_bow(topic_finance[1])\n", + "water_distribution = make_topics_bow(topic_water[1])\n", + "\n", + "# the finance topic in bag of words format looks like this:\n", + "finance_distribution" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we've got our topics in a format more acceptable by our functions, let's use a Distance metric to see how similar the word distributions in the topics are." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.36453028040240248" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hellinger(water_distribution, finance_distribution)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Our value of roughly 0.36 means that the topics are not TOO distant with respect to their word distributions.\n", + "This makes sense again, because of overlapping words like `bank` and a small size dictionary." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Some things to take care of " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In our previous example we didn't use Kullback Leibler to test for similarity for a reason - KL is not a Distance 'Metric' in the technical sense (you can see what a metric is [here](https://en.wikipedia.org/wiki/Metric_(mathematics)). The nature of it, mathematically also means we must be a little careful before using it, because since it involves the log function, a zero can mess things up. For example:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "inf" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 16 here is the number of features the probability distribution draws from\n", + "kullback_leibler(water_distribution, finance_distribution, 16) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That wasn't very helpful, right? This just means that we have to be a bit careful about our inputs. Our old example didn't work out because they were some missing values for some words (because `show_topics()` only returned the top 10 topics). \n", + "\n", + "This can be remedied, though." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.19781515" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# return ALL the words in the dictionary for the topic-word distribution.\n", + "topic_water, topic_finance = model.show_topics(num_words=len(model.id2word))\n", + "\n", + "# do our bag of words transformation again\n", + "finance_distribution = make_topics_bow(topic_finance[1])\n", + "water_distribution = make_topics_bow(topic_water[1])\n", + "\n", + "# and voila!\n", + "kullback_leibler(water_distribution, finance_distribution)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You may notice that the distance for this is quite less, indicating a high similarity. This may be a bit off because of the small size of the corpus, where all topics are likely to contain a decent overlap of word probabilities. You will likely get a better value for a bigger corpus.\n", + "\n", + "So, just remember, if you intend to use KL as a metric to measure similarity or distance between two distributions, avoid zeros by returning the ENTIRE distribution. Since it's unlikely any probability distribution will ever have absolute zeros for any feature/word, returning all the values like we did will make you good to go." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That brings us to the end of this small tutorial.\n", + "The scope for adding new similarity metrics is large, as there exist an even larger suite of metrics and methods to add to the matutils.py file. ([This](http://nzcsrsc08.canterbury.ac.nz/site/proceedings/Individual_Papers/pg049_Similarity_Measures_for_Text_Document_Clustering.pdf) is one paper which talks about some of them)\n", + "\n", + "Looking forward to more PRs towards this functionality in Gensim! :)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/docs/src/conf.py b/docs/src/conf.py index 42fa3c71a0..5a9ad2b635 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -52,9 +52,9 @@ # built documents. # # The short X.Y version. -version = '0.13.0rc1' +version = '0.13.0' # The full version, including alpha/beta/rc tags. -release = '0.13.0rc1' +release = '0.13.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/gensim/matutils.py b/gensim/matutils.py index 02e5e2cc18..6045c28b14 100644 --- a/gensim/matutils.py +++ b/gensim/matutils.py @@ -18,6 +18,7 @@ import numpy import scipy.sparse +from scipy.stats import entropy import scipy.linalg from scipy.linalg.lapack import get_lapack_funcs @@ -395,6 +396,112 @@ def cossim(vec1, vec2): return result +def isbow(vec): + """ + Checks if vector passed is in bag of words representation or not. + Vec is considered to be in bag of words format if it is 2-tuple format. + """ + if scipy.sparse.issparse(vec): + vec = vec.todense().tolist() + try: + id_, val_ = vec[0] # checking first value to see if it is in bag of words format by unpacking + id_, val_ = int(id_), float(val_) + except IndexError: + return True # this is to handle the empty input case + except Exception: + return False + return True + + +def kullback_leibler(vec1, vec2, num_features=None): + """ + A distance metric between two probability distributions. + Returns a distance value in range <0,1> where values closer to 0 mean less distance (and a higher similarity) + Uses the scipy.stats.entropy method to identify kullback_leibler convergence value. + If the distribution draws from a certain number of docs, that value must be passed. + """ + if scipy.sparse.issparse(vec1): + vec1 = vec1.toarray() + if scipy.sparse.issparse(vec2): + vec2 = vec2.toarray() # converted both the vectors to dense in case they were in sparse matrix + if isbow(vec1) and isbow(vec2): # if they are in bag of words format we make it dense + if num_features != None: # if not None, make as large as the documents drawing from + dense1 = sparse2full(vec1, num_features) + dense2 = sparse2full(vec2, num_features) + return entropy(dense1, dense2) + else: + max_len = max(len(vec1), len(vec2)) + dense1 = sparse2full(vec1, max_len) + dense2 = sparse2full(vec2, max_len) + return entropy(dense1, dense2) + else: + # this conversion is made because if it is not in bow format, it might be a list within a list after conversion + # the scipy implementation of Kullback fails in such a case so we pick up only the nested list. + if len(vec1) == 1: + vec1 = vec1[0] + if len(vec2) == 1: + vec2 = vec2[0] + return scipy.stats.entropy(vec1, vec2) + + +def hellinger(vec1, vec2): + """ + Hellinger distance is a distance metric to quantify the similarity between two probability distributions. + Distance between distributions will be a number between <0,1>, where 0 is minimum distance (maximum similarity) and 1 is maximum distance (minimum similarity). + """ + if scipy.sparse.issparse(vec1): + vec1 = vec1.toarray() + if scipy.sparse.issparse(vec2): + vec2 = vec2.toarray() + if isbow(vec1) and isbow(vec2): + # if it is a bag of words format, instead of converting to dense we use dictionaries to calculate appropriate distance + vec1, vec2 = dict(vec1), dict(vec2) + if len(vec2) < len(vec1): + vec1, vec2 = vec2, vec1 # swap references so that we iterate over the shorter vector + sim = numpy.sqrt(0.5*sum((numpy.sqrt(value) - numpy.sqrt(vec2.get(index, 0.0)))**2 for index, value in iteritems(vec1))) + return sim + else: + sim = numpy.sqrt(0.5 * ((numpy.sqrt(vec1) - numpy.sqrt(vec2))**2).sum()) + return sim + + +def jaccard(vec1, vec2): + """ + A distance metric between bags of words representation. + Returns 1 minus the intersection divided by union, where union is the sum of the size of the two bags. + If it is not a bag of words representation, the union and intersection is calculated in the traditional manner. + Returns a value in range <0,1> where values closer to 0 mean less distance and thus higher similarity. + + """ + + # converting from sparse for easier manipulation + if scipy.sparse.issparse(vec1): + vec1 = vec1.toarray() + if scipy.sparse.issparse(vec2): + vec2 = vec2.toarray() + if isbow(vec1) and isbow(vec2): + # if it's in bow format, we use the following definitions: + # union = sum of the 'weights' of both the bags + # intersection = lowest weight for a particular id; basically the number of common words or items + union = sum(weight for id_, weight in vec1) + sum(weight for id_, weight in vec2) + vec1, vec2 = dict(vec1), dict(vec2) + intersection = 0.0 + for feature_id, feature_weight in iteritems(vec1): + intersection += min(feature_weight, vec2.get(feature_id, 0.0)) + return 1 - float(intersection) / float(union) + else: + # if it isn't in bag of words format, we can use sets to calculate intersection and union + if isinstance(vec1, numpy.ndarray): + vec1 = vec1.tolist() + if isinstance(vec2, numpy.ndarray): + vec2 = vec2.tolist() + vec1 = set(vec1) + vec2 = set(vec2) + intersection = vec1 & vec2 + union = vec1 | vec2 + return 1 - float(len(intersection)) / float(len(union)) + + def qr_destroy(la): """ Return QR decomposition of `la[0]`. Content of `la` gets destroyed in the process. diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py index 21e0eebd2b..af8702be50 100644 --- a/gensim/test/test_ldamodel.py +++ b/gensim/test/test_ldamodel.py @@ -293,19 +293,25 @@ def testTermTopics(self): numpy.random.seed(0) model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes=100) - # check with id + # check with word_type result = model.get_term_topics(2) - expected = [(1, 0.1066)] - # FIXME: fails on win and osx - # self.assertEqual(result[0][0], expected[0][0]) - # self.assertAlmostEqual(result[0][1], expected[0][1], places=2) + for topic_no, probability in result: + self.assertTrue(isinstance(topic_no, int)) + self.assertTrue(isinstance(probability, float)) + + # checks if topic '1' is in the result list + self.assertTrue(1 in result[0]) + # if user has entered word instead, check with word result = model.get_term_topics(str(model.id2word[2])) - expected = [(1, 0.1066)] - # FIXME: fails on win and osx - # self.assertEqual(result[0][0], expected[0][0]) - # self.assertAlmostEqual(result[0][1], expected[0][1], places=2) + for topic_no, probability in result: + self.assertTrue(isinstance(topic_no, int)) + self.assertTrue(isinstance(probability, float)) + + # checks if topic '1' is in the result list + self.assertTrue(1 in result[0]) + def testPasses(self): # long message includes the original error message with a custom one diff --git a/gensim/test/test_similarity_metrics.py b/gensim/test/test_similarity_metrics.py new file mode 100644 index 0000000000..0d7290472b --- /dev/null +++ b/gensim/test/test_similarity_metrics.py @@ -0,0 +1,255 @@ +#!/usr/bin/env python +# encoding: utf-8 +# +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Automated test to check similarity functions and isbow function. + +""" + + +import logging +import unittest + +from gensim import matutils +from scipy.sparse import csr_matrix +import numpy +import math +import os +from gensim.corpora import mmcorpus, Dictionary +from gensim.models import ldamodel + +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder +datapath = lambda fname: os.path.join(module_path, 'test_data', fname) + +# set up vars used in testing ("Deerwester" from the web tutorial) +texts = [['human', 'interface', 'computer'], + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey']] +dictionary = Dictionary(texts) +corpus = [dictionary.doc2bow(text) for text in texts] + + + +class TestIsBow(unittest.TestCase): + def test_None(self): + # test None + result = matutils.isbow(None) + expected = False + self.assertEqual(expected, result) + + def test_bow(self): + # test list words + + # one bag of words + potentialbow = [(0, 0.4)] + result = matutils.isbow(potentialbow) + expected = True + self.assertEqual(expected, result) + + # multiple bags + potentialbow = [(0, 4.), (1, 2.), (2, 5.), (3, 8.)] + result = matutils.isbow(potentialbow) + expected = True + self.assertEqual(expected, result) + + # checking empty input + potentialbow = [] + result = matutils.isbow(potentialbow) + expected = True + self.assertEqual(expected, result) + + # checking corpus; should return false + potentialbow = [[(2, 1), (3, 1), (4, 1), (5, 1), (1, 1), (7, 1)]] + result = matutils.isbow(potentialbow) + expected = False + self.assertEqual(expected, result) + + # not a bag of words, should return false + potentialbow = [(1, 3, 6)] + result = matutils.isbow(potentialbow) + expected = False + self.assertEqual(expected, result) + + # checking sparse matrix format bag of words + potentialbow = csr_matrix([[1, 0.4], [0, 0.3], [2, 0.1]]) + result = matutils.isbow(potentialbow) + expected = True + self.assertEqual(expected, result) + + # checking numpy array format bag of words + potentialbow = numpy.array([[1, 0.4], [0, 0.2],[2, 0.2]]) + result = matutils.isbow(potentialbow) + expected = True + self.assertEqual(expected, result) + +class TestHellinger(unittest.TestCase): + def setUp(self): + self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) + self.class_ = ldamodel.LdaModel + self.model = self.class_(corpus, id2word=dictionary, num_topics=2, passes=100) + + def test_inputs(self): + + # checking empty inputs + vec_1 = [] + vec_2 = [] + result = matutils.hellinger(vec_1, vec_2) + expected = 0.0 + self.assertEqual(expected, result) + + # checking numpy array and list input + vec_1 = numpy.array([]) + vec_2 = [] + result = matutils.hellinger(vec_1, vec_2) + expected = 0.0 + self.assertEqual(expected, result) + + # checking scipy csr matrix and list input + vec_1 = csr_matrix([]) + vec_2 = [] + result = matutils.hellinger(vec_1, vec_2) + expected = 0.0 + self.assertEqual(expected, result) + + def test_distributions(self): + + # checking bag of words as inputs + vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)] + vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1)] + result = matutils.hellinger(vec_1, vec_2) + expected = 0.185241936534 + self.assertAlmostEqual(expected, result) + + + # checking ndarray, csr_matrix as inputs + vec_1 = numpy.array([[1, 0.3], [0, 0.4], [2, 0.3]]) + vec_2 = csr_matrix([[1, 0.4], [0, 0.2], [2, 0.2]]) + result = matutils.hellinger(vec_1, vec_2) + expected = 0.160618030536 + self.assertAlmostEqual(expected, result) + + # checking ndarray, list as inputs + vec_1 = numpy.array([0.6, 0.1, 0.1, 0.2]) + vec_2 = [0.2, 0.2, 0.1, 0.5] + result = matutils.hellinger(vec_1, vec_2) + expected = 0.309742984153 + self.assertAlmostEqual(expected, result) + + # testing LDA distribution vectors + numpy.random.seed(0) + model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes= 100) + lda_vec1 = model[[(1, 2), (2, 3)]] + lda_vec2 = model[[(2, 2), (1, 3)]] + result = matutils.hellinger(lda_vec1, lda_vec2) + expected = 1.0406845281146034e-06 + self.assertAlmostEqual(expected, result) + +class TestKL(unittest.TestCase): + def setUp(self): + self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) + self.class_ = ldamodel.LdaModel + self.model = self.class_(corpus, id2word=dictionary, num_topics=2, passes=100) + + def test_inputs(self): + + # checking empty inputs + vec_1 = [] + vec_2 = [] + result = matutils.kullback_leibler(vec_1, vec_2) + expected = 0.0 + self.assertEqual(expected, result) + + # checking numpy array and list input + vec_1 = numpy.array([]) + vec_2 = [] + result = matutils.kullback_leibler(vec_1, vec_2) + expected = 0.0 + self.assertEqual(expected, result) + + # checking scipy csr matrix and list input + vec_1 = csr_matrix([]) + vec_2 = [] + result = matutils.kullback_leibler(vec_1, vec_2) + expected = 0.0 + self.assertEqual(expected, result) + + def test_distributions(self): + + # checking bag of words as inputs + vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)] + vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1)] + result = matutils.kullback_leibler(vec_2, vec_1, 8) + expected = 0.55451775 + self.assertAlmostEqual(expected, result) + + # KL is not symetric; vec1 compared with vec2 will contain log of zeros and return infinity + vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)] + vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1)] + result = matutils.kullback_leibler(vec_1, vec_2, 8) + self.assertTrue(math.isinf(result)) + + # checking ndarray, csr_matrix as inputs + vec_1 = numpy.array([[1, 0.3], [0, 0.4], [2, 0.3]]) + vec_2 = csr_matrix([[1, 0.4], [0, 0.2], [2, 0.2]]) + result = matutils.kullback_leibler(vec_1, vec_2, 3) + expected = 0.0894502 + self.assertAlmostEqual(expected, result) + + # checking ndarray, list as inputs + vec_1 = numpy.array([0.6, 0.1, 0.1, 0.2]) + vec_2 = [0.2, 0.2, 0.1, 0.5] + result = matutils.kullback_leibler(vec_1, vec_2) + expected = 0.40659450877 + self.assertAlmostEqual(expected, result) + + # testing LDA distribution vectors + numpy.random.seed(0) + model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes= 100) + lda_vec1 = model[[(1, 2), (2, 3)]] + lda_vec2 = model[[(2, 2), (1, 3)]] + result = matutils.kullback_leibler(lda_vec1, lda_vec2) + expected = 4.283407e-12 + self.assertAlmostEqual(expected, result) + +class TestJaccard(unittest.TestCase): + def test_inputs(self): + + # all empty inputs will give a divide by zero exception + vec_1 = [] + vec_2 = [] + self.assertRaises(ZeroDivisionError, matutils.jaccard , vec_1, vec_2) + + def test_distributions(self): + + # checking bag of words as inputs + vec_1 = [(2, 1), (3, 4), (4, 1), (5, 1), (1, 1), (7, 2)] + vec_2 = [(1, 1), (3, 8), (4, 1)] + result = matutils.jaccard(vec_2, vec_1) + expected = 1 - 0.3 + self.assertAlmostEqual(expected, result) + + # checking ndarray, csr_matrix as inputs + vec_1 = numpy.array([[1, 3], [0, 4], [2, 3]]) + vec_2 = csr_matrix([[1, 4], [0, 2], [2, 2]]) + result = matutils.jaccard(vec_1, vec_2) + expected = 1 - 0.388888888889 + self.assertAlmostEqual(expected, result) + + # checking ndarray, list as inputs + vec_1 = numpy.array([6, 1, 2, 3]) + vec_2 = [4, 3, 2, 5] + result = matutils.jaccard(vec_1, vec_2) + expected = 1 - 0.333333333333 + self.assertAlmostEqual(expected, result) + +if __name__ == '__main__': + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) + unittest.main() diff --git a/setup.py b/setup.py index f492a949cc..a616bb97d3 100644 --- a/setup.py +++ b/setup.py @@ -123,7 +123,7 @@ def readfile(fname): setup( name='gensim', - version='0.13.0rc1', + version='0.13.0', description='Python framework for fast Vector Space Modelling', long_description=readfile('README.rst'),