Skip to content

Commit

Permalink
polishing up after #3169
Browse files Browse the repository at this point in the history
The repo wasn't accepting maintainer commits, so I'm taking care of this
here.
  • Loading branch information
mpenkov committed Jun 29, 2021
1 parent 2b9b1b3 commit 2a41200
Show file tree
Hide file tree
Showing 2 changed files with 142 additions and 149 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Changes
* [#3131](https://github.com/RaRe-Technologies/gensim/pull/3131): Added import to Nmf docs, and to models/__init__.py, by [@properGrammar](https://github.com/properGrammar)
* [#3163](https://github.com/RaRe-Technologies/gensim/pull/3163): Optimize word mover distance (WMD) computation, by [@flowlight0](https://github.com/flowlight0)
* [#2965](https://github.com/RaRe-Technologies/gensim/pull/2965): Remove strip_punctuation2 alias of strip_punctuation, by [@sciatro](https://github.com/sciatro)
* [#3169](https://github.com/RaRe-Technologies/gensim/pull/3169): Implement `shrink_windows` argument for Word2Vec., by [@M-Demay](https://github.com/M-Demay)

### :books: Documentation

Expand Down
290 changes: 141 additions & 149 deletions gensim/test/test_fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import sys

import numpy as np
import pytest

from gensim import utils
from gensim.models.word2vec import LineSentence
Expand Down Expand Up @@ -397,156 +398,7 @@ def test_wm_distance(self):
dist = self.test_model.wv.wmdistance(doc, oov_doc)
self.assertNotEqual(float('inf'), dist)

def test_cbow_hs_training(self, shrink_windows=True):

model_gensim = FT_gensim(
vector_size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0,
min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET, shrink_windows=shrink_windows)

lee_data = LineSentence(datapath('lee_background.cor'))
model_gensim.build_vocab(lee_data)
orig0 = np.copy(model_gensim.wv.vectors[0])
model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs)
self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all()) # vector should vary after training

sims_gensim = model_gensim.wv.most_similar('night', topn=10)
sims_gensim_words = [word for (word, distance) in sims_gensim] # get similar words
expected_sims_words = [
u'night,',
u'night.',
u'rights',
u'kilometres',
u'in',
u'eight',
u'according',
u'flights',
u'during',
u'comes']
overlaps = set(sims_gensim_words).intersection(expected_sims_words)
overlap_count = len(overlaps)
self.assertGreaterEqual(
overlap_count, 2,
"only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words))

def test_cbow_hs_training_fromfile(self, shrink_windows=True):
with temporary_file('gensim_fasttext.tst') as corpus_file:
model_gensim = FT_gensim(
vector_size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0,
min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET * 4, shrink_windows=shrink_windows)

lee_data = LineSentence(datapath('lee_background.cor'))
utils.save_as_line_sentence(lee_data, corpus_file)

model_gensim.build_vocab(corpus_file=corpus_file)
orig0 = np.copy(model_gensim.wv.vectors[0])
model_gensim.train(corpus_file=corpus_file,
total_words=model_gensim.corpus_total_words,
epochs=model_gensim.epochs)
self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all()) # vector should vary after training

sims_gensim = model_gensim.wv.most_similar('night', topn=10)
sims_gensim_words = [word for (word, distance) in sims_gensim] # get similar words
expected_sims_words = [
u'night,',
u'night.',
u'rights',
u'kilometres',
u'in',
u'eight',
u'according',
u'flights',
u'during',
u'comes']
overlaps = set(sims_gensim_words).intersection(expected_sims_words)
overlap_count = len(overlaps)
self.assertGreaterEqual(
overlap_count, 2,
"only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words))

def test_sg_hs_training(self, shrink_windows=True):

model_gensim = FT_gensim(
vector_size=48, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0,
min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET, shrink_windows=shrink_windows)

lee_data = LineSentence(datapath('lee_background.cor'))
model_gensim.build_vocab(lee_data)
orig0 = np.copy(model_gensim.wv.vectors[0])
model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs)
self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all()) # vector should vary after training

sims_gensim = model_gensim.wv.most_similar('night', topn=10)
sims_gensim_words = [word for (word, distance) in sims_gensim] # get similar words
expected_sims_words = [
u'night,',
u'night.',
u'eight',
u'nine',
u'overnight',
u'crew',
u'overnight.',
u'manslaughter',
u'north',
u'flight']
overlaps = set(sims_gensim_words).intersection(expected_sims_words)
overlap_count = len(overlaps)
self.assertGreaterEqual(
overlap_count, 2,
"only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words))

def test_sg_hs_training_fromfile(self, shrink_windows=True):
with temporary_file('gensim_fasttext.tst') as corpus_file:
model_gensim = FT_gensim(
vector_size=48, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0,
min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET, shrink_windows=shrink_windows)

lee_data = LineSentence(datapath('lee_background.cor'))
utils.save_as_line_sentence(lee_data, corpus_file)

model_gensim.build_vocab(corpus_file=corpus_file)
orig0 = np.copy(model_gensim.wv.vectors[0])
model_gensim.train(corpus_file=corpus_file,
total_words=model_gensim.corpus_total_words,
epochs=model_gensim.epochs)
self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all()) # vector should vary after training

sims_gensim = model_gensim.wv.most_similar('night', topn=10)
sims_gensim_words = [word for (word, distance) in sims_gensim] # get similar words
expected_sims_words = [
u'night,',
u'night.',
u'eight',
u'nine',
u'overnight',
u'crew',
u'overnight.',
u'manslaughter',
u'north',
u'flight']
overlaps = set(sims_gensim_words).intersection(expected_sims_words)
overlap_count = len(overlaps)
self.assertGreaterEqual(
overlap_count, 2,
"only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words))

def test_cbow_hs_training_fixedwindowsize(self):
self.test_cbow_hs_training(shrink_windows=False)

def test_cbow_hs_training_fixedwindowsize_fromfile(self):
self.test_cbow_hs_training_fromfile(shrink_windows=False)

def test_sg_hs_training_fixedwindowsize(self):
self.test_sg_hs_training(shrink_windows=False)

def test_sg_hs_training_fixedwindowsize_fromfile(self):
self.test_sg_hs_training_fromfile(shrink_windows=False)

def test_cbow_neg_training(self):

model_gensim = FT_gensim(
vector_size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=5,
min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
Expand Down Expand Up @@ -863,6 +715,146 @@ def obsolete_testLoadOldModel(self):
self.assertEqual(model.wv.vectors_ngrams.shape, (2000000, 100))


@pytest.mark.parametrize('shrink_windows', [True, False])
def test_cbow_hs_training(shrink_windows):
model_gensim = FT_gensim(
vector_size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0,
min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET, shrink_windows=shrink_windows)

lee_data = LineSentence(datapath('lee_background.cor'))
model_gensim.build_vocab(lee_data)
orig0 = np.copy(model_gensim.wv.vectors[0])
model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs)
assert not (orig0 == model_gensim.wv.vectors[0]).all() # vector should vary after training

sims_gensim = model_gensim.wv.most_similar('night', topn=10)
sims_gensim_words = [word for (word, distance) in sims_gensim] # get similar words
expected_sims_words = [
u'night,',
u'night.',
u'rights',
u'kilometres',
u'in',
u'eight',
u'according',
u'flights',
u'during',
u'comes']
overlaps = set(sims_gensim_words).intersection(expected_sims_words)
overlap_count = len(overlaps)

message = f"only {overlap_count} overlap in expected {expected_sims_words} & actual {sims_gensim_words}"
assert overlap_count >= 2, message


@pytest.mark.parametrize('shrink_windows', [True, False])
def test_cbow_hs_training_fromfile(shrink_windows):
with temporary_file('gensim_fasttext.tst') as corpus_file:
model_gensim = FT_gensim(
vector_size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0,
min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET * 4, shrink_windows=shrink_windows)

lee_data = LineSentence(datapath('lee_background.cor'))
utils.save_as_line_sentence(lee_data, corpus_file)

model_gensim.build_vocab(corpus_file=corpus_file)
orig0 = np.copy(model_gensim.wv.vectors[0])
model_gensim.train(corpus_file=corpus_file,
total_words=model_gensim.corpus_total_words,
epochs=model_gensim.epochs)
assert not (orig0 == model_gensim.wv.vectors[0]).all() # vector should vary after training

sims_gensim = model_gensim.wv.most_similar('night', topn=10)
sims_gensim_words = [word for (word, distance) in sims_gensim] # get similar words
expected_sims_words = [
u'night,',
u'night.',
u'rights',
u'kilometres',
u'in',
u'eight',
u'according',
u'flights',
u'during',
u'comes']
overlaps = set(sims_gensim_words).intersection(expected_sims_words)
overlap_count = len(overlaps)
message = f"only {overlap_count} overlap in expected {expected_sims_words} & actual {sims_gensim_words}"
assert overlap_count >= 2, message


@pytest.mark.parametrize('shrink_windows', [True, False])
def test_sg_hs_training(shrink_windows):
model_gensim = FT_gensim(
vector_size=48, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0,
min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET, shrink_windows=shrink_windows)

lee_data = LineSentence(datapath('lee_background.cor'))
model_gensim.build_vocab(lee_data)
orig0 = np.copy(model_gensim.wv.vectors[0])
model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs)
assert not (orig0 == model_gensim.wv.vectors[0]).all() # vector should vary after training

sims_gensim = model_gensim.wv.most_similar('night', topn=10)
sims_gensim_words = [word for (word, distance) in sims_gensim] # get similar words
expected_sims_words = [
u'night,',
u'night.',
u'eight',
u'nine',
u'overnight',
u'crew',
u'overnight.',
u'manslaughter',
u'north',
u'flight']
overlaps = set(sims_gensim_words).intersection(expected_sims_words)
overlap_count = len(overlaps)

message = f"only {overlap_count} overlap in expected {expected_sims_words} & actual {sims_gensim_words}"
assert overlap_count >= 2, message


@pytest.mark.parametrize('shrink_windows', [True, False])
def test_sg_hs_training_fromfile(shrink_windows):
with temporary_file('gensim_fasttext.tst') as corpus_file:
model_gensim = FT_gensim(
vector_size=48, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0,
min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET, shrink_windows=shrink_windows)

lee_data = LineSentence(datapath('lee_background.cor'))
utils.save_as_line_sentence(lee_data, corpus_file)

model_gensim.build_vocab(corpus_file=corpus_file)
orig0 = np.copy(model_gensim.wv.vectors[0])
model_gensim.train(corpus_file=corpus_file,
total_words=model_gensim.corpus_total_words,
epochs=model_gensim.epochs)
assert not (orig0 == model_gensim.wv.vectors[0]).all() # vector should vary after training

sims_gensim = model_gensim.wv.most_similar('night', topn=10)
sims_gensim_words = [word for (word, distance) in sims_gensim] # get similar words
expected_sims_words = [
u'night,',
u'night.',
u'eight',
u'nine',
u'overnight',
u'crew',
u'overnight.',
u'manslaughter',
u'north',
u'flight']
overlaps = set(sims_gensim_words).intersection(expected_sims_words)
overlap_count = len(overlaps)
message = f"only {overlap_count} overlap in expected {expected_sims_words} & actual {sims_gensim_words}"
assert overlap_count >= 2, message


with open(datapath('toy-data.txt')) as fin:
TOY_SENTENCES = [fin.read().strip().split(' ')]

Expand Down

0 comments on commit 2a41200

Please sign in to comment.