Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement shrink_windows argument for Word2Vec. #3169

Merged
merged 14 commits into from
Jun 29, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 30 additions & 13 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,13 +223,13 @@

def train_epoch_sg(
model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words,
_work, _neu1, compute_loss,
_work, _neu1, compute_loss, reduced_windows,
):
raise RuntimeError("Training with corpus_file argument is not supported")

def train_epoch_cbow(
model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words,
_work, _neu1, compute_loss,
_work, _neu1, compute_loss, reduced_windows,
):
raise RuntimeError("Training with corpus_file argument is not supported")

Expand All @@ -240,7 +240,7 @@ def __init__(
max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0,
trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(),
comment=None, max_final_vocab=None,
comment=None, max_final_vocab=None, reduced_windows=True
piskvorky marked this conversation as resolved.
Show resolved Hide resolved
):
"""Train, use and evaluate neural networks described in https://code.google.com/p/word2vec/.

Expand Down Expand Up @@ -345,6 +345,9 @@ def __init__(
:meth:`~gensim.models.word2vec.Word2Vec.get_latest_training_loss`.
callbacks : iterable of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional
Sequence of callbacks to be executed at specific stages during training.
reduced_windows : bool, optional
If True, the window size is uniformly sampled from {1, `window`}
during training. Otherwise, it is fixed to `window`.
M-Demay marked this conversation as resolved.
Show resolved Hide resolved

Examples
--------
Expand Down Expand Up @@ -377,6 +380,7 @@ def __init__(
self.min_alpha = float(min_alpha)

self.window = int(window)
self.reduced_windows = bool(reduced_windows)
self.random = np.random.RandomState(seed)

self.hs = int(hs)
Expand Down Expand Up @@ -419,7 +423,8 @@ def __init__(
self.train(
corpus_iterable=corpus_iterable, corpus_file=corpus_file, total_examples=self.corpus_count,
total_words=self.corpus_total_words, epochs=self.epochs, start_alpha=self.alpha,
end_alpha=self.min_alpha, compute_loss=self.compute_loss, callbacks=callbacks)
end_alpha=self.min_alpha, compute_loss=self.compute_loss, callbacks=callbacks,
reduced_windows=self.reduced_windows)
else:
if trim_rule is not None:
logger.warning(
Expand Down Expand Up @@ -910,12 +915,14 @@ def _do_train_epoch(
if self.sg:
examples, tally, raw_tally = train_epoch_sg(
self, corpus_file, offset, cython_vocab, cur_epoch,
total_examples, total_words, work, neu1, self.compute_loss,
total_examples, total_words, work, neu1,
self.compute_loss, self.reduced_windows,
M-Demay marked this conversation as resolved.
Show resolved Hide resolved
)
else:
examples, tally, raw_tally = train_epoch_cbow(
self, corpus_file, offset, cython_vocab, cur_epoch,
total_examples, total_words, work, neu1, self.compute_loss,
total_examples, total_words, work, neu1,
self.compute_loss, self.reduced_windows,
)

return examples, tally, raw_tally
Expand All @@ -941,20 +948,26 @@ def _do_train_job(self, sentences, alpha, inits):
work, neu1 = inits
tally = 0
if self.sg:
tally += train_batch_sg(self, sentences, alpha, work, self.compute_loss)
tally += train_batch_sg(
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
self, sentences, alpha, work,
self.compute_loss, self.reduced_windows,
)
else:
tally += train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss)
tally += train_batch_cbow(
self, sentences, alpha, work, neu1,
self.compute_loss, self.reduced_windows,
)
return tally, self._raw_word_count(sentences)

def _clear_post_train(self):
"""Clear any cached values that training may have invalidated."""
self.wv.norms = None

def train(
self, corpus_iterable=None, corpus_file=None, total_examples=None, total_words=None,
epochs=None, start_alpha=None, end_alpha=None, word_count=0,
queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=(),
**kwargs,
self, corpus_iterable=None, corpus_file=None, total_examples=None,
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
total_words=None, epochs=None, start_alpha=None, end_alpha=None,
word_count=0, queue_factor=2, report_delay=1.0, compute_loss=False,
reduced_windows=True, callbacks=(), **kwargs,
M-Demay marked this conversation as resolved.
Show resolved Hide resolved
):
"""Update the model's neural weights from a sequence of sentences.

Expand Down Expand Up @@ -1011,6 +1024,9 @@ def train(
compute_loss: bool, optional
If True, computes and stores loss value which can be retrieved using
:meth:`~gensim.models.word2vec.Word2Vec.get_latest_training_loss`.
reduced_windows : bool, optional
If True, the window size is uniformly sampled from {1, `window`}
during training. Otherwise, it is fixed to `window`.
callbacks : iterable of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional
Sequence of callbacks to be executed at specific stages during training.

Expand All @@ -1030,6 +1046,7 @@ def train(
self.alpha = start_alpha or self.alpha
self.min_alpha = end_alpha or self.min_alpha
self.epochs = epochs
self.reduced_windows = reduced_windows

self._check_training_sanity(epochs=epochs, total_examples=total_examples, total_words=total_words)
self._check_corpus_sanity(corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=epochs)
Expand All @@ -1039,7 +1056,7 @@ def train(
msg=(
f"training model with {self.workers} workers on {len(self.wv)} vocabulary and "
f"{self.layer1_size} features, using sg={self.sg} hs={self.hs} sample={self.sample} "
f"negative={self.negative} window={self.window}"
f"negative={self.negative} window={self.window} reduced_windows={self.reduced_windows}"
),
)

Expand Down
24 changes: 18 additions & 6 deletions gensim/models/word2vec_corpusfile.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,8 @@ cdef void prepare_c_structures_for_batch(
vector[vector[string]] &sentences, int sample, int hs, int window, long long *total_words,
int *effective_words, int *effective_sentences, unsigned long long *next_random,
cvocab_t *vocab, int *sentence_idx, np.uint32_t *indexes, int *codelens,
np.uint8_t **codes, np.uint32_t **points, np.uint32_t *reduced_windows) nogil:
np.uint8_t **codes, np.uint32_t **points, np.uint32_t *reduced_windows,
bint do_reduced_windows) nogil:
M-Demay marked this conversation as resolved.
Show resolved Hide resolved
cdef VocabItem word
cdef string token
cdef vector[string] sent
Expand Down Expand Up @@ -225,7 +226,10 @@ cdef void prepare_c_structures_for_batch(

# precompute "reduced window" offsets in a single randint() call
for i in range(effective_words[0]):
reduced_windows[i] = random_int32(next_random) % window
if do_reduced_windows:
reduced_windows[i] = random_int32(next_random) % window
M-Demay marked this conversation as resolved.
Show resolved Hide resolved
else:
reduced_windows[i] = window


cdef REAL_t get_alpha(REAL_t alpha, REAL_t end_alpha, int cur_epoch, int num_epochs) nogil:
Expand All @@ -250,7 +254,7 @@ cdef REAL_t get_next_alpha(


def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, _work,
_neu1, compute_loss):
_neu1, compute_loss, reduced_windows):
"""Train Skipgram model for one epoch by training on an input stream. This function is used only in multistream mode.

Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`.
Expand All @@ -269,6 +273,9 @@ def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expec
Private working memory for each worker.
compute_loss : bool
Whether or not the training loss should be computed in this batch.
reduced_windows : bool
Whether or not the window size should be reduced based on random
uniform sampling.

Returns
-------
Expand All @@ -295,6 +302,7 @@ def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expec
cdef long long total_sentences = 0
cdef long long total_effective_words = 0, total_words = 0
cdef int sent_idx, idx_start, idx_end
cdef bint do_reduced_windows = reduced_windows

init_w2v_config(&c, model, _alpha, compute_loss, _work)

Expand All @@ -311,7 +319,7 @@ def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expec
prepare_c_structures_for_batch(
sentences, c.sample, c.hs, c.window, &total_words, &effective_words, &effective_sentences,
&c.next_random, vocab.get_vocab_ptr(), c.sentence_idx, c.indexes,
c.codelens, c.codes, c.points, c.reduced_windows)
c.codelens, c.codes, c.points, c.reduced_windows, do_reduced_windows)

for sent_idx in range(effective_sentences):
idx_start = c.sentence_idx[sent_idx]
Expand Down Expand Up @@ -350,7 +358,7 @@ def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expec


def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, _work,
_neu1, compute_loss):
_neu1, compute_loss, reduced_windows):
"""Train CBOW model for one epoch by training on an input stream. This function is used only in multistream mode.

Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`.
Expand All @@ -369,6 +377,9 @@ def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _exp
Private working memory for each worker.
compute_loss : bool
Whether or not the training loss should be computed in this batch.
reduced_windows : bool
Whether or not the window size should be reduced based on random
uniform sampling.

Returns
-------
Expand All @@ -395,6 +406,7 @@ def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _exp
cdef long long total_sentences = 0
cdef long long total_effective_words = 0, total_words = 0
cdef int sent_idx, idx_start, idx_end
cdef bint do_reduced_windows = reduced_windows

init_w2v_config(&c, model, _alpha, compute_loss, _work, _neu1)

Expand All @@ -411,7 +423,7 @@ def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _exp
prepare_c_structures_for_batch(
sentences, c.sample, c.hs, c.window, &total_words, &effective_words,
&effective_sentences, &c.next_random, vocab.get_vocab_ptr(), c.sentence_idx,
c.indexes, c.codelens, c.codes, c.points, c.reduced_windows)
c.indexes, c.codelens, c.codes, c.points, c.reduced_windows, do_reduced_windows)

for sent_idx in range(effective_sentences):
idx_start = c.sentence_idx[sent_idx]
Expand Down
26 changes: 20 additions & 6 deletions gensim/models/word2vec_inner.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,10 @@ cdef void our_saxpy_noblas(const int *N, const float *alpha, const float *X, con
for i from 0 <= i < N[0] by 1:
Y[i * (incY[0])] = (alpha[0]) * X[i * (incX[0])] + Y[i * (incY[0])]

cdef long long _mul(const np.uint32_t a, const int b) nogil:
cdef long long _mul(const np.uint32_t a, const int b) nogil:
"""Safe multiplication of ints with explict typecasting"""
return <long long>a * <long long>b
piskvorky marked this conversation as resolved.
Show resolved Hide resolved

cdef void w2v_fast_sentence_sg_hs(
const np.uint32_t *word_point, const np.uint8_t *word_code, const int codelen,
REAL_t *syn0, REAL_t *syn1, const int size,
Expand Down Expand Up @@ -502,7 +502,7 @@ cdef init_w2v_config(Word2VecConfig *c, model, alpha, compute_loss, _work, _neu1
c[0].neu1 = <REAL_t *>np.PyArray_DATA(_neu1)


def train_batch_sg(model, sentences, alpha, _work, compute_loss):
def train_batch_sg(model, sentences, alpha, _work, compute_loss, reduced_windows):
"""Update skip-gram model by training on a batch of sentences.

Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`.
Expand All @@ -519,6 +519,9 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss):
Private working memory for each worker.
compute_loss : bool
Whether or not the training loss should be computed in this batch.
reduced_windows : bool
Whether or not the window size should be reduced based on random
uniform sampling.

Returns
-------
Expand Down Expand Up @@ -570,7 +573,11 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss):
break # TODO: log warning, tally overflow?

# precompute "reduced window" offsets in a single randint() call
for i, item in enumerate(model.random.randint(0, c.window, effective_words)):
if reduced_windows:
window_size = model.random.randint(0, c.window, effective_words)
else:
window_size = [0] * effective_words
M-Demay marked this conversation as resolved.
Show resolved Hide resolved
for i, item in enumerate(window_size):
c.reduced_windows[i] = item

# release GIL & train on all sentences
Expand All @@ -597,7 +604,7 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss):
return effective_words


def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss):
def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss, reduced_windows):
"""Update CBOW model by training on a batch of sentences.

Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`.
Expand All @@ -616,6 +623,9 @@ def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss):
Private working memory for each worker.
compute_loss : bool
Whether or not the training loss should be computed in this batch.
reduced_windows : bool
Whether or not the window size should be reduced based on random
uniform sampling.

Returns
-------
Expand Down Expand Up @@ -666,7 +676,11 @@ def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss):
break # TODO: log warning, tally overflow?

# precompute "reduced window" offsets in a single randint() call
for i, item in enumerate(model.random.randint(0, c.window, effective_words)):
if reduced_windows:
window_size = model.random.randint(0, c.window, effective_words)
else:
window_size = [0] * effective_words
for i, item in enumerate(window_size):
c.reduced_windows[i] = item

# release GIL & train on all sentences
Expand Down