Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix computation of Word2Vec loss & add loss value to logging string #2135

Open
wants to merge 20 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 54 additions & 20 deletions gensim/models/base_any2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,21 @@ def _clear_post_train(self):
raise NotImplementedError()

def _do_train_job(self, data_iterable, job_parameters, thread_private_mem):
"""Train a single batch. Return 2-tuple `(effective word count, total word count)`."""
"""Train a single batch.
`
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should document the input parameters as well.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, though there are no clear constraints on what the inputs actually are. They can actually be pretty much anything depending on how the derived class implements _get_thread_working_mem and _get_job_params, the duty is to the derived class to figure out what to do with them. I'll figure out something.

Returns
-------
(int, int, int)
effective_word_count: int
The number of words processed after ignoring unknown words and sentence length trimming.
total_word_count: int
The total number of words in this batch.
total_samples_used: int
The total samples used while training on this data. This is the same as the effective word count when using
CBOW, but it can differ with Skip-Gram, since a random number of positve examples are used for each average
loss for an epoch.

"""
raise NotImplementedError()

def _check_training_sanity(self, epochs=None, total_examples=None, total_words=None, **kwargs):
Expand Down Expand Up @@ -167,12 +181,23 @@ def _worker_loop(self, job_queue, progress_queue):
for callback in self.callbacks:
callback.on_batch_begin(self)

tally, raw_tally = self._do_train_job(data_iterable, job_parameters, thread_private_mem)
stats_tuple = self._do_train_job(
data_iterable, job_parameters, thread_private_mem)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there is no need to break the line here. We have a hard limit of 120 characters per line in gensim, and this is well within that limit.

if len(stats_tuple) == 3:
tally, raw_tally, effective_samples = stats_tuple
else:
# TODO: Some models haven't updated their _do_train_job method to return a 3-tuple instead of a
# 2-tuple, containing also the number of samples used while processing the batch.
# For those models that don't implement samples tallying, We assume that the number of samples is the
# effective words tally. This gives coherent outputs with previous implementations.
tally, raw_tally = stats_tuple
effective_samples = tally

for callback in self.callbacks:
callback.on_batch_end(self)

progress_queue.put((len(data_iterable), tally, raw_tally)) # report back progress
# report back progress
progress_queue.put((len(data_iterable), tally, raw_tally, effective_samples))
jobs_processed += 1
logger.debug("worker exiting, processed %i jobs", jobs_processed)

Expand Down Expand Up @@ -260,6 +285,7 @@ def _log_train_end(self, raw_word_count, trained_word_count, total_elapsed, job_

def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_examples=None, total_words=None,
report_delay=1.0):

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please revert

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

still here

"""Get the progress report for a single training epoch.

Parameters
Expand Down Expand Up @@ -294,7 +320,7 @@ def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_exam
* Total word count used in training.

"""
example_count, trained_word_count, raw_word_count = 0, 0, 0
example_count, trained_word_count, raw_word_count, samples_count = 0, 0, 0, 0
start, next_report = default_timer() - 0.00001, 1.0
job_tally = 0
unfinished_worker_count = self.workers
Expand All @@ -305,20 +331,20 @@ def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_exam
unfinished_worker_count -= 1
logger.info("worker thread finished; awaiting finish of %i more threads", unfinished_worker_count)
continue
examples, trained_words, raw_words = report
examples, trained_words, raw_words, effective_samples = report
job_tally += 1

# update progress stats
example_count += examples
trained_word_count += trained_words # only words in vocab & sampled
raw_word_count += raw_words

samples_count += effective_samples
# log progress once every report_delay seconds
elapsed = default_timer() - start
if elapsed >= next_report:
self._log_progress(
job_queue, progress_queue, cur_epoch, example_count, total_examples,
raw_word_count, total_words, trained_word_count, elapsed)
raw_word_count, total_words, trained_word_count, samples_count, elapsed)
next_report = elapsed + report_delay
# all done; report the final stats
elapsed = default_timer() - start
Expand Down Expand Up @@ -361,6 +387,7 @@ def _train_epoch(self, data_iterable=None, data_iterables=None, cur_epoch=0, tot
* Total word count used in training.

"""
self.running_training_loss = 0.
self._check_input_data_sanity(data_iterable, data_iterables)
job_queue = Queue(maxsize=queue_factor * self.workers)
progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers)
Expand Down Expand Up @@ -966,6 +993,9 @@ def train(self, sentences=None, input_streams=None, total_examples=None, total_w
total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count,
queue_factor=queue_factor, report_delay=report_delay, compute_loss=compute_loss, callbacks=callbacks)

def get_latest_training_loss(self):
raise NotImplementedError("To compute the loss for a model, you must implement get_latest_training_loss")

def _get_job_params(self, cur_epoch):
"""Get the learning rate used in the current epoch.

Expand Down Expand Up @@ -1146,7 +1176,7 @@ def load(cls, *args, **kwargs):
return model

def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples,
raw_word_count, total_words, trained_word_count, elapsed):
raw_word_count, total_words, trained_word_count, total_samples, elapsed):
"""Callback used to log progress for long running jobs.

Parameters
Expand All @@ -1172,24 +1202,28 @@ def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, tot
trained_word_count : int
Number of effective words used in training until now (after ignoring unknown words and trimming
the sentence length).
total_samples : int
Number of effective samples used in training until now (differs from total_examples for Skip-Gram)
elapsed : int
Elapsed time since the beginning of training in seconds.

"""
if total_examples:
# examples-based progress %
logger.info(
"EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i",
cur_epoch + 1, 100.0 * example_count / total_examples, trained_word_count / elapsed,
utils.qsize(job_queue), utils.qsize(progress_queue)
)
div = total_examples
else:
# words-based progress %
logger.info(
"EPOCH %i - PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i",
cur_epoch + 1, 100.0 * raw_word_count / total_words, trained_word_count / elapsed,
utils.qsize(job_queue), utils.qsize(progress_queue)
)
div = total_words

msg = "EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be PROGRESS: at %.2f%% words (not only examples)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right. Good catch. I'll fix it.

args = (cur_epoch + 1, 100.0 * example_count / div, trained_word_count / elapsed,
utils.qsize(job_queue), utils.qsize(progress_queue))
if self.compute_loss:
if total_samples == 0:
loss = -1
else:
loss = self.get_latest_training_loss() / total_samples
msg += ", current_loss %.3f"
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
args += (loss,)
logger.info(msg, *args)

def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_count, total_words,
trained_word_count, elapsed):
Expand Down
18 changes: 12 additions & 6 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,12 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False):
int
Number of words in the vocabulary actually used for training (that already existed in the vocabulary
and were not discarded by negative sampling).
int
Number of samples used for training. A sample is a positive/negative example.

"""
result = 0
effective_samples = 0
for sentence in sentences:
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
Expand All @@ -192,12 +195,13 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False):
for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start):
# don't train on the `word` itself
if pos2 != pos:
effective_samples += 1
train_sg_pair(
model, model.wv.index2word[word.index], word2.index, alpha, compute_loss=compute_loss
)

result += len(word_vocabs)
return result
return result, effective_samples

def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss=False):
"""Update CBOW model by training on a sequence of sentences.
Expand Down Expand Up @@ -229,6 +233,9 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss
int
Number of words in the vocabulary actually used for training (that already existed in the vocabulary
and were not discarded by negative sampling).
int
Number of samples used for training. A sample is a positive/negative example. In the case of CBOW
this is the same as the effective number of words.

"""
result = 0
Expand All @@ -247,7 +254,7 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss
l1 /= len(word2_indices)
train_cbow_pair(model, word, word2_indices, l1, alpha, compute_loss=compute_loss)
result += len(word_vocabs)
return result
return result, result

def score_sentence_sg(model, sentence, work=None):
"""Obtain likelihood score for a single sentence in a fitted skip-gram representation.
Expand Down Expand Up @@ -771,12 +778,11 @@ def _do_train_job(self, sentences, alpha, inits):

"""
work, neu1 = inits
tally = 0
if self.sg:
tally += train_batch_sg(self, sentences, alpha, work, self.compute_loss)
(tally, effective_samples) = train_batch_sg(self, sentences, alpha, work, self.compute_loss)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

() no needed here (and same below)

else:
tally += train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss)
return tally, self._raw_word_count(sentences)
(tally, effective_samples) = train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss)
return tally, self._raw_word_count(sentences), effective_samples
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to update an docstrings everywhere when you change returning type

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alreadytaikeune Still not done, please check


def _clear_post_train(self):
"""Remove all L2-normalized word vectors from the model."""
Expand Down
Loading