Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix computation of Word2Vec loss & add loss value to logging string #2135

Open
wants to merge 20 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 67 additions & 19 deletions gensim/models/base_any2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,17 @@ def _clear_post_train(self):
raise NotImplementedError()

def _do_train_job(self, data_iterable, job_parameters, thread_private_mem):
"""Train a single batch. Return 2-tuple `(effective word count, total word count)`."""
"""Train a single batch. Return 3-tuple
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please follow numpy-style for docstrings, more links available here

`(effective word count, total word count, total samples used)`.

The total samples used is the same as the effective word count when
using CBOW, but it can differ with Skip-Gram, since a random number of
positve examples are used for each effective word.

Knowing the effective number of samples used allows us to compute the
average loss for an epoch.

"""
raise NotImplementedError()

def _check_training_sanity(self, epochs=None, total_examples=None, total_words=None, **kwargs):
Expand Down Expand Up @@ -167,12 +177,23 @@ def _worker_loop(self, job_queue, progress_queue):
for callback in self.callbacks:
callback.on_batch_begin(self)

tally, raw_tally = self._do_train_job(data_iterable, job_parameters, thread_private_mem)
stats_tuple = self._do_train_job(
data_iterable, job_parameters, thread_private_mem)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there is no need to break the line here. We have a hard limit of 120 characters per line in gensim, and this is well within that limit.

if len(stats_tuple) == 3:
tally, raw_tally, effective_samples = stats_tuple
else:
# Model doesn't implement the samples tallying. We assume
# that the number of samples is the effective words tally. This
# gives coherent outputs with previous implementaitons
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please add TODO here - if/else should be removed when compute_loss implemented for all models

tally, raw_tally = stats_tuple
effective_samples = tally

for callback in self.callbacks:
callback.on_batch_end(self)

progress_queue.put((len(data_iterable), tally, raw_tally)) # report back progress
# report back progress
progress_queue.put(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no need to break line (we use 120 char limit for gensim), here and everywhere

(len(data_iterable), tally, raw_tally, effective_samples))
jobs_processed += 1
logger.debug("worker exiting, processed %i jobs", jobs_processed)

Expand Down Expand Up @@ -248,7 +269,7 @@ def _job_producer(self, data_iterator, job_queue, cur_epoch=0, total_examples=No
logger.debug("job loop exiting, total %i jobs", job_no)

def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples,
raw_word_count, total_words, trained_word_count, elapsed):
raw_word_count, total_words, trained_word_count, total_samples, elapsed):
raise NotImplementedError()

def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_count, total_words,
Expand All @@ -260,6 +281,7 @@ def _log_train_end(self, raw_word_count, trained_word_count, total_elapsed, job_

def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_examples=None, total_words=None,
report_delay=1.0):

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please revert

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

still here

"""Get the progress report for a single training epoch.

Parameters
Expand Down Expand Up @@ -294,7 +316,7 @@ def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_exam
* Total word count used in training.

"""
example_count, trained_word_count, raw_word_count = 0, 0, 0
example_count, trained_word_count, raw_word_count, samples_count = 0, 0, 0, 0
start, next_report = default_timer() - 0.00001, 1.0
job_tally = 0
unfinished_worker_count = self.workers
Expand All @@ -305,20 +327,20 @@ def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_exam
unfinished_worker_count -= 1
logger.info("worker thread finished; awaiting finish of %i more threads", unfinished_worker_count)
continue
examples, trained_words, raw_words = report
examples, trained_words, raw_words, effective_samples = report
job_tally += 1

# update progress stats
example_count += examples
trained_word_count += trained_words # only words in vocab & sampled
raw_word_count += raw_words

samples_count += effective_samples
# log progress once every report_delay seconds
elapsed = default_timer() - start
if elapsed >= next_report:
self._log_progress(
job_queue, progress_queue, cur_epoch, example_count, total_examples,
raw_word_count, total_words, trained_word_count, elapsed)
raw_word_count, total_words, trained_word_count, samples_count, elapsed)
next_report = elapsed + report_delay
# all done; report the final stats
elapsed = default_timer() - start
Expand Down Expand Up @@ -361,6 +383,7 @@ def _train_epoch(self, data_iterable=None, data_iterables=None, cur_epoch=0, tot
* Total word count used in training.

"""
self.running_training_loss = 0.
self._check_input_data_sanity(data_iterable, data_iterables)
job_queue = Queue(maxsize=queue_factor * self.workers)
progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers)
Expand Down Expand Up @@ -966,6 +989,9 @@ def train(self, sentences=None, input_streams=None, total_examples=None, total_w
total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count,
queue_factor=queue_factor, report_delay=report_delay, compute_loss=compute_loss, callbacks=callbacks)

def get_latest_training_loss(self):
return 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should raise NotImplementet (loss feature works only for w2v now, not for d2v/fasttext/etc) or return -1 maybe?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed it to raise an exception. I think -1 will be confusing since it is not the value that will be displayed. The value that is displayed is the running training loss divided by the number of samples. That is why I chose to return 0 and -1. -1 will look like the loss is decreasing as the number of processed words increases.


def _get_job_params(self, cur_epoch):
"""Get the learning rate used in the current epoch.

Expand Down Expand Up @@ -1146,7 +1172,7 @@ def load(cls, *args, **kwargs):
return model

def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples,
raw_word_count, total_words, trained_word_count, elapsed):
raw_word_count, total_words, trained_word_count, total_samples, elapsed):
"""Callback used to log progress for long running jobs.

Parameters
Expand All @@ -1172,24 +1198,46 @@ def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, tot
trained_word_count : int
Number of effective words used in training until now (after ignoring unknown words and trimming
the sentence length).
total_samples : int
Number of effective samples used in training until now (differs from total_examples for Skip-Gram)
elapsed : int
Elapsed time since the beginning of training in seconds.

"""
if total_samples == 0:
loss = -1
else:
loss = self.get_latest_training_loss() / total_samples
if total_examples:
# examples-based progress %
logger.info(
"EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i",
cur_epoch + 1, 100.0 * example_count / total_examples, trained_word_count / elapsed,
utils.qsize(job_queue), utils.qsize(progress_queue)
)
if self.compute_loss:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you fully refactor this function please (make it clearer & shorter, not if { if { } else { } } else { if { } else { } } )?

Hint

  • generate pattern (logging template first)
  • collect needed parameters to list/tuple
  • use *my_parameters

logger.info(
("EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, "
"in_qsize %i, out_qsize %i, current_loss %.3f"),
cur_epoch + 1, 100.0 * example_count / total_examples, trained_word_count / elapsed,
utils.qsize(job_queue), utils.qsize(progress_queue), loss
)
else:
logger.info(
"EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i",
cur_epoch + 1, 100.0 * example_count / total_examples, trained_word_count / elapsed,
utils.qsize(job_queue), utils.qsize(progress_queue)
)
else:
# words-based progress %
logger.info(
"EPOCH %i - PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i",
cur_epoch + 1, 100.0 * raw_word_count / total_words, trained_word_count / elapsed,
utils.qsize(job_queue), utils.qsize(progress_queue)
)
if self.compute_loss:
logger.info(
"EPOCH %i - PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i",
cur_epoch + 1, 100.0 * raw_word_count / total_words, trained_word_count / elapsed,
utils.qsize(job_queue), utils.qsize(progress_queue)
)
else:
logger.info(
("EPOCH %i - PROGRESS: at %.2f%% words, %.0f words/s, "
"in_qsize %i, out_qsize %i, current_loss %.3f"),
cur_epoch + 1, 100.0 * raw_word_count / total_words, trained_word_count / elapsed,
utils.qsize(job_queue), utils.qsize(progress_queue), loss
)

def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_count, total_words,
trained_word_count, elapsed):
Expand Down
13 changes: 7 additions & 6 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False):

"""
result = 0
effective_samples = 0
for sentence in sentences:
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
Expand All @@ -192,12 +193,13 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False):
for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start):
# don't train on the `word` itself
if pos2 != pos:
effective_samples += 1
train_sg_pair(
model, model.wv.index2word[word.index], word2.index, alpha, compute_loss=compute_loss
)

result += len(word_vocabs)
return result
return result, effective_samples

def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss=False):
"""Update CBOW model by training on a sequence of sentences.
Expand Down Expand Up @@ -247,7 +249,7 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss
l1 /= len(word2_indices)
train_cbow_pair(model, word, word2_indices, l1, alpha, compute_loss=compute_loss)
result += len(word_vocabs)
return result
return result, result

def score_sentence_sg(model, sentence, work=None):
"""Obtain likelihood score for a single sentence in a fitted skip-gram representation.
Expand Down Expand Up @@ -771,12 +773,11 @@ def _do_train_job(self, sentences, alpha, inits):

"""
work, neu1 = inits
tally = 0
if self.sg:
tally += train_batch_sg(self, sentences, alpha, work, self.compute_loss)
(tally, effective_samples) = train_batch_sg(self, sentences, alpha, work, self.compute_loss)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

() no needed here (and same below)

else:
tally += train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss)
return tally, self._raw_word_count(sentences)
(tally, effective_samples) = train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss)
return tally, self._raw_word_count(sentences), effective_samples
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to update an docstrings everywhere when you change returning type

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alreadytaikeune Still not done, please check


def _clear_post_train(self):
"""Remove all L2-normalized word vectors from the model."""
Expand Down
Loading