-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix computation of Word2Vec
loss & add loss value to logging string
#2135
base: develop
Are you sure you want to change the base?
Changes from 6 commits
e96798c
f447df0
a6548c4
a2fd340
1bdd4a5
7b457d6
18735e2
0bcae41
eb4b14d
00e7b7d
995b5f8
6b46f64
f6a5cc5
3a453a9
a8e4a66
854c8fd
5e21a85
0f4d572
3eec299
aaf9ed9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -124,7 +124,17 @@ def _clear_post_train(self): | |
raise NotImplementedError() | ||
|
||
def _do_train_job(self, data_iterable, job_parameters, thread_private_mem): | ||
"""Train a single batch. Return 2-tuple `(effective word count, total word count)`.""" | ||
"""Train a single batch. Return 3-tuple | ||
`(effective word count, total word count, total samples used)`. | ||
|
||
The total samples used is the same as the effective word count when | ||
using CBOW, but it can differ with Skip-Gram, since a random number of | ||
positve examples are used for each effective word. | ||
|
||
Knowing the effective number of samples used allows us to compute the | ||
average loss for an epoch. | ||
|
||
""" | ||
raise NotImplementedError() | ||
|
||
def _check_training_sanity(self, epochs=None, total_examples=None, total_words=None, **kwargs): | ||
|
@@ -167,12 +177,23 @@ def _worker_loop(self, job_queue, progress_queue): | |
for callback in self.callbacks: | ||
callback.on_batch_begin(self) | ||
|
||
tally, raw_tally = self._do_train_job(data_iterable, job_parameters, thread_private_mem) | ||
stats_tuple = self._do_train_job( | ||
data_iterable, job_parameters, thread_private_mem) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think there is no need to break the line here. We have a hard limit of 120 characters per line in gensim, and this is well within that limit. |
||
if len(stats_tuple) == 3: | ||
tally, raw_tally, effective_samples = stats_tuple | ||
else: | ||
# Model doesn't implement the samples tallying. We assume | ||
# that the number of samples is the effective words tally. This | ||
# gives coherent outputs with previous implementaitons | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please add TODO here - if/else should be removed when compute_loss implemented for all models |
||
tally, raw_tally = stats_tuple | ||
effective_samples = tally | ||
|
||
for callback in self.callbacks: | ||
callback.on_batch_end(self) | ||
|
||
progress_queue.put((len(data_iterable), tally, raw_tally)) # report back progress | ||
# report back progress | ||
progress_queue.put( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no need to break line (we use 120 char limit for gensim), here and everywhere |
||
(len(data_iterable), tally, raw_tally, effective_samples)) | ||
jobs_processed += 1 | ||
logger.debug("worker exiting, processed %i jobs", jobs_processed) | ||
|
||
|
@@ -248,7 +269,7 @@ def _job_producer(self, data_iterator, job_queue, cur_epoch=0, total_examples=No | |
logger.debug("job loop exiting, total %i jobs", job_no) | ||
|
||
def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples, | ||
raw_word_count, total_words, trained_word_count, elapsed): | ||
raw_word_count, total_words, trained_word_count, total_samples, elapsed): | ||
raise NotImplementedError() | ||
|
||
def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_count, total_words, | ||
|
@@ -260,6 +281,7 @@ def _log_train_end(self, raw_word_count, trained_word_count, total_elapsed, job_ | |
|
||
def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_examples=None, total_words=None, | ||
report_delay=1.0): | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please revert There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. still here |
||
"""Get the progress report for a single training epoch. | ||
|
||
Parameters | ||
|
@@ -294,7 +316,7 @@ def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_exam | |
* Total word count used in training. | ||
|
||
""" | ||
example_count, trained_word_count, raw_word_count = 0, 0, 0 | ||
example_count, trained_word_count, raw_word_count, samples_count = 0, 0, 0, 0 | ||
start, next_report = default_timer() - 0.00001, 1.0 | ||
job_tally = 0 | ||
unfinished_worker_count = self.workers | ||
|
@@ -305,20 +327,20 @@ def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_exam | |
unfinished_worker_count -= 1 | ||
logger.info("worker thread finished; awaiting finish of %i more threads", unfinished_worker_count) | ||
continue | ||
examples, trained_words, raw_words = report | ||
examples, trained_words, raw_words, effective_samples = report | ||
job_tally += 1 | ||
|
||
# update progress stats | ||
example_count += examples | ||
trained_word_count += trained_words # only words in vocab & sampled | ||
raw_word_count += raw_words | ||
|
||
samples_count += effective_samples | ||
# log progress once every report_delay seconds | ||
elapsed = default_timer() - start | ||
if elapsed >= next_report: | ||
self._log_progress( | ||
job_queue, progress_queue, cur_epoch, example_count, total_examples, | ||
raw_word_count, total_words, trained_word_count, elapsed) | ||
raw_word_count, total_words, trained_word_count, samples_count, elapsed) | ||
next_report = elapsed + report_delay | ||
# all done; report the final stats | ||
elapsed = default_timer() - start | ||
|
@@ -361,6 +383,7 @@ def _train_epoch(self, data_iterable=None, data_iterables=None, cur_epoch=0, tot | |
* Total word count used in training. | ||
|
||
""" | ||
self.running_training_loss = 0. | ||
self._check_input_data_sanity(data_iterable, data_iterables) | ||
job_queue = Queue(maxsize=queue_factor * self.workers) | ||
progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) | ||
|
@@ -966,6 +989,9 @@ def train(self, sentences=None, input_streams=None, total_examples=None, total_w | |
total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, | ||
queue_factor=queue_factor, report_delay=report_delay, compute_loss=compute_loss, callbacks=callbacks) | ||
|
||
def get_latest_training_loss(self): | ||
return 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should raise There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I changed it to raise an exception. I think -1 will be confusing since it is not the value that will be displayed. The value that is displayed is the running training loss divided by the number of samples. That is why I chose to return 0 and -1. -1 will look like the loss is decreasing as the number of processed words increases. |
||
|
||
def _get_job_params(self, cur_epoch): | ||
"""Get the learning rate used in the current epoch. | ||
|
||
|
@@ -1146,7 +1172,7 @@ def load(cls, *args, **kwargs): | |
return model | ||
|
||
def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples, | ||
raw_word_count, total_words, trained_word_count, elapsed): | ||
raw_word_count, total_words, trained_word_count, total_samples, elapsed): | ||
"""Callback used to log progress for long running jobs. | ||
|
||
Parameters | ||
|
@@ -1172,24 +1198,46 @@ def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, tot | |
trained_word_count : int | ||
Number of effective words used in training until now (after ignoring unknown words and trimming | ||
the sentence length). | ||
total_samples : int | ||
Number of effective samples used in training until now (differs from total_examples for Skip-Gram) | ||
elapsed : int | ||
Elapsed time since the beginning of training in seconds. | ||
|
||
""" | ||
if total_samples == 0: | ||
loss = -1 | ||
else: | ||
loss = self.get_latest_training_loss() / total_samples | ||
if total_examples: | ||
# examples-based progress % | ||
logger.info( | ||
"EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i", | ||
cur_epoch + 1, 100.0 * example_count / total_examples, trained_word_count / elapsed, | ||
utils.qsize(job_queue), utils.qsize(progress_queue) | ||
) | ||
if self.compute_loss: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you fully refactor this function please (make it clearer & shorter, not Hint
|
||
logger.info( | ||
("EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, " | ||
"in_qsize %i, out_qsize %i, current_loss %.3f"), | ||
cur_epoch + 1, 100.0 * example_count / total_examples, trained_word_count / elapsed, | ||
utils.qsize(job_queue), utils.qsize(progress_queue), loss | ||
) | ||
else: | ||
logger.info( | ||
"EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i", | ||
cur_epoch + 1, 100.0 * example_count / total_examples, trained_word_count / elapsed, | ||
utils.qsize(job_queue), utils.qsize(progress_queue) | ||
) | ||
else: | ||
# words-based progress % | ||
logger.info( | ||
"EPOCH %i - PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i", | ||
cur_epoch + 1, 100.0 * raw_word_count / total_words, trained_word_count / elapsed, | ||
utils.qsize(job_queue), utils.qsize(progress_queue) | ||
) | ||
if self.compute_loss: | ||
logger.info( | ||
"EPOCH %i - PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i", | ||
cur_epoch + 1, 100.0 * raw_word_count / total_words, trained_word_count / elapsed, | ||
utils.qsize(job_queue), utils.qsize(progress_queue) | ||
) | ||
else: | ||
logger.info( | ||
("EPOCH %i - PROGRESS: at %.2f%% words, %.0f words/s, " | ||
"in_qsize %i, out_qsize %i, current_loss %.3f"), | ||
cur_epoch + 1, 100.0 * raw_word_count / total_words, trained_word_count / elapsed, | ||
utils.qsize(job_queue), utils.qsize(progress_queue), loss | ||
) | ||
|
||
def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_count, total_words, | ||
trained_word_count, elapsed): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -181,6 +181,7 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False): | |
|
||
""" | ||
result = 0 | ||
effective_samples = 0 | ||
for sentence in sentences: | ||
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and | ||
model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32] | ||
|
@@ -192,12 +193,13 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False): | |
for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): | ||
# don't train on the `word` itself | ||
if pos2 != pos: | ||
effective_samples += 1 | ||
train_sg_pair( | ||
model, model.wv.index2word[word.index], word2.index, alpha, compute_loss=compute_loss | ||
) | ||
|
||
result += len(word_vocabs) | ||
return result | ||
return result, effective_samples | ||
|
||
def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss=False): | ||
"""Update CBOW model by training on a sequence of sentences. | ||
|
@@ -247,7 +249,7 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss | |
l1 /= len(word2_indices) | ||
train_cbow_pair(model, word, word2_indices, l1, alpha, compute_loss=compute_loss) | ||
result += len(word_vocabs) | ||
return result | ||
return result, result | ||
|
||
def score_sentence_sg(model, sentence, work=None): | ||
"""Obtain likelihood score for a single sentence in a fitted skip-gram representation. | ||
|
@@ -771,12 +773,11 @@ def _do_train_job(self, sentences, alpha, inits): | |
|
||
""" | ||
work, neu1 = inits | ||
tally = 0 | ||
if self.sg: | ||
tally += train_batch_sg(self, sentences, alpha, work, self.compute_loss) | ||
(tally, effective_samples) = train_batch_sg(self, sentences, alpha, work, self.compute_loss) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
else: | ||
tally += train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss) | ||
return tally, self._raw_word_count(sentences) | ||
(tally, effective_samples) = train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss) | ||
return tally, self._raw_word_count(sentences), effective_samples | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need to update an docstrings everywhere when you change returning type There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @alreadytaikeune Still not done, please check |
||
|
||
def _clear_post_train(self): | ||
"""Remove all L2-normalized word vectors from the model.""" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please follow numpy-style for docstrings, more links available here