From 3d342287f06d0ab4c0f09eb77dc46ba4410d7e5b Mon Sep 17 00:00:00 2001 From: Jindrich Libovicky Date: Fri, 15 Mar 2019 19:04:46 +0100 Subject: [PATCH 01/14] refactor rnn encoder --- neuralmonkey/encoders/__init__.py | 1 - neuralmonkey/encoders/recurrent.py | 172 ++++++++--------------------- 2 files changed, 49 insertions(+), 124 deletions(-) diff --git a/neuralmonkey/encoders/__init__.py b/neuralmonkey/encoders/__init__.py index 5db833b0e..2f58debb5 100644 --- a/neuralmonkey/encoders/__init__.py +++ b/neuralmonkey/encoders/__init__.py @@ -4,6 +4,5 @@ from .recurrent import FactoredEncoder from .recurrent import RecurrentEncoder from .recurrent import SentenceEncoder -from .recurrent import DeepSentenceEncoder from .sentence_cnn_encoder import SentenceCNNEncoder from .sequence_cnn_encoder import SequenceCNNEncoder diff --git a/neuralmonkey/encoders/recurrent.py b/neuralmonkey/encoders/recurrent.py index 687910169..3b5c267a5 100644 --- a/neuralmonkey/encoders/recurrent.py +++ b/neuralmonkey/encoders/recurrent.py @@ -7,13 +7,13 @@ TemporalStatefulWithOutput, TemporalStateful) from neuralmonkey.model.parameterized import InitializerSpecs from neuralmonkey.model.model_part import ModelPart -from neuralmonkey.logging import warn from neuralmonkey.nn.ortho_gru_cell import OrthoGRUCell, NematusGRUCell from neuralmonkey.nn.utils import dropout from neuralmonkey.vocabulary import Vocabulary from neuralmonkey.decorators import tensor from neuralmonkey.model.sequence import ( EmbeddedSequence, EmbeddedFactorSequence) +from neuralmonkey.tf_utils import layer_norm RNN_CELL_TYPES = { "NematusGRU": NematusGRUCell, @@ -70,8 +70,7 @@ def _make_rnn_cell(spec: RNNSpec) -> Callable[[], tf.nn.rnn_cell.RNNCell]: def rnn_layer(rnn_input: tf.Tensor, lengths: tf.Tensor, - rnn_spec: RNNSpec, - add_residual: bool) -> Tuple[tf.Tensor, tf.Tensor]: + rnn_spec: RNNSpec) -> Tuple[tf.Tensor, tf.Tensor]: """Construct a RNN layer given its inputs and specs. Arguments: @@ -108,17 +107,6 @@ def rnn_layer(rnn_input: tf.Tensor, if rnn_spec.cell_type == "LSTM": final_state = final_state.h - if add_residual: - if outputs.get_shape()[-1].value != rnn_input.get_shape()[-1].value: - warn("Size of the RNN layer input ({}) and layer output ({}) " - "must match when applying residual connection. Reshaping " - "the rnn output using linear projection.".format( - outputs.get_shape(), rnn_input.get_shape())) - # pylint: disable=redefined-variable-type - outputs = tf.layers.dense(outputs, rnn_input.shape.as_list()[-1]) - # pylint: enable=redefined-variable-type - outputs += rnn_input - return outputs, final_state @@ -128,10 +116,9 @@ class RecurrentEncoder(ModelPart, TemporalStatefulWithOutput): def __init__(self, name: str, input_sequence: TemporalStateful, - rnn_size: int, - rnn_cell: str = "GRU", - rnn_direction: str = "bidirectional", + rnn_layers: List[RNNSpecTuple], add_residual: bool = False, + add_layer_norm: bool = False, dropout_keep_prob: float = 1.0, reuse: ModelPart = None, save_checkpoint: str = None, @@ -150,6 +137,7 @@ def __init__(self, "bidirectional" will double the resulting vector dimension as well as the number of encoder parameters. add_residual: Add residual connections to the RNN layer output. + add_layer_norm: Add layer normalization after each RNN layer. dropout_keep_prob: 1 - dropout probability. save_checkpoint: ModelPart save checkpoint file. load_checkpoint: ModelPart load checkpoint file. @@ -161,8 +149,9 @@ def __init__(self, self.input_sequence = input_sequence self.dropout_keep_prob = dropout_keep_prob - self.rnn_spec = _make_rnn_spec(rnn_size, rnn_direction, rnn_cell) + self.rnn_specs = [_make_rnn_spec(*r) for r in rnn_layers] self.add_residual = add_residual + self.add_layer_norm = add_layer_norm if self.dropout_keep_prob <= 0.0 or self.dropout_keep_prob > 1.0: raise ValueError("Dropout keep prob must be inside (0,1].") @@ -178,8 +167,40 @@ def rnn_input(self) -> tf.Tensor: @tensor def rnn(self) -> Tuple[tf.Tensor, tf.Tensor]: - return rnn_layer(self.rnn_input, self.input_sequence.lengths, - self.rnn_spec, self.add_residual) + layer_input = self.rnn_input # type: tf.Tensor + layer_final = None + + for i, rnn_spec in enumerate(self.rnn_specs): + with tf.variable_scope("rnn_{}_{}".format(i, rnn_spec.direction), + reuse=tf.AUTO_REUSE): + + if self.add_layer_norm: + layer_input = layer_norm(layer_input) + + layer_output, layer_final_output = rnn_layer( + layer_input, self.input_sequence.lengths, rnn_spec) + + layer_output = dropout( + layer_output, self.dropout_keep_prob, self.train_mode) + layer_final_output = dropout( + layer_final_output, self.dropout_keep_prob, + self.train_mode) + + in_dim = layer_input.get_shape()[-1] + out_dim = layer_output.get_shape()[-1] + + if self.add_residual and in_dim == out_dim: + assert layer_final is not None + layer_input += layer_output + layer_final += layer_final_output + else: + # pylint: disable=redefined-variable-type + layer_input = layer_output + layer_final = layer_final_output + # pylint: enable=redefined-variable-type + + assert layer_final is not None + return layer_input, layer_final @tensor def temporal_states(self) -> tf.Tensor: @@ -209,6 +230,7 @@ def __init__(self, rnn_cell: str = "GRU", rnn_direction: str = "bidirectional", add_residual: bool = False, + add_layer_norm: bool = False, max_input_len: int = None, dropout_keep_prob: float = 1.0, reuse: ModelPart = None, @@ -234,6 +256,7 @@ def __init__(self, "bidirectional" will double the resulting vector dimension as well as the number of encoder parameters. add_residual: Add residual connections to the RNN layer output. + add_layer_norm: Add layer normalization after each RNN layer. dropout_keep_prob: 1 - dropout probability. save_checkpoint: ModelPart save checkpoint file. load_checkpoint: ModelPart load checkpoint file. @@ -266,10 +289,9 @@ def __init__(self, self, name=name, input_sequence=input_sequence, - rnn_size=rnn_size, - rnn_cell=rnn_cell, - rnn_direction=rnn_direction, + rnn_layers=[(rnn_size, rnn_direction, rnn_cell)], add_residual=add_residual, + add_layer_norm=add_layer_norm, dropout_keep_prob=dropout_keep_prob, reuse=reuse, save_checkpoint=save_checkpoint, @@ -289,6 +311,7 @@ def __init__(self, rnn_cell: str = "GRU", rnn_direction: str = "bidirectional", add_residual: bool = False, + add_layer_norm: bool = False, max_input_len: int = None, dropout_keep_prob: float = 1.0, reuse: ModelPart = None, @@ -314,6 +337,7 @@ def __init__(self, "bidirectional" will double the resulting vector dimension as well as the number of encoder parameters. add_residual: Add residual connections to the RNN layer output. + add_layer_norm: Add layer normalization after each RNN layer. dropout_keep_prob: 1 - dropout probability. save_checkpoint: ModelPart save checkpoint file. load_checkpoint: ModelPart load checkpoint file. @@ -336,110 +360,12 @@ def __init__(self, self, name=name, input_sequence=input_sequence, - rnn_size=rnn_size, - rnn_cell=rnn_cell, - rnn_direction=rnn_direction, + rnn_layers=[(rnn_size, rnn_cell, rnn_direction)], add_residual=add_residual, + add_layer_norm=add_layer_norm, dropout_keep_prob=dropout_keep_prob, reuse=reuse, save_checkpoint=save_checkpoint, load_checkpoint=load_checkpoint, initializers=initializers) # pylint: enable=too-many-arguments,too-many-locals - - -class DeepSentenceEncoder(SentenceEncoder): - # pylint: disable=too-many-arguments,too-many-locals - def __init__(self, - name: str, - vocabulary: Vocabulary, - data_id: str, - embedding_size: int, - rnn_sizes: List[int], - rnn_directions: List[str], - rnn_cell: str = "GRU", - add_residual: bool = False, - max_input_len: int = None, - dropout_keep_prob: float = 1.0, - reuse: ModelPart = None, - save_checkpoint: str = None, - load_checkpoint: str = None, - initializers: InitializerSpecs = None, - embedding_initializer: Callable = None) -> None: - """Create a new instance of the deep sentence encoder. - - Arguments: - name: ModelPart name. - vocabulary: The input vocabulary. - data_id: The input sequence data ID. - embedding_size: The dimension of the embedding vectors in the input - sequence. - max_input_len: Maximum length of the input sequence (disregard - tokens after this position). - rnn_sizes: The list of dimensions of the RNN hidden state vectors - in respective layers. - rnn_cell: One of "GRU", "NematusGRU", "LSTM". Which kind of memory - cell to use. - rnn_directions: The list of rnn directions in the respective - layers. Should be equally long as `rnn_sizes`. Each item must - be one of "forward", "backward", "bidirectional". Determines in - what order to process the input sequence. Note that choosing - "bidirectional" will double the resulting vector dimension as - well as the number of the parameters in the given layer. - add_residual: Add residual connections to each RNN layer output. - dropout_keep_prob: 1 - dropout probability. - save_checkpoint: ModelPart save checkpoint file. - load_checkpoint: ModelPart load checkpoint file. - """ - check_argument_types() - - if len(rnn_sizes) != len(rnn_directions): - raise ValueError("Different number of rnn sizes and directions.") - - self.rnn_sizes = rnn_sizes - self.rnn_directions = rnn_directions - self.rnn_cell = rnn_cell - - SentenceEncoder.__init__( - self, - name=name, - vocabulary=vocabulary, - data_id=data_id, - embedding_size=embedding_size, - rnn_size=rnn_sizes[-1], - rnn_direction=rnn_directions[-1], - rnn_cell=rnn_cell, - add_residual=add_residual, - max_input_len=max_input_len, - dropout_keep_prob=dropout_keep_prob, - reuse=reuse, - save_checkpoint=save_checkpoint, - load_checkpoint=load_checkpoint, - initializers=initializers, - embedding_initializer=embedding_initializer) - - @tensor - def rnn(self) -> Tuple[tf.Tensor, tf.Tensor]: - """Run stacked RNN given sizes and directions. - - Inputs of the first RNN are the RNN inputs to the encoder. Outputs from - each layer are used as inputs to the next one. As a final state of the - stacked RNN, the final state of the final layer is used. - """ - rnn_input_local = self.rnn_input - - for level, (rnn_size, rnn_dir) in enumerate( - zip(self.rnn_sizes, self.rnn_directions)): - rnn_spec = _make_rnn_spec(rnn_size, rnn_dir, self.rnn_cell) - - with tf.variable_scope("layer_{}".format(level)): - outputs, state = rnn_layer( - rnn_input_local, self.input_sequence.lengths, - rnn_spec, self.add_residual) - - # pylint - redefinition from instancemethod to list - # pylint: disable=redefined-variable-type - rnn_input_local = outputs - # pylint: enable=redefined-variable-type - - return outputs, state From 229de60a089c7c8e252386fe931db77f842a22ee Mon Sep 17 00:00:00 2001 From: Jindrich Libovicky Date: Fri, 15 Mar 2019 20:44:07 +0100 Subject: [PATCH 02/14] adjust tests and bugfix --- neuralmonkey/encoders/recurrent.py | 5 ++--- tests/labeler.ini | 16 ++++++++-------- tests/nematus.ini | 20 +++++++++++--------- tests/post-edit.ini | 3 +-- tests/str.ini | 2 +- 5 files changed, 23 insertions(+), 23 deletions(-) diff --git a/neuralmonkey/encoders/recurrent.py b/neuralmonkey/encoders/recurrent.py index 3b5c267a5..f633289bd 100644 --- a/neuralmonkey/encoders/recurrent.py +++ b/neuralmonkey/encoders/recurrent.py @@ -168,7 +168,7 @@ def rnn_input(self) -> tf.Tensor: @tensor def rnn(self) -> Tuple[tf.Tensor, tf.Tensor]: layer_input = self.rnn_input # type: tf.Tensor - layer_final = None + layer_final = self.rnn_input[:, -1] for i, rnn_spec in enumerate(self.rnn_specs): with tf.variable_scope("rnn_{}_{}".format(i, rnn_spec.direction), @@ -190,7 +190,6 @@ def rnn(self) -> Tuple[tf.Tensor, tf.Tensor]: out_dim = layer_output.get_shape()[-1] if self.add_residual and in_dim == out_dim: - assert layer_final is not None layer_input += layer_output layer_final += layer_final_output else: @@ -360,7 +359,7 @@ def __init__(self, self, name=name, input_sequence=input_sequence, - rnn_layers=[(rnn_size, rnn_cell, rnn_direction)], + rnn_layers=[(rnn_size, rnn_direction, rnn_cell)], add_residual=add_residual, add_layer_norm=add_layer_norm, dropout_keep_prob=dropout_keep_prob, diff --git a/tests/labeler.ini b/tests/labeler.ini index 768447eab..6fcc4e62a 100644 --- a/tests/labeler.ini +++ b/tests/labeler.ini @@ -44,18 +44,18 @@ path="tests/data/factored_decoder_vocab.tsv" class=vocabulary.from_wordlist path="tests/data/factored_tag_vocab.tsv" - -[encoder] -class=encoders.DeepSentenceEncoder -name="sentence_encoder" -rnn_sizes=[10,9,8] -rnn_directions=["forward", "backward", "bidirectional"] -rnn_cell="NematusGRU" +[encoder_input] +class=model.sequence.EmbeddedSequence embedding_size=7 -dropout_keep_prob=0.5 data_id="source" vocabulary= +[encoder] +class=encoders.RecurrentEncoder +rnn_layers=[(10, "forward", "LSTM"), (9, "backward", "NematusGRU"), (8, "bidirectional", "NematusGRU")] +dropout_keep_prob=0.5 +input_sequence= + [decoder] class=decoders.sequence_labeler.SequenceLabeler name="tagger" diff --git a/tests/nematus.ini b/tests/nematus.ini index cbb10809f..67ef04993 100644 --- a/tests/nematus.ini +++ b/tests/nematus.ini @@ -34,18 +34,20 @@ data=["tests/data/val.tc.en", "tests/data/val.tc.de"] class=vocabulary.from_wordlist path="tests/data/encoder_vocab.tsv" -[encoder] -class=encoders.DeepSentenceEncoder -name="sentence_encoder" -rnn_sizes=[14,7,14] -rnn_directions=["forward","bidirectional","backward"] -max_input_len=5 -embedding_size=14 -dropout_keep_prob=0.5 +[encoder_input] +class=model.sequence.EmbeddedSequence +embedding_size=7 +max_length=10 data_id="source" vocabulary= -rnn_cell="NematusGRU" + +[encoder] +class=encoders.RecurrentEncoder +rnn_layers=[(7, "forward", "LSTM"), (7, "backward", "NematusGRU"), (7, "bidirectional", "NematusGRU")] add_residual=True +add_layer_norm=True +dropout_keep_prob=0.5 +input_sequence= [attention] class=attention.Attention diff --git a/tests/post-edit.ini b/tests/post-edit.ini index 5c6b46548..76036d8ea 100644 --- a/tests/post-edit.ini +++ b/tests/post-edit.ini @@ -44,9 +44,8 @@ keys_encoder= [trans_encoder] class=encoders.recurrent.RecurrentEncoder input_sequence= -rnn_size=15 +rnn_layers=[(15, "bidirectional", "LSTM")] dropout_keep_prob=0.8 -rnn_cell="LSTM" name="trans_encoder" [trans_attention] diff --git a/tests/str.ini b/tests/str.ini index 7d8f857d4..09530e692 100644 --- a/tests/str.ini +++ b/tests/str.ini @@ -68,7 +68,7 @@ cnn= class=encoders.RecurrentEncoder name="encoder" input_sequence= -rnn_size=256 +rnn_layers=[(256)] [attention] class=attention.Attention From 96f99efa14433ae5e6276527066a20c85deace6f Mon Sep 17 00:00:00 2001 From: Jindrich Libovicky Date: Mon, 18 Mar 2019 11:45:25 +0100 Subject: [PATCH 03/14] pylint fix --- neuralmonkey/encoders/recurrent.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/neuralmonkey/encoders/recurrent.py b/neuralmonkey/encoders/recurrent.py index f633289bd..8a7d4e9aa 100644 --- a/neuralmonkey/encoders/recurrent.py +++ b/neuralmonkey/encoders/recurrent.py @@ -168,7 +168,9 @@ def rnn_input(self) -> tf.Tensor: @tensor def rnn(self) -> Tuple[tf.Tensor, tf.Tensor]: layer_input = self.rnn_input # type: tf.Tensor + # pylint: disable=unsubscriptable-object layer_final = self.rnn_input[:, -1] + # pylint: enable=unsubscriptable-object for i, rnn_spec in enumerate(self.rnn_specs): with tf.variable_scope("rnn_{}_{}".format(i, rnn_spec.direction), From 68b4fb2e80f01f5c71fbc6cf811dba6166f27b95 Mon Sep 17 00:00:00 2001 From: Jindrich Libovicky Date: Mon, 18 Mar 2019 11:45:43 +0100 Subject: [PATCH 04/14] get rid of raw rnn encoder --- neuralmonkey/encoders/__init__.py | 1 - .../encoders/numpy_stateful_filler.py | 79 +++++++- neuralmonkey/encoders/raw_rnn_encoder.py | 183 ------------------ tests/audio-classifier.ini | 10 +- tests/ctc.ini | 9 +- 5 files changed, 90 insertions(+), 192 deletions(-) delete mode 100644 neuralmonkey/encoders/raw_rnn_encoder.py diff --git a/neuralmonkey/encoders/__init__.py b/neuralmonkey/encoders/__init__.py index 2f58debb5..124ffa81d 100644 --- a/neuralmonkey/encoders/__init__.py +++ b/neuralmonkey/encoders/__init__.py @@ -1,6 +1,5 @@ from .cnn_encoder import CNNEncoder from .cnn_encoder import CNNTemporalView -from .raw_rnn_encoder import RawRNNEncoder from .recurrent import FactoredEncoder from .recurrent import RecurrentEncoder from .recurrent import SentenceEncoder diff --git a/neuralmonkey/encoders/numpy_stateful_filler.py b/neuralmonkey/encoders/numpy_stateful_filler.py index e7abd4841..c77b40e03 100644 --- a/neuralmonkey/encoders/numpy_stateful_filler.py +++ b/neuralmonkey/encoders/numpy_stateful_filler.py @@ -1,6 +1,7 @@ # TODO untested module from typing import Dict, List +import numpy as np import tensorflow as tf from typeguard import check_argument_types @@ -9,7 +10,8 @@ from neuralmonkey.model.feedable import FeedDict from neuralmonkey.model.parameterized import InitializerSpecs from neuralmonkey.model.model_part import ModelPart -from neuralmonkey.model.stateful import Stateful, SpatialStatefulWithOutput +from neuralmonkey.model.stateful import ( + Stateful, SpatialStatefulWithOutput, TemporalStateful) # pylint: disable=too-few-public-methods @@ -77,6 +79,81 @@ def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: return fd +class TemporalFiller(ModelPart, TemporalStateful): + """Placeholder class for 2D numerical input. + + This model part is used to feed 2D tensors (e.g., audio input). + """ + + # pylint: disable=too-many-arguments + def __init__(self, + name: str, + data_id: str, + input_size: int, + max_input_len: int = None, + dropout_keep_prob: float = 1.0, + reuse: ModelPart = None, + save_checkpoint: str = None, + load_checkpoint: str = None, + initializers: InitializerSpecs = None) -> None: + check_argument_types() + ModelPart.__init__( + self, name, reuse, save_checkpoint, load_checkpoint, initializers) + + self.data_id = data_id + self.input_size = input_size + self.max_input_len = max_input_len + self.dropout_keep_prob = dropout_keep_prob + # pylint: enable=too-many-arguments + + @property + def input_types(self) -> Dict[str, tf.DType]: + return {self.data_id: tf.float32} + + @property + def input_shapes(self) -> Dict[str, tf.TensorShape]: + return {self.data_id: tf.TensorShape([None, None, self.input_size])} + + @tensor + def temporal_states(self) -> tf.Tensor: + return self.dataset[self.data_id] + + # pylint: disable=no-self-use + @tensor + def _input_lengths(self) -> tf.Tensor: + return tf.placeholder(tf.int32, [None], "encoder_padding_lengths") + # pylint: enable=no-self-use + + @tensor + def temporal_mask(self) -> tf.Tensor: + return tf.sequence_mask(self._input_lengths, dtype=tf.float32) + + def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: + fd = ModelPart.feed_dict(self, dataset, train) + + series = list(dataset.get_series(self.data_id)) + lengths = [] + inputs = [] + + max_len = max(x.shape[0] for x in series) + if self.max_input_len is not None: + max_len = min(self.max_input_len, max_len) + + for x in series: + length = min(max_len, x.shape[0]) + x_padded = np.zeros(shape=(max_len,) + x.shape[1:], + dtype=x.dtype) + x_padded[:length] = x[:length] + + lengths.append(length) + inputs.append(x_padded) + + fd[self.temporal_states] = inputs + fd[self._input_lengths] = lengths + + return fd + + class SpatialFiller(ModelPart, SpatialStatefulWithOutput): """Placeholder class for 3D numerical input. diff --git a/neuralmonkey/encoders/raw_rnn_encoder.py b/neuralmonkey/encoders/raw_rnn_encoder.py deleted file mode 100644 index 9f45b1571..000000000 --- a/neuralmonkey/encoders/raw_rnn_encoder.py +++ /dev/null @@ -1,183 +0,0 @@ -from typing import List, Dict, Tuple - -import numpy as np -import tensorflow as tf -from typeguard import check_argument_types - -from neuralmonkey.dataset import Dataset -from neuralmonkey.decorators import tensor -# pylint: disable=protected-access -from neuralmonkey.encoders.recurrent import ( - RNNSpecTuple, _make_rnn_spec, _make_rnn_cell) -# pylint: enable=protected-access -from neuralmonkey.model.feedable import FeedDict -from neuralmonkey.model.parameterized import InitializerSpecs -from neuralmonkey.model.model_part import ModelPart -from neuralmonkey.model.stateful import TemporalStatefulWithOutput -from neuralmonkey.nn.utils import dropout - - -# pylint: disable=too-many-instance-attributes -class RawRNNEncoder(ModelPart, TemporalStatefulWithOutput): - """A raw RNN encoder that gets input as a tensor.""" - - # pylint: disable=too-many-arguments,too-many-locals - def __init__(self, - name: str, - data_id: str, - input_size: int, - rnn_layers: List[RNNSpecTuple], - max_input_len: int = None, - dropout_keep_prob: float = 1.0, - reuse: ModelPart = None, - save_checkpoint: str = None, - load_checkpoint: str = None, - initializers: InitializerSpecs = None) -> None: - """Create a new instance of the encoder. - - Arguments: - data_id: Identifier of the data series fed to this encoder - name: An unique identifier for this encoder - rnn_layers: A list of tuples specifying the size and, optionally, - the direction ('forward', 'backward' or 'bidirectional') - and cell type ('GRU' or 'LSTM') of each RNN layer. - dropout_keep_prob: The dropout keep probability - (default 1.0) - """ - check_argument_types() - ModelPart.__init__(self, name, reuse, save_checkpoint, load_checkpoint, - initializers) - - self.data_id = data_id - - self._rnn_layers = [_make_rnn_spec(*r) for r in rnn_layers] - self.max_input_len = max_input_len - self.input_size = input_size - self.dropout_keep_prob = dropout_keep_prob - - @property - def input_types(self) -> Dict[str, tf.DType]: - return {self.data_id: tf.float32} - - @property - def input_shapes(self) -> Dict[str, tf.TensorShape]: - return {self.data_id: tf.TensorShape([None, None, self.input_size])} - - @tensor - def inputs(self) -> tf.Tensor: - return self.dataset[self.data_id] - - # pylint: disable=no-self-use - @tensor - def _input_lengths(self) -> tf.Tensor: - return tf.placeholder(tf.int32, [None], "encoder_padding_lengths") - # pylint: enable=no-self-use - - @tensor - def states_mask(self) -> tf.Tensor: - return tf.sequence_mask(self._input_lengths, dtype=tf.float32) - - @tensor - def rnn(self) -> Tuple[tf.Tensor, tf.Tensor]: - states = self.inputs - states_reversed = False - - def reverse_states(): - nonlocal states, states_reversed - states = tf.reverse_sequence( - states, self._input_lengths, batch_axis=0, seq_axis=1) - states_reversed = not states_reversed - - for i, layer in enumerate(self._rnn_layers): - with tf.variable_scope("rnn_{}_{}".format(i, layer.direction)): - if layer.direction == "bidirectional": - fw_cell = _make_rnn_cell(layer) - bw_cell = _make_rnn_cell(layer) - outputs_tup, encoded_tup = ( - tf.nn.bidirectional_dynamic_rnn( - fw_cell, bw_cell, states, self._input_lengths, - dtype=tf.float32)) - - if states_reversed: - # treat forward as backward and vice versa - outputs_tup = tuple(reversed(outputs_tup)) - encoded_tup = tuple(reversed(encoded_tup)) - states_reversed = False - - states = tf.concat(outputs_tup, 2) - encoded = tf.concat(encoded_tup, 1) - elif layer.direction in ["forward", "backward"]: - should_be_reversed = (layer.direction == "backward") - if states_reversed != should_be_reversed: - reverse_states() - - cell = _make_rnn_cell(layer) - states, encoded = tf.nn.dynamic_rnn( - cell, states, - sequence_length=self._input_lengths, - dtype=tf.float32) - else: - raise ValueError( - "Unknown RNN direction {}".format(layer.direction)) - - if i < len(self._rnn_layers) - 1: - states = dropout(states, self.dropout_keep_prob, - self.train_mode) - - if states_reversed: - reverse_states() - - return states, encoded - - # pylint: disable=unsubscriptable-object - @tensor - def hidden_states(self) -> tf.Tensor: - return self.rnn[0] - - @tensor - def encoded(self) -> tf.Tensor: - return self.rnn[1] - # pylint: enable=unsubscriptable-object - - @property - def output(self) -> tf.Tensor: - return self.encoded - - @property - def temporal_states(self) -> tf.Tensor: - return self.hidden_states - - @property - def temporal_mask(self) -> tf.Tensor: - return self.states_mask - - def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: - """Populate the feed dictionary with the encoder inputs. - - Arguments: - dataset: The dataset to use - train: Boolean flag telling whether it is training time - """ - fd = ModelPart.feed_dict(self, dataset, train) - - series = list(dataset.get_series(self.data_id)) - lengths = [] - inputs = [] - - max_len = max(x.shape[0] for x in series) - if self.max_input_len is not None: - max_len = min(self.max_input_len, max_len) - - for x in series: - length = min(max_len, x.shape[0]) - x_padded = np.zeros(shape=(max_len,) + x.shape[1:], - dtype=x.dtype) - x_padded[:length] = x[:length] - - lengths.append(length) - inputs.append(x_padded) - - fd[self.inputs] = inputs - fd[self._input_lengths] = lengths - - return fd diff --git a/tests/audio-classifier.ini b/tests/audio-classifier.ini index 255078174..455e7331b 100644 --- a/tests/audio-classifier.ini +++ b/tests/audio-classifier.ini @@ -50,12 +50,14 @@ path="tests/data/dtmf/labels.vocab" contains_header=False contains_frequencies=False - -[encoder] -class=encoders.raw_rnn_encoder.RawRNNEncoder -name="encoder" +[input_seq] +class=encoders.numpy_stateful_filler.TemporalFiller data_id="features" input_size=26 + +[encoder] +class=encoders.RecurrentEncoder +input_sequence= rnn_layers=[(7)] dropout_keep_prob=0.5 diff --git a/tests/ctc.ini b/tests/ctc.ini index 34d6f689d..2a8635ddc 100644 --- a/tests/ctc.ini +++ b/tests/ctc.ini @@ -52,11 +52,14 @@ path="tests/data/yesno/yesno.vocab" contains_header=False contains_frequencies=False -[audio_encoder] -class=encoders.raw_rnn_encoder.RawRNNEncoder -name="audio_encoder" +[input_seq] +class=encoders.numpy_stateful_filler.TemporalFiller data_id="source" input_size=39 + +[audio_encoder] +class=encoders.RecurrentEncoder +input_sequence= rnn_layers=[(50,"bidirectional"),(100,"forward"),(100,"backward")] dropout_keep_prob=0.5 From 2088ca2b92b79c274a3a5142feac0c1c65d1d29e Mon Sep 17 00:00:00 2001 From: Jindrich Libovicky Date: Wed, 3 Apr 2019 16:19:02 +0200 Subject: [PATCH 05/14] address review --- neuralmonkey/encoders/recurrent.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/neuralmonkey/encoders/recurrent.py b/neuralmonkey/encoders/recurrent.py index 8a7d4e9aa..b544f10d5 100644 --- a/neuralmonkey/encoders/recurrent.py +++ b/neuralmonkey/encoders/recurrent.py @@ -119,6 +119,7 @@ def __init__(self, rnn_layers: List[RNNSpecTuple], add_residual: bool = False, add_layer_norm: bool = False, + include_final_layer_norm: bool = True, dropout_keep_prob: float = 1.0, reuse: ModelPart = None, save_checkpoint: str = None, @@ -138,6 +139,7 @@ def __init__(self, well as the number of encoder parameters. add_residual: Add residual connections to the RNN layer output. add_layer_norm: Add layer normalization after each RNN layer. + include_final_layer_norm: Normalize also output of the network. dropout_keep_prob: 1 - dropout probability. save_checkpoint: ModelPart save checkpoint file. load_checkpoint: ModelPart load checkpoint file. @@ -152,10 +154,19 @@ def __init__(self, self.rnn_specs = [_make_rnn_spec(*r) for r in rnn_layers] self.add_residual = add_residual self.add_layer_norm = add_layer_norm + self.include_final_layer_norm = include_final_layer_norm if self.dropout_keep_prob <= 0.0 or self.dropout_keep_prob > 1.0: raise ValueError("Dropout keep prob must be inside (0,1].") + layer_sizes = [ + 2 * layer.size if layer.direction == "bidirectional" + else layer.size for layer in self.rnn_specs] + if add_residual and len(set(layer_sizes)) > 1: + raise ValueError( + "When using residual connectiong, all layers must have " + "the same size, but are {}.".format(layer_sizes)) + self._variable_scope.set_initializer( tf.random_normal_initializer(stddev=0.001)) # pylint: enable=too-many-arguments @@ -201,6 +212,8 @@ def rnn(self) -> Tuple[tf.Tensor, tf.Tensor]: # pylint: enable=redefined-variable-type assert layer_final is not None + if self.include_final_layer_norm: + return layer_norm(layer_input), layer_norm(layer_final) return layer_input, layer_final @tensor From 98accc60a00ed61c7d199e7860c386470e873959 Mon Sep 17 00:00:00 2001 From: Jindra Helcl Date: Fri, 3 May 2019 15:23:26 +0200 Subject: [PATCH 06/14] fix mypy for new version --- neuralmonkey/runners/beamsearch_runner.py | 2 +- neuralmonkey/runners/ctc_debug_runner.py | 2 +- neuralmonkey/runners/label_runner.py | 2 +- neuralmonkey/runners/logits_runner.py | 2 +- neuralmonkey/runners/plain_runner.py | 2 +- neuralmonkey/runners/regression_runner.py | 2 +- neuralmonkey/runners/runner.py | 3 +-- neuralmonkey/runners/tensor_runner.py | 3 +-- neuralmonkey/runners/word_alignment_runner.py | 2 +- neuralmonkey/runners/xent_runner.py | 3 +-- neuralmonkey/trainers/objective.py | 2 +- neuralmonkey/trainers/rl_trainer.py | 2 +- neuralmonkey/trainers/self_critical_objective.py | 2 +- neuralmonkey/trainers/test_multitask_trainer.py | 3 --- 14 files changed, 13 insertions(+), 19 deletions(-) diff --git a/neuralmonkey/runners/beamsearch_runner.py b/neuralmonkey/runners/beamsearch_runner.py index 14a6fe486..2fd313f21 100644 --- a/neuralmonkey/runners/beamsearch_runner.py +++ b/neuralmonkey/runners/beamsearch_runner.py @@ -132,7 +132,7 @@ def __init__(self, postprocess: The postprocessor to apply to the output data. """ check_argument_types() - BaseRunner[BeamSearchDecoder].__init__(self, output_series, decoder) + super().__init__(output_series, decoder) if rank < 1 or rank > decoder.beam_size: raise ValueError( diff --git a/neuralmonkey/runners/ctc_debug_runner.py b/neuralmonkey/runners/ctc_debug_runner.py index 1e393619e..ca62209d8 100644 --- a/neuralmonkey/runners/ctc_debug_runner.py +++ b/neuralmonkey/runners/ctc_debug_runner.py @@ -43,7 +43,7 @@ def __init__(self, output_series: str, decoder: CTCDecoder) -> None: check_argument_types() - BaseRunner[CTCDecoder].__init__(self, output_series, decoder) + super().__init__(output_series, decoder) @tensor def fetches(self) -> Dict[str, tf.Tensor]: diff --git a/neuralmonkey/runners/label_runner.py b/neuralmonkey/runners/label_runner.py index 2a286be26..4e7297e5a 100644 --- a/neuralmonkey/runners/label_runner.py +++ b/neuralmonkey/runners/label_runner.py @@ -53,7 +53,7 @@ def __init__(self, decoder: SequenceLabeler, postprocess: Postprocessor = None) -> None: check_argument_types() - BaseRunner[SequenceLabeler].__init__(self, output_series, decoder) + super().__init__(output_series, decoder) self.postprocess = postprocess @tensor diff --git a/neuralmonkey/runners/logits_runner.py b/neuralmonkey/runners/logits_runner.py index 9868fa3a0..86dd82b10 100644 --- a/neuralmonkey/runners/logits_runner.py +++ b/neuralmonkey/runners/logits_runner.py @@ -74,7 +74,7 @@ def __init__(self, vocabulary whose logit or probability should be on output. """ check_argument_types() - BaseRunner[Classifier].__init__(self, output_series, decoder) + super().__init__(output_series, decoder) if pick_index is not None and pick_value is not None: raise ValueError("Either a pick index or a vocabulary value can " diff --git a/neuralmonkey/runners/plain_runner.py b/neuralmonkey/runners/plain_runner.py index e2ef389d0..e90ffe0b4 100644 --- a/neuralmonkey/runners/plain_runner.py +++ b/neuralmonkey/runners/plain_runner.py @@ -49,7 +49,7 @@ def __init__(self, decoder: SupportedDecoder, postprocess: Postprocessor = None) -> None: check_argument_types() - BaseRunner[SupportedDecoder].__init__(self, output_series, decoder) + super().__init__(output_series, decoder) self.postprocess = postprocess @tensor diff --git a/neuralmonkey/runners/regression_runner.py b/neuralmonkey/runners/regression_runner.py index 788f50b34..1cd91306b 100644 --- a/neuralmonkey/runners/regression_runner.py +++ b/neuralmonkey/runners/regression_runner.py @@ -43,7 +43,7 @@ def __init__(self, decoder: SequenceRegressor, postprocess: Postprocessor = None) -> None: check_argument_types() - BaseRunner[SequenceRegressor].__init__(self, output_series, decoder) + super().__init__(output_series, decoder) self.postprocess = postprocess @tensor diff --git a/neuralmonkey/runners/runner.py b/neuralmonkey/runners/runner.py index dc3982fe4..1e75428ff 100644 --- a/neuralmonkey/runners/runner.py +++ b/neuralmonkey/runners/runner.py @@ -67,8 +67,7 @@ def __init__(self, decoder: SupportedDecoder, postprocess: Postprocessor = None) -> None: check_argument_types() - BaseRunner[AutoregressiveDecoder].__init__( - self, output_series, decoder) + super().__init__(output_series, decoder) self.postprocess = postprocess self.vocabulary = self.decoder.vocabulary diff --git a/neuralmonkey/runners/tensor_runner.py b/neuralmonkey/runners/tensor_runner.py index b5404e83d..b5f6b9255 100644 --- a/neuralmonkey/runners/tensor_runner.py +++ b/neuralmonkey/runners/tensor_runner.py @@ -110,8 +110,7 @@ def __init__(self, if not modelparts: raise ValueError("At least one model part is expected") - BaseRunner[GenericModelPart].__init__( - self, output_series, modelparts[0]) + super().__init__(output_series, modelparts[0]) if len(modelparts) != len(tensors): raise ValueError("TensorRunner: 'modelparts' and 'tensors' lists " diff --git a/neuralmonkey/runners/word_alignment_runner.py b/neuralmonkey/runners/word_alignment_runner.py index 7a157b412..9bbc0d5aa 100644 --- a/neuralmonkey/runners/word_alignment_runner.py +++ b/neuralmonkey/runners/word_alignment_runner.py @@ -24,7 +24,7 @@ def __init__(self, attention: BaseAttention, decoder: Decoder) -> None: check_argument_types() - BaseRunner[BaseAttention].__init__(self, output_series, attention) + super().__init__(output_series, attention) self._key = "{}_run".format(decoder.name) diff --git a/neuralmonkey/runners/xent_runner.py b/neuralmonkey/runners/xent_runner.py index 4086a2b8a..163f1c2dc 100644 --- a/neuralmonkey/runners/xent_runner.py +++ b/neuralmonkey/runners/xent_runner.py @@ -28,8 +28,7 @@ def __init__(self, output_series: str, decoder: SupportedDecoders) -> None: check_argument_types() - BaseRunner[SupportedDecoders].__init__( - self, output_series, decoder) + super().__init__(output_series, decoder) @tensor def fetches(self) -> Dict[str, tf.Tensor]: diff --git a/neuralmonkey/trainers/objective.py b/neuralmonkey/trainers/objective.py index ad49c02a7..8fb2a21d0 100644 --- a/neuralmonkey/trainers/objective.py +++ b/neuralmonkey/trainers/objective.py @@ -84,7 +84,7 @@ def __init__(self, decoder: GenericModelPart, name = "{} - cost".format(str(decoder)) - Objective[GenericModelPart].__init__(self, name, decoder) + super().__init__(name, decoder) self._weight = weight @tensor diff --git a/neuralmonkey/trainers/rl_trainer.py b/neuralmonkey/trainers/rl_trainer.py index 98436c7f6..304165271 100644 --- a/neuralmonkey/trainers/rl_trainer.py +++ b/neuralmonkey/trainers/rl_trainer.py @@ -64,7 +64,7 @@ def __init__(self, """ check_argument_types() name = "{}_rl".format(decoder.name) - Objective[Decoder].__init__(self, name, decoder) + super().__init__(name, decoder) self.reward_function = reward_function self.subtract_baseline = subtract_baseline diff --git a/neuralmonkey/trainers/self_critical_objective.py b/neuralmonkey/trainers/self_critical_objective.py index 1e703ca27..161866c3a 100644 --- a/neuralmonkey/trainers/self_critical_objective.py +++ b/neuralmonkey/trainers/self_critical_objective.py @@ -41,7 +41,7 @@ def __init__(self, decoder: Decoder, reward_function: RewardFunction, """ check_argument_types() name = "{}_self_critical".format(decoder.name) - Objective[Decoder].__init__(self, name, decoder) + super().__init__(name, decoder) self.reward_function = reward_function self._weight = weight diff --git a/neuralmonkey/trainers/test_multitask_trainer.py b/neuralmonkey/trainers/test_multitask_trainer.py index 2fa6194e5..84c2e9344 100644 --- a/neuralmonkey/trainers/test_multitask_trainer.py +++ b/neuralmonkey/trainers/test_multitask_trainer.py @@ -27,9 +27,6 @@ def loss(self) -> tf.Tensor: # pylint: disable=too-few-public-methods class DummyObjective(Objective[TestMP]): - def __init__(self, name: str, decoder: TestMP) -> None: - Objective[TestMP].__init__(self, name, decoder) - @tensor def loss(self) -> tf.Tensor: return self.decoder.loss From 0542e006d3111e40b9802b1c4812f7e122d4003c Mon Sep 17 00:00:00 2001 From: varisd Date: Tue, 23 Apr 2019 16:17:19 +0200 Subject: [PATCH 07/14] Fixed Dataset.subset when buffer_size is None --- neuralmonkey/dataset.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/neuralmonkey/dataset.py b/neuralmonkey/dataset.py index ae963d75d..cfebdd48d 100644 --- a/neuralmonkey/dataset.py +++ b/neuralmonkey/dataset.py @@ -602,6 +602,12 @@ def subset(self, start: int, length: int) -> "Dataset": start, start + length) for s_id in self.iterators} + # Workaround since self.buffer_size and self.buffer_min_size + # may not be initialized + buffer_size = None + if hasattr(self, "buffer_min_size") and hasattr(self, "buffer_size"): + buffer_size = (self.buffer_min_size, self.buffer_size) + # Here, the type: ignore is because of the tied argument to the lambda # function above, which made it Callable[[Any], ...] instead of just # Callable[[], ...]. @@ -610,5 +616,5 @@ def subset(self, start: int, length: int) -> "Dataset": iterators=slices, batching=self.batching, outputs=outputs, - buffer_size=self.buffer_size, + buffer_size=buffer_size, shuffled=self.shuffled) From 109f9b699522246b7f9f84f282a688a0d5409a95 Mon Sep 17 00:00:00 2001 From: varisd Date: Thu, 2 May 2019 15:48:29 +0200 Subject: [PATCH 08/14] froze the required versions of mypy and sacrebleu --- requirements-gpu.txt | 2 +- requirements.txt | 2 +- tests/mypy_requirements.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements-gpu.txt b/requirements-gpu.txt index ebc6da3ab..6b927aa97 100644 --- a/requirements-gpu.txt +++ b/requirements-gpu.txt @@ -9,5 +9,5 @@ python_speech_features pygments rouge==0.2.1 typeguard -sacrebleu +sacrebleu==1.3.1 tensorflow-gpu>=1.12.0,<1.13 diff --git a/requirements.txt b/requirements.txt index 27341cc05..60332e25f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,5 +9,5 @@ python_speech_features pygments rouge==0.2.1 typeguard -sacrebleu +sacrebleu==1.3.1 tensorflow>=1.12.0,<1.13 diff --git a/tests/mypy_requirements.txt b/tests/mypy_requirements.txt index f0aa93ac8..87d25c2ea 100644 --- a/tests/mypy_requirements.txt +++ b/tests/mypy_requirements.txt @@ -1 +1 @@ -mypy +mypy==0.660 From 02071ec3b65b72d78df3c40d5cb3a99b6cd21942 Mon Sep 17 00:00:00 2001 From: varisd Date: Wed, 24 Apr 2019 14:14:15 +0200 Subject: [PATCH 09/14] Fixed Dataset.subset when buffer_size is None --- neuralmonkey/dataset.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/neuralmonkey/dataset.py b/neuralmonkey/dataset.py index cfebdd48d..5aac44130 100644 --- a/neuralmonkey/dataset.py +++ b/neuralmonkey/dataset.py @@ -602,12 +602,6 @@ def subset(self, start: int, length: int) -> "Dataset": start, start + length) for s_id in self.iterators} - # Workaround since self.buffer_size and self.buffer_min_size - # may not be initialized - buffer_size = None - if hasattr(self, "buffer_min_size") and hasattr(self, "buffer_size"): - buffer_size = (self.buffer_min_size, self.buffer_size) - # Here, the type: ignore is because of the tied argument to the lambda # function above, which made it Callable[[Any], ...] instead of just # Callable[[], ...]. @@ -616,5 +610,6 @@ def subset(self, start: int, length: int) -> "Dataset": iterators=slices, batching=self.batching, outputs=outputs, - buffer_size=buffer_size, + buffer_size=((self.buffer_min_size, self.buffer_size) + if self.lazy else None), shuffled=self.shuffled) From dbe69e0fd9ccb95ff620a1e2dc31ddd99b66d5ef Mon Sep 17 00:00:00 2001 From: varisd Date: Thu, 14 Mar 2019 17:42:09 +0100 Subject: [PATCH 10/14] created inheritable decoders.Attentive class providing interface for encoder attention --- neuralmonkey/decoders/attentive.py | 167 +++++++++++++++++++ neuralmonkey/decoders/autoregressive.py | 5 +- neuralmonkey/decoders/beam_search_decoder.py | 22 ++- neuralmonkey/decoders/transformer.py | 117 +++---------- 4 files changed, 201 insertions(+), 110 deletions(-) create mode 100644 neuralmonkey/decoders/attentive.py diff --git a/neuralmonkey/decoders/attentive.py b/neuralmonkey/decoders/attentive.py new file mode 100644 index 000000000..a1dcbe0e0 --- /dev/null +++ b/neuralmonkey/decoders/attentive.py @@ -0,0 +1,167 @@ +"""TODO.""" +from typing import Callable, List, Union + +import tensorflow as tf + +from neuralmonkey.attention.base_attention import ( + Attendable, get_attention_states, get_attention_mask) +from neuralmonkey.attention.transformer_cross_layer import ( + serial, parallel, flat, hierarchical) +from neuralmonkey.logging import warn +from neuralmonkey.model.model_part import ModelPart +from neuralmonkey.model.parameterized import InitializerSpecs +from neuralmonkey.nn.utils import dropout + +STRATEGIES = ["serial", "parallel", "flat", "hierarchical"] + + +# We inherit from ModelPart to access self.train_mode potentially creating +# a diamond inheritance pattern in the derived class. However, this should +# be fine since we do not override any of the class methods/attributes. +# pylint: disable=too-few-public-methods +class Attentive(ModelPart): + + # pylint: disable=too-many-arguments + def __init__(self, + name: str, + encoders: List[Attendable], + n_heads_enc: Union[List[int], int], + n_heads_hier: int = None, + attention_combination_strategy: str = "serial", + dropout_keep_prob: float = 1.0, + attention_dropout_keep_prob: Union[float, List[float]] = 1.0, + use_att_transform_bias: bool = False, + reuse: ModelPart = None, + save_checkpoint: str = None, + load_checkpoint: str = None, + initializers: InitializerSpecs = None) -> None: + """Initialize the common parameters. + + Provides methods and attributes necessary for computing attention + across the input encoders. + + Arguments: + name: Name of the decoder. Should be unique accross all Neural + Monkey objects. + encoders: Input encoders for the decoder to attend to. + n_heads_enc: Number of the attention heads over each encoder. + Either a list which size must be equal to ``encoders``, or a + single integer. In the latter case, the number of heads is + equal for all encoders. + n_heads_hier: Number of the attention heads for the second + attention in the ``hierarchical`` attention combination. + attention_comnbination_strategy: One of ``serial``, ``parallel``, + ``flat``, ``hierarchical``. Controls the attention combination + strategy for enc-dec attention. + dropout_keep_prob: Probability of keeping a value during dropout. + attention_dropout_keep_prob: Probability of keeping a value + during dropout on the attention output. + use_att_transform_bias: Add bias to the feed-forward layers in + the attention. + + TODO: + Generalize the attention. + """ + ModelPart.__init__(self, name, reuse, save_checkpoint, load_checkpoint, + initializers) + + self.encoders = encoders + self.n_heads_hier = n_heads_hier + self.attention_combination_strategy = attention_combination_strategy + self.dropout_keep_prob = dropout_keep_prob + self.use_att_transform_bias = use_att_transform_bias + + if isinstance(n_heads_enc, int): + if attention_combination_strategy == "flat": + self.n_heads_enc = [n_heads_enc] + else: + self.n_heads_enc = [n_heads_enc for _ in self.encoders] + else: + self.n_heads_enc = n_heads_enc + + if isinstance(attention_dropout_keep_prob, float): + self.attention_dropout_keep_prob = [ + attention_dropout_keep_prob for _ in encoders] + else: + self.attention_dropout_keep_prob = attention_dropout_keep_prob + + self.encoder_states = lambda: [get_attention_states(e) + for e in self.encoders] + self.encoder_masks = lambda: [get_attention_mask(e) + for e in self.encoders] + + if self.attention_combination_strategy not in STRATEGIES: + raise ValueError( + "Unknown attention combination strategy '{}'. " + "Allowed: {}.".format(self.attention_combination_strategy, + ", ".join(STRATEGIES))) + + if (self.attention_combination_strategy == "hierarchical" + and self.n_heads_hier is None): + raise ValueError( + "You must provide n_heads_hier when using the hierarchical " + "attention combination strategy.") + + if (self.attention_combination_strategy != "hierarchical" + and self.n_heads_hier is not None): + warn("Ignoring n_heads_hier parameter -- use the hierarchical " + "attention combination strategy instead.") + + if (self.attention_combination_strategy == "flat" + and len(self.n_heads_enc) != 1): + raise ValueError( + "For the flat attention combination strategy, only a single " + "value is permitted in n_heads_enc.") + + if any((val < 0.0 or val > 1.0) + for val in self.attention_dropout_keep_prob): + raise ValueError( + "Attention dropout keep probabilities must be " + "a real number in the interval [0,1].") + # pylint: enable=too-many-arguments + + def encoder_attention(self, queries: tf.Tensor) -> tf.Tensor: + """Compute attention context vectors over encoders using queries.""" + enc_states = self.encoder_states() + enc_masks = self.encoder_masks() + assert enc_states is not None + assert enc_masks is not None + + # Attention dropout callbacks are created in a loop so we need to + # use a factory function to prevent late binding. + def make_dropout_callback( + prob: float) -> Callable[[tf.Tensor], tf.Tensor]: + def callback(x: tf.Tensor) -> tf.Tensor: + return dropout(x, prob, self.train_mode) + return callback + + dropout_cb = make_dropout_callback(self.dropout_keep_prob) + attn_dropout_cbs = [make_dropout_callback(prob) + for prob in self.attention_dropout_keep_prob] + + if self.attention_combination_strategy == "serial": + return serial(queries, enc_states, enc_masks, self.n_heads_enc, + attn_dropout_cbs, dropout_cb) + + if self.attention_combination_strategy == "parallel": + return parallel(queries, enc_states, enc_masks, self.n_heads_enc, + attn_dropout_cbs, dropout_cb) + + if self.attention_combination_strategy == "flat": + assert len(set(self.n_heads_enc)) == 1 + assert len(set(self.attention_dropout_keep_prob)) == 1 + + return flat(queries, enc_states, enc_masks, self.n_heads_enc[0], + attn_dropout_cbs[0], dropout_cb) + + if self.attention_combination_strategy == "hierarchical": + assert self.n_heads_hier is not None + + return hierarchical( + queries, enc_states, enc_masks, self.n_heads_enc, + self.n_heads_hier, attn_dropout_cbs, dropout_cb) + + # TODO: remove this - this is already checked in the constructor + raise NotImplementedError( + "Unknown attention combination strategy: {}" + .format(self.attention_combination_strategy)) diff --git a/neuralmonkey/decoders/autoregressive.py b/neuralmonkey/decoders/autoregressive.py index 393e71695..1c121afad 100644 --- a/neuralmonkey/decoders/autoregressive.py +++ b/neuralmonkey/decoders/autoregressive.py @@ -6,7 +6,7 @@ The autoregressive decoder uses the while loop to get the outputs. Descendants should only specify the initial state and the while loop body. """ -from typing import NamedTuple, Callable, Optional, Any, List, Dict, Tuple +from typing import NamedTuple, Callable, Optional, Any, Dict import tensorflow as tf @@ -163,9 +163,6 @@ def __init__(self, self.tie_embeddings = tie_embeddings self.supress_unk = supress_unk - self.encoder_states = lambda: [] # type: Callable[[], List[tf.Tensor]] - self.encoder_masks = lambda: [] # type: Callable[[], List[tf.Tensor]] - # Check the values of the parameters (max_output_len, ...) if self.max_output_len <= 0: raise ValueError( diff --git a/neuralmonkey/decoders/beam_search_decoder.py b/neuralmonkey/decoders/beam_search_decoder.py index afe84ddfc..f4b6318a7 100644 --- a/neuralmonkey/decoders/beam_search_decoder.py +++ b/neuralmonkey/decoders/beam_search_decoder.py @@ -31,6 +31,7 @@ from neuralmonkey.decoders.autoregressive import ( AutoregressiveDecoder, LoopState) +from neuralmonkey.decoders.attentive import Attentive from neuralmonkey.decorators import tensor from neuralmonkey.model.model_part import ModelPart from neuralmonkey.tf_utils import ( @@ -171,22 +172,25 @@ def outputs(self) -> tf.Tensor: # the graph, replace them with beam-size-times copied originals, create # the beam search graph, and then replace the inner states back. - enc_states = self.parent_decoder.encoder_states - enc_masks = self.parent_decoder.encoder_masks + if isinstance(self, Attentive): + enc_states = self.parent_decoder.encoder_states + enc_masks = self.parent_decoder.encoder_masks - setattr(self.parent_decoder, "encoder_states", - lambda: [self.expand_to_beam(sts) for sts in enc_states()]) - setattr(self.parent_decoder, "encoder_masks", - lambda: [self.expand_to_beam(mask) for mask in enc_masks()]) + setattr(self.parent_decoder, "encoder_states", + lambda: [self.expand_to_beam(sts) for sts in enc_states()]) + setattr(self.parent_decoder, "encoder_masks", + lambda: [self.expand_to_beam(mask) + for mask in enc_masks()]) # Create the beam search symbolic graph. with self.use_scope(): self._initial_loop_state = self.get_initial_loop_state() outputs = self.decoding_loop() - # Reassign the original encoder states and mask back - setattr(self.parent_decoder, "encoder_states", enc_states) - setattr(self.parent_decoder, "encoder_masks", enc_masks) + if isinstance(self, Attentive): + # Reassign the original encoder states and mask back + setattr(self.parent_decoder, "encoder_states", enc_states) + setattr(self.parent_decoder, "encoder_masks", enc_masks) return outputs diff --git a/neuralmonkey/decoders/transformer.py b/neuralmonkey/decoders/transformer.py index d0a74bf9a..ed9317ebf 100644 --- a/neuralmonkey/decoders/transformer.py +++ b/neuralmonkey/decoders/transformer.py @@ -11,16 +11,13 @@ from typeguard import check_argument_types from neuralmonkey.attention.scaled_dot_product import attention -from neuralmonkey.attention.base_attention import ( - Attendable, get_attention_states, get_attention_mask) -from neuralmonkey.attention.transformer_cross_layer import ( - serial, parallel, flat, hierarchical) +from neuralmonkey.attention.base_attention import Attendable from neuralmonkey.decorators import tensor from neuralmonkey.decoders.autoregressive import ( AutoregressiveDecoder, LoopState, DecoderFeedables, DecoderHistories) +from neuralmonkey.decoders.attentive import Attentive from neuralmonkey.encoders.transformer import ( TransformerLayer, position_signal) -from neuralmonkey.logging import warn from neuralmonkey.model.sequence import EmbeddedSequence from neuralmonkey.model.parameterized import InitializerSpecs from neuralmonkey.model.model_part import ModelPart @@ -30,9 +27,6 @@ from neuralmonkey.tf_utils import append_tensor, layer_norm -STRATEGIES = ["serial", "parallel", "flat", "hierarchical"] - - class TransformerFeedables(NamedTuple( "TransformerFeedables", [("input_sequence", tf.Tensor), @@ -69,7 +63,7 @@ class TransformerHistories(NamedTuple( # pylint: disable=too-many-instance-attributes -class TransformerDecoder(AutoregressiveDecoder): +class TransformerDecoder(AutoregressiveDecoder, Attentive): # pylint: disable=too-many-arguments,too-many-locals,too-many-branches def __init__(self, @@ -151,56 +145,25 @@ def __init__(self, save_checkpoint=save_checkpoint, load_checkpoint=load_checkpoint) - self.encoders = encoders + Attentive.__init__( + self, + name=name, + encoders=encoders, + n_heads_enc=n_heads_enc, + n_heads_hier=n_heads_hier, + attention_combination_strategy=attention_combination_strategy, + dropout_keep_prob=dropout_keep_prob, + attention_dropout_keep_prob=attention_dropout_keep_prob, + use_att_transform_bias=use_att_transform_bias, + reuse=reuse, + save_checkpoint=save_checkpoint, + load_checkpoint=load_checkpoint, + initializers=initializers) + self.ff_hidden_size = ff_hidden_size self.n_heads_self = n_heads_self - - if isinstance(n_heads_enc, int): - if attention_combination_strategy == "flat": - self.n_heads_enc = [n_heads_enc] - else: - self.n_heads_enc = [n_heads_enc for _ in self.encoders] - else: - self.n_heads_enc = n_heads_enc - self.depth = depth - if isinstance(attention_dropout_keep_prob, float): - self.attention_dropout_keep_prob = [ - attention_dropout_keep_prob for _ in encoders] - else: - self.attention_dropout_keep_prob = attention_dropout_keep_prob self.self_att_dropout_keep_prob = self_attention_dropout_keep_prob - self.use_att_transform_bias = use_att_transform_bias - self.attention_combination_strategy = attention_combination_strategy - self.n_heads_hier = n_heads_hier - - self.encoder_states = lambda: [get_attention_states(e) - for e in self.encoders] - self.encoder_masks = lambda: [get_attention_mask(e) - for e in self.encoders] - - if self.attention_combination_strategy not in STRATEGIES: - raise ValueError( - "Unknown attention combination strategy '{}'. " - "Allowed: {}.".format(self.attention_combination_strategy, - ", ".join(STRATEGIES))) - - if (self.attention_combination_strategy == "hierarchical" - and self.n_heads_hier is None): - raise ValueError( - "You must provide n_heads_hier when using the hierarchical " - "attention combination strategy.") - - if (self.attention_combination_strategy != "hierarchical" - and self.n_heads_hier is not None): - warn("Ignoring n_heads_hier parameter -- use the hierarchical " - "attention combination strategy instead.") - - if (self.attention_combination_strategy == "flat" - and len(self.n_heads_enc) != 1): - raise ValueError( - "For the flat attention combination strategy, only a single " - "value is permitted in n_heads_enc.") self._variable_scope.set_initializer(tf.variance_scaling_initializer( mode="fan_avg", distribution="uniform")) @@ -296,48 +259,8 @@ def self_attention_sublayer( def encoder_attention_sublayer(self, queries: tf.Tensor) -> tf.Tensor: """Create the encoder-decoder attention sublayer.""" - enc_states = self.encoder_states() - enc_masks = self.encoder_masks() - assert enc_states is not None - assert enc_masks is not None - - # Attention dropout callbacks are created in a loop so we need to - # use a factory function to prevent late binding. - def make_attn_callback( - prob: float) -> Callable[[tf.Tensor], tf.Tensor]: - def callback(x: tf.Tensor) -> tf.Tensor: - return dropout(x, prob, self.train_mode) - return callback - - dropout_cb = make_attn_callback(self.dropout_keep_prob) - attn_dropout_cbs = [make_attn_callback(prob) - for prob in self.attention_dropout_keep_prob] - - if self.attention_combination_strategy == "serial": - return serial(queries, enc_states, enc_masks, self.n_heads_enc, - attn_dropout_cbs, dropout_cb) - - if self.attention_combination_strategy == "parallel": - return parallel(queries, enc_states, enc_masks, self.n_heads_enc, - attn_dropout_cbs, dropout_cb) - - if self.attention_combination_strategy == "flat": - assert len(set(self.n_heads_enc)) == 1 - assert len(set(self.attention_dropout_keep_prob)) == 1 - - return flat(queries, enc_states, enc_masks, self.n_heads_enc[0], - attn_dropout_cbs[0], dropout_cb) - - if self.attention_combination_strategy == "hierarchical": - assert self.n_heads_hier is not None - - return hierarchical( - queries, enc_states, enc_masks, self.n_heads_enc, - self.n_heads_hier, attn_dropout_cbs, dropout_cb) - - raise NotImplementedError( - "Unknown attention combination strategy: {}" - .format(self.attention_combination_strategy)) + + return self.encoder_attention(queries) def feedforward_sublayer(self, layer_input: tf.Tensor) -> tf.Tensor: """Create the feed-forward network sublayer.""" From 2057e0fd2197ed76a7be9b0d9e97bd31b48aef3c Mon Sep 17 00:00:00 2001 From: varisd Date: Thu, 14 Mar 2019 17:56:25 +0100 Subject: [PATCH 11/14] bs_decoder: checking correct object for Attentive --- neuralmonkey/decoders/beam_search_decoder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neuralmonkey/decoders/beam_search_decoder.py b/neuralmonkey/decoders/beam_search_decoder.py index f4b6318a7..1d566295d 100644 --- a/neuralmonkey/decoders/beam_search_decoder.py +++ b/neuralmonkey/decoders/beam_search_decoder.py @@ -172,7 +172,7 @@ def outputs(self) -> tf.Tensor: # the graph, replace them with beam-size-times copied originals, create # the beam search graph, and then replace the inner states back. - if isinstance(self, Attentive): + if isinstance(self.parent_decoder, Attentive): enc_states = self.parent_decoder.encoder_states enc_masks = self.parent_decoder.encoder_masks @@ -187,7 +187,7 @@ def outputs(self) -> tf.Tensor: self._initial_loop_state = self.get_initial_loop_state() outputs = self.decoding_loop() - if isinstance(self, Attentive): + if isinstance(self.parent_decoder, Attentive): # Reassign the original encoder states and mask back setattr(self.parent_decoder, "encoder_states", enc_states) setattr(self.parent_decoder, "encoder_masks", enc_masks) From 36e9e1bc643b387ee511d516b57b5df1602fb316 Mon Sep 17 00:00:00 2001 From: varisd Date: Mon, 18 Mar 2019 17:46:53 +0100 Subject: [PATCH 12/14] renamed attentive encoder to structured encoder --- neuralmonkey/encoders/{attentive.py => structured.py} | 2 +- tests/classifier.ini | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename neuralmonkey/encoders/{attentive.py => structured.py} (98%) diff --git a/neuralmonkey/encoders/attentive.py b/neuralmonkey/encoders/structured.py similarity index 98% rename from neuralmonkey/encoders/attentive.py rename to neuralmonkey/encoders/structured.py index 0c902b9d3..dc795b940 100644 --- a/neuralmonkey/encoders/attentive.py +++ b/neuralmonkey/encoders/structured.py @@ -10,7 +10,7 @@ get_attention_states, get_attention_mask, Attendable) -class AttentiveEncoder(ModelPart, TemporalStatefulWithOutput): +class StructuredEncoder(ModelPart, TemporalStatefulWithOutput): """An encoder with attention over the input and a fixed-dimension output. Based on "A Structured Self-attentive Sentence Embedding", diff --git a/tests/classifier.ini b/tests/classifier.ini index b287a80d1..b2e54f84b 100644 --- a/tests/classifier.ini +++ b/tests/classifier.ini @@ -46,7 +46,7 @@ data_id="source" vocabulary= [encoder_attentive] -class=encoders.attentive.AttentiveEncoder +class=encoders.attentive.StructuredEncoder name="attentive_encoder" input_sequence= hidden_size=9 From cfdeb423cce37c9a1b16e98ff5afac14fbf64986 Mon Sep 17 00:00:00 2001 From: varisd Date: Tue, 19 Mar 2019 14:57:11 +0100 Subject: [PATCH 13/14] fixing tests --- neuralmonkey/decoders/autoregressive.py | 2 +- neuralmonkey/decoders/transformer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/neuralmonkey/decoders/autoregressive.py b/neuralmonkey/decoders/autoregressive.py index 1c121afad..b970ae89f 100644 --- a/neuralmonkey/decoders/autoregressive.py +++ b/neuralmonkey/decoders/autoregressive.py @@ -6,7 +6,7 @@ The autoregressive decoder uses the while loop to get the outputs. Descendants should only specify the initial state and the while loop body. """ -from typing import NamedTuple, Callable, Optional, Any, Dict +from typing import NamedTuple, Callable, Optional, Any, Dict, Tuple import tensorflow as tf diff --git a/neuralmonkey/decoders/transformer.py b/neuralmonkey/decoders/transformer.py index ed9317ebf..85e97b020 100644 --- a/neuralmonkey/decoders/transformer.py +++ b/neuralmonkey/decoders/transformer.py @@ -4,7 +4,7 @@ """ # TODO make this code simpler # pylint: disable=too-many-lines -from typing import Any, Callable, NamedTuple, List, Union, Tuple +from typing import Any, NamedTuple, List, Union, Tuple import math import tensorflow as tf From 38c0bf8e8a3610291545302494a6f8f161db8db0 Mon Sep 17 00:00:00 2001 From: varisd Date: Thu, 2 May 2019 15:36:26 +0200 Subject: [PATCH 14/14] fixed tests/classifier.ini --- tests/classifier.ini | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/classifier.ini b/tests/classifier.ini index b2e54f84b..9a5342887 100644 --- a/tests/classifier.ini +++ b/tests/classifier.ini @@ -45,9 +45,9 @@ dropout_keep_prob=0.5 data_id="source" vocabulary= -[encoder_attentive] -class=encoders.attentive.StructuredEncoder -name="attentive_encoder" +[encoder_structured] +class=encoders.structured.StructuredEncoder +name="structured_encoder" input_sequence= hidden_size=9 num_heads=5 @@ -77,7 +77,7 @@ contains_frequencies=False [decoder] class=decoders.classifier.Classifier name="decoder" -encoders=[, , ] +encoders=[, , ] dropout_keep_prob=0.5 layers=[10,5] data_id="classification"