From 3d342287f06d0ab4c0f09eb77dc46ba4410d7e5b Mon Sep 17 00:00:00 2001
From: Jindrich Libovicky <libovicky@ufal.mff.cuni.cz>
Date: Fri, 15 Mar 2019 19:04:46 +0100
Subject: [PATCH 01/14] refactor rnn encoder

---
 neuralmonkey/encoders/__init__.py  |   1 -
 neuralmonkey/encoders/recurrent.py | 172 ++++++++---------------------
 2 files changed, 49 insertions(+), 124 deletions(-)

diff --git a/neuralmonkey/encoders/__init__.py b/neuralmonkey/encoders/__init__.py
index 5db833b0e..2f58debb5 100644
--- a/neuralmonkey/encoders/__init__.py
+++ b/neuralmonkey/encoders/__init__.py
@@ -4,6 +4,5 @@
 from .recurrent import FactoredEncoder
 from .recurrent import RecurrentEncoder
 from .recurrent import SentenceEncoder
-from .recurrent import DeepSentenceEncoder
 from .sentence_cnn_encoder import SentenceCNNEncoder
 from .sequence_cnn_encoder import SequenceCNNEncoder
diff --git a/neuralmonkey/encoders/recurrent.py b/neuralmonkey/encoders/recurrent.py
index 687910169..3b5c267a5 100644
--- a/neuralmonkey/encoders/recurrent.py
+++ b/neuralmonkey/encoders/recurrent.py
@@ -7,13 +7,13 @@
     TemporalStatefulWithOutput, TemporalStateful)
 from neuralmonkey.model.parameterized import InitializerSpecs
 from neuralmonkey.model.model_part import ModelPart
-from neuralmonkey.logging import warn
 from neuralmonkey.nn.ortho_gru_cell import OrthoGRUCell, NematusGRUCell
 from neuralmonkey.nn.utils import dropout
 from neuralmonkey.vocabulary import Vocabulary
 from neuralmonkey.decorators import tensor
 from neuralmonkey.model.sequence import (
     EmbeddedSequence, EmbeddedFactorSequence)
+from neuralmonkey.tf_utils import layer_norm
 
 RNN_CELL_TYPES = {
     "NematusGRU": NematusGRUCell,
@@ -70,8 +70,7 @@ def _make_rnn_cell(spec: RNNSpec) -> Callable[[], tf.nn.rnn_cell.RNNCell]:
 
 def rnn_layer(rnn_input: tf.Tensor,
               lengths: tf.Tensor,
-              rnn_spec: RNNSpec,
-              add_residual: bool) -> Tuple[tf.Tensor, tf.Tensor]:
+              rnn_spec: RNNSpec) -> Tuple[tf.Tensor, tf.Tensor]:
     """Construct a RNN layer given its inputs and specs.
 
     Arguments:
@@ -108,17 +107,6 @@ def rnn_layer(rnn_input: tf.Tensor,
         if rnn_spec.cell_type == "LSTM":
             final_state = final_state.h
 
-    if add_residual:
-        if outputs.get_shape()[-1].value != rnn_input.get_shape()[-1].value:
-            warn("Size of the RNN layer input ({}) and layer output ({}) "
-                 "must match when applying residual connection. Reshaping "
-                 "the rnn output using linear projection.".format(
-                     outputs.get_shape(), rnn_input.get_shape()))
-            # pylint: disable=redefined-variable-type
-            outputs = tf.layers.dense(outputs, rnn_input.shape.as_list()[-1])
-            # pylint: enable=redefined-variable-type
-        outputs += rnn_input
-
     return outputs, final_state
 
 
@@ -128,10 +116,9 @@ class RecurrentEncoder(ModelPart, TemporalStatefulWithOutput):
     def __init__(self,
                  name: str,
                  input_sequence: TemporalStateful,
-                 rnn_size: int,
-                 rnn_cell: str = "GRU",
-                 rnn_direction: str = "bidirectional",
+                 rnn_layers: List[RNNSpecTuple],
                  add_residual: bool = False,
+                 add_layer_norm: bool = False,
                  dropout_keep_prob: float = 1.0,
                  reuse: ModelPart = None,
                  save_checkpoint: str = None,
@@ -150,6 +137,7 @@ def __init__(self,
                 "bidirectional" will double the resulting vector dimension as
                 well as the number of encoder parameters.
             add_residual: Add residual connections to the RNN layer output.
+            add_layer_norm: Add layer normalization after each RNN layer.
             dropout_keep_prob: 1 - dropout probability.
             save_checkpoint: ModelPart save checkpoint file.
             load_checkpoint: ModelPart load checkpoint file.
@@ -161,8 +149,9 @@ def __init__(self,
 
         self.input_sequence = input_sequence
         self.dropout_keep_prob = dropout_keep_prob
-        self.rnn_spec = _make_rnn_spec(rnn_size, rnn_direction, rnn_cell)
+        self.rnn_specs = [_make_rnn_spec(*r) for r in rnn_layers]
         self.add_residual = add_residual
+        self.add_layer_norm = add_layer_norm
 
         if self.dropout_keep_prob <= 0.0 or self.dropout_keep_prob > 1.0:
             raise ValueError("Dropout keep prob must be inside (0,1].")
@@ -178,8 +167,40 @@ def rnn_input(self) -> tf.Tensor:
 
     @tensor
     def rnn(self) -> Tuple[tf.Tensor, tf.Tensor]:
-        return rnn_layer(self.rnn_input, self.input_sequence.lengths,
-                         self.rnn_spec, self.add_residual)
+        layer_input = self.rnn_input  # type: tf.Tensor
+        layer_final = None
+
+        for i, rnn_spec in enumerate(self.rnn_specs):
+            with tf.variable_scope("rnn_{}_{}".format(i, rnn_spec.direction),
+                                   reuse=tf.AUTO_REUSE):
+
+                if self.add_layer_norm:
+                    layer_input = layer_norm(layer_input)
+
+                layer_output, layer_final_output = rnn_layer(
+                    layer_input, self.input_sequence.lengths, rnn_spec)
+
+                layer_output = dropout(
+                    layer_output, self.dropout_keep_prob, self.train_mode)
+                layer_final_output = dropout(
+                    layer_final_output, self.dropout_keep_prob,
+                    self.train_mode)
+
+                in_dim = layer_input.get_shape()[-1]
+                out_dim = layer_output.get_shape()[-1]
+
+                if self.add_residual and in_dim == out_dim:
+                    assert layer_final is not None
+                    layer_input += layer_output
+                    layer_final += layer_final_output
+                else:
+                    # pylint: disable=redefined-variable-type
+                    layer_input = layer_output
+                    layer_final = layer_final_output
+                    # pylint: enable=redefined-variable-type
+
+        assert layer_final is not None
+        return layer_input, layer_final
 
     @tensor
     def temporal_states(self) -> tf.Tensor:
@@ -209,6 +230,7 @@ def __init__(self,
                  rnn_cell: str = "GRU",
                  rnn_direction: str = "bidirectional",
                  add_residual: bool = False,
+                 add_layer_norm: bool = False,
                  max_input_len: int = None,
                  dropout_keep_prob: float = 1.0,
                  reuse: ModelPart = None,
@@ -234,6 +256,7 @@ def __init__(self,
                 "bidirectional" will double the resulting vector dimension as
                 well as the number of encoder parameters.
             add_residual: Add residual connections to the RNN layer output.
+            add_layer_norm: Add layer normalization after each RNN layer.
             dropout_keep_prob: 1 - dropout probability.
             save_checkpoint: ModelPart save checkpoint file.
             load_checkpoint: ModelPart load checkpoint file.
@@ -266,10 +289,9 @@ def __init__(self,
             self,
             name=name,
             input_sequence=input_sequence,
-            rnn_size=rnn_size,
-            rnn_cell=rnn_cell,
-            rnn_direction=rnn_direction,
+            rnn_layers=[(rnn_size, rnn_direction, rnn_cell)],
             add_residual=add_residual,
+            add_layer_norm=add_layer_norm,
             dropout_keep_prob=dropout_keep_prob,
             reuse=reuse,
             save_checkpoint=save_checkpoint,
@@ -289,6 +311,7 @@ def __init__(self,
                  rnn_cell: str = "GRU",
                  rnn_direction: str = "bidirectional",
                  add_residual: bool = False,
+                 add_layer_norm: bool = False,
                  max_input_len: int = None,
                  dropout_keep_prob: float = 1.0,
                  reuse: ModelPart = None,
@@ -314,6 +337,7 @@ def __init__(self,
                 "bidirectional" will double the resulting vector dimension as
                 well as the number of encoder parameters.
             add_residual: Add residual connections to the RNN layer output.
+            add_layer_norm: Add layer normalization after each RNN layer.
             dropout_keep_prob: 1 - dropout probability.
             save_checkpoint: ModelPart save checkpoint file.
             load_checkpoint: ModelPart load checkpoint file.
@@ -336,110 +360,12 @@ def __init__(self,
             self,
             name=name,
             input_sequence=input_sequence,
-            rnn_size=rnn_size,
-            rnn_cell=rnn_cell,
-            rnn_direction=rnn_direction,
+            rnn_layers=[(rnn_size, rnn_cell, rnn_direction)],
             add_residual=add_residual,
+            add_layer_norm=add_layer_norm,
             dropout_keep_prob=dropout_keep_prob,
             reuse=reuse,
             save_checkpoint=save_checkpoint,
             load_checkpoint=load_checkpoint,
             initializers=initializers)
     # pylint: enable=too-many-arguments,too-many-locals
-
-
-class DeepSentenceEncoder(SentenceEncoder):
-    # pylint: disable=too-many-arguments,too-many-locals
-    def __init__(self,
-                 name: str,
-                 vocabulary: Vocabulary,
-                 data_id: str,
-                 embedding_size: int,
-                 rnn_sizes: List[int],
-                 rnn_directions: List[str],
-                 rnn_cell: str = "GRU",
-                 add_residual: bool = False,
-                 max_input_len: int = None,
-                 dropout_keep_prob: float = 1.0,
-                 reuse: ModelPart = None,
-                 save_checkpoint: str = None,
-                 load_checkpoint: str = None,
-                 initializers: InitializerSpecs = None,
-                 embedding_initializer: Callable = None) -> None:
-        """Create a new instance of the deep sentence encoder.
-
-        Arguments:
-            name: ModelPart name.
-            vocabulary: The input vocabulary.
-            data_id: The input sequence data ID.
-            embedding_size: The dimension of the embedding vectors in the input
-                sequence.
-            max_input_len: Maximum length of the input sequence (disregard
-                tokens after this position).
-            rnn_sizes: The list of dimensions of the RNN hidden state vectors
-                in respective layers.
-            rnn_cell: One of "GRU", "NematusGRU", "LSTM". Which kind of memory
-                cell to use.
-            rnn_directions: The list of rnn directions in the respective
-                layers. Should be equally long as `rnn_sizes`. Each item must
-                be one of "forward", "backward", "bidirectional". Determines in
-                what order to process the input sequence. Note that choosing
-                "bidirectional" will double the resulting vector dimension as
-                well as the number of the parameters in the given layer.
-            add_residual: Add residual connections to each RNN layer output.
-            dropout_keep_prob: 1 - dropout probability.
-            save_checkpoint: ModelPart save checkpoint file.
-            load_checkpoint: ModelPart load checkpoint file.
-        """
-        check_argument_types()
-
-        if len(rnn_sizes) != len(rnn_directions):
-            raise ValueError("Different number of rnn sizes and directions.")
-
-        self.rnn_sizes = rnn_sizes
-        self.rnn_directions = rnn_directions
-        self.rnn_cell = rnn_cell
-
-        SentenceEncoder.__init__(
-            self,
-            name=name,
-            vocabulary=vocabulary,
-            data_id=data_id,
-            embedding_size=embedding_size,
-            rnn_size=rnn_sizes[-1],
-            rnn_direction=rnn_directions[-1],
-            rnn_cell=rnn_cell,
-            add_residual=add_residual,
-            max_input_len=max_input_len,
-            dropout_keep_prob=dropout_keep_prob,
-            reuse=reuse,
-            save_checkpoint=save_checkpoint,
-            load_checkpoint=load_checkpoint,
-            initializers=initializers,
-            embedding_initializer=embedding_initializer)
-
-    @tensor
-    def rnn(self) -> Tuple[tf.Tensor, tf.Tensor]:
-        """Run stacked RNN given sizes and directions.
-
-        Inputs of the first RNN are the RNN inputs to the encoder. Outputs from
-        each layer are used as inputs to the next one. As a final state of the
-        stacked RNN, the final state of the final layer is used.
-        """
-        rnn_input_local = self.rnn_input
-
-        for level, (rnn_size, rnn_dir) in enumerate(
-                zip(self.rnn_sizes, self.rnn_directions)):
-            rnn_spec = _make_rnn_spec(rnn_size, rnn_dir, self.rnn_cell)
-
-            with tf.variable_scope("layer_{}".format(level)):
-                outputs, state = rnn_layer(
-                    rnn_input_local, self.input_sequence.lengths,
-                    rnn_spec, self.add_residual)
-
-            # pylint - redefinition from instancemethod to list
-            # pylint: disable=redefined-variable-type
-            rnn_input_local = outputs
-            # pylint: enable=redefined-variable-type
-
-        return outputs, state

From 229de60a089c7c8e252386fe931db77f842a22ee Mon Sep 17 00:00:00 2001
From: Jindrich Libovicky <libovicky@ufal.mff.cuni.cz>
Date: Fri, 15 Mar 2019 20:44:07 +0100
Subject: [PATCH 02/14] adjust tests and bugfix

---
 neuralmonkey/encoders/recurrent.py |  5 ++---
 tests/labeler.ini                  | 16 ++++++++--------
 tests/nematus.ini                  | 20 +++++++++++---------
 tests/post-edit.ini                |  3 +--
 tests/str.ini                      |  2 +-
 5 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/neuralmonkey/encoders/recurrent.py b/neuralmonkey/encoders/recurrent.py
index 3b5c267a5..f633289bd 100644
--- a/neuralmonkey/encoders/recurrent.py
+++ b/neuralmonkey/encoders/recurrent.py
@@ -168,7 +168,7 @@ def rnn_input(self) -> tf.Tensor:
     @tensor
     def rnn(self) -> Tuple[tf.Tensor, tf.Tensor]:
         layer_input = self.rnn_input  # type: tf.Tensor
-        layer_final = None
+        layer_final = self.rnn_input[:, -1]
 
         for i, rnn_spec in enumerate(self.rnn_specs):
             with tf.variable_scope("rnn_{}_{}".format(i, rnn_spec.direction),
@@ -190,7 +190,6 @@ def rnn(self) -> Tuple[tf.Tensor, tf.Tensor]:
                 out_dim = layer_output.get_shape()[-1]
 
                 if self.add_residual and in_dim == out_dim:
-                    assert layer_final is not None
                     layer_input += layer_output
                     layer_final += layer_final_output
                 else:
@@ -360,7 +359,7 @@ def __init__(self,
             self,
             name=name,
             input_sequence=input_sequence,
-            rnn_layers=[(rnn_size, rnn_cell, rnn_direction)],
+            rnn_layers=[(rnn_size, rnn_direction, rnn_cell)],
             add_residual=add_residual,
             add_layer_norm=add_layer_norm,
             dropout_keep_prob=dropout_keep_prob,
diff --git a/tests/labeler.ini b/tests/labeler.ini
index 768447eab..6fcc4e62a 100644
--- a/tests/labeler.ini
+++ b/tests/labeler.ini
@@ -44,18 +44,18 @@ path="tests/data/factored_decoder_vocab.tsv"
 class=vocabulary.from_wordlist
 path="tests/data/factored_tag_vocab.tsv"
 
-
-[encoder]
-class=encoders.DeepSentenceEncoder
-name="sentence_encoder"
-rnn_sizes=[10,9,8]
-rnn_directions=["forward", "backward", "bidirectional"]
-rnn_cell="NematusGRU"
+[encoder_input]
+class=model.sequence.EmbeddedSequence
 embedding_size=7
-dropout_keep_prob=0.5
 data_id="source"
 vocabulary=<source_vocabulary>
 
+[encoder]
+class=encoders.RecurrentEncoder
+rnn_layers=[(10, "forward", "LSTM"), (9, "backward", "NematusGRU"), (8, "bidirectional", "NematusGRU")]
+dropout_keep_prob=0.5
+input_sequence=<encoder_input>
+
 [decoder]
 class=decoders.sequence_labeler.SequenceLabeler
 name="tagger"
diff --git a/tests/nematus.ini b/tests/nematus.ini
index cbb10809f..67ef04993 100644
--- a/tests/nematus.ini
+++ b/tests/nematus.ini
@@ -34,18 +34,20 @@ data=["tests/data/val.tc.en", "tests/data/val.tc.de"]
 class=vocabulary.from_wordlist
 path="tests/data/encoder_vocab.tsv"
 
-[encoder]
-class=encoders.DeepSentenceEncoder
-name="sentence_encoder"
-rnn_sizes=[14,7,14]
-rnn_directions=["forward","bidirectional","backward"]
-max_input_len=5
-embedding_size=14
-dropout_keep_prob=0.5
+[encoder_input]
+class=model.sequence.EmbeddedSequence
+embedding_size=7
+max_length=10
 data_id="source"
 vocabulary=<encoder_vocabulary>
-rnn_cell="NematusGRU"
+
+[encoder]
+class=encoders.RecurrentEncoder
+rnn_layers=[(7, "forward", "LSTM"), (7, "backward", "NematusGRU"), (7, "bidirectional", "NematusGRU")]
 add_residual=True
+add_layer_norm=True
+dropout_keep_prob=0.5
+input_sequence=<encoder_input>
 
 [attention]
 class=attention.Attention
diff --git a/tests/post-edit.ini b/tests/post-edit.ini
index 5c6b46548..76036d8ea 100644
--- a/tests/post-edit.ini
+++ b/tests/post-edit.ini
@@ -44,9 +44,8 @@ keys_encoder=<src_encoder>
 [trans_encoder]
 class=encoders.recurrent.RecurrentEncoder
 input_sequence=<trans_embedded_input>
-rnn_size=15
+rnn_layers=[(15, "bidirectional", "LSTM")]
 dropout_keep_prob=0.8
-rnn_cell="LSTM"
 name="trans_encoder"
 
 [trans_attention]
diff --git a/tests/str.ini b/tests/str.ini
index 7d8f857d4..09530e692 100644
--- a/tests/str.ini
+++ b/tests/str.ini
@@ -68,7 +68,7 @@ cnn=<cnn>
 class=encoders.RecurrentEncoder
 name="encoder"
 input_sequence=<cnn_in_time>
-rnn_size=256
+rnn_layers=[(256)]
 
 [attention]
 class=attention.Attention

From 96f99efa14433ae5e6276527066a20c85deace6f Mon Sep 17 00:00:00 2001
From: Jindrich Libovicky <libovicky@ufal.mff.cuni.cz>
Date: Mon, 18 Mar 2019 11:45:25 +0100
Subject: [PATCH 03/14] pylint fix

---
 neuralmonkey/encoders/recurrent.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/neuralmonkey/encoders/recurrent.py b/neuralmonkey/encoders/recurrent.py
index f633289bd..8a7d4e9aa 100644
--- a/neuralmonkey/encoders/recurrent.py
+++ b/neuralmonkey/encoders/recurrent.py
@@ -168,7 +168,9 @@ def rnn_input(self) -> tf.Tensor:
     @tensor
     def rnn(self) -> Tuple[tf.Tensor, tf.Tensor]:
         layer_input = self.rnn_input  # type: tf.Tensor
+        # pylint: disable=unsubscriptable-object
         layer_final = self.rnn_input[:, -1]
+        # pylint: enable=unsubscriptable-object
 
         for i, rnn_spec in enumerate(self.rnn_specs):
             with tf.variable_scope("rnn_{}_{}".format(i, rnn_spec.direction),

From 68b4fb2e80f01f5c71fbc6cf811dba6166f27b95 Mon Sep 17 00:00:00 2001
From: Jindrich Libovicky <libovicky@ufal.mff.cuni.cz>
Date: Mon, 18 Mar 2019 11:45:43 +0100
Subject: [PATCH 04/14] get rid of raw rnn encoder

---
 neuralmonkey/encoders/__init__.py             |   1 -
 .../encoders/numpy_stateful_filler.py         |  79 +++++++-
 neuralmonkey/encoders/raw_rnn_encoder.py      | 183 ------------------
 tests/audio-classifier.ini                    |  10 +-
 tests/ctc.ini                                 |   9 +-
 5 files changed, 90 insertions(+), 192 deletions(-)
 delete mode 100644 neuralmonkey/encoders/raw_rnn_encoder.py

diff --git a/neuralmonkey/encoders/__init__.py b/neuralmonkey/encoders/__init__.py
index 2f58debb5..124ffa81d 100644
--- a/neuralmonkey/encoders/__init__.py
+++ b/neuralmonkey/encoders/__init__.py
@@ -1,6 +1,5 @@
 from .cnn_encoder import CNNEncoder
 from .cnn_encoder import CNNTemporalView
-from .raw_rnn_encoder import RawRNNEncoder
 from .recurrent import FactoredEncoder
 from .recurrent import RecurrentEncoder
 from .recurrent import SentenceEncoder
diff --git a/neuralmonkey/encoders/numpy_stateful_filler.py b/neuralmonkey/encoders/numpy_stateful_filler.py
index e7abd4841..c77b40e03 100644
--- a/neuralmonkey/encoders/numpy_stateful_filler.py
+++ b/neuralmonkey/encoders/numpy_stateful_filler.py
@@ -1,6 +1,7 @@
 # TODO untested module
 from typing import Dict, List
 
+import numpy as np
 import tensorflow as tf
 from typeguard import check_argument_types
 
@@ -9,7 +10,8 @@
 from neuralmonkey.model.feedable import FeedDict
 from neuralmonkey.model.parameterized import InitializerSpecs
 from neuralmonkey.model.model_part import ModelPart
-from neuralmonkey.model.stateful import Stateful, SpatialStatefulWithOutput
+from neuralmonkey.model.stateful import (
+    Stateful, SpatialStatefulWithOutput, TemporalStateful)
 
 
 # pylint: disable=too-few-public-methods
@@ -77,6 +79,81 @@ def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
         return fd
 
 
+class TemporalFiller(ModelPart, TemporalStateful):
+    """Placeholder class for 2D numerical input.
+
+    This model part is used to feed 2D tensors (e.g., audio input).
+    """
+
+    # pylint: disable=too-many-arguments
+    def __init__(self,
+                 name: str,
+                 data_id: str,
+                 input_size: int,
+                 max_input_len: int = None,
+                 dropout_keep_prob: float = 1.0,
+                 reuse: ModelPart = None,
+                 save_checkpoint: str = None,
+                 load_checkpoint: str = None,
+                 initializers: InitializerSpecs = None) -> None:
+        check_argument_types()
+        ModelPart.__init__(
+            self, name, reuse, save_checkpoint, load_checkpoint, initializers)
+
+        self.data_id = data_id
+        self.input_size = input_size
+        self.max_input_len = max_input_len
+        self.dropout_keep_prob = dropout_keep_prob
+    # pylint: enable=too-many-arguments
+
+    @property
+    def input_types(self) -> Dict[str, tf.DType]:
+        return {self.data_id: tf.float32}
+
+    @property
+    def input_shapes(self) -> Dict[str, tf.TensorShape]:
+        return {self.data_id: tf.TensorShape([None, None, self.input_size])}
+
+    @tensor
+    def temporal_states(self) -> tf.Tensor:
+        return self.dataset[self.data_id]
+
+    # pylint: disable=no-self-use
+    @tensor
+    def _input_lengths(self) -> tf.Tensor:
+        return tf.placeholder(tf.int32, [None], "encoder_padding_lengths")
+    # pylint: enable=no-self-use
+
+    @tensor
+    def temporal_mask(self) -> tf.Tensor:
+        return tf.sequence_mask(self._input_lengths, dtype=tf.float32)
+
+    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
+        fd = ModelPart.feed_dict(self, dataset, train)
+
+        series = list(dataset.get_series(self.data_id))
+        lengths = []
+        inputs = []
+
+        max_len = max(x.shape[0] for x in series)
+        if self.max_input_len is not None:
+            max_len = min(self.max_input_len, max_len)
+
+        for x in series:
+            length = min(max_len, x.shape[0])
+            x_padded = np.zeros(shape=(max_len,) + x.shape[1:],
+                                dtype=x.dtype)
+            x_padded[:length] = x[:length]
+
+            lengths.append(length)
+            inputs.append(x_padded)
+
+        fd[self.temporal_states] = inputs
+        fd[self._input_lengths] = lengths
+
+        return fd
+
+
 class SpatialFiller(ModelPart, SpatialStatefulWithOutput):
     """Placeholder class for 3D numerical input.
 
diff --git a/neuralmonkey/encoders/raw_rnn_encoder.py b/neuralmonkey/encoders/raw_rnn_encoder.py
deleted file mode 100644
index 9f45b1571..000000000
--- a/neuralmonkey/encoders/raw_rnn_encoder.py
+++ /dev/null
@@ -1,183 +0,0 @@
-from typing import List, Dict, Tuple
-
-import numpy as np
-import tensorflow as tf
-from typeguard import check_argument_types
-
-from neuralmonkey.dataset import Dataset
-from neuralmonkey.decorators import tensor
-# pylint: disable=protected-access
-from neuralmonkey.encoders.recurrent import (
-    RNNSpecTuple, _make_rnn_spec, _make_rnn_cell)
-# pylint: enable=protected-access
-from neuralmonkey.model.feedable import FeedDict
-from neuralmonkey.model.parameterized import InitializerSpecs
-from neuralmonkey.model.model_part import ModelPart
-from neuralmonkey.model.stateful import TemporalStatefulWithOutput
-from neuralmonkey.nn.utils import dropout
-
-
-# pylint: disable=too-many-instance-attributes
-class RawRNNEncoder(ModelPart, TemporalStatefulWithOutput):
-    """A raw RNN encoder that gets input as a tensor."""
-
-    # pylint: disable=too-many-arguments,too-many-locals
-    def __init__(self,
-                 name: str,
-                 data_id: str,
-                 input_size: int,
-                 rnn_layers: List[RNNSpecTuple],
-                 max_input_len: int = None,
-                 dropout_keep_prob: float = 1.0,
-                 reuse: ModelPart = None,
-                 save_checkpoint: str = None,
-                 load_checkpoint: str = None,
-                 initializers: InitializerSpecs = None) -> None:
-        """Create a new instance of the encoder.
-
-        Arguments:
-            data_id: Identifier of the data series fed to this encoder
-            name: An unique identifier for this encoder
-            rnn_layers: A list of tuples specifying the size and, optionally,
-                the direction ('forward', 'backward' or 'bidirectional')
-                and cell type ('GRU' or 'LSTM') of each RNN layer.
-            dropout_keep_prob: The dropout keep probability
-                (default 1.0)
-        """
-        check_argument_types()
-        ModelPart.__init__(self, name, reuse, save_checkpoint, load_checkpoint,
-                           initializers)
-
-        self.data_id = data_id
-
-        self._rnn_layers = [_make_rnn_spec(*r) for r in rnn_layers]
-        self.max_input_len = max_input_len
-        self.input_size = input_size
-        self.dropout_keep_prob = dropout_keep_prob
-
-    @property
-    def input_types(self) -> Dict[str, tf.DType]:
-        return {self.data_id: tf.float32}
-
-    @property
-    def input_shapes(self) -> Dict[str, tf.TensorShape]:
-        return {self.data_id: tf.TensorShape([None, None, self.input_size])}
-
-    @tensor
-    def inputs(self) -> tf.Tensor:
-        return self.dataset[self.data_id]
-
-    # pylint: disable=no-self-use
-    @tensor
-    def _input_lengths(self) -> tf.Tensor:
-        return tf.placeholder(tf.int32, [None], "encoder_padding_lengths")
-    # pylint: enable=no-self-use
-
-    @tensor
-    def states_mask(self) -> tf.Tensor:
-        return tf.sequence_mask(self._input_lengths, dtype=tf.float32)
-
-    @tensor
-    def rnn(self) -> Tuple[tf.Tensor, tf.Tensor]:
-        states = self.inputs
-        states_reversed = False
-
-        def reverse_states():
-            nonlocal states, states_reversed
-            states = tf.reverse_sequence(
-                states, self._input_lengths, batch_axis=0, seq_axis=1)
-            states_reversed = not states_reversed
-
-        for i, layer in enumerate(self._rnn_layers):
-            with tf.variable_scope("rnn_{}_{}".format(i, layer.direction)):
-                if layer.direction == "bidirectional":
-                    fw_cell = _make_rnn_cell(layer)
-                    bw_cell = _make_rnn_cell(layer)
-                    outputs_tup, encoded_tup = (
-                        tf.nn.bidirectional_dynamic_rnn(
-                            fw_cell, bw_cell, states, self._input_lengths,
-                            dtype=tf.float32))
-
-                    if states_reversed:
-                        # treat forward as backward and vice versa
-                        outputs_tup = tuple(reversed(outputs_tup))
-                        encoded_tup = tuple(reversed(encoded_tup))
-                        states_reversed = False
-
-                    states = tf.concat(outputs_tup, 2)
-                    encoded = tf.concat(encoded_tup, 1)
-                elif layer.direction in ["forward", "backward"]:
-                    should_be_reversed = (layer.direction == "backward")
-                    if states_reversed != should_be_reversed:
-                        reverse_states()
-
-                    cell = _make_rnn_cell(layer)
-                    states, encoded = tf.nn.dynamic_rnn(
-                        cell, states,
-                        sequence_length=self._input_lengths,
-                        dtype=tf.float32)
-                else:
-                    raise ValueError(
-                        "Unknown RNN direction {}".format(layer.direction))
-
-            if i < len(self._rnn_layers) - 1:
-                states = dropout(states, self.dropout_keep_prob,
-                                 self.train_mode)
-
-        if states_reversed:
-            reverse_states()
-
-        return states, encoded
-
-    # pylint: disable=unsubscriptable-object
-    @tensor
-    def hidden_states(self) -> tf.Tensor:
-        return self.rnn[0]
-
-    @tensor
-    def encoded(self) -> tf.Tensor:
-        return self.rnn[1]
-    # pylint: enable=unsubscriptable-object
-
-    @property
-    def output(self) -> tf.Tensor:
-        return self.encoded
-
-    @property
-    def temporal_states(self) -> tf.Tensor:
-        return self.hidden_states
-
-    @property
-    def temporal_mask(self) -> tf.Tensor:
-        return self.states_mask
-
-    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
-        """Populate the feed dictionary with the encoder inputs.
-
-        Arguments:
-            dataset: The dataset to use
-            train: Boolean flag telling whether it is training time
-        """
-        fd = ModelPart.feed_dict(self, dataset, train)
-
-        series = list(dataset.get_series(self.data_id))
-        lengths = []
-        inputs = []
-
-        max_len = max(x.shape[0] for x in series)
-        if self.max_input_len is not None:
-            max_len = min(self.max_input_len, max_len)
-
-        for x in series:
-            length = min(max_len, x.shape[0])
-            x_padded = np.zeros(shape=(max_len,) + x.shape[1:],
-                                dtype=x.dtype)
-            x_padded[:length] = x[:length]
-
-            lengths.append(length)
-            inputs.append(x_padded)
-
-        fd[self.inputs] = inputs
-        fd[self._input_lengths] = lengths
-
-        return fd
diff --git a/tests/audio-classifier.ini b/tests/audio-classifier.ini
index 255078174..455e7331b 100644
--- a/tests/audio-classifier.ini
+++ b/tests/audio-classifier.ini
@@ -50,12 +50,14 @@ path="tests/data/dtmf/labels.vocab"
 contains_header=False
 contains_frequencies=False
 
-
-[encoder]
-class=encoders.raw_rnn_encoder.RawRNNEncoder
-name="encoder"
+[input_seq]
+class=encoders.numpy_stateful_filler.TemporalFiller
 data_id="features"
 input_size=26
+
+[encoder]
+class=encoders.RecurrentEncoder
+input_sequence=<input_seq>
 rnn_layers=[(7)]
 dropout_keep_prob=0.5
 
diff --git a/tests/ctc.ini b/tests/ctc.ini
index 34d6f689d..2a8635ddc 100644
--- a/tests/ctc.ini
+++ b/tests/ctc.ini
@@ -52,11 +52,14 @@ path="tests/data/yesno/yesno.vocab"
 contains_header=False
 contains_frequencies=False
 
-[audio_encoder]
-class=encoders.raw_rnn_encoder.RawRNNEncoder
-name="audio_encoder"
+[input_seq]
+class=encoders.numpy_stateful_filler.TemporalFiller
 data_id="source"
 input_size=39
+
+[audio_encoder]
+class=encoders.RecurrentEncoder
+input_sequence=<input_seq>
 rnn_layers=[(50,"bidirectional"),(100,"forward"),(100,"backward")]
 dropout_keep_prob=0.5
 

From 2088ca2b92b79c274a3a5142feac0c1c65d1d29e Mon Sep 17 00:00:00 2001
From: Jindrich Libovicky <libovicky@ufal.mff.cuni.cz>
Date: Wed, 3 Apr 2019 16:19:02 +0200
Subject: [PATCH 05/14] address review

---
 neuralmonkey/encoders/recurrent.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/neuralmonkey/encoders/recurrent.py b/neuralmonkey/encoders/recurrent.py
index 8a7d4e9aa..b544f10d5 100644
--- a/neuralmonkey/encoders/recurrent.py
+++ b/neuralmonkey/encoders/recurrent.py
@@ -119,6 +119,7 @@ def __init__(self,
                  rnn_layers: List[RNNSpecTuple],
                  add_residual: bool = False,
                  add_layer_norm: bool = False,
+                 include_final_layer_norm: bool = True,
                  dropout_keep_prob: float = 1.0,
                  reuse: ModelPart = None,
                  save_checkpoint: str = None,
@@ -138,6 +139,7 @@ def __init__(self,
                 well as the number of encoder parameters.
             add_residual: Add residual connections to the RNN layer output.
             add_layer_norm: Add layer normalization after each RNN layer.
+            include_final_layer_norm: Normalize also output of the network.
             dropout_keep_prob: 1 - dropout probability.
             save_checkpoint: ModelPart save checkpoint file.
             load_checkpoint: ModelPart load checkpoint file.
@@ -152,10 +154,19 @@ def __init__(self,
         self.rnn_specs = [_make_rnn_spec(*r) for r in rnn_layers]
         self.add_residual = add_residual
         self.add_layer_norm = add_layer_norm
+        self.include_final_layer_norm = include_final_layer_norm
 
         if self.dropout_keep_prob <= 0.0 or self.dropout_keep_prob > 1.0:
             raise ValueError("Dropout keep prob must be inside (0,1].")
 
+        layer_sizes = [
+            2 * layer.size if layer.direction == "bidirectional"
+            else layer.size for layer in self.rnn_specs]
+        if add_residual and len(set(layer_sizes)) > 1:
+            raise ValueError(
+                "When using residual connectiong, all layers must have "
+                "the same size, but are {}.".format(layer_sizes))
+
         self._variable_scope.set_initializer(
             tf.random_normal_initializer(stddev=0.001))
     # pylint: enable=too-many-arguments
@@ -201,6 +212,8 @@ def rnn(self) -> Tuple[tf.Tensor, tf.Tensor]:
                     # pylint: enable=redefined-variable-type
 
         assert layer_final is not None
+        if self.include_final_layer_norm:
+            return layer_norm(layer_input), layer_norm(layer_final)
         return layer_input, layer_final
 
     @tensor

From 98accc60a00ed61c7d199e7860c386470e873959 Mon Sep 17 00:00:00 2001
From: Jindra Helcl <helcl@ufal.mff.cuni.cz>
Date: Fri, 3 May 2019 15:23:26 +0200
Subject: [PATCH 06/14] fix mypy for new version

---
 neuralmonkey/runners/beamsearch_runner.py        | 2 +-
 neuralmonkey/runners/ctc_debug_runner.py         | 2 +-
 neuralmonkey/runners/label_runner.py             | 2 +-
 neuralmonkey/runners/logits_runner.py            | 2 +-
 neuralmonkey/runners/plain_runner.py             | 2 +-
 neuralmonkey/runners/regression_runner.py        | 2 +-
 neuralmonkey/runners/runner.py                   | 3 +--
 neuralmonkey/runners/tensor_runner.py            | 3 +--
 neuralmonkey/runners/word_alignment_runner.py    | 2 +-
 neuralmonkey/runners/xent_runner.py              | 3 +--
 neuralmonkey/trainers/objective.py               | 2 +-
 neuralmonkey/trainers/rl_trainer.py              | 2 +-
 neuralmonkey/trainers/self_critical_objective.py | 2 +-
 neuralmonkey/trainers/test_multitask_trainer.py  | 3 ---
 14 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/neuralmonkey/runners/beamsearch_runner.py b/neuralmonkey/runners/beamsearch_runner.py
index 14a6fe486..2fd313f21 100644
--- a/neuralmonkey/runners/beamsearch_runner.py
+++ b/neuralmonkey/runners/beamsearch_runner.py
@@ -132,7 +132,7 @@ def __init__(self,
             postprocess: The postprocessor to apply to the output data.
         """
         check_argument_types()
-        BaseRunner[BeamSearchDecoder].__init__(self, output_series, decoder)
+        super().__init__(output_series, decoder)
 
         if rank < 1 or rank > decoder.beam_size:
             raise ValueError(
diff --git a/neuralmonkey/runners/ctc_debug_runner.py b/neuralmonkey/runners/ctc_debug_runner.py
index 1e393619e..ca62209d8 100644
--- a/neuralmonkey/runners/ctc_debug_runner.py
+++ b/neuralmonkey/runners/ctc_debug_runner.py
@@ -43,7 +43,7 @@ def __init__(self,
                  output_series: str,
                  decoder: CTCDecoder) -> None:
         check_argument_types()
-        BaseRunner[CTCDecoder].__init__(self, output_series, decoder)
+        super().__init__(output_series, decoder)
 
     @tensor
     def fetches(self) -> Dict[str, tf.Tensor]:
diff --git a/neuralmonkey/runners/label_runner.py b/neuralmonkey/runners/label_runner.py
index 2a286be26..4e7297e5a 100644
--- a/neuralmonkey/runners/label_runner.py
+++ b/neuralmonkey/runners/label_runner.py
@@ -53,7 +53,7 @@ def __init__(self,
                  decoder: SequenceLabeler,
                  postprocess: Postprocessor = None) -> None:
         check_argument_types()
-        BaseRunner[SequenceLabeler].__init__(self, output_series, decoder)
+        super().__init__(output_series, decoder)
         self.postprocess = postprocess
 
     @tensor
diff --git a/neuralmonkey/runners/logits_runner.py b/neuralmonkey/runners/logits_runner.py
index 9868fa3a0..86dd82b10 100644
--- a/neuralmonkey/runners/logits_runner.py
+++ b/neuralmonkey/runners/logits_runner.py
@@ -74,7 +74,7 @@ def __init__(self,
                 vocabulary whose logit or probability should be on output.
         """
         check_argument_types()
-        BaseRunner[Classifier].__init__(self, output_series, decoder)
+        super().__init__(output_series, decoder)
 
         if pick_index is not None and pick_value is not None:
             raise ValueError("Either a pick index or a vocabulary value can "
diff --git a/neuralmonkey/runners/plain_runner.py b/neuralmonkey/runners/plain_runner.py
index e2ef389d0..e90ffe0b4 100644
--- a/neuralmonkey/runners/plain_runner.py
+++ b/neuralmonkey/runners/plain_runner.py
@@ -49,7 +49,7 @@ def __init__(self,
                  decoder: SupportedDecoder,
                  postprocess: Postprocessor = None) -> None:
         check_argument_types()
-        BaseRunner[SupportedDecoder].__init__(self, output_series, decoder)
+        super().__init__(output_series, decoder)
         self.postprocess = postprocess
 
     @tensor
diff --git a/neuralmonkey/runners/regression_runner.py b/neuralmonkey/runners/regression_runner.py
index 788f50b34..1cd91306b 100644
--- a/neuralmonkey/runners/regression_runner.py
+++ b/neuralmonkey/runners/regression_runner.py
@@ -43,7 +43,7 @@ def __init__(self,
                  decoder: SequenceRegressor,
                  postprocess: Postprocessor = None) -> None:
         check_argument_types()
-        BaseRunner[SequenceRegressor].__init__(self, output_series, decoder)
+        super().__init__(output_series, decoder)
         self.postprocess = postprocess
 
     @tensor
diff --git a/neuralmonkey/runners/runner.py b/neuralmonkey/runners/runner.py
index dc3982fe4..1e75428ff 100644
--- a/neuralmonkey/runners/runner.py
+++ b/neuralmonkey/runners/runner.py
@@ -67,8 +67,7 @@ def __init__(self,
                  decoder: SupportedDecoder,
                  postprocess: Postprocessor = None) -> None:
         check_argument_types()
-        BaseRunner[AutoregressiveDecoder].__init__(
-            self, output_series, decoder)
+        super().__init__(output_series, decoder)
 
         self.postprocess = postprocess
         self.vocabulary = self.decoder.vocabulary
diff --git a/neuralmonkey/runners/tensor_runner.py b/neuralmonkey/runners/tensor_runner.py
index b5404e83d..b5f6b9255 100644
--- a/neuralmonkey/runners/tensor_runner.py
+++ b/neuralmonkey/runners/tensor_runner.py
@@ -110,8 +110,7 @@ def __init__(self,
         if not modelparts:
             raise ValueError("At least one model part is expected")
 
-        BaseRunner[GenericModelPart].__init__(
-            self, output_series, modelparts[0])
+        super().__init__(output_series, modelparts[0])
 
         if len(modelparts) != len(tensors):
             raise ValueError("TensorRunner: 'modelparts' and 'tensors' lists "
diff --git a/neuralmonkey/runners/word_alignment_runner.py b/neuralmonkey/runners/word_alignment_runner.py
index 7a157b412..9bbc0d5aa 100644
--- a/neuralmonkey/runners/word_alignment_runner.py
+++ b/neuralmonkey/runners/word_alignment_runner.py
@@ -24,7 +24,7 @@ def __init__(self,
                  attention: BaseAttention,
                  decoder: Decoder) -> None:
         check_argument_types()
-        BaseRunner[BaseAttention].__init__(self, output_series, attention)
+        super().__init__(output_series, attention)
 
         self._key = "{}_run".format(decoder.name)
 
diff --git a/neuralmonkey/runners/xent_runner.py b/neuralmonkey/runners/xent_runner.py
index 4086a2b8a..163f1c2dc 100644
--- a/neuralmonkey/runners/xent_runner.py
+++ b/neuralmonkey/runners/xent_runner.py
@@ -28,8 +28,7 @@ def __init__(self,
                  output_series: str,
                  decoder: SupportedDecoders) -> None:
         check_argument_types()
-        BaseRunner[SupportedDecoders].__init__(
-            self, output_series, decoder)
+        super().__init__(output_series, decoder)
 
     @tensor
     def fetches(self) -> Dict[str, tf.Tensor]:
diff --git a/neuralmonkey/trainers/objective.py b/neuralmonkey/trainers/objective.py
index ad49c02a7..8fb2a21d0 100644
--- a/neuralmonkey/trainers/objective.py
+++ b/neuralmonkey/trainers/objective.py
@@ -84,7 +84,7 @@ def __init__(self, decoder: GenericModelPart,
 
         name = "{} - cost".format(str(decoder))
 
-        Objective[GenericModelPart].__init__(self, name, decoder)
+        super().__init__(name, decoder)
         self._weight = weight
 
     @tensor
diff --git a/neuralmonkey/trainers/rl_trainer.py b/neuralmonkey/trainers/rl_trainer.py
index 98436c7f6..304165271 100644
--- a/neuralmonkey/trainers/rl_trainer.py
+++ b/neuralmonkey/trainers/rl_trainer.py
@@ -64,7 +64,7 @@ def __init__(self,
         """
         check_argument_types()
         name = "{}_rl".format(decoder.name)
-        Objective[Decoder].__init__(self, name, decoder)
+        super().__init__(name, decoder)
 
         self.reward_function = reward_function
         self.subtract_baseline = subtract_baseline
diff --git a/neuralmonkey/trainers/self_critical_objective.py b/neuralmonkey/trainers/self_critical_objective.py
index 1e703ca27..161866c3a 100644
--- a/neuralmonkey/trainers/self_critical_objective.py
+++ b/neuralmonkey/trainers/self_critical_objective.py
@@ -41,7 +41,7 @@ def __init__(self, decoder: Decoder, reward_function: RewardFunction,
         """
         check_argument_types()
         name = "{}_self_critical".format(decoder.name)
-        Objective[Decoder].__init__(self, name, decoder)
+        super().__init__(name, decoder)
 
         self.reward_function = reward_function
         self._weight = weight
diff --git a/neuralmonkey/trainers/test_multitask_trainer.py b/neuralmonkey/trainers/test_multitask_trainer.py
index 2fa6194e5..84c2e9344 100644
--- a/neuralmonkey/trainers/test_multitask_trainer.py
+++ b/neuralmonkey/trainers/test_multitask_trainer.py
@@ -27,9 +27,6 @@ def loss(self) -> tf.Tensor:
 
 # pylint: disable=too-few-public-methods
 class DummyObjective(Objective[TestMP]):
-    def __init__(self, name: str, decoder: TestMP) -> None:
-        Objective[TestMP].__init__(self, name, decoder)
-
     @tensor
     def loss(self) -> tf.Tensor:
         return self.decoder.loss

From 0542e006d3111e40b9802b1c4812f7e122d4003c Mon Sep 17 00:00:00 2001
From: varisd <varis@ufal.mff.cuni.cz>
Date: Tue, 23 Apr 2019 16:17:19 +0200
Subject: [PATCH 07/14] Fixed Dataset.subset when buffer_size is None

---
 neuralmonkey/dataset.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/neuralmonkey/dataset.py b/neuralmonkey/dataset.py
index ae963d75d..cfebdd48d 100644
--- a/neuralmonkey/dataset.py
+++ b/neuralmonkey/dataset.py
@@ -602,6 +602,12 @@ def subset(self, start: int, length: int) -> "Dataset":
                                               start, start + length)
                   for s_id in self.iterators}
 
+        # Workaround since self.buffer_size and self.buffer_min_size
+        # may not be initialized
+        buffer_size = None
+        if hasattr(self, "buffer_min_size") and hasattr(self, "buffer_size"):
+            buffer_size = (self.buffer_min_size, self.buffer_size)
+
         # Here, the type: ignore is because of the tied argument to the lambda
         # function above, which made it Callable[[Any], ...] instead of just
         # Callable[[], ...].
@@ -610,5 +616,5 @@ def subset(self, start: int, length: int) -> "Dataset":
             iterators=slices,
             batching=self.batching,
             outputs=outputs,
-            buffer_size=self.buffer_size,
+            buffer_size=buffer_size,
             shuffled=self.shuffled)

From 109f9b699522246b7f9f84f282a688a0d5409a95 Mon Sep 17 00:00:00 2001
From: varisd <varis@ufal.mff.cuni.cz>
Date: Thu, 2 May 2019 15:48:29 +0200
Subject: [PATCH 08/14] froze the required versions of mypy and sacrebleu

---
 requirements-gpu.txt        | 2 +-
 requirements.txt            | 2 +-
 tests/mypy_requirements.txt | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements-gpu.txt b/requirements-gpu.txt
index ebc6da3ab..6b927aa97 100644
--- a/requirements-gpu.txt
+++ b/requirements-gpu.txt
@@ -9,5 +9,5 @@ python_speech_features
 pygments
 rouge==0.2.1
 typeguard
-sacrebleu
+sacrebleu==1.3.1
 tensorflow-gpu>=1.12.0,<1.13
diff --git a/requirements.txt b/requirements.txt
index 27341cc05..60332e25f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,5 +9,5 @@ python_speech_features
 pygments
 rouge==0.2.1
 typeguard
-sacrebleu
+sacrebleu==1.3.1
 tensorflow>=1.12.0,<1.13
diff --git a/tests/mypy_requirements.txt b/tests/mypy_requirements.txt
index f0aa93ac8..87d25c2ea 100644
--- a/tests/mypy_requirements.txt
+++ b/tests/mypy_requirements.txt
@@ -1 +1 @@
-mypy
+mypy==0.660

From 02071ec3b65b72d78df3c40d5cb3a99b6cd21942 Mon Sep 17 00:00:00 2001
From: varisd <varis@ufal.mff.cuni.cz>
Date: Wed, 24 Apr 2019 14:14:15 +0200
Subject: [PATCH 09/14] Fixed Dataset.subset when buffer_size is None

---
 neuralmonkey/dataset.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/neuralmonkey/dataset.py b/neuralmonkey/dataset.py
index cfebdd48d..5aac44130 100644
--- a/neuralmonkey/dataset.py
+++ b/neuralmonkey/dataset.py
@@ -602,12 +602,6 @@ def subset(self, start: int, length: int) -> "Dataset":
                                               start, start + length)
                   for s_id in self.iterators}
 
-        # Workaround since self.buffer_size and self.buffer_min_size
-        # may not be initialized
-        buffer_size = None
-        if hasattr(self, "buffer_min_size") and hasattr(self, "buffer_size"):
-            buffer_size = (self.buffer_min_size, self.buffer_size)
-
         # Here, the type: ignore is because of the tied argument to the lambda
         # function above, which made it Callable[[Any], ...] instead of just
         # Callable[[], ...].
@@ -616,5 +610,6 @@ def subset(self, start: int, length: int) -> "Dataset":
             iterators=slices,
             batching=self.batching,
             outputs=outputs,
-            buffer_size=buffer_size,
+            buffer_size=((self.buffer_min_size, self.buffer_size)
+                         if self.lazy else None),
             shuffled=self.shuffled)

From dbe69e0fd9ccb95ff620a1e2dc31ddd99b66d5ef Mon Sep 17 00:00:00 2001
From: varisd <varis@ufal.mff.cuni.cz>
Date: Thu, 14 Mar 2019 17:42:09 +0100
Subject: [PATCH 10/14] created inheritable decoders.Attentive class providing
 interface for encoder attention

---
 neuralmonkey/decoders/attentive.py           | 167 +++++++++++++++++++
 neuralmonkey/decoders/autoregressive.py      |   5 +-
 neuralmonkey/decoders/beam_search_decoder.py |  22 ++-
 neuralmonkey/decoders/transformer.py         | 117 +++----------
 4 files changed, 201 insertions(+), 110 deletions(-)
 create mode 100644 neuralmonkey/decoders/attentive.py

diff --git a/neuralmonkey/decoders/attentive.py b/neuralmonkey/decoders/attentive.py
new file mode 100644
index 000000000..a1dcbe0e0
--- /dev/null
+++ b/neuralmonkey/decoders/attentive.py
@@ -0,0 +1,167 @@
+"""TODO."""
+from typing import Callable, List, Union
+
+import tensorflow as tf
+
+from neuralmonkey.attention.base_attention import (
+    Attendable, get_attention_states, get_attention_mask)
+from neuralmonkey.attention.transformer_cross_layer import (
+    serial, parallel, flat, hierarchical)
+from neuralmonkey.logging import warn
+from neuralmonkey.model.model_part import ModelPart
+from neuralmonkey.model.parameterized import InitializerSpecs
+from neuralmonkey.nn.utils import dropout
+
+STRATEGIES = ["serial", "parallel", "flat", "hierarchical"]
+
+
+# We inherit from ModelPart to access self.train_mode potentially creating
+# a diamond inheritance pattern in the derived class. However, this should
+# be fine since we do not override any of the class methods/attributes.
+# pylint: disable=too-few-public-methods
+class Attentive(ModelPart):
+
+    # pylint: disable=too-many-arguments
+    def __init__(self,
+                 name: str,
+                 encoders: List[Attendable],
+                 n_heads_enc: Union[List[int], int],
+                 n_heads_hier: int = None,
+                 attention_combination_strategy: str = "serial",
+                 dropout_keep_prob: float = 1.0,
+                 attention_dropout_keep_prob: Union[float, List[float]] = 1.0,
+                 use_att_transform_bias: bool = False,
+                 reuse: ModelPart = None,
+                 save_checkpoint: str = None,
+                 load_checkpoint: str = None,
+                 initializers: InitializerSpecs = None) -> None:
+        """Initialize the common parameters.
+
+        Provides methods and attributes necessary for computing attention
+        across the input encoders.
+
+        Arguments:
+            name: Name of the decoder. Should be unique accross all Neural
+                Monkey objects.
+            encoders: Input encoders for the decoder to attend to.
+            n_heads_enc: Number of the attention heads over each encoder.
+                Either a list which size must be equal to ``encoders``, or a
+                single integer. In the latter case, the number of heads is
+                equal for all encoders.
+            n_heads_hier: Number of the attention heads for the second
+                attention in the ``hierarchical`` attention combination.
+            attention_comnbination_strategy: One of ``serial``, ``parallel``,
+                ``flat``, ``hierarchical``. Controls the attention combination
+                strategy for enc-dec attention.
+            dropout_keep_prob: Probability of keeping a value during dropout.
+            attention_dropout_keep_prob: Probability of keeping a value
+                during dropout on the attention output.
+            use_att_transform_bias: Add bias to the feed-forward layers in
+                the attention.
+
+        TODO:
+            Generalize the attention.
+        """
+        ModelPart.__init__(self, name, reuse, save_checkpoint, load_checkpoint,
+                           initializers)
+
+        self.encoders = encoders
+        self.n_heads_hier = n_heads_hier
+        self.attention_combination_strategy = attention_combination_strategy
+        self.dropout_keep_prob = dropout_keep_prob
+        self.use_att_transform_bias = use_att_transform_bias
+
+        if isinstance(n_heads_enc, int):
+            if attention_combination_strategy == "flat":
+                self.n_heads_enc = [n_heads_enc]
+            else:
+                self.n_heads_enc = [n_heads_enc for _ in self.encoders]
+        else:
+            self.n_heads_enc = n_heads_enc
+
+        if isinstance(attention_dropout_keep_prob, float):
+            self.attention_dropout_keep_prob = [
+                attention_dropout_keep_prob for _ in encoders]
+        else:
+            self.attention_dropout_keep_prob = attention_dropout_keep_prob
+
+        self.encoder_states = lambda: [get_attention_states(e)
+                                       for e in self.encoders]
+        self.encoder_masks = lambda: [get_attention_mask(e)
+                                      for e in self.encoders]
+
+        if self.attention_combination_strategy not in STRATEGIES:
+            raise ValueError(
+                "Unknown attention combination strategy '{}'. "
+                "Allowed: {}.".format(self.attention_combination_strategy,
+                                      ", ".join(STRATEGIES)))
+
+        if (self.attention_combination_strategy == "hierarchical"
+                and self.n_heads_hier is None):
+            raise ValueError(
+                "You must provide n_heads_hier when using the hierarchical "
+                "attention combination strategy.")
+
+        if (self.attention_combination_strategy != "hierarchical"
+                and self.n_heads_hier is not None):
+            warn("Ignoring n_heads_hier parameter -- use the hierarchical "
+                 "attention combination strategy instead.")
+
+        if (self.attention_combination_strategy == "flat"
+                and len(self.n_heads_enc) != 1):
+            raise ValueError(
+                "For the flat attention combination strategy, only a single "
+                "value is permitted in n_heads_enc.")
+
+        if any((val < 0.0 or val > 1.0)
+               for val in self.attention_dropout_keep_prob):
+            raise ValueError(
+                "Attention dropout keep probabilities must be "
+                "a real number in the interval [0,1].")
+    # pylint: enable=too-many-arguments
+
+    def encoder_attention(self, queries: tf.Tensor) -> tf.Tensor:
+        """Compute attention context vectors over encoders using queries."""
+        enc_states = self.encoder_states()
+        enc_masks = self.encoder_masks()
+        assert enc_states is not None
+        assert enc_masks is not None
+
+        # Attention dropout callbacks are created in a loop so we need to
+        # use a factory function to prevent late binding.
+        def make_dropout_callback(
+                prob: float) -> Callable[[tf.Tensor], tf.Tensor]:
+            def callback(x: tf.Tensor) -> tf.Tensor:
+                return dropout(x, prob, self.train_mode)
+            return callback
+
+        dropout_cb = make_dropout_callback(self.dropout_keep_prob)
+        attn_dropout_cbs = [make_dropout_callback(prob)
+                            for prob in self.attention_dropout_keep_prob]
+
+        if self.attention_combination_strategy == "serial":
+            return serial(queries, enc_states, enc_masks, self.n_heads_enc,
+                          attn_dropout_cbs, dropout_cb)
+
+        if self.attention_combination_strategy == "parallel":
+            return parallel(queries, enc_states, enc_masks, self.n_heads_enc,
+                            attn_dropout_cbs, dropout_cb)
+
+        if self.attention_combination_strategy == "flat":
+            assert len(set(self.n_heads_enc)) == 1
+            assert len(set(self.attention_dropout_keep_prob)) == 1
+
+            return flat(queries, enc_states, enc_masks, self.n_heads_enc[0],
+                        attn_dropout_cbs[0], dropout_cb)
+
+        if self.attention_combination_strategy == "hierarchical":
+            assert self.n_heads_hier is not None
+
+            return hierarchical(
+                queries, enc_states, enc_masks, self.n_heads_enc,
+                self.n_heads_hier, attn_dropout_cbs, dropout_cb)
+
+        # TODO: remove this - this is already checked in the constructor
+        raise NotImplementedError(
+            "Unknown attention combination strategy: {}"
+            .format(self.attention_combination_strategy))
diff --git a/neuralmonkey/decoders/autoregressive.py b/neuralmonkey/decoders/autoregressive.py
index 393e71695..1c121afad 100644
--- a/neuralmonkey/decoders/autoregressive.py
+++ b/neuralmonkey/decoders/autoregressive.py
@@ -6,7 +6,7 @@
 The autoregressive decoder uses the while loop to get the outputs.
 Descendants should only specify the initial state and the while loop body.
 """
-from typing import NamedTuple, Callable, Optional, Any, List, Dict, Tuple
+from typing import NamedTuple, Callable, Optional, Any, Dict
 
 import tensorflow as tf
 
@@ -163,9 +163,6 @@ def __init__(self,
         self.tie_embeddings = tie_embeddings
         self.supress_unk = supress_unk
 
-        self.encoder_states = lambda: []  # type: Callable[[], List[tf.Tensor]]
-        self.encoder_masks = lambda: []  # type: Callable[[], List[tf.Tensor]]
-
         # Check the values of the parameters (max_output_len, ...)
         if self.max_output_len <= 0:
             raise ValueError(
diff --git a/neuralmonkey/decoders/beam_search_decoder.py b/neuralmonkey/decoders/beam_search_decoder.py
index afe84ddfc..f4b6318a7 100644
--- a/neuralmonkey/decoders/beam_search_decoder.py
+++ b/neuralmonkey/decoders/beam_search_decoder.py
@@ -31,6 +31,7 @@
 
 from neuralmonkey.decoders.autoregressive import (
     AutoregressiveDecoder, LoopState)
+from neuralmonkey.decoders.attentive import Attentive
 from neuralmonkey.decorators import tensor
 from neuralmonkey.model.model_part import ModelPart
 from neuralmonkey.tf_utils import (
@@ -171,22 +172,25 @@ def outputs(self) -> tf.Tensor:
         # the graph, replace them with beam-size-times copied originals, create
         # the beam search graph, and then replace the inner states back.
 
-        enc_states = self.parent_decoder.encoder_states
-        enc_masks = self.parent_decoder.encoder_masks
+        if isinstance(self, Attentive):
+            enc_states = self.parent_decoder.encoder_states
+            enc_masks = self.parent_decoder.encoder_masks
 
-        setattr(self.parent_decoder, "encoder_states",
-                lambda: [self.expand_to_beam(sts) for sts in enc_states()])
-        setattr(self.parent_decoder, "encoder_masks",
-                lambda: [self.expand_to_beam(mask) for mask in enc_masks()])
+            setattr(self.parent_decoder, "encoder_states",
+                    lambda: [self.expand_to_beam(sts) for sts in enc_states()])
+            setattr(self.parent_decoder, "encoder_masks",
+                    lambda: [self.expand_to_beam(mask)
+                             for mask in enc_masks()])
 
         # Create the beam search symbolic graph.
         with self.use_scope():
             self._initial_loop_state = self.get_initial_loop_state()
             outputs = self.decoding_loop()
 
-        # Reassign the original encoder states and mask back
-        setattr(self.parent_decoder, "encoder_states", enc_states)
-        setattr(self.parent_decoder, "encoder_masks", enc_masks)
+        if isinstance(self, Attentive):
+            # Reassign the original encoder states and mask back
+            setattr(self.parent_decoder, "encoder_states", enc_states)
+            setattr(self.parent_decoder, "encoder_masks", enc_masks)
 
         return outputs
 
diff --git a/neuralmonkey/decoders/transformer.py b/neuralmonkey/decoders/transformer.py
index d0a74bf9a..ed9317ebf 100644
--- a/neuralmonkey/decoders/transformer.py
+++ b/neuralmonkey/decoders/transformer.py
@@ -11,16 +11,13 @@
 from typeguard import check_argument_types
 
 from neuralmonkey.attention.scaled_dot_product import attention
-from neuralmonkey.attention.base_attention import (
-    Attendable, get_attention_states, get_attention_mask)
-from neuralmonkey.attention.transformer_cross_layer import (
-    serial, parallel, flat, hierarchical)
+from neuralmonkey.attention.base_attention import Attendable
 from neuralmonkey.decorators import tensor
 from neuralmonkey.decoders.autoregressive import (
     AutoregressiveDecoder, LoopState, DecoderFeedables, DecoderHistories)
+from neuralmonkey.decoders.attentive import Attentive
 from neuralmonkey.encoders.transformer import (
     TransformerLayer, position_signal)
-from neuralmonkey.logging import warn
 from neuralmonkey.model.sequence import EmbeddedSequence
 from neuralmonkey.model.parameterized import InitializerSpecs
 from neuralmonkey.model.model_part import ModelPart
@@ -30,9 +27,6 @@
 from neuralmonkey.tf_utils import append_tensor, layer_norm
 
 
-STRATEGIES = ["serial", "parallel", "flat", "hierarchical"]
-
-
 class TransformerFeedables(NamedTuple(
         "TransformerFeedables",
         [("input_sequence", tf.Tensor),
@@ -69,7 +63,7 @@ class TransformerHistories(NamedTuple(
 
 
 # pylint: disable=too-many-instance-attributes
-class TransformerDecoder(AutoregressiveDecoder):
+class TransformerDecoder(AutoregressiveDecoder, Attentive):
 
     # pylint: disable=too-many-arguments,too-many-locals,too-many-branches
     def __init__(self,
@@ -151,56 +145,25 @@ def __init__(self,
             save_checkpoint=save_checkpoint,
             load_checkpoint=load_checkpoint)
 
-        self.encoders = encoders
+        Attentive.__init__(
+            self,
+            name=name,
+            encoders=encoders,
+            n_heads_enc=n_heads_enc,
+            n_heads_hier=n_heads_hier,
+            attention_combination_strategy=attention_combination_strategy,
+            dropout_keep_prob=dropout_keep_prob,
+            attention_dropout_keep_prob=attention_dropout_keep_prob,
+            use_att_transform_bias=use_att_transform_bias,
+            reuse=reuse,
+            save_checkpoint=save_checkpoint,
+            load_checkpoint=load_checkpoint,
+            initializers=initializers)
+
         self.ff_hidden_size = ff_hidden_size
         self.n_heads_self = n_heads_self
-
-        if isinstance(n_heads_enc, int):
-            if attention_combination_strategy == "flat":
-                self.n_heads_enc = [n_heads_enc]
-            else:
-                self.n_heads_enc = [n_heads_enc for _ in self.encoders]
-        else:
-            self.n_heads_enc = n_heads_enc
-
         self.depth = depth
-        if isinstance(attention_dropout_keep_prob, float):
-            self.attention_dropout_keep_prob = [
-                attention_dropout_keep_prob for _ in encoders]
-        else:
-            self.attention_dropout_keep_prob = attention_dropout_keep_prob
         self.self_att_dropout_keep_prob = self_attention_dropout_keep_prob
-        self.use_att_transform_bias = use_att_transform_bias
-        self.attention_combination_strategy = attention_combination_strategy
-        self.n_heads_hier = n_heads_hier
-
-        self.encoder_states = lambda: [get_attention_states(e)
-                                       for e in self.encoders]
-        self.encoder_masks = lambda: [get_attention_mask(e)
-                                      for e in self.encoders]
-
-        if self.attention_combination_strategy not in STRATEGIES:
-            raise ValueError(
-                "Unknown attention combination strategy '{}'. "
-                "Allowed: {}.".format(self.attention_combination_strategy,
-                                      ", ".join(STRATEGIES)))
-
-        if (self.attention_combination_strategy == "hierarchical"
-                and self.n_heads_hier is None):
-            raise ValueError(
-                "You must provide n_heads_hier when using the hierarchical "
-                "attention combination strategy.")
-
-        if (self.attention_combination_strategy != "hierarchical"
-                and self.n_heads_hier is not None):
-            warn("Ignoring n_heads_hier parameter -- use the hierarchical "
-                 "attention combination strategy instead.")
-
-        if (self.attention_combination_strategy == "flat"
-                and len(self.n_heads_enc) != 1):
-            raise ValueError(
-                "For the flat attention combination strategy, only a single "
-                "value is permitted in n_heads_enc.")
 
         self._variable_scope.set_initializer(tf.variance_scaling_initializer(
             mode="fan_avg", distribution="uniform"))
@@ -296,48 +259,8 @@ def self_attention_sublayer(
 
     def encoder_attention_sublayer(self, queries: tf.Tensor) -> tf.Tensor:
         """Create the encoder-decoder attention sublayer."""
-        enc_states = self.encoder_states()
-        enc_masks = self.encoder_masks()
-        assert enc_states is not None
-        assert enc_masks is not None
-
-        # Attention dropout callbacks are created in a loop so we need to
-        # use a factory function to prevent late binding.
-        def make_attn_callback(
-                prob: float) -> Callable[[tf.Tensor], tf.Tensor]:
-            def callback(x: tf.Tensor) -> tf.Tensor:
-                return dropout(x, prob, self.train_mode)
-            return callback
-
-        dropout_cb = make_attn_callback(self.dropout_keep_prob)
-        attn_dropout_cbs = [make_attn_callback(prob)
-                            for prob in self.attention_dropout_keep_prob]
-
-        if self.attention_combination_strategy == "serial":
-            return serial(queries, enc_states, enc_masks, self.n_heads_enc,
-                          attn_dropout_cbs, dropout_cb)
-
-        if self.attention_combination_strategy == "parallel":
-            return parallel(queries, enc_states, enc_masks, self.n_heads_enc,
-                            attn_dropout_cbs, dropout_cb)
-
-        if self.attention_combination_strategy == "flat":
-            assert len(set(self.n_heads_enc)) == 1
-            assert len(set(self.attention_dropout_keep_prob)) == 1
-
-            return flat(queries, enc_states, enc_masks, self.n_heads_enc[0],
-                        attn_dropout_cbs[0], dropout_cb)
-
-        if self.attention_combination_strategy == "hierarchical":
-            assert self.n_heads_hier is not None
-
-            return hierarchical(
-                queries, enc_states, enc_masks, self.n_heads_enc,
-                self.n_heads_hier, attn_dropout_cbs, dropout_cb)
-
-        raise NotImplementedError(
-            "Unknown attention combination strategy: {}"
-            .format(self.attention_combination_strategy))
+
+        return self.encoder_attention(queries)
 
     def feedforward_sublayer(self, layer_input: tf.Tensor) -> tf.Tensor:
         """Create the feed-forward network sublayer."""

From 2057e0fd2197ed76a7be9b0d9e97bd31b48aef3c Mon Sep 17 00:00:00 2001
From: varisd <varis@ufal.mff.cuni.cz>
Date: Thu, 14 Mar 2019 17:56:25 +0100
Subject: [PATCH 11/14] bs_decoder: checking correct object for Attentive

---
 neuralmonkey/decoders/beam_search_decoder.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/neuralmonkey/decoders/beam_search_decoder.py b/neuralmonkey/decoders/beam_search_decoder.py
index f4b6318a7..1d566295d 100644
--- a/neuralmonkey/decoders/beam_search_decoder.py
+++ b/neuralmonkey/decoders/beam_search_decoder.py
@@ -172,7 +172,7 @@ def outputs(self) -> tf.Tensor:
         # the graph, replace them with beam-size-times copied originals, create
         # the beam search graph, and then replace the inner states back.
 
-        if isinstance(self, Attentive):
+        if isinstance(self.parent_decoder, Attentive):
             enc_states = self.parent_decoder.encoder_states
             enc_masks = self.parent_decoder.encoder_masks
 
@@ -187,7 +187,7 @@ def outputs(self) -> tf.Tensor:
             self._initial_loop_state = self.get_initial_loop_state()
             outputs = self.decoding_loop()
 
-        if isinstance(self, Attentive):
+        if isinstance(self.parent_decoder, Attentive):
             # Reassign the original encoder states and mask back
             setattr(self.parent_decoder, "encoder_states", enc_states)
             setattr(self.parent_decoder, "encoder_masks", enc_masks)

From 36e9e1bc643b387ee511d516b57b5df1602fb316 Mon Sep 17 00:00:00 2001
From: varisd <varis@ufal.mff.cuni.cz>
Date: Mon, 18 Mar 2019 17:46:53 +0100
Subject: [PATCH 12/14] renamed attentive encoder to structured encoder

---
 neuralmonkey/encoders/{attentive.py => structured.py} | 2 +-
 tests/classifier.ini                                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename neuralmonkey/encoders/{attentive.py => structured.py} (98%)

diff --git a/neuralmonkey/encoders/attentive.py b/neuralmonkey/encoders/structured.py
similarity index 98%
rename from neuralmonkey/encoders/attentive.py
rename to neuralmonkey/encoders/structured.py
index 0c902b9d3..dc795b940 100644
--- a/neuralmonkey/encoders/attentive.py
+++ b/neuralmonkey/encoders/structured.py
@@ -10,7 +10,7 @@
     get_attention_states, get_attention_mask, Attendable)
 
 
-class AttentiveEncoder(ModelPart, TemporalStatefulWithOutput):
+class StructuredEncoder(ModelPart, TemporalStatefulWithOutput):
     """An encoder with attention over the input and a fixed-dimension output.
 
     Based on "A Structured Self-attentive Sentence Embedding",
diff --git a/tests/classifier.ini b/tests/classifier.ini
index b287a80d1..b2e54f84b 100644
--- a/tests/classifier.ini
+++ b/tests/classifier.ini
@@ -46,7 +46,7 @@ data_id="source"
 vocabulary=<encoder_vocabulary>
 
 [encoder_attentive]
-class=encoders.attentive.AttentiveEncoder
+class=encoders.attentive.StructuredEncoder
 name="attentive_encoder"
 input_sequence=<encoder_rnn>
 hidden_size=9

From cfdeb423cce37c9a1b16e98ff5afac14fbf64986 Mon Sep 17 00:00:00 2001
From: varisd <varis@ufal.mff.cuni.cz>
Date: Tue, 19 Mar 2019 14:57:11 +0100
Subject: [PATCH 13/14] fixing tests

---
 neuralmonkey/decoders/autoregressive.py | 2 +-
 neuralmonkey/decoders/transformer.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/neuralmonkey/decoders/autoregressive.py b/neuralmonkey/decoders/autoregressive.py
index 1c121afad..b970ae89f 100644
--- a/neuralmonkey/decoders/autoregressive.py
+++ b/neuralmonkey/decoders/autoregressive.py
@@ -6,7 +6,7 @@
 The autoregressive decoder uses the while loop to get the outputs.
 Descendants should only specify the initial state and the while loop body.
 """
-from typing import NamedTuple, Callable, Optional, Any, Dict
+from typing import NamedTuple, Callable, Optional, Any, Dict, Tuple
 
 import tensorflow as tf
 
diff --git a/neuralmonkey/decoders/transformer.py b/neuralmonkey/decoders/transformer.py
index ed9317ebf..85e97b020 100644
--- a/neuralmonkey/decoders/transformer.py
+++ b/neuralmonkey/decoders/transformer.py
@@ -4,7 +4,7 @@
 """
 # TODO make this code simpler
 # pylint: disable=too-many-lines
-from typing import Any, Callable, NamedTuple, List, Union, Tuple
+from typing import Any, NamedTuple, List, Union, Tuple
 import math
 
 import tensorflow as tf

From 38c0bf8e8a3610291545302494a6f8f161db8db0 Mon Sep 17 00:00:00 2001
From: varisd <varis@ufal.mff.cuni.cz>
Date: Thu, 2 May 2019 15:36:26 +0200
Subject: [PATCH 14/14] fixed tests/classifier.ini

---
 tests/classifier.ini | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/classifier.ini b/tests/classifier.ini
index b2e54f84b..9a5342887 100644
--- a/tests/classifier.ini
+++ b/tests/classifier.ini
@@ -45,9 +45,9 @@ dropout_keep_prob=0.5
 data_id="source"
 vocabulary=<encoder_vocabulary>
 
-[encoder_attentive]
-class=encoders.attentive.StructuredEncoder
-name="attentive_encoder"
+[encoder_structured]
+class=encoders.structured.StructuredEncoder
+name="structured_encoder"
 input_sequence=<encoder_rnn>
 hidden_size=9
 num_heads=5
@@ -77,7 +77,7 @@ contains_frequencies=False
 [decoder]
 class=decoders.classifier.Classifier
 name="decoder"
-encoders=[<encoder_attentive>, <encoder_pooling>, <encoder_cnn>]
+encoders=[<encoder_structured>, <encoder_pooling>, <encoder_cnn>]
 dropout_keep_prob=0.5
 layers=[10,5]
 data_id="classification"