diff --git a/docs/docs/extractors/flashtext.md b/docs/docs/extractors/flashtext.md index 24f600e..6d7d461 100644 --- a/docs/docs/extractors/flashtext.md +++ b/docs/docs/extractors/flashtext.md @@ -29,6 +29,7 @@ use the parameter `non_word_boundaries` - **entity_name**: the name of the entity to attach to the message - **case_sensitive**: whether to consider case when matching entities. `False` by default. - **non_word_boundaries**: characters which shouldn't be considered word boundaries. +- **encoding**: the name of the encoding used to read the lookup text file. ## Base Usage diff --git a/rasa_nlu_examples/extractors/flashtext_entity_extractor.py b/rasa_nlu_examples/extractors/flashtext_entity_extractor.py index 59f328d..ced2e53 100644 --- a/rasa_nlu_examples/extractors/flashtext_entity_extractor.py +++ b/rasa_nlu_examples/extractors/flashtext_entity_extractor.py @@ -45,6 +45,7 @@ def get_default_config() -> Dict[Text, Any]: "non_word_boundaries": "", "path": None, "entity_name": None, + "encoding": None, } def __init__( @@ -63,7 +64,9 @@ def __init__( ) for non_word_boundary in config["non_word_boundaries"]: self.keyword_processor.add_non_word_boundary(non_word_boundary) - words = pathlib.Path(self.path).read_text().split("\n") + words = ( + pathlib.Path(self.path).read_text(encoding=config["encoding"]).split("\n") + ) if len(words) == 0: rasa.shared.utils.io.raise_warning( f"No words found in the {pathlib.Path(self.path)} file."