diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 0c67983..5b71553 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -2,23 +2,25 @@ name: GH on: pull_request: - branches: '*' push: - branches: 'master' - tags: '*' + branches: master + release: + types: [released, prereleased] + workflow_dispatch: # allows running workflow manually from the Actions tab jobs: CI: runs-on: ubuntu-latest strategy: - max-parallel: 8 matrix: - python-version: [2.7, 3.5, 3.6, 3.7, 3.8] + python-version: ['2.7', '3.5', '3.6', '3.7', '3.8', '3.9', '3.10', '3.11'] steps: - uses: actions/checkout@v2 + with: + fetch-depth: 0 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} @@ -26,52 +28,63 @@ jobs: uses: actions/cache@v2 with: path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements/*.txt') }} + key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('**/requirements/*.txt') }} restore-keys: | ${{ runner.os }}-pip- - - run: pip install -U pip setuptools wheel + - name: Pre-commit cache + uses: actions/cache@v2 + with: + path: ~/.cache/pre-commit + key: ${{ runner.os }}-pre-commit-${{ matrix.python-version }}-${{ hashFiles('**/requirements/ci.txt') }}-${{ hashFiles('.pre-commit-config.yaml') }} + restore-keys: | + ${{ runner.os }}-pre-commit- - name: Install dependencies - run: make install + run: | + pip install -U pip setuptools wheel codecov + make install - name: Lint - if: matrix.python-version == 3.8 + if: matrix.python-version == 3.11 run: make lint - name: Test run: make test + - name: Docs + if: matrix.python-version == 3.11 + run: SPHINXOPTS=-W make builddocs + - name: Codecov - if: matrix.python-version == 3.8 env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} run: | - pip install codecov codecov CD: needs: CI - if: startsWith(github.ref, 'refs/tags/') + if: github.event_name == 'release' runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - name: Set up Python 3.8 + - name: Set up Python uses: actions/setup-python@v2 with: - python-version: 3.8 + python-version: 3.x - - name: PyPi Deploy preparation + - name: Build run: | - pip install --upgrade setuptools wheel + pip install -U pip setuptools wheel twine python setup.py sdist bdist_wheel --universal - - name: PyPi Deploy - uses: pypa/gh-action-pypi-publish@v1.2.1 - with: - user: ${{ secrets.PYPI_USER }} - password: ${{ secrets.PYPI_PASSWORD }} + - name: Publish + env: + TWINE_USERNAME: ${{ secrets.PYPI_USER }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: | + twine upload dist/* - uses: apexskier/github-release-commenter@v1 with: diff --git a/.gitignore b/.gitignore index d20458f..0c241fc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,6 @@ +# docs +docs/source/README.md +docs/source/_code_reference/ + # setuptools_scm _repo_version.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 71ee9f0..971aa15 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,45 +1,61 @@ repos: - repo: https://github.com/Yelp/detect-secrets - rev: v1.2.0 + rev: v1.4.0 hooks: - id: detect-secrets args: ['--baseline', '.secrets.baseline'] exclude: .*/tests/.* - repo: https://github.com/psf/black - rev: 22.3.0 + # when updating this version, also update blacken-docs hook below + rev: 22.10.0 hooks: - id: black +- repo: https://github.com/asottile/blacken-docs + rev: v1.12.1 + hooks: + - id: blacken-docs + additional_dependencies: ['black==22.10.0'] + - repo: https://github.com/timothycrosley/isort rev: 5.10.1 hooks: - id: isort +- repo: local + hooks: + - id: mypy + name: Run mypy + entry: python -m mypy src/ + language: system + types: [python] + pass_filenames: false + - repo: https://github.com/PyCQA/flake8 - rev: 4.0.1 + rev: 5.0.4 hooks: - id: flake8 additional_dependencies: [ - # 'darglint~=1.5.4', + # 'darglint~=1.8.1', 'flake8-absolute-import~=1.0', 'flake8-blind-except~=0.2.1', - 'flake8-builtins~=1.5.3', + 'flake8-builtins~=2.0.1', 'flake8-cognitive-complexity==0.1.0', - 'flake8-comprehensions~=3.8.0', - # 'flake8-docstrings~=1.5.0', - 'flake8-logging-format~=0.6.0', + 'flake8-comprehensions~=3.10.0', + # 'flake8-docstrings~=1.6.0', + 'flake8-logging-format~=0.8.1', 'flake8-mutable~=1.2.0', - 'flake8-print~=4.0.0', - 'flake8-printf-formatting~=1.1.2', + 'flake8-print~=5.0.0', + 'flake8-printf-formatting~=1.1.0', 'flake8-pytest-style~=1.6.0', 'flake8-quotes~=3.3.1', 'flake8-tuple~=0.4.1', - 'pep8-naming~=0.12.1' + 'pep8-naming~=0.13.2' ] - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.1.0 + rev: v4.3.0 hooks: - id: mixed-line-ending args: ['--fix=lf'] @@ -50,3 +66,11 @@ repos: - id: check-toml - id: check-xml - id: check-yaml + - id: debug-statements + +- repo: https://github.com/econchick/interrogate + rev: 1.5.0 + hooks: + - id: interrogate + pass_filenames: false + args: [-c, setup.cfg] diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..83de6f4 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,25 @@ +# .readthedocs.yml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: docs/source/conf.py + +# Build documentation with MkDocs +#mkdocs: +# configuration: mkdocs.yml + +# Optionally build your docs in additional formats such as PDF and ePub +formats: all + +# Optionally set the version of Python and requirements required to build your docs +python: + version: 3.8 + install: + - method: pip + path: . + - requirements: requirements/docs.txt diff --git a/Makefile b/Makefile index d2013fe..b79004c 100644 --- a/Makefile +++ b/Makefile @@ -8,13 +8,27 @@ lint: test: python -m pytest tests/ +.PHONY: showcov +## Open the test coverage overview using the default HTML viewer +showcov: + xdg-open htmlcov/index.html || open htmlcov/index.html + .PHONY: install -## Install for development +## Install this repo, plus dev requirements, in editable mode install: - pip install -r requirements/ci.txt - pip install -e . + pip install -r requirements/ci.txt -r requirements/docs.txt -e . pre-commit install || true # not installed on older python versions +.PHONY: builddocs +## Build documentation using Sphinx +builddocs: + cd docs && make docs + +.PHONY: showdocs +## Open the docs using the default HTML viewer +showdocs: + xdg-open docs/_build/html/index.html || open docs/_build/html/index.html + .PHONY: help ## Print Makefile documentation help: diff --git a/README.md b/README.md index 0a6e6db..e3f547c 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,11 @@ $ pip install retrie ## Usage -The following objects are all subclasses of [`retrie.retrie.Retrie`](src/retrie/retrie.py), which handles filling the Trie and compiling the corresponding regex pattern. +[![readthedocs](https://readthedocs.org/projects/retrie/badge/?version=latest)](https://retrie.readthedocs.io) + +For documentation, see [retrie.readthedocs.io](https://retrie.readthedocs.io/en/stable/_code_reference/retrie.html). + +The following objects are all subclasses of `retrie.retrie.Retrie`, which handles filling the Trie and compiling the corresponding regex pattern. #### Blacklist diff --git a/docs/Makefile b/docs/Makefile new file mode 100755 index 0000000..5cf6a41 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,29 @@ + +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +show: + xdg-open docs/_build/html/index.html || open docs/_build/html/index.html + +docs: + rm -rf source/_code_reference + make clean + make html diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100755 index 0000000..1336faa --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,183 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. + +import inspect +import sys +from pathlib import Path + +from sphinx.ext import apidoc + +from retrie import __version__ + +current_dir = Path(__file__).parent.absolute() +base_dir = current_dir.parents[1] +code_dir = base_dir / "src" + +sys.path.insert(0, str(code_dir)) + +readme_dest = current_dir / "README.md" +readme_src = base_dir / "README.md" + +if readme_dest.exists(): + readme_dest.unlink() +readme_dest.symlink_to(readme_src) + +# -- Project information ----------------------------------------------------- + +project = "retrie" +project_url = "https://github.com/ddelange/retrie" +author = "ddelange" +copyright = "2020, ddelange" # noqa:A001 + +# The full version, including alpha/beta/rc tags +release = __version__ + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "recommonmark", + "sphinx_rtd_theme", + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", + "sphinx.ext.linkcode", + "sphinx.ext.intersphinx", +] +autodoc_typehints = "description" + +# recommonmark extension allows mixed filetypes +source_suffix = [".rst", ".md"] + +# Add any paths that contain templates here, relative to this directory. +# templates_path = ["_templates"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = "sphinx_rtd_theme" + + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +# html_static_path = ["_static"] + + +# -- Options for sphinx.ext.autodoc ------------------------------------------ + + +def run_apidoc(_): + exclude = [] + + argv = [ + "--doc-project", + "Code Reference", + "-M", + "-f", + "-d", + "3", + "--tocfile", + "index", + "-o", + str(current_dir / "_code_reference"), + str(code_dir), + ] + exclude + + apidoc.main(argv) + + +def setup(app): + app.connect("builder-inited", run_apidoc) + + +# -- Options for sphinx.ext.linkcode: [source] links ------------------------- + + +def linkcode_resolve( # noqa:CCR001 + domain, + info, + blob_url=f"{project_url.rstrip('/')}/blob", + default_branch="master", # branch used for untagged 'latest' builds on readthedocs + tag_prefix="", # could be for instance "v" depending on tagging convention +): + """Determine a GitHub permalink (with line numbers) for a Python object. Adapted from https://github.com/numpy/numpy/blob/v1.19.4/doc/source/conf.py.""" + if domain != "py": + return None + + modname = info["module"] + fullname = info["fullname"] + + submod = sys.modules.get(modname) + if submod is None: + return None + + obj = submod + for part in fullname.split("."): + try: + obj = getattr(obj, part) + except Exception: + return None + + # strip decorators, which would resolve to the source of the decorator + # possibly an upstream bug in getsourcefile, bpo-1764286 + obj = inspect.unwrap(obj) + + try: + sourcefile = inspect.getsourcefile(obj) + except Exception: + return None + + try: + source, lineno = inspect.getsourcelines(obj) + except Exception: + linespec = "" + else: + linespec = f"#L{lineno}-L{lineno + len(source) - 1}" + + try: + # editable install + relsourcefile = Path(sourcefile).relative_to(base_dir) + except ValueError as exc: + # site-packages + if "site-packages/" not in sourcefile: + raise RuntimeError( + "Expected a pip install -e, or install to site-packages" + ) from exc + + relsourcefile = (code_dir / sourcefile.split("site-packages/")[-1]).relative_to( + base_dir + ) + + if "dev" in release: + # setuptools_scm (setup.py) appends a dev identifier to __version__ if there are + # commits since last tag. For readthedocs, this is only the case when building + # 'latest' that is newer than 'stable', for which the default_branch is assumed. + return f"{blob_url}/{default_branch}/{relsourcefile}{linespec}" + else: + return f"{blob_url}/{tag_prefix}{release}/{relsourcefile}{linespec}" + + +# -- Options for sphinx.ext.intersphinx -------------------------------------- + +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), +} diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100755 index 0000000..b964b4c --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,14 @@ +.. awsstepfuncs documentation master file, created by + sphinx-quickstart on Thu Jul 2 09:53:53 2020. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to retrie's documentation! +=================================================== + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + Introduction + _code_reference/index diff --git a/requirements/ci.txt b/requirements/ci.txt index 3dce88f..d16a979 100644 --- a/requirements/ci.txt +++ b/requirements/ci.txt @@ -1,5 +1,7 @@ +detect-secrets~=1.4.0 ; python_version >= "3.6" +mypy~=0.782 ; python_version >= "3.6" pre-commit~=2.9 ; python_version >= "3.6" -pytest~=4.6 # pytest 5 requires py3 +pytest pytest-cov~=2.8 pytest-env~=0.6 pytest-sugar~=0.9 diff --git a/requirements/docs.txt b/requirements/docs.txt new file mode 100644 index 0000000..f344818 --- /dev/null +++ b/requirements/docs.txt @@ -0,0 +1,3 @@ +recommonmark~=0.7 ; python_version >= "3.6" +sphinx-rtd-theme~=1.0 ; python_version >= "3.6" +sphinx~=4.2 ; python_version >= "3.6" diff --git a/setup.cfg b/setup.cfg index 1237f6b..bbaf00b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [tool:pytest] -addopts = -s --strict -vv --cache-clear --maxfail=1 --cov=retrie --cov-report=term --cov-report=html --cov-branch --no-cov-on-fail +addopts = -s --strict -vv --cache-clear --maxfail=1 --cov=retrie --cov-report=term --cov-report=html --cov-branch --no-cov-on-fail --ignore=docs [isort] profile = black @@ -7,20 +7,36 @@ default_section = THIRDPARTY known_first_party = tests [flake8] -extend-ignore = D10,E203,E501 +ignore = B902,D10,E203,E501,W503 max-line-length = 88 -select = A,B,C4,D,E,F,M,Q,T,W,ABS inline-quotes = double docstring-convention = google +max-cognitive-complexity = 10 [coverage:run] branch = True -omit = site-packages + +[coverage:report] +exclude_lines = + pragma: no cover [mypy] files = src/**/*.py -python_version = 3.7 ignore_missing_imports = True warn_no_return = False disallow_untyped_defs = False allow_redefinition = True + +[darglint] +strictness = short + +[tool:interrogate] +fail-under = 100 +exclude = docs,tests,setup.py,.eggs,.env,.venv,dist,src/mapply/_version.py +verbose = 1 +quiet = false +color = true +ignore-module = true +ignore-nested-functions = true +ignore-private = true +ignore-semiprivate = true diff --git a/setup.py b/setup.py index 26812b9..b3f8481 100644 --- a/setup.py +++ b/setup.py @@ -63,6 +63,9 @@ def read_readme(path): "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", "Topic :: Software Development :: Libraries :: Python Modules", "Topic :: Utilities", ], diff --git a/src/retrie/__init__.py b/src/retrie/__init__.py index b280c29..100533b 100644 --- a/src/retrie/__init__.py +++ b/src/retrie/__init__.py @@ -1,7 +1,7 @@ try: from functools import cached_property # type: ignore except ImportError: # pragma: no cover - from cached_property import cached_property # noqa:F401 + from cached_property import cached_property # type: ignore # noqa:F401 # version based on .git/refs/tags - make a tag/release locally, or on GitHub (and pull) from . import _repo_version # noqa:ABS101 diff --git a/src/retrie/retrie.py b/src/retrie/retrie.py index 630d44e..92b2c78 100644 --- a/src/retrie/retrie.py +++ b/src/retrie/retrie.py @@ -1,3 +1,80 @@ +"""Submodule containing the :class:`Retrie` class, which handles filling the Trie and compiling the corresponding regex pattern, and its high-level wrappers. + +The :class:`Blacklist` class can be used to filter out bad occurences in a text or a sequence of strings: +:: + + from retrie.retrie import Blacklist + + # check out docstrings and methods + help(Blacklist) + + blacklist = Blacklist(["abc", "foo", "abs"], match_substrings=False) + blacklist.compiled + # re.compile(r'(?<=\b)(?:ab[cs]|foo)(?=\b)', re.IGNORECASE|re.UNICODE) + assert not blacklist.is_blacklisted("a foobar") + assert tuple(blacklist.filter(("good", "abc", "foobar"))) == ("good", "foobar") + assert blacklist.cleanse_text(("good abc foobar")) == "good foobar" + + blacklist = Blacklist(["abc", "foo", "abs"], match_substrings=True) + blacklist.compiled + # re.compile(r'(?:ab[cs]|foo)', re.IGNORECASE|re.UNICODE) + assert blacklist.is_blacklisted("a foobar") + assert tuple(blacklist.filter(("good", "abc", "foobar"))) == ("good",) + assert blacklist.cleanse_text(("good abc foobar")) == "good bar" + + +Similar methods are available for the :class:`Whitelist` class: +:: + + from retrie.retrie import Whitelist + + # check out docstrings and methods + help(Whitelist) + + whitelist = Whitelist(["abc", "foo", "abs"], match_substrings=False) + whitelist.compiled + # re.compile(r'(?<=\b)(?:ab[cs]|foo)(?=\b)', re.IGNORECASE|re.UNICODE) + assert not whitelist.is_whitelisted("a foobar") + assert tuple(whitelist.filter(("bad", "abc", "foobar"))) == ("abc",) + assert whitelist.cleanse_text(("bad abc foobar")) == "abc" + + whitelist = Whitelist(["abc", "foo", "abs"], match_substrings=True) + whitelist.compiled + # re.compile(r'(?:ab[cs]|foo)', re.IGNORECASE|re.UNICODE) + assert whitelist.is_whitelisted("a foobar") + assert tuple(whitelist.filter(("bad", "abc", "foobar"))) == ("abc", "foobar") + assert whitelist.cleanse_text(("bad abc foobar")) == "abcfoo" + +The :class:`Replacer` class does a fast single-pass search & replace for occurrences of ``replacement_mapping.keys()`` with corresponding values. +:: + + from retrie.retrie import Replacer + + # check out docstrings and methods + help(Replacer) + + replacement_mapping = dict(zip(["abc", "foo", "abs"], ["new1", "new2", "new3"])) + + replacer = Replacer(replacement_mapping, match_substrings=True) + replacer.compiled + # re.compile(r'(?:ab[cs]|foo)', re.IGNORECASE|re.UNICODE) + assert replacer.replace("ABS ...foo... foobar") == "new3 ...new2... new2bar" + + replacer = Replacer(replacement_mapping, match_substrings=False) + replacer.compiled + # re.compile(r'\b(?:ab[cs]|foo)\b', re.IGNORECASE|re.UNICODE) + assert replacer.replace("ABS ...foo... foobar") == "new3 ...new2... foobar" + + replacer = Replacer(replacement_mapping, match_substrings=False, re_flags=None) + replacer.compiled # on py3, re.UNICODE is always enabled + # re.compile(r'\b(?:ab[cs]|foo)\b') + assert replacer.replace("ABS ...foo... foobar") == "ABS ...new2... foobar" + + replacer = Replacer(replacement_mapping, match_substrings=False, word_boundary=" ") + replacer.compiled + # re.compile(r'(?<= )(?:ab[cs]|foo)(?= )', re.IGNORECASE|re.UNICODE) + assert replacer.replace(". ABS ...foo... foobar") == ". new3 ...foo... foobar" +""" import re from typing import ( Any, @@ -32,6 +109,16 @@ def _lower_keys( class Retrie: + """Wrap a :class:`retrie.trie.Trie` to compile the corresponding regex pattern with word boundary and regex flags. + + Note: + Although the Trie is case-sensitive, by default :obj:`re.IGNORECASE` is used for better performance. Pass ``re_flags=None`` to perform case-sensitive replacements. + + Args: + word_boundary (str): Token to wrap the retrie to exclude certain matches. + re_flags (re.RegexFlag): Flags passed to regex engine. + """ + __slots__ = "trie", "word_boundary", "re_flags" def __init__( @@ -39,24 +126,20 @@ def __init__( word_boundary=WORD_BOUNDARY, # type: Text re_flags=DEFAULT_FLAGS, # type: re_flag_type ): # type: (...) -> None - """Initialize Trie and set regex config. - - Note: - Although the Trie is case-sensitive, by default re.IGNORECASE is used. - - Args: - word_boundary (str): Token to wrap the retrie to exclude certain matches. - re_flags (re.RegexFlag): Flags passed to regex engine. - """ + """Initialize :class:`retrie.trie.Trie` and set config.""" self.trie = trie.Trie() + """The underlying :class:`retrie.trie.Trie`.""" self.word_boundary = word_boundary or "" + """The boundary token to wrap the :class:`retrie.trie.Trie` pattern in.""" self.re_flags = self.parse_re_flags(re_flags) + """Regex flags passed to :func:`re.compile`.""" @classmethod def parse_re_flags( cls, re_flags, # type: re_flag_type ): # type: (...) -> int + """Convert re_flags to integer.""" return int(re_flags) if re_flags else 0 def pattern(self): # type: (...) -> Text @@ -72,7 +155,7 @@ def compile( # noqa:A003 word_boundary=None, # type: Optional[Text] re_flags=-1, # type: re_flag_type ): # type: (...) -> Pattern[Text] - """Compile re.Pattern for the current Trie. + """Compile a :class:`re.Pattern` for the current Trie. Optionally the following args can be passed to temporarily override class attrs. @@ -100,6 +183,18 @@ def compile( # noqa:A003 class Checklist(Retrie): + """Check and mutate strings against a Retrie. + + Note: + Although the Trie is case-sensitive, by default :obj:`re.IGNORECASE` is used for better performance. Pass ``re_flags=None`` to perform case-sensitive replacements. + + Args: + keys (Sequence): Strings to build the Retrie from. + match_substrings (bool): Wether or not to override word_boundary with "". + word_boundary (str): Token to wrap the retrie to exclude certain matches. + re_flags (re.RegexFlag): Flags passed to regex engine. + """ + def __init__( self, keys, # type: Sequence[Text] @@ -107,17 +202,7 @@ def __init__( word_boundary=WORD_BOUNDARY, # type: Text re_flags=DEFAULT_FLAGS, # type: re_flag_type ): # type: (...) -> None - """Check and mutate strings against a Retrie. - - Note: - Although the Trie is case-sensitive, by default re.IGNORECASE is used. - - Args: - keys (Sequence): Strings to build the Retrie from. - match_substrings (bool): Wether or not to override word_boundary with "". - word_boundary (str): Token to wrap the retrie to exclude certain matches. - re_flags (re.RegexFlag): Flags passed to regex engine. - """ + """Initialize :class:`retrie.trie.Trie` and set config.""" if match_substrings: word_boundary = "" @@ -144,6 +229,18 @@ def not_listed( class Blacklist(Checklist): + """Mutate [sequences of] strings based on their match against blacklisted. + + Note: + Although the Trie is case-sensitive, by default :obj:`re.IGNORECASE` is used for better performance. Pass ``re_flags=None`` to perform case-sensitive replacements. + + Args: + blacklisted (Sequence): Strings to build the Retrie from. + match_substrings (bool): Wether or not to override word_boundary with "". + word_boundary (str): Token to wrap the retrie to exclude certain matches. + re_flags (re.RegexFlag): Flags passed to regex engine. + """ + def __init__( self, blacklisted, # type: Sequence[Text] @@ -151,17 +248,7 @@ def __init__( word_boundary=WORD_BOUNDARY, # type: Text re_flags=DEFAULT_FLAGS, # type: re_flag_type ): - """Mutate [sequences of] strings based on their match against blacklisted. - - Note: - Although the Trie is case-sensitive, by default re.IGNORECASE is used. - - Args: - blacklisted (Sequence): Strings to build the Retrie from. - match_substrings (bool): Wether or not to override word_boundary with "". - word_boundary (str): Token to wrap the retrie to exclude certain matches. - re_flags (re.RegexFlag): Flags passed to regex engine. - """ + """Initialize :class:`retrie.trie.Trie` and set config.""" Checklist.__init__( self, keys=blacklisted, @@ -191,6 +278,18 @@ def cleanse_text( class Whitelist(Checklist): + """Mutate [sequences of] strings based on their match against whitelisted. + + Note: + Although the Trie is case-sensitive, by default :obj:`re.IGNORECASE` is used for better performance. Pass ``re_flags=None`` to perform case-sensitive replacements. + + Args: + whitelisted (Sequence): Strings to build the Retrie from. + match_substrings (bool): Wether or not to override word_boundary with "". + word_boundary (str): Token to wrap the retrie to exclude certain matches. + re_flags (re.RegexFlag): Flags passed to regex engine. + """ + def __init__( self, whitelisted, # type: Sequence[Text] @@ -198,17 +297,7 @@ def __init__( word_boundary=WORD_BOUNDARY, # type: Text re_flags=DEFAULT_FLAGS, # type: re_flag_type ): - """Mutate [sequences of] strings based on their match against whitelisted. - - Note: - Although the Trie is case-sensitive, by default re.IGNORECASE is used. - - Args: - whitelisted (Sequence): Strings to build the Retrie from. - match_substrings (bool): Wether or not to override word_boundary with "". - word_boundary (str): Token to wrap the retrie to exclude certain matches. - re_flags (re.RegexFlag): Flags passed to regex engine. - """ + """Initialize :class:`retrie.trie.Trie` and set config.""" Checklist.__init__( self, keys=whitelisted, @@ -238,6 +327,18 @@ def cleanse_text( class Replacer(Checklist): + """Replace occurrences of ``replacement_mapping.keys()`` with corresponding values. + + Note: + Although the Trie is case-sensitive, by default :obj:`re.IGNORECASE` is used for better performance. Pass ``re_flags=None`` to perform case-sensitive replacements. + + Args: + replacement_mapping (Mapping): Mapping ``{old: new}`` to replace. + match_substrings (bool): Wether or not to override word_boundary with "". + word_boundary (str): Token to wrap the retrie to exclude certain matches. + re_flags (re.RegexFlag): Flags passed to regex engine. + """ + __slots__ = "replacement_mapping" def __init__( @@ -247,17 +348,7 @@ def __init__( word_boundary=WORD_BOUNDARY, # type: Text re_flags=DEFAULT_FLAGS, # type: re_flag_type ): - """Replace occurrences of replacement_mapping.keys() with corresponding values. - - Note: - Although the Trie is case-sensitive, by default re.IGNORECASE is used. - - Args: - replacement_mapping (Mapping): Mapping {old: new} to replace. - match_substrings (bool): Wether or not to override word_boundary with "". - word_boundary (str): Token to wrap the retrie to exclude certain matches. - re_flags (re.RegexFlag): Flags passed to regex engine. - """ + """Initialize :class:`retrie.trie.Trie` and set config.""" Checklist.__init__( self, keys=tuple(replacement_mapping.keys()), @@ -291,5 +382,8 @@ def replace( Args: text (str): String to search & replace. count (int): Amount of occurences to replace. If 0 or omitted, replace all. + + Returns: + str: String with matches replaced. """ return self.compiled.sub(self._replace, text, count=count) diff --git a/src/retrie/trie.py b/src/retrie/trie.py index 7b5a8a4..42949c6 100644 --- a/src/retrie/trie.py +++ b/src/retrie/trie.py @@ -1,7 +1,28 @@ +"""Submodule containing code to build a regex pattern from a trie of strings. + +Standalone usage: +:: + + from retrie.trie import Trie + + trie = Trie() + + trie.add("abc", "foo", "abs") + assert trie.pattern() == "(?:ab[cs]|foo)" # equivalent to but faster than "(?:abc|abs|foo)" + + trie.add("absolute") + assert trie.pattern() == "(?:ab(?:c|s(?:olute)?)|foo)" + + trie.add("abx") + assert trie.pattern() == "(?:ab(?:[cx]|s(?:olute)?)|foo)" + + trie.add("abxy") + assert trie.pattern() == "(?:ab(?:c|s(?:olute)?|xy?)|foo)" +""" import re -from typing import Dict, Optional, Text, Union +from typing import Dict, Optional, Text -data_type = Dict[Text, Union[int, Dict]] +data_type = Dict[Text, Dict] class Trie: @@ -15,16 +36,19 @@ class Trie: __slots__ = "data" def __init__(self): + """Initialize data dictionary.""" self.data = {} # type: data_type - def add(self, *word): + def add( + self, *word # type: Text + ): """Add one or more words to the current Trie.""" for word in word: ref = self.data for char in word: ref[char] = ref.get(char, {}) ref = ref[char] - ref[""] = 1 + ref[""] = {} def dump(self): # type: (...) -> data_type """Dump the current trie as dictionary.""" @@ -35,7 +59,9 @@ def pattern(self): # type: (...) -> Text return self._pattern(self.dump()) or "" @classmethod - def _pattern(cls, data): # type: (...) -> Optional[Text] + def _pattern( # noqa: CCR001 + cls, data # type: Dict + ): # type: (...) -> Optional[Text] """Build regex string from Trie.""" if not data or len(data) == 1 and "" in data: return None @@ -44,7 +70,7 @@ def _pattern(cls, data): # type: (...) -> Optional[Text] current = [] leaf_reached = False for char in sorted(data): - if data[char] == 1: + if char == "": leaf_reached = True continue diff --git a/tests/test_retrie.py b/tests/test_retrie.py index d22c82f..1e35e2d 100644 --- a/tests/test_retrie.py +++ b/tests/test_retrie.py @@ -3,10 +3,9 @@ from retrie.retrie import Blacklist, Replacer, Retrie, Whitelist -def test_Retrie(): +def test_retrie(): retrie = Retrie() - for term in ["abc", "foo", "abs"]: - retrie.trie.add(term) + retrie.trie.add("abc", "foo", "abs") assert retrie.pattern() == "(?:ab[cs]|foo)" @@ -45,7 +44,7 @@ def test_Retrie(): assert match.group(0) == "abcy" -def test_Blacklist(): +def test_blacklist(): blacklist = Blacklist(["abc", "foo", "abs"], match_substrings=False) assert not blacklist.is_blacklisted("a foobar") assert tuple(blacklist.filter(("good", "abc", "foobar"))) == ("good", "foobar") @@ -57,7 +56,7 @@ def test_Blacklist(): assert blacklist.cleanse_text(("good abc foobar")) == "good bar" -def test_Whitelist(): +def test_whitelist(): whitelist = Whitelist(["abc", "foo", "abs"], match_substrings=False) assert not whitelist.is_whitelisted("a foobar") assert tuple(whitelist.filter(("bad", "abc", "foobar"))) == ("abc",) @@ -69,7 +68,7 @@ def test_Whitelist(): assert whitelist.cleanse_text(("bad abc foobar")) == "abcfoo" -def test_Replacer(): +def test_replacer(): replacement_mapping = dict(zip(["abc", "foo", "abs"], ["new1", "new2", "new3"])) replacer = Replacer(replacement_mapping, match_substrings=True) @@ -88,7 +87,7 @@ def test_Replacer(): def test__lower_keys(): replacement_mapping = {"x": 0, "X": 1} - with pytest.raises(ValueError) as excinfo: + with pytest.raises(ValueError, match="Ambiguous replacement_mapping") as excinfo: Replacer(replacement_mapping) # re.IGNORECASE enabled by default assert ( str(excinfo.value) diff --git a/tests/test_trie.py b/tests/test_trie.py index 4483001..6f19d05 100644 --- a/tests/test_trie.py +++ b/tests/test_trie.py @@ -1,7 +1,7 @@ from retrie.trie import Trie -def test_Trie(): +def test_trie(): trie = Trie() assert trie.pattern() == ""