diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml new file mode 100644 index 0000000000..f13a773d2e --- /dev/null +++ b/.github/workflows/build-docs.yml @@ -0,0 +1,68 @@ +# +# This workflow rebuilds documentation and stores the resulting patch as a +# workflow artifact. We can then download the artifact, apply the patch, and +# then push the changes. +# +# It's possible to do all this locally on a developer's machine, but it's not +# trivial, because it requires many pre-requisites. +# +name: Rebuild documentation +on: workflow_dispatch +jobs: + docs: + name: Rebuild documentation + timeout-minutes: 180 + runs-on: ubuntu-20.04 + defaults: + run: + shell: bash + + steps: + - uses: actions/checkout@v3 + - name: Setup up Python ${{ matrix.python }} + uses: actions/setup-python@v4 + with: + # + # We use Py3.8 here for historical reasons. + # + python-version: "3.8" + + - name: Update pip + run: python -m pip install -U pip + + - name: Install apt packages for LaTeX rendering + run: | + sudo apt-get -yq update + sudo apt-get -yq remove texlive-binaries --purge + sudo apt-get -yq --no-install-suggests --no-install-recommends --force-yes install dvipng texlive-latex-base texlive-latex-extra texlive-latex-recommended texlive-latex-extra texlive-fonts-recommended latexmk + sudo apt-get -yq install build-essential python3.8-dev + - name: Install gensim and its dependencies + run: pip install -e .[docs] + + - name: Build documentation + run: | + python setup.py build_ext --inplace + make -C docs/src clean html + + - name: Check changes to prebuilt docs + run: | + git config user.email "noreply@github.com" + git config user.name "Gensim Docs Build" + if ! git diff --quiet @ ; then + git add . + branch="$GITHUB_HEAD_REF ($GITHUB_REF_NAME)" + git commit -m "Import rebuilt documentation for branch $branch" + git format-patch @^ + git bundle create prebuilt-docs-changes.bundle @^...@ + git reset --mixed @^ + git diff --exit-code --stat @ + fi + + - name: Upload prebuilt docs changes + if: always() + uses: actions/upload-artifact@v3 + with: + name: prebuilt-docs-changes + path: | + *.patch + *.bundle diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index 31544d6b26..6c77538911 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -36,16 +36,113 @@ jobs: platforms: all - name: Build wheels - uses: pypa/cibuildwheel@v2.12.0 + uses: pypa/cibuildwheel@v2.13.1 env: CIBW_ARCHS_LINUX: x86_64 aarch64 CIBW_ARCHS_MACOS: x86_64 arm64 - CIBW_ARCHS_WINDOWS: AMD64 x86 ARM64 - CIBW_BEFORE_BUILD: pip install numpy scipy + CIBW_ARCHS_WINDOWS: AMD64 x86 CIBW_SKIP: pp* cp36-* cp37-* *-win32 *_i686 *-musllinux_* CIBW_TEST_COMMAND: pytest -rfxEXs --durations=20 --disable-warnings --showlocals --pyargs gensim CIBW_TEST_REQUIRES: pytest testfixtures mock - CIBW_TEST_SKIP: cp38* cp39* cp310* *_aarch64 *_arm64 *_universal2 + CIBW_TEST_SKIP: cp38* cp39* cp310* cp311* *_aarch64 *_arm64 *_universal2 + CIBW_BUILD_VERBOSITY: 3 + + - name: Upload wheels as artifacts + if: always() + uses: actions/upload-artifact@v3 + with: + name: wheels-${{ matrix.os }} + path: wheelhouse/*.whl + + test: + name: Test wheel for ${{ matrix.os }} Python ${{ matrix.python }} + needs: build_wheels + strategy: + fail-fast: false + matrix: + include: + - {python: '3.8', os: macos-11} + - {python: '3.9', os: macos-11} + - {python: '3.10', os: macos-11} + - {python: '3.11', os: macos-11} + + - {python: '3.8', os: ubuntu-20.04} + - {python: '3.9', os: ubuntu-20.04} + - {python: '3.10', os: ubuntu-20.04} + - {python: '3.11', os: ubuntu-20.04} + + - {python: '3.8', os: windows-2019} + - {python: '3.9', os: windows-2019} + + # + # We know that the Windows Py3.10 wheels are broken. Don't test them, + # because it will fail the entire workflow. + # + # https://github.com/RaRe-Technologies/gensim/issues/3489 + # + # - {python: '3.10', os: windows-2019} + - {python: '3.11', os: windows-2019} + + runs-on: ${{ matrix.os }} + steps: + - name: Setup up Python ${{ matrix.python }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python }} + + - name: Downloads build artifacts + uses: actions/download-artifact@v3 + with: + path: artifacts/ + + # + # We want to make sure our wheels run against older Numpy versions + # + - name: Install oldest-supported-numpy + run: python -m pip install oldest-supported-numpy + + # + # Avoid checking out the entire gensim repo to get just one file + # + - name: Download installwheel.py + run: curl "https://raw.githubusercontent.com/RaRe-Technologies/gensim/develop/.github/workflows/installwheel.py" --output installwheel.py --silent + + - name: Install wheel + run: python installwheel.py artifacts/wheels-${{ matrix.os }} + + - name: Debug test environment + run: | + pip freeze + python -c 'import numpy;print(numpy.__file__)' + python -c 'import numpy;print(numpy.__version__)' + + # + # If the wheel was incorrectly built, then this will fail. + # https://github.com/RaRe-Technologies/gensim/issues/3097 + # + - name: Test wheel + run: python -c 'import gensim' + + upload: + name: Upload to S3 + if: always() + needs: build_wheels + runs-on: ubuntu-latest + steps: + + - name: Install wheel uploading tool + run: python -m pip install wheelhouse-uploader + + - name: Downloads build artifacts + uses: actions/download-artifact@v3 + with: + path: artifacts/ + + - name: Check files + run: tree artifacts/ + + - name: Move all wheels into one folder + run: mkdir wheelhouse ; find artifacts/ -name '*.whl' -exec mv -v {} wheelhouse/ \; - name: Upload wheels to s3://gensim-wheels # @@ -53,15 +150,8 @@ jobs: # This means that PRs will still build wheels, but not upload them. # (PRs do not have access to secrets). # - # The always() ensures this step runs even if a previous step fails. - # We want to upload wheels whenever possible (even if e.g. tests failed) - # because we don't want an innocuous test failure from blocking a release. - # - if: ${{ always() && env.WHEELHOUSE_UPLOADER_USERNAME && env.WHEELHOUSE_UPLOADER_SECRET }} - run: | - python -m pip install wheelhouse-uploader - ls wheelhouse/*.whl - python -m wheelhouse_uploader upload --local-folder wheelhouse/ --no-ssl-check gensim-wheels --provider S3 --no-enable-cdn + if: ${{ env.WHEELHOUSE_UPLOADER_USERNAME && env.WHEELHOUSE_UPLOADER_SECRET }} + run: python -m wheelhouse_uploader upload --local-folder wheelhouse/ --no-ssl-check gensim-wheels --provider S3 --no-enable-cdn env: WHEELHOUSE_UPLOADER_USERNAME: ${{ secrets.AWS_ACCESS_KEY_ID }} WHEELHOUSE_UPLOADER_SECRET: ${{ secrets.AWS_SECRET_ACCESS_KEY }} diff --git a/.github/workflows/installwheel.py b/.github/workflows/installwheel.py new file mode 100644 index 0000000000..43a6b7c833 --- /dev/null +++ b/.github/workflows/installwheel.py @@ -0,0 +1,35 @@ +"""Install a wheel for the current platform.""" +import os +import platform +import subprocess +import sys + + +def main(): + subdir = sys.argv[1] + vi = sys.version_info + + if platform.system() in ('Linux', 'Darwin'): + arch = 'x86_64' + else: + arch = 'amd64' + + want = f'-cp{vi.major}{vi.minor}-' + suffix = f'_{arch}.whl' + + files = sorted(os.listdir(subdir)) + for f in files: + if want in f and f.endswith(suffix): + command = [sys.executable, '-m', 'pip', 'install', os.path.join(subdir, f)] + subprocess.check_call(command) + return 0 + + print(f'no matches for {want} / {suffix} in {subdir}:') + print('\n'.join(files)) + + return 1 + + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/.github/workflows/test_wheel.py b/.github/workflows/test_wheel.py new file mode 100755 index 0000000000..bfcbc81f2e --- /dev/null +++ b/.github/workflows/test_wheel.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python +"""Test a Gensim wheel stored on S3. + +Downloads the wheel, installs it into a fresh working environment, and then runs gensim tests. + +usage: + + python test_wheel.py $(which python3.10) + +where the URL comes from http://gensim-wheels.s3-website-us-east-1.amazonaws.com/ +""" + +import argparse +import io +import os +import subprocess +import tempfile +import urllib.parse +import urllib.request +import shutil +import sys + +curr_dir = os.path.dirname(os.path.abspath(__file__)) + + +def run(*command, **kwargs): + print("-" * 70, file=sys.stderr) + print(" ".join(command), file=sys.stderr) + print("-" * 70, file=sys.stderr) + subprocess.check_call(command, **kwargs) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("wheel_path", help="The location of the wheel. May be a URL or local path") + parser.add_argument("python", help="Which python binary to use to test the wheel") + parser.add_argument("--gensim-path", default=os.path.expanduser("~/git/gensim"), help="Where the gensim repo lives") + parser.add_argument("--keep", action="store_true", help="Do not delete the sandbox after testing") + parser.add_argument("--test", default="test", help="Specify which tests to run") + args = parser.parse_args() + + _, python_version = subprocess.check_output([args.python, "--version"]).decode().strip().split(" ", 1) + + try: + tmpdir = tempfile.mkdtemp(prefix=f"test_wheel-py{python_version}-") + + tmp_test_path = os.path.join(tmpdir, "test") + shutil.copytree(os.path.join(args.gensim_path, "gensim/test"), tmp_test_path) + + if args.wheel_path.startswith("http://") or args.wheel_path.startswith("https://"): + parsed = urllib.parse.urlparse(args.wheel_path) + filename = parsed.path.split('/')[-1] + wheel_path = os.path.join(tmpdir, filename) + urllib.request.urlretrieve(args.wheel_path, wheel_path) + else: + wheel_path = args.wheel_path + + env_path = os.path.join(tmpdir, "env") + run("virtualenv", "-p", args.python, env_path) + + python_exe = os.path.join(tmpdir, "env/bin/python") + run(python_exe, "-m", "pip", "install", wheel_path) + run(python_exe, "-m", "pip", "install", "mock", "pytest", "testfixtures") + + pytest_exe = os.path.join(tmpdir, "env/bin/pytest") + run(pytest_exe, "-vvv", args.test, "--durations", "0", cwd=tmpdir) + finally: + if args.keep: + print(f"keeping {tmpdir}, remove it yourself when done") + else: + shutil.rmtree(tmpdir) + + + +if __name__ == "__main__": + main() diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b63b5d89b2..c04ca3aaa4 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -51,14 +51,10 @@ jobs: run: | python setup.py build_ext --inplace make -C docs/src clean html - # - # FIXME: do we want to store the built documentation somewhere, or is - # knowing that the docs built successfully enough? - # tests: name: test ${{ matrix.os }} python ${{ matrix.python }} - timeout-minutes: 30 + timeout-minutes: 20 runs-on: ${{ matrix.os }} defaults: run: @@ -67,13 +63,13 @@ jobs: fail-fast: false matrix: include: - - {python: 3.8, os: ubuntu-20.04} - - {python: 3.9, os: ubuntu-20.04} + - {python: '3.8', os: ubuntu-20.04} + - {python: '3.9', os: ubuntu-20.04} - {python: '3.10', os: ubuntu-20.04} - {python: '3.11', os: ubuntu-20.04} - - {python: 3.8, os: windows-2019} - - {python: 3.9, os: windows-2019} + - {python: '3.8', os: windows-2019} + - {python: '3.9', os: windows-2019} - {python: '3.10', os: windows-2019} - {python: '3.11', os: windows-2019} @@ -112,6 +108,7 @@ jobs: sudo apt-get update -y sudo apt-get install -y gdb ulimit -c unlimited -S # enable core dumps + - name: Install gensim and its dependencies if: matrix.os != 'windows' run: pip install -e .[test] @@ -120,21 +117,45 @@ jobs: if: matrix.os == 'windows' run: pip install -e .[test-win] - - name: Build + - run: pip freeze + + - name: Show numpy configuration + run: python -c 'import numpy;numpy.show_config()' + + - name: Show libraries packaged by numpy run: | - python --version - pip --version - python setup.py build_ext --inplace + python -c 'import numpy;import os;print(os.listdir(os.path.join(os.path.dirname(numpy.__file__),"..","numpy.libs")))' || echo + python -c 'import numpy;import os;print(os.listdir(os.path.join(os.path.dirname(numpy.__file__),"/.libs")))' || echo + python -c 'import numpy;import os;print(os.listdir(os.path.join(os.path.dirname(numpy.__file__),"/.dylibs")))' || echo + + + # + # Nb. pip 23.2.1 and newer will quietly run build_ext using the Cython + # version specified in pyproject.toml, so we don't need to run build_ext + # ourselves. + # + # It's helpful to know what cython version was actually used to build the + # extensions: it'll be in the file headers. + # + # + - name: Examine build output + run: | + ls gensim/models/*.so gensim/models/*.dylib gensim/models/*.dll || true + head gensim/models/*.c gensim/models/*.cpp || true + + - name: Output FAST_VERSION + run: python -c 'from gensim import models;print(models.FAST_VERSION)' + # # Some of our tests are hanging, and I strongly suspect it's because of the coverage plugin. # - name: Run tests (without coverage) if: matrix.coverage != true - run: pytest -v gensim/test + run: pytest -v gensim/test --durations 0 - name: Run tests (with coverage) if: matrix.coverage == true - run: pytest -v gensim/test --cov=gensim/ --cov-report=xml + run: pytest -v gensim/test --cov=gensim/ --cov-report=xml --durations 0 - name: Upload coverage to Codecov if: matrix.coverage == true diff --git a/.gitignore b/.gitignore index 8853bd683a..95e8669a28 100644 --- a/.gitignore +++ b/.gitignore @@ -75,6 +75,7 @@ data *.inv *.js docs/_images/ +docs/_downloads/ # # Generated by Cython @@ -88,5 +89,6 @@ gensim/models/fasttext_inner.c gensim/models/nmf_pgd.c gensim/models/word2vec_corpusfile.cpp gensim/models/word2vec_inner.c +gensim/similarities/fastss.c .ipynb_checkpoints diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 6502f68e57..0000000000 --- a/.travis.yml +++ /dev/null @@ -1,73 +0,0 @@ -branches: - only: - - /release-.+/ - - travis -language: python -arch: arm64-graviton2 -# -# https://docs.travis-ci.com/user/reference/focal/ -# -dist: focal -virt: vm -group: edge -services: docker -env: - global: - - REPO_DIR=gensim - - BUILD_COMMIT=HEAD - - UNICODE_WIDTH=32 - - PLAT=aarch64 - - MB_ML_VER=2014 - - SKIP_NETWORK_TESTS=1 - # - # The contents of this file mirror the linux_testenv list - # in gensim's setup.py. I can't think of a better way to include - # them here for now. They'll get picked up by the multibuild stuff - # running in multibuild/common_utils.sh. - # - - TEST_DEPENDS="pytest mock cython POT testfixtures python-levenshtein==0.12.0 scikit-learn" - -matrix: - # - # See .github/workflows/build-wheels.yml for a discussion of why we - # handle numpy versions explicitly. - # - # Usually, multibuild is clever enough to pick the right docker test - # image. That is not the case for Py3.8, so we specify the image - # ourselves. - # - - os: linux - env: - - MB_PYTHON_VERSION=3.8 - - BUILD_DEPENDS="numpy==1.19.2 scipy==1.7.0" - - DOCKER_TEST_IMAGE=multibuild/xenial_arm64v8 - - os: linux - env: - - MB_PYTHON_VERSION=3.9 - # - # oldest-supported-numpy does not seem to handle this particular case - # (aarch64, Py3.9) explicitly, but I've double-checked that wheels for - # this numpy release are available via PyPI. - # - - BUILD_DEPENDS="numpy==1.19.3 scipy==1.7.0" - - os: linux - env: - - MB_PYTHON_VERSION=3.10 - - BUILD_DEPENDS="numpy==1.22.2 scipy==1.8.0 importlib-metadata==5.1.0" - - os: linux - env: - - MB_PYTHON_VERSION=3.11 - - BUILD_DEPENDS="numpy==1.23.2 scipy==1.9.2 importlib-metadata==5.1.0" - -before_install: - - source multibuild/common_utils.sh - - source multibuild/travis_steps.sh - - before_install -install: - - build_wheel $REPO_DIR $PLAT -script: - - install_run $PLAT -after_script: - - ls -laht ${TRAVIS_BUILD_DIR}/wheelhouse/ - - pip install wheelhouse-uploader - - python -m wheelhouse_uploader upload --local-folder ${TRAVIS_BUILD_DIR}/wheelhouse/ --no-ssl-check gensim-wheels --provider S3 --no-enable-cdn diff --git a/CHANGELOG.md b/CHANGELOG.md index b78ff30f78..91984fded2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,32 @@ Changes ======= +## 4.3.2, 2023-08-23 + +### :red_circle: Bug fixes + +* Fix incorrect conversion of cosine distance to cosine similarity (__[monash849](https://github.com/monash849)__, [#3441](https://github.com/RaRe-Technologies/gensim/pull/3441)) + +### :books: Tutorial and doc improvements + +* Fix inconsistent documentation for LdaSeqModel #3474 (__[rsokolewicz](https://github.com/rsokolewicz)__, [#3475](https://github.com/RaRe-Technologies/gensim/pull/3475)) +* Update the licence link to LGPLv2.1 (__[ERijck](https://github.com/ERijck)__, [#3471](https://github.com/RaRe-Technologies/gensim/pull/3471)) +* Replace HTTP with HTTPS in enwiki URLs (__[Holmes5](https://github.com/Holmes5)__, [#3459](https://github.com/RaRe-Technologies/gensim/pull/3459)) +* Update broken/redirecting/unencrypted links (__[pabs3](https://github.com/pabs3)__, [#3456](https://github.com/RaRe-Technologies/gensim/pull/3456)) +* Update Python version in docs (__[gliptak](https://github.com/gliptak)__, [#3446](https://github.com/RaRe-Technologies/gensim/pull/3446)) + +### :+1: Improvements + +* Remove unused dependency, handle ImportError (__[mpenkov](https://github.com/mpenkov)__, [#3447](https://github.com/RaRe-Technologies/gensim/pull/3447)) +* Sanity check for `hs` and `negative` in Word2Vec (__[gau-nernst](https://github.com/gau-nernst)__, [#3443](https://github.com/RaRe-Technologies/gensim/pull/3443)) + +### 🔮 Testing, CI, housekeeping + +* Fix CI test and wheel building workflow (__[mpenkov](https://github.com/mpenkov)__, [#3488](https://github.com/RaRe-Technologies/gensim/pull/3488)) +* Build wheels with oldest supported numpy (__[PrimozGodec](https://github.com/PrimozGodec)__, [#3467](https://github.com/RaRe-Technologies/gensim/pull/3467)) +* Bump pypa/cibuildwheel from 2.12.1 to 2.13.1 (__[dependabot[bot]](https://github.com/apps/dependabot)__, [#3483](https://github.com/RaRe-Technologies/gensim/pull/3483)) +* Doc fixes and separate workflow for building docs via CI (__[pabs3](https://github.com/pabs3)__, [#3462](https://github.com/RaRe-Technologies/gensim/pull/3462)) +* Move wheels upload into its own job (__[nikaro](https://github.com/nikaro)__, [#3454](https://github.com/RaRe-Technologies/gensim/pull/3454)) +* Enable arm64/aarch64 wheel builds (__[nikaro](https://github.com/nikaro)__, [#3448](https://github.com/RaRe-Technologies/gensim/pull/3448)) ## 4.3.1, 2022-03-09 @@ -290,7 +317,7 @@ Gensim 4.0 is a major release with lots of performance & robustness improvements * Dropped Python 2. Gensim 4.0 is Py3.6+. Read our [Python version support policy](https://github.com/RaRe-Technologies/gensim/wiki/Gensim-And-Compatibility). - If you still need Python 2 for some reason, stay at [Gensim 3.8.3](https://github.com/RaRe-Technologies/gensim/releases/tag/3.8.3). -* A new [Gensim website](https://radimrehurek.com/gensim) – finally! 🙃 +* A new [Gensim website](https://radimrehurek.com/gensim/) – finally! 🙃 So, a major clean-up release overall. We're happy with this **tighter, leaner and faster Gensim**. @@ -486,7 +513,7 @@ This is the direction we'll keep going forward: less kitchen-sink of "latest aca ### Why pre-release? -This 4.0.0beta pre-release is for users who want the **cutting edge performance and bug fixes**. Plus users who want to help out, by **testing and providing feedback**: code, documentation, workflows… Please let us know on the [mailing list](https://groups.google.com/forum/#!forum/gensim)! +This 4.0.0beta pre-release is for users who want the **cutting edge performance and bug fixes**. Plus users who want to help out, by **testing and providing feedback**: code, documentation, workflows… Please let us know on the [mailing list](https://groups.google.com/g/gensim)! Install the pre-release with: @@ -2557,7 +2584,7 @@ Tutorial and doc improvements: * transactional similarity server: see docs/simserver.html * website moved from university hosting to radimrehurek.com * much improved speed of lsi[corpus] transformation: -* accuracy tests of incremental svd: test/svd_error.py and http://groups.google.com/group/gensim/browse_thread/thread/4b605b72f8062770 +* accuracy tests of incremental svd: test/svd_error.py and https://groups.google.com/g/gensim/c/S2BbcvgGJ3A * further improvements to memory-efficiency of LDA and LSA * improved wiki preprocessing (thx to Luca de Alfaro) * model.print_topics() debug fncs now support std output, in addition to logging (thx to Homer Strong) @@ -2683,3 +2710,4 @@ Tutorial and doc improvements: ## 0.2.0 * First version + diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 09f2f5a870..9ace1bdcda 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -4,7 +4,7 @@ First, please see [contribution-guide.org](http://www.contribution-guide.org/) f Also, please check the [Gensim FAQ](https://github.com/RaRe-Technologies/gensim/wiki/Recipes-&-FAQ) page before posting. -**The proper place for open-ended questions is the [Gensim mailing list](https://groups.google.com/forum/#!forum/gensim).** Github is not the right place for research discussions or feature requests. +**The proper place for open-ended questions is the [Gensim mailing list](https://groups.google.com/g/gensim).** Github is not the right place for research discussions or feature requests. # How to add a new feature or create a pull request? diff --git a/HACKTOBERFEST.md b/HACKTOBERFEST.md index b60d1ffa41..4682f2f135 100644 --- a/HACKTOBERFEST.md +++ b/HACKTOBERFEST.md @@ -28,7 +28,7 @@ Check out the following: ## Questions -If you have a general question about Gensim, please ask on the [mailing list](https://groups.google.com/forum/#!forum/gensim). +If you have a general question about Gensim, please ask on the [mailing list](https://groups.google.com/g/gensim). If you have a question a about a specific issue or PR, just ask there directly, and we'll get back to you as soon as we can. Otherwise, ping @mpenkov on [Twitter](https://twitter.com/mpenkov) or [Telegram](https://t.me/mpenkov). diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md index 95121b30ae..91904dbb45 100644 --- a/ISSUE_TEMPLATE.md +++ b/ISSUE_TEMPLATE.md @@ -1,7 +1,7 @@